Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pick up index dimension on first insert for an empty table #251

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions src/hnsw.c
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,23 @@ HnswColumnType GetIndexColumnType(Relation index)
return GetColumnTypeFromOid(attr->atttypid);
}

/*
* Returns length of vector from datum
*/
int DatumGetLength(Datum datum, HnswColumnType type)
{
if(type == VECTOR) {
Vector *vector = DatumGetVector(datum);
return vector->dim;
} else if(type == REAL_ARRAY || type == INT_ARRAY) {
ArrayType *array = DatumGetArrayTypePCopy(datum);
return ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
} else {
elog(ERROR, "Unsupported type");
}
return -1;
}

/*
* Given vector data and vector type, read it as either a float4 or int32 array and return as void*
*/
Expand Down
1 change: 1 addition & 0 deletions src/hnsw.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ PGDLLEXPORT Datum vector_cos_dist(PG_FUNCTION_ARGS);

HnswColumnType GetColumnTypeFromOid(Oid oid);
HnswColumnType GetIndexColumnType(Relation index);
int DatumGetLength(Datum datum, HnswColumnType type);
void* DatumGetSizedArray(Datum datum, HnswColumnType type, int dimensions);

#define LDB_UNUSED(x) (void)(x)
Expand Down
12 changes: 9 additions & 3 deletions src/hnsw/build.c
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ static int GetArrayLengthFromHeap(Relation heap, int indexCol, IndexInfo *indexI
tuple = heap_getnext(scan, ForwardScanDirection);
if(tuple == NULL) {
heap_endscan(scan);
return n_items;
return 0;
}

if(indexInfo->ii_Expressions != NULL) {
Expand Down Expand Up @@ -349,10 +349,16 @@ static void InitBuildState(HnswBuildState *buildstate, Relation heap, Relation i

// If a dimension wasn't specified try to infer it
if(buildstate->dimensions < 1) {
// todo:: isn't calling InferDimension and GetHnswIndexDimensions above redundant?
buildstate->dimensions = InferDimension(heap, indexInfo);
}
/* Require column to have dimensions to be indexed */
if(buildstate->dimensions < 1) elog(ERROR, "column does not have dimensions, please specify one");

// At this point, (buildstate->dimensions == 0) if this is building an index with no dim specified on an empty table
// The zero is a sentinel value that we check upon the first insertion of a row
// Note that (buildstate->dimensions == -1) if something went wrong
if(buildstate->dimensions < 0) {
elog(ERROR, "could not infer a dimension when no dimension was specified");
}

// not supported because of 8K page limit in postgres WAL pages
// can pass this limit once quantization is supported
Expand Down
24 changes: 20 additions & 4 deletions src/hnsw/insert.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,10 @@ bool ldb_aminsert(Relation index,
GenericXLogState *state;
uint32 new_tuple_id;
HnswIndexTuple *new_tuple;
HnswColumnType column_type;
usearch_init_options_t opts = {0};
LDB_UNUSED(heap);
LDB_UNUSED(indexInfo);
#if PG_VERSION_NUM >= 140000
LDB_UNUSED(indexUnchanged);
#endif
Expand Down Expand Up @@ -103,8 +105,23 @@ bool ldb_aminsert(Relation index,
hdr = (HnswIndexHeaderPage *)PageGetContents(hdr_page);
assert(hdr->magicNumber == LDB_WAL_MAGIC_NUMBER);

opts.dimensions = GetHnswIndexDimensions(index, indexInfo);
CheckHnswIndexDimensions(index, values[ 0 ], opts.dimensions);
datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ]));
column_type = GetIndexColumnType(index);

// Check if we created an index on an empty-table with no dimension specified
if(hdr->vector_dim == 0) {
opts.dimensions = DatumGetLength(datum, column_type);
if(opts.dimensions < 1)
elog(ERROR,
"Failed to infer dimension of inserted vector upon first insert on an empty table with no index "
"dimension specified.");
// update the index header (we mark hdr_buf dirty later)
hdr->vector_dim = opts.dimensions;
} else {
opts.dimensions = hdr->vector_dim;
CheckHnswIndexDimensions(index, values[ 0 ], opts.dimensions);
}

PopulateUsearchOpts(index, &opts);
opts.retriever_ctx = ldb_wal_retriever_area_init(index, hdr);
opts.retriever = ldb_wal_index_node_retriever;
Expand All @@ -125,14 +142,13 @@ bool ldb_aminsert(Relation index,

insertstate->uidx = uidx;
insertstate->retriever_ctx = opts.retriever_ctx;
insertstate->columnType = GetIndexColumnType(index);
insertstate->columnType = column_type;

hdr_page = NULL;

meta = usearch_metadata(uidx, &error);
assert(!error);

datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ]));
void *vector = DatumGetSizedArray(datum, insertstate->columnType, opts.dimensions);

#if LANTERNDB_COPYNODES
Expand Down
36 changes: 32 additions & 4 deletions test/expected/hnsw_create.out
Original file line number Diff line number Diff line change
Expand Up @@ -92,18 +92,21 @@ CREATE TABLE small_world4 (
id varchar(3),
vector real[]
);
-- If the first row is NULL we do not infer a dimension
-- If the first inserted row is NULL: we can create an index but we can't infer the dimension from the first inserted row (since it is null)
\set ON_ERROR_STOP off
CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
ERROR: column does not have dimensions, please specify one
CREATE INDEX first_row_null_idx ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
INFO: done init usearch index
INFO: inserted 0 elements
INFO: done saving 0 vectors
begin;
INSERT INTO small_world4 (id, vector) VALUES
('000', NULL),
('001', '{1,0,0,1}');
CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
ERROR: column does not have dimensions, please specify one
ERROR: could not infer a dimension when no dimension was specified
rollback;
\set ON_ERROR_STOP on
DROP INDEX first_row_null_idx;
INSERT INTO small_world4 (id, vector) VALUES
('000', '{1,0,0,0}'),
('001', '{1,0,0,1}'),
Expand Down Expand Up @@ -151,3 +154,28 @@ CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construct
INFO: done init usearch index
ERROR: Wrong number of dimensions: 3 instead of 4 expected
\set ON_ERROR_STOP on
-- Test index creation on empty table and no dimension specified
CREATE TABLE small_world5 (
id SERIAL PRIMARY KEY,
v REAL[]
);
-- We can still create an index despite having an empty table and not specifying a dimension during index creation
CREATE INDEX small_world5_hnsw_idx ON small_world5 USING hnsw (v dist_l2sq_ops);
INFO: done init usearch index
INFO: inserted 0 elements
INFO: done saving 0 vectors
begin;
-- Our index then infers the dimension from the first inserted row
INSERT INTO small_world5 (id, v) VALUES
('000', '{1,0,0,0,1}'),
('001', '{1,0,0,1,2}'),
('010', '{1,0,1,0,3}');
rollback;
-- Test that upon infering the dimension from the first inserted row, we do not allow subsequent rows with different dimensions
\set ON_ERROR_STOP off
INSERT INTO small_world5 (id, v) VALUES
('100', '{2,0,0,0,1}'),
('101', '{2,0,0}'),
('110', '{2,0,1,0}');
ERROR: Wrong number of dimensions: 3 instead of 5 expected
\set ON_ERROR_STOP on
32 changes: 30 additions & 2 deletions test/sql/hnsw_create.sql
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,17 @@ CREATE TABLE small_world4 (
id varchar(3),
vector real[]
);
-- If the first row is NULL we do not infer a dimension
-- If the first inserted row is NULL: we can create an index but we can't infer the dimension from the first inserted row (since it is null)
\set ON_ERROR_STOP off
CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
CREATE INDEX first_row_null_idx ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
begin;
INSERT INTO small_world4 (id, vector) VALUES
('000', NULL),
('001', '{1,0,0,1}');
CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
rollback;
\set ON_ERROR_STOP on
DROP INDEX first_row_null_idx;

INSERT INTO small_world4 (id, vector) VALUES
('000', '{1,0,0,0}'),
Expand Down Expand Up @@ -78,3 +79,30 @@ UPDATE small_world4 SET vector = '{0,0,0}' WHERE id = '001';
\set ON_ERROR_STOP off
CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
\set ON_ERROR_STOP on

-- Test index creation on empty table and no dimension specified
CREATE TABLE small_world5 (
id SERIAL PRIMARY KEY,
v REAL[]
);

-- We can still create an index despite having an empty table and not specifying a dimension during index creation
CREATE INDEX small_world5_hnsw_idx ON small_world5 USING hnsw (v dist_l2sq_ops);

begin;
-- Our index then infers the dimension from the first inserted row
INSERT INTO small_world5 (id, v) VALUES
('000', '{1,0,0,0,1}'),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you insert a NULL value here first, before inserting anything else, to trigger this case ?

('001', '{1,0,0,1,2}'),
('010', '{1,0,1,0,3}');
rollback;

-- Test that upon infering the dimension from the first inserted row, we do not allow subsequent rows with different dimensions
\set ON_ERROR_STOP off
INSERT INTO small_world5 (id, v) VALUES
('100', '{2,0,0,0,1}'),
('101', '{2,0,0}'),
('110', '{2,0,1,0}');
\set ON_ERROR_STOP on


Loading