From af15c88f64c311881cffbd7838a1c4f7d0176c94 Mon Sep 17 00:00:00 2001 From: Danyil Date: Mon, 18 Dec 2023 19:11:15 -0500 Subject: [PATCH 1/2] on unspecified dim index build on empty table, set index dimension to 0 and then later update it after inferring dimension from first insert --- src/hnsw.c | 17 +++++++++++++++++ src/hnsw.h | 1 + src/hnsw/build.c | 12 +++++++++--- src/hnsw/insert.c | 24 +++++++++++++++++++---- test/expected/hnsw_create.out | 36 +++++++++++++++++++++++++++++++---- test/sql/hnsw_create.sql | 32 +++++++++++++++++++++++++++++-- 6 files changed, 109 insertions(+), 13 deletions(-) diff --git a/src/hnsw.c b/src/hnsw.c index d47656e4..f2b6f6e8 100644 --- a/src/hnsw.c +++ b/src/hnsw.c @@ -447,6 +447,23 @@ HnswColumnType GetIndexColumnType(Relation index) return GetColumnTypeFromOid(attr->atttypid); } +/* + * Returns length of vector from datum + */ +int DatumGetLength(Datum datum, HnswColumnType type) +{ + if(type == VECTOR) { + Vector *vector = DatumGetVector(datum); + return vector->dim; + } else if(type == REAL_ARRAY || type == INT_ARRAY) { + ArrayType *array = DatumGetArrayTypePCopy(datum); + return ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); + } else { + elog(ERROR, "Unsupported type"); + } + return -1; +} + /* * Given vector data and vector type, read it as either a float4 or int32 array and return as void* */ diff --git a/src/hnsw.h b/src/hnsw.h index c20911c9..bdd36000 100644 --- a/src/hnsw.h +++ b/src/hnsw.h @@ -37,6 +37,7 @@ PGDLLEXPORT Datum vector_cos_dist(PG_FUNCTION_ARGS); HnswColumnType GetColumnTypeFromOid(Oid oid); HnswColumnType GetIndexColumnType(Relation index); +int DatumGetLength(Datum datum, HnswColumnType type); void* DatumGetSizedArray(Datum datum, HnswColumnType type, int dimensions); #define LDB_UNUSED(x) (void)(x) diff --git a/src/hnsw/build.c b/src/hnsw/build.c index 7e0e6e1f..4e78dbd8 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -232,7 +232,7 @@ static int GetArrayLengthFromHeap(Relation heap, int indexCol, IndexInfo *indexI tuple = heap_getnext(scan, ForwardScanDirection); if(tuple == NULL) { heap_endscan(scan); - return n_items; + return 0; } if(indexInfo->ii_Expressions != NULL) { @@ -349,10 +349,16 @@ static void InitBuildState(HnswBuildState *buildstate, Relation heap, Relation i // If a dimension wasn't specified try to infer it if(buildstate->dimensions < 1) { + // todo:: isn't calling InferDimension and GetHnswIndexDimensions above redundant? buildstate->dimensions = InferDimension(heap, indexInfo); } - /* Require column to have dimensions to be indexed */ - if(buildstate->dimensions < 1) elog(ERROR, "column does not have dimensions, please specify one"); + + // At this point, (buildstate->dimensions == 0) if this is building an index with no dim specified on an empty table + // The zero is a sentinel value that we check upon the first insertion of a row + // Note that (buildstate->dimensions == -1) if something went wrong + if(buildstate->dimensions < 0) { + elog(ERROR, "could not infer a dimension when no dimension was specified"); + } // not supported because of 8K page limit in postgres WAL pages // can pass this limit once quantization is supported diff --git a/src/hnsw/insert.c b/src/hnsw/insert.c index aaff2690..aa5de4ba 100644 --- a/src/hnsw/insert.c +++ b/src/hnsw/insert.c @@ -70,8 +70,10 @@ bool ldb_aminsert(Relation index, GenericXLogState *state; uint32 new_tuple_id; HnswIndexTuple *new_tuple; + HnswColumnType column_type; usearch_init_options_t opts = {0}; LDB_UNUSED(heap); + LDB_UNUSED(indexInfo); #if PG_VERSION_NUM >= 140000 LDB_UNUSED(indexUnchanged); #endif @@ -103,8 +105,23 @@ bool ldb_aminsert(Relation index, hdr = (HnswIndexHeaderPage *)PageGetContents(hdr_page); assert(hdr->magicNumber == LDB_WAL_MAGIC_NUMBER); - opts.dimensions = GetHnswIndexDimensions(index, indexInfo); - CheckHnswIndexDimensions(index, values[ 0 ], opts.dimensions); + datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ])); + column_type = GetIndexColumnType(index); + + // Check if we created an index on an empty-table with no dimension specified + if(hdr->vector_dim == 0) { + opts.dimensions = DatumGetLength(datum, column_type); + if(opts.dimensions < 1) + elog(ERROR, + "Failed to infer dimension of inserted vector upon first insert on an empty table with no index " + "dimension specified."); + // update the index header (we mark hdr_buf dirty later) + hdr->vector_dim = opts.dimensions; + } else { + opts.dimensions = hdr->vector_dim; + CheckHnswIndexDimensions(index, values[ 0 ], opts.dimensions); + } + PopulateUsearchOpts(index, &opts); opts.retriever_ctx = ldb_wal_retriever_area_init(index, hdr); opts.retriever = ldb_wal_index_node_retriever; @@ -125,14 +142,13 @@ bool ldb_aminsert(Relation index, insertstate->uidx = uidx; insertstate->retriever_ctx = opts.retriever_ctx; - insertstate->columnType = GetIndexColumnType(index); + insertstate->columnType = column_type; hdr_page = NULL; meta = usearch_metadata(uidx, &error); assert(!error); - datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ])); void *vector = DatumGetSizedArray(datum, insertstate->columnType, opts.dimensions); #if LANTERNDB_COPYNODES diff --git a/test/expected/hnsw_create.out b/test/expected/hnsw_create.out index 2d866705..169b3225 100644 --- a/test/expected/hnsw_create.out +++ b/test/expected/hnsw_create.out @@ -92,18 +92,21 @@ CREATE TABLE small_world4 ( id varchar(3), vector real[] ); --- If the first row is NULL we do not infer a dimension +-- If the first inserted row is NULL: we can create an index but we can't infer the dimension from the first inserted row (since it is null) \set ON_ERROR_STOP off -CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); -ERROR: column does not have dimensions, please specify one +CREATE INDEX first_row_null_idx ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +INFO: done init usearch index +INFO: inserted 0 elements +INFO: done saving 0 vectors begin; INSERT INTO small_world4 (id, vector) VALUES ('000', NULL), ('001', '{1,0,0,1}'); CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); -ERROR: column does not have dimensions, please specify one +ERROR: could not infer a dimension when no dimension was specified rollback; \set ON_ERROR_STOP on +DROP INDEX first_row_null_idx; INSERT INTO small_world4 (id, vector) VALUES ('000', '{1,0,0,0}'), ('001', '{1,0,0,1}'), @@ -151,3 +154,28 @@ CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construct INFO: done init usearch index ERROR: Wrong number of dimensions: 3 instead of 4 expected \set ON_ERROR_STOP on +-- Test index creation on empty table and no dimension specified +CREATE TABLE small_world5 ( + id SERIAL PRIMARY KEY, + v REAL[] +); +-- We can still create an index despite having an empty table and not specifying a dimension during index creation +CREATE INDEX small_world5_hnsw_idx ON small_world5 USING hnsw (v dist_l2sq_ops); +INFO: done init usearch index +INFO: inserted 0 elements +INFO: done saving 0 vectors +begin; +-- Our index then infers the dimension from the first inserted row +INSERT INTO small_world5 (id, v) VALUES +('000', '{1,0,0,0,1}'), +('001', '{1,0,0,1,2}'), +('010', '{1,0,1,0,3}'); +rollback; +-- Test that upon infering the dimension from the first inserted row, we do not allow subsequent rows with different dimensions +\set ON_ERROR_STOP off +INSERT INTO small_world5 (id, v) VALUES +('100', '{2,0,0,0,1}'), +('101', '{2,0,0}'), +('110', '{2,0,1,0}'); +ERROR: Wrong number of dimensions: 3 instead of 5 expected +\set ON_ERROR_STOP on diff --git a/test/sql/hnsw_create.sql b/test/sql/hnsw_create.sql index 776ddb2d..19163a58 100644 --- a/test/sql/hnsw_create.sql +++ b/test/sql/hnsw_create.sql @@ -36,9 +36,9 @@ CREATE TABLE small_world4 ( id varchar(3), vector real[] ); --- If the first row is NULL we do not infer a dimension +-- If the first inserted row is NULL: we can create an index but we can't infer the dimension from the first inserted row (since it is null) \set ON_ERROR_STOP off -CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +CREATE INDEX first_row_null_idx ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); begin; INSERT INTO small_world4 (id, vector) VALUES ('000', NULL), @@ -46,6 +46,7 @@ INSERT INTO small_world4 (id, vector) VALUES CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); rollback; \set ON_ERROR_STOP on +DROP INDEX first_row_null_idx; INSERT INTO small_world4 (id, vector) VALUES ('000', '{1,0,0,0}'), @@ -78,3 +79,30 @@ UPDATE small_world4 SET vector = '{0,0,0}' WHERE id = '001'; \set ON_ERROR_STOP off CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); \set ON_ERROR_STOP on + +-- Test index creation on empty table and no dimension specified +CREATE TABLE small_world5 ( + id SERIAL PRIMARY KEY, + v REAL[] +); + +-- We can still create an index despite having an empty table and not specifying a dimension during index creation +CREATE INDEX small_world5_hnsw_idx ON small_world5 USING hnsw (v dist_l2sq_ops); + +begin; +-- Our index then infers the dimension from the first inserted row +INSERT INTO small_world5 (id, v) VALUES +('000', '{1,0,0,0,1}'), +('001', '{1,0,0,1,2}'), +('010', '{1,0,1,0,3}'); +rollback; + +-- Test that upon infering the dimension from the first inserted row, we do not allow subsequent rows with different dimensions +\set ON_ERROR_STOP off +INSERT INTO small_world5 (id, v) VALUES +('100', '{2,0,0,0,1}'), +('101', '{2,0,0}'), +('110', '{2,0,1,0}'); +\set ON_ERROR_STOP on + + From 4c114a3f0e48adf2c9e7775d9e299cba4ef9860e Mon Sep 17 00:00:00 2001 From: Danyil Date: Wed, 27 Dec 2023 15:26:46 -0500 Subject: [PATCH 2/2] added a null value insert in the empty table empty index test case in hnsw_create --- test/expected/hnsw_create.out | 5 ++++- test/sql/hnsw_create.sql | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/test/expected/hnsw_create.out b/test/expected/hnsw_create.out index 169b3225..90e9a920 100644 --- a/test/expected/hnsw_create.out +++ b/test/expected/hnsw_create.out @@ -165,7 +165,10 @@ INFO: done init usearch index INFO: inserted 0 elements INFO: done saving 0 vectors begin; --- Our index then infers the dimension from the first inserted row +-- Inserting a NULL vector should only insert it into the table and not into our index +-- So, our index is still empty after and is yet to pick up a dimension +INSERT INTO small_world5 (id, v) VALUES ('200', NULL); +-- Our index then infers the dimension from the first inserted non-NULL row INSERT INTO small_world5 (id, v) VALUES ('000', '{1,0,0,0,1}'), ('001', '{1,0,0,1,2}'), diff --git a/test/sql/hnsw_create.sql b/test/sql/hnsw_create.sql index 19163a58..a212ca8d 100644 --- a/test/sql/hnsw_create.sql +++ b/test/sql/hnsw_create.sql @@ -90,7 +90,11 @@ CREATE TABLE small_world5 ( CREATE INDEX small_world5_hnsw_idx ON small_world5 USING hnsw (v dist_l2sq_ops); begin; --- Our index then infers the dimension from the first inserted row +-- Inserting a NULL vector should only insert it into the table and not into our index +-- So, our index is still empty after and is yet to pick up a dimension +INSERT INTO small_world5 (id, v) VALUES ('200', NULL); + +-- Our index then infers the dimension from the first inserted non-NULL row INSERT INTO small_world5 (id, v) VALUES ('000', '{1,0,0,0,1}'), ('001', '{1,0,0,1,2}'),