From 535e81d89134d1d35ac38c54f9d34e1c67d124eb Mon Sep 17 00:00:00 2001 From: Danyil Date: Wed, 18 Oct 2023 22:34:16 -0400 Subject: [PATCH 1/5] fixed all bugs and modified test outputs to a point wherethey're working --- src/hnsw.c | 16 ++++++ src/hnsw.h | 1 + src/hnsw/build.c | 61 ++++++++++++++++---- src/hnsw/build.h | 5 +- src/hnsw/external_index.c | 1 + src/hnsw/insert.c | 84 ++++++++++++++++++++++++++-- src/hnsw/options.c | 32 ++++++++--- src/hnsw/options.h | 5 ++ test/expected/hnsw_cost_estimate.out | 3 - test/expected/hnsw_create.out | 8 ++- test/expected/hnsw_dist_func.out | 18 +++--- test/expected/hnsw_insert.out | 12 ++-- test/sql/hnsw_create.sql | 2 +- 13 files changed, 202 insertions(+), 46 deletions(-) diff --git a/src/hnsw.c b/src/hnsw.c index 6e475ff1..1dc1fd57 100644 --- a/src/hnsw.c +++ b/src/hnsw.c @@ -172,6 +172,9 @@ static void hnswcostestimate(PlannerInfo *root, costs.numIndexTuples = estimate_number_tuples_accessed(path->indexinfo->indexoid, num_tuples_in_index); uint64 num_blocks_accessed = estimate_number_blocks_accessed(num_tuples_in_index, path->indexinfo->pages, costs.numIndexTuples); + // choose max{above, 1} since on a postponed index build, we will have 0 for the above quantity... this should only + // affect scans on empty indexes + num_blocks_accessed = (num_blocks_accessed > 1) ? num_blocks_accessed : 1; #if PG_VERSION_NUM >= 120000 genericcostestimate(root, path, loop_count, &costs); @@ -386,6 +389,19 @@ HnswColumnType GetIndexColumnType(Relation index) return GetColumnTypeFromOid(attr->atttypid); } +int DatumGetLength(Datum datum, HnswColumnType type) +{ + if(type == VECTOR) { + Vector *vector = DatumGetVector(datum); + return vector->dim; + } else if(type == REAL_ARRAY || type == INT_ARRAY) { + ArrayType *array = DatumGetArrayTypePCopy(datum); + return ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); + } else { + elog(ERROR, "Unsupported type"); + } +} + /* * Given vector data and vector type, read it as either a float4 or int32 array and return as void* */ diff --git a/src/hnsw.h b/src/hnsw.h index 570ed6bf..cf33defe 100644 --- a/src/hnsw.h +++ b/src/hnsw.h @@ -34,6 +34,7 @@ PGDLLEXPORT Datum cos_dist(PG_FUNCTION_ARGS); HnswColumnType GetColumnTypeFromOid(Oid oid); HnswColumnType GetIndexColumnType(Relation index); +int DatumGetLength(Datum datum, HnswColumnType type); void *DatumGetSizedArray(Datum datum, HnswColumnType type, int dimensions); #define LDB_UNUSED(x) (void)(x) diff --git a/src/hnsw/build.c b/src/hnsw/build.c index 7e0e6e1f..f1f8ccfb 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -119,7 +119,11 @@ static void BuildCallback( Relation index, CALLBACK_ITEM_POINTER, Datum *values, bool *isnull, bool tupleIsAlive, void *state) { HnswBuildState *buildstate = (HnswBuildState *)state; - MemoryContext oldCtx; + if(buildstate->postponed && buildstate->reltuples > 0) { + return; + } + //elog(INFO, "Callback for build!"); + MemoryContext oldCtx; // we can later use this for some optimizations I think LDB_UNUSED(tupleIsAlive); @@ -257,6 +261,8 @@ static int GetArrayLengthFromHeap(Relation heap, int indexCol, IndexInfo *indexI return n_items; } +// Attempts to get the number of dimensions from the index, and falls back on a heap scan to get dimensions if that +// fails int GetHnswIndexDimensions(Relation index, IndexInfo *indexInfo) { HnswColumnType columnType = GetIndexColumnType(index); @@ -308,6 +314,10 @@ void CheckHnswIndexDimensions(Relation index, Datum arrayDatum, int dimensions) int n_items; HnswColumnType indexType = GetIndexColumnType(index); + if(dimensions == HNSW_DEFAULT_DIM) { + return; + } + if(indexType == REAL_ARRAY || indexType == INT_ARRAY) { /* Check dimensions of vector */ array = DatumGetArrayTypePCopy(arrayDatum); @@ -344,19 +354,21 @@ static void InitBuildState(HnswBuildState *buildstate, Relation heap, Relation i buildstate->index = index; buildstate->indexInfo = indexInfo; buildstate->columnType = GetIndexColumnType(index); - buildstate->dimensions = GetHnswIndexDimensions(index, indexInfo); + if(!buildstate->postponed) { + buildstate->dimensions = GetHnswIndexDimensions(index, indexInfo); + } buildstate->index_file_path = ldb_HnswGetIndexFilePath(index); // If a dimension wasn't specified try to infer it - if(buildstate->dimensions < 1) { + if(buildstate->dimensions < 1 && !buildstate->postponed) { buildstate->dimensions = InferDimension(heap, indexInfo); } /* Require column to have dimensions to be indexed */ - if(buildstate->dimensions < 1) elog(ERROR, "column does not have dimensions, please specify one"); + // if(buildstate->dimensions < 1) elog(ERROR, "column does not have dimensions, please specify one"); // not supported because of 8K page limit in postgres WAL pages // can pass this limit once quantization is supported - if(buildstate->dimensions > HNSW_MAX_DIM) + if(buildstate->dimensions > HNSW_MAX_DIM && !buildstate->postponed) elog(ERROR, "vector dimension %d is too large. " "LanternDB currently supports up to %ddim vectors", @@ -408,14 +420,35 @@ static void ScanTable(HnswBuildState *buildstate) /* * Build the index */ -static void BuildIndex( - Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildState *buildstate, ForkNumber forkNum) +void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildState *buildstate, ForkNumber forkNum) { usearch_error_t error = NULL; usearch_init_options_t opts; MemSet(&opts, 0, sizeof(opts)); + bool empty_table = RelationGetNumberOfBlocks(heap) == 0; InitBuildState(buildstate, heap, index, indexInfo); + + if(buildstate->dimensions < 1 && !empty_table && !buildstate->postponed) { + elog(ERROR, "Failed to infer dimensions from non-empty table, please specify one"); + return; + } + + if(empty_table && !buildstate->postponed) { + // Postpone creation of index until first insert, where we can get dimension from that inserted vector and build + // the index with the right dimension buildstate has enough information at this point to return in original + // am_build + // TODO: we can only do this if there is no dimension specified in the options(cant be inferred) but that would + // require us passing that state to aminsert so we only build index + // elog(INFO, "Since table is empty, postponing creation of index until first insert"); + // we use indexInfo->AmCache to store this (since it's a void* we can check if it's NULL to encode the boolean + // value of whether we're postponing or not) + // elog(INFO, "BUILD, before setting it to true! ii_AmCache address: %p\n", indexInfo->ii_AmCache); + indexInfo->ii_AmCache = (void *)1; + // elog(INFO, "BUILD, after setting it to true! ii_AmCache address: %p\n", indexInfo->ii_AmCache); + return; + } + opts.dimensions = buildstate->dimensions; PopulateUsearchOpts(index, &opts); @@ -424,7 +457,8 @@ static void BuildIndex( assert(error == NULL); buildstate->hnsw = NULL; - if(buildstate->index_file_path) { + if(buildstate->index_file_path && !buildstate->postponed) { + //elog(INFO, "using index file path to creat index!"); if(access(buildstate->index_file_path, F_OK) != 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("Invalid index file path "))); } @@ -444,7 +478,7 @@ static void BuildIndex( } else { BlockNumber numBlocks = RelationGetNumberOfBlocks(heap); uint32_t estimated_row_count = 0; - if(numBlocks > 0) { + if(numBlocks > 0 && !buildstate->postponed) { // Read the first block Buffer buffer = ReadBufferExtended(heap, MAIN_FORKNUM, 0, RBM_NORMAL, NULL); // Lock buffer so there won't be any new writes during this operation @@ -476,10 +510,14 @@ static void BuildIndex( } UpdateProgress(PROGRESS_CREATEIDX_PHASE, PROGRESS_HNSW_PHASE_IN_MEMORY_INSERT); - LanternBench("build hnsw index", ScanTable(buildstate)); - + // if(!empty_table && !buildstate->postponed) { + //elog(INFO, "Before ScanTable"); + LanternBench("build hnsw index", + ScanTable(buildstate)); // can we get away with not calling this on an empty table? + //elog(INFO, "After ScanTable"); elog(INFO, "inserted %ld elements", usearch_size(buildstate->usearch_index, &error)); assert(error == NULL); + //elog(INFO, "After assertion for ScanTable"); } char *result_buf = NULL; @@ -512,6 +550,7 @@ IndexBuildResult *ldb_ambuild(Relation heap, Relation index, IndexInfo *indexInf { IndexBuildResult *result; HnswBuildState buildstate; + buildstate.postponed = false; BuildIndex(heap, index, indexInfo, &buildstate, MAIN_FORKNUM); diff --git a/src/hnsw/build.h b/src/hnsw/build.h index 272bd394..668981b7 100644 --- a/src/hnsw/build.h +++ b/src/hnsw/build.h @@ -2,6 +2,7 @@ #define LDB_HNSW_BUILD_H #include +#include #include #include @@ -20,6 +21,7 @@ typedef struct HnswBuildState int dimensions; HnswColumnType columnType; char *index_file_path; + bool postponed; /* Statistics */ double tuples_indexed; @@ -36,6 +38,7 @@ typedef struct HnswBuildState IndexBuildResult *ldb_ambuild(Relation heap, Relation index, IndexInfo *indexInfo); void ldb_ambuildunlogged(Relation index); int GetHnswIndexDimensions(Relation index, IndexInfo *indexInfo); -void CheckHnswIndexDimensions(Relation index, Datum arrayDatum, int deimensions); +void CheckHnswIndexDimensions(Relation index, Datum arrayDatum, int dimensions); +void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildState *buildstate, ForkNumber forkNum); // todo: does this render my check unnecessary #endif // LDB_HNSW_BUILD_H diff --git a/src/hnsw/external_index.c b/src/hnsw/external_index.c index ae3bbbd4..85ba3775 100644 --- a/src/hnsw/external_index.c +++ b/src/hnsw/external_index.c @@ -368,6 +368,7 @@ HnswIndexTuple *PrepareIndexTuple(Relation index_rel, // allocate buffer to construct the new node // note that we allocate more than sizeof(HnswIndexTuple) since the struct has a flexible array member // which depends on parameters passed into UsearchNodeBytes above + // TODO: this palloc is throwing an error alloced_tuple = (HnswIndexTuple *)palloc0(sizeof(HnswIndexTuple) + new_tuple_size); alloced_tuple->id = new_tuple_id; alloced_tuple->level = new_tuple_level; diff --git a/src/hnsw/insert.c b/src/hnsw/insert.c index aaff2690..104b8303 100644 --- a/src/hnsw/insert.c +++ b/src/hnsw/insert.c @@ -70,12 +70,13 @@ bool ldb_aminsert(Relation index, GenericXLogState *state; uint32 new_tuple_id; HnswIndexTuple *new_tuple; + HnswColumnType column_type; usearch_init_options_t opts = {0}; LDB_UNUSED(heap); #if PG_VERSION_NUM >= 140000 LDB_UNUSED(indexUnchanged); #endif - + HnswInsertState *insertstate = palloc0(sizeof(HnswInsertState)); if(checkUnique != UNIQUE_CHECK_NO) { @@ -88,7 +89,77 @@ bool ldb_aminsert(Relation index, if(isnull[ 0 ]) { return false; } - // todo:: thre is room for optimization for when indexUnchanged is true + // todo:: there is room for optimization for when indexUnchanged is true + //elog(INFO, "RUNNING AMINSERT!"); + + // TODO: what if there are concurrent inserts? can that result in issues with creating this index, which we've + // postponed, to here? + // bool postponed = (indexInfo->ii_AmCache != NULL); + datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ])); + column_type = GetIndexColumnType(index); + //elog(INFO, "HEAP number of blocks: %d", RelationGetNumberOfBlocks(heap)); + + bool postponed = RelationGetNumberOfBlocks(index) == 0; + + //elog(INFO, "INSERT: ii_AmCache address: %p\n", indexInfo->ii_AmCache); + //elog(INFO, "Num Blocks in insert: %d", RelationGetNumberOfBlocks(index)); + + if(postponed) { + // int ndims = GetHnswIndexDimensions(index, indexInfo); + int ndims = DatumGetLength(datum, column_type); + // TODO: test this ldb_HnswGetDim call... not sure if options will carry over + int index_ndims = ldb_HnswGetDim(index); + // TODO: check if above works... we can then only do this postponed index build stuff if no dimension was + // specified during index declaration + // int index_ndims = -1; + //elog(INFO, "Vector dimension specified during index creation: %d", index_ndims); + //elog(INFO, "ndims is : %d", ndims); + // elog(INFO, "From aminsert, our index_ndims is: %d", index_ndims); + + if(index_ndims >= 1 && index_ndims != ndims) { + elog(ERROR, + "Vector dimension %d of inserted vector does not match vector dimension %d specified during index " + "creation.", + ndims, + index_ndims); + return false; + } + + if(ndims < 1) { + // error is printed in GetHNSWIndexDimensions above + elog(ERROR, "ndims < 1 in insert!"); + return false; + } + + if(ndims > HNSW_MAX_DIM) { + elog(ERROR, + "Vector dimension %d of inserted vector is too large. " + "LanternDB currently supports up to %ddim vectors", + ndims, + HNSW_MAX_DIM); + return false; + } + + // construct index with ndims now + HnswBuildState buildstate; + buildstate.postponed = true; + buildstate.dimensions = ndims; + + // dont think we need this context switching stuff + // MemoryContext prevCtx; + // prevCtx = CurrentMemoryContext; + + //elog(INFO, "About to call build index for the second time"); + BuildIndex(heap, index, indexInfo, &buildstate, MAIN_FORKNUM); + //elog(INFO, "Finished building Index!"); + + // Building the index already inserted this vector since it was written to the heap prior to this AM method + // being called, so we can return to avoid inserting twice + return false; + + // MemoryContextSwitchTo(prevCtx); + // elog(INFO, "Finished switching back to prevCtx!"); + } insertCtx = AllocSetContextCreate(CurrentMemoryContext, "LanternInsertContext", ALLOCSET_DEFAULT_SIZES); oldCtx = MemoryContextSwitchTo(insertCtx); @@ -103,7 +174,11 @@ bool ldb_aminsert(Relation index, hdr = (HnswIndexHeaderPage *)PageGetContents(hdr_page); assert(hdr->magicNumber == LDB_WAL_MAGIC_NUMBER); - opts.dimensions = GetHnswIndexDimensions(index, indexInfo); + //opts.dimensions = GetHnswIndexDimensions(index, indexInfo); + opts.dimensions = hdr->vector_dim; + + //opts.dimensions = DatumGetLength(datum, column_type); + //elog(INFO, "opts.dimensions in insert: %d", (int)opts.dimensions); CheckHnswIndexDimensions(index, values[ 0 ], opts.dimensions); PopulateUsearchOpts(index, &opts); opts.retriever_ctx = ldb_wal_retriever_area_init(index, hdr); @@ -125,14 +200,13 @@ bool ldb_aminsert(Relation index, insertstate->uidx = uidx; insertstate->retriever_ctx = opts.retriever_ctx; - insertstate->columnType = GetIndexColumnType(index); + insertstate->columnType = column_type; hdr_page = NULL; meta = usearch_metadata(uidx, &error); assert(!error); - datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ])); void *vector = DatumGetSizedArray(datum, insertstate->columnType, opts.dimensions); #if LANTERNDB_COPYNODES diff --git a/src/hnsw/options.c b/src/hnsw/options.c index c0b84442..239d1221 100644 --- a/src/hnsw/options.c +++ b/src/hnsw/options.c @@ -73,6 +73,13 @@ char *ldb_HnswGetIndexFilePath(Relation index) return (char *)opts + opts->experimantal_index_path_offset; } +bool ldb_HnswGetPostponeIndexBuild(Relation index) +{ + ldb_HnswOptions *opts = (ldb_HnswOptions *)index->rd_options; + if(opts) return opts->postpone_index_build; + return false; +} + usearch_metric_kind_t ldb_HnswGetMetricKind(Relation index) { struct catclist *proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(index->rd_opfamily[ 0 ])); @@ -103,13 +110,15 @@ usearch_metric_kind_t ldb_HnswGetMetricKind(Relation index) */ bytea *ldb_amoptions(Datum reloptions, bool validate) { - static const relopt_parse_elt tab[] - = {{"dim", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, dim)}, - {"element_limit", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, element_limit)}, - {"m", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, m)}, - {"ef_construction", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef_construction)}, - {"ef", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef)}, - {"_experimental_index_path", RELOPT_TYPE_STRING, offsetof(ldb_HnswOptions, experimantal_index_path_offset)}}; + static const relopt_parse_elt tab[] = { + {"dim", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, dim)}, + {"element_limit", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, element_limit)}, + {"m", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, m)}, + {"ef_construction", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef_construction)}, + {"ef", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef)}, + {"_experimental_index_path", RELOPT_TYPE_STRING, offsetof(ldb_HnswOptions, experimantal_index_path_offset)}, + {"_postpone_index_build", RELOPT_TYPE_BOOL, offsetof(ldb_HnswOptions, postpone_index_build)}, + }; #if PG_VERSION_NUM >= 130000 LDB_UNUSED(validate); @@ -205,6 +214,15 @@ void _PG_init(void) #if PG_VERSION_NUM >= 130000 , AccessExclusiveLock +#endif + ); + add_bool_reloption(ldb_hnsw_index_withopts, + "_postpone_index_build", + "Whether or not to postpone creation of the index to the first insert, for empty tables", + false +#if PG_VERSION_NUM >= certain_version + , + AccessExclusiveLock #endif ); DefineCustomIntVariable("hnsw.init_k", diff --git a/src/hnsw/options.h b/src/hnsw/options.h index edcc651e..82550ba6 100644 --- a/src/hnsw/options.h +++ b/src/hnsw/options.h @@ -40,6 +40,11 @@ typedef struct ldb_HnswOptions int ef_construction; int ef; int experimantal_index_path_offset; + + // whether we postpone the process of building the index on the first insert, when the table is empty + // this is done for user convenience so that we can get the dimension from the first insert, and user thus doesn't + // have to specify the dimension in the create index clause + bool postpone_index_build; } ldb_HnswOptions; int ldb_HnswGetDim(Relation index); diff --git a/test/expected/hnsw_cost_estimate.out b/test/expected/hnsw_cost_estimate.out index 7940f805..3e749f96 100644 --- a/test/expected/hnsw_cost_estimate.out +++ b/test/expected/hnsw_cost_estimate.out @@ -52,9 +52,6 @@ SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -- Case 0, sanity check. No data. CREATE TABLE empty_table(id SERIAL PRIMARY KEY, v REAL[2]); CREATE INDEX empty_idx ON empty_table USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=2, dim=2); -INFO: done init usearch index -INFO: inserted 0 elements -INFO: done saving 0 vectors SET _lantern_internal.is_test = true; SELECT is_cost_estimate_within_error('EXPLAIN SELECT * FROM empty_table ORDER BY v <-> ''{1,2}'' LIMIT 10', 0.47); DEBUG: LANTERN - Query cost estimator diff --git a/test/expected/hnsw_create.out b/test/expected/hnsw_create.out index 28eb6a63..cdceb7b1 100644 --- a/test/expected/hnsw_create.out +++ b/test/expected/hnsw_create.out @@ -77,14 +77,16 @@ CREATE TABLE small_world4 ( ); -- If the first row is NULL we do not infer a dimension \set ON_ERROR_STOP off -CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); -ERROR: column does not have dimensions, please specify one begin; +CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); INSERT INTO small_world4 (id, vector) VALUES ('000', NULL), ('001', '{1,0,0,1}'); +INFO: done init usearch index +INFO: inserted 1 elements +INFO: done saving 1 vectors CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); -ERROR: column does not have dimensions, please specify one +ERROR: Failed to infer dimensions from non-empty table, please specify one rollback; \set ON_ERROR_STOP on INSERT INTO small_world4 (id, vector) VALUES diff --git a/test/expected/hnsw_dist_func.out b/test/expected/hnsw_dist_func.out index 78acab3a..7cc439e1 100644 --- a/test/expected/hnsw_dist_func.out +++ b/test/expected/hnsw_dist_func.out @@ -20,20 +20,20 @@ CREATE TABLE small_world_l2 (id VARCHAR(3), v REAL[]); CREATE TABLE small_world_cos (id VARCHAR(3), v REAL[]); CREATE TABLE small_world_ham (id VARCHAR(3), v INTEGER[]); CREATE INDEX ON small_world_l2 USING hnsw (v dist_l2sq_ops) WITH (dim=3); -INFO: done init usearch index -INFO: inserted 0 elements -INFO: done saving 0 vectors CREATE INDEX ON small_world_cos USING hnsw (v dist_cos_ops) WITH (dim=3); -INFO: done init usearch index -INFO: inserted 0 elements -INFO: done saving 0 vectors CREATE INDEX ON small_world_ham USING hnsw (v dist_hamming_ops) WITH (dim=3); -INFO: done init usearch index -INFO: inserted 0 elements -INFO: done saving 0 vectors INSERT INTO small_world_l2 SELECT id, v FROM small_world; +INFO: done init usearch index +INFO: inserted 1 elements +INFO: done saving 1 vectors INSERT INTO small_world_cos SELECT id, v FROM small_world; +INFO: done init usearch index +INFO: inserted 1 elements +INFO: done saving 1 vectors INSERT INTO small_world_ham SELECT id, ARRAY[CAST(v[1] AS INTEGER), CAST(v[2] AS INTEGER), CAST(v[3] AS INTEGER)] FROM small_world; +INFO: done init usearch index +INFO: inserted 1 elements +INFO: done saving 1 vectors SET enable_seqscan = false; -- Verify that the distance functions work (check distances) SELECT ROUND(l2sq_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_l2 ORDER BY v <-> '{0,1,0}'; diff --git a/test/expected/hnsw_insert.out b/test/expected/hnsw_insert.out index 6140c130..bde3e3b0 100644 --- a/test/expected/hnsw_insert.out +++ b/test/expected/hnsw_insert.out @@ -12,11 +12,11 @@ CREATE TABLE small_world ( v REAL[2] ); CREATE INDEX ON small_world USING hnsw (v) WITH (dim=3); -INFO: done init usearch index -INFO: inserted 0 elements -INFO: done saving 0 vectors -- Insert rows with valid vector data INSERT INTO small_world (v) VALUES ('{0,0,1}'), ('{0,1,0}'); +INFO: done init usearch index +INFO: inserted 1 elements +INFO: done saving 1 vectors INSERT INTO small_world (v) VALUES (NULL); -- Attempt to insert a row with an incorrect vector length \set ON_ERROR_STOP off @@ -107,10 +107,10 @@ CREATE TABLE sift_base10k ( v REAL[128] ); CREATE INDEX hnsw_idx ON sift_base10k USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128); -INFO: done init usearch index -INFO: inserted 0 elements -INFO: done saving 0 vectors \COPY sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' WITH CSV; +INFO: done init usearch index +INFO: inserted 1 elements +INFO: done saving 1 vectors SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444' QUERY PLAN diff --git a/test/sql/hnsw_create.sql b/test/sql/hnsw_create.sql index f8571345..03edbc8f 100644 --- a/test/sql/hnsw_create.sql +++ b/test/sql/hnsw_create.sql @@ -34,8 +34,8 @@ CREATE TABLE small_world4 ( ); -- If the first row is NULL we do not infer a dimension \set ON_ERROR_STOP off -CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); begin; +CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); INSERT INTO small_world4 (id, vector) VALUES ('000', NULL), ('001', '{1,0,0,1}'); From 17dc848f9ff36af73dc4b68f0bc74b1925d5172f Mon Sep 17 00:00:00 2001 From: Danyil Date: Wed, 18 Oct 2023 23:08:41 -0400 Subject: [PATCH 2/5] cleaned up and formatted code --- src/hnsw.c | 4 ++++ src/hnsw/build.c | 35 ++++++++++------------------------- src/hnsw/external_index.c | 1 - src/hnsw/insert.c | 36 +++--------------------------------- src/hnsw/options.c | 17 ----------------- src/hnsw/options.h | 5 ----- 6 files changed, 17 insertions(+), 81 deletions(-) diff --git a/src/hnsw.c b/src/hnsw.c index 1dc1fd57..eecc8301 100644 --- a/src/hnsw.c +++ b/src/hnsw.c @@ -389,6 +389,9 @@ HnswColumnType GetIndexColumnType(Relation index) return GetColumnTypeFromOid(attr->atttypid); } +/* + * Returns length of vector from datum + */ int DatumGetLength(Datum datum, HnswColumnType type) { if(type == VECTOR) { @@ -400,6 +403,7 @@ int DatumGetLength(Datum datum, HnswColumnType type) } else { elog(ERROR, "Unsupported type"); } + return -1; } /* diff --git a/src/hnsw/build.c b/src/hnsw/build.c index f1f8ccfb..1b15ec48 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -119,10 +119,14 @@ static void BuildCallback( Relation index, CALLBACK_ITEM_POINTER, Datum *values, bool *isnull, bool tupleIsAlive, void *state) { HnswBuildState *buildstate = (HnswBuildState *)state; + // If this is a postponed index build, we only want to read the first tuple and build the index from that. This is + // relevant when we have postponed an index build (on an empty table) and then the first insert occurs as part of a + // batch (like \COPY from a csv file). When this happens, the batch of tuples will be in the heap by the time the + // first aminsert runs, and we only want to build the index with the first tuple since aminsert will return for only + // that tuple. All the other tuples will subsequently be inserted normally via aminsert after this if(buildstate->postponed && buildstate->reltuples > 0) { return; } - //elog(INFO, "Callback for build!"); MemoryContext oldCtx; // we can later use this for some optimizations I think LDB_UNUSED(tupleIsAlive); @@ -261,8 +265,8 @@ static int GetArrayLengthFromHeap(Relation heap, int indexCol, IndexInfo *indexI return n_items; } -// Attempts to get the number of dimensions from the index, and falls back on a heap scan to get dimensions if that -// fails +// Attempts to get the number of dimensions from the index, and if that fails, falls back on a heap scan to fetch the +// first tuple, and get the length of the vector from that tuple int GetHnswIndexDimensions(Relation index, IndexInfo *indexInfo) { HnswColumnType columnType = GetIndexColumnType(index); @@ -314,10 +318,6 @@ void CheckHnswIndexDimensions(Relation index, Datum arrayDatum, int dimensions) int n_items; HnswColumnType indexType = GetIndexColumnType(index); - if(dimensions == HNSW_DEFAULT_DIM) { - return; - } - if(indexType == REAL_ARRAY || indexType == INT_ARRAY) { /* Check dimensions of vector */ array = DatumGetArrayTypePCopy(arrayDatum); @@ -435,17 +435,8 @@ void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildSt } if(empty_table && !buildstate->postponed) { - // Postpone creation of index until first insert, where we can get dimension from that inserted vector and build - // the index with the right dimension buildstate has enough information at this point to return in original - // am_build - // TODO: we can only do this if there is no dimension specified in the options(cant be inferred) but that would - // require us passing that state to aminsert so we only build index - // elog(INFO, "Since table is empty, postponing creation of index until first insert"); - // we use indexInfo->AmCache to store this (since it's a void* we can check if it's NULL to encode the boolean - // value of whether we're postponing or not) - // elog(INFO, "BUILD, before setting it to true! ii_AmCache address: %p\n", indexInfo->ii_AmCache); - indexInfo->ii_AmCache = (void *)1; - // elog(INFO, "BUILD, after setting it to true! ii_AmCache address: %p\n", indexInfo->ii_AmCache); + // Postpone creation of the index until the first insert, where we can get the dimension from that inserted + // vector and then build the index with that dimension return; } @@ -458,7 +449,6 @@ void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildSt buildstate->hnsw = NULL; if(buildstate->index_file_path && !buildstate->postponed) { - //elog(INFO, "using index file path to creat index!"); if(access(buildstate->index_file_path, F_OK) != 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("Invalid index file path "))); } @@ -510,14 +500,9 @@ void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildSt } UpdateProgress(PROGRESS_CREATEIDX_PHASE, PROGRESS_HNSW_PHASE_IN_MEMORY_INSERT); - // if(!empty_table && !buildstate->postponed) { - //elog(INFO, "Before ScanTable"); - LanternBench("build hnsw index", - ScanTable(buildstate)); // can we get away with not calling this on an empty table? - //elog(INFO, "After ScanTable"); + LanternBench("build hnsw index", ScanTable(buildstate)); elog(INFO, "inserted %ld elements", usearch_size(buildstate->usearch_index, &error)); assert(error == NULL); - //elog(INFO, "After assertion for ScanTable"); } char *result_buf = NULL; diff --git a/src/hnsw/external_index.c b/src/hnsw/external_index.c index 85ba3775..ae3bbbd4 100644 --- a/src/hnsw/external_index.c +++ b/src/hnsw/external_index.c @@ -368,7 +368,6 @@ HnswIndexTuple *PrepareIndexTuple(Relation index_rel, // allocate buffer to construct the new node // note that we allocate more than sizeof(HnswIndexTuple) since the struct has a flexible array member // which depends on parameters passed into UsearchNodeBytes above - // TODO: this palloc is throwing an error alloced_tuple = (HnswIndexTuple *)palloc0(sizeof(HnswIndexTuple) + new_tuple_size); alloced_tuple->id = new_tuple_id; alloced_tuple->level = new_tuple_level; diff --git a/src/hnsw/insert.c b/src/hnsw/insert.c index 104b8303..487c0aec 100644 --- a/src/hnsw/insert.c +++ b/src/hnsw/insert.c @@ -76,7 +76,6 @@ bool ldb_aminsert(Relation index, #if PG_VERSION_NUM >= 140000 LDB_UNUSED(indexUnchanged); #endif - HnswInsertState *insertstate = palloc0(sizeof(HnswInsertState)); if(checkUnique != UNIQUE_CHECK_NO) { @@ -90,31 +89,16 @@ bool ldb_aminsert(Relation index, return false; } // todo:: there is room for optimization for when indexUnchanged is true - //elog(INFO, "RUNNING AMINSERT!"); - // TODO: what if there are concurrent inserts? can that result in issues with creating this index, which we've - // postponed, to here? - // bool postponed = (indexInfo->ii_AmCache != NULL); datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ])); column_type = GetIndexColumnType(index); - //elog(INFO, "HEAP number of blocks: %d", RelationGetNumberOfBlocks(heap)); bool postponed = RelationGetNumberOfBlocks(index) == 0; - //elog(INFO, "INSERT: ii_AmCache address: %p\n", indexInfo->ii_AmCache); - //elog(INFO, "Num Blocks in insert: %d", RelationGetNumberOfBlocks(index)); - + // TODO: what if there are concurrent inserts? can that result in issues with creating this postponed index? if(postponed) { - // int ndims = GetHnswIndexDimensions(index, indexInfo); int ndims = DatumGetLength(datum, column_type); - // TODO: test this ldb_HnswGetDim call... not sure if options will carry over int index_ndims = ldb_HnswGetDim(index); - // TODO: check if above works... we can then only do this postponed index build stuff if no dimension was - // specified during index declaration - // int index_ndims = -1; - //elog(INFO, "Vector dimension specified during index creation: %d", index_ndims); - //elog(INFO, "ndims is : %d", ndims); - // elog(INFO, "From aminsert, our index_ndims is: %d", index_ndims); if(index_ndims >= 1 && index_ndims != ndims) { elog(ERROR, @@ -126,8 +110,7 @@ bool ldb_aminsert(Relation index, } if(ndims < 1) { - // error is printed in GetHNSWIndexDimensions above - elog(ERROR, "ndims < 1 in insert!"); + elog(ERROR, "Could not identify dimension of inserted vector!"); return false; } @@ -140,25 +123,16 @@ bool ldb_aminsert(Relation index, return false; } - // construct index with ndims now + // We now build the postponed index, using ndims HnswBuildState buildstate; buildstate.postponed = true; buildstate.dimensions = ndims; - // dont think we need this context switching stuff - // MemoryContext prevCtx; - // prevCtx = CurrentMemoryContext; - - //elog(INFO, "About to call build index for the second time"); BuildIndex(heap, index, indexInfo, &buildstate, MAIN_FORKNUM); - //elog(INFO, "Finished building Index!"); // Building the index already inserted this vector since it was written to the heap prior to this AM method // being called, so we can return to avoid inserting twice return false; - - // MemoryContextSwitchTo(prevCtx); - // elog(INFO, "Finished switching back to prevCtx!"); } insertCtx = AllocSetContextCreate(CurrentMemoryContext, "LanternInsertContext", ALLOCSET_DEFAULT_SIZES); @@ -174,11 +148,7 @@ bool ldb_aminsert(Relation index, hdr = (HnswIndexHeaderPage *)PageGetContents(hdr_page); assert(hdr->magicNumber == LDB_WAL_MAGIC_NUMBER); - //opts.dimensions = GetHnswIndexDimensions(index, indexInfo); opts.dimensions = hdr->vector_dim; - - //opts.dimensions = DatumGetLength(datum, column_type); - //elog(INFO, "opts.dimensions in insert: %d", (int)opts.dimensions); CheckHnswIndexDimensions(index, values[ 0 ], opts.dimensions); PopulateUsearchOpts(index, &opts); opts.retriever_ctx = ldb_wal_retriever_area_init(index, hdr); diff --git a/src/hnsw/options.c b/src/hnsw/options.c index 239d1221..47886fa4 100644 --- a/src/hnsw/options.c +++ b/src/hnsw/options.c @@ -73,13 +73,6 @@ char *ldb_HnswGetIndexFilePath(Relation index) return (char *)opts + opts->experimantal_index_path_offset; } -bool ldb_HnswGetPostponeIndexBuild(Relation index) -{ - ldb_HnswOptions *opts = (ldb_HnswOptions *)index->rd_options; - if(opts) return opts->postpone_index_build; - return false; -} - usearch_metric_kind_t ldb_HnswGetMetricKind(Relation index) { struct catclist *proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(index->rd_opfamily[ 0 ])); @@ -117,7 +110,6 @@ bytea *ldb_amoptions(Datum reloptions, bool validate) {"ef_construction", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef_construction)}, {"ef", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef)}, {"_experimental_index_path", RELOPT_TYPE_STRING, offsetof(ldb_HnswOptions, experimantal_index_path_offset)}, - {"_postpone_index_build", RELOPT_TYPE_BOOL, offsetof(ldb_HnswOptions, postpone_index_build)}, }; #if PG_VERSION_NUM >= 130000 @@ -214,15 +206,6 @@ void _PG_init(void) #if PG_VERSION_NUM >= 130000 , AccessExclusiveLock -#endif - ); - add_bool_reloption(ldb_hnsw_index_withopts, - "_postpone_index_build", - "Whether or not to postpone creation of the index to the first insert, for empty tables", - false -#if PG_VERSION_NUM >= certain_version - , - AccessExclusiveLock #endif ); DefineCustomIntVariable("hnsw.init_k", diff --git a/src/hnsw/options.h b/src/hnsw/options.h index 82550ba6..edcc651e 100644 --- a/src/hnsw/options.h +++ b/src/hnsw/options.h @@ -40,11 +40,6 @@ typedef struct ldb_HnswOptions int ef_construction; int ef; int experimantal_index_path_offset; - - // whether we postpone the process of building the index on the first insert, when the table is empty - // this is done for user convenience so that we can get the dimension from the first insert, and user thus doesn't - // have to specify the dimension in the create index clause - bool postpone_index_build; } ldb_HnswOptions; int ldb_HnswGetDim(Relation index); From c304a8a271b8dedcbda1acf83aeebb2e1b77716c Mon Sep 17 00:00:00 2001 From: Danyil Date: Wed, 18 Oct 2023 23:53:43 -0400 Subject: [PATCH 3/5] expanded on hnsw_create test to test postponed index build functionality --- test/expected/hnsw_create.out | 102 +++++++++++++++++++++++++++++++++- test/sql/hnsw_create.sql | 92 +++++++++++++++++++++++++++++- 2 files changed, 189 insertions(+), 5 deletions(-) diff --git a/test/expected/hnsw_create.out b/test/expected/hnsw_create.out index cdceb7b1..23bf039a 100644 --- a/test/expected/hnsw_create.out +++ b/test/expected/hnsw_create.out @@ -75,20 +75,116 @@ CREATE TABLE small_world4 ( id varchar(3), vector real[] ); --- If the first row is NULL we do not infer a dimension -\set ON_ERROR_STOP off +-- Test postponing of index creation on an empty table +-- no options and single insert +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', '{0,1,2,0}'); +INFO: done init usearch index +INFO: inserted 1 elements +INFO: done saving 1 vectors +rollback; +DROP INDEX small_world4_idx1; +-- We need to vacuum or else we won't detect that the table is empty in ambuild +VACUUM small_world4; +SELECT * FROM small_world4; + id | vector +----+-------- +(0 rows) + +-- no options and batch insert +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', '{0,1,2,0}'), +('001', '{1,0,0,1}'); +INFO: done init usearch index +INFO: inserted 1 elements +INFO: done saving 1 vectors +rollback; +DROP INDEX small_world4_idx1; +VACUUM small_world4; +-- some options but no dim and single insert +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', '{0,1,2,0}'); +INFO: done init usearch index +INFO: inserted 1 elements +INFO: done saving 1 vectors +rollback; +DROP INDEX small_world4_idx1; +VACUUM small_world4; +-- some options but no dim and batch insert +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', '{0,1,2,0}'), +('001', '{1,0,0,1}'); +INFO: done init usearch index +INFO: inserted 1 elements +INFO: done saving 1 vectors +rollback; +DROP INDEX small_world4_idx1; +VACUUM small_world4; +-- dim specified and single insert +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', '{0,1,2,0}'); +INFO: done init usearch index +INFO: inserted 1 elements +INFO: done saving 1 vectors +rollback; +DROP INDEX small_world4_idx1; +VACUUM small_world4; +-- dim specified and batch insert +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', '{0,1,2,0}'), +('001', '{1,0,0,1}'); +INFO: done init usearch index +INFO: inserted 1 elements +INFO: done saving 1 vectors +rollback; +DROP INDEX small_world4_idx1; +VACUUM small_world4; +-- Test cases where a NULL vector is inserted, and dim is not specified +-- Create postponed index on empty table with batch insert where one vector is NULL +-- this should ignore all NULL vectors and build index upon encountering insertion of first non-NULL entry +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); begin; -CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); INSERT INTO small_world4 (id, vector) VALUES ('000', NULL), ('001', '{1,0,0,1}'); INFO: done init usearch index INFO: inserted 1 elements INFO: done saving 1 vectors +rollback; +DROP INDEX small_world4_idx1; +VACUUM small_world4; +-- Empty table with first insert where vector is NULL +-- This should ignore the NULL vector and NOT build the postponed index +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', NULL); +rollback; +DROP INDEX small_world4_idx1; +VACUUM small_world4; +-- If the first row is NULL and index is not postponed (non-empty table) then we can't infer dimension and this will error +\set ON_ERROR_STOP off +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', NULL), +('001', '{1,0,0,1}'); CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); ERROR: Failed to infer dimensions from non-empty table, please specify one rollback; \set ON_ERROR_STOP on +VACUUM small_world4; INSERT INTO small_world4 (id, vector) VALUES ('000', '{1,0,0,0}'), ('001', '{1,0,0,1}'), diff --git a/test/sql/hnsw_create.sql b/test/sql/hnsw_create.sql index 03edbc8f..63136516 100644 --- a/test/sql/hnsw_create.sql +++ b/test/sql/hnsw_create.sql @@ -32,16 +32,104 @@ CREATE TABLE small_world4 ( id varchar(3), vector real[] ); --- If the first row is NULL we do not infer a dimension + +-- Test postponing of index creation on an empty table +-- no options and single insert +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', '{0,1,2,0}'); +rollback; +DROP INDEX small_world4_idx1; +-- We need to vacuum or else we won't detect that the table is empty in ambuild +VACUUM small_world4; + +SELECT * FROM small_world4; + +-- no options and batch insert +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', '{0,1,2,0}'), +('001', '{1,0,0,1}'); +rollback; +DROP INDEX small_world4_idx1; +VACUUM small_world4; + + +-- some options but no dim and single insert +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', '{0,1,2,0}'); +rollback; +DROP INDEX small_world4_idx1; +VACUUM small_world4; + + +-- some options but no dim and batch insert +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', '{0,1,2,0}'), +('001', '{1,0,0,1}'); +rollback; +DROP INDEX small_world4_idx1; +VACUUM small_world4; + + +-- dim specified and single insert +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', '{0,1,2,0}'); +rollback; +DROP INDEX small_world4_idx1; +VACUUM small_world4; + +-- dim specified and batch insert +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', '{0,1,2,0}'), +('001', '{1,0,0,1}'); +rollback; +DROP INDEX small_world4_idx1; +VACUUM small_world4; + +-- Test cases where a NULL vector is inserted, and dim is not specified + +-- Create postponed index on empty table with batch insert where one vector is NULL +-- this should ignore all NULL vectors and build index upon encountering insertion of first non-NULL entry +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', NULL), +('001', '{1,0,0,1}'); +rollback; +DROP INDEX small_world4_idx1; +VACUUM small_world4; + +-- Empty table with first insert where vector is NULL +-- This should ignore the NULL vector and NOT build the postponed index +CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); +begin; +INSERT INTO small_world4 (id, vector) VALUES +('000', NULL); +rollback; +DROP INDEX small_world4_idx1; +VACUUM small_world4; + +-- If the first row is NULL and index is not postponed (non-empty table) then we can't infer dimension and this will error \set ON_ERROR_STOP off begin; -CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); INSERT INTO small_world4 (id, vector) VALUES ('000', NULL), ('001', '{1,0,0,1}'); CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2); rollback; \set ON_ERROR_STOP on +VACUUM small_world4; INSERT INTO small_world4 (id, vector) VALUES ('000', '{1,0,0,0}'), From 788d19b8743bdc8d1e99cf1c7a60a5f57c60d92c Mon Sep 17 00:00:00 2001 From: Danyil Date: Thu, 19 Oct 2023 00:21:06 -0400 Subject: [PATCH 4/5] postponing index build only when dimension is not specified now --- src/hnsw/build.c | 2 +- src/hnsw/insert.c | 12 ++++++++---- test/expected/hnsw_cost_estimate.out | 3 +++ test/expected/hnsw_create.out | 16 ++++++++-------- test/expected/hnsw_dist_func.out | 18 +++++++++--------- test/expected/hnsw_insert.out | 12 ++++++------ test/sql/hnsw_create.sql | 4 ++-- 7 files changed, 37 insertions(+), 30 deletions(-) diff --git a/src/hnsw/build.c b/src/hnsw/build.c index 1b15ec48..c59f1391 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -434,7 +434,7 @@ void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildSt return; } - if(empty_table && !buildstate->postponed) { + if(empty_table && buildstate->dimensions < 1 && !buildstate->postponed) { // Postpone creation of the index until the first insert, where we can get the dimension from that inserted // vector and then build the index with that dimension return; diff --git a/src/hnsw/insert.c b/src/hnsw/insert.c index 487c0aec..ccb4e26b 100644 --- a/src/hnsw/insert.c +++ b/src/hnsw/insert.c @@ -93,14 +93,17 @@ bool ldb_aminsert(Relation index, datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ])); column_type = GetIndexColumnType(index); - bool postponed = RelationGetNumberOfBlocks(index) == 0; + int index_ndims = ldb_HnswGetDim(index); + bool index_ndims_exists = index_ndims >= 1; + bool index_empty = RelationGetNumberOfBlocks(index) == 0; + bool postponed = index_empty && !index_ndims_exists; // TODO: what if there are concurrent inserts? can that result in issues with creating this postponed index? if(postponed) { int ndims = DatumGetLength(datum, column_type); - int index_ndims = ldb_HnswGetDim(index); - if(index_ndims >= 1 && index_ndims != ndims) { + /* + if(index_ndims_exists && index_ndims != ndims) { elog(ERROR, "Vector dimension %d of inserted vector does not match vector dimension %d specified during index " "creation.", @@ -108,6 +111,7 @@ bool ldb_aminsert(Relation index, index_ndims); return false; } + */ if(ndims < 1) { elog(ERROR, "Could not identify dimension of inserted vector!"); @@ -130,7 +134,7 @@ bool ldb_aminsert(Relation index, BuildIndex(heap, index, indexInfo, &buildstate, MAIN_FORKNUM); - // Building the index already inserted this vector since it was written to the heap prior to this AM method + // Building the index already inserted this vector since it was written to the heap prior to this function // being called, so we can return to avoid inserting twice return false; } diff --git a/test/expected/hnsw_cost_estimate.out b/test/expected/hnsw_cost_estimate.out index 3e749f96..7940f805 100644 --- a/test/expected/hnsw_cost_estimate.out +++ b/test/expected/hnsw_cost_estimate.out @@ -52,6 +52,9 @@ SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -- Case 0, sanity check. No data. CREATE TABLE empty_table(id SERIAL PRIMARY KEY, v REAL[2]); CREATE INDEX empty_idx ON empty_table USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=2, dim=2); +INFO: done init usearch index +INFO: inserted 0 elements +INFO: done saving 0 vectors SET _lantern_internal.is_test = true; SELECT is_cost_estimate_within_error('EXPLAIN SELECT * FROM empty_table ORDER BY v <-> ''{1,2}'' LIMIT 10', 0.47); DEBUG: LANTERN - Query cost estimator diff --git a/test/expected/hnsw_create.out b/test/expected/hnsw_create.out index 23bf039a..0d7cffe7 100644 --- a/test/expected/hnsw_create.out +++ b/test/expected/hnsw_create.out @@ -128,26 +128,26 @@ INFO: done saving 1 vectors rollback; DROP INDEX small_world4_idx1; VACUUM small_world4; --- dim specified and single insert +-- dim specified and single insert (this should NOT postpone index build since dim is specified) CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4); +INFO: done init usearch index +INFO: inserted 0 elements +INFO: done saving 0 vectors begin; INSERT INTO small_world4 (id, vector) VALUES ('000', '{0,1,2,0}'); -INFO: done init usearch index -INFO: inserted 1 elements -INFO: done saving 1 vectors rollback; DROP INDEX small_world4_idx1; VACUUM small_world4; --- dim specified and batch insert +-- dim specified and batch insert (this should NOT postpone index build since dim is specified) CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4); +INFO: done init usearch index +INFO: inserted 0 elements +INFO: done saving 0 vectors begin; INSERT INTO small_world4 (id, vector) VALUES ('000', '{0,1,2,0}'), ('001', '{1,0,0,1}'); -INFO: done init usearch index -INFO: inserted 1 elements -INFO: done saving 1 vectors rollback; DROP INDEX small_world4_idx1; VACUUM small_world4; diff --git a/test/expected/hnsw_dist_func.out b/test/expected/hnsw_dist_func.out index 7cc439e1..78acab3a 100644 --- a/test/expected/hnsw_dist_func.out +++ b/test/expected/hnsw_dist_func.out @@ -20,20 +20,20 @@ CREATE TABLE small_world_l2 (id VARCHAR(3), v REAL[]); CREATE TABLE small_world_cos (id VARCHAR(3), v REAL[]); CREATE TABLE small_world_ham (id VARCHAR(3), v INTEGER[]); CREATE INDEX ON small_world_l2 USING hnsw (v dist_l2sq_ops) WITH (dim=3); +INFO: done init usearch index +INFO: inserted 0 elements +INFO: done saving 0 vectors CREATE INDEX ON small_world_cos USING hnsw (v dist_cos_ops) WITH (dim=3); +INFO: done init usearch index +INFO: inserted 0 elements +INFO: done saving 0 vectors CREATE INDEX ON small_world_ham USING hnsw (v dist_hamming_ops) WITH (dim=3); -INSERT INTO small_world_l2 SELECT id, v FROM small_world; INFO: done init usearch index -INFO: inserted 1 elements -INFO: done saving 1 vectors +INFO: inserted 0 elements +INFO: done saving 0 vectors +INSERT INTO small_world_l2 SELECT id, v FROM small_world; INSERT INTO small_world_cos SELECT id, v FROM small_world; -INFO: done init usearch index -INFO: inserted 1 elements -INFO: done saving 1 vectors INSERT INTO small_world_ham SELECT id, ARRAY[CAST(v[1] AS INTEGER), CAST(v[2] AS INTEGER), CAST(v[3] AS INTEGER)] FROM small_world; -INFO: done init usearch index -INFO: inserted 1 elements -INFO: done saving 1 vectors SET enable_seqscan = false; -- Verify that the distance functions work (check distances) SELECT ROUND(l2sq_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_l2 ORDER BY v <-> '{0,1,0}'; diff --git a/test/expected/hnsw_insert.out b/test/expected/hnsw_insert.out index bde3e3b0..6140c130 100644 --- a/test/expected/hnsw_insert.out +++ b/test/expected/hnsw_insert.out @@ -12,11 +12,11 @@ CREATE TABLE small_world ( v REAL[2] ); CREATE INDEX ON small_world USING hnsw (v) WITH (dim=3); +INFO: done init usearch index +INFO: inserted 0 elements +INFO: done saving 0 vectors -- Insert rows with valid vector data INSERT INTO small_world (v) VALUES ('{0,0,1}'), ('{0,1,0}'); -INFO: done init usearch index -INFO: inserted 1 elements -INFO: done saving 1 vectors INSERT INTO small_world (v) VALUES (NULL); -- Attempt to insert a row with an incorrect vector length \set ON_ERROR_STOP off @@ -107,10 +107,10 @@ CREATE TABLE sift_base10k ( v REAL[128] ); CREATE INDEX hnsw_idx ON sift_base10k USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128); -\COPY sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' WITH CSV; INFO: done init usearch index -INFO: inserted 1 elements -INFO: done saving 1 vectors +INFO: inserted 0 elements +INFO: done saving 0 vectors +\COPY sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' WITH CSV; SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444' QUERY PLAN diff --git a/test/sql/hnsw_create.sql b/test/sql/hnsw_create.sql index 63136516..f49e690d 100644 --- a/test/sql/hnsw_create.sql +++ b/test/sql/hnsw_create.sql @@ -78,7 +78,7 @@ DROP INDEX small_world4_idx1; VACUUM small_world4; --- dim specified and single insert +-- dim specified and single insert (this should NOT postpone index build since dim is specified) CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4); begin; INSERT INTO small_world4 (id, vector) VALUES @@ -87,7 +87,7 @@ rollback; DROP INDEX small_world4_idx1; VACUUM small_world4; --- dim specified and batch insert +-- dim specified and batch insert (this should NOT postpone index build since dim is specified) CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4); begin; INSERT INTO small_world4 (id, vector) VALUES From a8d7396d8f6ceb7a00cdac5bb8dce5a06af320cf Mon Sep 17 00:00:00 2001 From: Danyil Date: Thu, 19 Oct 2023 00:27:39 -0400 Subject: [PATCH 5/5] minor formatting --- src/hnsw/build.c | 3 +-- src/hnsw/insert.c | 12 +----------- src/hnsw/options.c | 15 +++++++-------- 3 files changed, 9 insertions(+), 21 deletions(-) diff --git a/src/hnsw/build.c b/src/hnsw/build.c index c59f1391..53966a69 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -363,8 +363,6 @@ static void InitBuildState(HnswBuildState *buildstate, Relation heap, Relation i if(buildstate->dimensions < 1 && !buildstate->postponed) { buildstate->dimensions = InferDimension(heap, indexInfo); } - /* Require column to have dimensions to be indexed */ - // if(buildstate->dimensions < 1) elog(ERROR, "column does not have dimensions, please specify one"); // not supported because of 8K page limit in postgres WAL pages // can pass this limit once quantization is supported @@ -501,6 +499,7 @@ void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildSt UpdateProgress(PROGRESS_CREATEIDX_PHASE, PROGRESS_HNSW_PHASE_IN_MEMORY_INSERT); LanternBench("build hnsw index", ScanTable(buildstate)); + elog(INFO, "inserted %ld elements", usearch_size(buildstate->usearch_index, &error)); assert(error == NULL); } diff --git a/src/hnsw/insert.c b/src/hnsw/insert.c index ccb4e26b..0e2679af 100644 --- a/src/hnsw/insert.c +++ b/src/hnsw/insert.c @@ -76,6 +76,7 @@ bool ldb_aminsert(Relation index, #if PG_VERSION_NUM >= 140000 LDB_UNUSED(indexUnchanged); #endif + HnswInsertState *insertstate = palloc0(sizeof(HnswInsertState)); if(checkUnique != UNIQUE_CHECK_NO) { @@ -102,17 +103,6 @@ bool ldb_aminsert(Relation index, if(postponed) { int ndims = DatumGetLength(datum, column_type); - /* - if(index_ndims_exists && index_ndims != ndims) { - elog(ERROR, - "Vector dimension %d of inserted vector does not match vector dimension %d specified during index " - "creation.", - ndims, - index_ndims); - return false; - } - */ - if(ndims < 1) { elog(ERROR, "Could not identify dimension of inserted vector!"); return false; diff --git a/src/hnsw/options.c b/src/hnsw/options.c index 47886fa4..c0b84442 100644 --- a/src/hnsw/options.c +++ b/src/hnsw/options.c @@ -103,14 +103,13 @@ usearch_metric_kind_t ldb_HnswGetMetricKind(Relation index) */ bytea *ldb_amoptions(Datum reloptions, bool validate) { - static const relopt_parse_elt tab[] = { - {"dim", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, dim)}, - {"element_limit", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, element_limit)}, - {"m", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, m)}, - {"ef_construction", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef_construction)}, - {"ef", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef)}, - {"_experimental_index_path", RELOPT_TYPE_STRING, offsetof(ldb_HnswOptions, experimantal_index_path_offset)}, - }; + static const relopt_parse_elt tab[] + = {{"dim", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, dim)}, + {"element_limit", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, element_limit)}, + {"m", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, m)}, + {"ef_construction", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef_construction)}, + {"ef", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef)}, + {"_experimental_index_path", RELOPT_TYPE_STRING, offsetof(ldb_HnswOptions, experimantal_index_path_offset)}}; #if PG_VERSION_NUM >= 130000 LDB_UNUSED(validate);