From 535e81d89134d1d35ac38c54f9d34e1c67d124eb Mon Sep 17 00:00:00 2001
From: Danyil <danyilblyschak@gmail.com>
Date: Wed, 18 Oct 2023 22:34:16 -0400
Subject: [PATCH 1/5] fixed all bugs and modified test outputs to a point
 wherethey're working

---
 src/hnsw.c                           | 16 ++++++
 src/hnsw.h                           |  1 +
 src/hnsw/build.c                     | 61 ++++++++++++++++----
 src/hnsw/build.h                     |  5 +-
 src/hnsw/external_index.c            |  1 +
 src/hnsw/insert.c                    | 84 ++++++++++++++++++++++++++--
 src/hnsw/options.c                   | 32 ++++++++---
 src/hnsw/options.h                   |  5 ++
 test/expected/hnsw_cost_estimate.out |  3 -
 test/expected/hnsw_create.out        |  8 ++-
 test/expected/hnsw_dist_func.out     | 18 +++---
 test/expected/hnsw_insert.out        | 12 ++--
 test/sql/hnsw_create.sql             |  2 +-
 13 files changed, 202 insertions(+), 46 deletions(-)

diff --git a/src/hnsw.c b/src/hnsw.c
index 6e475ff1..1dc1fd57 100644
--- a/src/hnsw.c
+++ b/src/hnsw.c
@@ -172,6 +172,9 @@ static void hnswcostestimate(PlannerInfo *root,
     costs.numIndexTuples = estimate_number_tuples_accessed(path->indexinfo->indexoid, num_tuples_in_index);
     uint64 num_blocks_accessed
         = estimate_number_blocks_accessed(num_tuples_in_index, path->indexinfo->pages, costs.numIndexTuples);
+    // choose max{above, 1} since on a postponed index build, we will have 0 for the above quantity... this should only
+    // affect scans on empty indexes
+    num_blocks_accessed = (num_blocks_accessed > 1) ? num_blocks_accessed : 1;
 
 #if PG_VERSION_NUM >= 120000
     genericcostestimate(root, path, loop_count, &costs);
@@ -386,6 +389,19 @@ HnswColumnType GetIndexColumnType(Relation index)
     return GetColumnTypeFromOid(attr->atttypid);
 }
 
+int DatumGetLength(Datum datum, HnswColumnType type)
+{
+    if(type == VECTOR) {
+        Vector *vector = DatumGetVector(datum);
+        return vector->dim;
+    } else if(type == REAL_ARRAY || type == INT_ARRAY) {
+        ArrayType *array = DatumGetArrayTypePCopy(datum);
+        return ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
+    } else {
+        elog(ERROR, "Unsupported type");
+    }
+}
+
 /*
  * Given vector data and vector type, read it as either a float4 or int32 array and return as void*
  */
diff --git a/src/hnsw.h b/src/hnsw.h
index 570ed6bf..cf33defe 100644
--- a/src/hnsw.h
+++ b/src/hnsw.h
@@ -34,6 +34,7 @@ PGDLLEXPORT Datum cos_dist(PG_FUNCTION_ARGS);
 
 HnswColumnType GetColumnTypeFromOid(Oid oid);
 HnswColumnType GetIndexColumnType(Relation index);
+int            DatumGetLength(Datum datum, HnswColumnType type);
 void          *DatumGetSizedArray(Datum datum, HnswColumnType type, int dimensions);
 
 #define LDB_UNUSED(x) (void)(x)
diff --git a/src/hnsw/build.c b/src/hnsw/build.c
index 7e0e6e1f..f1f8ccfb 100644
--- a/src/hnsw/build.c
+++ b/src/hnsw/build.c
@@ -119,7 +119,11 @@ static void BuildCallback(
     Relation index, CALLBACK_ITEM_POINTER, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
 {
     HnswBuildState *buildstate = (HnswBuildState *)state;
-    MemoryContext   oldCtx;
+    if(buildstate->postponed && buildstate->reltuples > 0) {
+        return;
+    }
+    //elog(INFO, "Callback for build!");
+    MemoryContext oldCtx;
     // we can later use this for some optimizations I think
     LDB_UNUSED(tupleIsAlive);
 
@@ -257,6 +261,8 @@ static int GetArrayLengthFromHeap(Relation heap, int indexCol, IndexInfo *indexI
     return n_items;
 }
 
+// Attempts to get the number of dimensions from the index, and falls back on a heap scan to get dimensions if that
+// fails
 int GetHnswIndexDimensions(Relation index, IndexInfo *indexInfo)
 {
     HnswColumnType columnType = GetIndexColumnType(index);
@@ -308,6 +314,10 @@ void CheckHnswIndexDimensions(Relation index, Datum arrayDatum, int dimensions)
     int            n_items;
     HnswColumnType indexType = GetIndexColumnType(index);
 
+    if(dimensions == HNSW_DEFAULT_DIM) {
+        return;
+    }
+
     if(indexType == REAL_ARRAY || indexType == INT_ARRAY) {
         /* Check dimensions of vector */
         array = DatumGetArrayTypePCopy(arrayDatum);
@@ -344,19 +354,21 @@ static void InitBuildState(HnswBuildState *buildstate, Relation heap, Relation i
     buildstate->index = index;
     buildstate->indexInfo = indexInfo;
     buildstate->columnType = GetIndexColumnType(index);
-    buildstate->dimensions = GetHnswIndexDimensions(index, indexInfo);
+    if(!buildstate->postponed) {
+        buildstate->dimensions = GetHnswIndexDimensions(index, indexInfo);
+    }
     buildstate->index_file_path = ldb_HnswGetIndexFilePath(index);
 
     // If a dimension wasn't specified try to infer it
-    if(buildstate->dimensions < 1) {
+    if(buildstate->dimensions < 1 && !buildstate->postponed) {
         buildstate->dimensions = InferDimension(heap, indexInfo);
     }
     /* Require column to have dimensions to be indexed */
-    if(buildstate->dimensions < 1) elog(ERROR, "column does not have dimensions, please specify one");
+    // if(buildstate->dimensions < 1) elog(ERROR, "column does not have dimensions, please specify one");
 
     // not supported because of 8K page limit in postgres WAL pages
     // can pass this limit once quantization is supported
-    if(buildstate->dimensions > HNSW_MAX_DIM)
+    if(buildstate->dimensions > HNSW_MAX_DIM && !buildstate->postponed)
         elog(ERROR,
              "vector dimension %d is too large. "
              "LanternDB currently supports up to %ddim vectors",
@@ -408,14 +420,35 @@ static void ScanTable(HnswBuildState *buildstate)
 /*
  * Build the index
  */
-static void BuildIndex(
-    Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildState *buildstate, ForkNumber forkNum)
+void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildState *buildstate, ForkNumber forkNum)
 {
     usearch_error_t        error = NULL;
     usearch_init_options_t opts;
     MemSet(&opts, 0, sizeof(opts));
+    bool empty_table = RelationGetNumberOfBlocks(heap) == 0;
 
     InitBuildState(buildstate, heap, index, indexInfo);
+
+    if(buildstate->dimensions < 1 && !empty_table && !buildstate->postponed) {
+        elog(ERROR, "Failed to infer dimensions from non-empty table, please specify one");
+        return;
+    }
+
+    if(empty_table && !buildstate->postponed) {
+        // Postpone creation of index until first insert, where we can get dimension from that inserted vector and build
+        // the index with the right dimension buildstate has enough information at this point to return in original
+        // am_build
+        // TODO: we can only do this if there is no dimension specified in the options(cant be inferred) but that would
+        // require us passing that state to aminsert so we only build index
+        // elog(INFO, "Since table is empty, postponing creation of index until first insert");
+        // we use indexInfo->AmCache to store this (since it's a void* we can check if it's NULL to encode the boolean
+        // value of whether we're postponing or not)
+        // elog(INFO, "BUILD, before setting it to true! ii_AmCache address: %p\n", indexInfo->ii_AmCache);
+        indexInfo->ii_AmCache = (void *)1;
+        // elog(INFO, "BUILD, after setting it to true! ii_AmCache address: %p\n", indexInfo->ii_AmCache);
+        return;
+    }
+
     opts.dimensions = buildstate->dimensions;
     PopulateUsearchOpts(index, &opts);
 
@@ -424,7 +457,8 @@ static void BuildIndex(
     assert(error == NULL);
 
     buildstate->hnsw = NULL;
-    if(buildstate->index_file_path) {
+    if(buildstate->index_file_path && !buildstate->postponed) {
+        //elog(INFO, "using index file path to creat index!");
         if(access(buildstate->index_file_path, F_OK) != 0) {
             ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("Invalid index file path ")));
         }
@@ -444,7 +478,7 @@ static void BuildIndex(
     } else {
         BlockNumber numBlocks = RelationGetNumberOfBlocks(heap);
         uint32_t    estimated_row_count = 0;
-        if(numBlocks > 0) {
+        if(numBlocks > 0 && !buildstate->postponed) {
             // Read the first block
             Buffer buffer = ReadBufferExtended(heap, MAIN_FORKNUM, 0, RBM_NORMAL, NULL);
             // Lock buffer so there won't be any new writes during this operation
@@ -476,10 +510,14 @@ static void BuildIndex(
         }
 
         UpdateProgress(PROGRESS_CREATEIDX_PHASE, PROGRESS_HNSW_PHASE_IN_MEMORY_INSERT);
-        LanternBench("build hnsw index", ScanTable(buildstate));
-
+        // if(!empty_table && !buildstate->postponed) {
+        //elog(INFO, "Before ScanTable");
+        LanternBench("build hnsw index",
+                     ScanTable(buildstate));  // can we get away with not calling this on an empty table?
+        //elog(INFO, "After ScanTable");
         elog(INFO, "inserted %ld elements", usearch_size(buildstate->usearch_index, &error));
         assert(error == NULL);
+        //elog(INFO, "After assertion for ScanTable");
     }
 
     char *result_buf = NULL;
@@ -512,6 +550,7 @@ IndexBuildResult *ldb_ambuild(Relation heap, Relation index, IndexInfo *indexInf
 {
     IndexBuildResult *result;
     HnswBuildState    buildstate;
+    buildstate.postponed = false;
 
     BuildIndex(heap, index, indexInfo, &buildstate, MAIN_FORKNUM);
 
diff --git a/src/hnsw/build.h b/src/hnsw/build.h
index 272bd394..668981b7 100644
--- a/src/hnsw/build.h
+++ b/src/hnsw/build.h
@@ -2,6 +2,7 @@
 #define LDB_HNSW_BUILD_H
 
 #include <access/genam.h>
+#include <common/relpath.h>
 #include <nodes/execnodes.h>
 #include <utils/relcache.h>
 
@@ -20,6 +21,7 @@ typedef struct HnswBuildState
     int            dimensions;
     HnswColumnType columnType;
     char          *index_file_path;
+    bool           postponed;
 
     /* Statistics */
     double tuples_indexed;
@@ -36,6 +38,7 @@ typedef struct HnswBuildState
 IndexBuildResult *ldb_ambuild(Relation heap, Relation index, IndexInfo *indexInfo);
 void              ldb_ambuildunlogged(Relation index);
 int               GetHnswIndexDimensions(Relation index, IndexInfo *indexInfo);
-void              CheckHnswIndexDimensions(Relation index, Datum arrayDatum, int deimensions);
+void              CheckHnswIndexDimensions(Relation index, Datum arrayDatum, int dimensions);
+void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildState *buildstate, ForkNumber forkNum);
 // todo: does this render my check unnecessary
 #endif  // LDB_HNSW_BUILD_H
diff --git a/src/hnsw/external_index.c b/src/hnsw/external_index.c
index ae3bbbd4..85ba3775 100644
--- a/src/hnsw/external_index.c
+++ b/src/hnsw/external_index.c
@@ -368,6 +368,7 @@ HnswIndexTuple *PrepareIndexTuple(Relation             index_rel,
     // allocate buffer to construct the new node
     // note that we allocate more than sizeof(HnswIndexTuple) since the struct has a flexible array member
     // which depends on parameters passed into UsearchNodeBytes above
+    // TODO: this palloc is throwing an error
     alloced_tuple = (HnswIndexTuple *)palloc0(sizeof(HnswIndexTuple) + new_tuple_size);
     alloced_tuple->id = new_tuple_id;
     alloced_tuple->level = new_tuple_level;
diff --git a/src/hnsw/insert.c b/src/hnsw/insert.c
index aaff2690..104b8303 100644
--- a/src/hnsw/insert.c
+++ b/src/hnsw/insert.c
@@ -70,12 +70,13 @@ bool ldb_aminsert(Relation         index,
     GenericXLogState      *state;
     uint32                 new_tuple_id;
     HnswIndexTuple        *new_tuple;
+    HnswColumnType         column_type;
     usearch_init_options_t opts = {0};
     LDB_UNUSED(heap);
 #if PG_VERSION_NUM >= 140000
     LDB_UNUSED(indexUnchanged);
 #endif
-
+    
     HnswInsertState *insertstate = palloc0(sizeof(HnswInsertState));
 
     if(checkUnique != UNIQUE_CHECK_NO) {
@@ -88,7 +89,77 @@ bool ldb_aminsert(Relation         index,
     if(isnull[ 0 ]) {
         return false;
     }
-    // todo:: thre is room for optimization for when indexUnchanged is true
+    // todo:: there is room for optimization for when indexUnchanged is true
+    //elog(INFO, "RUNNING AMINSERT!");
+
+    // TODO: what if there are concurrent inserts? can that result in issues with creating this index, which we've
+    // postponed, to here?
+    // bool postponed = (indexInfo->ii_AmCache != NULL);
+    datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ]));
+    column_type = GetIndexColumnType(index);
+    //elog(INFO, "HEAP number of blocks: %d", RelationGetNumberOfBlocks(heap));
+
+    bool postponed = RelationGetNumberOfBlocks(index) == 0;
+
+    //elog(INFO, "INSERT: ii_AmCache address: %p\n", indexInfo->ii_AmCache);
+    //elog(INFO, "Num Blocks in insert: %d", RelationGetNumberOfBlocks(index));
+
+    if(postponed) {
+        // int ndims = GetHnswIndexDimensions(index, indexInfo);
+        int ndims = DatumGetLength(datum, column_type);
+        // TODO: test this ldb_HnswGetDim call... not sure if options will carry over
+        int index_ndims = ldb_HnswGetDim(index);
+        // TODO: check if above works... we can then only do this postponed index build stuff if no dimension was
+        // specified during index declaration
+        // int index_ndims = -1;
+        //elog(INFO, "Vector dimension specified during index creation: %d", index_ndims);
+        //elog(INFO, "ndims is : %d", ndims);
+        // elog(INFO, "From aminsert, our index_ndims is: %d", index_ndims);
+
+        if(index_ndims >= 1 && index_ndims != ndims) {
+            elog(ERROR,
+                 "Vector dimension %d of inserted vector does not match vector dimension %d specified during index "
+                 "creation.",
+                 ndims,
+                 index_ndims);
+            return false;
+        }
+
+        if(ndims < 1) {
+            // error is printed in GetHNSWIndexDimensions above
+            elog(ERROR, "ndims < 1 in insert!");
+            return false;
+        }
+
+        if(ndims > HNSW_MAX_DIM) {
+            elog(ERROR,
+                 "Vector dimension %d of inserted vector is too large. "
+                 "LanternDB currently supports up to %ddim vectors",
+                 ndims,
+                 HNSW_MAX_DIM);
+            return false;
+        }
+
+        // construct index with ndims now
+        HnswBuildState buildstate;
+        buildstate.postponed = true;
+        buildstate.dimensions = ndims;
+
+        // dont think we need this context switching stuff
+        // MemoryContext prevCtx;
+        // prevCtx = CurrentMemoryContext;
+
+        //elog(INFO, "About to call build index for the second time");
+        BuildIndex(heap, index, indexInfo, &buildstate, MAIN_FORKNUM);
+        //elog(INFO, "Finished building Index!");
+
+        // Building the index already inserted this vector since it was written to the heap prior to this AM method
+        // being called, so we can return to avoid inserting twice
+        return false;
+
+        // MemoryContextSwitchTo(prevCtx);
+        // elog(INFO, "Finished switching back to prevCtx!");
+    }
 
     insertCtx = AllocSetContextCreate(CurrentMemoryContext, "LanternInsertContext", ALLOCSET_DEFAULT_SIZES);
     oldCtx = MemoryContextSwitchTo(insertCtx);
@@ -103,7 +174,11 @@ bool ldb_aminsert(Relation         index,
     hdr = (HnswIndexHeaderPage *)PageGetContents(hdr_page);
     assert(hdr->magicNumber == LDB_WAL_MAGIC_NUMBER);
 
-    opts.dimensions = GetHnswIndexDimensions(index, indexInfo);
+    //opts.dimensions = GetHnswIndexDimensions(index, indexInfo);
+    opts.dimensions = hdr->vector_dim;
+
+    //opts.dimensions = DatumGetLength(datum, column_type);
+    //elog(INFO, "opts.dimensions in insert: %d", (int)opts.dimensions);
     CheckHnswIndexDimensions(index, values[ 0 ], opts.dimensions);
     PopulateUsearchOpts(index, &opts);
     opts.retriever_ctx = ldb_wal_retriever_area_init(index, hdr);
@@ -125,14 +200,13 @@ bool ldb_aminsert(Relation         index,
 
     insertstate->uidx = uidx;
     insertstate->retriever_ctx = opts.retriever_ctx;
-    insertstate->columnType = GetIndexColumnType(index);
+    insertstate->columnType = column_type;
 
     hdr_page = NULL;
 
     meta = usearch_metadata(uidx, &error);
     assert(!error);
 
-    datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ]));
     void *vector = DatumGetSizedArray(datum, insertstate->columnType, opts.dimensions);
 
 #if LANTERNDB_COPYNODES
diff --git a/src/hnsw/options.c b/src/hnsw/options.c
index c0b84442..239d1221 100644
--- a/src/hnsw/options.c
+++ b/src/hnsw/options.c
@@ -73,6 +73,13 @@ char *ldb_HnswGetIndexFilePath(Relation index)
     return (char *)opts + opts->experimantal_index_path_offset;
 }
 
+bool ldb_HnswGetPostponeIndexBuild(Relation index)
+{
+    ldb_HnswOptions *opts = (ldb_HnswOptions *)index->rd_options;
+    if(opts) return opts->postpone_index_build;
+    return false;
+}
+
 usearch_metric_kind_t ldb_HnswGetMetricKind(Relation index)
 {
     struct catclist *proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(index->rd_opfamily[ 0 ]));
@@ -103,13 +110,15 @@ usearch_metric_kind_t ldb_HnswGetMetricKind(Relation index)
  */
 bytea *ldb_amoptions(Datum reloptions, bool validate)
 {
-    static const relopt_parse_elt tab[]
-        = {{"dim", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, dim)},
-           {"element_limit", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, element_limit)},
-           {"m", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, m)},
-           {"ef_construction", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef_construction)},
-           {"ef", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef)},
-           {"_experimental_index_path", RELOPT_TYPE_STRING, offsetof(ldb_HnswOptions, experimantal_index_path_offset)}};
+    static const relopt_parse_elt tab[] = {
+        {"dim", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, dim)},
+        {"element_limit", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, element_limit)},
+        {"m", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, m)},
+        {"ef_construction", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef_construction)},
+        {"ef", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef)},
+        {"_experimental_index_path", RELOPT_TYPE_STRING, offsetof(ldb_HnswOptions, experimantal_index_path_offset)},
+        {"_postpone_index_build", RELOPT_TYPE_BOOL, offsetof(ldb_HnswOptions, postpone_index_build)},
+    };
 
 #if PG_VERSION_NUM >= 130000
     LDB_UNUSED(validate);
@@ -205,6 +214,15 @@ void _PG_init(void)
 #if PG_VERSION_NUM >= 130000
                          ,
                          AccessExclusiveLock
+#endif
+    );
+    add_bool_reloption(ldb_hnsw_index_withopts,
+                       "_postpone_index_build",
+                       "Whether or not to postpone creation of the index to the first insert, for empty tables",
+                       false
+#if PG_VERSION_NUM >= certain_version
+                       ,
+                       AccessExclusiveLock
 #endif
     );
     DefineCustomIntVariable("hnsw.init_k",
diff --git a/src/hnsw/options.h b/src/hnsw/options.h
index edcc651e..82550ba6 100644
--- a/src/hnsw/options.h
+++ b/src/hnsw/options.h
@@ -40,6 +40,11 @@ typedef struct ldb_HnswOptions
     int   ef_construction;
     int   ef;
     int   experimantal_index_path_offset;
+
+    // whether we postpone the process of building the index on the first insert, when the table is empty
+    // this is done for user convenience so that we can get the dimension from the first insert, and user thus doesn't
+    // have to specify the dimension in the create index clause
+    bool postpone_index_build;
 } ldb_HnswOptions;
 
 int                   ldb_HnswGetDim(Relation index);
diff --git a/test/expected/hnsw_cost_estimate.out b/test/expected/hnsw_cost_estimate.out
index 7940f805..3e749f96 100644
--- a/test/expected/hnsw_cost_estimate.out
+++ b/test/expected/hnsw_cost_estimate.out
@@ -52,9 +52,6 @@ SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset
 -- Case 0, sanity check. No data.
 CREATE TABLE empty_table(id SERIAL PRIMARY KEY, v REAL[2]);
 CREATE INDEX empty_idx ON empty_table USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=2, dim=2);
-INFO:  done init usearch index
-INFO:  inserted 0 elements
-INFO:  done saving 0 vectors
 SET _lantern_internal.is_test = true;
 SELECT is_cost_estimate_within_error('EXPLAIN SELECT * FROM empty_table ORDER BY v <-> ''{1,2}'' LIMIT 10', 0.47);
 DEBUG:  LANTERN - Query cost estimator
diff --git a/test/expected/hnsw_create.out b/test/expected/hnsw_create.out
index 28eb6a63..cdceb7b1 100644
--- a/test/expected/hnsw_create.out
+++ b/test/expected/hnsw_create.out
@@ -77,14 +77,16 @@ CREATE TABLE small_world4 (
 );
 -- If the first row is NULL we do not infer a dimension
 \set ON_ERROR_STOP off
-CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
-ERROR:  column does not have dimensions, please specify one
 begin;
+CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
 INSERT INTO small_world4 (id, vector) VALUES
 ('000', NULL),
 ('001', '{1,0,0,1}');
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
 CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
-ERROR:  column does not have dimensions, please specify one
+ERROR:  Failed to infer dimensions from non-empty table, please specify one
 rollback;
 \set ON_ERROR_STOP on
 INSERT INTO small_world4 (id, vector) VALUES
diff --git a/test/expected/hnsw_dist_func.out b/test/expected/hnsw_dist_func.out
index 78acab3a..7cc439e1 100644
--- a/test/expected/hnsw_dist_func.out
+++ b/test/expected/hnsw_dist_func.out
@@ -20,20 +20,20 @@ CREATE TABLE small_world_l2 (id VARCHAR(3), v REAL[]);
 CREATE TABLE small_world_cos (id VARCHAR(3), v REAL[]);
 CREATE TABLE small_world_ham (id VARCHAR(3), v INTEGER[]);
 CREATE INDEX ON small_world_l2 USING hnsw (v dist_l2sq_ops) WITH (dim=3);
-INFO:  done init usearch index
-INFO:  inserted 0 elements
-INFO:  done saving 0 vectors
 CREATE INDEX ON small_world_cos USING hnsw (v dist_cos_ops) WITH (dim=3);
-INFO:  done init usearch index
-INFO:  inserted 0 elements
-INFO:  done saving 0 vectors
 CREATE INDEX ON small_world_ham USING hnsw (v dist_hamming_ops) WITH (dim=3);
-INFO:  done init usearch index
-INFO:  inserted 0 elements
-INFO:  done saving 0 vectors
 INSERT INTO small_world_l2 SELECT id, v FROM small_world;
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
 INSERT INTO small_world_cos SELECT id, v FROM small_world;
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
 INSERT INTO small_world_ham SELECT id, ARRAY[CAST(v[1] AS INTEGER), CAST(v[2] AS INTEGER), CAST(v[3] AS INTEGER)] FROM small_world;
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
 SET enable_seqscan = false;
 -- Verify that the distance functions work (check distances)
 SELECT ROUND(l2sq_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_l2 ORDER BY v <-> '{0,1,0}';
diff --git a/test/expected/hnsw_insert.out b/test/expected/hnsw_insert.out
index 6140c130..bde3e3b0 100644
--- a/test/expected/hnsw_insert.out
+++ b/test/expected/hnsw_insert.out
@@ -12,11 +12,11 @@ CREATE TABLE small_world (
     v REAL[2]
 );
 CREATE INDEX ON small_world USING hnsw (v) WITH (dim=3);
-INFO:  done init usearch index
-INFO:  inserted 0 elements
-INFO:  done saving 0 vectors
 -- Insert rows with valid vector data
 INSERT INTO small_world (v) VALUES ('{0,0,1}'), ('{0,1,0}');
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
 INSERT INTO small_world (v) VALUES (NULL);
 -- Attempt to insert a row with an incorrect vector length
 \set ON_ERROR_STOP off
@@ -107,10 +107,10 @@ CREATE TABLE sift_base10k (
     v REAL[128]
 );
 CREATE INDEX hnsw_idx ON sift_base10k USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128);
-INFO:  done init usearch index
-INFO:  inserted 0 elements
-INFO:  done saving 0 vectors
 \COPY sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' WITH CSV;
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
 SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset
 EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444'
                                                                                                                                                                                        QUERY PLAN                                                                                                                                                                                        
diff --git a/test/sql/hnsw_create.sql b/test/sql/hnsw_create.sql
index f8571345..03edbc8f 100644
--- a/test/sql/hnsw_create.sql
+++ b/test/sql/hnsw_create.sql
@@ -34,8 +34,8 @@ CREATE TABLE small_world4 (
 );
 -- If the first row is NULL we do not infer a dimension
 \set ON_ERROR_STOP off
-CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
 begin;
+CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
 INSERT INTO small_world4 (id, vector) VALUES
 ('000', NULL),
 ('001', '{1,0,0,1}');

From 17dc848f9ff36af73dc4b68f0bc74b1925d5172f Mon Sep 17 00:00:00 2001
From: Danyil <danyilblyschak@gmail.com>
Date: Wed, 18 Oct 2023 23:08:41 -0400
Subject: [PATCH 2/5] cleaned up and formatted code

---
 src/hnsw.c                |  4 ++++
 src/hnsw/build.c          | 35 ++++++++++-------------------------
 src/hnsw/external_index.c |  1 -
 src/hnsw/insert.c         | 36 +++---------------------------------
 src/hnsw/options.c        | 17 -----------------
 src/hnsw/options.h        |  5 -----
 6 files changed, 17 insertions(+), 81 deletions(-)

diff --git a/src/hnsw.c b/src/hnsw.c
index 1dc1fd57..eecc8301 100644
--- a/src/hnsw.c
+++ b/src/hnsw.c
@@ -389,6 +389,9 @@ HnswColumnType GetIndexColumnType(Relation index)
     return GetColumnTypeFromOid(attr->atttypid);
 }
 
+/*
+ * Returns length of vector from datum
+ */
 int DatumGetLength(Datum datum, HnswColumnType type)
 {
     if(type == VECTOR) {
@@ -400,6 +403,7 @@ int DatumGetLength(Datum datum, HnswColumnType type)
     } else {
         elog(ERROR, "Unsupported type");
     }
+    return -1;
 }
 
 /*
diff --git a/src/hnsw/build.c b/src/hnsw/build.c
index f1f8ccfb..1b15ec48 100644
--- a/src/hnsw/build.c
+++ b/src/hnsw/build.c
@@ -119,10 +119,14 @@ static void BuildCallback(
     Relation index, CALLBACK_ITEM_POINTER, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
 {
     HnswBuildState *buildstate = (HnswBuildState *)state;
+    // If this is a postponed index build, we only want to read the first tuple and build the index from that. This is
+    // relevant when we have postponed an index build (on an empty table) and then the first insert occurs as part of a
+    // batch (like \COPY from a csv file). When this happens, the batch of tuples will be in the heap by the time the
+    // first aminsert runs, and we only want to build the index with the first tuple since aminsert will return for only
+    // that tuple. All the other tuples will subsequently be inserted normally via aminsert after this
     if(buildstate->postponed && buildstate->reltuples > 0) {
         return;
     }
-    //elog(INFO, "Callback for build!");
     MemoryContext oldCtx;
     // we can later use this for some optimizations I think
     LDB_UNUSED(tupleIsAlive);
@@ -261,8 +265,8 @@ static int GetArrayLengthFromHeap(Relation heap, int indexCol, IndexInfo *indexI
     return n_items;
 }
 
-// Attempts to get the number of dimensions from the index, and falls back on a heap scan to get dimensions if that
-// fails
+// Attempts to get the number of dimensions from the index, and if that fails, falls back on a heap scan to fetch the
+// first tuple, and get the length of the vector from that tuple
 int GetHnswIndexDimensions(Relation index, IndexInfo *indexInfo)
 {
     HnswColumnType columnType = GetIndexColumnType(index);
@@ -314,10 +318,6 @@ void CheckHnswIndexDimensions(Relation index, Datum arrayDatum, int dimensions)
     int            n_items;
     HnswColumnType indexType = GetIndexColumnType(index);
 
-    if(dimensions == HNSW_DEFAULT_DIM) {
-        return;
-    }
-
     if(indexType == REAL_ARRAY || indexType == INT_ARRAY) {
         /* Check dimensions of vector */
         array = DatumGetArrayTypePCopy(arrayDatum);
@@ -435,17 +435,8 @@ void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildSt
     }
 
     if(empty_table && !buildstate->postponed) {
-        // Postpone creation of index until first insert, where we can get dimension from that inserted vector and build
-        // the index with the right dimension buildstate has enough information at this point to return in original
-        // am_build
-        // TODO: we can only do this if there is no dimension specified in the options(cant be inferred) but that would
-        // require us passing that state to aminsert so we only build index
-        // elog(INFO, "Since table is empty, postponing creation of index until first insert");
-        // we use indexInfo->AmCache to store this (since it's a void* we can check if it's NULL to encode the boolean
-        // value of whether we're postponing or not)
-        // elog(INFO, "BUILD, before setting it to true! ii_AmCache address: %p\n", indexInfo->ii_AmCache);
-        indexInfo->ii_AmCache = (void *)1;
-        // elog(INFO, "BUILD, after setting it to true! ii_AmCache address: %p\n", indexInfo->ii_AmCache);
+        // Postpone creation of the index until the first insert, where we can get the dimension from that inserted
+        // vector and then build the index with that dimension
         return;
     }
 
@@ -458,7 +449,6 @@ void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildSt
 
     buildstate->hnsw = NULL;
     if(buildstate->index_file_path && !buildstate->postponed) {
-        //elog(INFO, "using index file path to creat index!");
         if(access(buildstate->index_file_path, F_OK) != 0) {
             ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("Invalid index file path ")));
         }
@@ -510,14 +500,9 @@ void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildSt
         }
 
         UpdateProgress(PROGRESS_CREATEIDX_PHASE, PROGRESS_HNSW_PHASE_IN_MEMORY_INSERT);
-        // if(!empty_table && !buildstate->postponed) {
-        //elog(INFO, "Before ScanTable");
-        LanternBench("build hnsw index",
-                     ScanTable(buildstate));  // can we get away with not calling this on an empty table?
-        //elog(INFO, "After ScanTable");
+        LanternBench("build hnsw index", ScanTable(buildstate));
         elog(INFO, "inserted %ld elements", usearch_size(buildstate->usearch_index, &error));
         assert(error == NULL);
-        //elog(INFO, "After assertion for ScanTable");
     }
 
     char *result_buf = NULL;
diff --git a/src/hnsw/external_index.c b/src/hnsw/external_index.c
index 85ba3775..ae3bbbd4 100644
--- a/src/hnsw/external_index.c
+++ b/src/hnsw/external_index.c
@@ -368,7 +368,6 @@ HnswIndexTuple *PrepareIndexTuple(Relation             index_rel,
     // allocate buffer to construct the new node
     // note that we allocate more than sizeof(HnswIndexTuple) since the struct has a flexible array member
     // which depends on parameters passed into UsearchNodeBytes above
-    // TODO: this palloc is throwing an error
     alloced_tuple = (HnswIndexTuple *)palloc0(sizeof(HnswIndexTuple) + new_tuple_size);
     alloced_tuple->id = new_tuple_id;
     alloced_tuple->level = new_tuple_level;
diff --git a/src/hnsw/insert.c b/src/hnsw/insert.c
index 104b8303..487c0aec 100644
--- a/src/hnsw/insert.c
+++ b/src/hnsw/insert.c
@@ -76,7 +76,6 @@ bool ldb_aminsert(Relation         index,
 #if PG_VERSION_NUM >= 140000
     LDB_UNUSED(indexUnchanged);
 #endif
-    
     HnswInsertState *insertstate = palloc0(sizeof(HnswInsertState));
 
     if(checkUnique != UNIQUE_CHECK_NO) {
@@ -90,31 +89,16 @@ bool ldb_aminsert(Relation         index,
         return false;
     }
     // todo:: there is room for optimization for when indexUnchanged is true
-    //elog(INFO, "RUNNING AMINSERT!");
 
-    // TODO: what if there are concurrent inserts? can that result in issues with creating this index, which we've
-    // postponed, to here?
-    // bool postponed = (indexInfo->ii_AmCache != NULL);
     datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ]));
     column_type = GetIndexColumnType(index);
-    //elog(INFO, "HEAP number of blocks: %d", RelationGetNumberOfBlocks(heap));
 
     bool postponed = RelationGetNumberOfBlocks(index) == 0;
 
-    //elog(INFO, "INSERT: ii_AmCache address: %p\n", indexInfo->ii_AmCache);
-    //elog(INFO, "Num Blocks in insert: %d", RelationGetNumberOfBlocks(index));
-
+    // TODO: what if there are concurrent inserts? can that result in issues with creating this postponed index?
     if(postponed) {
-        // int ndims = GetHnswIndexDimensions(index, indexInfo);
         int ndims = DatumGetLength(datum, column_type);
-        // TODO: test this ldb_HnswGetDim call... not sure if options will carry over
         int index_ndims = ldb_HnswGetDim(index);
-        // TODO: check if above works... we can then only do this postponed index build stuff if no dimension was
-        // specified during index declaration
-        // int index_ndims = -1;
-        //elog(INFO, "Vector dimension specified during index creation: %d", index_ndims);
-        //elog(INFO, "ndims is : %d", ndims);
-        // elog(INFO, "From aminsert, our index_ndims is: %d", index_ndims);
 
         if(index_ndims >= 1 && index_ndims != ndims) {
             elog(ERROR,
@@ -126,8 +110,7 @@ bool ldb_aminsert(Relation         index,
         }
 
         if(ndims < 1) {
-            // error is printed in GetHNSWIndexDimensions above
-            elog(ERROR, "ndims < 1 in insert!");
+            elog(ERROR, "Could not identify dimension of inserted vector!");
             return false;
         }
 
@@ -140,25 +123,16 @@ bool ldb_aminsert(Relation         index,
             return false;
         }
 
-        // construct index with ndims now
+        // We now build the postponed index, using ndims
         HnswBuildState buildstate;
         buildstate.postponed = true;
         buildstate.dimensions = ndims;
 
-        // dont think we need this context switching stuff
-        // MemoryContext prevCtx;
-        // prevCtx = CurrentMemoryContext;
-
-        //elog(INFO, "About to call build index for the second time");
         BuildIndex(heap, index, indexInfo, &buildstate, MAIN_FORKNUM);
-        //elog(INFO, "Finished building Index!");
 
         // Building the index already inserted this vector since it was written to the heap prior to this AM method
         // being called, so we can return to avoid inserting twice
         return false;
-
-        // MemoryContextSwitchTo(prevCtx);
-        // elog(INFO, "Finished switching back to prevCtx!");
     }
 
     insertCtx = AllocSetContextCreate(CurrentMemoryContext, "LanternInsertContext", ALLOCSET_DEFAULT_SIZES);
@@ -174,11 +148,7 @@ bool ldb_aminsert(Relation         index,
     hdr = (HnswIndexHeaderPage *)PageGetContents(hdr_page);
     assert(hdr->magicNumber == LDB_WAL_MAGIC_NUMBER);
 
-    //opts.dimensions = GetHnswIndexDimensions(index, indexInfo);
     opts.dimensions = hdr->vector_dim;
-
-    //opts.dimensions = DatumGetLength(datum, column_type);
-    //elog(INFO, "opts.dimensions in insert: %d", (int)opts.dimensions);
     CheckHnswIndexDimensions(index, values[ 0 ], opts.dimensions);
     PopulateUsearchOpts(index, &opts);
     opts.retriever_ctx = ldb_wal_retriever_area_init(index, hdr);
diff --git a/src/hnsw/options.c b/src/hnsw/options.c
index 239d1221..47886fa4 100644
--- a/src/hnsw/options.c
+++ b/src/hnsw/options.c
@@ -73,13 +73,6 @@ char *ldb_HnswGetIndexFilePath(Relation index)
     return (char *)opts + opts->experimantal_index_path_offset;
 }
 
-bool ldb_HnswGetPostponeIndexBuild(Relation index)
-{
-    ldb_HnswOptions *opts = (ldb_HnswOptions *)index->rd_options;
-    if(opts) return opts->postpone_index_build;
-    return false;
-}
-
 usearch_metric_kind_t ldb_HnswGetMetricKind(Relation index)
 {
     struct catclist *proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(index->rd_opfamily[ 0 ]));
@@ -117,7 +110,6 @@ bytea *ldb_amoptions(Datum reloptions, bool validate)
         {"ef_construction", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef_construction)},
         {"ef", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef)},
         {"_experimental_index_path", RELOPT_TYPE_STRING, offsetof(ldb_HnswOptions, experimantal_index_path_offset)},
-        {"_postpone_index_build", RELOPT_TYPE_BOOL, offsetof(ldb_HnswOptions, postpone_index_build)},
     };
 
 #if PG_VERSION_NUM >= 130000
@@ -214,15 +206,6 @@ void _PG_init(void)
 #if PG_VERSION_NUM >= 130000
                          ,
                          AccessExclusiveLock
-#endif
-    );
-    add_bool_reloption(ldb_hnsw_index_withopts,
-                       "_postpone_index_build",
-                       "Whether or not to postpone creation of the index to the first insert, for empty tables",
-                       false
-#if PG_VERSION_NUM >= certain_version
-                       ,
-                       AccessExclusiveLock
 #endif
     );
     DefineCustomIntVariable("hnsw.init_k",
diff --git a/src/hnsw/options.h b/src/hnsw/options.h
index 82550ba6..edcc651e 100644
--- a/src/hnsw/options.h
+++ b/src/hnsw/options.h
@@ -40,11 +40,6 @@ typedef struct ldb_HnswOptions
     int   ef_construction;
     int   ef;
     int   experimantal_index_path_offset;
-
-    // whether we postpone the process of building the index on the first insert, when the table is empty
-    // this is done for user convenience so that we can get the dimension from the first insert, and user thus doesn't
-    // have to specify the dimension in the create index clause
-    bool postpone_index_build;
 } ldb_HnswOptions;
 
 int                   ldb_HnswGetDim(Relation index);

From c304a8a271b8dedcbda1acf83aeebb2e1b77716c Mon Sep 17 00:00:00 2001
From: Danyil <danyilblyschak@gmail.com>
Date: Wed, 18 Oct 2023 23:53:43 -0400
Subject: [PATCH 3/5] expanded on hnsw_create test to test postponed index
 build functionality

---
 test/expected/hnsw_create.out | 102 +++++++++++++++++++++++++++++++++-
 test/sql/hnsw_create.sql      |  92 +++++++++++++++++++++++++++++-
 2 files changed, 189 insertions(+), 5 deletions(-)

diff --git a/test/expected/hnsw_create.out b/test/expected/hnsw_create.out
index cdceb7b1..23bf039a 100644
--- a/test/expected/hnsw_create.out
+++ b/test/expected/hnsw_create.out
@@ -75,20 +75,116 @@ CREATE TABLE small_world4 (
     id varchar(3),
     vector real[]
 );
--- If the first row is NULL we do not infer a dimension
-\set ON_ERROR_STOP off
+-- Test postponing of index creation on an empty table
+-- no options and single insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}');
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
+rollback;
+DROP INDEX small_world4_idx1;
+-- We need to vacuum or else we won't detect that the table is empty in ambuild
+VACUUM small_world4;
+SELECT * FROM small_world4;
+ id | vector 
+----+--------
+(0 rows)
+
+-- no options and batch insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}'),
+('001', '{1,0,0,1}');
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+-- some options but no dim and single insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}');
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+-- some options but no dim and batch insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}'),
+('001', '{1,0,0,1}');
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+-- dim specified and single insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}');
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+-- dim specified and batch insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}'),
+('001', '{1,0,0,1}');
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+-- Test cases where a NULL vector is inserted, and dim is not specified
+-- Create postponed index on empty table with batch insert where one vector is NULL
+-- this should ignore all NULL vectors and build index upon encountering insertion of first non-NULL entry
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
 begin;
-CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
 INSERT INTO small_world4 (id, vector) VALUES
 ('000', NULL),
 ('001', '{1,0,0,1}');
 INFO:  done init usearch index
 INFO:  inserted 1 elements
 INFO:  done saving 1 vectors
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+-- Empty table with first insert where vector is NULL
+-- This should ignore the NULL vector and NOT build the postponed index
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', NULL);
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+-- If the first row is NULL and index is not postponed (non-empty table) then we can't infer dimension and this will error
+\set ON_ERROR_STOP off
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', NULL),
+('001', '{1,0,0,1}');
 CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
 ERROR:  Failed to infer dimensions from non-empty table, please specify one
 rollback;
 \set ON_ERROR_STOP on
+VACUUM small_world4;
 INSERT INTO small_world4 (id, vector) VALUES
 ('000', '{1,0,0,0}'),
 ('001', '{1,0,0,1}'),
diff --git a/test/sql/hnsw_create.sql b/test/sql/hnsw_create.sql
index 03edbc8f..63136516 100644
--- a/test/sql/hnsw_create.sql
+++ b/test/sql/hnsw_create.sql
@@ -32,16 +32,104 @@ CREATE TABLE small_world4 (
     id varchar(3),
     vector real[]
 );
--- If the first row is NULL we do not infer a dimension
+
+-- Test postponing of index creation on an empty table
+-- no options and single insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}');
+rollback;
+DROP INDEX small_world4_idx1;
+-- We need to vacuum or else we won't detect that the table is empty in ambuild
+VACUUM small_world4;
+
+SELECT * FROM small_world4;
+
+-- no options and batch insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}'),
+('001', '{1,0,0,1}');
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+
+
+-- some options but no dim and single insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}');
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+
+
+-- some options but no dim and batch insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}'),
+('001', '{1,0,0,1}');
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+
+
+-- dim specified and single insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}');
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+
+-- dim specified and batch insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}'),
+('001', '{1,0,0,1}');
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+
+-- Test cases where a NULL vector is inserted, and dim is not specified
+
+-- Create postponed index on empty table with batch insert where one vector is NULL
+-- this should ignore all NULL vectors and build index upon encountering insertion of first non-NULL entry
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', NULL),
+('001', '{1,0,0,1}');
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+
+-- Empty table with first insert where vector is NULL
+-- This should ignore the NULL vector and NOT build the postponed index
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', NULL);
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+
+-- If the first row is NULL and index is not postponed (non-empty table) then we can't infer dimension and this will error
 \set ON_ERROR_STOP off
 begin;
-CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
 INSERT INTO small_world4 (id, vector) VALUES
 ('000', NULL),
 ('001', '{1,0,0,1}');
 CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
 rollback;
 \set ON_ERROR_STOP on
+VACUUM small_world4;
 
 INSERT INTO small_world4 (id, vector) VALUES
 ('000', '{1,0,0,0}'),

From 788d19b8743bdc8d1e99cf1c7a60a5f57c60d92c Mon Sep 17 00:00:00 2001
From: Danyil <danyilblyschak@gmail.com>
Date: Thu, 19 Oct 2023 00:21:06 -0400
Subject: [PATCH 4/5] postponing index build only when dimension is not
 specified now

---
 src/hnsw/build.c                     |  2 +-
 src/hnsw/insert.c                    | 12 ++++++++----
 test/expected/hnsw_cost_estimate.out |  3 +++
 test/expected/hnsw_create.out        | 16 ++++++++--------
 test/expected/hnsw_dist_func.out     | 18 +++++++++---------
 test/expected/hnsw_insert.out        | 12 ++++++------
 test/sql/hnsw_create.sql             |  4 ++--
 7 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/src/hnsw/build.c b/src/hnsw/build.c
index 1b15ec48..c59f1391 100644
--- a/src/hnsw/build.c
+++ b/src/hnsw/build.c
@@ -434,7 +434,7 @@ void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildSt
         return;
     }
 
-    if(empty_table && !buildstate->postponed) {
+    if(empty_table && buildstate->dimensions < 1 && !buildstate->postponed) {
         // Postpone creation of the index until the first insert, where we can get the dimension from that inserted
         // vector and then build the index with that dimension
         return;
diff --git a/src/hnsw/insert.c b/src/hnsw/insert.c
index 487c0aec..ccb4e26b 100644
--- a/src/hnsw/insert.c
+++ b/src/hnsw/insert.c
@@ -93,14 +93,17 @@ bool ldb_aminsert(Relation         index,
     datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ]));
     column_type = GetIndexColumnType(index);
 
-    bool postponed = RelationGetNumberOfBlocks(index) == 0;
+    int  index_ndims = ldb_HnswGetDim(index);
+    bool index_ndims_exists = index_ndims >= 1;
+    bool index_empty = RelationGetNumberOfBlocks(index) == 0;
+    bool postponed = index_empty && !index_ndims_exists;
 
     // TODO: what if there are concurrent inserts? can that result in issues with creating this postponed index?
     if(postponed) {
         int ndims = DatumGetLength(datum, column_type);
-        int index_ndims = ldb_HnswGetDim(index);
 
-        if(index_ndims >= 1 && index_ndims != ndims) {
+        /*
+        if(index_ndims_exists && index_ndims != ndims) {
             elog(ERROR,
                  "Vector dimension %d of inserted vector does not match vector dimension %d specified during index "
                  "creation.",
@@ -108,6 +111,7 @@ bool ldb_aminsert(Relation         index,
                  index_ndims);
             return false;
         }
+        */
 
         if(ndims < 1) {
             elog(ERROR, "Could not identify dimension of inserted vector!");
@@ -130,7 +134,7 @@ bool ldb_aminsert(Relation         index,
 
         BuildIndex(heap, index, indexInfo, &buildstate, MAIN_FORKNUM);
 
-        // Building the index already inserted this vector since it was written to the heap prior to this AM method
+        // Building the index already inserted this vector since it was written to the heap prior to this function
         // being called, so we can return to avoid inserting twice
         return false;
     }
diff --git a/test/expected/hnsw_cost_estimate.out b/test/expected/hnsw_cost_estimate.out
index 3e749f96..7940f805 100644
--- a/test/expected/hnsw_cost_estimate.out
+++ b/test/expected/hnsw_cost_estimate.out
@@ -52,6 +52,9 @@ SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset
 -- Case 0, sanity check. No data.
 CREATE TABLE empty_table(id SERIAL PRIMARY KEY, v REAL[2]);
 CREATE INDEX empty_idx ON empty_table USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=2, dim=2);
+INFO:  done init usearch index
+INFO:  inserted 0 elements
+INFO:  done saving 0 vectors
 SET _lantern_internal.is_test = true;
 SELECT is_cost_estimate_within_error('EXPLAIN SELECT * FROM empty_table ORDER BY v <-> ''{1,2}'' LIMIT 10', 0.47);
 DEBUG:  LANTERN - Query cost estimator
diff --git a/test/expected/hnsw_create.out b/test/expected/hnsw_create.out
index 23bf039a..0d7cffe7 100644
--- a/test/expected/hnsw_create.out
+++ b/test/expected/hnsw_create.out
@@ -128,26 +128,26 @@ INFO:  done saving 1 vectors
 rollback;
 DROP INDEX small_world4_idx1;
 VACUUM small_world4;
--- dim specified and single insert
+-- dim specified and single insert (this should NOT postpone index build since dim is specified)
 CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4);
+INFO:  done init usearch index
+INFO:  inserted 0 elements
+INFO:  done saving 0 vectors
 begin;
 INSERT INTO small_world4 (id, vector) VALUES
 ('000', '{0,1,2,0}');
-INFO:  done init usearch index
-INFO:  inserted 1 elements
-INFO:  done saving 1 vectors
 rollback;
 DROP INDEX small_world4_idx1;
 VACUUM small_world4;
--- dim specified and batch insert
+-- dim specified and batch insert (this should NOT postpone index build since dim is specified)
 CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4);
+INFO:  done init usearch index
+INFO:  inserted 0 elements
+INFO:  done saving 0 vectors
 begin;
 INSERT INTO small_world4 (id, vector) VALUES
 ('000', '{0,1,2,0}'),
 ('001', '{1,0,0,1}');
-INFO:  done init usearch index
-INFO:  inserted 1 elements
-INFO:  done saving 1 vectors
 rollback;
 DROP INDEX small_world4_idx1;
 VACUUM small_world4;
diff --git a/test/expected/hnsw_dist_func.out b/test/expected/hnsw_dist_func.out
index 7cc439e1..78acab3a 100644
--- a/test/expected/hnsw_dist_func.out
+++ b/test/expected/hnsw_dist_func.out
@@ -20,20 +20,20 @@ CREATE TABLE small_world_l2 (id VARCHAR(3), v REAL[]);
 CREATE TABLE small_world_cos (id VARCHAR(3), v REAL[]);
 CREATE TABLE small_world_ham (id VARCHAR(3), v INTEGER[]);
 CREATE INDEX ON small_world_l2 USING hnsw (v dist_l2sq_ops) WITH (dim=3);
+INFO:  done init usearch index
+INFO:  inserted 0 elements
+INFO:  done saving 0 vectors
 CREATE INDEX ON small_world_cos USING hnsw (v dist_cos_ops) WITH (dim=3);
+INFO:  done init usearch index
+INFO:  inserted 0 elements
+INFO:  done saving 0 vectors
 CREATE INDEX ON small_world_ham USING hnsw (v dist_hamming_ops) WITH (dim=3);
-INSERT INTO small_world_l2 SELECT id, v FROM small_world;
 INFO:  done init usearch index
-INFO:  inserted 1 elements
-INFO:  done saving 1 vectors
+INFO:  inserted 0 elements
+INFO:  done saving 0 vectors
+INSERT INTO small_world_l2 SELECT id, v FROM small_world;
 INSERT INTO small_world_cos SELECT id, v FROM small_world;
-INFO:  done init usearch index
-INFO:  inserted 1 elements
-INFO:  done saving 1 vectors
 INSERT INTO small_world_ham SELECT id, ARRAY[CAST(v[1] AS INTEGER), CAST(v[2] AS INTEGER), CAST(v[3] AS INTEGER)] FROM small_world;
-INFO:  done init usearch index
-INFO:  inserted 1 elements
-INFO:  done saving 1 vectors
 SET enable_seqscan = false;
 -- Verify that the distance functions work (check distances)
 SELECT ROUND(l2sq_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_l2 ORDER BY v <-> '{0,1,0}';
diff --git a/test/expected/hnsw_insert.out b/test/expected/hnsw_insert.out
index bde3e3b0..6140c130 100644
--- a/test/expected/hnsw_insert.out
+++ b/test/expected/hnsw_insert.out
@@ -12,11 +12,11 @@ CREATE TABLE small_world (
     v REAL[2]
 );
 CREATE INDEX ON small_world USING hnsw (v) WITH (dim=3);
+INFO:  done init usearch index
+INFO:  inserted 0 elements
+INFO:  done saving 0 vectors
 -- Insert rows with valid vector data
 INSERT INTO small_world (v) VALUES ('{0,0,1}'), ('{0,1,0}');
-INFO:  done init usearch index
-INFO:  inserted 1 elements
-INFO:  done saving 1 vectors
 INSERT INTO small_world (v) VALUES (NULL);
 -- Attempt to insert a row with an incorrect vector length
 \set ON_ERROR_STOP off
@@ -107,10 +107,10 @@ CREATE TABLE sift_base10k (
     v REAL[128]
 );
 CREATE INDEX hnsw_idx ON sift_base10k USING hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128);
-\COPY sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' WITH CSV;
 INFO:  done init usearch index
-INFO:  inserted 1 elements
-INFO:  done saving 1 vectors
+INFO:  inserted 0 elements
+INFO:  done saving 0 vectors
+\COPY sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' WITH CSV;
 SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset
 EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444'
                                                                                                                                                                                        QUERY PLAN                                                                                                                                                                                        
diff --git a/test/sql/hnsw_create.sql b/test/sql/hnsw_create.sql
index 63136516..f49e690d 100644
--- a/test/sql/hnsw_create.sql
+++ b/test/sql/hnsw_create.sql
@@ -78,7 +78,7 @@ DROP INDEX small_world4_idx1;
 VACUUM small_world4;
 
 
--- dim specified and single insert
+-- dim specified and single insert (this should NOT postpone index build since dim is specified)
 CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4);
 begin;
 INSERT INTO small_world4 (id, vector) VALUES
@@ -87,7 +87,7 @@ rollback;
 DROP INDEX small_world4_idx1;
 VACUUM small_world4;
 
--- dim specified and batch insert
+-- dim specified and batch insert (this should NOT postpone index build since dim is specified)
 CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4);
 begin;
 INSERT INTO small_world4 (id, vector) VALUES

From a8d7396d8f6ceb7a00cdac5bb8dce5a06af320cf Mon Sep 17 00:00:00 2001
From: Danyil <danyilblyschak@gmail.com>
Date: Thu, 19 Oct 2023 00:27:39 -0400
Subject: [PATCH 5/5] minor formatting

---
 src/hnsw/build.c   |  3 +--
 src/hnsw/insert.c  | 12 +-----------
 src/hnsw/options.c | 15 +++++++--------
 3 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/src/hnsw/build.c b/src/hnsw/build.c
index c59f1391..53966a69 100644
--- a/src/hnsw/build.c
+++ b/src/hnsw/build.c
@@ -363,8 +363,6 @@ static void InitBuildState(HnswBuildState *buildstate, Relation heap, Relation i
     if(buildstate->dimensions < 1 && !buildstate->postponed) {
         buildstate->dimensions = InferDimension(heap, indexInfo);
     }
-    /* Require column to have dimensions to be indexed */
-    // if(buildstate->dimensions < 1) elog(ERROR, "column does not have dimensions, please specify one");
 
     // not supported because of 8K page limit in postgres WAL pages
     // can pass this limit once quantization is supported
@@ -501,6 +499,7 @@ void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildSt
 
         UpdateProgress(PROGRESS_CREATEIDX_PHASE, PROGRESS_HNSW_PHASE_IN_MEMORY_INSERT);
         LanternBench("build hnsw index", ScanTable(buildstate));
+
         elog(INFO, "inserted %ld elements", usearch_size(buildstate->usearch_index, &error));
         assert(error == NULL);
     }
diff --git a/src/hnsw/insert.c b/src/hnsw/insert.c
index ccb4e26b..0e2679af 100644
--- a/src/hnsw/insert.c
+++ b/src/hnsw/insert.c
@@ -76,6 +76,7 @@ bool ldb_aminsert(Relation         index,
 #if PG_VERSION_NUM >= 140000
     LDB_UNUSED(indexUnchanged);
 #endif
+
     HnswInsertState *insertstate = palloc0(sizeof(HnswInsertState));
 
     if(checkUnique != UNIQUE_CHECK_NO) {
@@ -102,17 +103,6 @@ bool ldb_aminsert(Relation         index,
     if(postponed) {
         int ndims = DatumGetLength(datum, column_type);
 
-        /*
-        if(index_ndims_exists && index_ndims != ndims) {
-            elog(ERROR,
-                 "Vector dimension %d of inserted vector does not match vector dimension %d specified during index "
-                 "creation.",
-                 ndims,
-                 index_ndims);
-            return false;
-        }
-        */
-
         if(ndims < 1) {
             elog(ERROR, "Could not identify dimension of inserted vector!");
             return false;
diff --git a/src/hnsw/options.c b/src/hnsw/options.c
index 47886fa4..c0b84442 100644
--- a/src/hnsw/options.c
+++ b/src/hnsw/options.c
@@ -103,14 +103,13 @@ usearch_metric_kind_t ldb_HnswGetMetricKind(Relation index)
  */
 bytea *ldb_amoptions(Datum reloptions, bool validate)
 {
-    static const relopt_parse_elt tab[] = {
-        {"dim", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, dim)},
-        {"element_limit", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, element_limit)},
-        {"m", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, m)},
-        {"ef_construction", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef_construction)},
-        {"ef", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef)},
-        {"_experimental_index_path", RELOPT_TYPE_STRING, offsetof(ldb_HnswOptions, experimantal_index_path_offset)},
-    };
+    static const relopt_parse_elt tab[]
+        = {{"dim", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, dim)},
+           {"element_limit", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, element_limit)},
+           {"m", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, m)},
+           {"ef_construction", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef_construction)},
+           {"ef", RELOPT_TYPE_INT, offsetof(ldb_HnswOptions, ef)},
+           {"_experimental_index_path", RELOPT_TYPE_STRING, offsetof(ldb_HnswOptions, experimantal_index_path_offset)}};
 
 #if PG_VERSION_NUM >= 130000
     LDB_UNUSED(validate);