lanterndata · therealdarkknight · Oct 19, 2023 · Oct 19, 2023 · Oct 19, 2023 · Oct 19, 2023
diff --git a/src/hnsw.c b/src/hnsw.c
@@ -172,6 +172,9 @@ static void hnswcostestimate(PlannerInfo *root,
     costs.numIndexTuples = estimate_number_tuples_accessed(path->indexinfo->indexoid, num_tuples_in_index);
     uint64 num_blocks_accessed
         = estimate_number_blocks_accessed(num_tuples_in_index, path->indexinfo->pages, costs.numIndexTuples);
+    // choose max{above, 1} since on a postponed index build, we will have 0 for the above quantity... this should only
+    // affect scans on empty indexes
+    num_blocks_accessed = (num_blocks_accessed > 1) ? num_blocks_accessed : 1;
 
 #if PG_VERSION_NUM >= 120000
     genericcostestimate(root, path, loop_count, &costs);
@@ -386,6 +389,23 @@ HnswColumnType GetIndexColumnType(Relation index)
     return GetColumnTypeFromOid(attr->atttypid);
 }
 
+/*
+ * Returns length of vector from datum
+ */
+int DatumGetLength(Datum datum, HnswColumnType type)
+{
+    if(type == VECTOR) {
+        Vector *vector = DatumGetVector(datum);
+        return vector->dim;
+    } else if(type == REAL_ARRAY || type == INT_ARRAY) {
+        ArrayType *array = DatumGetArrayTypePCopy(datum);
+        return ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
+    } else {
+        elog(ERROR, "Unsupported type");
+    }
+    return -1;
+}
+
 /*
  * Given vector data and vector type, read it as either a float4 or int32 array and return as void*
  */

diff --git a/src/hnsw.h b/src/hnsw.h
@@ -34,6 +34,7 @@ PGDLLEXPORT Datum cos_dist(PG_FUNCTION_ARGS);
 
 HnswColumnType GetColumnTypeFromOid(Oid oid);
 HnswColumnType GetIndexColumnType(Relation index);
+int            DatumGetLength(Datum datum, HnswColumnType type);
 void          *DatumGetSizedArray(Datum datum, HnswColumnType type, int dimensions);
 
 #define LDB_UNUSED(x) (void)(x)

diff --git a/src/hnsw/build.c b/src/hnsw/build.c
@@ -119,7 +119,15 @@ static void BuildCallback(
     Relation index, CALLBACK_ITEM_POINTER, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
 {
     HnswBuildState *buildstate = (HnswBuildState *)state;
-    MemoryContext   oldCtx;
+    // If this is a postponed index build, we only want to read the first tuple and build the index from that. This is
+    // relevant when we have postponed an index build (on an empty table) and then the first insert occurs as part of a
+    // batch (like \COPY from a csv file). When this happens, the batch of tuples will be in the heap by the time the
+    // first aminsert runs, and we only want to build the index with the first tuple since aminsert will return for only
+    // that tuple. All the other tuples will subsequently be inserted normally via aminsert after this
+    if(buildstate->postponed && buildstate->reltuples > 0) {
+        return;
+    }
+    MemoryContext oldCtx;
     // we can later use this for some optimizations I think
     LDB_UNUSED(tupleIsAlive);
 
@@ -257,6 +265,8 @@ static int GetArrayLengthFromHeap(Relation heap, int indexCol, IndexInfo *indexI
     return n_items;
 }
 
+// Attempts to get the number of dimensions from the index, and if that fails, falls back on a heap scan to fetch the
+// first tuple, and get the length of the vector from that tuple
 int GetHnswIndexDimensions(Relation index, IndexInfo *indexInfo)
 {
     HnswColumnType columnType = GetIndexColumnType(index);
@@ -344,19 +354,19 @@ static void InitBuildState(HnswBuildState *buildstate, Relation heap, Relation i
     buildstate->index = index;
     buildstate->indexInfo = indexInfo;
     buildstate->columnType = GetIndexColumnType(index);
-    buildstate->dimensions = GetHnswIndexDimensions(index, indexInfo);
+    if(!buildstate->postponed) {
+        buildstate->dimensions = GetHnswIndexDimensions(index, indexInfo);
+    }
     buildstate->index_file_path = ldb_HnswGetIndexFilePath(index);
 
     // If a dimension wasn't specified try to infer it
-    if(buildstate->dimensions < 1) {
+    if(buildstate->dimensions < 1 && !buildstate->postponed) {
         buildstate->dimensions = InferDimension(heap, indexInfo);
     }
-    /* Require column to have dimensions to be indexed */
-    if(buildstate->dimensions < 1) elog(ERROR, "column does not have dimensions, please specify one");
 
     // not supported because of 8K page limit in postgres WAL pages
     // can pass this limit once quantization is supported
-    if(buildstate->dimensions > HNSW_MAX_DIM)
+    if(buildstate->dimensions > HNSW_MAX_DIM && !buildstate->postponed)
         elog(ERROR,
              "vector dimension %d is too large. "
              "LanternDB currently supports up to %ddim vectors",
@@ -408,14 +418,26 @@ static void ScanTable(HnswBuildState *buildstate)
 /*
  * Build the index
  */
-static void BuildIndex(
-    Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildState *buildstate, ForkNumber forkNum)
+void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildState *buildstate, ForkNumber forkNum)
 {
     usearch_error_t        error = NULL;
     usearch_init_options_t opts;
     MemSet(&opts, 0, sizeof(opts));
+    bool empty_table = RelationGetNumberOfBlocks(heap) == 0;
 
     InitBuildState(buildstate, heap, index, indexInfo);
+
+    if(buildstate->dimensions < 1 && !empty_table && !buildstate->postponed) {
+        elog(ERROR, "Failed to infer dimensions from non-empty table, please specify one");
+        return;
+    }
+
+    if(empty_table && buildstate->dimensions < 1 && !buildstate->postponed) {
+        // Postpone creation of the index until the first insert, where we can get the dimension from that inserted
+        // vector and then build the index with that dimension
+        return;
+    }
+
     opts.dimensions = buildstate->dimensions;
     PopulateUsearchOpts(index, &opts);
 
@@ -424,7 +446,7 @@ static void BuildIndex(
     assert(error == NULL);
 
     buildstate->hnsw = NULL;
-    if(buildstate->index_file_path) {
+    if(buildstate->index_file_path && !buildstate->postponed) {
         if(access(buildstate->index_file_path, F_OK) != 0) {
             ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("Invalid index file path ")));
         }
@@ -444,7 +466,7 @@ static void BuildIndex(
     } else {
         BlockNumber numBlocks = RelationGetNumberOfBlocks(heap);
         uint32_t    estimated_row_count = 0;
-        if(numBlocks > 0) {
+        if(numBlocks > 0 && !buildstate->postponed) {
             // Read the first block
             Buffer buffer = ReadBufferExtended(heap, MAIN_FORKNUM, 0, RBM_NORMAL, NULL);
             // Lock buffer so there won't be any new writes during this operation
@@ -512,6 +534,7 @@ IndexBuildResult *ldb_ambuild(Relation heap, Relation index, IndexInfo *indexInf
 {
     IndexBuildResult *result;
     HnswBuildState    buildstate;
+    buildstate.postponed = false;
 
     BuildIndex(heap, index, indexInfo, &buildstate, MAIN_FORKNUM);
 

diff --git a/src/hnsw/build.h b/src/hnsw/build.h
@@ -2,6 +2,7 @@
 #define LDB_HNSW_BUILD_H
 
 #include <access/genam.h>
+#include <common/relpath.h>
 #include <nodes/execnodes.h>
 #include <utils/relcache.h>
 
@@ -20,6 +21,7 @@ typedef struct HnswBuildState
     int            dimensions;
     HnswColumnType columnType;
     char          *index_file_path;
+    bool           postponed;
 
     /* Statistics */
     double tuples_indexed;
@@ -36,6 +38,7 @@ typedef struct HnswBuildState
 IndexBuildResult *ldb_ambuild(Relation heap, Relation index, IndexInfo *indexInfo);
 void              ldb_ambuildunlogged(Relation index);
 int               GetHnswIndexDimensions(Relation index, IndexInfo *indexInfo);
-void              CheckHnswIndexDimensions(Relation index, Datum arrayDatum, int deimensions);
+void              CheckHnswIndexDimensions(Relation index, Datum arrayDatum, int dimensions);
+void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildState *buildstate, ForkNumber forkNum);
 // todo: does this render my check unnecessary
 #endif  // LDB_HNSW_BUILD_H
diff --git a/src/hnsw/insert.c b/src/hnsw/insert.c
@@ -70,6 +70,7 @@ bool ldb_aminsert(Relation         index,
     GenericXLogState      *state;
     uint32                 new_tuple_id;
     HnswIndexTuple        *new_tuple;
+    HnswColumnType         column_type;
     usearch_init_options_t opts = {0};
     LDB_UNUSED(heap);
 #if PG_VERSION_NUM >= 140000
@@ -88,7 +89,45 @@ bool ldb_aminsert(Relation         index,
     if(isnull[ 0 ]) {
         return false;
     }
-    // todo:: thre is room for optimization for when indexUnchanged is true
+    // todo:: there is room for optimization for when indexUnchanged is true
+
+    datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ]));
+    column_type = GetIndexColumnType(index);
+
+    int  index_ndims = ldb_HnswGetDim(index);
+    bool index_ndims_exists = index_ndims >= 1;
+    bool index_empty = RelationGetNumberOfBlocks(index) == 0;
+    bool postponed = index_empty && !index_ndims_exists;
+
+    // TODO: what if there are concurrent inserts? can that result in issues with creating this postponed index?
+    if(postponed) {
+        int ndims = DatumGetLength(datum, column_type);
+
+        if(ndims < 1) {
+            elog(ERROR, "Could not identify dimension of inserted vector!");
+            return false;
+        }
+
+        if(ndims > HNSW_MAX_DIM) {
+            elog(ERROR,
+                 "Vector dimension %d of inserted vector is too large. "
+                 "LanternDB currently supports up to %ddim vectors",
+                 ndims,
+                 HNSW_MAX_DIM);
+            return false;
+        }
+
+        // We now build the postponed index, using ndims
+        HnswBuildState buildstate;
+        buildstate.postponed = true;
+        buildstate.dimensions = ndims;
+
+        BuildIndex(heap, index, indexInfo, &buildstate, MAIN_FORKNUM);
+
+        // Building the index already inserted this vector since it was written to the heap prior to this function
+        // being called, so we can return to avoid inserting twice
+        return false;
+    }
 
     insertCtx = AllocSetContextCreate(CurrentMemoryContext, "LanternInsertContext", ALLOCSET_DEFAULT_SIZES);
     oldCtx = MemoryContextSwitchTo(insertCtx);
@@ -103,7 +142,7 @@ bool ldb_aminsert(Relation         index,
     hdr = (HnswIndexHeaderPage *)PageGetContents(hdr_page);
     assert(hdr->magicNumber == LDB_WAL_MAGIC_NUMBER);
 
-    opts.dimensions = GetHnswIndexDimensions(index, indexInfo);
+    opts.dimensions = hdr->vector_dim;
     CheckHnswIndexDimensions(index, values[ 0 ], opts.dimensions);
     PopulateUsearchOpts(index, &opts);
     opts.retriever_ctx = ldb_wal_retriever_area_init(index, hdr);
@@ -125,14 +164,13 @@ bool ldb_aminsert(Relation         index,
 
     insertstate->uidx = uidx;
     insertstate->retriever_ctx = opts.retriever_ctx;
-    insertstate->columnType = GetIndexColumnType(index);
+    insertstate->columnType = column_type;
 
     hdr_page = NULL;
 
     meta = usearch_metadata(uidx, &error);
     assert(!error);
 
-    datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ]));
     void *vector = DatumGetSizedArray(datum, insertstate->columnType, opts.dimensions);
 
 #if LANTERNDB_COPYNODES

diff --git a/test/expected/hnsw_create.out b/test/expected/hnsw_create.out
@@ -75,18 +75,116 @@ CREATE TABLE small_world4 (
     id varchar(3),
     vector real[]
 );
--- If the first row is NULL we do not infer a dimension
+-- Test postponing of index creation on an empty table
+-- no options and single insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}');
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
+rollback;
+DROP INDEX small_world4_idx1;
+-- We need to vacuum or else we won't detect that the table is empty in ambuild
+VACUUM small_world4;
+SELECT * FROM small_world4;
+ id | vector 
+----+--------
+(0 rows)
+
+-- no options and batch insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}'),
+('001', '{1,0,0,1}');
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+-- some options but no dim and single insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}');
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+-- some options but no dim and batch insert
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}'),
+('001', '{1,0,0,1}');
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+-- dim specified and single insert (this should NOT postpone index build since dim is specified)
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4);
+INFO:  done init usearch index
+INFO:  inserted 0 elements
+INFO:  done saving 0 vectors
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}');
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+-- dim specified and batch insert (this should NOT postpone index build since dim is specified)
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4);
+INFO:  done init usearch index
+INFO:  inserted 0 elements
+INFO:  done saving 0 vectors
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', '{0,1,2,0}'),
+('001', '{1,0,0,1}');
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+-- Test cases where a NULL vector is inserted, and dim is not specified
+-- Create postponed index on empty table with batch insert where one vector is NULL
+-- this should ignore all NULL vectors and build index upon encountering insertion of first non-NULL entry
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', NULL),
+('001', '{1,0,0,1}');
+INFO:  done init usearch index
+INFO:  inserted 1 elements
+INFO:  done saving 1 vectors
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+-- Empty table with first insert where vector is NULL
+-- This should ignore the NULL vector and NOT build the postponed index
+CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
+begin;
+INSERT INTO small_world4 (id, vector) VALUES
+('000', NULL);
+rollback;
+DROP INDEX small_world4_idx1;
+VACUUM small_world4;
+-- If the first row is NULL and index is not postponed (non-empty table) then we can't infer dimension and this will error
 \set ON_ERROR_STOP off
-CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
-ERROR:  column does not have dimensions, please specify one
 begin;
 INSERT INTO small_world4 (id, vector) VALUES
 ('000', NULL),
 ('001', '{1,0,0,1}');
 CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
-ERROR:  column does not have dimensions, please specify one
+ERROR:  Failed to infer dimensions from non-empty table, please specify one
 rollback;
 \set ON_ERROR_STOP on
+VACUUM small_world4;
 INSERT INTO small_world4 (id, vector) VALUES
 ('000', '{1,0,0,0}'),
 ('001', '{1,0,0,1}'),