Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Postpone index build for empty table to first insert #209

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions src/hnsw.c
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,9 @@ static void hnswcostestimate(PlannerInfo *root,
costs.numIndexTuples = estimate_number_tuples_accessed(path->indexinfo->indexoid, num_tuples_in_index);
uint64 num_blocks_accessed
= estimate_number_blocks_accessed(num_tuples_in_index, path->indexinfo->pages, costs.numIndexTuples);
// choose max{above, 1} since on a postponed index build, we will have 0 for the above quantity... this should only
// affect scans on empty indexes
num_blocks_accessed = (num_blocks_accessed > 1) ? num_blocks_accessed : 1;

#if PG_VERSION_NUM >= 120000
genericcostestimate(root, path, loop_count, &costs);
Expand Down Expand Up @@ -386,6 +389,23 @@ HnswColumnType GetIndexColumnType(Relation index)
return GetColumnTypeFromOid(attr->atttypid);
}

/*
* Returns length of vector from datum
*/
int DatumGetLength(Datum datum, HnswColumnType type)
{
if(type == VECTOR) {
Vector *vector = DatumGetVector(datum);
return vector->dim;
} else if(type == REAL_ARRAY || type == INT_ARRAY) {
ArrayType *array = DatumGetArrayTypePCopy(datum);
return ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
} else {
elog(ERROR, "Unsupported type");
}
return -1;
}

/*
* Given vector data and vector type, read it as either a float4 or int32 array and return as void*
*/
Expand Down
1 change: 1 addition & 0 deletions src/hnsw.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ PGDLLEXPORT Datum cos_dist(PG_FUNCTION_ARGS);

HnswColumnType GetColumnTypeFromOid(Oid oid);
HnswColumnType GetIndexColumnType(Relation index);
int DatumGetLength(Datum datum, HnswColumnType type);
void *DatumGetSizedArray(Datum datum, HnswColumnType type, int dimensions);

#define LDB_UNUSED(x) (void)(x)
Expand Down
43 changes: 33 additions & 10 deletions src/hnsw/build.c
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,15 @@ static void BuildCallback(
Relation index, CALLBACK_ITEM_POINTER, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
{
HnswBuildState *buildstate = (HnswBuildState *)state;
MemoryContext oldCtx;
// If this is a postponed index build, we only want to read the first tuple and build the index from that. This is
// relevant when we have postponed an index build (on an empty table) and then the first insert occurs as part of a
// batch (like \COPY from a csv file). When this happens, the batch of tuples will be in the heap by the time the
// first aminsert runs, and we only want to build the index with the first tuple since aminsert will return for only
// that tuple. All the other tuples will subsequently be inserted normally via aminsert after this
if(buildstate->postponed && buildstate->reltuples > 0) {
return;
}
MemoryContext oldCtx;
// we can later use this for some optimizations I think
LDB_UNUSED(tupleIsAlive);

Expand Down Expand Up @@ -257,6 +265,8 @@ static int GetArrayLengthFromHeap(Relation heap, int indexCol, IndexInfo *indexI
return n_items;
}

// Attempts to get the number of dimensions from the index, and if that fails, falls back on a heap scan to fetch the
// first tuple, and get the length of the vector from that tuple
int GetHnswIndexDimensions(Relation index, IndexInfo *indexInfo)
{
HnswColumnType columnType = GetIndexColumnType(index);
Expand Down Expand Up @@ -344,19 +354,19 @@ static void InitBuildState(HnswBuildState *buildstate, Relation heap, Relation i
buildstate->index = index;
buildstate->indexInfo = indexInfo;
buildstate->columnType = GetIndexColumnType(index);
buildstate->dimensions = GetHnswIndexDimensions(index, indexInfo);
if(!buildstate->postponed) {
buildstate->dimensions = GetHnswIndexDimensions(index, indexInfo);
}
buildstate->index_file_path = ldb_HnswGetIndexFilePath(index);

// If a dimension wasn't specified try to infer it
if(buildstate->dimensions < 1) {
if(buildstate->dimensions < 1 && !buildstate->postponed) {
buildstate->dimensions = InferDimension(heap, indexInfo);
}
/* Require column to have dimensions to be indexed */
if(buildstate->dimensions < 1) elog(ERROR, "column does not have dimensions, please specify one");

// not supported because of 8K page limit in postgres WAL pages
// can pass this limit once quantization is supported
if(buildstate->dimensions > HNSW_MAX_DIM)
if(buildstate->dimensions > HNSW_MAX_DIM && !buildstate->postponed)
elog(ERROR,
"vector dimension %d is too large. "
"LanternDB currently supports up to %ddim vectors",
Expand Down Expand Up @@ -408,14 +418,26 @@ static void ScanTable(HnswBuildState *buildstate)
/*
* Build the index
*/
static void BuildIndex(
Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildState *buildstate, ForkNumber forkNum)
void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildState *buildstate, ForkNumber forkNum)
{
usearch_error_t error = NULL;
usearch_init_options_t opts;
MemSet(&opts, 0, sizeof(opts));
bool empty_table = RelationGetNumberOfBlocks(heap) == 0;

InitBuildState(buildstate, heap, index, indexInfo);

if(buildstate->dimensions < 1 && !empty_table && !buildstate->postponed) {
elog(ERROR, "Failed to infer dimensions from non-empty table, please specify one");
return;
}

if(empty_table && buildstate->dimensions < 1 && !buildstate->postponed) {
// Postpone creation of the index until the first insert, where we can get the dimension from that inserted
// vector and then build the index with that dimension
return;
}

opts.dimensions = buildstate->dimensions;
PopulateUsearchOpts(index, &opts);

Expand All @@ -424,7 +446,7 @@ static void BuildIndex(
assert(error == NULL);

buildstate->hnsw = NULL;
if(buildstate->index_file_path) {
if(buildstate->index_file_path && !buildstate->postponed) {
if(access(buildstate->index_file_path, F_OK) != 0) {
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("Invalid index file path ")));
}
Expand All @@ -444,7 +466,7 @@ static void BuildIndex(
} else {
BlockNumber numBlocks = RelationGetNumberOfBlocks(heap);
uint32_t estimated_row_count = 0;
if(numBlocks > 0) {
if(numBlocks > 0 && !buildstate->postponed) {
// Read the first block
Buffer buffer = ReadBufferExtended(heap, MAIN_FORKNUM, 0, RBM_NORMAL, NULL);
// Lock buffer so there won't be any new writes during this operation
Expand Down Expand Up @@ -512,6 +534,7 @@ IndexBuildResult *ldb_ambuild(Relation heap, Relation index, IndexInfo *indexInf
{
IndexBuildResult *result;
HnswBuildState buildstate;
buildstate.postponed = false;

BuildIndex(heap, index, indexInfo, &buildstate, MAIN_FORKNUM);

Expand Down
5 changes: 4 additions & 1 deletion src/hnsw/build.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define LDB_HNSW_BUILD_H

#include <access/genam.h>
#include <common/relpath.h>
#include <nodes/execnodes.h>
#include <utils/relcache.h>

Expand All @@ -20,6 +21,7 @@ typedef struct HnswBuildState
int dimensions;
HnswColumnType columnType;
char *index_file_path;
bool postponed;

/* Statistics */
double tuples_indexed;
Expand All @@ -36,6 +38,7 @@ typedef struct HnswBuildState
IndexBuildResult *ldb_ambuild(Relation heap, Relation index, IndexInfo *indexInfo);
void ldb_ambuildunlogged(Relation index);
int GetHnswIndexDimensions(Relation index, IndexInfo *indexInfo);
void CheckHnswIndexDimensions(Relation index, Datum arrayDatum, int deimensions);
void CheckHnswIndexDimensions(Relation index, Datum arrayDatum, int dimensions);
void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, HnswBuildState *buildstate, ForkNumber forkNum);
// todo: does this render my check unnecessary
#endif // LDB_HNSW_BUILD_H
46 changes: 42 additions & 4 deletions src/hnsw/insert.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ bool ldb_aminsert(Relation index,
GenericXLogState *state;
uint32 new_tuple_id;
HnswIndexTuple *new_tuple;
HnswColumnType column_type;
usearch_init_options_t opts = {0};
LDB_UNUSED(heap);
#if PG_VERSION_NUM >= 140000
Expand All @@ -88,7 +89,45 @@ bool ldb_aminsert(Relation index,
if(isnull[ 0 ]) {
return false;
}
// todo:: thre is room for optimization for when indexUnchanged is true
// todo:: there is room for optimization for when indexUnchanged is true

datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ]));
column_type = GetIndexColumnType(index);

int index_ndims = ldb_HnswGetDim(index);
bool index_ndims_exists = index_ndims >= 1;
bool index_empty = RelationGetNumberOfBlocks(index) == 0;
bool postponed = index_empty && !index_ndims_exists;

// TODO: what if there are concurrent inserts? can that result in issues with creating this postponed index?
if(postponed) {
int ndims = DatumGetLength(datum, column_type);

if(ndims < 1) {
elog(ERROR, "Could not identify dimension of inserted vector!");
return false;
}

if(ndims > HNSW_MAX_DIM) {
elog(ERROR,
"Vector dimension %d of inserted vector is too large. "
"LanternDB currently supports up to %ddim vectors",
ndims,
HNSW_MAX_DIM);
return false;
}

// We now build the postponed index, using ndims
HnswBuildState buildstate;
buildstate.postponed = true;
buildstate.dimensions = ndims;

BuildIndex(heap, index, indexInfo, &buildstate, MAIN_FORKNUM);

// Building the index already inserted this vector since it was written to the heap prior to this function
// being called, so we can return to avoid inserting twice
return false;
}

insertCtx = AllocSetContextCreate(CurrentMemoryContext, "LanternInsertContext", ALLOCSET_DEFAULT_SIZES);
oldCtx = MemoryContextSwitchTo(insertCtx);
Expand All @@ -103,7 +142,7 @@ bool ldb_aminsert(Relation index,
hdr = (HnswIndexHeaderPage *)PageGetContents(hdr_page);
assert(hdr->magicNumber == LDB_WAL_MAGIC_NUMBER);

opts.dimensions = GetHnswIndexDimensions(index, indexInfo);
opts.dimensions = hdr->vector_dim;
CheckHnswIndexDimensions(index, values[ 0 ], opts.dimensions);
PopulateUsearchOpts(index, &opts);
opts.retriever_ctx = ldb_wal_retriever_area_init(index, hdr);
Expand All @@ -125,14 +164,13 @@ bool ldb_aminsert(Relation index,

insertstate->uidx = uidx;
insertstate->retriever_ctx = opts.retriever_ctx;
insertstate->columnType = GetIndexColumnType(index);
insertstate->columnType = column_type;

hdr_page = NULL;

meta = usearch_metadata(uidx, &error);
assert(!error);

datum = PointerGetDatum(PG_DETOAST_DATUM(values[ 0 ]));
void *vector = DatumGetSizedArray(datum, insertstate->columnType, opts.dimensions);

#if LANTERNDB_COPYNODES
Expand Down
106 changes: 102 additions & 4 deletions test/expected/hnsw_create.out
Original file line number Diff line number Diff line change
Expand Up @@ -75,18 +75,116 @@ CREATE TABLE small_world4 (
id varchar(3),
vector real[]
);
-- If the first row is NULL we do not infer a dimension
-- Test postponing of index creation on an empty table
-- no options and single insert
CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector);
begin;
INSERT INTO small_world4 (id, vector) VALUES
('000', '{0,1,2,0}');
INFO: done init usearch index
INFO: inserted 1 elements
INFO: done saving 1 vectors
rollback;
DROP INDEX small_world4_idx1;
-- We need to vacuum or else we won't detect that the table is empty in ambuild
VACUUM small_world4;
SELECT * FROM small_world4;
id | vector
----+--------
(0 rows)

-- no options and batch insert
CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector);
begin;
INSERT INTO small_world4 (id, vector) VALUES
('000', '{0,1,2,0}'),
('001', '{1,0,0,1}');
INFO: done init usearch index
INFO: inserted 1 elements
INFO: done saving 1 vectors
rollback;
DROP INDEX small_world4_idx1;
VACUUM small_world4;
-- some options but no dim and single insert
CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
begin;
INSERT INTO small_world4 (id, vector) VALUES
('000', '{0,1,2,0}');
INFO: done init usearch index
INFO: inserted 1 elements
INFO: done saving 1 vectors
rollback;
DROP INDEX small_world4_idx1;
VACUUM small_world4;
-- some options but no dim and batch insert
CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
begin;
INSERT INTO small_world4 (id, vector) VALUES
('000', '{0,1,2,0}'),
('001', '{1,0,0,1}');
INFO: done init usearch index
INFO: inserted 1 elements
INFO: done saving 1 vectors
rollback;
DROP INDEX small_world4_idx1;
VACUUM small_world4;
-- dim specified and single insert (this should NOT postpone index build since dim is specified)
CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4);
INFO: done init usearch index
INFO: inserted 0 elements
INFO: done saving 0 vectors
begin;
INSERT INTO small_world4 (id, vector) VALUES
('000', '{0,1,2,0}');
rollback;
DROP INDEX small_world4_idx1;
VACUUM small_world4;
-- dim specified and batch insert (this should NOT postpone index build since dim is specified)
CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2, dim=4);
INFO: done init usearch index
INFO: inserted 0 elements
INFO: done saving 0 vectors
begin;
INSERT INTO small_world4 (id, vector) VALUES
('000', '{0,1,2,0}'),
('001', '{1,0,0,1}');
rollback;
DROP INDEX small_world4_idx1;
VACUUM small_world4;
-- Test cases where a NULL vector is inserted, and dim is not specified
-- Create postponed index on empty table with batch insert where one vector is NULL
-- this should ignore all NULL vectors and build index upon encountering insertion of first non-NULL entry
CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
begin;
INSERT INTO small_world4 (id, vector) VALUES
('000', NULL),
('001', '{1,0,0,1}');
INFO: done init usearch index
INFO: inserted 1 elements
INFO: done saving 1 vectors
rollback;
DROP INDEX small_world4_idx1;
VACUUM small_world4;
-- Empty table with first insert where vector is NULL
-- This should ignore the NULL vector and NOT build the postponed index
CREATE INDEX small_world4_idx1 ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
begin;
INSERT INTO small_world4 (id, vector) VALUES
('000', NULL);
rollback;
DROP INDEX small_world4_idx1;
VACUUM small_world4;
-- If the first row is NULL and index is not postponed (non-empty table) then we can't infer dimension and this will error
\set ON_ERROR_STOP off
CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
ERROR: column does not have dimensions, please specify one
begin;
INSERT INTO small_world4 (id, vector) VALUES
('000', NULL),
('001', '{1,0,0,1}');
CREATE INDEX ON small_world4 USING hnsw (vector) WITH (M=14, ef=22, ef_construction=2);
ERROR: column does not have dimensions, please specify one
ERROR: Failed to infer dimensions from non-empty table, please specify one
rollback;
\set ON_ERROR_STOP on
VACUUM small_world4;
INSERT INTO small_world4 (id, vector) VALUES
('000', '{1,0,0,0}'),
('001', '{1,0,0,1}'),
Expand Down
Loading