Skip to content

Commit

Permalink
Add flag to disable operator rewriting hooks and make pgvector-compat…
Browse files Browse the repository at this point in the history
…ible (#240)

* Add flag to disable operator rewriting hooks

* Add operators for cosine and hamming distances to work on pgvector compatibility mode

* Add update sql file

* Run pgvector tests in pgvector_compat mode

* Fix vector tests

* Chown pgvector dir for postgres

* remove pgvector directory before installing

* Fix update path

* Keep original hooks every time the pgvector_compat guc is changed

* Reset original hooks only if changed in fini

* Set pgvector_compat to TRUE by default and update tests

* Update README

* Fix brew symlink issue

* Remove symlink before brew install

* Ignore brew install error
  • Loading branch information
var77 authored Dec 9, 2023
1 parent 37a8519 commit 6223b7a
Show file tree
Hide file tree
Showing 27 changed files with 711 additions and 39 deletions.
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.3)

set(LANTERNDB_VERSION 0.0.8)
set(LANTERNDB_VERSION 0.0.9)

project(
LanternDB
Expand Down Expand Up @@ -189,6 +189,7 @@ set (_update_files
sql/updates/0.0.5--0.0.6.sql
sql/updates/0.0.6--0.0.7.sql
sql/updates/0.0.7--0.0.8.sql
sql/updates/0.0.8--0.0.9.sql
)

add_custom_command(
Expand Down
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,27 @@ FROM small_world ORDER BY vector <-> ARRAY[0,0,0] LIMIT 1;

### A note on operators and operator classes

Lantern supports several distance functions in the index. You only need to specify the distance function used for a column at index creation time. Lantern will automatically infer the distance function to use for search so you always use `<->` operator in search queries.
Lantern supports several distance functions in the index and it has 2 modes for operators:

1. `lantern.pgvector_compat=TRUE` (default)
In this mode there are 3 operators available `<->` (l2sq), `<=>` (cosine), `<+>` (hamming).
You need to use right operator in order to trigger index scan

2. `lantern.pgvector_compat=FALSE`
In this mode you only need to specify the distance function used for a column at index creation time. Lantern will automatically infer the distance function to use for search so you always use `<->` operator in search queries.

Note that the operator `<->` is intended exclusively for use with index lookups. If you expect to not use the index in a query, just use the distance function directly (e.g. `l2sq_dist(v1, v2)`)

> To switch between modes set `lantern.pgvector_compat` variable to `TRUE` or `FALSE`.
There are four defined operator classes that can be employed during index creation:

- **`dist_l2sq_ops`**: Default for the type `real[]`
- **`dist_vec_l2sq_ops`**: Default for the type `vector`
- **`dist_cos_ops`**: Applicable to the type `real[]`
- **`dist_hamming_ops`**: Applicable for the type `integer[]`
- **`dist_vec_cos_ops`**: Applicable to the type `vector`
- **`dist_hamming_ops`**: Applicable to the type `integer[]`
- **`dist_vec_hamming_ops`**: Applicable to the type `vector`

### Index Construction Parameters

Expand Down
1 change: 1 addition & 0 deletions ci/scripts/build-linux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,5 @@ function cleanup_environment() {

# Chown to postgres for running tests
chown -R postgres:postgres /tmp/lantern
chown -R postgres:postgres /tmp/pgvector
}
2 changes: 1 addition & 1 deletion ci/scripts/build-mac.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ function setup_locale_and_install_packages() {
}

function setup_postgres() {
cmd="brew install postgresql@${PG_VERSION} clang-format"
cmd="brew install postgresql@${PG_VERSION} clang-format || true" # ignoring brew linking errors
if [[ $USER == "root" ]]
then
# Runner is github CI user
Expand Down
4 changes: 3 additions & 1 deletion ci/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ function install_external_dependencies() {
PGVECTOR_VERSION=0.5.0
wget -O pgvector.tar.gz https://github.com/pgvector/pgvector/archive/refs/tags/v${PGVECTOR_VERSION}.tar.gz
tar xzf pgvector.tar.gz
pushd pgvector-${PGVECTOR_VERSION}
rm -rf pgvector || true
mv pgvector-${PGVECTOR_VERSION} pgvector
pushd pgvector
make && make install
popd
popd
Expand Down
23 changes: 22 additions & 1 deletion ci/scripts/run-tests-linux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,33 @@ function wait_for_pg(){
done
}

function run_pgvector_tests(){
pushd /tmp/pgvector
# Add lantern to load-extension in pgregress
sed -i '/REGRESS_OPTS \=/ s/$/ --load-extension lantern/' Makefile

# Set pgvector_compat flag in test files
for file in ./test/sql/*; do
echo 'SET lantern.pgvector_compat=TRUE;' | cat - $file > temp && mv temp $file
done

# Set pgvector_compat flag in result files
for file in ./test/expected/*.out; do
echo 'SET lantern.pgvector_compat=TRUE;' | cat - $file > temp && mv temp $file
done

# Run tests
make installcheck
popd
}

function run_db_tests(){
if [[ "$RUN_TESTS" == "1" ]]
then
cd $WORKDIR/build && \
make test && \
make test-client
make test-client && \
run_pgvector_tests && \
killall postgres && \
gcovr -r $WORKDIR/src/ --object-directory $WORKDIR/build/ --xml /tmp/coverage.xml
fi
Expand Down
68 changes: 61 additions & 7 deletions sql/lantern.sql
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,52 @@ CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
CREATE FUNCTION ldb_generic_dist(real[], real[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE FUNCTION ldb_generic_dist(integer[], integer[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE FUNCTION l2sq_dist(real[], real[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

-- this function is needed, as we should also use <-> operator
-- with integer[] type (to overwrite hamming dist function in our hooks)
-- and if we do not create l2sq_dist for integer[] type it will fail to cast in pgvector_compat mode
CREATE FUNCTION l2sq_dist(integer[], integer[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE FUNCTION cos_dist(real[], real[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

-- functions _with_guard suffix are used to forbid operator usage
-- if operator hooks are enabled (lantern.pgvector_compat=FALSE)
CREATE FUNCTION cos_dist_with_guard(real[], real[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE FUNCTION hamming_dist(integer[], integer[]) RETURNS integer
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;


CREATE FUNCTION hamming_dist_with_guard(integer[], integer[]) RETURNS integer
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

-- operators
CREATE OPERATOR <-> (
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = ldb_generic_dist,
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2sq_dist,
COMMUTATOR = '<->'
);

CREATE OPERATOR <-> (
LEFTARG = integer[], RIGHTARG = integer[], PROCEDURE = ldb_generic_dist,
LEFTARG = integer[], RIGHTARG = integer[], PROCEDURE = l2sq_dist,
COMMUTATOR = '<->'
);

CREATE OPERATOR <=> (
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = cos_dist_with_guard,
COMMUTATOR = '<=>'
);

CREATE OPERATOR <+> (
LEFTARG = integer[], RIGHTARG = integer[], PROCEDURE = hamming_dist_with_guard,
COMMUTATOR = '<+>'
);


CREATE SCHEMA _lantern_internal;

CREATE FUNCTION _lantern_internal.validate_index(index regclass, print_info boolean DEFAULT true) RETURNS VOID
Expand Down Expand Up @@ -56,14 +79,20 @@ BEGIN
CREATE OPERATOR CLASS dist_cos_ops
FOR TYPE real[] USING ' || access_method_name || ' AS
OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops,
FUNCTION 1 cos_dist(real[], real[]);
FUNCTION 1 cos_dist(real[], real[]),
-- it is important to set the function with guard the second
-- as op rewriting hook takes the first function to use
OPERATOR 2 <=> (real[], real[]) FOR ORDER BY float_ops,
FUNCTION 2 cos_dist_with_guard(real[], real[]);
';

dist_hamming_ops := '
CREATE OPERATOR CLASS dist_hamming_ops
FOR TYPE integer[] USING ' || access_method_name || ' AS
OPERATOR 1 <-> (integer[], integer[]) FOR ORDER BY float_ops,
FUNCTION 1 hamming_dist(integer[], integer[]);
FUNCTION 1 hamming_dist(integer[], integer[]),
OPERATOR 2 <+> (integer[], integer[]) FOR ORDER BY integer_ops,
FUNCTION 2 hamming_dist_with_guard(integer[], integer[]);
';

-- Execute the dynamic SQL statement.
Expand Down Expand Up @@ -107,10 +136,35 @@ BEGIN
CREATE FUNCTION l2sq_dist(vector, vector) RETURNS float8
AS 'MODULE_PATHNAME', 'vector_l2sq_dist' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE FUNCTION cos_dist(vector, vector) RETURNS float8
AS 'MODULE_PATHNAME', 'vector_cos_dist' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE FUNCTION hamming_dist(vector, vector) RETURNS float8
AS 'MODULE_PATHNAME', 'vector_hamming_dist' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE OPERATOR <+> (
LEFTARG = vector, RIGHTARG = vector, PROCEDURE = hamming_dist,
COMMUTATOR = '<+>'
);

CREATE OPERATOR CLASS dist_vec_l2sq_ops
DEFAULT FOR TYPE vector USING lantern_hnsw AS
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 l2sq_dist(vector, vector);

CREATE OPERATOR CLASS dist_vec_cos_ops
FOR TYPE vector USING lantern_hnsw AS
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 cos_dist(vector, vector),
OPERATOR 2 <=> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 2 cos_dist(vector, vector);

CREATE OPERATOR CLASS dist_vec_hamming_ops
FOR TYPE vector USING lantern_hnsw AS
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 hamming_dist(vector, vector),
OPERATOR 2 <+> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 2 hamming_dist(vector, vector);
END IF;


Expand Down
169 changes: 169 additions & 0 deletions sql/updates/0.0.8--0.0.9.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
DO $BODY$
DECLARE
pgvector_exists boolean;
am_name TEXT;
r pg_indexes%ROWTYPE;
indexes_cursor REFCURSOR;
index_names TEXT[] := '{}';
index_definitions TEXT[] := '{}';
BEGIN
-- Function to recreate operator classes for specified access method
CREATE OR REPLACE FUNCTION _lantern_internal._recreate_ldb_operator_classes(access_method_name TEXT) RETURNS BOOLEAN AS $$
DECLARE
dist_l2sq_ops TEXT;
dist_l2sq_ops_drop TEXT;
dist_cos_ops TEXT;
dist_cos_ops_drop TEXT;
dist_hamming_ops TEXT;
dist_hamming_ops_drop TEXT;
BEGIN

-- Construct the SQL statement to create the operator classes dynamically.
dist_l2sq_ops_drop := 'DROP OPERATOR CLASS IF EXISTS dist_l2sq_ops USING ' || access_method_name || ' CASCADE;';
dist_l2sq_ops := '
CREATE OPERATOR CLASS dist_l2sq_ops
DEFAULT FOR TYPE real[] USING ' || access_method_name || ' AS
OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops,
FUNCTION 1 l2sq_dist(real[], real[]);
';

dist_cos_ops_drop := 'DROP OPERATOR CLASS IF EXISTS dist_cos_ops USING ' || access_method_name || ' CASCADE;';
dist_cos_ops := '
CREATE OPERATOR CLASS dist_cos_ops
FOR TYPE real[] USING ' || access_method_name || ' AS
OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops,
FUNCTION 1 cos_dist(real[], real[]),
-- it is important to set the function with guard the second
-- as op rewriting hook takes the first function to use
OPERATOR 2 <=> (real[], real[]) FOR ORDER BY float_ops,
FUNCTION 2 cos_dist_with_guard(real[], real[]);
';


dist_hamming_ops_drop := 'DROP OPERATOR CLASS IF EXISTS dist_hamming_ops USING ' || access_method_name || ' CASCADE;';
dist_hamming_ops := '
CREATE OPERATOR CLASS dist_hamming_ops
FOR TYPE integer[] USING ' || access_method_name || ' AS
OPERATOR 1 <-> (integer[], integer[]) FOR ORDER BY float_ops,
FUNCTION 1 hamming_dist(integer[], integer[]),
OPERATOR 2 <+> (integer[], integer[]) FOR ORDER BY integer_ops,
FUNCTION 2 hamming_dist_with_guard(integer[], integer[]);
';


-- Execute the dynamic SQL statement.
EXECUTE dist_l2sq_ops_drop;
EXECUTE dist_l2sq_ops;
EXECUTE dist_cos_ops_drop;
EXECUTE dist_cos_ops;
EXECUTE dist_hamming_ops_drop;
EXECUTE dist_hamming_ops;

RETURN TRUE;
END;
$$ LANGUAGE plpgsql VOLATILE;

-- Check if the vector type from pgvector exists
SELECT EXISTS (
SELECT 1
FROM pg_type
WHERE typname = 'vector'
) INTO pgvector_exists;

am_name := 'hnsw';


IF pgvector_exists THEN
CREATE FUNCTION cos_dist(vector, vector) RETURNS float8
AS 'MODULE_PATHNAME', 'vector_cos_dist' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE FUNCTION hamming_dist(vector, vector) RETURNS float8
AS 'MODULE_PATHNAME', 'vector_hamming_dist' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;


CREATE OPERATOR <+> (
LEFTARG = vector, RIGHTARG = vector, PROCEDURE = hamming_dist,
COMMUTATOR = '<+>'
);

CREATE OPERATOR CLASS dist_vec_cos_ops
FOR TYPE vector USING lantern_hnsw AS
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 cos_dist(vector, vector),
OPERATOR 2 <=> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 2 cos_dist(vector, vector);

CREATE OPERATOR CLASS dist_vec_hamming_ops
FOR TYPE vector USING lantern_hnsw AS
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 hamming_dist(vector, vector),
OPERATOR 2 <+> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 2 hamming_dist(vector, vector);

am_name := 'lantern_hnsw';
END IF;

-- this function is needed, as we should also use <-> operator
-- with integer[] type (to overwrite hamming dist function in our hooks)
-- and if we do create l2sq_dist for integer[] type it will fail to cast in pgvector_compat mode
CREATE OR REPLACE FUNCTION l2sq_dist(integer[], integer[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

-- functions _with_guard suffix are used to forbid operator usage
-- if operator hooks are enabled (lantern.pgvector_compat=FALSE)
CREATE FUNCTION cos_dist_with_guard(real[], real[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;

CREATE FUNCTION hamming_dist_with_guard(integer[], integer[]) RETURNS integer
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;


-- keep existing indexes to reindex as we should drop indexes in order to change operator classes
OPEN indexes_cursor FOR SELECT * FROM pg_indexes WHERE indexdef ILIKE '%USING ' || am_name || '%';
-- Fetch index names into the array
LOOP
FETCH indexes_cursor INTO r;
EXIT WHEN NOT FOUND;

-- Append index name to the array
index_names := array_append(index_names, r.indexname);
index_definitions := array_append(index_definitions, r.indexdef);
END LOOP;

CLOSE indexes_cursor;

-- operators
DROP OPERATOR <->(real[], real[]) CASCADE;
CREATE OPERATOR <-> (
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2sq_dist,
COMMUTATOR = '<->'
);

DROP OPERATOR <->(integer[], integer[]) CASCADE;
CREATE OPERATOR <-> (
LEFTARG = integer[], RIGHTARG = integer[], PROCEDURE = l2sq_dist,
COMMUTATOR = '<->'
);

CREATE OPERATOR <=> (
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = cos_dist_with_guard,
COMMUTATOR = '<=>'
);

CREATE OPERATOR <+> (
LEFTARG = integer[], RIGHTARG = integer[], PROCEDURE = hamming_dist_with_guard,
COMMUTATOR = '<+>'
);

PERFORM _lantern_internal._recreate_ldb_operator_classes(am_name);

SET client_min_messages TO NOTICE;
-- reindex indexes
FOR i IN 1..coalesce(array_length(index_names, 1), 0) LOOP
RAISE NOTICE 'Reindexing index %', index_names[i];
EXECUTE index_definitions[i];
RAISE NOTICE 'Reindexed index: %', index_names[i];
END LOOP;
END;
$BODY$
LANGUAGE plpgsql;
Loading

0 comments on commit 6223b7a

Please sign in to comment.