diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 7e09301fe..82397bbcb 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -54,6 +54,7 @@ jobs: - panng_ngt - pg_embedding - pgvector + - pgvecto_rs - pynndescent - redisearch - qdrant diff --git a/README.md b/README.md index 357fab209..cf592573a 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ Evaluated * [Milvus](https://github.com/milvus-io/milvus) ![https://img.shields.io/github/stars/milvus-io/milvus?style=social](https://img.shields.io/github/stars/milvus-io/milvus?style=social): [Knowhere](https://github.com/milvus-io/knowhere) * [Zilliz(Glass)](https://github.com/hhy3/pyglass) * [pgvector](https://github.com/pgvector/pgvector) ![https://img.shields.io/github/stars/pgvector/pgvector?style=social](https://img.shields.io/github/stars/pgvector/pgvector?style=social) +* [pgvecto.rs](https://github.com/tensorchord/pgvecto.rs) ![https://img.shields.io/github/stars/tensorchord/pgvecto.rs?style=social](https://img.shields.io/github/stars/tensorchord/pgvecto.rs?style=social) * [RediSearch](https://github.com/redisearch/redisearch) ![https://img.shields.io/github/stars/redisearch/redisearch?style=social](https://img.shields.io/github/stars/redisearch/redisearch?style=social) * [pg_embedding](https://github.com/neondatabase/pg_embedding) ![https://img.shields.io/github/stars/pg_embedding/pg_embedding?style=social](https://img.shields.io/github/stars/neondatabase/pg_embedding?style=social) * [Descartes(01AI)](https://github.com/xiaoming-01ai/descartes) diff --git a/ann_benchmarks/algorithms/pgvecto_rs/Dockerfile b/ann_benchmarks/algorithms/pgvecto_rs/Dockerfile new file mode 100644 index 000000000..bbde1e67a --- /dev/null +++ b/ann_benchmarks/algorithms/pgvecto_rs/Dockerfile @@ -0,0 +1,26 @@ +FROM tensorchord/pgvecto-rs:pg16-v0.3.0-alpha.1 + +# https://github.com/tensorchord/pgvecto.rs + +RUN apt-get update \ + && apt-get install -y python3-pip + +WORKDIR /home/app +COPY requirements.txt . + +RUN python3 -m pip install --break-system-packages -r requirements.txt +RUN python3 -m pip install --break-system-packages psycopg[binary] + +COPY run_algorithm.py . + +ENV POSTGRES_PASSWORD=password +ENV POSTGRES_USER=postgres + +RUN printf '#!/bin/bash\n\ +runuser -u postgres -- initdb \n\ +runuser -u postgres -- postgres -c shared_preload_libraries=vectors.so &\n\ +sleep 5\n\ +python3 -u run_algorithm.py "$@"' > entrypoint.sh \ + && chmod u+x entrypoint.sh + +ENTRYPOINT ["/home/app/entrypoint.sh"] diff --git a/ann_benchmarks/algorithms/pgvecto_rs/config.yml b/ann_benchmarks/algorithms/pgvecto_rs/config.yml new file mode 100644 index 000000000..bf57b0f6c --- /dev/null +++ b/ann_benchmarks/algorithms/pgvecto_rs/config.yml @@ -0,0 +1,17 @@ +float: + any: + - base_args: ['@metric'] + constructor: PGVectoRS + disabled: false + docker_tag: ann-benchmarks-pgvecto_rs + module: ann_benchmarks.algorithms.pgvecto_rs + name: pgvecto_rs + run_groups: + M-16: + arg_groups: [{M: 16, efConstruction: 200}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 800]] + M-24: + arg_groups: [{M: 24, efConstruction: 200}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 800]] diff --git a/ann_benchmarks/algorithms/pgvecto_rs/module.py b/ann_benchmarks/algorithms/pgvecto_rs/module.py new file mode 100644 index 000000000..3b0b41b39 --- /dev/null +++ b/ann_benchmarks/algorithms/pgvecto_rs/module.py @@ -0,0 +1,99 @@ +import struct +import time + +import numpy as np +import psycopg +from psycopg.adapt import Dumper, Loader +from psycopg.pq import Format +from psycopg.types import TypeInfo + +from ..base.module import BaseANN + + +class VectorDumper(Dumper): + format = Format.BINARY + + def dump(self, obj): + return struct.pack(f" None: + self.metric = metric + self.m = method_param["M"] + self.ef_construction = method_param["efConstruction"] + self.ef_search = 100 + + if metric == "angular": + self.query_sql = "SELECT id FROM items ORDER BY embedding <=> %s LIMIT %s" + self.index_sql = f"CREATE INDEX ON items USING vectors (embedding vector_cos_ops) WITH (options = $$[indexing.hnsw]\nm = {self.m}\nef_construction = {self.ef_construction}$$)" + elif metric == "euclidean": + self.query_sql = "SELECT id FROM items ORDER BY embedding <-> %s LIMIT %s" + self.index_sql = f"CREATE INDEX ON items USING vectors (embedding vector_l2_ops) WITH (options = $$[indexing.hnsw]\nm = {self.m}\nef_construction = {self.ef_construction}$$)" + else: + raise RuntimeError(f"unknown metric {metric}") + + self.connect = psycopg.connect(user="postgres", password="password", autocommit=True) + self.connect.execute("SET search_path = \"$user\", public, vectors") + self.connect.execute("CREATE EXTENSION IF NOT EXISTS vectors") + register_vector(self.connect) + + def fit(self, X): + dim = X.shape[1] + + cur = self.connect.cursor() + cur.execute("DROP TABLE IF EXISTS items") + cur.execute(f"CREATE TABLE items (id int, embedding vector({dim}))") + with cur.copy("COPY items (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: + copy.set_types(["int4", "vector"]) + for i, emb in enumerate(X): + copy.write_row((i, emb)) + + cur.execute(self.index_sql) + print("waiting for indexing to finish...") + for _ in range(3600): + cur.execute("SELECT idx_indexing FROM vectors.pg_vector_index_stat WHERE tablename='items'") + if not cur.fetchone()[0]: + break + time.sleep(10) + + def set_query_arguments(self, ef_search): + self.ef_search = ef_search + self.connect.execute(f"SET vectors.hnsw_ef_search = {ef_search}") + + def query(self, vec, num): + cur = self.connect.execute(self.query_sql, (vec, num), binary=True, prepare=True) + return [id for (id,) in cur.fetchall()] + + def __str__(self): + return ( + f"PGVectoRS(metric={self.metric}, m={self.m}, " + f"ef_construction={self.ef_construction}, ef_search={self.ef_search})" + )