Add pgvectorscale

This commit inspired by timescale blog post publication [1] about performance improvements over current implementation of the pgvector [2] implementation. Adding a benchmark to evaluate implementation suggested by PgVectorScale project. [1]: https://www.timescale.com/blog/pgvector-is-now-as-fast-as-pinecone-at-75-less-cost/ [2]: https://github.com/timescale/pgvectorscale/tree/main Signed-off-by: Artem Barger <[email protected]>
erikbern · Aug 6, 2024 · c4464b5 · c4464b5
1 parent ebfea23
commit c4464b5
Show file tree

Hide file tree

Showing 5 changed files with 148 additions and 0 deletions.
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -57,6 +57,7 @@ jobs:
           - parlayann
           - pg_embedding
           - pgvector
+          - pgvectorscale
           - pgvecto_rs
           - pynndescent
           - redisearch

diff --git a/README.md b/README.md
@@ -50,6 +50,7 @@ Evaluated
   * [pg_embedding](https://github.com/neondatabase/pg_embedding) ![https://img.shields.io/github/stars/pg_embedding/pg_embedding?style=social](https://img.shields.io/github/stars/neondatabase/pg_embedding?style=social)
 * [Descartes(01AI)](https://github.com/xiaoming-01ai/descartes)
 * [kgn](https://github.com/Henry-yan/kgn)
+* [PGVectorScale](https://github.com/timescale/pgvectorscale/tree/main)
 
 Data sets
 =========

diff --git a/ann_benchmarks/algorithms/pgvectorscale/Dockerfile b/ann_benchmarks/algorithms/pgvectorscale/Dockerfile
@@ -0,0 +1,59 @@
+FROM ann-benchmarks
+
+RUN git clone https://github.com/pgvector/pgvector /tmp/pgvector
+
+RUN DEBIAN_FRONTEND=noninteractive apt-get -y install tzdata
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential postgresql-common
+RUN /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y
+RUN apt-get install -y --no-install-recommends postgresql-16 postgresql-server-dev-16
+RUN sh -c 'echo "local all all trust" > /etc/postgresql/16/main/pg_hba.conf'
+
+# Dynamically set OPTFLAGS based on the architecture
+RUN ARCH=$(uname -m) && \
+    if [ "$ARCH" = "aarch64" ]; then \
+        OPTFLAGS="-march=native -msve-vector-bits=512"; \
+    elif [ "$ARCH" = "x86_64" ]; then \
+        OPTFLAGS="-march=native -mprefer-vector-width=512"; \
+    else \
+        OPTFLAGS="-march=native"; \
+    fi && \
+    cd /tmp/pgvector && \
+    make clean && \
+    make OPTFLAGS="$OPTFLAGS" && \
+    make install
+
+# Install necessary dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    build-essential \
+    libssl-dev \
+    pkg-config \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust and Cargo
+RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
+
+# Ensure the cargo bin directory is in the PATH
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Install pgrx using cargo
+RUN cargo install --locked cargo-pgrx
+# Initialize pgrx with the PostgreSQL version
+RUN cargo pgrx init --pg16 pg_config
+RUN git clone https://github.com/timescale/pgvectorscale /tmp/pgvectorscale
+RUN cd /tmp/pgvectorscale/pgvectorscale && \
+    cargo pgrx install --release
+
+USER postgres
+RUN service postgresql start && \
+    psql -c "CREATE USER ann WITH ENCRYPTED PASSWORD 'ann'" && \
+    psql -c "CREATE DATABASE ann" && \
+    psql -c "GRANT ALL PRIVILEGES ON DATABASE ann TO ann" && \
+    psql -d ann -c "GRANT ALL ON SCHEMA public TO ann" && \
+    psql -d ann -c "CREATE EXTENSION IF NOT EXISTS vectorscale CASCADE" && \
+    psql -c "ALTER USER ann SET maintenance_work_mem = '4GB'" && \
+    psql -c "ALTER USER ann SET max_parallel_maintenance_workers = 0" && \
+    psql -c "ALTER SYSTEM SET shared_buffers = '4GB'"
+USER root
+
+RUN pip install psycopg[binary] pgvector
diff --git a/ann_benchmarks/algorithms/pgvectorscale/config.yml b/ann_benchmarks/algorithms/pgvectorscale/config.yml
@@ -0,0 +1,33 @@
+float:
+  any:
+  - base_args: ['@metric']
+    constructor: PGDiskANN
+    disabled: false
+    docker_tag: ann-benchmarks-pgvectorscale
+    module: ann_benchmarks.algorithms.pgvectorscale
+    name: pgvectorscale
+    run_groups:
+      N-50:
+        arg_groups: [{num_neighbors: 50, search_list_size: 100, max_alpha: 1.2}]
+        args: {}
+        query_args: [[10, 20, 40, 80, 120, 200, 400, 800]]
+      N-100:
+        arg_groups: [{num_neighbors: 100, search_list_size: 100, max_alpha: 1.2}]
+        args: {}
+        query_args: [[10, 20, 40, 80, 120, 200, 400, 800]]
+      ALPHA-1.00:
+        arg_groups: [{num_neighbors: 100, search_list_size: 100, max_alpha: 1}]
+        args: {}
+        query_args: [[10, 20, 40, 80, 120, 200, 400, 800]]
+      ALPHA-1.20:
+        arg_groups: [{num_neighbors: 100, search_list_size: 100, max_alpha: 1.2}]
+        args: {}
+        query_args: [[10, 20, 40, 80, 120, 200, 400, 800]]
+      ALPHA-1.50:
+        arg_groups: [{num_neighbors: 100, search_list_size: 100, max_alpha: 1.5}]
+        args: {}
+        query_args: [[10, 20, 40, 80, 120, 200, 400, 800]]
+      ALPHA-2.00:
+        arg_groups: [{num_neighbors: 100, search_list_size: 100, max_alpha: 2}]
+        args: {}
+        query_args: [[10, 20, 40, 80, 120, 200, 400, 800]]
diff --git a/ann_benchmarks/algorithms/pgvectorscale/module.py b/ann_benchmarks/algorithms/pgvectorscale/module.py
@@ -0,0 +1,54 @@
+import subprocess
+import sys
+
+import pgvector.psycopg
+import psycopg
+
+from ..base.module import BaseANN
+
+class PGDiskANN(BaseANN):
+    def __init__(self, metric, method_param):
+        print(f"running constructor")
+        self._metric = metric
+        self._cur = None
+        self._query = "SELECT id FROM items ORDER BY embedding <=> %s LIMIT %s"
+        self._num_neighbors = method_param['num_neighbors']
+        self._search_list_size = method_param['search_list_size']
+        self._max_alpha = method_param['max_alpha']
+        print(f"running only {self._metric} and {self._query}")
+
+    def fit(self, X):
+        print("running fit")
+        subprocess.run("service postgresql start", shell=True, check=True, stdout=sys.stdout, stderr=sys.stderr)
+        conn = psycopg.connect(user="ann", password="ann", dbname="ann", autocommit=True)
+        pgvector.psycopg.register_vector(conn)
+        cur = conn.cursor()
+        cur.execute("DROP TABLE IF EXISTS items")
+        cur.execute("CREATE TABLE items (id int, embedding vector(%d))" % X.shape[1])
+        cur.execute("ALTER TABLE items ALTER COLUMN embedding SET STORAGE PLAIN")
+        print("copying data...")
+        with cur.copy("COPY items (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy:
+            copy.set_types(["int4", "vector"])
+            for i, embedding in enumerate(X):
+                copy.write_row((i, embedding))
+        print("creating index...")
+        cur.execute("CREATE INDEX ON items USING diskann(embedding) WITH (num_neighbors = %d, search_list_size = %d, max_alpha = %d)" % (self._num_neighbors, self._search_list_size, self._max_alpha))
+        print("done!")
+        self._cur = cur
+
+    def query(self, v, n):
+        self._cur.execute(self._query, (v, n), binary=True, prepare=True)
+        return [id for id, in self._cur.fetchall()]
+
+    def set_query_arguments(self, list_size):
+        self._list_size = list_size
+        self._cur.execute("SET diskann.query_search_list_size = %d" % list_size)
+
+    def get_memory_usage(self):
+        if self._cur is None:
+            return 0
+        self._cur.execute("SELECT pg_relation_size('items_embedding_idx')")
+        return self._cur.fetchone()[0] / 1024
+
+    def __str__(self):
+        return f"PGDiskANN()"