diff --git a/ann_benchmarks/algorithms/luceneknn/Dockerfile b/ann_benchmarks/algorithms/luceneknn/Dockerfile index b4688d332..8a089813e 100644 --- a/ann_benchmarks/algorithms/luceneknn/Dockerfile +++ b/ann_benchmarks/algorithms/luceneknn/Dockerfile @@ -8,25 +8,29 @@ RUN apt-get install -y wget apt-transport-https gnupg RUN wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - RUN echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list RUN apt-get update -RUN apt-get install -y temurin-17-jdk +RUN apt-get install -y temurin-20-jdk temurin-17-jdk # Install PyLucene & JCC -RUN wget https://dlcdn.apache.org/lucene/pylucene/pylucene-9.4.1-src.tar.gz -RUN tar -xzf pylucene-9.4.1-src.tar.gz -ENV JCC_JDK=/usr/lib/jvm/temurin-17-jdk-amd64 -WORKDIR /home/app/pylucene-9.4.1/jcc +RUN wget https://dlcdn.apache.org/lucene/pylucene/pylucene-9.7.0-src.tar.gz +RUN tar -xzf pylucene-9.7.0-src.tar.gz +ENV JCC_JDK=/usr/lib/jvm/temurin-20-jdk-amd64 +WORKDIR /home/app/pylucene-9.7.0/jcc RUN python3 ./setup.py build RUN python3 ./setup.py install -WORKDIR /home/app/pylucene-9.4.1 +WORKDIR /home/app/pylucene-9.7.0 ENV PYTHON=python3 ENV JCC="$(PYTHON) -m jcc --shared" ENV NUM_FILES=16 -RUN make lucene-java-9.4.1 -RUN mkdir lucene-java-9.4.1/lucene/extensions/src/java/org/apache/pylucene/codecs -RUN wget -O ./lucene-java-9.4.1/lucene/extensions/src/java/org/apache/pylucene/codecs/PyLucene94Codec.java https://gist.githubusercontent.com/benwtrent/f3a6c4a9ce9749e702285dc82f39a129/raw/4742cf91401103f86809655d5c708b833beae43f/PyLucene94Codec.java +# Needed as current Lucene gradle version doesn't support java 20 on build +ENV JAVA_HOME=/usr/lib/jvm/temurin-17-jdk-amd64 +RUN make lucene-java-9.7.0 +RUN mkdir lucene-java-9.7.0/lucene/extensions/src/java/org/apache/pylucene/codecs +RUN wget -O ./lucene-java-9.7.0/lucene/extensions/src/java/org/apache/pylucene/codecs/PyLucene95Codec.java https://gist.githubusercontent.com/benwtrent/79d70d59716f0e25833c5ea84d956c12/raw/8f529f3437c2fb8318f0127ecd71c960e43e0a7f/PyLucene95Codec.java RUN make RUN make install +# Switch back to jdk20 for panama vectorization support +ENV JAVA_HOME=/usr/lib/jvm/temurin-20-jdk-amd64 # Reset the work dir so scripts can be ran WORKDIR /home/app \ No newline at end of file diff --git a/ann_benchmarks/algorithms/luceneknn/module.py b/ann_benchmarks/algorithms/luceneknn/module.py index a4bc0b5a8..0b3d7c846 100644 --- a/ann_benchmarks/algorithms/luceneknn/module.py +++ b/ann_benchmarks/algorithms/luceneknn/module.py @@ -7,21 +7,21 @@ import sklearn.preprocessing from java.nio.file import Paths from lucene import JArray -from org.apache.lucene.codecs.lucene94 import Lucene94HnswVectorsFormat +from org.apache.lucene.codecs.lucene95 import Lucene95HnswVectorsFormat from org.apache.lucene.document import Document, KnnVectorField, StoredField from org.apache.lucene.index import (DirectoryReader, IndexWriter, IndexWriterConfig, VectorSimilarityFunction) from org.apache.lucene.search import IndexSearcher, KnnVectorQuery from org.apache.lucene.store import FSDirectory -from org.apache.pylucene.codecs import PyLucene94Codec +from org.apache.pylucene.codecs import PyLucene95Codec from ..base.module import BaseANN -class Codec(PyLucene94Codec): +class Codec(PyLucene95Codec): """ - Custom codec so that the appropriate Lucene94 codec can be returned with the configured M and efConstruction + Custom codec so that the appropriate Lucene95 codec can be returned with the configured M and efConstruction """ def __init__(self, M, efConstruction): @@ -30,7 +30,7 @@ def __init__(self, M, efConstruction): self.efConstruction = efConstruction def getKnnVectorsFormatForField(self, field): - return Lucene94HnswVectorsFormat(self.M, self.efConstruction) + return Lucene95HnswVectorsFormat(self.M, self.efConstruction) class PyLuceneKNN(BaseANN): @@ -40,9 +40,13 @@ class PyLuceneKNN(BaseANN): def __init__(self, metric: str, dimension: int, param): try: - lucene.initVM(vmargs=["-Djava.awt.headless=true -Xmx6g -Xms6g"]) - except ValueError: - print("VM already initialized") + lucene.initVM( + initialheap="6g", + maxheap="6g", + vmargs=["--add-modules=jdk.incubator.vector"] + ) + except ValueError as e: + print(f"VM already initialized: {e}") self.metric = metric self.dimension = dimension self.param = param @@ -78,7 +82,7 @@ def fit(self, X): doc.add(StoredField("id", id)) iw.addDocument(doc) id += 1 - if id + 1 % 1000 == 0: + if (id + 1) % 1000 == 0: print(f"LuceneKNN: written {id} docs") # Force merge so only one HNSW graph is searched. iw.forceMerge(1)