Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/update luceneknn #443

Merged
merged 2 commits into from
Oct 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions ann_benchmarks/algorithms/luceneknn/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,29 @@ RUN apt-get install -y wget apt-transport-https gnupg
RUN wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add -
RUN echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list
RUN apt-get update
RUN apt-get install -y temurin-17-jdk
RUN apt-get install -y temurin-20-jdk temurin-17-jdk

# Install PyLucene & JCC
RUN wget https://dlcdn.apache.org/lucene/pylucene/pylucene-9.4.1-src.tar.gz
RUN tar -xzf pylucene-9.4.1-src.tar.gz
ENV JCC_JDK=/usr/lib/jvm/temurin-17-jdk-amd64
WORKDIR /home/app/pylucene-9.4.1/jcc
RUN wget https://dlcdn.apache.org/lucene/pylucene/pylucene-9.7.0-src.tar.gz
RUN tar -xzf pylucene-9.7.0-src.tar.gz
ENV JCC_JDK=/usr/lib/jvm/temurin-20-jdk-amd64
WORKDIR /home/app/pylucene-9.7.0/jcc
RUN python3 ./setup.py build
RUN python3 ./setup.py install

WORKDIR /home/app/pylucene-9.4.1
WORKDIR /home/app/pylucene-9.7.0
ENV PYTHON=python3
ENV JCC="$(PYTHON) -m jcc --shared"
ENV NUM_FILES=16
RUN make lucene-java-9.4.1
RUN mkdir lucene-java-9.4.1/lucene/extensions/src/java/org/apache/pylucene/codecs
RUN wget -O ./lucene-java-9.4.1/lucene/extensions/src/java/org/apache/pylucene/codecs/PyLucene94Codec.java https://gist.githubusercontent.com/benwtrent/f3a6c4a9ce9749e702285dc82f39a129/raw/4742cf91401103f86809655d5c708b833beae43f/PyLucene94Codec.java
# Needed as current Lucene gradle version doesn't support java 20 on build
ENV JAVA_HOME=/usr/lib/jvm/temurin-17-jdk-amd64
RUN make lucene-java-9.7.0
RUN mkdir lucene-java-9.7.0/lucene/extensions/src/java/org/apache/pylucene/codecs
RUN wget -O ./lucene-java-9.7.0/lucene/extensions/src/java/org/apache/pylucene/codecs/PyLucene95Codec.java https://gist.githubusercontent.com/benwtrent/79d70d59716f0e25833c5ea84d956c12/raw/8f529f3437c2fb8318f0127ecd71c960e43e0a7f/PyLucene95Codec.java
RUN make
RUN make install
# Switch back to jdk20 for panama vectorization support
ENV JAVA_HOME=/usr/lib/jvm/temurin-20-jdk-amd64

# Reset the work dir so scripts can be ran
WORKDIR /home/app
22 changes: 13 additions & 9 deletions ann_benchmarks/algorithms/luceneknn/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,21 @@
import sklearn.preprocessing
from java.nio.file import Paths
from lucene import JArray
from org.apache.lucene.codecs.lucene94 import Lucene94HnswVectorsFormat
from org.apache.lucene.codecs.lucene95 import Lucene95HnswVectorsFormat
from org.apache.lucene.document import Document, KnnVectorField, StoredField
from org.apache.lucene.index import (DirectoryReader, IndexWriter,
IndexWriterConfig,
VectorSimilarityFunction)
from org.apache.lucene.search import IndexSearcher, KnnVectorQuery
from org.apache.lucene.store import FSDirectory
from org.apache.pylucene.codecs import PyLucene94Codec
from org.apache.pylucene.codecs import PyLucene95Codec

from ..base.module import BaseANN


class Codec(PyLucene94Codec):
class Codec(PyLucene95Codec):
"""
Custom codec so that the appropriate Lucene94 codec can be returned with the configured M and efConstruction
Custom codec so that the appropriate Lucene95 codec can be returned with the configured M and efConstruction
"""

def __init__(self, M, efConstruction):
Expand All @@ -30,7 +30,7 @@ def __init__(self, M, efConstruction):
self.efConstruction = efConstruction

def getKnnVectorsFormatForField(self, field):
return Lucene94HnswVectorsFormat(self.M, self.efConstruction)
return Lucene95HnswVectorsFormat(self.M, self.efConstruction)


class PyLuceneKNN(BaseANN):
Expand All @@ -40,9 +40,13 @@ class PyLuceneKNN(BaseANN):

def __init__(self, metric: str, dimension: int, param):
try:
lucene.initVM(vmargs=["-Djava.awt.headless=true -Xmx6g -Xms6g"])
except ValueError:
print("VM already initialized")
lucene.initVM(
initialheap="6g",
maxheap="6g",
vmargs=["--add-modules=jdk.incubator.vector"]
)
except ValueError as e:
print(f"VM already initialized: {e}")
self.metric = metric
self.dimension = dimension
self.param = param
Expand Down Expand Up @@ -78,7 +82,7 @@ def fit(self, X):
doc.add(StoredField("id", id))
iw.addDocument(doc)
id += 1
if id + 1 % 1000 == 0:
if (id + 1) % 1000 == 0:
print(f"LuceneKNN: written {id} docs")
# Force merge so only one HNSW graph is searched.
iw.forceMerge(1)
Expand Down
Loading