From edd8a0c097d30302119540f9aeef24faf5231312 Mon Sep 17 00:00:00 2001 From: Connor <36115510+ScarFX@users.noreply.github.com> Date: Fri, 27 Sep 2024 11:04:18 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=80=20refactor:=20Langchain=20v0.3=20(?= =?UTF-8?q?#74)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ↪️refactor: moved packages out of langchain community and fixed embeddings abstraction * ↪️refactor: update requirements.txt * ✓chore: removed long list of requirements.txt * fix: mongo db query and embed/upload * refactor: change env var name * docs: Atlas MongoDB update env var setting * docs: MongoDB Atlas direction * docs: update atlas mongoDB directions * docs: add deprecated env variable * refactor: mongo db env variable MONGO_VECTOR_COLLECTION backwards compatability add --- README.md | 7 +++++-- config.py | 24 +++++++++++++----------- requirements.txt | 19 +++++++++++-------- store.py | 8 ++++++++ store_factory.py | 27 ++++++++++++++++++--------- 5 files changed, 55 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index e581c6d..3146fe3 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,8 @@ The following environment variables are required to run the application: - Note: `AZURE_OPENAI_ENDPOINT` will work but `RAG_AZURE_OPENAI_ENDPOINT` will override it in order to not conflict with LibreChat setting. - `HF_TOKEN`: (Optional) if needed for `huggingface` option. - `OLLAMA_BASE_URL`: (Optional) defaults to `http://ollama:11434`. +- `ATLAS_SEARCH_INDEX`: (Optional) the name of the vector search index if using Atlas MongoDB, defaults to `vector_index` +- `MONGO_VECTOR_COLLECTION`: Deprecated for MongoDB, please use `ATLAS_SEARCH_INDEX` and `COLLECTION_NAME` Make sure to set these environment variables before running the application. You can set them in a `.env` file or as system environment variables. @@ -87,10 +89,11 @@ Instead of using the default pgvector, we could use [Atlas MongoDB](https://www. ```env VECTOR_DB_TYPE=atlas-mongo ATLAS_MONGO_DB_URI= -MONGO_VECTOR_COLLECTION= +COLLECTION_NAME= +ATLAS_SEARCH_INDEX= ``` -The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by LibreChat. Even if it is the same, the `$MONGO_VECTOR_COLLECTION` collection needs to be a completely new one, separate from all collections used by LibreChat. In additional, create a vector search index for `$MONGO_VECTOR_COLLECTION` with the following json: +The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by LibreChat. Even if it is the same, the `$COLLECTION_NAME` collection needs to be a completely new one, separate from all collections used by LibreChat. In addition, create a vector search index for collection above (remember to assign `$ATLAS_SEARCH_INDEX`) with the following json: ```json { diff --git a/config.py b/config.py index e426361..36f9202 100644 --- a/config.py +++ b/config.py @@ -5,11 +5,8 @@ from enum import Enum from datetime import datetime from dotenv import find_dotenv, load_dotenv -from langchain_community.embeddings import ( - HuggingFaceEmbeddings, - HuggingFaceHubEmbeddings, - OllamaEmbeddings, -) +from langchain_ollama import OllamaEmbeddings +from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpointEmbeddings from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings from starlette.middleware.base import BaseHTTPMiddleware from store_factory import get_vector_store @@ -60,10 +57,10 @@ def get_env_variable( ATLAS_MONGO_DB_URI = get_env_variable( "ATLAS_MONGO_DB_URI", "mongodb://127.0.0.1:27018/LibreChat" ) +ATLAS_SEARCH_INDEX = get_env_variable("ATLAS_SEARCH_INDEX", "vector_index") MONGO_VECTOR_COLLECTION = get_env_variable( - "MONGO_VECTOR_COLLECTION", "vector_collection" -) - + "MONGO_VECTOR_COLLECTION", None +) # Deprecated, backwards compatability CHUNK_SIZE = int(get_env_variable("CHUNK_SIZE", "1500")) CHUNK_OVERLAP = int(get_env_variable("CHUNK_OVERLAP", "100")) @@ -195,7 +192,7 @@ def init_embeddings(provider, model): model_name=model, encode_kwargs={"normalize_embeddings": True} ) elif provider == EmbeddingsProvider.HUGGINGFACETEI: - return HuggingFaceHubEmbeddings(model=model) + return HuggingFaceEndpointEmbeddings(model=model) elif provider == EmbeddingsProvider.OLLAMA: return OllamaEmbeddings(model=model, base_url=OLLAMA_BASE_URL) else: @@ -236,12 +233,17 @@ def init_embeddings(provider, model): mode="async", ) elif VECTOR_DB_TYPE == VectorDBType.ATLAS_MONGO: - logger.warning("Using Atlas MongoDB as vector store is not fully supported yet.") + # Backward compatability check + if MONGO_VECTOR_COLLECTION: + logger.info(f"DEPRECATED: Please remove env var MONGO_VECTOR_COLLECTION and instead use COLLECTION_NAME and ATLAS_SEARCH_INDEX. You can set both as same, but not neccessary. See README for more information.") + ATLAS_SEARCH_INDEX = MONGO_VECTOR_COLLECTION + COLLECTION_NAME = MONGO_VECTOR_COLLECTION vector_store = get_vector_store( connection_string=ATLAS_MONGO_DB_URI, embeddings=embeddings, - collection_name=MONGO_VECTOR_COLLECTION, + collection_name=COLLECTION_NAME, mode="atlas-mongo", + search_index=ATLAS_SEARCH_INDEX, ) else: raise ValueError(f"Unsupported vector store type: {VECTOR_DB_TYPE}") diff --git a/requirements.txt b/requirements.txt index f6b788f..f015b94 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -langchain==0.1.12 -langchain_community==0.0.34 -langchain_openai==0.0.8 -langchain_core==0.1.45 +langchain==0.3 +langchain_community==0.3 +langchain_openai==0.2.0 +langchain_core==0.3.5 sqlalchemy==2.0.28 python-dotenv==1.0.1 fastapi==0.110.0 @@ -9,7 +9,7 @@ psycopg2-binary==2.9.9 pgvector==0.2.5 uvicorn==0.28.0 pypdf==4.1.0 -unstructured==0.12.6 +unstructured==0.15.13 markdown==3.6 networkx==3.2.1 pandas==2.2.1 @@ -19,12 +19,15 @@ pypandoc==1.13 PyJWT==2.8.0 asyncpg==0.29.0 python-multipart==0.0.9 -sentence_transformers==2.5.1 +sentence_transformers==3.1.1 aiofiles==23.2.1 -rapidocr-onnxruntime==1.3.17 +rapidocr-onnxruntime==1.3.24 opencv-python-headless==4.9.0.80 pymongo==4.6.3 -langchain-mongodb==0.1.3 +langchain-mongodb==0.2.0 +langchain-ollama==0.2.0 +langchain-openai==0.2.0 +langchain-huggingface==0.1.0 cryptography==42.0.7 python-magic==0.4.27 python-pptx==0.6.23 diff --git a/store.py b/store.py index f6fd1e0..92694ef 100644 --- a/store.py +++ b/store.py @@ -80,6 +80,14 @@ class AtlasMongoVector(MongoDBAtlasVectorSearch): @property def embedding_function(self) -> Embeddings: return self.embeddings + + def add_documents(self, docs: list[Document], ids: list[str]): + #{file_id}_{idx} + new_ids = [id for id in range(len(ids))] + file_id = docs[0].metadata['file_id'] + f_ids = [f'{file_id}_{id}' for id in new_ids] + return super().add_documents(docs, f_ids) + def similarity_search_with_score_by_vector( self, diff --git a/store_factory.py b/store_factory.py index 16b81ef..6549e29 100644 --- a/store_factory.py +++ b/store_factory.py @@ -1,14 +1,16 @@ -from langchain_community.embeddings import OpenAIEmbeddings - +from typing import Optional +from langchain_core.embeddings import Embeddings from store import AsyncPgVector, ExtendedPgVector from store import AtlasMongoVector from pymongo import MongoClient + def get_vector_store( connection_string: str, - embeddings: OpenAIEmbeddings, + embeddings: Embeddings, collection_name: str, mode: str = "sync", + search_index: Optional[str] = None ): if mode == "sync": return ExtendedPgVector( @@ -25,7 +27,9 @@ def get_vector_store( elif mode == "atlas-mongo": mongo_db = MongoClient(connection_string).get_database() mong_collection = mongo_db[collection_name] - return AtlasMongoVector(collection=mong_collection, embedding=embeddings, index_name=collection_name) + return AtlasMongoVector( + collection=mong_collection, embedding=embeddings, index_name=search_index + ) else: raise ValueError("Invalid mode specified. Choose 'sync' or 'async'.") @@ -35,20 +39,25 @@ async def create_index_if_not_exists(conn, table_name: str, column_name: str): # Construct index name conventionally index_name = f"idx_{table_name}_{column_name}" # Check if index exists - exists = await conn.fetchval(f""" + exists = await conn.fetchval( + f""" SELECT EXISTS ( SELECT FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.relname = $1 AND n.nspname = 'public' -- Or specify your schema if different ); - """, index_name) + """, + index_name, + ) # Create the index if it does not exist if not exists: - await conn.execute(f""" + await conn.execute( + f""" CREATE INDEX CONCURRENTLY IF NOT EXISTS {index_name} ON public.{table_name} ({column_name}); - """) + """ + ) print(f"Index {index_name} created on {table_name}.{column_name}") else: - print(f"Index {index_name} already exists on {table_name}.{column_name}") \ No newline at end of file + print(f"Index {index_name} already exists on {table_name}.{column_name}")