From d2dfe55610d3dd0507bc0b4b4361bbde7db421c1 Mon Sep 17 00:00:00 2001 From: Bharat Ramanathan Date: Wed, 31 Jul 2024 12:21:23 +0530 Subject: [PATCH] fix: config issues in ingestion --- src/wandbot/ingestion/config.py | 6 ++++-- src/wandbot/ingestion/vectorstores.py | 9 ++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/wandbot/ingestion/config.py b/src/wandbot/ingestion/config.py index 670a81b..20aed51 100644 --- a/src/wandbot/ingestion/config.py +++ b/src/wandbot/ingestion/config.py @@ -17,8 +17,7 @@ from urllib.parse import urlparse from pydantic import BaseModel, Field, model_validator -from pydantic_settings import BaseSettings - +from pydantic_settings import BaseSettings, SettingsConfigDict from wandbot.utils import get_logger logger = get_logger(__name__) @@ -273,6 +272,9 @@ def _set_cache_paths(cls, values: "DataStoreConfig") -> "DataStoreConfig": class VectorStoreConfig(BaseSettings): + model_config = SettingsConfigDict( + env_file=".env", env_file_encoding="utf-8", extra="allow" + ) collection_name: str = "vectorstore" persist_dir: pathlib.Path = pathlib.Path("data/cache/vectorstore") embedding_model_name: str = "text-embedding-3-small" diff --git a/src/wandbot/ingestion/vectorstores.py b/src/wandbot/ingestion/vectorstores.py index f35f4b7..f364767 100644 --- a/src/wandbot/ingestion/vectorstores.py +++ b/src/wandbot/ingestion/vectorstores.py @@ -16,12 +16,11 @@ import pathlib from typing import List +import wandb from langchain_chroma import Chroma from langchain_core.documents import Document from langchain_openai import OpenAIEmbeddings from tqdm import trange - -import wandb from wandbot.ingestion.config import VectorStoreConfig from wandbot.utils import get_logger @@ -61,7 +60,8 @@ def load( # Todo: Change to LiteLLM Embeddings embedding_fn = OpenAIEmbeddings( - model=config.embeddings_model, dimensions=config.embedding_dim + model=config.embedding_model_name, + dimensions=config.embedding_dimensions, ) vectorstore_dir = config.persist_dir vectorstore_dir.mkdir(parents=True, exist_ok=True) @@ -77,14 +77,13 @@ def load( transformed_documents.append(Document(**json.loads(line))) chroma = Chroma( - collection_name=config.name, + collection_name=config.collection_name, embedding_function=embedding_fn, persist_directory=str(config.persist_dir), ) for batch_idx in trange(0, len(transformed_documents), config.batch_size): batch = transformed_documents[batch_idx : batch_idx + config.batch_size] chroma.add_documents(batch) - chroma.persist() result_artifact = wandb.Artifact( name=result_artifact_name,