Skip to content

Commit

Permalink
fix: config issues in ingestion
Browse files Browse the repository at this point in the history
  • Loading branch information
parambharat committed Jul 31, 2024
1 parent 6d72f54 commit d2dfe55
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 7 deletions.
6 changes: 4 additions & 2 deletions src/wandbot/ingestion/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
from urllib.parse import urlparse

from pydantic import BaseModel, Field, model_validator
from pydantic_settings import BaseSettings

from pydantic_settings import BaseSettings, SettingsConfigDict
from wandbot.utils import get_logger

logger = get_logger(__name__)
Expand Down Expand Up @@ -273,6 +272,9 @@ def _set_cache_paths(cls, values: "DataStoreConfig") -> "DataStoreConfig":


class VectorStoreConfig(BaseSettings):
model_config = SettingsConfigDict(
env_file=".env", env_file_encoding="utf-8", extra="allow"
)
collection_name: str = "vectorstore"
persist_dir: pathlib.Path = pathlib.Path("data/cache/vectorstore")
embedding_model_name: str = "text-embedding-3-small"
Expand Down
9 changes: 4 additions & 5 deletions src/wandbot/ingestion/vectorstores.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@
import pathlib
from typing import List

import wandb
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from tqdm import trange

import wandb
from wandbot.ingestion.config import VectorStoreConfig
from wandbot.utils import get_logger

Expand Down Expand Up @@ -61,7 +60,8 @@ def load(

# Todo: Change to LiteLLM Embeddings
embedding_fn = OpenAIEmbeddings(
model=config.embeddings_model, dimensions=config.embedding_dim
model=config.embedding_model_name,
dimensions=config.embedding_dimensions,
)
vectorstore_dir = config.persist_dir
vectorstore_dir.mkdir(parents=True, exist_ok=True)
Expand All @@ -77,14 +77,13 @@ def load(
transformed_documents.append(Document(**json.loads(line)))

chroma = Chroma(
collection_name=config.name,
collection_name=config.collection_name,
embedding_function=embedding_fn,
persist_directory=str(config.persist_dir),
)
for batch_idx in trange(0, len(transformed_documents), config.batch_size):
batch = transformed_documents[batch_idx : batch_idx + config.batch_size]
chroma.add_documents(batch)
chroma.persist()

result_artifact = wandb.Artifact(
name=result_artifact_name,
Expand Down

0 comments on commit d2dfe55

Please sign in to comment.