diff --git a/src/wandbot/api/app.py b/src/wandbot/api/app.py index 3e6c0c2..7d5fb49 100644 --- a/src/wandbot/api/app.py +++ b/src/wandbot/api/app.py @@ -33,9 +33,9 @@ from datetime import datetime, timezone import pandas as pd +import wandb from fastapi import FastAPI -import wandb from wandbot.api.routers import chat as chat_router from wandbot.api.routers import database as database_router from wandbot.api.routers import retrieve as retrieve_router diff --git a/src/wandbot/api/routers/database.py b/src/wandbot/api/routers/database.py index b45b710..a0b5bb8 100644 --- a/src/wandbot/api/routers/database.py +++ b/src/wandbot/api/routers/database.py @@ -1,8 +1,8 @@ +import wandb from fastapi import APIRouter from starlette import status from starlette.responses import Response -import wandb from wandbot.database.client import DatabaseClient from wandbot.database.database import engine from wandbot.database.models import Base diff --git a/src/wandbot/chat/chat.py b/src/wandbot/chat/chat.py index 55ec833..974850d 100644 --- a/src/wandbot/chat/chat.py +++ b/src/wandbot/chat/chat.py @@ -26,9 +26,9 @@ """ from typing import List +import wandb from weave.monitoring import StreamTable -import wandb from wandbot.chat.config import ChatConfig from wandbot.chat.rag import RAGPipeline, RAGPipelineOutput from wandbot.chat.schemas import ChatRequest, ChatResponse diff --git a/src/wandbot/evaluation/eval/async_main.py b/src/wandbot/evaluation/eval/async_main.py index d219d5c..c9166b2 100644 --- a/src/wandbot/evaluation/eval/async_main.py +++ b/src/wandbot/evaluation/eval/async_main.py @@ -7,11 +7,11 @@ import aiofiles import httpx import pandas as pd +import wandb from llama_index.llms.openai import OpenAI from tenacity import retry, stop_after_attempt, wait_random_exponential from tqdm import tqdm -import wandb from wandbot.evaluation.config import EvalConfig from wandbot.evaluation.eval.correctness import ( CORRECTNESS_EVAL_TEMPLATE, diff --git a/src/wandbot/ingestion/config.py b/src/wandbot/ingestion/config.py index 91c9683..9cd4654 100644 --- a/src/wandbot/ingestion/config.py +++ b/src/wandbot/ingestion/config.py @@ -18,7 +18,6 @@ from pydantic import BaseModel, Field, model_validator from pydantic_settings import BaseSettings - from wandbot.utils import get_logger logger = get_logger(__name__) @@ -32,6 +31,7 @@ class DataSource(BaseSettings): remote_path: str = "" repo_path: str = "" local_path: Optional[pathlib.Path] = None + branch: Optional[str] = None base_path: Optional[str] = "" file_patterns: List[str] = ["*.*"] is_git_repo: bool = False @@ -98,14 +98,30 @@ class DocodileJapaneseStoreConfig(DataStoreConfig): data_source: DataSource = DataSource( remote_path="https://docs.wandb.ai/ja/", repo_path="https://github.com/wandb/docodile", - base_path="i18n/ja/docusaurus-plugin-content-docs/current", + base_path="docs", file_patterns=["*.md"], is_git_repo=True, + branch="japanese_docs", ) language: str = "ja" docstore_dir: pathlib.Path = pathlib.Path("wandb_documentation_ja") +class DocodileKoreanStoreConfig(DataStoreConfig): + name: str = "Korean Documentation" + source_type: str = "documentation" + data_source: DataSource = DataSource( + remote_path="https://docs.wandb.ai/ko/", + repo_path="https://github.com/wandb/docodile", + base_path="docs", + file_patterns=["*.md"], + is_git_repo=True, + branch="korean_docs", + ) + language: str = "ko" + docstore_dir: pathlib.Path = pathlib.Path("wandb_documentation_ko") + + class ExampleCodeStoreConfig(DataStoreConfig): name: str = "Examples code" source_type: str = "code" @@ -184,6 +200,22 @@ class WeaveExamplesStoreConfig(DataStoreConfig): docstore_dir: pathlib.Path = pathlib.Path("weave_examples") +class WeaveDocStoreConfig(DataStoreConfig): + name: str = "Weave Documentation" + source_type: str = "documentation" + data_source: DataSource = DataSource( + remote_path="https://wandb.github.io/weave/", + repo_path="https://github.com/wandb/weave", + base_path="docs/docs", + file_patterns=[ + "*.md", + ], + is_git_repo=True, + ) + language: str = "en" + docstore_dir: pathlib.Path = pathlib.Path("weave_documentation") + + class WandbEduCodeStoreConfig(DataStoreConfig): name: str = "Wandb Edu code" source_type: str = "code" diff --git a/src/wandbot/ingestion/prepare_data.py b/src/wandbot/ingestion/prepare_data.py index c98a852..ad66a21 100644 --- a/src/wandbot/ingestion/prepare_data.py +++ b/src/wandbot/ingestion/prepare_data.py @@ -23,17 +23,17 @@ import nbformat import pandas as pd +import wandb from google.cloud import bigquery from langchain.schema import Document from langchain_community.document_loaders import TextLoader from langchain_community.document_loaders.base import BaseLoader from nbconvert import MarkdownExporter - -import wandb from wandbot.ingestion.config import ( DataStoreConfig, DocodileEnglishStoreConfig, DocodileJapaneseStoreConfig, + DocodileKoreanStoreConfig, ExampleCodeStoreConfig, ExampleNotebookStoreConfig, FCReportsStoreConfig, @@ -41,6 +41,7 @@ SDKTestsStoreConfig, WandbEduCodeStoreConfig, WeaveCodeStoreConfig, + WeaveDocStoreConfig, WeaveExamplesStoreConfig, ) from wandbot.ingestion.utils import ( @@ -253,7 +254,7 @@ def lazy_load( document.metadata["source"] )[-1] document.metadata["source"] = document_files[ - document.metadata["source"] + pathlib.Path(document.metadata["source"]) ] document.metadata["language"] = self.config.language document.metadata["description"] = self.extract_description( @@ -270,6 +271,24 @@ def lazy_load( ) +class WeaveDocsDataLoader(DocodileDataLoader): + def generate_site_url( + self, base_path: pathlib.Path, file_path: pathlib.Path + ) -> str: + chapter = "" + slug = "" + file_loc = "" + + file_name = file_path.stem + if file_path.name in ("introduction.md",): + file_name = "" + site_relative_path = os.path.join(chapter, slug, file_loc, file_name) + site_url = urljoin( + str(self.config.data_source.remote_path), str(site_relative_path) + ) + return site_url + + class CodeDataLoader(DataLoader): def lazy_load(self) -> Iterator[Document]: """A lazy loader for code documents. @@ -788,15 +807,35 @@ def lazy_load(self) -> Iterator[Document]: SOURCE_TYPE_TO_LOADER_MAP = { - "documentation": DocodileDataLoader, + "wandb_documentation": DocodileDataLoader, + "weave_documentation": WeaveDocsDataLoader, "code": CodeDataLoader, "notebook": CodeDataLoader, "report": FCReportsDataLoader, } +def get_loader_from_config(config: DataStoreConfig) -> DataLoader: + """Get the DataLoader class based on the source type. + + Args: + config: The configuration for the data store. + + Returns: + The DataLoader class. + """ + source_type = config.source_type + if source_type == "documentation": + if "weave" in config.name.lower(): + source_type = "weave_documentation" + else: + source_type = "wandb_documentation" + + return SOURCE_TYPE_TO_LOADER_MAP[source_type](config) + + def load_from_config(config: DataStoreConfig) -> pathlib.Path: - loader = SOURCE_TYPE_TO_LOADER_MAP[config.source_type](config) + loader = get_loader_from_config(config) loader.config.docstore_dir.mkdir(parents=True, exist_ok=True) with (loader.config.docstore_dir / "config.json").open("w") as f: @@ -844,10 +883,12 @@ def load( configs = [ DocodileEnglishStoreConfig(), DocodileJapaneseStoreConfig(), + DocodileKoreanStoreConfig(), ExampleCodeStoreConfig(), ExampleNotebookStoreConfig(), SDKCodeStoreConfig(), SDKTestsStoreConfig(), + WeaveDocStoreConfig(), WeaveCodeStoreConfig(), WeaveExamplesStoreConfig(), WandbEduCodeStoreConfig(), diff --git a/src/wandbot/ingestion/preprocess_data.py b/src/wandbot/ingestion/preprocess_data.py index af883f9..2cc96f2 100644 --- a/src/wandbot/ingestion/preprocess_data.py +++ b/src/wandbot/ingestion/preprocess_data.py @@ -24,9 +24,9 @@ from typing import Any, List, Sequence import tiktoken +import wandb from langchain_core.documents import BaseDocumentTransformer, Document -import wandb from wandbot.ingestion.preprocessors.markdown import MarkdownTextTransformer from wandbot.ingestion.preprocessors.source_code import CodeTextTransformer from wandbot.utils import ( diff --git a/src/wandbot/ingestion/report.py b/src/wandbot/ingestion/report.py index 68b7a79..760fa6c 100644 --- a/src/wandbot/ingestion/report.py +++ b/src/wandbot/ingestion/report.py @@ -19,9 +19,8 @@ import pathlib from datetime import datetime -import wandb.apis.reports as wr - import wandb +import wandb.apis.reports as wr def log_raw_counts(metadata: dict[str, dict[str, int]]) -> list[str]: diff --git a/src/wandbot/ingestion/utils.py b/src/wandbot/ingestion/utils.py index 8548efe..8f63ff4 100644 --- a/src/wandbot/ingestion/utils.py +++ b/src/wandbot/ingestion/utils.py @@ -38,7 +38,6 @@ import markdownify from bs4 import BeautifulSoup, Comment from git import Repo - from wandbot.utils import get_logger logger = get_logger(__name__) @@ -131,6 +130,8 @@ def fetch_git_repo(paths: Any, id_file: Path) -> Dict[str, str]: f"Repo {paths.local_path} already exists... Pulling changes from {repo.remotes.origin.url}" ) with repo.git.custom_environment(GIT_SSH_COMMAND=git_command): + if paths.branch is not None: + repo.git.checkout(paths.branch) repo.remotes.origin.pull() else: remote_url = giturlparse.parse(f"{paths.repo_path}").urls.get("ssh") @@ -139,6 +140,8 @@ def fetch_git_repo(paths: Any, id_file: Path) -> Dict[str, str]: repo = Repo.clone_from( remote_url, paths.local_path, env=dict(GIT_SSH_COMMAND=git_command) ) + if paths.branch is not None: + repo.git.checkout(paths.branch) return fetch_repo_metadata(repo) diff --git a/src/wandbot/ingestion/vectorstores.py b/src/wandbot/ingestion/vectorstores.py index 5498ed1..f3ef115 100644 --- a/src/wandbot/ingestion/vectorstores.py +++ b/src/wandbot/ingestion/vectorstores.py @@ -16,12 +16,12 @@ import pathlib from typing import List +import wandb from langchain_community.vectorstores.chroma import Chroma from langchain_core.documents import Document from langchain_openai import OpenAIEmbeddings from tqdm import trange -import wandb from wandbot.ingestion.config import VectorStoreConfig from wandbot.utils import get_logger diff --git a/src/wandbot/rag/retrieval.py b/src/wandbot/rag/retrieval.py index 1a72ac9..0af9c7e 100644 --- a/src/wandbot/rag/retrieval.py +++ b/src/wandbot/rag/retrieval.py @@ -3,7 +3,6 @@ from langchain.retrievers.document_compressors import CohereRerank from langchain_core.documents import Document from langchain_core.runnables import Runnable, RunnablePassthrough - from wandbot.rag.utils import get_web_contexts from wandbot.retriever.base import VectorStore from wandbot.retriever.web_search import YouSearch, YouSearchConfig @@ -64,7 +63,7 @@ def __init__( self.top_k = top_k self.search_type = search_type - self.retriever = self.vectorstore.as_parent_retriever( + self.retriever = self.vectorstore.as_retriever( search_type=self.search_type, search_kwargs={"k": self.top_k} ) diff --git a/src/wandbot/retriever/base.py b/src/wandbot/retriever/base.py index 5b8a479..990150d 100644 --- a/src/wandbot/retriever/base.py +++ b/src/wandbot/retriever/base.py @@ -1,12 +1,12 @@ from operator import itemgetter from typing import List +import wandb from langchain_community.document_transformers import EmbeddingsRedundantFilter from langchain_community.vectorstores.chroma import Chroma from langchain_core.documents import Document from langchain_core.runnables import RunnableLambda, RunnableParallel -import wandb from wandbot.ingestion.config import VectorStoreConfig from wandbot.retriever.reranking import CohereRerankChain from wandbot.retriever.utils import OpenAIEmbeddingsModel diff --git a/src/wandbot/utils.py b/src/wandbot/utils.py index 3a7ccd3..3d2d793 100644 --- a/src/wandbot/utils.py +++ b/src/wandbot/utils.py @@ -36,12 +36,11 @@ import fasttext import nest_asyncio import tiktoken +import wandb from langchain_core.documents import Document from pydantic import Field from pydantic_settings import BaseSettings, SettingsConfigDict -import wandb - def get_logger(name: str) -> logging.Logger: """Creates and returns a logger with the specified name.