Skip to content

Commit

Permalink
Merge pull request #73 from wandb/feat/add-weave-docs
Browse files Browse the repository at this point in the history
feat: add weave documenation to ingestion pipeline
  • Loading branch information
morganmcg1 authored Apr 25, 2024
2 parents 75928fa + d9126dd commit cd9fb7a
Show file tree
Hide file tree
Showing 13 changed files with 94 additions and 21 deletions.
2 changes: 1 addition & 1 deletion src/wandbot/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@
from datetime import datetime, timezone

import pandas as pd
import wandb
from fastapi import FastAPI

import wandb
from wandbot.api.routers import chat as chat_router
from wandbot.api.routers import database as database_router
from wandbot.api.routers import retrieve as retrieve_router
Expand Down
2 changes: 1 addition & 1 deletion src/wandbot/api/routers/database.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import wandb
from fastapi import APIRouter
from starlette import status
from starlette.responses import Response

import wandb
from wandbot.database.client import DatabaseClient
from wandbot.database.database import engine
from wandbot.database.models import Base
Expand Down
2 changes: 1 addition & 1 deletion src/wandbot/chat/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@
"""
from typing import List

import wandb
from weave.monitoring import StreamTable

import wandb
from wandbot.chat.config import ChatConfig
from wandbot.chat.rag import RAGPipeline, RAGPipelineOutput
from wandbot.chat.schemas import ChatRequest, ChatResponse
Expand Down
2 changes: 1 addition & 1 deletion src/wandbot/evaluation/eval/async_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
import aiofiles
import httpx
import pandas as pd
import wandb
from llama_index.llms.openai import OpenAI
from tenacity import retry, stop_after_attempt, wait_random_exponential
from tqdm import tqdm

import wandb
from wandbot.evaluation.config import EvalConfig
from wandbot.evaluation.eval.correctness import (
CORRECTNESS_EVAL_TEMPLATE,
Expand Down
36 changes: 34 additions & 2 deletions src/wandbot/ingestion/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

from pydantic import BaseModel, Field, model_validator
from pydantic_settings import BaseSettings

from wandbot.utils import get_logger

logger = get_logger(__name__)
Expand All @@ -32,6 +31,7 @@ class DataSource(BaseSettings):
remote_path: str = ""
repo_path: str = ""
local_path: Optional[pathlib.Path] = None
branch: Optional[str] = None
base_path: Optional[str] = ""
file_patterns: List[str] = ["*.*"]
is_git_repo: bool = False
Expand Down Expand Up @@ -98,14 +98,30 @@ class DocodileJapaneseStoreConfig(DataStoreConfig):
data_source: DataSource = DataSource(
remote_path="https://docs.wandb.ai/ja/",
repo_path="https://github.com/wandb/docodile",
base_path="i18n/ja/docusaurus-plugin-content-docs/current",
base_path="docs",
file_patterns=["*.md"],
is_git_repo=True,
branch="japanese_docs",
)
language: str = "ja"
docstore_dir: pathlib.Path = pathlib.Path("wandb_documentation_ja")


class DocodileKoreanStoreConfig(DataStoreConfig):
name: str = "Korean Documentation"
source_type: str = "documentation"
data_source: DataSource = DataSource(
remote_path="https://docs.wandb.ai/ko/",
repo_path="https://github.com/wandb/docodile",
base_path="docs",
file_patterns=["*.md"],
is_git_repo=True,
branch="korean_docs",
)
language: str = "ko"
docstore_dir: pathlib.Path = pathlib.Path("wandb_documentation_ko")


class ExampleCodeStoreConfig(DataStoreConfig):
name: str = "Examples code"
source_type: str = "code"
Expand Down Expand Up @@ -184,6 +200,22 @@ class WeaveExamplesStoreConfig(DataStoreConfig):
docstore_dir: pathlib.Path = pathlib.Path("weave_examples")


class WeaveDocStoreConfig(DataStoreConfig):
name: str = "Weave Documentation"
source_type: str = "documentation"
data_source: DataSource = DataSource(
remote_path="https://wandb.github.io/weave/",
repo_path="https://github.com/wandb/weave",
base_path="docs/docs",
file_patterns=[
"*.md",
],
is_git_repo=True,
)
language: str = "en"
docstore_dir: pathlib.Path = pathlib.Path("weave_documentation")


class WandbEduCodeStoreConfig(DataStoreConfig):
name: str = "Wandb Edu code"
source_type: str = "code"
Expand Down
51 changes: 46 additions & 5 deletions src/wandbot/ingestion/prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,25 @@

import nbformat
import pandas as pd
import wandb
from google.cloud import bigquery
from langchain.schema import Document
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders.base import BaseLoader
from nbconvert import MarkdownExporter

import wandb
from wandbot.ingestion.config import (
DataStoreConfig,
DocodileEnglishStoreConfig,
DocodileJapaneseStoreConfig,
DocodileKoreanStoreConfig,
ExampleCodeStoreConfig,
ExampleNotebookStoreConfig,
FCReportsStoreConfig,
SDKCodeStoreConfig,
SDKTestsStoreConfig,
WandbEduCodeStoreConfig,
WeaveCodeStoreConfig,
WeaveDocStoreConfig,
WeaveExamplesStoreConfig,
)
from wandbot.ingestion.utils import (
Expand Down Expand Up @@ -253,7 +254,7 @@ def lazy_load(
document.metadata["source"]
)[-1]
document.metadata["source"] = document_files[
document.metadata["source"]
pathlib.Path(document.metadata["source"])
]
document.metadata["language"] = self.config.language
document.metadata["description"] = self.extract_description(
Expand All @@ -270,6 +271,24 @@ def lazy_load(
)


class WeaveDocsDataLoader(DocodileDataLoader):
def generate_site_url(
self, base_path: pathlib.Path, file_path: pathlib.Path
) -> str:
chapter = ""
slug = ""
file_loc = ""

file_name = file_path.stem
if file_path.name in ("introduction.md",):
file_name = ""
site_relative_path = os.path.join(chapter, slug, file_loc, file_name)
site_url = urljoin(
str(self.config.data_source.remote_path), str(site_relative_path)
)
return site_url


class CodeDataLoader(DataLoader):
def lazy_load(self) -> Iterator[Document]:
"""A lazy loader for code documents.
Expand Down Expand Up @@ -788,15 +807,35 @@ def lazy_load(self) -> Iterator[Document]:


SOURCE_TYPE_TO_LOADER_MAP = {
"documentation": DocodileDataLoader,
"wandb_documentation": DocodileDataLoader,
"weave_documentation": WeaveDocsDataLoader,
"code": CodeDataLoader,
"notebook": CodeDataLoader,
"report": FCReportsDataLoader,
}


def get_loader_from_config(config: DataStoreConfig) -> DataLoader:
"""Get the DataLoader class based on the source type.
Args:
config: The configuration for the data store.
Returns:
The DataLoader class.
"""
source_type = config.source_type
if source_type == "documentation":
if "weave" in config.name.lower():
source_type = "weave_documentation"
else:
source_type = "wandb_documentation"

return SOURCE_TYPE_TO_LOADER_MAP[source_type](config)


def load_from_config(config: DataStoreConfig) -> pathlib.Path:
loader = SOURCE_TYPE_TO_LOADER_MAP[config.source_type](config)
loader = get_loader_from_config(config)
loader.config.docstore_dir.mkdir(parents=True, exist_ok=True)

with (loader.config.docstore_dir / "config.json").open("w") as f:
Expand Down Expand Up @@ -844,10 +883,12 @@ def load(
configs = [
DocodileEnglishStoreConfig(),
DocodileJapaneseStoreConfig(),
DocodileKoreanStoreConfig(),
ExampleCodeStoreConfig(),
ExampleNotebookStoreConfig(),
SDKCodeStoreConfig(),
SDKTestsStoreConfig(),
WeaveDocStoreConfig(),
WeaveCodeStoreConfig(),
WeaveExamplesStoreConfig(),
WandbEduCodeStoreConfig(),
Expand Down
2 changes: 1 addition & 1 deletion src/wandbot/ingestion/preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@
from typing import Any, List, Sequence

import tiktoken
import wandb
from langchain_core.documents import BaseDocumentTransformer, Document

import wandb
from wandbot.ingestion.preprocessors.markdown import MarkdownTextTransformer
from wandbot.ingestion.preprocessors.source_code import CodeTextTransformer
from wandbot.utils import (
Expand Down
3 changes: 1 addition & 2 deletions src/wandbot/ingestion/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@
import pathlib
from datetime import datetime

import wandb.apis.reports as wr

import wandb
import wandb.apis.reports as wr


def log_raw_counts(metadata: dict[str, dict[str, int]]) -> list[str]:
Expand Down
5 changes: 4 additions & 1 deletion src/wandbot/ingestion/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
import markdownify
from bs4 import BeautifulSoup, Comment
from git import Repo

from wandbot.utils import get_logger

logger = get_logger(__name__)
Expand Down Expand Up @@ -131,6 +130,8 @@ def fetch_git_repo(paths: Any, id_file: Path) -> Dict[str, str]:
f"Repo {paths.local_path} already exists... Pulling changes from {repo.remotes.origin.url}"
)
with repo.git.custom_environment(GIT_SSH_COMMAND=git_command):
if paths.branch is not None:
repo.git.checkout(paths.branch)
repo.remotes.origin.pull()
else:
remote_url = giturlparse.parse(f"{paths.repo_path}").urls.get("ssh")
Expand All @@ -139,6 +140,8 @@ def fetch_git_repo(paths: Any, id_file: Path) -> Dict[str, str]:
repo = Repo.clone_from(
remote_url, paths.local_path, env=dict(GIT_SSH_COMMAND=git_command)
)
if paths.branch is not None:
repo.git.checkout(paths.branch)
return fetch_repo_metadata(repo)


Expand Down
2 changes: 1 addition & 1 deletion src/wandbot/ingestion/vectorstores.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
import pathlib
from typing import List

import wandb
from langchain_community.vectorstores.chroma import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from tqdm import trange

import wandb
from wandbot.ingestion.config import VectorStoreConfig
from wandbot.utils import get_logger

Expand Down
3 changes: 1 addition & 2 deletions src/wandbot/rag/retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from langchain.retrievers.document_compressors import CohereRerank
from langchain_core.documents import Document
from langchain_core.runnables import Runnable, RunnablePassthrough

from wandbot.rag.utils import get_web_contexts
from wandbot.retriever.base import VectorStore
from wandbot.retriever.web_search import YouSearch, YouSearchConfig
Expand Down Expand Up @@ -64,7 +63,7 @@ def __init__(
self.top_k = top_k
self.search_type = search_type

self.retriever = self.vectorstore.as_parent_retriever(
self.retriever = self.vectorstore.as_retriever(
search_type=self.search_type, search_kwargs={"k": self.top_k}
)

Expand Down
2 changes: 1 addition & 1 deletion src/wandbot/retriever/base.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from operator import itemgetter
from typing import List

import wandb
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain_community.vectorstores.chroma import Chroma
from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda, RunnableParallel

import wandb
from wandbot.ingestion.config import VectorStoreConfig
from wandbot.retriever.reranking import CohereRerankChain
from wandbot.retriever.utils import OpenAIEmbeddingsModel
Expand Down
3 changes: 1 addition & 2 deletions src/wandbot/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,11 @@
import fasttext
import nest_asyncio
import tiktoken
import wandb
from langchain_core.documents import Document
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict

import wandb


def get_logger(name: str) -> logging.Logger:
"""Creates and returns a logger with the specified name.
Expand Down

0 comments on commit cd9fb7a

Please sign in to comment.