Add working hardcoded workflow

- allow custom details for indexing in yaml format based on ingestion data store config - add accessory flags and functions to map and parse the provided repo approriately - Allow for serving of the index via "WANDB_INDEX_ARTIFACT" env var
wandb · Nov 22, 2023 · 045966a · 045966a
1 parent bc9fff6
commit 045966a
Show file tree

Hide file tree

Showing 7 changed files with 168 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -42,6 +42,32 @@ poetry run python -m src.wandbot.ingestion
 You will notice that the data is ingested into the `data/cache` directory and stored in three different directories `raw_data`, `vectorstore` with individual files for each step of the ingestion process.
 These datasets are also stored as wandb artifacts in the project defined in the environment variable `WANDB_PROJECT` and can be accessed from the [wandb dashboard](https://wandb.ai/wandb/wandbot-dev).
 
+#### Custom Dataset
+
+To run the Data Ingestion with a custom dataset you can use the following command:
+
+```bash
+poetry run python -m src.wandbot.ingestion --custom --custom_dataset_config_yaml <path_to_yaml>
+```
+
+where
+
+- `--custom` -> Flag for ingesting a custom dataset. If this flag is not set, the default wandbot data flow is used.
+- `--custom_dataset_config_yaml` -> Path to the custom dataset config yaml file. An example is provided in `src/wandbot/ingestion/custom_dataset_config.yaml`
+
+The YAML is structured as follows:
+```yaml
+- CustomConfig:
+    name: "custom_store"
+    data_source:
+      remote_path: "https://docs.wandb.ai/"
+      repo_path: "https://github.com/wandb/docodile"
+      base_path: "docs"
+      file_pattern: "*.md"
+      is_git_repo: true
+    language: "en"
+    docstore_dir: "custom_store_en"
+```
 
 ### Running the Q&A Bot
 
@@ -65,6 +91,8 @@ WANDB_PROJECT="wandbot-dev"
 WANDB_ENTITY="wandbot"
 ```
 
+Note, ensure that you have a git identity file which points to the credentials created for ssh access to repositories. The git identity file is typically located located at `~/.ssh/id_rsa` and the corresponding public key should be added to the github account.
+
 Once these environment variables are set, you can start the Q&A bot application using the following commands:
 
 ```bash
@@ -78,6 +106,14 @@ For more detailed instructions on installing and running the bot, please refer t
 
 Executing these commands will launch the API, Slackbot, and Discord bot applications, enabling you to interact with the bot and ask questions related to the Weights & Biases documentation.
 
+#### Custom Dataset
+
+To load an index based on the custom dataset as defined above, you can set the following environment variable to an artifact path:
+
+```bash
+WANDB_INDEX_ARTIFACT="{ENTITY}/{PROJECT}/custom_index:latest" 
+```
+
 ### Evaluation
 
 To evaluate the performance of the Q&A bot, the provided evaluation script (…) can be used. This script utilizes a separate dataset for evaluation, which can be stored as a W&B Artifact. The evaluation script calculates retrieval accuracy, average string distance, and chat model accuracy.

diff --git a/src/wandbot/ingestion/__main__.py b/src/wandbot/ingestion/__main__.py
@@ -1,22 +1,37 @@
+import argparse
 import os
+import pathlib
 
 from wandbot.ingestion import prepare_data, vectorstores
 from wandbot.ingestion.report import create_ingestion_report
+from wandbot.ingestion.utils import load_custom_dataset_configs_from_yaml
 from wandbot.utils import get_logger
 
 logger = get_logger(__name__)
 
-
-def main():
+def main(custom: bool, custom_dataset_config_yaml: pathlib.Path):
     project = os.environ.get("WANDB_PROJECT", "wandbot-dev")
     entity = os.environ.get("WANDB_ENTITY", "wandbot")
 
-    raw_artifact = prepare_data.load(project, entity)
+    if custom and custom_dataset_config_yaml.is_file():
+        configs = load_custom_dataset_configs_from_yaml(custom_dataset_config_yaml)
+        #TODO: Add the full list of configs as opposed to limiting to one
+        #TODO: Add the ability to define which dataloader to use in the config yaml itself
+        config = configs[0]
+        raw_artifact = prepare_data.load_custom(project, entity, "custom_raw_dataset", config, "docodile")
+    else:
+        raw_artifact = prepare_data.load(project, entity)
     vectorstore_artifact = vectorstores.load(project, entity, raw_artifact)
-    # TODO: include ingestion report
     create_ingestion_report(project, entity, raw_artifact, vectorstore_artifact)
     print(vectorstore_artifact)
 
-
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser(description='Process some integers.')
+    parser.add_argument('--custom', type=bool, default=True,
+                        help='Flag for ingesting a custom dataset')
+    parser.add_argument('--custom_dataset_config_yaml', type=pathlib.Path, 
+                        default=pathlib.Path(__file__).parent / "custom_dataset.yaml",
+                        help='Path to the custom dataset config yaml file')
+    args = parser.parse_args()
+
+    main(args.custom, args.custom_dataset_config_yaml)
diff --git a/src/wandbot/ingestion/config.py b/src/wandbot/ingestion/config.py
@@ -41,6 +41,7 @@ class DataStoreConfig(BaseModel):
     name: str = "docstore"
     data_source: DataSource = DataSource()
     docstore_dir: pathlib.Path = pathlib.Path("docstore")
+    language: Optional[str] = None
 
     @model_validator(mode="after")
     def _set_cache_paths(cls, values: "DataStoreConfig") -> "DataStoreConfig":
@@ -59,6 +60,7 @@ def _set_cache_paths(cls, values: "DataStoreConfig") -> "DataStoreConfig":
                     data_source.cache_dir / values.name / local_path
                 )
             if data_source.is_git_repo:
+                # TODO: Remove this for public repos and credentialless access
                 if data_source.git_id_file is None:
                     logger.debug(
                         "The source data is a git repo but no git_id_file is set."
@@ -70,6 +72,14 @@ def _set_cache_paths(cls, values: "DataStoreConfig") -> "DataStoreConfig":
         values.data_source = data_source
 
         return values
+
+    @classmethod
+    def from_dict(cls, config_dict: dict) -> "DataStoreConfig":
+        return cls(
+            name=config_dict.get("name"),
+            data_source=DataSource(**config_dict.get("data_source")),
+            docstore_dir=pathlib.Path(config_dict.get("docstore_dir")),
+        )
 
 
 class DocodileEnglishStoreConfig(DataStoreConfig):
@@ -189,4 +199,4 @@ class VectorStoreConfig(BaseSettings):
     chat_model_name: str = "gpt-3.5-turbo-0613"
     temperature: float = 0.1
     max_retries: int = 3
-    embeddings_cache: pathlib.Path = pathlib.Path("data/cache/embeddings")
+    embeddings_cache: pathlib.Path = pathlib.Path("data/cache/embeddings")
diff --git a/src/wandbot/ingestion/custom_dataset.yaml b/src/wandbot/ingestion/custom_dataset.yaml
@@ -0,0 +1,10 @@
+- CustomConfig:
+    name: "custom_store"
+    data_source:
+      remote_path: "https://docs.wandb.ai/"
+      repo_path: "https://github.com/wandb/docodile"
+      base_path: "docs"
+      file_pattern: "*.md"
+      is_git_repo: true
+    language: "en"
+    docstore_dir: "custom_store_en"
diff --git a/src/wandbot/ingestion/prepare_data.py b/src/wandbot/ingestion/prepare_data.py
@@ -17,6 +17,7 @@
 import json
 import os
 import pathlib
+from pathlib import Path
 from typing import Iterator
 from urllib.parse import urljoin
 
@@ -35,6 +36,7 @@
     ExampleCodeStoreConfig,
     ExampleNotebookStoreConfig,
 )
+from wandbot.ingestion.typings import DataStoreConfigDict
 from wandbot.ingestion.utils import (
     EXTENSION_MAP,
     clean_contents,
@@ -343,3 +345,67 @@ def load(
     run.log_artifact(artifact)
     run.finish()
     return f"{entity}/{project}/{result_artifact_name}:latest"
+
+def load_custom(
+    project: str,
+    entity: str,
+    result_artifact_name: str = "custom_raw_dataset",
+    custom_dataset_args: DataStoreConfigDict = {},
+    dataset_type: str = "docodile",
+):
+    """Load and prepare data for a chatbot system.
+
+    This function initializes a Wandb run, creates an artifact for a defined dataset,
+    and loads and prepares the data from the appropriate loader. The prepared data is then saved
+    in the docstore directory and added to the artifact.
+
+    Args:
+        project: The name of the Wandb project.
+        entity: The name of the Wandb entity.
+        result_artifact_name: The name of the result artifact. Default is "custom_raw_dataset".
+        custom_dataset_args: The arguments for the custom dataset to load into the config and loader
+        dataloader_type: The type of dataloader to use for the custom dataset
+
+    Returns:
+        The latest version of the prepared dataset artifact in the format "{entity}/{project}/{result_artifact_name}:latest".
+    """
+    run = wandb.init(project=project, entity=entity, job_type="prepare_dataset")
+    artifact = wandb.Artifact(
+        result_artifact_name,
+        type="dataset",
+        description="Raw documents for custom dataset",
+    )
+
+    #TODO: Allow for an arbitrary amount of custom datasets mapped to the proper config and loader based on appropriate popped args
+    if dataset_type == "docodile":
+        CustomDataLoader = DocodileDataLoader
+    elif dataset_type == "code":
+        CustomDataLoader = CodeDataLoader
+    else:
+        raise ValueError(f"Dataset type {dataset_type} not supported")
+    custom_dataset_loader = CustomDataLoader(DataStoreConfig.from_dict(custom_dataset_args))
+    for loader in [
+        custom_dataset_loader
+    ]:
+        loader.config.docstore_dir.mkdir(parents=True, exist_ok=True)
+
+        with (loader.config.docstore_dir / "config.json").open("w") as f:
+            f.write(loader.config.model_dump_json())
+
+        with (loader.config.docstore_dir / "documents.jsonl").open("w") as f:
+            for document in loader.load():
+                document_json = {
+                    "page_content": document.page_content,
+                    "metadata": document.metadata,
+                }
+                f.write(json.dumps(document_json) + "\n")
+        with (loader.config.docstore_dir / "metadata.json").open("w") as f:
+            json.dump(loader.metadata, f)
+
+        artifact.add_dir(
+            str(loader.config.docstore_dir),
+            name=loader.config.docstore_dir.name,
+        )
+    run.log_artifact(artifact)
+    run.finish()
+    return f"{entity}/{project}/{result_artifact_name}:latest"
diff --git a/src/wandbot/ingestion/typings.py b/src/wandbot/ingestion/typings.py
@@ -0,0 +1,15 @@
+from typing import get_type_hints, Optional, Union, List
+from wandbot.ingestion.config import DataSource, DataStoreConfig
+
+# Get the type hints for DataSource and DataStoreConfig
+DataSourceDict = get_type_hints(DataSource)
+DataStoreConfigDict = get_type_hints(DataStoreConfig)
+
+# Replace the types that are not JSON serializable
+DataSourceDict["cache_dir"] = str
+DataSourceDict["local_path"] = Optional[str]
+DataSourceDict["git_id_file"] = Optional[str]
+DataSourceDict["file_pattern"] = Union[str, List[str]]
+
+DataStoreConfigDict["data_source"] = DataSourceDict
+DataStoreConfigDict["docstore_dir"] = str
diff --git a/src/wandbot/ingestion/utils.py b/src/wandbot/ingestion/utils.py
@@ -9,6 +9,7 @@
 - `fetch_repo_metadata`: Fetches the metadata of the git repository.
 - `fetch_git_repo`: Fetches the git repository.
 - `concatenate_cells`: Combines cells information in a readable format.
+- `load_custom_dataset_configs_from_yaml`: Loads the config from a yaml file.
 
 The module also includes the following constants:
 - `EXTENSION_MAP`: A dictionary mapping file extensions to programming languages.
@@ -36,6 +37,8 @@
 import markdownify
 from bs4 import BeautifulSoup, Comment
 from git import Repo
+import yaml
+
 
 from wandbot.utils import get_logger
 
@@ -51,6 +54,7 @@ def get_git_command(id_file: Path) -> str:
     Returns:
         The git command with the id file.
     """
+    print(id_file)
     assert id_file.is_file()
 
     git_command = f"ssh -v -i /{id_file}"
@@ -276,3 +280,8 @@ def clean_contents(contents: str) -> str:
     cleaned_document = re.sub(r"\[([^]]+)\]\([^)]+\)", r"\1", cleaned_document)
 
     return cleaned_document
+
+def load_custom_dataset_configs_from_yaml(file_path: str) -> list:
+    with open(file_path, 'r') as file:
+        config_list = yaml.safe_load(file)
+    return [next(iter(config.values())) for config in config_list]