diff --git a/README.md b/README.md index faee7cb..79e81d4 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,32 @@ poetry run python -m src.wandbot.ingestion You will notice that the data is ingested into the `data/cache` directory and stored in three different directories `raw_data`, `vectorstore` with individual files for each step of the ingestion process. These datasets are also stored as wandb artifacts in the project defined in the environment variable `WANDB_PROJECT` and can be accessed from the [wandb dashboard](https://wandb.ai/wandb/wandbot-dev). +#### Custom Dataset + +To run the Data Ingestion with a custom dataset you can use the following command: + +```bash +poetry run python -m src.wandbot.ingestion --custom --custom_dataset_config_yaml +``` + +where + +- `--custom` -> Flag for ingesting a custom dataset. If this flag is not set, the default wandbot data flow is used. +- `--custom_dataset_config_yaml` -> Path to the custom dataset config yaml file. An example is provided in `src/wandbot/ingestion/custom_dataset_config.yaml` + +The YAML is structured as follows: +```yaml +- CustomConfig: + name: "custom_store" + data_source: + remote_path: "https://docs.wandb.ai/" + repo_path: "https://github.com/wandb/docodile" + base_path: "docs" + file_pattern: "*.md" + is_git_repo: true + language: "en" + docstore_dir: "custom_store_en" +``` ### Running the Q&A Bot @@ -65,6 +91,8 @@ WANDB_PROJECT="wandbot-dev" WANDB_ENTITY="wandbot" ``` +Note, ensure that you have a git identity file which points to the credentials created for ssh access to repositories. The git identity file is typically located located at `~/.ssh/id_rsa` and the corresponding public key should be added to the github account. + Once these environment variables are set, you can start the Q&A bot application using the following commands: ```bash @@ -78,6 +106,14 @@ For more detailed instructions on installing and running the bot, please refer t Executing these commands will launch the API, Slackbot, and Discord bot applications, enabling you to interact with the bot and ask questions related to the Weights & Biases documentation. +#### Custom Dataset + +To load an index based on the custom dataset as defined above, you can set the following environment variable to an artifact path: + +```bash +WANDB_INDEX_ARTIFACT="{ENTITY}/{PROJECT}/custom_index:latest" +``` + ### Evaluation To evaluate the performance of the Q&A bot, the provided evaluation script (…) can be used. This script utilizes a separate dataset for evaluation, which can be stored as a W&B Artifact. The evaluation script calculates retrieval accuracy, average string distance, and chat model accuracy. diff --git a/src/wandbot/ingestion/__main__.py b/src/wandbot/ingestion/__main__.py index f2b4e50..6497060 100644 --- a/src/wandbot/ingestion/__main__.py +++ b/src/wandbot/ingestion/__main__.py @@ -1,22 +1,37 @@ +import argparse import os +import pathlib from wandbot.ingestion import prepare_data, vectorstores from wandbot.ingestion.report import create_ingestion_report +from wandbot.ingestion.utils import load_custom_dataset_configs_from_yaml from wandbot.utils import get_logger logger = get_logger(__name__) - -def main(): +def main(custom: bool, custom_dataset_config_yaml: pathlib.Path): project = os.environ.get("WANDB_PROJECT", "wandbot-dev") entity = os.environ.get("WANDB_ENTITY", "wandbot") - raw_artifact = prepare_data.load(project, entity) + if custom and custom_dataset_config_yaml.is_file(): + configs = load_custom_dataset_configs_from_yaml(custom_dataset_config_yaml) + #TODO: Add the full list of configs as opposed to limiting to one + #TODO: Add the ability to define which dataloader to use in the config yaml itself + config = configs[0] + raw_artifact = prepare_data.load_custom(project, entity, "custom_raw_dataset", config, "docodile") + else: + raw_artifact = prepare_data.load(project, entity) vectorstore_artifact = vectorstores.load(project, entity, raw_artifact) - # TODO: include ingestion report create_ingestion_report(project, entity, raw_artifact, vectorstore_artifact) print(vectorstore_artifact) - if __name__ == "__main__": - main() + parser = argparse.ArgumentParser(description='Process some integers.') + parser.add_argument('--custom', type=bool, default=True, + help='Flag for ingesting a custom dataset') + parser.add_argument('--custom_dataset_config_yaml', type=pathlib.Path, + default=pathlib.Path(__file__).parent / "custom_dataset.yaml", + help='Path to the custom dataset config yaml file') + args = parser.parse_args() + + main(args.custom, args.custom_dataset_config_yaml) \ No newline at end of file diff --git a/src/wandbot/ingestion/config.py b/src/wandbot/ingestion/config.py index 9bdc636..7781b1b 100644 --- a/src/wandbot/ingestion/config.py +++ b/src/wandbot/ingestion/config.py @@ -41,6 +41,7 @@ class DataStoreConfig(BaseModel): name: str = "docstore" data_source: DataSource = DataSource() docstore_dir: pathlib.Path = pathlib.Path("docstore") + language: Optional[str] = None @model_validator(mode="after") def _set_cache_paths(cls, values: "DataStoreConfig") -> "DataStoreConfig": @@ -59,6 +60,7 @@ def _set_cache_paths(cls, values: "DataStoreConfig") -> "DataStoreConfig": data_source.cache_dir / values.name / local_path ) if data_source.is_git_repo: + # TODO: Remove this for public repos and credentialless access if data_source.git_id_file is None: logger.debug( "The source data is a git repo but no git_id_file is set." @@ -70,6 +72,14 @@ def _set_cache_paths(cls, values: "DataStoreConfig") -> "DataStoreConfig": values.data_source = data_source return values + + @classmethod + def from_dict(cls, config_dict: dict) -> "DataStoreConfig": + return cls( + name=config_dict.get("name"), + data_source=DataSource(**config_dict.get("data_source")), + docstore_dir=pathlib.Path(config_dict.get("docstore_dir")), + ) class DocodileEnglishStoreConfig(DataStoreConfig): @@ -189,4 +199,4 @@ class VectorStoreConfig(BaseSettings): chat_model_name: str = "gpt-3.5-turbo-0613" temperature: float = 0.1 max_retries: int = 3 - embeddings_cache: pathlib.Path = pathlib.Path("data/cache/embeddings") + embeddings_cache: pathlib.Path = pathlib.Path("data/cache/embeddings") \ No newline at end of file diff --git a/src/wandbot/ingestion/custom_dataset.yaml b/src/wandbot/ingestion/custom_dataset.yaml new file mode 100644 index 0000000..b6e77df --- /dev/null +++ b/src/wandbot/ingestion/custom_dataset.yaml @@ -0,0 +1,10 @@ +- CustomConfig: + name: "custom_store" + data_source: + remote_path: "https://docs.wandb.ai/" + repo_path: "https://github.com/wandb/docodile" + base_path: "docs" + file_pattern: "*.md" + is_git_repo: true + language: "en" + docstore_dir: "custom_store_en" diff --git a/src/wandbot/ingestion/prepare_data.py b/src/wandbot/ingestion/prepare_data.py index e46e437..f2be746 100644 --- a/src/wandbot/ingestion/prepare_data.py +++ b/src/wandbot/ingestion/prepare_data.py @@ -17,6 +17,7 @@ import json import os import pathlib +from pathlib import Path from typing import Iterator from urllib.parse import urljoin @@ -35,6 +36,7 @@ ExampleCodeStoreConfig, ExampleNotebookStoreConfig, ) +from wandbot.ingestion.typings import DataStoreConfigDict from wandbot.ingestion.utils import ( EXTENSION_MAP, clean_contents, @@ -343,3 +345,67 @@ def load( run.log_artifact(artifact) run.finish() return f"{entity}/{project}/{result_artifact_name}:latest" + +def load_custom( + project: str, + entity: str, + result_artifact_name: str = "custom_raw_dataset", + custom_dataset_args: DataStoreConfigDict = {}, + dataset_type: str = "docodile", +): + """Load and prepare data for a chatbot system. + + This function initializes a Wandb run, creates an artifact for a defined dataset, + and loads and prepares the data from the appropriate loader. The prepared data is then saved + in the docstore directory and added to the artifact. + + Args: + project: The name of the Wandb project. + entity: The name of the Wandb entity. + result_artifact_name: The name of the result artifact. Default is "custom_raw_dataset". + custom_dataset_args: The arguments for the custom dataset to load into the config and loader + dataloader_type: The type of dataloader to use for the custom dataset + + Returns: + The latest version of the prepared dataset artifact in the format "{entity}/{project}/{result_artifact_name}:latest". + """ + run = wandb.init(project=project, entity=entity, job_type="prepare_dataset") + artifact = wandb.Artifact( + result_artifact_name, + type="dataset", + description="Raw documents for custom dataset", + ) + + #TODO: Allow for an arbitrary amount of custom datasets mapped to the proper config and loader based on appropriate popped args + if dataset_type == "docodile": + CustomDataLoader = DocodileDataLoader + elif dataset_type == "code": + CustomDataLoader = CodeDataLoader + else: + raise ValueError(f"Dataset type {dataset_type} not supported") + custom_dataset_loader = CustomDataLoader(DataStoreConfig.from_dict(custom_dataset_args)) + for loader in [ + custom_dataset_loader + ]: + loader.config.docstore_dir.mkdir(parents=True, exist_ok=True) + + with (loader.config.docstore_dir / "config.json").open("w") as f: + f.write(loader.config.model_dump_json()) + + with (loader.config.docstore_dir / "documents.jsonl").open("w") as f: + for document in loader.load(): + document_json = { + "page_content": document.page_content, + "metadata": document.metadata, + } + f.write(json.dumps(document_json) + "\n") + with (loader.config.docstore_dir / "metadata.json").open("w") as f: + json.dump(loader.metadata, f) + + artifact.add_dir( + str(loader.config.docstore_dir), + name=loader.config.docstore_dir.name, + ) + run.log_artifact(artifact) + run.finish() + return f"{entity}/{project}/{result_artifact_name}:latest" \ No newline at end of file diff --git a/src/wandbot/ingestion/typings.py b/src/wandbot/ingestion/typings.py new file mode 100644 index 0000000..37c46b3 --- /dev/null +++ b/src/wandbot/ingestion/typings.py @@ -0,0 +1,15 @@ +from typing import get_type_hints, Optional, Union, List +from wandbot.ingestion.config import DataSource, DataStoreConfig + +# Get the type hints for DataSource and DataStoreConfig +DataSourceDict = get_type_hints(DataSource) +DataStoreConfigDict = get_type_hints(DataStoreConfig) + +# Replace the types that are not JSON serializable +DataSourceDict["cache_dir"] = str +DataSourceDict["local_path"] = Optional[str] +DataSourceDict["git_id_file"] = Optional[str] +DataSourceDict["file_pattern"] = Union[str, List[str]] + +DataStoreConfigDict["data_source"] = DataSourceDict +DataStoreConfigDict["docstore_dir"] = str diff --git a/src/wandbot/ingestion/utils.py b/src/wandbot/ingestion/utils.py index 2d7399b..47430e6 100644 --- a/src/wandbot/ingestion/utils.py +++ b/src/wandbot/ingestion/utils.py @@ -9,6 +9,7 @@ - `fetch_repo_metadata`: Fetches the metadata of the git repository. - `fetch_git_repo`: Fetches the git repository. - `concatenate_cells`: Combines cells information in a readable format. +- `load_custom_dataset_configs_from_yaml`: Loads the config from a yaml file. The module also includes the following constants: - `EXTENSION_MAP`: A dictionary mapping file extensions to programming languages. @@ -36,6 +37,8 @@ import markdownify from bs4 import BeautifulSoup, Comment from git import Repo +import yaml + from wandbot.utils import get_logger @@ -51,6 +54,7 @@ def get_git_command(id_file: Path) -> str: Returns: The git command with the id file. """ + print(id_file) assert id_file.is_file() git_command = f"ssh -v -i /{id_file}" @@ -276,3 +280,8 @@ def clean_contents(contents: str) -> str: cleaned_document = re.sub(r"\[([^]]+)\]\([^)]+\)", r"\1", cleaned_document) return cleaned_document + +def load_custom_dataset_configs_from_yaml(file_path: str) -> list: + with open(file_path, 'r') as file: + config_list = yaml.safe_load(file) + return [next(iter(config.values())) for config in config_list] \ No newline at end of file