From 8e9161e4b0a39c9383b8f296b513b2757f3a95e2 Mon Sep 17 00:00:00 2001 From: Anish Shah Date: Wed, 22 Nov 2023 13:17:04 -0500 Subject: [PATCH] Working multidataset ingestion workflow --- src/wandbot/ingestion/__main__.py | 5 +---- src/wandbot/ingestion/custom_dataset.yaml | 2 ++ src/wandbot/ingestion/prepare_data.py | 27 +++++++++++------------ src/wandbot/ingestion/typings.py | 3 +++ 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/wandbot/ingestion/__main__.py b/src/wandbot/ingestion/__main__.py index 8d5e462..2d3244a 100644 --- a/src/wandbot/ingestion/__main__.py +++ b/src/wandbot/ingestion/__main__.py @@ -15,10 +15,7 @@ def main(custom: bool, custom_dataset_config_yaml: pathlib.Path): if custom and custom_dataset_config_yaml.is_file(): configs = load_custom_dataset_configs_from_yaml(custom_dataset_config_yaml) - #TODO: Add the full list of configs as opposed to limiting to one - #TODO: Add the ability to define which dataloader to use in the config yaml itself - config = configs[0] - raw_artifact = prepare_data.load_custom(project, entity, "custom_raw_dataset", config, "docodile") + raw_artifact = prepare_data.load_custom(project, entity, "custom_raw_dataset", configs) else: raw_artifact = prepare_data.load(project, entity) vectorstore_artifact = vectorstores.load(project, entity, raw_artifact) diff --git a/src/wandbot/ingestion/custom_dataset.yaml b/src/wandbot/ingestion/custom_dataset.yaml index 237b7c7..6c83906 100644 --- a/src/wandbot/ingestion/custom_dataset.yaml +++ b/src/wandbot/ingestion/custom_dataset.yaml @@ -8,6 +8,7 @@ is_git_repo: true language: "en" docstore_dir: "custom_store_en" + dataloader_type: "docodile" - CustomConfig2: name: "custom_store2" data_source: @@ -18,3 +19,4 @@ is_git_repo: true language: "en" docstore_dir: "custom_store_en2" + dataloader_type: "docodile" diff --git a/src/wandbot/ingestion/prepare_data.py b/src/wandbot/ingestion/prepare_data.py index f2be746..cf2f57c 100644 --- a/src/wandbot/ingestion/prepare_data.py +++ b/src/wandbot/ingestion/prepare_data.py @@ -350,8 +350,8 @@ def load_custom( project: str, entity: str, result_artifact_name: str = "custom_raw_dataset", - custom_dataset_args: DataStoreConfigDict = {}, - dataset_type: str = "docodile", + #TODO: Rename + custom_datasets_args: DataStoreConfigDict = {} ): """Load and prepare data for a chatbot system. @@ -375,18 +375,17 @@ def load_custom( type="dataset", description="Raw documents for custom dataset", ) - - #TODO: Allow for an arbitrary amount of custom datasets mapped to the proper config and loader based on appropriate popped args - if dataset_type == "docodile": - CustomDataLoader = DocodileDataLoader - elif dataset_type == "code": - CustomDataLoader = CodeDataLoader - else: - raise ValueError(f"Dataset type {dataset_type} not supported") - custom_dataset_loader = CustomDataLoader(DataStoreConfig.from_dict(custom_dataset_args)) - for loader in [ - custom_dataset_loader - ]: + + for custom_dataset_args in custom_datasets_args: + dataloader_type = custom_dataset_args.pop("dataloader_type") + if dataloader_type == "docodile": + CustomDataLoader = DocodileDataLoader + elif dataloader_type == "code": + CustomDataLoader = CodeDataLoader + else: + raise ValueError(f"Dataset type {dataloader_type} not supported") + + loader = CustomDataLoader(DataStoreConfig.from_dict(custom_dataset_args)) loader.config.docstore_dir.mkdir(parents=True, exist_ok=True) with (loader.config.docstore_dir / "config.json").open("w") as f: diff --git a/src/wandbot/ingestion/typings.py b/src/wandbot/ingestion/typings.py index 37c46b3..d529f3f 100644 --- a/src/wandbot/ingestion/typings.py +++ b/src/wandbot/ingestion/typings.py @@ -13,3 +13,6 @@ DataStoreConfigDict["data_source"] = DataSourceDict DataStoreConfigDict["docstore_dir"] = str + +# Add additional fields to the type hints for custom fields +DataSourceDict["dataloader_type"] = str \ No newline at end of file