From 8e9161e4b0a39c9383b8f296b513b2757f3a95e2 Mon Sep 17 00:00:00 2001
From: Anish Shah <anish@wandb.com>
Date: Wed, 22 Nov 2023 13:17:04 -0500
Subject: [PATCH] Working multidataset ingestion workflow

---
 src/wandbot/ingestion/__main__.py         |  5 +----
 src/wandbot/ingestion/custom_dataset.yaml |  2 ++
 src/wandbot/ingestion/prepare_data.py     | 27 +++++++++++------------
 src/wandbot/ingestion/typings.py          |  3 +++
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/src/wandbot/ingestion/__main__.py b/src/wandbot/ingestion/__main__.py
index 8d5e462..2d3244a 100644
--- a/src/wandbot/ingestion/__main__.py
+++ b/src/wandbot/ingestion/__main__.py
@@ -15,10 +15,7 @@ def main(custom: bool, custom_dataset_config_yaml: pathlib.Path):
 
     if custom and custom_dataset_config_yaml.is_file():
         configs = load_custom_dataset_configs_from_yaml(custom_dataset_config_yaml)
-        #TODO: Add the full list of configs as opposed to limiting to one
-        #TODO: Add the ability to define which dataloader to use in the config yaml itself
-        config = configs[0]
-        raw_artifact = prepare_data.load_custom(project, entity, "custom_raw_dataset", config, "docodile")
+        raw_artifact = prepare_data.load_custom(project, entity, "custom_raw_dataset", configs)
     else:
         raw_artifact = prepare_data.load(project, entity)
     vectorstore_artifact = vectorstores.load(project, entity, raw_artifact)
diff --git a/src/wandbot/ingestion/custom_dataset.yaml b/src/wandbot/ingestion/custom_dataset.yaml
index 237b7c7..6c83906 100644
--- a/src/wandbot/ingestion/custom_dataset.yaml
+++ b/src/wandbot/ingestion/custom_dataset.yaml
@@ -8,6 +8,7 @@
       is_git_repo: true
     language: "en"
     docstore_dir: "custom_store_en"
+    dataloader_type: "docodile"
 - CustomConfig2:
     name: "custom_store2"
     data_source:
@@ -18,3 +19,4 @@
       is_git_repo: true
     language: "en"
     docstore_dir: "custom_store_en2"
+    dataloader_type: "docodile"
diff --git a/src/wandbot/ingestion/prepare_data.py b/src/wandbot/ingestion/prepare_data.py
index f2be746..cf2f57c 100644
--- a/src/wandbot/ingestion/prepare_data.py
+++ b/src/wandbot/ingestion/prepare_data.py
@@ -350,8 +350,8 @@ def load_custom(
     project: str,
     entity: str,
     result_artifact_name: str = "custom_raw_dataset",
-    custom_dataset_args: DataStoreConfigDict = {},
-    dataset_type: str = "docodile",
+    #TODO: Rename
+    custom_datasets_args: DataStoreConfigDict = {}
 ):
     """Load and prepare data for a chatbot system.
 
@@ -375,18 +375,17 @@ def load_custom(
         type="dataset",
         description="Raw documents for custom dataset",
     )
-
-    #TODO: Allow for an arbitrary amount of custom datasets mapped to the proper config and loader based on appropriate popped args
-    if dataset_type == "docodile":
-        CustomDataLoader = DocodileDataLoader
-    elif dataset_type == "code":
-        CustomDataLoader = CodeDataLoader
-    else:
-        raise ValueError(f"Dataset type {dataset_type} not supported")
-    custom_dataset_loader = CustomDataLoader(DataStoreConfig.from_dict(custom_dataset_args))
-    for loader in [
-        custom_dataset_loader
-    ]:
+    
+    for custom_dataset_args in custom_datasets_args:
+        dataloader_type = custom_dataset_args.pop("dataloader_type")
+        if dataloader_type == "docodile":
+            CustomDataLoader = DocodileDataLoader
+        elif dataloader_type == "code":
+            CustomDataLoader = CodeDataLoader
+        else:
+            raise ValueError(f"Dataset type {dataloader_type} not supported")
+        
+        loader = CustomDataLoader(DataStoreConfig.from_dict(custom_dataset_args))
         loader.config.docstore_dir.mkdir(parents=True, exist_ok=True)
 
         with (loader.config.docstore_dir / "config.json").open("w") as f:
diff --git a/src/wandbot/ingestion/typings.py b/src/wandbot/ingestion/typings.py
index 37c46b3..d529f3f 100644
--- a/src/wandbot/ingestion/typings.py
+++ b/src/wandbot/ingestion/typings.py
@@ -13,3 +13,6 @@
 
 DataStoreConfigDict["data_source"] = DataSourceDict
 DataStoreConfigDict["docstore_dir"] = str
+
+# Add additional fields to the type hints for custom fields
+DataSourceDict["dataloader_type"] = str
\ No newline at end of file