Skip to content

Commit

Permalink
Working multidataset ingestion workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
ash0ts committed Nov 22, 2023
1 parent 35bae27 commit 8e9161e
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 18 deletions.
5 changes: 1 addition & 4 deletions src/wandbot/ingestion/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,7 @@ def main(custom: bool, custom_dataset_config_yaml: pathlib.Path):

if custom and custom_dataset_config_yaml.is_file():
configs = load_custom_dataset_configs_from_yaml(custom_dataset_config_yaml)
#TODO: Add the full list of configs as opposed to limiting to one
#TODO: Add the ability to define which dataloader to use in the config yaml itself
config = configs[0]
raw_artifact = prepare_data.load_custom(project, entity, "custom_raw_dataset", config, "docodile")
raw_artifact = prepare_data.load_custom(project, entity, "custom_raw_dataset", configs)
else:
raw_artifact = prepare_data.load(project, entity)
vectorstore_artifact = vectorstores.load(project, entity, raw_artifact)
Expand Down
2 changes: 2 additions & 0 deletions src/wandbot/ingestion/custom_dataset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
is_git_repo: true
language: "en"
docstore_dir: "custom_store_en"
dataloader_type: "docodile"
- CustomConfig2:
name: "custom_store2"
data_source:
Expand All @@ -18,3 +19,4 @@
is_git_repo: true
language: "en"
docstore_dir: "custom_store_en2"
dataloader_type: "docodile"
27 changes: 13 additions & 14 deletions src/wandbot/ingestion/prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,8 +350,8 @@ def load_custom(
project: str,
entity: str,
result_artifact_name: str = "custom_raw_dataset",
custom_dataset_args: DataStoreConfigDict = {},
dataset_type: str = "docodile",
#TODO: Rename
custom_datasets_args: DataStoreConfigDict = {}
):
"""Load and prepare data for a chatbot system.
Expand All @@ -375,18 +375,17 @@ def load_custom(
type="dataset",
description="Raw documents for custom dataset",
)

#TODO: Allow for an arbitrary amount of custom datasets mapped to the proper config and loader based on appropriate popped args
if dataset_type == "docodile":
CustomDataLoader = DocodileDataLoader
elif dataset_type == "code":
CustomDataLoader = CodeDataLoader
else:
raise ValueError(f"Dataset type {dataset_type} not supported")
custom_dataset_loader = CustomDataLoader(DataStoreConfig.from_dict(custom_dataset_args))
for loader in [
custom_dataset_loader
]:

for custom_dataset_args in custom_datasets_args:
dataloader_type = custom_dataset_args.pop("dataloader_type")
if dataloader_type == "docodile":
CustomDataLoader = DocodileDataLoader
elif dataloader_type == "code":
CustomDataLoader = CodeDataLoader
else:
raise ValueError(f"Dataset type {dataloader_type} not supported")

loader = CustomDataLoader(DataStoreConfig.from_dict(custom_dataset_args))
loader.config.docstore_dir.mkdir(parents=True, exist_ok=True)

with (loader.config.docstore_dir / "config.json").open("w") as f:
Expand Down
3 changes: 3 additions & 0 deletions src/wandbot/ingestion/typings.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@

DataStoreConfigDict["data_source"] = DataSourceDict
DataStoreConfigDict["docstore_dir"] = str

# Add additional fields to the type hints for custom fields
DataSourceDict["dataloader_type"] = str

0 comments on commit 8e9161e

Please sign in to comment.