From 1aa28ad7f88f1ae268d49ec477a8ee11a0239354 Mon Sep 17 00:00:00 2001 From: AstrakhantsevaAA Date: Thu, 3 Aug 2023 16:21:07 +0200 Subject: [PATCH] black --- .../unstructured_data/google_drive/helpers.py | 4 +-- sources/unstructured_data/inbox/__init__.py | 7 ++-- sources/unstructured_data/settings.py | 2 +- sources/unstructured_data_pipeline.py | 8 +++-- .../test_unstructured_data_source.py | 33 +++++++++++++------ 5 files changed, 37 insertions(+), 17 deletions(-) diff --git a/sources/unstructured_data/google_drive/helpers.py b/sources/unstructured_data/google_drive/helpers.py index cb103c9cb..1ba8b8907 100644 --- a/sources/unstructured_data/google_drive/helpers.py +++ b/sources/unstructured_data/google_drive/helpers.py @@ -1,8 +1,8 @@ import io from typing import Any -from googleapiclient.errors import HttpError # type: ignore -from googleapiclient.http import MediaIoBaseDownload # type: ignore +from googleapiclient.errors import HttpError # type: ignore +from googleapiclient.http import MediaIoBaseDownload # type: ignore def download_file_from_google_drive(service: Any, file_id: str, file_path: str) -> None: diff --git a/sources/unstructured_data/inbox/__init__.py b/sources/unstructured_data/inbox/__init__.py index 74e0246bc..62a553f33 100644 --- a/sources/unstructured_data/inbox/__init__.py +++ b/sources/unstructured_data/inbox/__init__.py @@ -47,7 +47,10 @@ def inbox_source( start_date=start_date, ) if attachments: - return uids | get_attachments_by_uid(storage_folder_path=storage_folder_path, filter_by_mime_type=filter_by_mime_type) + return uids | get_attachments_by_uid( + storage_folder_path=storage_folder_path, + filter_by_mime_type=filter_by_mime_type, + ) else: return uids | read_messages @@ -163,7 +166,7 @@ def get_attachments_by_uid( email_account: str = dlt.secrets.value, password: str = dlt.secrets.value, include_body: bool = False, - filter_by_mime_type: Sequence[str] = () + filter_by_mime_type: Sequence[str] = (), ) -> TDataItem: """Downloads attachments from email messages based on the provided message UIDs. diff --git a/sources/unstructured_data/settings.py b/sources/unstructured_data/settings.py index 1217016d0..e30681d1c 100644 --- a/sources/unstructured_data/settings.py +++ b/sources/unstructured_data/settings.py @@ -5,4 +5,4 @@ "invoice_number": "What is the invoice number? Just return the number. If you don't know, then return None", "service_description": "What is the description of the service that this invoice is for? Just return the description. If you don't know, then return None", "phone_number": "What is the company phone number? Just return the phone number. If you don't know, then return None", -} \ No newline at end of file +} diff --git a/sources/unstructured_data_pipeline.py b/sources/unstructured_data_pipeline.py index 2d5ba5fcf..d7aa71af4 100644 --- a/sources/unstructured_data_pipeline.py +++ b/sources/unstructured_data_pipeline.py @@ -40,7 +40,9 @@ def from_google_drive_to_structured() -> None: full_refresh=True, ) - data_source = google_drive_source(download=True, filter_by_mime_type=("application/pdf", )) + data_source = google_drive_source( + download=True, filter_by_mime_type=("application/pdf",) + ) data_resource = data_source.resources["attachments"] # run the pipeline with your parameters @@ -65,7 +67,9 @@ def from_inbox_to_structured() -> None: full_refresh=True, ) - data_source = inbox_source(attachments=True, filter_by_mime_type=("application/pdf",)) + data_source = inbox_source( + attachments=True, filter_by_mime_type=("application/pdf",) + ) data_resource = data_source.resources["attachments"] # run the pipeline with your parameters load_info = pipeline.run( diff --git a/tests/unstructured_data/test_unstructured_data_source.py b/tests/unstructured_data/test_unstructured_data_source.py index ebf93c00e..9fec0a447 100644 --- a/tests/unstructured_data/test_unstructured_data_source.py +++ b/tests/unstructured_data/test_unstructured_data_source.py @@ -5,9 +5,9 @@ from dlt.extract.source import DltResource from sources.unstructured_data import unstructured_to_structured_resource -from sources.unstructured_data.local_folder import local_folder_resource from sources.unstructured_data.google_drive import google_drive_source from sources.unstructured_data.inbox import inbox_source +from sources.unstructured_data.local_folder import local_folder_resource from tests.utils import ALL_DESTINATIONS, assert_load_info @@ -38,7 +38,9 @@ class TestUnstructuredFromLocalFolder: @pytest.fixture def data_resource(self, data_dir: str) -> DltResource: resource = local_folder_resource(data_dir=data_dir) - filtered_data_resource = resource.add_filter(lambda item: item["content_type"] == "application/pdf") + filtered_data_resource = resource.add_filter( + lambda item: item["content_type"] == "application/pdf" + ) return filtered_data_resource @pytest.mark.parametrize("run_async", (False, True)) @@ -75,7 +77,9 @@ def test_content( queries: dict, data_resource: DltResource, ) -> None: - filtered_data_resource = data_resource.add_filter(lambda item: item["content_type"] == "application/pdf") + filtered_data_resource = data_resource.add_filter( + lambda item: item["content_type"] == "application/pdf" + ) pipeline, _ = run_pipeline(destination_name, queries, filtered_data_resource) with pipeline.sql_client() as c: # you can use parametrized queries as well, see python dbapi @@ -90,7 +94,12 @@ def test_content( @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) class TestUnstructuredFromGoogleDrive: @pytest.fixture(scope="session") - def data_resource(self, tmpdir_factory, gd_folders: Sequence[str], filter_by_mime_type: Sequence[str] = ()) -> DltResource: + def data_resource( + self, + tmpdir_factory, + gd_folders: Sequence[str], + filter_by_mime_type: Sequence[str] = (), + ) -> DltResource: tmp_path = tmpdir_factory.mktemp("temp_data") source = google_drive_source( download=True, @@ -99,7 +108,9 @@ def data_resource(self, tmpdir_factory, gd_folders: Sequence[str], filter_by_mim filter_by_mime_type=filter_by_mime_type, ) resource = source.resources["attachments"] - filtered_data_resource = resource.add_filter(lambda item: item["content_type"] == "application/pdf") + filtered_data_resource = resource.add_filter( + lambda item: item["content_type"] == "application/pdf" + ) return filtered_data_resource @pytest.mark.parametrize("run_async", (False, True)) @@ -157,7 +168,9 @@ def data_resource(self, tmpdir_factory) -> DltResource: storage_folder_path=tmp_path, ) resource = source.resources["attachments"] - filtered_data_resource = resource.add_filter(lambda item: item["content_type"] == "application/pdf") + filtered_data_resource = resource.add_filter( + lambda item: item["content_type"] == "application/pdf" + ) return filtered_data_resource @pytest.mark.parametrize("run_async", (False, True)) @@ -202,7 +215,9 @@ def test_content( "SELECT file_path FROM unstructured_from_attachments" ) as cur: rows = list(cur.fetchall()) - assert len(rows) == 4 # 4 files were processed, other content types were skipped except pdf + assert ( + len(rows) == 4 + ) # 4 files were processed, other content types were skipped except pdf def test_incremental_loading( self, @@ -210,9 +225,7 @@ def test_incremental_loading( queries: dict, data_resource: DltResource, ) -> None: - pipeline, load_info = run_pipeline( - destination_name, queries, data_resource - ) + pipeline, load_info = run_pipeline(destination_name, queries, data_resource) # make sure all data were loaded assert_load_info(load_info) print(load_info)