From 45c6c61c75de67124870933e855b45859188dbfa Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 14 May 2024 14:23:43 +0200 Subject: [PATCH 01/41] Add an example for post paginators --- .../docs/general-usage/http/rest-client.md | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index 556dbfcac6..efeeb9e4f0 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -311,7 +311,7 @@ When working with APIs that use non-standard pagination schemes, or when you nee - `update_request(request: Request) -> None`: Before making the next API call in `RESTClient.paginate` method, `update_request` is used to modify the request with the necessary parameters to fetch the next page (based on the current state of the paginator). For example, you can add query parameters to the request, or modify the URL. -#### Example: creating a query parameter paginator +#### Example 1: creating a query parameter paginator Suppose an API uses query parameters for pagination, incrementing an page parameter for each subsequent page, without providing direct links to next pages in its responses. E.g. `https://api.example.com/posts?page=1`, `https://api.example.com/posts?page=2`, etc. Here's how you could implement a paginator for this scheme: @@ -354,6 +354,38 @@ def get_data(): yield page ``` +:::tip +[`PageNumberPaginator`](#pagenumberpaginator) that ships with dlt does the same thing, but with more flexibility and error handling. This example is meant to demonstrate how to implement a custom paginator. For most use cases, you should use the [built-in paginators](#paginators). +::: + +#### Example 2: creating a paginator for POST requests + +Some APIs use POST requests for pagination, where the next page is fetched by sending a POST request with a cursor or other parameters in the request body. This is frequently used in "search" API endpoints or other endpoints with big payloads. Here's how you could implement a paginator for a case like this: + +```py +from dlt.sources.helpers.rest_client.paginators import BasePaginator +from dlt.sources.helpers.requests import Response, Request + +class PostBodyPaginator(BasePaginator): + def __init__(self): + super().__init__() + self.cursor = None + + def update_state(self, response: Response) -> None: + # Assuming the API returns an empty list when no more data is available + if not response.json(): + self._has_next_page = False + else: + self.cursor = response.json().get("cursor") + + def update_request(self, request: Request) -> None: + if request.json is None: + request.json = {} + + # Add the cursor to the request body + request.json["cursor"] = self.cursor +``` + ## Authentication The RESTClient supports various authentication strategies, such as bearer tokens, API keys, and HTTP basic auth, configured through the `auth` parameter of both the `RESTClient` and the `paginate()` method. From be03338a74f9ca1a238eed82c0034903b6b22333 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 14 May 2024 14:26:26 +0200 Subject: [PATCH 02/41] update the example cursor key --- docs/website/docs/general-usage/http/rest-client.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index efeeb9e4f0..d3f4f379f1 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -376,7 +376,7 @@ class PostBodyPaginator(BasePaginator): if not response.json(): self._has_next_page = False else: - self.cursor = response.json().get("cursor") + self.cursor = response.json().get("next_page_cursor") def update_request(self, request: Request) -> None: if request.json is None: From 7f3f0f0808363fc0df3651555a0bb1cdd3c994b9 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 14 May 2024 14:43:31 +0200 Subject: [PATCH 03/41] Update docs/website/docs/general-usage/http/rest-client.md Co-authored-by: VioletM --- docs/website/docs/general-usage/http/rest-client.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index d3f4f379f1..fea8c189de 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -384,6 +384,16 @@ class PostBodyPaginator(BasePaginator): # Add the cursor to the request body request.json["cursor"] = self.cursor + +client = RESTClient( + base_url="https://api.example.com", + paginator= PostBodyPaginator() +) + +@dlt.resource +def get_data(): + for page in client.paginate("/data"): + yield page ``` ## Authentication From c8f413d73d6d360069481028e012afe9ef424a00 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 14 May 2024 14:46:01 +0200 Subject: [PATCH 04/41] Update docs/website/docs/general-usage/http/rest-client.md Co-authored-by: VioletM --- docs/website/docs/general-usage/http/rest-client.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index fea8c189de..c32a864759 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -364,6 +364,7 @@ Some APIs use POST requests for pagination, where the next page is fetched by se ```py from dlt.sources.helpers.rest_client.paginators import BasePaginator +from dlt.sources.helpers.rest_client import RESTClient from dlt.sources.helpers.requests import Response, Request class PostBodyPaginator(BasePaginator): From 4c7fee7736ec703e4b8287ef27e8fb4d06124a70 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 14 May 2024 14:48:36 +0200 Subject: [PATCH 05/41] Update docs/website/docs/general-usage/http/rest-client.md --- docs/website/docs/general-usage/http/rest-client.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index c32a864759..ca39046d35 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -388,7 +388,7 @@ class PostBodyPaginator(BasePaginator): client = RESTClient( base_url="https://api.example.com", - paginator= PostBodyPaginator() + paginator=PostBodyPaginator() ) @dlt.resource From c825759e07dc44e7e704c296a759f6438fd4b713 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 14 May 2024 17:05:00 +0200 Subject: [PATCH 06/41] Add rest_api verified source documentation (#1308) * Add rest_api source docs * Expand rest_api documentation * Update snippets * Update string aliases * Link dlt source * Reordered code in the example and added a new section * Mention auto detection * Reorder the sentence about paginator types and instances * Elaborate on dependent resources; link the transformer docs * Link incremental loading * Update the example to use rest_api_resources * Rename github_config --- .../verified-sources/rest_api.md | 578 ++++++++++++++++++ docs/website/sidebars.js | 1 + 2 files changed, 579 insertions(+) create mode 100644 docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md new file mode 100644 index 0000000000..1f79055d06 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -0,0 +1,578 @@ +--- +title: REST API generic source +description: dlt verified source for REST APIs +keywords: [rest api, restful api] +--- +import Header from './_source-info-header.md'; + +
+ +This is a generic dlt source you can use to extract data from any REST API. It uses [declarative configuration](#source-configuration) to define the API endpoints, their [relationships](#define-resource-relationships), how to handle [pagination](#pagination), and [authentication](#authentication). + +## Setup guide + +### Initialize the verified source + +Enter the following command in your terminal: + +```sh +dlt init rest_api duckdb +``` + +[dlt init](../../reference/command-line-interface) will initialize the pipeline examples for REST API as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) as the [destination](../destinations). + +Running `dlt init` creates the following in the current folder: +- `rest_api_pipeline.py` file with a sample pipelines definition: + - GitHub API example + - Pokemon API example +- `.dlt` folder with: + - `secrets.toml` file to store your access tokens and other sensitive information + - `config.toml` file to store the configuration settings +- `requirements.txt` file with the required dependencies + +Change the REST API source to your needs by modifying the `rest_api_pipeline.py` file. See the detailed [source configuration](#source-configuration) section below. + +:::note +For the rest of the guide, we will use the [GitHub API](https://docs.github.com/en/rest?apiVersion=2022-11-28) and [Pokemon API](https://pokeapi.co/) as example sources. +::: + +This source is based on the [RESTClient class](../../general-usage/http/rest-client.md). + +### Add credentials + +In the `.dlt` folder, you'll find a file called `secrets.toml`, where you can securely store your access tokens and other sensitive information. It's important to handle this file with care and keep it safe. + +The GitHub API [requires an access token](https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api?apiVersion=2022-11-28) to access some of its endpoints and to increase the rate limit for the API calls. To get a GitHub token, follow the GitHub documentation on [managing your personal access tokens](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens). + +After you get the token, add it to the `secrets.toml` file: + +```toml +[sources.rest_api.github] +github_token = "your_github_token" +``` + +## Run the pipeline + +1. Install the required dependencies by running the following command: + + ```sh + pip install -r requirements.txt + ``` + +2. Run the pipeline: + + ```sh + python rest_api_pipeline.py + ``` + +3. Verify that everything loaded correctly by using the following command: + + ```sh + dlt pipeline rest_api show + ``` + +## Source configuration + +### Quick example + +Let's take a look at the GitHub example in `rest_api_pipeline.py` file: + +```py +from rest_api import RESTAPIConfig, rest_api_resources + +@dlt.source +def github_source(github_token=dlt.secrets.value): + config: RESTAPIConfig = { + "client": { + "base_url": "https://api.github.com/repos/dlt-hub/dlt/", + "auth": { + "token": github_token, + }, + }, + "resource_defaults": { + "primary_key": "id", + "write_disposition": "merge", + "endpoint": { + "params": { + "per_page": 100, + }, + }, + }, + "resources": [ + { + "name": "issues", + "endpoint": { + "path": "issues", + "params": { + "sort": "updated", + "direction": "desc", + "state": "open", + "since": { + "type": "incremental", + "cursor_path": "updated_at", + "initial_value": "2024-01-25T11:21:28Z", + }, + }, + }, + }, + { + "name": "issue_comments", + "endpoint": { + "path": "issues/{issue_number}/comments", + "params": { + "issue_number": { + "type": "resolve", + "resource": "issues", + "field": "number", + } + }, + }, + "include_from_parent": ["id"], + }, + ], + } + + yield from rest_api_resources(config) + +def load_github() -> None: + pipeline = dlt.pipeline( + pipeline_name="rest_api_github", + destination="duckdb", + dataset_name="rest_api_data", + ) + + load_info = pipeline.run(github_source()) + print(load_info) +``` + +The declarative resource configuration is defined in the `config` dictionary. It contains the following key components: + +1. `client`: Defines the base URL and authentication method for the API. In this case it uses token-based authentication. The token is stored in the `secrets.toml` file. + +2. `resource_defaults`: Contains default settings for all [resources](#resource-configuration). In this example, we define that all resources: + - Have `id` as the [primary key](../../general-usage/resource#define-schema) + - Use the `merge` [write disposition](../../general-usage/incremental-loading#choosing-a-write-disposition) to merge the data with the existing data in the destination. + - Send a `per_page` query parameter with each request to 100 to get more results per page. + +3. `resources`: A list of [resources](#resource-configuration) to be loaded. Here, we have two resources: `issues` and `issue_comments`, which correspond to the GitHub API endpoints for [repository issues](https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28#list-repository-issues) and [issue comments](https://docs.github.com/en/rest/issues/comments?apiVersion=2022-11-28#list-issue-comments). Note that we need a in issue number to fetch comments for each issue. This number is taken from the `issues` resource. More on this in the [resource relationships](#define-resource-relationships) section. + +Let's break down the configuration in more detail. + +### Configuration structure + +:::tip +Import the `RESTAPIConfig` type from the `rest_api` module to have convenient hints in your editor/IDE and use it to define the configuration object. + +```py +from rest_api import RESTAPIConfig +``` +::: + + +The configuration object passed to the REST API Generic Source has three main elements: + +```py +config: RESTAPIConfig = { + "client": { + ... + }, + "resource_defaults": { + ... + }, + "resources": [ + ... + ], +} +``` + +#### `client` + +`client` contains the configuration to connect to the API's endpoints. It includes the following fields: + +- `base_url` (str): The base URL of the API. This string is prepended to all endpoint paths. For example, if the base URL is `https://api.example.com/v1/`, and the endpoint path is `users`, the full URL will be `https://api.example.com/v1/users`. +- `headers` (dict, optional): Additional headers to be sent with each request. +- `auth` (optional): Authentication configuration. It can be a simple token, a `AuthConfigBase` object, or a more complex authentication method. +- `paginator` (optional): Configuration for the default pagination to be used for resources that support pagination. See the [pagination](#pagination) section for more details. + +#### `resource_defaults` (optional) + +`resource_defaults` contains the default values to [configure the dlt resources](#resource-configuration). This configuration is applied to all resources unless overridden by the resource-specific configuration. + +For example, you can set the primary key, write disposition, and other default settings here: + +```py +config = { + "client": { + ... + }, + "resource_defaults": { + "primary_key": "id", + "write_disposition": "merge", + "endpoint": { + "params": { + "per_page": 100, + }, + }, + }, + "resources": [ + "resource1", + "resource2": { + "name": "resource2_name", + "write_disposition": "append", + "endpoint": { + "params": { + "param1": "value1", + }, + }, + }, + ], +} +``` + +Above, all resources will have `primary_key` set to `id`, `resource1` will have `write_disposition` set to `merge`, and `resource2` will override the default `write_disposition` with `append`. +Both `resource1` and `resource2` will have the `per_page` parameter set to 100. + +#### `resources` + +This is a list of resource configurations that define the API endpoints to be loaded. Each resource configuration can be: +- a dictionary with the [resource configuration](#resource-configuration). +- a string. In this case, the string is used as the both as the endpoint path and the resource name, and the resource configuration is taken from the `resource_defaults` configuration if it exists. + +### Resource configuration + +A resource configuration is used to define a [dlt resource](../../general-usage/resource.md) for the data to be loaded from an API endpoint. It contains the following key fields: + +- `endpoint`: The endpoint configuration for the resource. It can be a string or a dict representing the endpoint settings. See the [endpoint configuration](#endpoint-configuration) section for more details. +- `write_disposition`: The write disposition for the resource. +- `primary_key`: The primary key for the resource. +- `include_from_parent`: A list of fields from the parent resource to be included in the resource output. See the [resource relationships](#include-fields-from-the-parent-resource) section for more details. +- `selected`: A flag to indicate if the resource is selected for loading. This could be useful when you want to load data only from child resources and not from the parent resource. + +You can also pass additional resource parameters that will be used to configure the dlt resource. See [dlt resource API reference](../../api_reference/extract/decorators.md#resource) for more details. + +### Endpoint configuration + +The endpoint configuration defines how to query the API endpoint. Quick example: + +```py +{ + "path": "issues", + "method": "GET", + "params": { + "sort": "updated", + "direction": "desc", + "state": "open", + "since": { + "type": "incremental", + "cursor_path": "updated_at", + "initial_value": "2024-01-25T11:21:28Z", + }, + }, + "data_selector": "results", +} +``` + +The fields in the endpoint configuration are: + +- `path`: The path to the API endpoint. +- `method`: The HTTP method to be used. Default is `GET`. +- `params`: Query parameters to be sent with each request. For example, `sort` to order the results or `since` to specify [incremental loading](#incremental-loading). This is also used to define [resource relationships](#define-resource-relationships). +- `json`: The JSON payload to be sent with the request (for POST and PUT requests). +- `paginator`: Pagination configuration for the endpoint. See the [pagination](#pagination) section for more details. +- `data_selector`: A JSONPath to select the data from the response. See the [data selection](#data-selection) section for more details. +- `response_actions`: A list of actions that define how to process the response data. +- `incremental`: Configuration for [incremental loading](#incremental-loading). + +### Pagination + +The REST API source will try to automatically handle pagination for you. This works by detecting the pagination details from the first API response. + +In some special cases, you may need to specify the pagination configuration explicitly. + +:::note +Currently pagination is supported only for GET requests. To handle POST requests with pagination, you need to implement a [custom paginator](../../general-usage/http/rest-client.md#custom-paginator). +::: + +These are the available paginators: + +| Paginator class | String Alias (`type`) | Description | +| -------------- | ------------ | ----------- | +| [JSONResponsePaginator](../../general-usage/http/rest-client.md#jsonresponsepaginator) | `json_response` | The links to the next page are in the body (JSON) of the response. | +| [HeaderLinkPaginator](../../general-usage/http/rest-client.md#headerlinkpaginator) | `header_link` | The links to the next page are in the response headers. | +| [OffsetPaginator](../../general-usage/http/rest-client.md#offsetpaginator) | `offset` | The pagination is based on an offset parameter. With total items count either in the response body or explicitly provided. | +| [PageNumberPaginator](../../general-usage/http/rest-client.md#pagenumberpaginator) | `page_number` | The pagination is based on a page number parameter. With total pages count either in the response body or explicitly provided. | +| [JSONCursorPaginator](../../general-usage/http/rest-client.md#jsonresponsecursorpaginator) | `cursor` | The pagination is based on a cursor parameter. The value of the cursor is in the response body (JSON). | +| SinglePagePaginator | `single_page` | The response will be interpreted as a single-page response, ignoring possible pagination metadata. | +| `None` | `auto` | Explicitly specify that the source should automatically detect the pagination method. | + +To specify the pagination configuration, use the `paginator` field in the [client](#client) or [endpoint](#endpoint-configuration) configurations. You may either use a dictionary with a string alias in the `type` field along with the required parameters, or use the paginator instance directly: + +```py +{ + ... + "paginator": { + "type": "json_links", + "next_url_path": "paging.next", + } +} +``` + +Or using the paginator instance: + +```py +{ + ... + "paginator": JSONResponsePaginator( + next_url_path="paging.next" + ), +} +``` + +This is useful when you're [implementing and using a custom paginator](../../general-usage/http/rest-client.md#custom-paginator). + +### Data selection + +The `data_selector` field in the endpoint configuration allows you to specify a JSONPath to select the data from the response. By default, the source will try to detect locations of the data automatically. + +Use this field when you need to specify the location of the data in the response explicitly. + +For example, if the API response looks like this: + +```json +{ + "posts": [ + {"id": 1, "title": "Post 1"}, + {"id": 2, "title": "Post 2"}, + {"id": 3, "title": "Post 3"} + ] +} +``` + +You can use the following endpoint configuration: + +```py +{ + "path": "posts", + "data_selector": "posts", +} +``` + +For a nested structure like this: + +```json +{ + "results": { + "posts": [ + {"id": 1, "title": "Post 1"}, + {"id": 2, "title": "Post 2"}, + {"id": 3, "title": "Post 3"} + ] + } +} +``` + +You can use the following endpoint configuration: + +```py +{ + "path": "posts", + "data_selector": "results.posts", +} +``` + +Read more about [JSONPath syntax](https://github.com/h2non/jsonpath-ng?tab=readme-ov-file#jsonpath-syntax) to learn how to write selectors. + + +### Authentication + +Many APIs require authentication to access their endpoints. The REST API source supports various authentication methods, such as token-based, query parameters, basic auth, etc. + +#### Quick example + +One of the most common method is token-based authentication. To authenticate with a token, you can use the `token` field in the `auth` configuration: + +```py +{ + "client": { + ... + "auth": { + "token": dlt.secrets["your_api_token"], + }, + ... + }, +} +``` + +:::warning +Make sure to store your access tokens and other sensitive information in the `secrets.toml` file and never commit it to the version control system. +::: + +Available authentication types: + +| Authentication class | String Alias (`type`) | Description | +| ------------------- | ----------- | ----------- | +| [BearTokenAuth](../../general-usage/http/rest-client.md#bearer-token-authentication) | `bearer` | Bearer token authentication. | +| [HTTPBasicAuth](../../general-usage/http/rest-client.md#http-basic-authentication) | `api_key` | Basic HTTP authentication. | +| [APIKeyAuth](../../general-usage/http/rest-client.md#api-key-authentication) | `http_basic` | API key authentication with key defined in the query parameters or in the headers. | + +To specify the authentication configuration, use the `auth` field in the [client](#client) configuration: + +```py +{ + "client": { + "auth": { + "type": "bearer", + "token": dlt.secrets["your_api_token"], + }, + ... + }, +} +``` + +Alternatively, you can use the authentication class directly: + +```py +from dlt.sources.helpers.rest_client.auth import BearerTokenAuth + +config = { + "client": { + "auth": BearTokenAuth(dlt.secrets["your_api_token"]), + }, + ... +} +``` + +### Define resource relationships + +When you have a resource that depends on another resource, you can define the relationship using the `resolve` configuration. With it you link a path parameter in the child resource to a field in the parent resource's data. + +In the GitHub example, the `issue_comments` resource depends on the `issues` resource. The `issue_number` parameter in the `issue_comments` endpoint configuration is resolved from the `number` field of the `issues` resource: + +```py +{ + "resources": [ + { + "name": "issues", + "endpoint": { + "path": "issues", + ... + }, + }, + { + "name": "issue_comments", + "endpoint": { + "path": "issues/{issue_number}/comments", + "params": { + "issue_number": { + "type": "resolve", + "resource": "issues", + "field": "number", + } + }, + }, + "include_from_parent": ["id"], + }, + ], +} +``` + +This configuration tells the source to get issue numbers from the `issues` resource and use them to fetch comments for each issue. So if the `issues` resource yields the following data: + +```json +[ + {"id": 1, "number": 123}, + {"id": 2, "number": 124}, + {"id": 3, "number": 125} +] +``` + +The `issue_comments` resource will make requests to the following endpoints: + +- `issues/123/comments` +- `issues/124/comments` +- `issues/125/comments` + +The syntax for the `resolve` field in parameter configuration is: + +```py +"": { + "type": "resolve", + "resource": "", + "field": "", +} +``` + +Under the hood, dlt handles this by using a [transformer resource](../../general-usage/resource.md#process-resources-with-dlttransformer). + +#### Include fields from the parent resource + +You can include data from the parent resource in the child resource by using the `include_from_parent` field in the resource configuration. For example: + +```py +{ + "name": "issue_comments", + "endpoint": { + ... + }, + "include_from_parent": ["id", "title", "created_at"], +} +``` + +This will include the `id`, `title`, and `created_at` fields from the `issues` resource in the `issue_comments` resource data. The name of the included fields will be prefixed with the parent resource name and an underscore (`_`) like so: `_issues_id`, `_issues_title`, `_issues_created_at`. + +## Incremental loading + +Some APIs provide a way to fetch only new or changed data (most often by using a timestamp field like `updated_at`, `created_at`, or incremental IDs). +This is called [incremental loading](../../general-usage/incremental-loading.md) and is very useful as it allows you to reduce the load time and the amount of data transferred. + +When the API endpoint supports incremental loading, you can configure the source to load only the new or changed data using these two methods: + +1. Defining a special parameter in the `params` section of the [endpoint configuration](#endpoint-configuration): + + ```py + "": { + "type": "incremental", + "cursor_path": "", + "initial_value": "", + }, + ``` + + For example, in the `issues` resource configuration in the GitHub example, we have: + + ```py + "since": { + "type": "incremental", + "cursor_path": "updated_at", + "initial_value": "2024-01-25T11:21:28Z", + }, + ``` + + This configuration tells the source to create an incremental object that will keep track of the `updated_at` field in the response and use it as a value for the `since` parameter in subsequent requests. + +2. Specifying the `incremental` field in the [endpoint configuration](#endpoint-configuration): + + ```py + "incremental": { + "start_param": "", + "end_param": "", + "cursor_path": "", + "initial_value": "", + "end_value": "", + }, + ``` + + This configuration is more flexible and allows you to specify the start and end conditions for the incremental loading. + +See the [incremental loading](../../general-usage/incremental-loading.md#incremental-loading-with-a-cursor-field) guide for more details. + +## Advanced configuration + +`rest_api_source()` function creates the [dlt source](../../general-usage/source.md) and lets you configure the following parameters: + +- `config`: The REST API configuration dictionary. +- `name`: An optional name for the source. +- `section`: An optional section name in the configuration file. +- `max_table_nesting`: Sets the maximum depth of nested table above which the remaining nodes are loaded as structs or JSON. +- `root_key` (bool): Enables merging on all resources by propagating root foreign key to child tables. This option is most useful if you plan to change write disposition of a resource to disable/enable merge. Defaults to False. +- `schema_contract`: Schema contract settings that will be applied to this resource. +- `spec`: A specification of configuration and secret values required by the source. diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 728c3b6593..a3fe12c8fb 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -83,6 +83,7 @@ const sidebars = { 'dlt-ecosystem/verified-sources/notion', 'dlt-ecosystem/verified-sources/personio', 'dlt-ecosystem/verified-sources/pipedrive', + 'dlt-ecosystem/verified-sources/rest_api', 'dlt-ecosystem/verified-sources/salesforce', 'dlt-ecosystem/verified-sources/scrapy', 'dlt-ecosystem/verified-sources/shopify', From 511df6ee6b97bcff8e605b1bac0098814faf2dfe Mon Sep 17 00:00:00 2001 From: Maxime Lemaitre Date: Thu, 16 May 2024 10:46:59 +0200 Subject: [PATCH 07/41] Fix typo in Slack Docs (#1369) --- docs/website/docs/dlt-ecosystem/verified-sources/slack.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md index 970a891e60..38eda15c94 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md @@ -70,7 +70,7 @@ To get started with your data pipeline, follow these steps: [This command](../../reference/command-line-interface) will initialize [the pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/slack_pipeline.py) - with Google Sheets as the [source](../../general-usage/source) and + with Slack as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) as the [destination](../destinations). 1. If you'd like to use a different destination, simply replace `duckdb` with the name of your From 80e78204d6a81e3c02cdd481a5caa50e30c88bf6 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Thu, 16 May 2024 10:52:25 +0200 Subject: [PATCH 08/41] Add the troubleshooting section (#1367) --- .../docs/general-usage/http/rest-client.md | 70 ++++++++++++++++++- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index ca39046d35..481670ae4b 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -385,7 +385,7 @@ class PostBodyPaginator(BasePaginator): # Add the cursor to the request body request.json["cursor"] = self.cursor - + client = RESTClient( base_url="https://api.example.com", paginator=PostBodyPaginator() @@ -527,4 +527,70 @@ from dlt.sources.helpers.rest_client import paginate for page in paginate("https://api.example.com/posts"): print(page) -``` \ No newline at end of file +``` + +## Troubleshooting + +### `RESTClient.get()` and `RESTClient.post()` methods + +These methods work similarly to the [get()](https://docs.python-requests.org/en/latest/api/#requests.get) and [post()](https://docs.python-requests.org/en/latest/api/#requests.post) functions +from the Requests library. They return a [Response](https://docs.python-requests.org/en/latest/api/#requests.Response) object that contains the response data. +You can inspect the `Response` object to get the `response.status_code`, `response.headers`, and `response.content`. For example: + +```py +from dlt.sources.helpers.rest_client import RESTClient +from dlt.sources.helpers.rest_client.auth import BearerTokenAuth + +client = RESTClient(base_url="https://api.example.com") +response = client.get("/posts", auth=BearerTokenAuth(token="your_access_token")) + +print(response.status_code) +print(response.headers) +print(response.content) +``` + +### `RESTClient.paginate()` + +Debugging `paginate()` is trickier because it's a generator function that yields [`PageData`](#pagedata) objects. Here's several ways to debug the `paginate()` method: + +1. Enable [logging](../../running-in-production/running.md#set-the-log-level-and-format) to see detailed information about the HTTP requests: + +```bash +RUNTIME__LOG_LEVEL=INFO python my_script.py +``` + +2. Use the [`PageData`](#pagedata) instance to inspect the [request](https://docs.python-requests.org/en/latest/api/#requests.Request) +and [response](https://docs.python-requests.org/en/latest/api/#requests.Response) objects: + +```py +from dlt.sources.helpers.rest_client import RESTClient +from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator + +client = RESTClient( + base_url="https://api.example.com", + paginator=JSONResponsePaginator(next_url_path="pagination.next") +) + +for page in client.paginate("/posts"): + print(page.request) + print(page.response) +``` + +3. Use the `hooks` parameter to add custom response handlers to the `paginate()` method: + +```py +from dlt.sources.helpers.rest_client.auth import BearerTokenAuth + +def response_hook(response, **kwargs): + print(response.status_code) + print(f"Content: {response.content}") + print(f"Request: {response.request.body}") + # Or import pdb; pdb.set_trace() to debug + +for page in client.paginate( + "/posts", + auth=BearerTokenAuth(token="your_access_token") + hooks={"response": [response_hook]} +): + print(page) +``` From 314e7a026619c5fc793ca7408581baa2d71d7e13 Mon Sep 17 00:00:00 2001 From: Sultan Iman <354868+sultaniman@users.noreply.github.com> Date: Thu, 16 May 2024 14:53:53 +0200 Subject: [PATCH 09/41] Replace weather api example with github in create a pipeline walkthrough (#1351) Co-authored-by: AstrakhantsevaAA Co-authored-by: Anton Burnashev --- .../docs/walkthroughs/create-a-pipeline.md | 142 +++++++++++------- 1 file changed, 85 insertions(+), 57 deletions(-) diff --git a/docs/website/docs/walkthroughs/create-a-pipeline.md b/docs/website/docs/walkthroughs/create-a-pipeline.md index 1d5974efbe..bba78dc6cb 100644 --- a/docs/website/docs/walkthroughs/create-a-pipeline.md +++ b/docs/website/docs/walkthroughs/create-a-pipeline.md @@ -1,31 +1,46 @@ --- title: Create a pipeline description: How to create a pipeline -keywords: [how to, create a pipeline] +keywords: [how to, create a pipeline, rest client] --- # Create a pipeline -Follow the steps below to create a [pipeline](../general-usage/glossary.md#pipeline) from the -WeatherAPI.com API to DuckDB from scratch. The same steps can be repeated for any source and -destination of your choiceβ€”use `dlt init ` and then build the pipeline for -that API instead. +This guide walks you through creating a pipeline that uses our [REST API Client](../general-usage/http/rest-client) +to connect to [DuckDB](../dlt-ecosystem/destinations/duckdb). +:::tip +We're using DuckDB as a destination here, but you can adapt the steps to any [source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/) and [destination](https://dlthub.com/docs/dlt-ecosystem/destinations/) by +using the [command](../reference/command-line-interface#dlt-init) `dlt init ` and tweaking the pipeline accordingly. +::: -Please make sure you have [installed `dlt`](../reference/installation.md) before following the +Please make sure you have [installed `dlt`](../reference/installation) before following the steps below. +## Task overview + +Imagine you want to analyze issues from a GitHub project locally. +To achieve this, you need to write code that accomplishes the following: + +1. Constructs a correct request. +2. Authenticates your request. +3. Fetches and handles paginated issue data. +4. Stores the data for analysis. + +This may sound complicated, but dlt provides a [REST API Client](../general-usage/http/rest-client) that allows you to focus more on your data rather than on managing API interactions. + + ## 1. Initialize project Create a new empty directory for your `dlt` project by running: ```sh -mkdir weatherapi_duckdb && cd weatherapi_duckdb +mkdir github_api_duckdb && cd github_api_duckdb ``` Start a `dlt` project with a pipeline template that loads data to DuckDB by running: ```sh -dlt init weatherapi duckdb +dlt init github_api duckdb ``` Install the dependencies necessary for DuckDB: @@ -34,114 +49,127 @@ Install the dependencies necessary for DuckDB: pip install -r requirements.txt ``` -## 2. Add WeatherAPI.com API credentials +## 2. Obtain and add API credentials from GitHub -You will need to [sign up for the WeatherAPI.com API](https://www.weatherapi.com/signup.aspx). +You will need to [sign in](https://github.com/login) to your GitHub account and create your access token via [Personal access tokens page](https://github.com/settings/tokens). -Once you do this, you should see your `API Key` at the top of your -[user page](https://www.weatherapi.com/my/). - -Copy the value of the API key into `.dlt/secrets.toml`: +Copy your new access token over to `.dlt/secrets.toml`: ```toml [sources] api_secret_key = '' ``` -The **secret name** corresponds to the **argument name** in the source function. Below `api_secret_key` [will get its value](../general-usage/credentials/configuration.md#general-usage-and-an-example) from `secrets.toml` when `weatherapi_source()` is called. + +This token will be used by `github_api_source()` to authenticate requests. + +The **secret name** corresponds to the **argument name** in the source function. +Below `api_secret_key` [will get its value](../general-usage/credentials/configuration#allow-dlt-to-pass-the-config-and-secrets-automatically) +from `secrets.toml` when `github_api_source()` is called. + ```py @dlt.source -def weatherapi_source(api_secret_key=dlt.secrets.value): - ... +def github_api_source(api_secret_key: str = dlt.secrets.value): + return github_api_resource(api_secret_key=api_secret_key) ``` -Run the `weatherapi.py` pipeline script to test that authentication headers look fine: +Run the `github_api.py` pipeline script to test that authentication headers look fine: ```sh -python3 weatherapi.py +python github_api.py ``` Your API key should be printed out to stdout along with some test data. -## 3. Request data from the WeatherAPI.com API +## 3. Request project issues from then GitHub API -Replace the definition of the `weatherapi_resource` function definition in the `weatherapi.py` -pipeline script with a call to the WeatherAPI.com API: -```py -@dlt.resource(write_disposition="append") -def weatherapi_resource(api_secret_key=dlt.secrets.value): - url = "https://api.weatherapi.com/v1/current.json" - params = { - "q": "NYC", - "key": api_secret_key - } - response = requests.get(url, params=params) - response.raise_for_status() - yield response.json() -``` +:::tip +We will use `dlt` repository as an example GitHub project https://github.com/dlt-hub/dlt, feel free to replace it with your own repository. +::: -Run the `weatherapi.py` pipeline script to test that the API call works: +Modify `github_api_resource` in `github_api.py` to request issues data from your GitHub project's API: -```sh -python3 weatherapi.py +```py +from dlt.sources.helpers.rest_client import paginate +from dlt.sources.helpers.rest_client.auth import BearerTokenAuth +from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator + +@dlt.resource(write_disposition="replace") +def github_api_resource(api_secret_key: str = dlt.secrets.value): + url = "https://api.github.com/repos/dlt-hub/dlt/issues" + + for page in paginate( + url, + auth=BearerTokenAuth(api_secret_key), + paginator=HeaderLinkPaginator(), + params={"state": "open"} + ): + yield page ``` -This should print out the weather in New York City right now. - ## 4. Load the data -Remove the `exit()` call from the `main` function in `weatherapi.py`, so that running the -`python3 weatherapi.py` command will now also run the pipeline: +Uncomment the commented out code in `main` function in `github_api.py`, so that running the +`python github_api.py` command will now also run the pipeline: ```py if __name__=='__main__': - # configure the pipeline with your destination details pipeline = dlt.pipeline( - pipeline_name='weatherapi', + pipeline_name='github_api_pipeline', destination='duckdb', - dataset_name='weatherapi_data' + dataset_name='github_api_data' ) # print credentials by running the resource - data = list(weatherapi_resource()) + data = list(github_api_resource()) # print the data yielded from resource print(data) # run the pipeline with your parameters - load_info = pipeline.run(weatherapi_source()) + load_info = pipeline.run(github_api_source()) # pretty print the information on data that was loaded print(load_info) ``` -Run the `weatherapi.py` pipeline script to load data into DuckDB: + +Run the `github_api.py` pipeline script to test that the API call works: ```sh -python3 weatherapi.py +python github_api.py ``` -Then this command to see that the data loaded: +This should print out JSON data containing the issues in the GitHub project. + +It also prints `load_info` object. + +Let's explore the loaded data with the [command](../reference/command-line-interface#show-tables-and-data-in-the-destination) `dlt pipeline show`. + +:::info +Make sure you have `streamlit` installed `pip install streamlit` +::: ```sh -dlt pipeline weatherapi show +dlt pipeline github_api_pipeline show ``` This will open a Streamlit app that gives you an overview of the data loaded. ## 5. Next steps -Now that you have a working pipeline, you have options for what to learn next: +With a functioning pipeline, consider exploring: +- Our [REST Client](../general-usage/http/rest-client). - [Deploy this pipeline with GitHub Actions](deploy-a-pipeline/deploy-with-github-actions), so that the data is automatically loaded on a schedule. - Transform the [loaded data](../dlt-ecosystem/transformations) with dbt or in Pandas DataFrames. -- Learn how to [run](../running-in-production/running.md), - [monitor](../running-in-production/monitoring.md), and - [alert](../running-in-production/alerting.md) when you put your pipeline in production. +- Learn how to [run](../running-in-production/running), + [monitor](../running-in-production/monitoring), and + [alert](../running-in-production/alerting) when you put your pipeline in production. - Try loading data to a different destination like - [Google BigQuery](../dlt-ecosystem/destinations/bigquery.md), - [Amazon Redshift](../dlt-ecosystem/destinations/redshift.md), or - [Postgres](../dlt-ecosystem/destinations/postgres.md). + [Google BigQuery](../dlt-ecosystem/destinations/bigquery), + [Amazon Redshift](../dlt-ecosystem/destinations/redshift), or + [Postgres](../dlt-ecosystem/destinations/postgres). From ca154015d8cdde3d0a0922389839463467d50ee9 Mon Sep 17 00:00:00 2001 From: Ilya Gurov Date: Thu, 16 May 2024 20:25:46 +0400 Subject: [PATCH 10/41] feat(pipeline): add an ability to auto truncate (#1292) * feat(pipeline): add an ability to auto truncate staging destination after load * lint fix * fix typo * improve tests * truncate dataset * do truncation after all the load is finished * fix the test, which already expects warnings * add docs, tests * lint fix * lint fix * fixes * fix typo * delete excess comment * fix the test * additional conditions for assert * use qualified name * lint fix * lint fix * fix tests * fix the test * fix test * if staging is not used, don't test it * test fix for clickhouse * test fix * uses with_staging_dataset correctly --------- Co-authored-by: Marcin Rudolf --- dlt/load/configuration.py | 3 ++ dlt/load/load.py | 35 ++++++++++++++++++- dlt/pipeline/pipeline.py | 1 + .../docs/running-in-production/running.md | 6 ++++ .../airflow_tests/test_airflow_wrapper.py | 12 ++++++- tests/load/pipeline/test_pipelines.py | 17 +++++++++ tests/pipeline/test_pipeline.py | 32 ++++++++++++++++- 7 files changed, 103 insertions(+), 3 deletions(-) diff --git a/dlt/load/configuration.py b/dlt/load/configuration.py index 97cf23fdfc..b3fc2fbcd4 100644 --- a/dlt/load/configuration.py +++ b/dlt/load/configuration.py @@ -15,6 +15,9 @@ class LoaderConfiguration(PoolRunnerConfiguration): raise_on_max_retries: int = 5 """When gt 0 will raise when job reaches raise_on_max_retries""" _load_storage_config: LoadStorageConfiguration = None + # if set to `True`, the staging dataset will be + # truncated after loading the data + truncate_staging_dataset: bool = False def on_resolved(self) -> None: self.pool_type = "none" if self.workers == 1 else "thread" diff --git a/dlt/load/load.py b/dlt/load/load.py index 66ddb1c308..9d898bc54d 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -53,7 +53,7 @@ LoadClientUnsupportedWriteDisposition, LoadClientUnsupportedFileFormats, ) -from dlt.load.utils import get_completed_table_chain, init_client +from dlt.load.utils import _extend_tables_with_table_chain, get_completed_table_chain, init_client class Load(Runnable[Executor], WithStepInfo[LoadMetrics, LoadInfo]): @@ -348,6 +348,8 @@ def complete_package(self, load_id: str, schema: Schema, aborted: bool = False) ) ): job_client.complete_load(load_id) + self._maybe_trancate_staging_dataset(schema, job_client) + self.load_storage.complete_load_package(load_id, aborted) # collect package info self._loaded_packages.append(self.load_storage.get_load_package_info(load_id)) @@ -490,6 +492,37 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: return TRunMetrics(False, len(self.load_storage.list_normalized_packages())) + def _maybe_trancate_staging_dataset(self, schema: Schema, job_client: JobClientBase) -> None: + """ + Truncate the staging dataset if one used, + and configuration requests truncation. + + Args: + schema (Schema): Schema to use for the staging dataset. + job_client (JobClientBase): + Job client to use for the staging dataset. + """ + if not ( + isinstance(job_client, WithStagingDataset) and self.config.truncate_staging_dataset + ): + return + + data_tables = schema.data_table_names() + tables = _extend_tables_with_table_chain( + schema, data_tables, data_tables, job_client.should_load_data_to_staging_dataset + ) + + try: + with self.get_destination_client(schema) as client: + with client.with_staging_dataset(): # type: ignore + client.initialize_storage(truncate_tables=tables) + + except Exception as exc: + logger.warn( + f"Staging dataset truncate failed due to the following error: {exc}" + " However, it didn't affect the data integrity." + ) + def get_step_info( self, pipeline: SupportsPipeline, diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index a2ea1936a9..53770f332d 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -554,6 +554,7 @@ def load( with signals.delayed_signals(): runner.run_pool(load_step.config, load_step) info: LoadInfo = self._get_step_info(load_step) + self.first_run = False return info except Exception as l_ex: diff --git a/docs/website/docs/running-in-production/running.md b/docs/website/docs/running-in-production/running.md index 253a27d942..9c52f58caa 100644 --- a/docs/website/docs/running-in-production/running.md +++ b/docs/website/docs/running-in-production/running.md @@ -108,6 +108,12 @@ behind. In `config.toml`: load.delete_completed_jobs=true ``` +Also, by default, `dlt` leaves data in staging dataset, used during merge and replace load for deduplication. In order to clear it, put the following line in `config.toml`: + +```toml +load.truncate_staging_dataset=true +``` + ## Using slack to send messages `dlt` provides basic support for sending slack messages. You can configure Slack incoming hook via diff --git a/tests/helpers/airflow_tests/test_airflow_wrapper.py b/tests/helpers/airflow_tests/test_airflow_wrapper.py index 845800e47f..533d16c998 100644 --- a/tests/helpers/airflow_tests/test_airflow_wrapper.py +++ b/tests/helpers/airflow_tests/test_airflow_wrapper.py @@ -384,7 +384,17 @@ def dag_parallel(): with mock.patch("dlt.helpers.airflow_helper.logger.warn") as warn_mock: dag_def = dag_parallel() dag_def.test() - warn_mock.assert_called_once() + warn_mock.assert_has_calls( + [ + mock.call( + "The resource resource2 in task" + " mock_data_incremental_source_resource1-resource2 is using incremental loading" + " and may modify the state. Resources that modify the state should not run in" + " parallel within the single pipeline as the state will not be correctly" + " merged. Please use 'serialize' or 'parallel-isolated' modes instead." + ) + ] + ) def test_parallel_isolated_run(): diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index a498b570a0..d98f335d16 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -10,6 +10,7 @@ from dlt.common.pipeline import SupportsPipeline from dlt.common.destination import Destination from dlt.common.destination.exceptions import DestinationHasFailedJobs +from dlt.common.destination.reference import WithStagingDataset from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.schema.schema import Schema from dlt.common.schema.typing import VERSION_TABLE_NAME @@ -896,6 +897,7 @@ def test_pipeline_upfront_tables_two_loads( # use staging tables for replace os.environ["DESTINATION__REPLACE_STRATEGY"] = replace_strategy + os.environ["TRUNCATE_STAGING_DATASET"] = "True" pipeline = destination_config.setup_pipeline( "test_pipeline_upfront_tables_two_loads", @@ -1001,6 +1003,21 @@ def table_3(make_data=False): is True ) + job_client, _ = pipeline._get_destination_clients(schema) + + if destination_config.staging and isinstance(job_client, WithStagingDataset): + for i in range(1, 4): + with pipeline.sql_client() as client: + table_name = f"table_{i}" + + if job_client.should_load_data_to_staging_dataset( + job_client.schema.tables[table_name] + ): + with client.with_staging_dataset(staging=True): + tab_name = client.make_qualified_table_name(table_name) + with client.execute_query(f"SELECT * FROM {tab_name}") as cur: + assert len(cur.fetchall()) == 0 + # @pytest.mark.skip(reason="Finalize the test: compare some_data values to values from database") # @pytest.mark.parametrize( diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index a828de40fd..1c4383405b 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -5,9 +5,9 @@ import logging import os import random +import threading from time import sleep from typing import Any, Tuple, cast -import threading from tenacity import retry_if_exception, Retrying, stop_after_attempt import pytest @@ -2230,3 +2230,33 @@ def stateful_resource(): assert len(fs_client.list_table_files("_dlt_loads")) == 2 assert len(fs_client.list_table_files("_dlt_version")) == 1 assert len(fs_client.list_table_files("_dlt_pipeline_state")) == 1 + + +@pytest.mark.parametrize("truncate", (True, False)) +def test_staging_dataset_truncate(truncate) -> None: + dlt.config["truncate_staging_dataset"] = truncate + + @dlt.resource(write_disposition="merge", merge_key="id") + def test_data(): + yield [{"field": 1, "id": 1}, {"field": 2, "id": 2}, {"field": 3, "id": 3}] + + pipeline = dlt.pipeline( + pipeline_name="test_staging_cleared", + destination="duckdb", + full_refresh=True, + ) + + info = pipeline.run(test_data, table_name="staging_cleared") + assert_load_info(info) + + with pipeline.sql_client() as client: + with client.execute_query( + f"SELECT * FROM {pipeline.dataset_name}_staging.staging_cleared" + ) as cur: + if truncate: + assert len(cur.fetchall()) == 0 + else: + assert len(cur.fetchall()) == 3 + + with client.execute_query(f"SELECT * FROM {pipeline.dataset_name}.staging_cleared") as cur: + assert len(cur.fetchall()) == 3 From 920d41a773879acd26c24445ee8fd127385c434f Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Thu, 16 May 2024 12:39:07 -0400 Subject: [PATCH 11/41] Add recommended_file_size cap to limit data writer file size (#1368) --- dlt/common/data_writers/buffered.py | 3 ++ dlt/common/destination/capabilities.py | 2 + dlt/destinations/impl/bigquery/__init__.py | 2 + tests/common/data_writers/utils.py | 6 +-- .../data_writers/test_buffered_writer.py | 38 ++++++++++++++++++- 5 files changed, 47 insertions(+), 4 deletions(-) diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index fdd5b50111..bd32c68c49 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -55,7 +55,10 @@ def __init__( self.closed_files: List[DataWriterMetrics] = [] # all fully processed files # buffered items must be less than max items in file self.buffer_max_items = min(buffer_max_items, file_max_items or buffer_max_items) + # Explicitly configured max size supersedes destination limit self.file_max_bytes = file_max_bytes + if self.file_max_bytes is None and _caps: + self.file_max_bytes = _caps.recommended_file_size self.file_max_items = file_max_items # the open function is either gzip.open or open self.open = ( diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index e74f5a980d..089b4a1d5e 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -29,6 +29,8 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): preferred_loader_file_format: TLoaderFileFormat = None supported_loader_file_formats: Sequence[TLoaderFileFormat] = None + recommended_file_size: Optional[int] = None + """Recommended file size in bytes when writing extract/load files""" preferred_staging_file_format: Optional[TLoaderFileFormat] = None supported_staging_file_formats: Sequence[TLoaderFileFormat] = None escape_identifier: Callable[[str], str] = None diff --git a/dlt/destinations/impl/bigquery/__init__.py b/dlt/destinations/impl/bigquery/__init__.py index d33466ed5e..39322b43a0 100644 --- a/dlt/destinations/impl/bigquery/__init__.py +++ b/dlt/destinations/impl/bigquery/__init__.py @@ -12,6 +12,8 @@ def capabilities() -> DestinationCapabilitiesContext: caps.supported_loader_file_formats = ["jsonl", "parquet"] caps.preferred_staging_file_format = "parquet" caps.supported_staging_file_formats = ["parquet", "jsonl"] + # BQ limit is 4GB but leave a large headroom since buffered writer does not preemptively check size + caps.recommended_file_size = int(1024 * 1024 * 1024) caps.escape_identifier = escape_bigquery_identifier caps.escape_literal = None caps.format_datetime_literal = format_bigquery_datetime_literal diff --git a/tests/common/data_writers/utils.py b/tests/common/data_writers/utils.py index 2cb440bde1..e6e377b7d0 100644 --- a/tests/common/data_writers/utils.py +++ b/tests/common/data_writers/utils.py @@ -1,5 +1,5 @@ import os -from typing import Type +from typing import Type, Optional from dlt.common.data_writers.buffered import BufferedDataWriter from dlt.common.data_writers.writers import TWriter, ALL_WRITERS @@ -18,8 +18,8 @@ def get_writer( writer: Type[TWriter], buffer_max_items: int = 10, - file_max_items: int = 10, - file_max_bytes: int = None, + file_max_items: Optional[int] = 10, + file_max_bytes: Optional[int] = None, disable_compression: bool = False, caps: DestinationCapabilitiesContext = None, ) -> BufferedDataWriter[TWriter]: diff --git a/tests/extract/data_writers/test_buffered_writer.py b/tests/extract/data_writers/test_buffered_writer.py index 82b81a1cd7..b6da132de9 100644 --- a/tests/extract/data_writers/test_buffered_writer.py +++ b/tests/extract/data_writers/test_buffered_writer.py @@ -2,6 +2,7 @@ import pytest import time from typing import Iterator, Type +from uuid import uuid4 from dlt.common.data_writers.exceptions import BufferedDataWriterClosed from dlt.common.data_writers.writers import ( @@ -11,7 +12,7 @@ JsonlWriter, ALL_WRITERS, ) -from dlt.common.destination.capabilities import TLoaderFileFormat +from dlt.common.destination.capabilities import TLoaderFileFormat, DestinationCapabilitiesContext from dlt.common.schema.utils import new_column from dlt.common.storages.file_storage import FileStorage @@ -330,3 +331,38 @@ def test_special_write_rotates(disable_compression: bool, writer_type: Type[Data metrics = writer.import_file( "tests/extract/cases/imported.any", DataWriterMetrics("", 1, 231, 0, 0) ) + + +@pytest.mark.parametrize( + "disable_compression", [True, False], ids=["no_compression", "compression"] +) +@pytest.mark.parametrize("writer_type", ALL_OBJECT_WRITERS) +def test_rotation_on_destination_caps_recommended_file_size( + disable_compression: bool, writer_type: Type[DataWriter] +) -> None: + caps = DestinationCapabilitiesContext.generic_capabilities() + caps.recommended_file_size = int(250 * 1024) + columns = {"id": new_column("id", "text")} + with get_writer( + writer_type, + disable_compression=disable_compression, + buffer_max_items=100, + file_max_items=None, + file_max_bytes=None, + caps=caps, + ) as writer: + for i in range(8): + # Data chunk approximately 40kb serialized + items = [{"id": str(uuid4())} for _ in range(1000)] + writer.write_data_item(items, columns) + if i < 5: + assert not writer.closed_files + + if i > 5: + # We should have written atleast 250kb by now and have rotated the file + assert len(writer.closed_files) == 1 + + # Check the files that were written are all within the recommended size + 1 chunk + assert len(writer.closed_files) == 2 + for file in writer.closed_files: + assert file.file_size < caps.recommended_file_size + 1024 * 50 From 5b0afa490112e842baa497583138bac3ce169699 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Thu, 16 May 2024 21:14:29 +0200 Subject: [PATCH 12/41] limits mssql query size to fit network buffer (#1372) --- dlt/destinations/impl/mssql/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dlt/destinations/impl/mssql/__init__.py b/dlt/destinations/impl/mssql/__init__.py index e9d9fe24fd..f7768d9238 100644 --- a/dlt/destinations/impl/mssql/__init__.py +++ b/dlt/destinations/impl/mssql/__init__.py @@ -17,7 +17,8 @@ def capabilities() -> DestinationCapabilitiesContext: # https://learn.microsoft.com/en-us/sql/sql-server/maximum-capacity-specifications-for-sql-server?view=sql-server-ver16&redirectedfrom=MSDN caps.max_identifier_length = 128 caps.max_column_identifier_length = 128 - caps.max_query_length = 4 * 1024 * 64 * 1024 + # A SQL Query can be a varchar(max) but is shown as limited to 65,536 * Network Packet + caps.max_query_length = 65536 * 10 caps.is_max_query_length_in_bytes = True caps.max_text_data_type_length = 2**30 - 1 caps.is_max_text_data_type_length_in_bytes = False From cb38702398bd8ea76de1770acdc3561dc3a6bd29 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Fri, 17 May 2024 10:07:25 +0200 Subject: [PATCH 13/41] Link REST API generic source from the docs intro (#1376) --- docs/website/docs/intro.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/website/docs/intro.md b/docs/website/docs/intro.md index 776329bcf4..0374802b7d 100644 --- a/docs/website/docs/intro.md +++ b/docs/website/docs/intro.md @@ -32,6 +32,10 @@ The library will create or update tables, infer data types, and handle nested da ]}> +:::tip +Looking to use a REST API as a source? Explore our new [REST API generic source](dlt-ecosystem/verified-sources/rest_api) for a declarative way to load data. +::: + From 359ec72c61628cb24b1e401a34f0528de1a657d3 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Fri, 17 May 2024 10:08:07 +0200 Subject: [PATCH 14/41] RESTClient: docs: Fixed snippet definition (#1373) --- docs/website/docs/general-usage/http/rest-client.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index 481670ae4b..3f29182044 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -555,7 +555,7 @@ Debugging `paginate()` is trickier because it's a generator function that yields 1. Enable [logging](../../running-in-production/running.md#set-the-log-level-and-format) to see detailed information about the HTTP requests: -```bash +```sh RUNTIME__LOG_LEVEL=INFO python my_script.py ``` @@ -589,7 +589,7 @@ def response_hook(response, **kwargs): for page in client.paginate( "/posts", - auth=BearerTokenAuth(token="your_access_token") + auth=BearerTokenAuth(token="your_access_token"), hooks={"response": [response_hook]} ): print(page) From e789093f94d6fea17e372bce3bae0533ee09ad9f Mon Sep 17 00:00:00 2001 From: rudolfix Date: Fri, 17 May 2024 20:06:41 +0200 Subject: [PATCH 15/41] allows to bubble up exceptions when standalone resource returns (#1374) --- dlt/extract/decorators.py | 15 ++++++--------- tests/extract/test_decorators.py | 11 +++++++++++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index fac6391e01..9c4076cfa7 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -567,16 +567,13 @@ def _wrap(*args: Any, **kwargs: Any) -> TDltResourceImpl: compat_wrapper(actual_resource_name, conf_f, sig, *args, **kwargs), incremental, ) - except InvalidResourceDataTypeFunctionNotAGenerator as gen_ex: + except InvalidResourceDataTypeFunctionNotAGenerator: # we allow an edge case: resource can return another resource - try: - # actually call the function to see if it contains DltResource - data_ = conf_f(*args, **kwargs) - if not isinstance(data_, DltResource): - raise - r = data_ # type: ignore[assignment] - except Exception: - raise gen_ex from None + # actually call the function to see if it contains DltResource + data_ = conf_f(*args, **kwargs) + if not isinstance(data_, DltResource): + raise + r = data_ # type: ignore[assignment] # consider transformer arguments bound r._args_bound = True # keep explicit args passed diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 5e85552d73..c6a675a8d3 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -880,6 +880,17 @@ def rv_resource(name: str): assert list(r) == [1, 2, 3] +def test_standalone_resource_returning_resource_exception() -> None: + @dlt.resource(standalone=True) + def rv_resource(uniq_name: str = dlt.config.value): + return dlt.resource([1, 2, 3], name=uniq_name, primary_key="value") + + # pass through of the exception in `rv_resource` when it returns, not yields + with pytest.raises(ConfigFieldMissingException) as conf_ex: + rv_resource() + assert conf_ex.value.fields == ["uniq_name"] + + def test_resource_rename_credentials_separation(): os.environ["SOURCES__TEST_DECORATORS__STANDALONE_SIGNATURE__SECRET_END"] = "5" assert list(standalone_signature(1)) == [1, 2, 3, 4] From 4c6f928c491fcb46b7c03a14d9f4fae4dd32c2c3 Mon Sep 17 00:00:00 2001 From: Daniel-Vetter-Coverwhale <120594412+Daniel-Vetter-Coverwhale@users.noreply.github.com> Date: Fri, 17 May 2024 15:31:51 -0400 Subject: [PATCH 16/41] fix: use .get on column in mssql destination for cases where the yaml does not contain the nullable property, like other sql destinations (#1380) --- dlt/destinations/impl/mssql/mssql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/destinations/impl/mssql/mssql.py b/dlt/destinations/impl/mssql/mssql.py index 8de15e2bd9..6f364c8af1 100644 --- a/dlt/destinations/impl/mssql/mssql.py +++ b/dlt/destinations/impl/mssql/mssql.py @@ -181,7 +181,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non if c.get(h, False) is True ) column_name = self.capabilities.escape_identifier(c["name"]) - return f"{column_name} {db_type} {hints_str} {self._gen_not_null(c['nullable'])}" + return f"{column_name} {db_type} {hints_str} {self._gen_not_null(c.get('nullable', True))}" def _create_replace_followup_jobs( self, table_chain: Sequence[TTableSchema] From 7a996337021032bac4e74e5f659f2fdc45da5b64 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Mon, 20 May 2024 20:55:41 +0400 Subject: [PATCH 17/41] Make path tests Windows compatible (#1384) * make path tests windows compatible --------- Co-authored-by: Jorrit Sandbrink --- .../load/filesystem/test_filesystem_client.py | 51 ++++++++++--------- .../load/pipeline/test_filesystem_pipeline.py | 4 +- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/tests/load/filesystem/test_filesystem_client.py b/tests/load/filesystem/test_filesystem_client.py index ca962adb16..4519f1ea83 100644 --- a/tests/load/filesystem/test_filesystem_client.py +++ b/tests/load/filesystem/test_filesystem_client.py @@ -1,6 +1,7 @@ import posixpath import os from unittest import mock +from pathlib import Path import pytest @@ -117,16 +118,18 @@ def test_replace_write_disposition(layout: str, default_buckets_env: str) -> Non client, _, root_path, load_id1 = load_info layout = client.config.layout # this path will be kept after replace - job_2_load_1_path = posixpath.join( - root_path, - create_path( - layout, - NORMALIZED_FILES[1], - client.schema.name, - load_id1, - load_package_timestamp=timestamp, - extra_placeholders=client.config.extra_placeholders, - ), + job_2_load_1_path = Path( + posixpath.join( + root_path, + create_path( + layout, + NORMALIZED_FILES[1], + client.schema.name, + load_id1, + load_package_timestamp=timestamp, + extra_placeholders=client.config.extra_placeholders, + ), + ) ) with perform_load( @@ -135,16 +138,18 @@ def test_replace_write_disposition(layout: str, default_buckets_env: str) -> Non client, _, root_path, load_id2 = load_info # this one we expect to be replaced with - job_1_load_2_path = posixpath.join( - root_path, - create_path( - layout, - NORMALIZED_FILES[0], - client.schema.name, - load_id2, - load_package_timestamp=timestamp, - extra_placeholders=client.config.extra_placeholders, - ), + job_1_load_2_path = Path( + posixpath.join( + root_path, + create_path( + layout, + NORMALIZED_FILES[0], + client.schema.name, + load_id2, + load_package_timestamp=timestamp, + extra_placeholders=client.config.extra_placeholders, + ), + ) ) # First file from load1 remains, second file is replaced by load2 @@ -159,7 +164,7 @@ def test_replace_write_disposition(layout: str, default_buckets_env: str) -> Non for f in files: if f == INIT_FILE_NAME: continue - paths.append(posixpath.join(basedir, f)) + paths.append(Path(posixpath.join(basedir, f))) ls = set(paths) assert ls == {job_2_load_1_path, job_1_load_2_path} @@ -210,7 +215,7 @@ def test_append_write_disposition(layout: str, default_buckets_env: str) -> None ) for job in jobs2 ] - expected_files = sorted([posixpath.join(root_path, fn) for fn in expected_files]) + expected_files = sorted([Path(posixpath.join(root_path, fn)) for fn in expected_files]) # type: ignore[misc] paths = [] for basedir, _dirs, files in client.fs_client.walk( @@ -222,5 +227,5 @@ def test_append_write_disposition(layout: str, default_buckets_env: str) -> None for f in files: if f == INIT_FILE_NAME: continue - paths.append(posixpath.join(basedir, f)) + paths.append(Path(posixpath.join(basedir, f))) assert list(sorted(paths)) == expected_files diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 7680bc6e90..5f24daf57f 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -301,7 +301,7 @@ def count(*args, **kwargs) -> Any: for file in files: if ".jsonl" in file: - expected_files.add(posixpath.join(basedir, file)) + expected_files.add(Path(posixpath.join(basedir, file))) for load_package in load_info.load_packages: for load_info in load_package.jobs["completed_jobs"]: # type: ignore[assignment] @@ -321,7 +321,7 @@ def count(*args, **kwargs) -> Any: full_path = posixpath.join(client.dataset_path, path) # type: ignore[attr-defined] assert client.fs_client.exists(full_path) # type: ignore[attr-defined] if ".jsonl" in full_path: - known_files.add(full_path) + known_files.add(Path(full_path)) assert expected_files == known_files assert known_files From db23c7123bec1f3b831b88314d354600b1bc8189 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 21 May 2024 12:31:04 +0200 Subject: [PATCH 18/41] Update destination-tables.md (#1386) --- docs/website/docs/general-usage/destination-tables.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/website/docs/general-usage/destination-tables.md b/docs/website/docs/general-usage/destination-tables.md index 8e1f771e47..4d31b8440b 100644 --- a/docs/website/docs/general-usage/destination-tables.md +++ b/docs/website/docs/general-usage/destination-tables.md @@ -74,7 +74,8 @@ pipeline = dlt.pipeline( load_info = pipeline.run(users) ``` -The result will be the same, but the table is implicitly named `users` based on the resource name. +The result will be the same, note that we do not explicitly pass `table_name="users"` to `pipeline.run`, +and the table is implicitly named `users` based on the resource name (e.g. `users()` decorated with `@dlt.resource`). :::note From 5b1f5adb3de4572b8dd44305c8ffd38ae002611a Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 21 May 2024 13:18:41 +0200 Subject: [PATCH 19/41] Fix typos in docs: destination tables (#1389) --- .../docs/general-usage/destination-tables.md | 56 ++++++------------- 1 file changed, 18 insertions(+), 38 deletions(-) diff --git a/docs/website/docs/general-usage/destination-tables.md b/docs/website/docs/general-usage/destination-tables.md index 4d31b8440b..4780d4be20 100644 --- a/docs/website/docs/general-usage/destination-tables.md +++ b/docs/website/docs/general-usage/destination-tables.md @@ -74,8 +74,7 @@ pipeline = dlt.pipeline( load_info = pipeline.run(users) ``` -The result will be the same, note that we do not explicitly pass `table_name="users"` to `pipeline.run`, -and the table is implicitly named `users` based on the resource name (e.g. `users()` decorated with `@dlt.resource`). +The result will be the same; note that we do not explicitly pass `table_name="users"` to `pipeline.run`, and the table is implicitly named `users` based on the resource name (e.g., `users()` decorated with `@dlt.resource`). :::note @@ -118,9 +117,7 @@ pipeline = dlt.pipeline( load_info = pipeline.run(data, table_name="users") ``` -Running this pipeline will create two tables in the destination, `users` and `users__pets`. The -`users` table will contain the top level data, and the `users__pets` table will contain the child -data. Here is what the tables may look like: +Running this pipeline will create two tables in the destination, `users` and `users__pets`. The `users` table will contain the top-level data, and the `users__pets` table will contain the child data. Here is what the tables may look like: **mydata.users** @@ -142,21 +139,14 @@ creating and linking children and parent tables. This is how it works: -1. Each row in all (top level and child) data tables created by `dlt` contains UNIQUE column named - `_dlt_id`. -1. Each child table contains FOREIGN KEY column `_dlt_parent_id` linking to a particular row - (`_dlt_id`) of a parent table. -1. Rows in child tables come from the lists: `dlt` stores the position of each item in the list in - `_dlt_list_idx`. -1. For tables that are loaded with the `merge` write disposition, we add a ROOT KEY column - `_dlt_root_id`, which links child table to a row in top level table. - +1. Each row in all (top level and child) data tables created by `dlt` contains a `UNIQUE` column named `_dlt_id`. +1. Each child table contains a `FOREIGN KEY` column `_dlt_parent_id` linking to a particular row (`_dlt_id`) of a parent table. +1. Rows in child tables come from the lists: `dlt` stores the position of each item in the list in `_dlt_list_idx`. +1. For tables that are loaded with the `merge` write disposition, we add a root key column `_dlt_root_id`, which links the child table to a row in the top-level table. :::note -If you define your own primary key in a child table, it will be used to link to parent table -and the `_dlt_parent_id` and `_dlt_list_idx` will not be added. `_dlt_id` is always added even in -case the primary key or other unique columns are defined. +If you define your own primary key in a child table, it will be used to link to the parent table, and the `_dlt_parent_id` and `_dlt_list_idx` will not be added. `_dlt_id` is always added even if the primary key or other unique columns are defined. ::: @@ -165,17 +155,15 @@ case the primary key or other unique columns are defined. During a pipeline run, dlt [normalizes both table and column names](schema.md#naming-convention) to ensure compatibility with the destination database's accepted format. All names from your source data will be transformed into snake_case and will only include alphanumeric characters. Please be aware that the names in the destination database may differ somewhat from those in your original input. ### Variant columns -If your data has inconsistent types, `dlt` will dispatch the data to several **variant columns**. For example, if you have a resource (ie json file) with a filed with name **answer** and your data contains boolean values, you will get get a column with name **answer** of type **BOOLEAN** in your destination. If for some reason, on next load you get integer value and string value in **answer**, the inconsistent data will go to **answer__v_bigint** and **answer__v_text** columns respectively. -The general naming rule for variant columns is `__v_` where `original_name` is the existing column name (with data type clash) and `type` is the name of data type stored in the variant. - +If your data has inconsistent types, `dlt` will dispatch the data to several **variant columns**. For example, if you have a resource (i.e., JSON file) with a field with name `answer` and your data contains boolean values, you will get a column with name `answer` of type `BOOLEAN` in your destination. If for some reason, on the next load, you get integer and string values in `answer`, the inconsistent data will go to `answer__v_bigint` and `answer__v_text` columns respectively. +The general naming rule for variant columns is `__v_` where `original_name` is the existing column name (with data type clash) and `type` is the name of the data type stored in the variant. ## Load Packages and Load IDs Each execution of the pipeline generates one or more load packages. A load package typically contains data retrieved from all the [resources](glossary.md#resource) of a particular [source](glossary.md#source). These packages are uniquely identified by a `load_id`. The `load_id` of a particular package is added to the top data tables -(referenced as `_dlt_load_id` column in the example above) and to the special `_dlt_loads` table with a status 0 -(when the load process is fully completed). +(referenced as `_dlt_load_id` column in the example above) and to the special `_dlt_loads` table with a status of 0 (when the load process is fully completed). To illustrate this, let's load more data into the same destination: @@ -190,8 +178,7 @@ data = [ ``` The rest of the pipeline definition remains the same. Running this pipeline will create a new load -package with a new `load_id` and add the data to the existing tables. The `users` table will now -look like this: +package with a new `load_id` and add the data to the existing tables. The `users` table will now look like this: **mydata.users** @@ -211,12 +198,12 @@ The `_dlt_loads` table will look like this: | **1234563456.12345** | quick_start | 0 | 2023-09-12 16:46:03.10662+00 | aOEb...Qekd/58= | The `_dlt_loads` table tracks complete loads and allows chaining transformations on top of them. -Many destinations do not support distributed and long-running transactions (e.g. Amazon Redshift). +Many destinations do not support distributed and long-running transactions (e.g., Amazon Redshift). In that case, the user may see the partially loaded data. It is possible to filter such data out: any row with a `load_id` that does not exist in `_dlt_loads` is not yet completed. The same procedure may be used to identify and delete data for packages that never got completed. -For each load, you can test and [alert](../running-in-production/alerting.md) on anomalies (e.g. +For each load, you can test and [alert](../running-in-production/alerting.md) on anomalies (e.g., no data, too much loaded to a table). There are also some useful load stats in the `Load info` tab of the [Streamlit app](../dlt-ecosystem/visualizations/exploring-the-data.md#exploring-the-data) mentioned above. @@ -232,8 +219,7 @@ Data lineage can be super relevant for architectures like the [data vault architecture](https://www.data-vault.co.uk/what-is-data-vault/) or when troubleshooting. The data vault architecture is a data warehouse that large organizations use when representing the same process across multiple systems, which adds data lineage requirements. Using the pipeline name -and `load_id` provided out of the box by `dlt`, you are able to identify the source and time of -data. +and `load_id` provided out of the box by `dlt`, you are able to identify the source and time of data. You can [save](../running-in-production/running.md#inspect-and-save-the-load-info-and-trace) complete lineage info for a particular `load_id` including a list of loaded files, error messages @@ -243,11 +229,7 @@ problems. ## Staging dataset So far we've been using the `append` write disposition in our example pipeline. This means that -each time we run the pipeline, the data is appended to the existing tables. When you use [the -merge write disposition](incremental-loading.md), dlt creates a staging database schema for -staging data. This schema is named `_staging` and contains the same tables as the -destination schema. When you run the pipeline, the data from the staging tables is loaded into the -destination tables in a single atomic transaction. +each time we run the pipeline, the data is appended to the existing tables. When you use the [merge write disposition](incremental-loading.md), dlt creates a staging database schema for staging data. This schema is named `_staging` and contains the same tables as the destination schema. When you run the pipeline, the data from the staging tables is loaded into the destination tables in a single atomic transaction. Let's illustrate this with an example. We change our pipeline to use the `merge` write disposition: @@ -271,8 +253,7 @@ load_info = pipeline.run(users) ``` Running this pipeline will create a schema in the destination database with the name `mydata_staging`. -If you inspect the tables in this schema, you will find `mydata_staging.users` table identical to the -`mydata.users` table in the previous example. +If you inspect the tables in this schema, you will find the `mydata_staging.users` table identical to the`mydata.users` table in the previous example. Here is what the tables may look like after running the pipeline: @@ -291,8 +272,7 @@ Here is what the tables may look like after running the pipeline: | 2 | Bob 2 | rX8ybgTeEmAmmA | 2345672350.98417 | | 3 | Charlie | h8lehZEvT3fASQ | 1234563456.12345 | -Notice that the `mydata.users` table now contains the data from both the previous pipeline run and -the current one. +Notice that the `mydata.users` table now contains the data from both the previous pipeline run and the current one. ## Versioned datasets @@ -323,4 +303,4 @@ load_info = pipeline.run(data, table_name="users") Every time you run this pipeline, a new schema will be created in the destination database with a datetime-based suffix. The data will be loaded into tables in this schema. For example, the first time you run the pipeline, the schema will be named -`mydata_20230912064403`, the second time it will be named `mydata_20230912064407`, and so on. +`mydata_20230912064403`, the second time it will be named `mydata_20230912064407`, and so on. \ No newline at end of file From 1ce556ae8503bbf567d36d92221b2a77878a5d7e Mon Sep 17 00:00:00 2001 From: David Scharf Date: Tue, 21 May 2024 17:30:03 +0200 Subject: [PATCH 20/41] add naming rules to contributing (#1291) * branch naming rules in contributing * add motivation to branch naming rule * formatting and typo --- CONTRIBUTING.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a8a8cc37ae..85dbf37c97 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -52,6 +52,29 @@ We use **master** branch for hot fixes (including documentation) that needs to b On the release day, **devel** branch is merged into **master**. All releases of `dlt` happen only from the **master**. +### Branch naming rules + +We want to make sure that our git history explains in a human readable way what has been changed with which Branch or PR. To this end, we are using the following branch naming pattern (all lowercase and dashes, no underscores): + +```sh +{category}/{ticket-id}-description-of-the-branch +# example: +feat/4922-add-avro-support +``` + +#### Branch categories + +* **feat** - a new feature that is being implemented (ticket required) +* **fix** - a change that fixes a bug (ticket required) +* **exp** - an experiment where we are testing a new idea or want to demonstrate something to the team, might turn into a `feat` later (ticket encouraged) +* **test** - anything related to the tests (ticket encouraged) +* **blogs** - a new entry to our blog (ticket optional) +* **docs** - a change to our docs (ticket optional) + +#### Ticket Numbers + +We encourage you to attach your branches to a ticket, if none exists, create one and explain what you are doing. For `feat` and `fix` branches, tickets are mandatory, for `exp` and `test` branches encouraged and for `blogs` and `docs` branches optional. + ### Submitting a hotfix We'll fix critical bugs and release `dlt` out of the schedule. Follow the regular procedure, but make your PR against **master** branch. Please ping us on Slack if you do it. @@ -166,3 +189,4 @@ Once the version has been bumped, follow these steps to publish the new release - [Poetry Documentation](https://python-poetry.org/docs/) If you have any questions or need help, don't hesitate to reach out to us. We're here to help you succeed in contributing to `dlt`. Happy coding! +**** \ No newline at end of file From b1e0f7760920355050b9c1dd6324ece695830434 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Tue, 21 May 2024 21:28:30 +0200 Subject: [PATCH 21/41] Fix snippet linting errors (#1392) * fix snippets * fix additional mypy errors * fix another auth type checker error --- .../verified-sources/rest_api.md | 84 +++++++++++-------- .../docs/general-usage/http/rest-client.md | 4 +- .../docs/walkthroughs/create-a-pipeline.md | 2 +- 3 files changed, 50 insertions(+), 40 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 1f79055d06..0022850987 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -203,7 +203,7 @@ For example, you can set the primary key, write disposition, and other default s ```py config = { "client": { - ... + # ... }, "resource_defaults": { "primary_key": "id", @@ -216,15 +216,17 @@ config = { }, "resources": [ "resource1", - "resource2": { - "name": "resource2_name", - "write_disposition": "append", - "endpoint": { - "params": { - "param1": "value1", + { + "resource2": { + "name": "resource2_name", + "write_disposition": "append", + "endpoint": { + "params": { + "param1": "value1", + }, }, - }, - }, + } + } ], } ``` @@ -309,7 +311,7 @@ To specify the pagination configuration, use the `paginator` field in the [clien ```py { - ... + # ... "paginator": { "type": "json_links", "next_url_path": "paging.next", @@ -321,7 +323,7 @@ Or using the paginator instance: ```py { - ... + # ... "paginator": JSONResponsePaginator( next_url_path="paging.next" ), @@ -394,11 +396,11 @@ One of the most common method is token-based authentication. To authenticate wit ```py { "client": { - ... + # ... "auth": { "token": dlt.secrets["your_api_token"], }, - ... + # ... }, } ``` @@ -424,7 +426,7 @@ To specify the authentication configuration, use the `auth` field in the [client "type": "bearer", "token": dlt.secrets["your_api_token"], }, - ... + # ... }, } ``` @@ -438,7 +440,7 @@ config = { "client": { "auth": BearTokenAuth(dlt.secrets["your_api_token"]), }, - ... + # ... } ``` @@ -455,7 +457,7 @@ In the GitHub example, the `issue_comments` resource depends on the `issues` res "name": "issues", "endpoint": { "path": "issues", - ... + # ... }, }, { @@ -495,10 +497,12 @@ The `issue_comments` resource will make requests to the following endpoints: The syntax for the `resolve` field in parameter configuration is: ```py -"": { - "type": "resolve", - "resource": "", - "field": "", +{ + "": { + "type": "resolve", + "resource": "", + "field": "", + } } ``` @@ -530,21 +534,25 @@ When the API endpoint supports incremental loading, you can configure the source 1. Defining a special parameter in the `params` section of the [endpoint configuration](#endpoint-configuration): ```py - "": { - "type": "incremental", - "cursor_path": "", - "initial_value": "", - }, + { + "": { + "type": "incremental", + "cursor_path": "", + "initial_value": "", + }, + } ``` For example, in the `issues` resource configuration in the GitHub example, we have: ```py - "since": { - "type": "incremental", - "cursor_path": "updated_at", - "initial_value": "2024-01-25T11:21:28Z", - }, + { + "since": { + "type": "incremental", + "cursor_path": "updated_at", + "initial_value": "2024-01-25T11:21:28Z", + }, + } ``` This configuration tells the source to create an incremental object that will keep track of the `updated_at` field in the response and use it as a value for the `since` parameter in subsequent requests. @@ -552,13 +560,15 @@ When the API endpoint supports incremental loading, you can configure the source 2. Specifying the `incremental` field in the [endpoint configuration](#endpoint-configuration): ```py - "incremental": { - "start_param": "", - "end_param": "", - "cursor_path": "", - "initial_value": "", - "end_value": "", - }, + { + "incremental": { + "start_param": "", + "end_param": "", + "cursor_path": "", + "initial_value": "", + "end_value": "", + } + } ``` This configuration is more flexible and allows you to specify the start and end conditions for the incremental loading. diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index 3f29182044..19cc95bf78 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -542,7 +542,7 @@ from dlt.sources.helpers.rest_client import RESTClient from dlt.sources.helpers.rest_client.auth import BearerTokenAuth client = RESTClient(base_url="https://api.example.com") -response = client.get("/posts", auth=BearerTokenAuth(token="your_access_token")) +response = client.get("/posts", auth=BearerTokenAuth(token="your_access_token")) # type: ignore print(response.status_code) print(response.headers) @@ -589,7 +589,7 @@ def response_hook(response, **kwargs): for page in client.paginate( "/posts", - auth=BearerTokenAuth(token="your_access_token"), + auth=BearerTokenAuth(token="your_access_token"), # type: ignore hooks={"response": [response_hook]} ): print(page) diff --git a/docs/website/docs/walkthroughs/create-a-pipeline.md b/docs/website/docs/walkthroughs/create-a-pipeline.md index bba78dc6cb..cbbbd73fc3 100644 --- a/docs/website/docs/walkthroughs/create-a-pipeline.md +++ b/docs/website/docs/walkthroughs/create-a-pipeline.md @@ -100,7 +100,7 @@ def github_api_resource(api_secret_key: str = dlt.secrets.value): for page in paginate( url, - auth=BearerTokenAuth(api_secret_key), + auth=BearerTokenAuth(api_secret_key), # type: ignore paginator=HeaderLinkPaginator(), params={"state": "open"} ): From bf92adaaa56e99d4a9a591789bf3763d1f321545 Mon Sep 17 00:00:00 2001 From: mucio Date: Thu, 23 May 2024 10:27:18 +0200 Subject: [PATCH 22/41] Added values to the data pattern of the rest_api helper (#1399) --- dlt/sources/helpers/rest_client/detector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dlt/sources/helpers/rest_client/detector.py b/dlt/sources/helpers/rest_client/detector.py index 857f6bbb4e..d004ca173c 100644 --- a/dlt/sources/helpers/rest_client/detector.py +++ b/dlt/sources/helpers/rest_client/detector.py @@ -25,6 +25,7 @@ "payload", "content", "objects", + "values", ] ) From bab9e90306446e6ae4238b8753bab59b19c87955 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Thu, 23 May 2024 14:47:08 +0530 Subject: [PATCH 23/41] Added info about how to reorder the columns to adjust a schema (#1364) * Added info about how to reorder the columns * Updated rest_api.md with configuration examples * Update docs/website/docs/walkthroughs/adjust-a-schema.md * Updated ../website/docs/dlt-ecosystem/verified-sources/rest_api.md * fix naming convention for bigquery custom destination --------- Co-authored-by: Anton Burnashev Co-authored-by: AstrakhantsevaAA --- .../custom_destination_bigquery.py | 20 +++++++------- .../docs/walkthroughs/adjust-a-schema.md | 26 +++++++++++++++++++ 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py index ea60b9b00d..e890469263 100644 --- a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py +++ b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py @@ -5,13 +5,13 @@ keywords: [destination, credentials, example, bigquery, custom destination] --- -In this example, you'll find a Python script that demonstrates how to load to bigquey with the custom destination. +In this example, you'll find a Python script that demonstrates how to load to BigQuery with the custom destination. We'll learn how to: -- use [built-in credentials](../general-usage/credentials/config_specs#gcp-credentials) -- use the [custom destination](../dlt-ecosystem/destinations/destination.md) -- Use pyarrow tables to create complex column types on bigquery -- Use bigquery `autodetect=True` for schema inference from parquet files +- Use [built-in credentials.](../general-usage/credentials/config_specs#gcp-credentials) +- Use the [custom destination.](../dlt-ecosystem/destinations/destination.md) +- Use pyarrow tables to create complex column types on BigQuery. +- Use BigQuery `autodetect=True` for schema inference from parquet files. """ @@ -38,7 +38,7 @@ def resource(url: str): # load pyarrow table with pandas table = pa.Table.from_pandas(pd.read_csv(url)) - # we add a list type column to demontrate bigquery lists + # we add a list type column to demonstrate bigquery lists table = table.append_column( "tags", pa.array( @@ -57,12 +57,12 @@ def resource(url: str): yield table -# dlt biquery custom destination +# dlt bigquery custom destination # we can use the dlt provided credentials class # to retrieve the gcp credentials from the secrets -@dlt.destination(name="bigquery", loader_file_format="parquet", batch_size=0) +@dlt.destination(name="bigquery", loader_file_format="parquet", batch_size=0, naming_convention="snake_case") def bigquery_insert( - items, table, credentials: GcpServiceAccountCredentials = dlt.secrets.value + items, table=BIGQUERY_TABLE_ID, credentials: GcpServiceAccountCredentials = dlt.secrets.value ) -> None: client = bigquery.Client( credentials.project_id, credentials.to_native_credentials(), location="US" @@ -74,7 +74,7 @@ def bigquery_insert( ) # since we have set the batch_size to 0, we get a filepath and can load the file directly with open(items, "rb") as f: - load_job = client.load_table_from_file(f, BIGQUERY_TABLE_ID, job_config=job_config) + load_job = client.load_table_from_file(f, table, job_config=job_config) load_job.result() # Waits for the job to complete. diff --git a/docs/website/docs/walkthroughs/adjust-a-schema.md b/docs/website/docs/walkthroughs/adjust-a-schema.md index cfe2d056b0..b0a9a9ce05 100644 --- a/docs/website/docs/walkthroughs/adjust-a-schema.md +++ b/docs/website/docs/walkthroughs/adjust-a-schema.md @@ -121,6 +121,32 @@ Do not rename the tables or columns in the yaml file. `dlt` infers those from th You can [adjust the schema](../general-usage/resource.md#adjust-schema) in Python before resource is loaded. ::: +### Reorder columns +To reorder the columns in your dataset, follow these steps: + +1. Initial Run: Execute the pipeline to obtain the import and export schemas. +1. Modify Export Schema: Adjust the column order as desired in the export schema. +1. Sync Import Schema: Ensure that these changes are mirrored in the import schema to maintain consistency. +1. Delete Dataset: Remove the existing dataset to prepare for the reload. +1. Reload Data: Reload the data. The dataset should now reflect the new column order as specified in the import YAML. + +These steps ensure that the column order in your dataset matches your specifications. + +**Another approach** to reorder columns is to use the `add_map` function. For instance, to rearrange β€˜column1’, β€˜column2’, and β€˜column3’, you can proceed as follows: + +```py +# Define the data source and reorder columns using add_map +data_source = resource().add_map(lambda row: { + 'column3': row['column3'], + 'column1': row['column1'], + 'column2': row['column2'] +}) + +# Run the pipeline +load_info = pipeline.run(data_source) +``` + +In this example, the `add_map` function reorders columns by defining a new mapping. The lambda function specifies the desired order by rearranging the key-value pairs. When the pipeline runs, the data will load with the columns in the new order. ### Load data as json instead of generating child table or columns from flattened dicts From e44984801a60beed73d32d9739b2cd5ae00cb403 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Thu, 23 May 2024 11:49:49 +0200 Subject: [PATCH 24/41] rest_api: add response_actions documentation (#1362) --- .../verified-sources/rest_api.md | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 0022850987..d5d29344de 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -282,7 +282,7 @@ The fields in the endpoint configuration are: - `json`: The JSON payload to be sent with the request (for POST and PUT requests). - `paginator`: Pagination configuration for the endpoint. See the [pagination](#pagination) section for more details. - `data_selector`: A JSONPath to select the data from the response. See the [data selection](#data-selection) section for more details. -- `response_actions`: A list of actions that define how to process the response data. +- `response_actions`: A list of actions that define how to process the response data. See the [response actions](#response-actions) section for more details. - `incremental`: Configuration for [incremental loading](#incremental-loading). ### Pagination @@ -586,3 +586,33 @@ See the [incremental loading](../../general-usage/incremental-loading.md#increme - `root_key` (bool): Enables merging on all resources by propagating root foreign key to child tables. This option is most useful if you plan to change write disposition of a resource to disable/enable merge. Defaults to False. - `schema_contract`: Schema contract settings that will be applied to this resource. - `spec`: A specification of configuration and secret values required by the source. + +### Response actions + +The `response_actions` field in the endpoint configuration allows you to specify how to handle specific responses from the API based on status codes or content substrings. This is useful for handling edge cases like ignoring responses on specific conditions. + +:::caution Experimental Feature +This is an experimental feature and may change in future releases. +::: + +#### Example + +```py +{ + "path": "issues", + "response_actions": [ + {"status_code": 404, "action": "ignore"}, + {"content": "Not found", "action": "ignore"}, + {"status_code": 200, "content": "some text", "action": "ignore"}, + ], +} +``` + +In this example, the source will ignore responses with a status code of 404, responses with the content "Not found", and responses with a status code of 200 _and_ content "some text". + +**Fields:** + +- `status_code` (int, optional): The HTTP status code to match. +- `content` (str, optional): A substring to search for in the response content. +- `action` (str): The action to take when the condition is met. Currently supported actions: + - `ignore`: Ignore the response. From 19e1462e65995291be39081dfcbc0cd0dd588b6f Mon Sep 17 00:00:00 2001 From: rudolfix Date: Thu, 23 May 2024 17:04:25 +0200 Subject: [PATCH 25/41] detects a path param in the right-most path segment (#1394) --- dlt/sources/helpers/rest_client/detector.py | 8 ++++++-- .../helpers/rest_client/test_detector.py | 17 ++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/dlt/sources/helpers/rest_client/detector.py b/dlt/sources/helpers/rest_client/detector.py index d004ca173c..19a1e83a82 100644 --- a/dlt/sources/helpers/rest_client/detector.py +++ b/dlt/sources/helpers/rest_client/detector.py @@ -1,5 +1,6 @@ import re -from typing import List, Dict, Any, Tuple, Union, Optional, Callable, Iterable +from pathlib import PurePosixPath +from typing import List, Dict, Any, Tuple, Union, Callable, Iterable from urllib.parse import urlparse from requests import Response @@ -47,7 +48,10 @@ def single_entity_path(path: str) -> bool: """Checks if path ends with path param indicating that single object is returned""" - return re.search(r"\{([a-zA-Z_][a-zA-Z0-9_]*)\}/?$", path) is not None + # get last path segment + name = PurePosixPath(path).name + # alphabet for a name taken from https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.3.md#fixed-fields-6 + return re.search(r"\{([a-zA-Z0-9\.\-_]+)\}", name) is not None def matches_any_pattern(key: str, patterns: Iterable[str]) -> bool: diff --git a/tests/sources/helpers/rest_client/test_detector.py b/tests/sources/helpers/rest_client/test_detector.py index f01f9409a1..6511b472fb 100644 --- a/tests/sources/helpers/rest_client/test_detector.py +++ b/tests/sources/helpers/rest_client/test_detector.py @@ -406,16 +406,20 @@ def test_find_paginator(test_case) -> None: [ "/users/{user_id}", "/api/v1/products/{product_id}/", - # those are not valid paths - # "/api/v1/products/{product_id}//", - # "/api/v1/products/{product_id}?param1=value1", - # "/api/v1/products/{product_id}#section", - # "/api/v1/products/{product_id}/#section", + "/api/v1/products/{product_id}//", + "/api/v1/products/{product_id}?param1=value1", + "/api/v1/products/{product_id}#section", + "/api/v1/products/{product_id}.json", + "/api/v1/products/{product_id}.json/", + "/api/v1/products/{product_id}_data", + "/api/v1/products/{product_id}_data?param=true", "/users/{user_id}/posts/{post_id}", "/users/{user_id}/posts/{post_id}/comments/{comment_id}", "{entity}", "/{entity}", "/{user_123}", + "/users/{user-id}", + "/users/{123}", ], ) def test_single_entity_path_valid(path): @@ -430,8 +434,7 @@ def test_single_entity_path_valid(path): "/users/{user_id}/details", "/", "/{}", - "/users/{123}", - "/users/{user-id}", + "/api/v1/products/{product_id}/#section", "/users/{user id}", "/users/{user_id}/{", # Invalid ending ], From d0fdfb4ca000ca57c56134bd0fb9b9f11e90b286 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Thu, 23 May 2024 18:50:26 +0200 Subject: [PATCH 26/41] Update the tutorial to use `rest_client.paginate` for pagination (#1287) --- .../docs/tutorial/grouping-resources.md | 141 ++++++------------ .../docs/tutorial/load-data-from-an-api.md | 51 ++++++- 2 files changed, 93 insertions(+), 99 deletions(-) diff --git a/docs/website/docs/tutorial/grouping-resources.md b/docs/website/docs/tutorial/grouping-resources.md index 3a05f7940c..3ba95b7971 100644 --- a/docs/website/docs/tutorial/grouping-resources.md +++ b/docs/website/docs/tutorial/grouping-resources.md @@ -14,6 +14,9 @@ This tutorial continues the [previous](load-data-from-an-api) part. We'll use th In the previous tutorial, we loaded issues from the GitHub API. Now we'll prepare to load comments from the API as well. Here's a sample [dlt resource](../general-usage/resource) that does that: ```py +import dlt +from dlt.sources.helpers.rest_client import paginate + @dlt.resource( table_name="comments", write_disposition="merge", @@ -22,17 +25,11 @@ In the previous tutorial, we loaded issues from the GitHub API. Now we'll prepar def get_comments( updated_at = dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") ): - url = "https://api.github.com/repos/dlt-hub/dlt/comments?per_page=100" - - while True: - response = requests.get(url) - response.raise_for_status() - yield response.json() - - # get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + for page in paginate( + "https://api.github.com/repos/dlt-hub/dlt/comments", + params={"per_page": 100} + ): + yield page ``` We can load this resource separately from the issues resource, however loading both issues and comments in one go is more efficient. To do that, we'll use the `@dlt.source` decorator on a function that returns a list of resources: @@ -47,7 +44,7 @@ def github_source(): ```py import dlt -from dlt.sources.helpers import requests +from dlt.sources.helpers.rest_client import paginate @dlt.resource( table_name="issues", @@ -57,21 +54,17 @@ from dlt.sources.helpers import requests def get_issues( updated_at = dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") ): - url = ( - "https://api.github.com/repos/dlt-hub/dlt/issues" - f"?since={updated_at.last_value}&per_page=100" - "&sort=updated&directions=desc&state=open" - ) - - while True: - response = requests.get(url) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + for page in paginate( + "https://api.github.com/repos/dlt-hub/dlt/issues", + params={ + "since": updated_at.last_value, + "per_page": 100, + "sort": "updated", + "directions": "desc", + "state": "open", + } + ): + yield page @dlt.resource( @@ -82,20 +75,14 @@ def get_issues( def get_comments( updated_at = dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") ): - url = ( - "https://api.github.com/repos/dlt-hub/dlt/comments" - "?per_page=100" - ) - - while True: - response = requests.get(url) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + for page in paginate( + "https://api.github.com/repos/dlt-hub/dlt/comments", + params={ + "since": updated_at.last_value, + "per_page": 100, + } + ): + yield page @dlt.source @@ -124,18 +111,8 @@ from dlt.sources.helpers import requests BASE_GITHUB_URL = "https://api.github.com/repos/dlt-hub/dlt" def fetch_github_data(endpoint, params={}): - """Fetch data from GitHub API based on endpoint and params.""" url = f"{BASE_GITHUB_URL}/{endpoint}" - - while True: - response = requests.get(url, params=params) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + return paginate(url, params=params) @dlt.source def github_source(): @@ -164,21 +141,16 @@ For the next step we'd want to get the [number of repository clones](https://doc Let's handle this by changing our `fetch_github_data()` first: ```py -def fetch_github_data(endpoint, params={}, access_token=None): - """Fetch data from GitHub API based on endpoint and params.""" - headers = {"Authorization": f"Bearer {access_token}"} if access_token else {} +from dlt.sources.helpers.rest_client.auth import BearerTokenAuth +def fetch_github_data(endpoint, params={}, access_token=None): url = f"{BASE_GITHUB_URL}/{endpoint}" + return paginate( + url, + params=params, + auth=BearerTokenAuth(token=access_token) if access_token else None, + ) - while True: - response = requests.get(url, params=params, headers=headers) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] @dlt.source def github_source(access_token): @@ -229,28 +201,7 @@ access_token = "ghp_A...3aRY" Now we can run the script and it will load the data from the `traffic/clones` endpoint: ```py -import dlt -from dlt.sources.helpers import requests - -BASE_GITHUB_URL = "https://api.github.com/repos/dlt-hub/dlt" - - -def fetch_github_data(endpoint, params={}, access_token=None): - """Fetch data from GitHub API based on endpoint and params.""" - headers = {"Authorization": f"Bearer {access_token}"} if access_token else {} - - url = f"{BASE_GITHUB_URL}/{endpoint}" - - while True: - response = requests.get(url, params=params, headers=headers) - response.raise_for_status() - yield response.json() - - # get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] - +... @dlt.source def github_source( @@ -287,19 +238,12 @@ BASE_GITHUB_URL = "https://api.github.com/repos/{repo_name}" def fetch_github_data(repo_name, endpoint, params={}, access_token=None): """Fetch data from GitHub API based on repo_name, endpoint, and params.""" - headers = {"Authorization": f"Bearer {access_token}"} if access_token else {} - url = BASE_GITHUB_URL.format(repo_name=repo_name) + f"/{endpoint}" - - while True: - response = requests.get(url, params=params, headers=headers) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + return paginate( + url, + params=params, + auth=BearerTokenAuth(token=access_token) if access_token else None, + ) @dlt.source @@ -347,5 +291,6 @@ Interested in learning more? Here are some suggestions: - [Pass config and credentials into your sources and resources](../general-usage/credentials). - [Run in production: inspecting, tracing, retry policies and cleaning up](../running-in-production/running). - [Run resources in parallel, optimize buffers and local storage](../reference/performance.md) + - [Use REST API client helpers](../general-usage/http/rest-client.md) to simplify working with REST APIs. 3. Check out our [how-to guides](../walkthroughs) to get answers to some common questions. 4. Explore the [Examples](../examples) section to see how dlt can be used in real-world scenarios diff --git a/docs/website/docs/tutorial/load-data-from-an-api.md b/docs/website/docs/tutorial/load-data-from-an-api.md index 31a2c1592d..ec6136b6d3 100644 --- a/docs/website/docs/tutorial/load-data-from-an-api.md +++ b/docs/website/docs/tutorial/load-data-from-an-api.md @@ -44,7 +44,7 @@ dlt pipeline github_issues show ## Append or replace your data -Try running the pipeline again with `python github_issues.py`. You will notice that the **issues** table contains two copies of the same data. This happens because the default load mode is `append`. It is very useful, for example, when you have a new folder created daily with `json` file logs, and you want to ingest them. +Try running the pipeline again with `python github_issues.py`. You will notice that the **issues** table contains two copies of the same data. This happens because the default load mode is `append`. It is very useful, for example, when you have daily data updates and you want to ingest them. To get the latest data, we'd need to run the script again. But how to do that without duplicating the data? One option is to tell `dlt` to replace the data in existing tables in the destination by using `replace` write disposition. Change the `github_issues.py` script to the following: @@ -148,6 +148,55 @@ and `updated_at.last_value` to tell GitHub to return issues updated only **after [Learn more about merge write disposition](../general-usage/incremental-loading#merge-incremental_loading). +## Using pagination helper + +In the previous examples, we used the `requests` library to make HTTP requests to the GitHub API and handled pagination manually. `dlt` has the built-in [REST client](../general-usage/http/rest-client.md) that simplifies API requests. We'll pick the `paginate()` helper from it for the next example. The `paginate` function takes a URL and optional parameters (quite similar to `requests`) and returns a generator that yields pages of data. + +Here's how the updated script looks: + +```py +import dlt +from dlt.sources.helpers.rest_client import paginate + +@dlt.resource( + table_name="issues", + write_disposition="merge", + primary_key="id", +) +def get_issues( + updated_at=dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") +): + for page in paginate( + "https://api.github.com/repos/dlt-hub/dlt/issues", + params={ + "since": updated_at.last_value, + "per_page": 100, + "sort": "updated", + "direction": "desc", + "state": "open", + }, + ): + yield page + +pipeline = dlt.pipeline( + pipeline_name="github_issues_merge", + destination="duckdb", + dataset_name="github_data_merge", +) +load_info = pipeline.run(get_issues) +row_counts = pipeline.last_trace.last_normalize_info + +print(row_counts) +print("------") +print(load_info) +``` + +Let's zoom in on the changes: + +1. The `while` loop that handled pagination is replaced with reading pages from the `paginate()` generator. +2. `paginate()` takes the URL of the API endpoint and optional parameters. In this case, we pass the `since` parameter to get only issues updated after the last pipeline run. +3. We're not explicitly setting up pagination, `paginate()` handles it for us. Magic! Under the hood, `paginate()` analyzes the response and detects the pagination method used by the API. Read more about pagination in the [REST client documentation](../general-usage/http/rest-client.md#paginating-api-responses). + ## Next steps Continue your journey with the [Resource Grouping and Secrets](grouping-resources) tutorial. From a8a6ff7288f8639bea9e92c20573126e538215e1 Mon Sep 17 00:00:00 2001 From: Harato Daisuke <129731743+Benjamin0313@users.noreply.github.com> Date: Fri, 24 May 2024 17:51:48 +0900 Subject: [PATCH 27/41] fix command to install dlt (#1404) --- docs/website/docs/dlt-ecosystem/destinations/snowflake.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index f144da02e6..deaaff3562 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -9,7 +9,7 @@ keywords: [Snowflake, destination, data warehouse] ## Install `dlt` with Snowflake **To install the `dlt` library with Snowflake dependencies, run:** ```sh -pip install dlt[snowflake] +pip install "dlt[snowflake]" ``` ## Setup Guide From f6f583c65ef32dfa6bc4741a0db4e5e0a2b096b3 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 24 May 2024 11:28:06 +0200 Subject: [PATCH 28/41] Update rest_api.md fix auth methods table --- docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index d5d29344de..54edac5062 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -414,8 +414,8 @@ Available authentication types: | Authentication class | String Alias (`type`) | Description | | ------------------- | ----------- | ----------- | | [BearTokenAuth](../../general-usage/http/rest-client.md#bearer-token-authentication) | `bearer` | Bearer token authentication. | -| [HTTPBasicAuth](../../general-usage/http/rest-client.md#http-basic-authentication) | `api_key` | Basic HTTP authentication. | -| [APIKeyAuth](../../general-usage/http/rest-client.md#api-key-authentication) | `http_basic` | API key authentication with key defined in the query parameters or in the headers. | +| [HTTPBasicAuth](../../general-usage/http/rest-client.md#http-basic-authentication) | `http_basic` | Basic HTTP authentication. | +| [APIKeyAuth](../../general-usage/http/rest-client.md#api-key-authentication) | `api_key` | API key authentication with key defined in the query parameters or in the headers. | To specify the authentication configuration, use the `auth` field in the [client](#client) configuration: From 7c07c674b67c6fac86e0c1f4f1d2b00ebbe7e655 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Fri, 24 May 2024 12:53:36 +0200 Subject: [PATCH 29/41] add typing classifier (#1391) update maintainers --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4bd62ce03b..1a946f5e10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "dlt" version = "0.4.11" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] -maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Ty Dunn "] +maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ] readme = "README.md" license = "Apache-2.0" homepage = "https://github.com/dlt-hub" @@ -13,6 +13,7 @@ classifiers = [ "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Topic :: Software Development :: Libraries", + "Typing :: Typed", "Operating System :: MacOS :: MacOS X", "Operating System :: POSIX :: Linux", "Operating System :: Microsoft :: Windows",] From ceb229d6bbfcb6cd52545e3af800e8b720fd8c1f Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Fri, 24 May 2024 14:19:01 +0200 Subject: [PATCH 30/41] RESTClient: implement AuthConfigBase.__bool__ + update docs (#1398) * Fix AuthConfigBase so its instances always evaluate to True in bool context; change docs to suggest direct inheritance from AuthBase * Add tests --- dlt/sources/helpers/rest_client/auth.py | 6 ++- .../docs/general-usage/http/rest-client.md | 8 ++-- .../helpers/rest_client/test_client.py | 43 ++++++++++++++++++- 3 files changed, 51 insertions(+), 6 deletions(-) diff --git a/dlt/sources/helpers/rest_client/auth.py b/dlt/sources/helpers/rest_client/auth.py index 37c0de3db1..020c63a195 100644 --- a/dlt/sources/helpers/rest_client/auth.py +++ b/dlt/sources/helpers/rest_client/auth.py @@ -38,7 +38,11 @@ class AuthConfigBase(AuthBase, CredentialsConfiguration): configurable via env variables or toml files """ - pass + def __bool__(self) -> bool: + # This is needed to avoid AuthConfigBase-derived classes + # which do not implement CredentialsConfiguration interface + # to be evaluated as False in requests.sessions.Session.prepare_request() + return True @configspec diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index 19cc95bf78..1093428b0f 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -407,7 +407,7 @@ The available authentication methods are defined in the `dlt.sources.helpers.res - [APIKeyAuth](#api-key-authentication) - [HttpBasicAuth](#http-basic-authentication) -For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthConfigBase` class. +For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthBase` class from the Requests library. ### Bearer token authentication @@ -479,12 +479,12 @@ response = client.get("/protected/resource") ### Implementing custom authentication -You can implement custom authentication by subclassing the `AuthConfigBase` class and implementing the `__call__` method: +You can implement custom authentication by subclassing the `AuthBase` class and implementing the `__call__` method: ```py -from dlt.sources.helpers.rest_client.auth import AuthConfigBase +from requests.auth import AuthBase -class CustomAuth(AuthConfigBase): +class CustomAuth(AuthBase): def __init__(self, token): self.token = token diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 50defa8edb..e03879b417 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -1,6 +1,7 @@ import os import pytest from typing import Any, cast +from requests.auth import AuthBase from dlt.common.typing import TSecretStrValue from dlt.sources.helpers.requests import Response, Request from dlt.sources.helpers.rest_client import RESTClient @@ -57,7 +58,6 @@ def test_page_context(self, rest_client: RESTClient) -> None: for page in rest_client.paginate( "/posts", paginator=JSONResponsePaginator(next_url_path="next_page"), - auth=AuthConfigBase(), ): # response that produced data assert isinstance(page.response, Response) @@ -183,3 +183,44 @@ def test_oauth_jwt_auth_success(self, rest_client: RESTClient): ) assert_pagination(list(pages_iter)) + + def test_custom_auth_success(self, rest_client: RESTClient): + class CustomAuthConfigBase(AuthConfigBase): + def __init__(self, token: str): + self.token = token + + def __call__(self, request: Request) -> Request: + request.headers["Authorization"] = f"Bearer {self.token}" + return request + + class CustomAuthAuthBase(AuthBase): + def __init__(self, token: str): + self.token = token + + def __call__(self, request: Request) -> Request: + request.headers["Authorization"] = f"Bearer {self.token}" + return request + + auth_list = [ + CustomAuthConfigBase("test-token"), + CustomAuthAuthBase("test-token"), + ] + + for auth in auth_list: + response = rest_client.get( + "/protected/posts/bearer-token", + auth=auth, + ) + + assert response.status_code == 200 + assert response.json()["data"][0] == {"id": 0, "title": "Post 0"} + + pages_iter = rest_client.paginate( + "/protected/posts/bearer-token", + auth=auth, + ) + + pages_list = list(pages_iter) + assert_pagination(pages_list) + + assert pages_list[0].response.request.headers["Authorization"] == "Bearer test-token" From 841c7b4634006276a8a339e9579c747e86d502b2 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Fri, 24 May 2024 18:10:27 +0200 Subject: [PATCH 31/41] Mention JSONPath in resolve docs (#1409) --- docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 54edac5062..98725627b9 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -501,11 +501,13 @@ The syntax for the `resolve` field in parameter configuration is: "": { "type": "resolve", "resource": "", - "field": "", + "field": "", } } ``` +The `field` value can be specified as a [JSONPath](https://github.com/h2non/jsonpath-ng?tab=readme-ov-file#jsonpath-syntax) to select a nested field in the parent resource data. For example: `"field": "items[0].id"`. + Under the hood, dlt handles this by using a [transformer resource](../../general-usage/resource.md#process-resources-with-dlttransformer). #### Include fields from the parent resource From 993ac37a0b6d862543e3a1c4b3a0c0d8793cdd1e Mon Sep 17 00:00:00 2001 From: rudolfix Date: Sat, 25 May 2024 11:19:53 +0200 Subject: [PATCH 32/41] Revert "RESTClient: implement AuthConfigBase.__bool__ + update docs (#1398)" (#1412) This reverts commit ceb229d6bbfcb6cd52545e3af800e8b720fd8c1f. --- dlt/sources/helpers/rest_client/auth.py | 6 +-- .../docs/general-usage/http/rest-client.md | 8 ++-- .../helpers/rest_client/test_client.py | 43 +------------------ 3 files changed, 6 insertions(+), 51 deletions(-) diff --git a/dlt/sources/helpers/rest_client/auth.py b/dlt/sources/helpers/rest_client/auth.py index 020c63a195..37c0de3db1 100644 --- a/dlt/sources/helpers/rest_client/auth.py +++ b/dlt/sources/helpers/rest_client/auth.py @@ -38,11 +38,7 @@ class AuthConfigBase(AuthBase, CredentialsConfiguration): configurable via env variables or toml files """ - def __bool__(self) -> bool: - # This is needed to avoid AuthConfigBase-derived classes - # which do not implement CredentialsConfiguration interface - # to be evaluated as False in requests.sessions.Session.prepare_request() - return True + pass @configspec diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index 1093428b0f..19cc95bf78 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -407,7 +407,7 @@ The available authentication methods are defined in the `dlt.sources.helpers.res - [APIKeyAuth](#api-key-authentication) - [HttpBasicAuth](#http-basic-authentication) -For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthBase` class from the Requests library. +For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthConfigBase` class. ### Bearer token authentication @@ -479,12 +479,12 @@ response = client.get("/protected/resource") ### Implementing custom authentication -You can implement custom authentication by subclassing the `AuthBase` class and implementing the `__call__` method: +You can implement custom authentication by subclassing the `AuthConfigBase` class and implementing the `__call__` method: ```py -from requests.auth import AuthBase +from dlt.sources.helpers.rest_client.auth import AuthConfigBase -class CustomAuth(AuthBase): +class CustomAuth(AuthConfigBase): def __init__(self, token): self.token = token diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index e03879b417..50defa8edb 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -1,7 +1,6 @@ import os import pytest from typing import Any, cast -from requests.auth import AuthBase from dlt.common.typing import TSecretStrValue from dlt.sources.helpers.requests import Response, Request from dlt.sources.helpers.rest_client import RESTClient @@ -58,6 +57,7 @@ def test_page_context(self, rest_client: RESTClient) -> None: for page in rest_client.paginate( "/posts", paginator=JSONResponsePaginator(next_url_path="next_page"), + auth=AuthConfigBase(), ): # response that produced data assert isinstance(page.response, Response) @@ -183,44 +183,3 @@ def test_oauth_jwt_auth_success(self, rest_client: RESTClient): ) assert_pagination(list(pages_iter)) - - def test_custom_auth_success(self, rest_client: RESTClient): - class CustomAuthConfigBase(AuthConfigBase): - def __init__(self, token: str): - self.token = token - - def __call__(self, request: Request) -> Request: - request.headers["Authorization"] = f"Bearer {self.token}" - return request - - class CustomAuthAuthBase(AuthBase): - def __init__(self, token: str): - self.token = token - - def __call__(self, request: Request) -> Request: - request.headers["Authorization"] = f"Bearer {self.token}" - return request - - auth_list = [ - CustomAuthConfigBase("test-token"), - CustomAuthAuthBase("test-token"), - ] - - for auth in auth_list: - response = rest_client.get( - "/protected/posts/bearer-token", - auth=auth, - ) - - assert response.status_code == 200 - assert response.json()["data"][0] == {"id": 0, "title": "Post 0"} - - pages_iter = rest_client.paginate( - "/protected/posts/bearer-token", - auth=auth, - ) - - pages_list = list(pages_iter) - assert_pagination(pages_list) - - assert pages_list[0].response.request.headers["Authorization"] == "Bearer test-token" From 5d2d1ecb35f01dd63d1fe4d26c6efe47945ef4cd Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Sat, 25 May 2024 18:04:36 +0530 Subject: [PATCH 33/41] updated installation command in destination docs and a few others (#1410) --- docs/website/docs/dlt-ecosystem/destinations/athena.md | 4 ++-- docs/website/docs/dlt-ecosystem/destinations/bigquery.md | 2 +- docs/website/docs/dlt-ecosystem/destinations/clickhouse.md | 4 ++-- docs/website/docs/dlt-ecosystem/destinations/databricks.md | 2 +- docs/website/docs/dlt-ecosystem/destinations/dremio.md | 4 ++-- docs/website/docs/dlt-ecosystem/destinations/duckdb.md | 2 +- docs/website/docs/dlt-ecosystem/destinations/filesystem.md | 6 +++--- docs/website/docs/dlt-ecosystem/destinations/motherduck.md | 2 +- docs/website/docs/dlt-ecosystem/destinations/mssql.md | 4 ++-- docs/website/docs/dlt-ecosystem/destinations/postgres.md | 2 +- docs/website/docs/dlt-ecosystem/destinations/qdrant.md | 2 +- docs/website/docs/dlt-ecosystem/destinations/redshift.md | 4 ++-- docs/website/docs/dlt-ecosystem/destinations/synapse.md | 2 +- docs/website/docs/dlt-ecosystem/destinations/weaviate.md | 2 +- docs/website/docs/dlt-ecosystem/file-formats/parquet.md | 2 +- .../data-enrichments/currency_conversion_data_enrichment.md | 2 +- .../data-enrichments/url-parser-data-enrichment.md | 2 +- .../data-enrichments/user_agent_device_data_enrichment.md | 2 +- 18 files changed, 25 insertions(+), 25 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index 76491578fe..7c907664d3 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -11,7 +11,7 @@ The Athena destination stores data as Parquet files in S3 buckets and creates [e ## Install dlt with Athena **To install the dlt library with Athena dependencies:** ```sh -pip install dlt[athena] +pip install "dlt[athena]" ``` ## Setup Guide @@ -30,7 +30,7 @@ First, install dependencies by running: ```sh pip install -r requirements.txt ``` -or with `pip install dlt[athena]`, which will install `s3fs`, `pyarrow`, `pyathena`, and `botocore` packages. +or with `pip install "dlt[athena]"`, which will install `s3fs`, `pyarrow`, `pyathena`, and `botocore` packages. :::caution diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index 54d5abae6d..4f99901e37 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -11,7 +11,7 @@ keywords: [bigquery, destination, data warehouse] **To install the dlt library with BigQuery dependencies:** ```sh -pip install dlt[bigquery] +pip install "dlt[bigquery]" ``` ## Setup Guide diff --git a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md index ea187e54eb..58551751c5 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md @@ -11,7 +11,7 @@ keywords: [ clickhouse, destination, data warehouse ] **To install the DLT library with ClickHouse dependencies:** ```sh -pip install dlt[clickhouse] +pip install "dlt[clickhouse]" ``` ## Setup Guide @@ -33,7 +33,7 @@ requirements file by executing it as follows: pip install -r requirements.txt ``` -or with `pip install dlt[clickhouse]`, which installs the `dlt` library and the necessary dependencies for working with ClickHouse as a destination. +or with `pip install "dlt[clickhouse]"`, which installs the `dlt` library and the necessary dependencies for working with ClickHouse as a destination. ### 2. Setup ClickHouse database diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index b601809935..6cd5767dcb 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -12,7 +12,7 @@ keywords: [Databricks, destination, data warehouse] ## Install dlt with Databricks **To install the dlt library with Databricks dependencies:** ```sh -pip install dlt[databricks] +pip install "dlt[databricks]" ``` ## Set up your Databricks workspace diff --git a/docs/website/docs/dlt-ecosystem/destinations/dremio.md b/docs/website/docs/dlt-ecosystem/destinations/dremio.md index 0be01e8e32..546f470938 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/dremio.md +++ b/docs/website/docs/dlt-ecosystem/destinations/dremio.md @@ -9,7 +9,7 @@ keywords: [dremio, iceberg, aws, glue catalog] ## Install dlt with Dremio **To install the dlt library with Dremio and s3 dependencies:** ```sh -pip install dlt[dremio,s3] +pip install "dlt[dremio,s3]" ``` ## Setup Guide @@ -28,7 +28,7 @@ First install dependencies by running: ```sh pip install -r requirements.txt ``` -or with `pip install dlt[dremio,s3]` which will install `s3fs`, `pyarrow`, and `botocore` packages. +or with `pip install "dlt[dremio,s3]"` which will install `s3fs`, `pyarrow`, and `botocore` packages. To edit the `dlt` credentials file with your secret info, open `.dlt/secrets.toml`. You will need to provide a `bucket_url` which holds the uploaded parquet files. diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index 22c5fd1df9..c2f6786f8d 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -9,7 +9,7 @@ keywords: [duckdb, destination, data warehouse] ## Install dlt with DuckDB **To install the dlt library with DuckDB dependencies, run:** ```sh -pip install dlt[duckdb] +pip install "dlt[duckdb]" ``` ## Setup Guide diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 0d719b4cfa..4c62e172d8 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -6,7 +6,7 @@ The Filesystem destination stores data in remote file systems and bucket storage ## Install dlt with filesystem **To install the dlt library with filesystem dependencies:** ```sh -pip install dlt[filesystem] +pip install "dlt[filesystem]" ``` This installs `s3fs` and `botocore` packages. @@ -125,7 +125,7 @@ client_kwargs = '{"verify": "public.crt"}' ``` #### Google Storage -Run `pip install dlt[gs]` which will install the `gcfs` package. +Run `pip install "dlt[gs]"` which will install the `gcfs` package. To edit the `dlt` credentials file with your secret info, open `.dlt/secrets.toml`. You'll see AWS credentials by default. @@ -148,7 +148,7 @@ if you have default google cloud credentials in your environment (i.e. on cloud Use **Cloud Storage** admin to create a new bucket. Then assign the **Storage Object Admin** role to your service account. #### Azure Blob Storage -Run `pip install dlt[az]` which will install the `adlfs` package to interface with Azure Blob Storage. +Run `pip install "dlt[az]"` which will install the `adlfs` package to interface with Azure Blob Storage. Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials: diff --git a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md index b053d29ac1..9d8c8d260b 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md +++ b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md @@ -10,7 +10,7 @@ keywords: [MotherDuck, duckdb, destination, data warehouse] ## Install dlt with MotherDuck **To install the dlt library with MotherDuck dependencies:** ```sh -pip install dlt[motherduck] +pip install "dlt[motherduck]" ``` :::tip diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md index 3e5b209aaa..1cba484f10 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/mssql.md +++ b/docs/website/docs/dlt-ecosystem/destinations/mssql.md @@ -9,7 +9,7 @@ keywords: [mssql, sqlserver, destination, data warehouse] ## Install dlt with MS SQL **To install the dlt library with MS SQL dependencies, use:** ```sh -pip install dlt[mssql] +pip install "dlt[mssql]" ``` ## Setup guide @@ -38,7 +38,7 @@ pip install -r requirements.txt ``` or run: ```sh -pip install dlt[mssql] +pip install "dlt[mssql]" ``` This will install `dlt` with the `mssql` extra, which contains all the dependencies required by the SQL server client. diff --git a/docs/website/docs/dlt-ecosystem/destinations/postgres.md b/docs/website/docs/dlt-ecosystem/destinations/postgres.md index 5126272e37..ae504728c3 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/postgres.md +++ b/docs/website/docs/dlt-ecosystem/destinations/postgres.md @@ -9,7 +9,7 @@ keywords: [postgres, destination, data warehouse] ## Install dlt with PostgreSQL **To install the dlt library with PostgreSQL dependencies, run:** ```sh -pip install dlt[postgres] +pip install "dlt[postgres]" ``` ## Setup Guide diff --git a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md index 1b560ad6fe..9f19007227 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md +++ b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md @@ -14,7 +14,7 @@ This destination helps you load data into Qdrant from [dlt resources](../../gene 1. To use Qdrant as a destination, make sure `dlt` is installed with the `qdrant` extra: ```sh -pip install dlt[qdrant] +pip install "dlt[qdrant]" ``` 2. Next, configure the destination in the dlt secrets file. The file is located at `~/.dlt/secrets.toml` by default. Add the following section to the secrets file: diff --git a/docs/website/docs/dlt-ecosystem/destinations/redshift.md b/docs/website/docs/dlt-ecosystem/destinations/redshift.md index 349698d201..7e0679ec6b 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/redshift.md +++ b/docs/website/docs/dlt-ecosystem/destinations/redshift.md @@ -9,7 +9,7 @@ keywords: [redshift, destination, data warehouse] ## Install dlt with Redshift **To install the dlt library with Redshift dependencies:** ```sh -pip install dlt[redshift] +pip install "dlt[redshift]" ``` ## Setup Guide @@ -26,7 +26,7 @@ The above command generates several files and directories, including `.dlt/secre ```sh pip install -r requirements.txt ``` -or withΒ `pip install dlt[redshift]`,Β which installs theΒ `dlt`Β library and the necessary dependencies for working with Amazon Redshift as a destination. +or withΒ `pip install "dlt[redshift]"`,Β which installs theΒ `dlt`Β library and the necessary dependencies for working with Amazon Redshift as a destination. ### 2. Setup Redshift cluster To load data into Redshift, you need to create a Redshift cluster and enable access to your IP address through the VPC inbound rules associated with the cluster. While we recommend asking our GPT-4 assistant for details, we have provided a general outline of the process below: diff --git a/docs/website/docs/dlt-ecosystem/destinations/synapse.md b/docs/website/docs/dlt-ecosystem/destinations/synapse.md index f1c43b4d54..2e936f193e 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/synapse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/synapse.md @@ -9,7 +9,7 @@ keywords: [synapse, destination, data warehouse] ## Install dlt with Synapse **To install the dlt library with Synapse dependencies:** ```sh -pip install dlt[synapse] +pip install "dlt[synapse]" ``` ## Setup guide diff --git a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md index 1272b16c86..11d1276ceb 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md +++ b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md @@ -14,7 +14,7 @@ This destination helps you load data into Weaviate from [dlt resources](../../ge 1. To use Weaviate as a destination, make sure dlt is installed with the 'weaviate' extra: ```sh -pip install dlt[weaviate] +pip install "dlt[weaviate]" ``` 2. Next, configure the destination in the dlt secrets file. The file is located at `~/.dlt/secrets.toml` by default. Add the following section to the secrets file: diff --git a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md index 8944b7d5fa..414eaf2cb8 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md @@ -11,7 +11,7 @@ keywords: [parquet, file formats] To use this format, you need a `pyarrow` package. You can get this package as a `dlt` extra as well: ```sh -pip install dlt[parquet] +pip install "dlt[parquet]" ``` ## Supported Destinations diff --git a/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md index f8bd179422..82297420ed 100644 --- a/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md @@ -247,7 +247,7 @@ API token. [destination](../../dlt-ecosystem/destinations/), For example, duckdb: ```sh - pip install dlt[duckdb] + pip install "dlt[duckdb]" ``` 1. Run the pipeline with the following command: diff --git a/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md b/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md index ab71d3d1d0..f2cd4a1065 100644 --- a/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md @@ -231,7 +231,7 @@ need to register to use this service neither get an API key. [destination](https://dlthub.com/docs/dlt-ecosystem/destinations/), For example, duckdb: ```sh - pip install dlt[duckdb] + pip install "dlt[duckdb]" ``` 1. Run the pipeline with the following command: diff --git a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md index 3aadb2f982..2448d31a06 100644 --- a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md @@ -284,7 +284,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the [destination](https://dlthub.com/docs/dlt-ecosystem/destinations/), For example, duckdb: ```sh - pip install dlt[duckdb] + pip install "dlt[duckdb]" ``` 1. Run the pipeline with the following command: From 67ccacd81708d2775d453e97c4f80a40b179eaba Mon Sep 17 00:00:00 2001 From: rudolfix Date: Sat, 25 May 2024 15:47:51 +0200 Subject: [PATCH 34/41] Feat/1406 bumps duckdb 0.10 (#1407) * bumps duckdb to 0.10.x * updates dbt support to 1.8 --- dlt/helpers/dbt/__init__.py | 2 +- dlt/helpers/dbt/dbt_utils.py | 12 -- .../custom_destination_bigquery.py | 4 +- poetry.lock | 150 ++++++------------ pyproject.toml | 10 +- .../dbt_tests/test_runner_dbt_versions.py | 11 +- 6 files changed, 65 insertions(+), 124 deletions(-) diff --git a/dlt/helpers/dbt/__init__.py b/dlt/helpers/dbt/__init__.py index b555bcd3a9..4801dcd6b9 100644 --- a/dlt/helpers/dbt/__init__.py +++ b/dlt/helpers/dbt/__init__.py @@ -11,7 +11,7 @@ from dlt.helpers.dbt.runner import create_runner, DBTPackageRunner -DEFAULT_DBT_VERSION = ">=1.1,<1.6" +DEFAULT_DBT_VERSION = ">=1.5,<1.9" # a map of destination names to dbt package names in case they don't match the pure destination name DBT_DESTINATION_MAP = { diff --git a/dlt/helpers/dbt/dbt_utils.py b/dlt/helpers/dbt/dbt_utils.py index bf14504eaa..80774d9858 100644 --- a/dlt/helpers/dbt/dbt_utils.py +++ b/dlt/helpers/dbt/dbt_utils.py @@ -24,7 +24,6 @@ # https://stackoverflow.com/questions/48619517/call-a-click-command-from-code import dbt.logger - from dbt.events import functions from dbt.contracts import results as dbt_results except ModuleNotFoundError: raise MissingDependencyException("DBT Core", ["dbt-core"]) @@ -56,17 +55,6 @@ def set_path_wrapper(self: dbt.logger.LogManager, path: str) -> None: self._file_handler.set_path(path) _DBT_LOGGER_INITIALIZED = True - # def setup_event_logger_wrapper(log_path: str, level_override:str = None) -> None: - # global _DBT_LOGGER_INITIALIZED - - # if not _DBT_LOGGER_INITIALIZED: - # functions.setup_event_logger(log_path, level.lower()) - # # force log level as file is debug only - # # functions.this.FILE_LOG.setLevel(level) - # # functions.this.FILE_LOG.handlers[0].setLevel(level) - # _DBT_LOGGER_INITIALIZED = True - - # dbt.main.setup_event_logger = setup_event_logger_wrapper dbt.logger.LogManager.set_path = set_path_wrapper # type: ignore globs = [] diff --git a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py index e890469263..380912a9a7 100644 --- a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py +++ b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py @@ -60,7 +60,9 @@ def resource(url: str): # dlt bigquery custom destination # we can use the dlt provided credentials class # to retrieve the gcp credentials from the secrets -@dlt.destination(name="bigquery", loader_file_format="parquet", batch_size=0, naming_convention="snake_case") +@dlt.destination( + name="bigquery", loader_file_format="parquet", batch_size=0, naming_convention="snake_case" +) def bigquery_insert( items, table=BIGQUERY_TABLE_ID, credentials: GcpServiceAccountCredentials = dlt.secrets.value ) -> None: diff --git a/poetry.lock b/poetry.lock index dcab5e1730..6159f751c4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2548,106 +2548,58 @@ dates = ["pytz (>=2019.1)"] [[package]] name = "duckdb" -version = "0.9.2" -description = "DuckDB embedded database" -optional = false -python-versions = ">=3.7.0" -files = [ - {file = "duckdb-0.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:aadcea5160c586704c03a8a796c06a8afffbefefb1986601104a60cb0bfdb5ab"}, - {file = "duckdb-0.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:08215f17147ed83cbec972175d9882387366de2ed36c21cbe4add04b39a5bcb4"}, - {file = "duckdb-0.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ee6c2a8aba6850abef5e1be9dbc04b8e72a5b2c2b67f77892317a21fae868fe7"}, - {file = "duckdb-0.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ff49f3da9399900fd58b5acd0bb8bfad22c5147584ad2427a78d937e11ec9d0"}, - {file = "duckdb-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd5ac5baf8597efd2bfa75f984654afcabcd698342d59b0e265a0bc6f267b3f0"}, - {file = "duckdb-0.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:81c6df905589a1023a27e9712edb5b724566587ef280a0c66a7ec07c8083623b"}, - {file = "duckdb-0.9.2-cp310-cp310-win32.whl", hash = "sha256:a298cd1d821c81d0dec8a60878c4b38c1adea04a9675fb6306c8f9083bbf314d"}, - {file = "duckdb-0.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:492a69cd60b6cb4f671b51893884cdc5efc4c3b2eb76057a007d2a2295427173"}, - {file = "duckdb-0.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:061a9ea809811d6e3025c5de31bc40e0302cfb08c08feefa574a6491e882e7e8"}, - {file = "duckdb-0.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a43f93be768af39f604b7b9b48891f9177c9282a408051209101ff80f7450d8f"}, - {file = "duckdb-0.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ac29c8c8f56fff5a681f7bf61711ccb9325c5329e64f23cb7ff31781d7b50773"}, - {file = "duckdb-0.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b14d98d26bab139114f62ade81350a5342f60a168d94b27ed2c706838f949eda"}, - {file = "duckdb-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:796a995299878913e765b28cc2b14c8e44fae2f54ab41a9ee668c18449f5f833"}, - {file = "duckdb-0.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6cb64ccfb72c11ec9c41b3cb6181b6fd33deccceda530e94e1c362af5f810ba1"}, - {file = "duckdb-0.9.2-cp311-cp311-win32.whl", hash = "sha256:930740cb7b2cd9e79946e1d3a8f66e15dc5849d4eaeff75c8788d0983b9256a5"}, - {file = "duckdb-0.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:c28f13c45006fd525001b2011cdf91fa216530e9751779651e66edc0e446be50"}, - {file = "duckdb-0.9.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:fbce7bbcb4ba7d99fcec84cec08db40bc0dd9342c6c11930ce708817741faeeb"}, - {file = "duckdb-0.9.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15a82109a9e69b1891f0999749f9e3265f550032470f51432f944a37cfdc908b"}, - {file = "duckdb-0.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9490fb9a35eb74af40db5569d90df8a04a6f09ed9a8c9caa024998c40e2506aa"}, - {file = "duckdb-0.9.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:696d5c6dee86c1a491ea15b74aafe34ad2b62dcd46ad7e03b1d00111ca1a8c68"}, - {file = "duckdb-0.9.2-cp37-cp37m-win32.whl", hash = "sha256:4f0935300bdf8b7631ddfc838f36a858c1323696d8c8a2cecbd416bddf6b0631"}, - {file = "duckdb-0.9.2-cp37-cp37m-win_amd64.whl", hash = "sha256:0aab900f7510e4d2613263865570203ddfa2631858c7eb8cbed091af6ceb597f"}, - {file = "duckdb-0.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7d8130ed6a0c9421b135d0743705ea95b9a745852977717504e45722c112bf7a"}, - {file = "duckdb-0.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:974e5de0294f88a1a837378f1f83330395801e9246f4e88ed3bfc8ada65dcbee"}, - {file = "duckdb-0.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4fbc297b602ef17e579bb3190c94d19c5002422b55814421a0fc11299c0c1100"}, - {file = "duckdb-0.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1dd58a0d84a424924a35b3772419f8cd78a01c626be3147e4934d7a035a8ad68"}, - {file = "duckdb-0.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11a1194a582c80dfb57565daa06141727e415ff5d17e022dc5f31888a5423d33"}, - {file = "duckdb-0.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:be45d08541002a9338e568dca67ab4f20c0277f8f58a73dfc1435c5b4297c996"}, - {file = "duckdb-0.9.2-cp38-cp38-win32.whl", hash = "sha256:dd6f88aeb7fc0bfecaca633629ff5c986ac966fe3b7dcec0b2c48632fd550ba2"}, - {file = "duckdb-0.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:28100c4a6a04e69aa0f4a6670a6d3d67a65f0337246a0c1a429f3f28f3c40b9a"}, - {file = "duckdb-0.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7ae5bf0b6ad4278e46e933e51473b86b4b932dbc54ff097610e5b482dd125552"}, - {file = "duckdb-0.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e5d0bb845a80aa48ed1fd1d2d285dd352e96dc97f8efced2a7429437ccd1fe1f"}, - {file = "duckdb-0.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ce262d74a52500d10888110dfd6715989926ec936918c232dcbaddb78fc55b4"}, - {file = "duckdb-0.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6935240da090a7f7d2666f6d0a5e45ff85715244171ca4e6576060a7f4a1200e"}, - {file = "duckdb-0.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5cfb93e73911696a98b9479299d19cfbc21dd05bb7ab11a923a903f86b4d06e"}, - {file = "duckdb-0.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:64e3bc01751f31e7572d2716c3e8da8fe785f1cdc5be329100818d223002213f"}, - {file = "duckdb-0.9.2-cp39-cp39-win32.whl", hash = "sha256:6e5b80f46487636368e31b61461940e3999986359a78660a50dfdd17dd72017c"}, - {file = "duckdb-0.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:e6142a220180dbeea4f341708bd5f9501c5c962ce7ef47c1cadf5e8810b4cb13"}, - {file = "duckdb-0.9.2.tar.gz", hash = "sha256:3843afeab7c3fc4a4c0b53686a4cc1d9cdbdadcbb468d60fef910355ecafd447"}, -] - -[[package]] -name = "duckdb" -version = "0.10.0" +version = "0.10.3" description = "DuckDB in-process database" optional = false python-versions = ">=3.7.0" files = [ - {file = "duckdb-0.10.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bd0ffb3fddef0f72a150e4d76e10942a84a1a0447d10907df1621b90d6668060"}, - {file = "duckdb-0.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f3d709d5c7c1a12b5e10d0b05fa916c670cd2b50178e3696faa0cc16048a1745"}, - {file = "duckdb-0.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9114aa22ec5d591a20ce5184be90f49d8e5b5348ceaab21e102c54560d07a5f8"}, - {file = "duckdb-0.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77a37877efadf39caf7cadde0f430fedf762751b9c54750c821e2f1316705a21"}, - {file = "duckdb-0.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87cbc9e1d9c3fc9f14307bea757f99f15f46843c0ab13a6061354410824ed41f"}, - {file = "duckdb-0.10.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f0bfec79fed387201550517d325dff4fad2705020bc139d936cab08b9e845662"}, - {file = "duckdb-0.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c5622134d2d9796b15e09de810e450859d4beb46d9b861357ec9ae40a61b775c"}, - {file = "duckdb-0.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:089ee8e831ccaef1b73fc89c43b661567175eed0115454880bafed5e35cda702"}, - {file = "duckdb-0.10.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a05af63747f1d7021995f0811c333dee7316cec3b06c0d3e4741b9bdb678dd21"}, - {file = "duckdb-0.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:072d6eba5d8a59e0069a8b5b4252fed8a21f9fe3f85a9129d186a39b3d0aea03"}, - {file = "duckdb-0.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a77b85668f59b919042832e4659538337f1c7f197123076c5311f1c9cf077df7"}, - {file = "duckdb-0.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96a666f1d2da65d03199a977aec246920920a5ea1da76b70ae02bd4fb1ffc48c"}, - {file = "duckdb-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ec76a4262b783628d26612d184834852d9c92fb203e91af789100c17e3d7173"}, - {file = "duckdb-0.10.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:009dd9d2cdbd3b061a9efbdfc79f2d1a8377bcf49f1e5f430138621f8c083a6c"}, - {file = "duckdb-0.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:878f06766088090dad4a2e5ee0081555242b2e8dcb29415ecc97e388cf0cf8d8"}, - {file = "duckdb-0.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:713ff0a1fb63a6d60f454acf67f31656549fb5d63f21ac68314e4f522daa1a89"}, - {file = "duckdb-0.10.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9c0ee450dfedfb52dd4957244e31820feef17228da31af6d052979450a80fd19"}, - {file = "duckdb-0.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ff79b2ea9994398b545c0d10601cd73565fbd09f8951b3d8003c7c5c0cebc7cb"}, - {file = "duckdb-0.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6bdf1aa71b924ef651062e6b8ff9981ad85bec89598294af8a072062c5717340"}, - {file = "duckdb-0.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0265bbc8216be3ced7b377ba8847128a3fc0ef99798a3c4557c1b88e3a01c23"}, - {file = "duckdb-0.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d418a315a07707a693bd985274c0f8c4dd77015d9ef5d8d3da4cc1942fd82e0"}, - {file = "duckdb-0.10.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2828475a292e68c71855190b818aded6bce7328f79e38c04a0c75f8f1c0ceef0"}, - {file = "duckdb-0.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:c3aaeaae2eba97035c65f31ffdb18202c951337bf2b3d53d77ce1da8ae2ecf51"}, - {file = "duckdb-0.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:c51790aaaea97d8e4a58a114c371ed8d2c4e1ca7cbf29e3bdab6d8ccfc5afc1e"}, - {file = "duckdb-0.10.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8af1ae7cc77a12206b6c47ade191882cc8f49f750bb3e72bb86ac1d4fa89926a"}, - {file = "duckdb-0.10.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa4f7e8e8dc0e376aeb280b83f2584d0e25ec38985c27d19f3107b2edc4f4a97"}, - {file = "duckdb-0.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28ae942a79fad913defa912b56483cd7827a4e7721f4ce4bc9025b746ecb3c89"}, - {file = "duckdb-0.10.0-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:01b57802898091455ca2a32c1335aac1e398da77c99e8a96a1e5de09f6a0add9"}, - {file = "duckdb-0.10.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:52e1ad4a55fa153d320c367046b9500578192e01c6d04308ba8b540441736f2c"}, - {file = "duckdb-0.10.0-cp37-cp37m-win_amd64.whl", hash = "sha256:904c47d04095af745e989c853f0bfc0776913dfc40dfbd2da7afdbbb5f67fed0"}, - {file = "duckdb-0.10.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:184ae7ea5874f3b8fa51ab0f1519bdd088a0b78c32080ee272b1d137e2c8fd9c"}, - {file = "duckdb-0.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bd33982ecc9bac727a032d6cedced9f19033cbad56647147408891eb51a6cb37"}, - {file = "duckdb-0.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f59bf0949899105dd5f8864cb48139bfb78454a8c017b8258ba2b5e90acf7afc"}, - {file = "duckdb-0.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:395f3b18948001e35dceb48a4423d574e38656606d033eef375408b539e7b076"}, - {file = "duckdb-0.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b8eb2b803be7ee1df70435c33b03a4598cdaf676cd67ad782b288dcff65d781"}, - {file = "duckdb-0.10.0-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:31b2ddd331801064326c8e3587a4db8a31d02aef11332c168f45b3bd92effb41"}, - {file = "duckdb-0.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c8b89e76a041424b8c2026c5dc1f74b53fbbc6c6f650d563259885ab2e7d093d"}, - {file = "duckdb-0.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:79084a82f16c0a54f6bfb7ded5600400c2daa90eb0d83337d81a56924eaee5d4"}, - {file = "duckdb-0.10.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:79799b3a270dcd9070f677ba510f1e66b112df3068425691bac97c5e278929c7"}, - {file = "duckdb-0.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8fc394bfe3434920cdbcfbdd0ac3ba40902faa1dbda088db0ba44003a45318a"}, - {file = "duckdb-0.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c116605551b4abf5786243a59bcef02bd69cc51837d0c57cafaa68cdc428aa0c"}, - {file = "duckdb-0.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3191170c3b0a43b0c12644800326f5afdea00d5a4621d59dbbd0c1059139e140"}, - {file = "duckdb-0.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fee69a50eb93c72dc77e7ab1fabe0c38d21a52c5da44a86aa217081e38f9f1bd"}, - {file = "duckdb-0.10.0-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c5f449e87dacb16b0d145dbe65fa6fdb5a55b2b6911a46d74876e445dd395bac"}, - {file = "duckdb-0.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4487d0df221b17ea4177ad08131bc606b35f25cfadf890987833055b9d10cdf6"}, - {file = "duckdb-0.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:c099ae2ff8fe939fda62da81704f91e2f92ac45e48dc0e37c679c9d243d01e65"}, - {file = "duckdb-0.10.0.tar.gz", hash = "sha256:c02bcc128002aa79e3c9d89b9de25e062d1096a8793bc0d7932317b7977f6845"}, + {file = "duckdb-0.10.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd25cc8d001c09a19340739ba59d33e12a81ab285b7a6bed37169655e1cefb31"}, + {file = "duckdb-0.10.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f9259c637b917ca0f4c63887e8d9b35ec248f5d987c886dfc4229d66a791009"}, + {file = "duckdb-0.10.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b48f5f1542f1e4b184e6b4fc188f497be8b9c48127867e7d9a5f4a3e334f88b0"}, + {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e327f7a3951ea154bb56e3fef7da889e790bd9a67ca3c36afc1beb17d3feb6d6"}, + {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d8b20ed67da004b4481973f4254fd79a0e5af957d2382eac8624b5c527ec48c"}, + {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d37680b8d7be04e4709db3a66c8b3eb7ceba2a5276574903528632f2b2cc2e60"}, + {file = "duckdb-0.10.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3d34b86d6a2a6dfe8bb757f90bfe7101a3bd9e3022bf19dbddfa4b32680d26a9"}, + {file = "duckdb-0.10.3-cp310-cp310-win_amd64.whl", hash = "sha256:73b1cb283ca0f6576dc18183fd315b4e487a545667ffebbf50b08eb4e8cdc143"}, + {file = "duckdb-0.10.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d917dde19fcec8cadcbef1f23946e85dee626ddc133e1e3f6551f15a61a03c61"}, + {file = "duckdb-0.10.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46757e0cf5f44b4cb820c48a34f339a9ccf83b43d525d44947273a585a4ed822"}, + {file = "duckdb-0.10.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:338c14d8ac53ac4aa9ec03b6f1325ecfe609ceeb72565124d489cb07f8a1e4eb"}, + {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:651fcb429602b79a3cf76b662a39e93e9c3e6650f7018258f4af344c816dab72"}, + {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3ae3c73b98b6215dab93cc9bc936b94aed55b53c34ba01dec863c5cab9f8e25"}, + {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56429b2cfe70e367fb818c2be19f59ce2f6b080c8382c4d10b4f90ba81f774e9"}, + {file = "duckdb-0.10.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b46c02c2e39e3676b1bb0dc7720b8aa953734de4fd1b762e6d7375fbeb1b63af"}, + {file = "duckdb-0.10.3-cp311-cp311-win_amd64.whl", hash = "sha256:bcd460feef56575af2c2443d7394d405a164c409e9794a4d94cb5fdaa24a0ba4"}, + {file = "duckdb-0.10.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:e229a7c6361afbb0d0ab29b1b398c10921263c52957aefe3ace99b0426fdb91e"}, + {file = "duckdb-0.10.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:732b1d3b6b17bf2f32ea696b9afc9e033493c5a3b783c292ca4b0ee7cc7b0e66"}, + {file = "duckdb-0.10.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f5380d4db11fec5021389fb85d614680dc12757ef7c5881262742250e0b58c75"}, + {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:468a4e0c0b13c55f84972b1110060d1b0f854ffeb5900a178a775259ec1562db"}, + {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fa1e7ff8d18d71defa84e79f5c86aa25d3be80d7cb7bc259a322de6d7cc72da"}, + {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ed1063ed97c02e9cf2e7fd1d280de2d1e243d72268330f45344c69c7ce438a01"}, + {file = "duckdb-0.10.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:22f2aad5bb49c007f3bfcd3e81fdedbc16a2ae41f2915fc278724ca494128b0c"}, + {file = "duckdb-0.10.3-cp312-cp312-win_amd64.whl", hash = "sha256:8f9e2bb00a048eb70b73a494bdc868ce7549b342f7ffec88192a78e5a4e164bd"}, + {file = "duckdb-0.10.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a6c2fc49875b4b54e882d68703083ca6f84b27536d57d623fc872e2f502b1078"}, + {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a66c125d0c30af210f7ee599e7821c3d1a7e09208196dafbf997d4e0cfcb81ab"}, + {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d99dd7a1d901149c7a276440d6e737b2777e17d2046f5efb0c06ad3b8cb066a6"}, + {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5ec3bbdb209e6095d202202893763e26c17c88293b88ef986b619e6c8b6715bd"}, + {file = "duckdb-0.10.3-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:2b3dec4ef8ed355d7b7230b40950b30d0def2c387a2e8cd7efc80b9d14134ecf"}, + {file = "duckdb-0.10.3-cp37-cp37m-win_amd64.whl", hash = "sha256:04129f94fb49bba5eea22f941f0fb30337f069a04993048b59e2811f52d564bc"}, + {file = "duckdb-0.10.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:d75d67024fc22c8edfd47747c8550fb3c34fb1cbcbfd567e94939ffd9c9e3ca7"}, + {file = "duckdb-0.10.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f3796e9507c02d0ddbba2e84c994fae131da567ce3d9cbb4cbcd32fadc5fbb26"}, + {file = "duckdb-0.10.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:78e539d85ebd84e3e87ec44d28ad912ca4ca444fe705794e0de9be3dd5550c11"}, + {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a99b67ac674b4de32073e9bc604b9c2273d399325181ff50b436c6da17bf00a"}, + {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1209a354a763758c4017a1f6a9f9b154a83bed4458287af9f71d84664ddb86b6"}, + {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b735cea64aab39b67c136ab3a571dbf834067f8472ba2f8bf0341bc91bea820"}, + {file = "duckdb-0.10.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:816ffb9f758ed98eb02199d9321d592d7a32a6cb6aa31930f4337eb22cfc64e2"}, + {file = "duckdb-0.10.3-cp38-cp38-win_amd64.whl", hash = "sha256:1631184b94c3dc38b13bce4045bf3ae7e1b0ecbfbb8771eb8d751d8ffe1b59b3"}, + {file = "duckdb-0.10.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:fb98c35fc8dd65043bc08a2414dd9f59c680d7e8656295b8969f3f2061f26c52"}, + {file = "duckdb-0.10.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7e75c9f5b6a92b2a6816605c001d30790f6d67ce627a2b848d4d6040686efdf9"}, + {file = "duckdb-0.10.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae786eddf1c2fd003466e13393b9348a44b6061af6fe7bcb380a64cac24e7df7"}, + {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9387da7b7973707b0dea2588749660dd5dd724273222680e985a2dd36787668"}, + {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:538f943bf9fa8a3a7c4fafa05f21a69539d2c8a68e557233cbe9d989ae232899"}, + {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6930608f35025a73eb94252964f9f19dd68cf2aaa471da3982cf6694866cfa63"}, + {file = "duckdb-0.10.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:03bc54a9cde5490918aad82d7d2a34290e3dfb78d5b889c6626625c0f141272a"}, + {file = "duckdb-0.10.3-cp39-cp39-win_amd64.whl", hash = "sha256:372b6e3901d85108cafe5df03c872dfb6f0dbff66165a0cf46c47246c1957aa0"}, + {file = "duckdb-0.10.3.tar.gz", hash = "sha256:c5bd84a92bc708d3a6adffe1f554b94c6e76c795826daaaf482afc3d9c636971"}, ] [[package]] @@ -9317,11 +9269,11 @@ clickhouse = ["adlfs", "clickhouse-connect", "clickhouse-driver", "gcsfs", "pyar databricks = ["databricks-sql-connector"] dbt = ["dbt-athena-community", "dbt-bigquery", "dbt-core", "dbt-databricks", "dbt-duckdb", "dbt-redshift", "dbt-snowflake"] dremio = ["pyarrow"] -duckdb = ["duckdb", "duckdb"] +duckdb = ["duckdb"] filesystem = ["botocore", "s3fs"] gcp = ["gcsfs", "google-cloud-bigquery", "grpcio"] gs = ["gcsfs"] -motherduck = ["duckdb", "duckdb", "pyarrow"] +motherduck = ["duckdb", "pyarrow"] mssql = ["pyodbc"] parquet = ["pyarrow"] postgres = ["psycopg2-binary", "psycopg2cffi"] @@ -9335,4 +9287,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "c206bfd3eab8f0c9349398c3c0ed251490bab96254327cb800d45807f05d2997" +content-hash = "605b9b04ed3ae8b71c41eaf532d7bc8ce4f8135ef00593b5f01a82debc3e14c8" diff --git a/pyproject.toml b/pyproject.toml index 1a946f5e10..8beefe409f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,10 +58,12 @@ psycopg2cffi = {version = ">=2.9.0", optional = true, markers="platform_python_i grpcio = {version = ">=1.50.0", optional = true} google-cloud-bigquery = {version = ">=2.26.0", optional = true} pyarrow = {version = ">=12.0.0", optional = true} -duckdb = [ - {version = ">=0.6.1,<0.10.0", python = ">=3.8,<3.12", optional = true}, - {version = ">=0.10.0,<0.11.0", python = ">=3.12", optional = true} -] +duckdb = {version = ">=0.6.1,<0.11", optional = true} +# keep per-python version dependency as a reference +# duckdb = [ +# {version = ">=0.6.1,<0.10.0", python = ">=3.8,<3.12", optional = true}, +# {version = ">=0.10.0,<0.11.0", python = ">=3.12", optional = true} +# ] dbt-core = {version = ">=1.2.0", optional = true} dbt-redshift = {version = ">=1.2.0", optional = true} dbt-bigquery = {version = ">=1.2.0", optional = true} diff --git a/tests/helpers/dbt_tests/test_runner_dbt_versions.py b/tests/helpers/dbt_tests/test_runner_dbt_versions.py index 5b9b07fcc5..a82345d732 100644 --- a/tests/helpers/dbt_tests/test_runner_dbt_versions.py +++ b/tests/helpers/dbt_tests/test_runner_dbt_versions.py @@ -43,16 +43,13 @@ def client() -> Iterator[PostgresClient]: PACKAGE_PARAMS = [ - # ("postgres", "1.1.3"), - # ("postgres", "1.2.4"), - # ("postgres", "1.3.2"), - # ("postgres", "1.4.0"), ("postgres", "1.5.2"), ("postgres", "1.6.13"), + ("postgres", "1.8.1"), ("postgres", None), - # ("snowflake", "1.4.0"), ("snowflake", "1.5.2"), ("snowflake", "1.6.13"), + ("snowflake", "1.8.1"), ("snowflake", None), ] PACKAGE_IDS = [ @@ -82,10 +79,10 @@ def test_infer_venv_deps() -> None: # provide version ranges requirements = _create_dbt_deps(["duckdb"], dbt_version=">3") # special duckdb dependency - assert requirements[:-1] == ["dbt-core>3", "dbt-duckdb", "duckdb==0.9.2"] + assert requirements[:-1] == ["dbt-core>3", "dbt-duckdb", "duckdb==0.10.3"] # we do not validate version ranges, pip will do it and fail when creating venv requirements = _create_dbt_deps(["motherduck"], dbt_version="y") - assert requirements[:-1] == ["dbt-corey", "dbt-duckdb", "duckdb==0.9.2"] + assert requirements[:-1] == ["dbt-corey", "dbt-duckdb", "duckdb==0.10.3"] def test_default_profile_name() -> None: From 3dc187475546e209be7d3025f4378d43e253a5bd Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Sat, 25 May 2024 11:28:12 -0400 Subject: [PATCH 35/41] Azure service principal credentials (#1377) * Support azure service principal credentials * Add sp credentials to docs * Test resolved fsspec instance * Update filesystem config test * Test connect with service credentials * configures destination credentials * fixes toml typo --------- Co-authored-by: Marcin Rudolf --- dlt/common/configuration/specs/__init__.py | 11 +- .../configuration/specs/azure_credentials.py | 50 +++++++- dlt/common/storages/configuration.py | 11 +- .../dlt-ecosystem/destinations/filesystem.md | 24 +++- .../load/filesystem/test_azure_credentials.py | 110 +++++++++++++++++- .../load/filesystem/test_filesystem_common.py | 10 +- 6 files changed, 191 insertions(+), 25 deletions(-) diff --git a/dlt/common/configuration/specs/__init__.py b/dlt/common/configuration/specs/__init__.py index 9acf14bde3..f1d7d819ff 100644 --- a/dlt/common/configuration/specs/__init__.py +++ b/dlt/common/configuration/specs/__init__.py @@ -20,7 +20,13 @@ from .connection_string_credentials import ConnectionStringCredentials from .api_credentials import OAuth2Credentials from .aws_credentials import AwsCredentials, AwsCredentialsWithoutDefaults -from .azure_credentials import AzureCredentials, AzureCredentialsWithoutDefaults +from .azure_credentials import ( + AzureCredentials, + AzureCredentialsWithoutDefaults, + AzureServicePrincipalCredentials, + AzureServicePrincipalCredentialsWithoutDefaults, + AnyAzureCredentials, +) # backward compatibility for service account credentials @@ -51,6 +57,9 @@ "AwsCredentialsWithoutDefaults", "AzureCredentials", "AzureCredentialsWithoutDefaults", + "AzureServicePrincipalCredentials", + "AzureServicePrincipalCredentialsWithoutDefaults", + "AnyAzureCredentials", "GcpClientCredentials", "GcpClientCredentialsWithDefault", ] diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py index 52d33ec0d3..8b8fc259f2 100644 --- a/dlt/common/configuration/specs/azure_credentials.py +++ b/dlt/common/configuration/specs/azure_credentials.py @@ -1,4 +1,4 @@ -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, Union from dlt.common.pendulum import pendulum from dlt.common.typing import TSecretStrValue @@ -7,10 +7,6 @@ CredentialsWithDefault, configspec, ) -from dlt.common.configuration.specs.exceptions import InvalidBoto3Session -from dlt import version - -import fsspec @configspec @@ -50,6 +46,22 @@ def on_partial(self) -> None: self.resolve() +@configspec +class AzureServicePrincipalCredentialsWithoutDefaults(CredentialsConfiguration): + azure_storage_account_name: str = None + azure_tenant_id: str = None + azure_client_id: str = None + azure_client_secret: TSecretStrValue = None + + def to_adlfs_credentials(self) -> Dict[str, Any]: + return dict( + account_name=self.azure_storage_account_name, + tenant_id=self.azure_tenant_id, + client_id=self.azure_client_id, + client_secret=self.azure_client_secret, + ) + + @configspec class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault): def on_partial(self) -> None: @@ -67,3 +79,31 @@ def to_adlfs_credentials(self) -> Dict[str, Any]: if self.has_default_credentials(): base_kwargs["anon"] = False return base_kwargs + + +@configspec +class AzureServicePrincipalCredentials( + AzureServicePrincipalCredentialsWithoutDefaults, CredentialsWithDefault +): + def on_partial(self) -> None: + from azure.identity import DefaultAzureCredential + + self._set_default_credentials(DefaultAzureCredential()) + if self.azure_storage_account_name: + self.resolve() + + def to_adlfs_credentials(self) -> Dict[str, Any]: + base_kwargs = super().to_adlfs_credentials() + if self.has_default_credentials(): + base_kwargs["anon"] = False + return base_kwargs + + +AnyAzureCredentials = Union[ + # Credentials without defaults come first because union types are attempted in order + # and explicit config should supersede system defaults + AzureCredentialsWithoutDefaults, + AzureServicePrincipalCredentialsWithoutDefaults, + AzureCredentials, + AzureServicePrincipalCredentials, +] diff --git a/dlt/common/storages/configuration.py b/dlt/common/storages/configuration.py index a1838fab6e..6e100536af 100644 --- a/dlt/common/storages/configuration.py +++ b/dlt/common/storages/configuration.py @@ -10,8 +10,7 @@ GcpServiceAccountCredentials, AwsCredentials, GcpOAuthCredentials, - AzureCredentials, - AzureCredentialsWithoutDefaults, + AnyAzureCredentials, BaseConfiguration, ) from dlt.common.typing import DictStrAny @@ -49,7 +48,7 @@ class LoadStorageConfiguration(BaseConfiguration): FileSystemCredentials = Union[ - AwsCredentials, GcpServiceAccountCredentials, AzureCredentials, GcpOAuthCredentials + AwsCredentials, GcpServiceAccountCredentials, AnyAzureCredentials, GcpOAuthCredentials ] @@ -70,9 +69,9 @@ class FilesystemConfiguration(BaseConfiguration): "gcs": Union[GcpServiceAccountCredentials, GcpOAuthCredentials], "gdrive": Union[GcpServiceAccountCredentials, GcpOAuthCredentials], "s3": AwsCredentials, - "az": Union[AzureCredentialsWithoutDefaults, AzureCredentials], - "abfs": Union[AzureCredentialsWithoutDefaults, AzureCredentials], - "adl": Union[AzureCredentialsWithoutDefaults, AzureCredentials], + "az": AnyAzureCredentials, + "abfs": AnyAzureCredentials, + "adl": AnyAzureCredentials, } bucket_url: str = None diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 4c62e172d8..3e2e08013c 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -150,7 +150,13 @@ Use **Cloud Storage** admin to create a new bucket. Then assign the **Storage Ob #### Azure Blob Storage Run `pip install "dlt[az]"` which will install the `adlfs` package to interface with Azure Blob Storage. -Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials: +Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials. + +Two forms of Azure credentials are supported: + +##### SAS token credentials + +Supply storage account name and either sas token or storage account key ```toml [destination.filesystem] @@ -168,6 +174,20 @@ If you have the correct Azure credentials set up on your machine (e.g. via azure you can omit both `azure_storage_account_key` and `azure_storage_sas_token` and `dlt` will fall back to the available default. Note that `azure_storage_account_name` is still required as it can't be inferred from the environment. +##### Service principal credentials + +Supply a client ID, client secret and a tenant ID for a service principal authorized to access your container + +```toml +[destination.filesystem] +bucket_url = "az://[your_container name]" # replace with your container name + +[destination.filesystem.credentials] +azure_client_id = "client_id" # please set me up! +azure_client_secret = "client_secret" +azure_tenant_id = "tenant_id" # please set me up! +``` + #### Local file system If for any reason you want to have those files in a local folder, set up the `bucket_url` as follows (you are free to use `config.toml` for that as there are no secrets required) @@ -458,4 +478,4 @@ managed in the regular way by the final destination you have configured. You will also notice `init` files being present in the root folder and the special `dlt` folders. In the absence of the concepts of schemas and tables in blob storages and directories, `dlt` uses these special files to harmonize the behavior of the `filesystem` destination with the other implemented destinations. - \ No newline at end of file + diff --git a/tests/load/filesystem/test_azure_credentials.py b/tests/load/filesystem/test_azure_credentials.py index 467ba55a4f..4ee2ec46db 100644 --- a/tests/load/filesystem/test_azure_credentials.py +++ b/tests/load/filesystem/test_azure_credentials.py @@ -1,15 +1,24 @@ -from typing import Dict +from typing import Dict, Optional from urllib.parse import parse_qs +from uuid import uuid4 import pytest +import dlt from dlt.common import pendulum from dlt.common.time import ensure_pendulum_datetime from dlt.common.configuration import resolve_configuration, ConfigFieldMissingException -from dlt.common.configuration.specs import AzureCredentials -from tests.load.utils import ALL_FILESYSTEM_DRIVERS +from dlt.common.configuration.specs import ( + AzureCredentials, + AzureServicePrincipalCredentials, + AzureServicePrincipalCredentialsWithoutDefaults, + AzureCredentialsWithoutDefaults, +) +from dlt.common.storages.configuration import FilesystemConfiguration +from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AZ_BUCKET from tests.common.configuration.utils import environment from tests.utils import preserve_environ, autouse_test_storage +from dlt.common.storages.fsspec_filesystem import fsspec_from_config # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -18,6 +27,27 @@ pytest.skip("az filesystem driver not configured", allow_module_level=True) +@pytest.fixture +def az_service_principal_config() -> Optional[FilesystemConfiguration]: + """FS config with alternate azure credentials format if available in environment + + Working credentials of this type may be created as an app in Entra, which has + R/W/E access to the bucket (via ACL of particular container) + + """ + credentials = AzureServicePrincipalCredentialsWithoutDefaults( + azure_tenant_id=dlt.config.get("tests.az_sp_tenant_id", str), + azure_client_id=dlt.config.get("tests.az_sp_client_id", str), + azure_client_secret=dlt.config.get("tests.az_sp_client_secret", str), # type: ignore[arg-type] + azure_storage_account_name=dlt.config.get("tests.az_sp_storage_account_name", str), + ) + # + credentials = resolve_configuration(credentials, sections=("destination", "fsazureprincipal")) + cfg = FilesystemConfiguration(bucket_url=AZ_BUCKET, credentials=credentials) + + return resolve_configuration(cfg) + + def test_azure_credentials_from_account_key(environment: Dict[str, str]) -> None: environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY"] = "QWERTYUIOPASDFGHJKLZXCVBNM1234567890" @@ -95,3 +125,77 @@ def test_azure_credentials_from_default(environment: Dict[str, str]) -> None: "sas_token": None, "anon": False, } + + +def test_azure_service_principal_credentials(environment: Dict[str, str]) -> None: + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" + environment["CREDENTIALS__AZURE_CLIENT_ID"] = "fake_client_id" + environment["CREDENTIALS__AZURE_CLIENT_SECRET"] = "fake_client_secret" + environment["CREDENTIALS__AZURE_TENANT_ID"] = "fake_tenant_id" + + config = resolve_configuration(AzureServicePrincipalCredentials()) + + assert config.azure_client_id == environment["CREDENTIALS__AZURE_CLIENT_ID"] + assert config.azure_client_secret == environment["CREDENTIALS__AZURE_CLIENT_SECRET"] + assert config.azure_tenant_id == environment["CREDENTIALS__AZURE_TENANT_ID"] + + assert config.to_adlfs_credentials() == { + "account_name": environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"], + "client_id": environment["CREDENTIALS__AZURE_CLIENT_ID"], + "client_secret": environment["CREDENTIALS__AZURE_CLIENT_SECRET"], + "tenant_id": environment["CREDENTIALS__AZURE_TENANT_ID"], + } + + +def test_azure_filesystem_configuration_service_principal(environment: Dict[str, str]) -> None: + """Filesystem config resolves correct credentials type""" + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" + environment["CREDENTIALS__AZURE_CLIENT_ID"] = "fake_client_id" + environment["CREDENTIALS__AZURE_CLIENT_SECRET"] = "asdsadas" + environment["CREDENTIALS__AZURE_TENANT_ID"] = str(uuid4()) + + config = FilesystemConfiguration(bucket_url="az://my-bucket") + + resolved_config = resolve_configuration(config) + + assert isinstance(resolved_config.credentials, AzureServicePrincipalCredentialsWithoutDefaults) + + fs, bucket = fsspec_from_config(resolved_config) + + assert fs.tenant_id == environment["CREDENTIALS__AZURE_TENANT_ID"] + assert fs.client_id == environment["CREDENTIALS__AZURE_CLIENT_ID"] + assert fs.client_secret == environment["CREDENTIALS__AZURE_CLIENT_SECRET"] + + +def test_azure_filesystem_configuration_sas_token(environment: Dict[str, str]) -> None: + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" + environment["CREDENTIALS__AZURE_STORAGE_SAS_TOKEN"] = ( + "sp=rwdlacx&se=2021-01-01T00:00:00Z&sv=2019-12-12&sr=c&sig=1234567890" + ) + + config = FilesystemConfiguration(bucket_url="az://my-bucket") + + resolved_config = resolve_configuration(config) + + assert isinstance(resolved_config.credentials, AzureCredentialsWithoutDefaults) + + fs, bucket = fsspec_from_config(resolved_config) + + assert fs.sas_token == "?" + environment["CREDENTIALS__AZURE_STORAGE_SAS_TOKEN"] + assert fs.account_name == environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] + + +def test_azure_service_principal_fs_operations( + az_service_principal_config: Optional[FilesystemConfiguration], +) -> None: + """Test connecting to azure filesystem with service principal credentials""" + config = az_service_principal_config + fs, bucket = fsspec_from_config(config) + + fn = uuid4().hex + # Try some file ops to see if the credentials work + fs.touch(f"{bucket}/{fn}/{fn}") + files = fs.ls(f"{bucket}/{fn}") + assert f"{bucket}/{fn}/{fn}" in files + fs.delete(f"{bucket}/{fn}/{fn}") + fs.rmdir(f"{bucket}/{fn}") diff --git a/tests/load/filesystem/test_filesystem_common.py b/tests/load/filesystem/test_filesystem_common.py index 3677765c9f..c069f88a15 100644 --- a/tests/load/filesystem/test_filesystem_common.py +++ b/tests/load/filesystem/test_filesystem_common.py @@ -12,10 +12,7 @@ from dlt.common import json, pendulum from dlt.common.configuration import resolve from dlt.common.configuration.inject import with_config -from dlt.common.configuration.specs import ( - AzureCredentials, - AzureCredentialsWithoutDefaults, -) +from dlt.common.configuration.specs import AnyAzureCredentials from dlt.common.storages import fsspec_from_config, FilesystemConfiguration from dlt.common.storages.fsspec_filesystem import MTIME_DISPATCH, glob_files from dlt.common.utils import custom_environ, uniq_id @@ -43,10 +40,7 @@ def test_filesystem_configuration() -> None: config = FilesystemConfiguration(bucket_url="az://root") assert config.protocol == "az" # print(config.resolve_credentials_type()) - assert ( - config.resolve_credentials_type() - == Union[AzureCredentialsWithoutDefaults, AzureCredentials] - ) + assert config.resolve_credentials_type() == AnyAzureCredentials assert dict(config) == { "read_only": False, "bucket_url": "az://root", From 4fcfa28da3891b4862f8ec466d33159cdde70f2e Mon Sep 17 00:00:00 2001 From: rudolfix Date: Mon, 27 May 2024 15:03:50 +0200 Subject: [PATCH 36/41] RESTClient: implement AuthConfigBase.__bool__ + update docs (#1413) * Fix AuthConfigBase so its instances always evaluate to True in bool context; change docs to suggest direct inheritance from AuthBase * Add tests * Fix formatting * uses AuthBase as auth type --------- Co-authored-by: Anton Burnashev --- dlt/sources/helpers/rest_client/auth.py | 6 ++- dlt/sources/helpers/rest_client/client.py | 14 +++--- .../docs/general-usage/http/rest-client.md | 8 ++-- .../helpers/rest_client/test_client.py | 46 ++++++++++++++++++- 4 files changed, 61 insertions(+), 13 deletions(-) diff --git a/dlt/sources/helpers/rest_client/auth.py b/dlt/sources/helpers/rest_client/auth.py index 37c0de3db1..020c63a195 100644 --- a/dlt/sources/helpers/rest_client/auth.py +++ b/dlt/sources/helpers/rest_client/auth.py @@ -38,7 +38,11 @@ class AuthConfigBase(AuthBase, CredentialsConfiguration): configurable via env variables or toml files """ - pass + def __bool__(self) -> bool: + # This is needed to avoid AuthConfigBase-derived classes + # which do not implement CredentialsConfiguration interface + # to be evaluated as False in requests.sessions.Session.prepare_request() + return True @configspec diff --git a/dlt/sources/helpers/rest_client/client.py b/dlt/sources/helpers/rest_client/client.py index b4b62fa849..7d1145a890 100644 --- a/dlt/sources/helpers/rest_client/client.py +++ b/dlt/sources/helpers/rest_client/client.py @@ -6,12 +6,14 @@ Any, TypeVar, Iterable, + Union, cast, ) import copy from urllib.parse import urlparse from requests import Session as BaseSession # noqa: I251 from requests import Response, Request +from requests.auth import AuthBase from dlt.common import jsonpath, logger @@ -41,7 +43,7 @@ def __init__( request: Request, response: Response, paginator: BasePaginator, - auth: AuthConfigBase, + auth: AuthBase, ): super().__init__(__iterable) self.request = request @@ -57,7 +59,7 @@ class RESTClient: Args: base_url (str): The base URL of the API to make requests to. headers (Optional[Dict[str, str]]): Default headers to include in all requests. - auth (Optional[AuthConfigBase]): Authentication configuration for all requests. + auth (Optional[AuthBase]): Authentication configuration for all requests. paginator (Optional[BasePaginator]): Default paginator for handling paginated responses. data_selector (Optional[jsonpath.TJsonPath]): JSONPath selector for extracting data from responses. session (BaseSession): HTTP session for making requests. @@ -69,7 +71,7 @@ def __init__( self, base_url: str, headers: Optional[Dict[str, str]] = None, - auth: Optional[AuthConfigBase] = None, + auth: Optional[AuthBase] = None, paginator: Optional[BasePaginator] = None, data_selector: Optional[jsonpath.TJsonPath] = None, session: BaseSession = None, @@ -105,7 +107,7 @@ def _create_request( method: HTTPMethod, params: Dict[str, Any], json: Optional[Dict[str, Any]] = None, - auth: Optional[AuthConfigBase] = None, + auth: Optional[AuthBase] = None, hooks: Optional[Hooks] = None, ) -> Request: parsed_url = urlparse(path) @@ -154,7 +156,7 @@ def paginate( method: HTTPMethodBasic = "GET", params: Optional[Dict[str, Any]] = None, json: Optional[Dict[str, Any]] = None, - auth: Optional[AuthConfigBase] = None, + auth: Optional[AuthBase] = None, paginator: Optional[BasePaginator] = None, data_selector: Optional[jsonpath.TJsonPath] = None, hooks: Optional[Hooks] = None, @@ -166,7 +168,7 @@ def paginate( method (HTTPMethodBasic): HTTP method for the request, defaults to 'get'. params (Optional[Dict[str, Any]]): URL parameters for the request. json (Optional[Dict[str, Any]]): JSON payload for the request. - auth (Optional[AuthConfigBase]): Authentication configuration for the request. + auth (Optional[AuthBase): Authentication configuration for the request. paginator (Optional[BasePaginator]): Paginator instance for handling pagination logic. data_selector (Optional[jsonpath.TJsonPath]): JSONPath selector for diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index 19cc95bf78..1093428b0f 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -407,7 +407,7 @@ The available authentication methods are defined in the `dlt.sources.helpers.res - [APIKeyAuth](#api-key-authentication) - [HttpBasicAuth](#http-basic-authentication) -For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthConfigBase` class. +For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthBase` class from the Requests library. ### Bearer token authentication @@ -479,12 +479,12 @@ response = client.get("/protected/resource") ### Implementing custom authentication -You can implement custom authentication by subclassing the `AuthConfigBase` class and implementing the `__call__` method: +You can implement custom authentication by subclassing the `AuthBase` class and implementing the `__call__` method: ```py -from dlt.sources.helpers.rest_client.auth import AuthConfigBase +from requests.auth import AuthBase -class CustomAuth(AuthConfigBase): +class CustomAuth(AuthBase): def __init__(self, token): self.token = token diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 50defa8edb..7f03c6d167 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -1,8 +1,10 @@ import os import pytest from typing import Any, cast +from requests import PreparedRequest, Request +from requests.auth import AuthBase from dlt.common.typing import TSecretStrValue -from dlt.sources.helpers.requests import Response, Request +from dlt.sources.helpers.requests import Response from dlt.sources.helpers.rest_client import RESTClient from dlt.sources.helpers.rest_client.client import Hooks from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator @@ -57,7 +59,6 @@ def test_page_context(self, rest_client: RESTClient) -> None: for page in rest_client.paginate( "/posts", paginator=JSONResponsePaginator(next_url_path="next_page"), - auth=AuthConfigBase(), ): # response that produced data assert isinstance(page.response, Response) @@ -183,3 +184,44 @@ def test_oauth_jwt_auth_success(self, rest_client: RESTClient): ) assert_pagination(list(pages_iter)) + + def test_custom_auth_success(self, rest_client: RESTClient): + class CustomAuthConfigBase(AuthConfigBase): + def __init__(self, token: str): + self.token = token + + def __call__(self, request: PreparedRequest) -> PreparedRequest: + request.headers["Authorization"] = f"Bearer {self.token}" + return request + + class CustomAuthAuthBase(AuthBase): + def __init__(self, token: str): + self.token = token + + def __call__(self, request: PreparedRequest) -> PreparedRequest: + request.headers["Authorization"] = f"Bearer {self.token}" + return request + + auth_list = [ + CustomAuthConfigBase("test-token"), + CustomAuthAuthBase("test-token"), + ] + + for auth in auth_list: + response = rest_client.get( + "/protected/posts/bearer-token", + auth=auth, + ) + + assert response.status_code == 200 + assert response.json()["data"][0] == {"id": 0, "title": "Post 0"} + + pages_iter = rest_client.paginate( + "/protected/posts/bearer-token", + auth=auth, + ) + + pages_list = list(pages_iter) + assert_pagination(pages_list) + + assert pages_list[0].response.request.headers["Authorization"] == "Bearer test-token" From 1322fbccbf2564108534535fc35ad5f645d4c508 Mon Sep 17 00:00:00 2001 From: VioletM Date: Mon, 27 May 2024 09:42:15 -0400 Subject: [PATCH 37/41] Update filesystem docs with auto mkdir config (#1416) --- .../dlt-ecosystem/destinations/filesystem.md | 47 ++++++++++++------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 3e2e08013c..9c7d961d3a 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -21,9 +21,7 @@ pip install s3fs so pip does not fail on backtracking. ::: -## Setup Guide - -### 1. Initialise the dlt project +## Initialise the dlt project Let's start by initializing a new dlt project as follows: ```sh @@ -33,9 +31,9 @@ Let's start by initializing a new dlt project as follows: This command will initialize your pipeline with chess as the source and the AWS S3 filesystem as the destination. ::: -### 2. Set up bucket storage and credentials +## Set up bucket storage and credentials -#### AWS S3 +### AWS S3 The command above creates a sample `secrets.toml` and requirements file for AWS S3 bucket. You can install those dependencies by running: ```sh pip install -r requirements.txt @@ -100,7 +98,7 @@ You need to create an S3 bucket and a user who can access that bucket. `dlt` doe 5. To grab the access and secret key for the user. Go to IAM > Users and in the β€œSecurity Credentials”, click on β€œCreate Access Key”, and preferably select β€œCommand Line Interface” and create the access key. 6. Grab the β€œAccess Key” and β€œSecret Access Key” created that are to be used in "secrets.toml". -##### Using S3 compatible storage +#### Using S3 compatible storage To use an S3 compatible storage other than AWS S3 like [MinIO](https://min.io/) or [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials: @@ -114,7 +112,7 @@ aws_secret_access_key = "please set me up!" # copy the secret access key here endpoint_url = "https://.r2.cloudflarestorage.com" # copy your endpoint URL here ``` -##### Adding Additional Configuration +#### Adding Additional Configuration To pass any additional arguments to `fsspec`, you may supply `kwargs` and `client_kwargs` in the config as a **stringified dictionary**: @@ -124,7 +122,7 @@ kwargs = '{"use_ssl": true, "auto_mkdir": true}' client_kwargs = '{"verify": "public.crt"}' ``` -#### Google Storage +### Google Storage Run `pip install "dlt[gs]"` which will install the `gcfs` package. To edit the `dlt` credentials file with your secret info, open `.dlt/secrets.toml`. @@ -147,14 +145,14 @@ if you have default google cloud credentials in your environment (i.e. on cloud Use **Cloud Storage** admin to create a new bucket. Then assign the **Storage Object Admin** role to your service account. -#### Azure Blob Storage +### Azure Blob Storage Run `pip install "dlt[az]"` which will install the `adlfs` package to interface with Azure Blob Storage. Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials. Two forms of Azure credentials are supported: -##### SAS token credentials +#### SAS token credentials Supply storage account name and either sas token or storage account key @@ -174,7 +172,7 @@ If you have the correct Azure credentials set up on your machine (e.g. via azure you can omit both `azure_storage_account_key` and `azure_storage_sas_token` and `dlt` will fall back to the available default. Note that `azure_storage_account_name` is still required as it can't be inferred from the environment. -##### Service principal credentials +#### Service principal credentials Supply a client ID, client secret and a tenant ID for a service principal authorized to access your container @@ -188,7 +186,7 @@ azure_client_secret = "client_secret" azure_tenant_id = "tenant_id" # please set me up! ``` -#### Local file system +### Local file system If for any reason you want to have those files in a local folder, set up the `bucket_url` as follows (you are free to use `config.toml` for that as there are no secrets required) ```toml @@ -196,10 +194,24 @@ If for any reason you want to have those files in a local folder, set up the `bu bucket_url = "file:///absolute/path" # three / for an absolute path ``` -`dlt` correctly handles the native local file paths. Indeed, using the `file://` schema may be not intuitive especially for Windows users. +:::tip +For handling deeply nested layouts, consider enabling automatic directory creation for the local filesystem destination. This can be done by setting `kwargs` in `secrets.toml`: ```toml [destination.filesystem] +kwargs = '{"auto_mkdir": true}' +``` + +Or by setting environment variable: +```sh +export DESTINATION__FILESYSTEM__KWARGS = '{"auto_mkdir": true/false}' +``` +::: + +`dlt` correctly handles the native local file paths. Indeed, using the `file://` schema may be not intuitive especially for Windows users. + +```toml +[destination.unc_destination] bucket_url = 'C:\a\b\c' ``` @@ -379,18 +391,17 @@ Please note: The filesystem destination configuration supports advanced layout customization and the inclusion of additional placeholders. This can be done through `config.toml` or programmatically when initializing via a factory method. -:::tip -For handling deeply nested layouts, consider enabling automatic directory creation for the local filesystem destination. This can be done by setting `kwargs = '{"auto_mkdir": true}'` to facilitate the creation of directories automatically. -::: - #### Configuration via `config.toml` To configure the layout and placeholders using `config.toml`, use the following format: ```toml +[destination.filesystem] layout = "{table_name}/{test_placeholder}/{YYYY}-{MM}-{DD}/{ddd}/{mm}/{load_id}.{file_id}.{ext}" extra_placeholders = { "test_placeholder" = "test_value" } current_datetime="2024-04-14T00:00:00" +# for automatic directory creation in the local filesystem +kwargs = '{"auto_mkdir": true}' ``` :::note @@ -478,4 +489,4 @@ managed in the regular way by the final destination you have configured. You will also notice `init` files being present in the root folder and the special `dlt` folders. In the absence of the concepts of schemas and tables in blob storages and directories, `dlt` uses these special files to harmonize the behavior of the `filesystem` destination with the other implemented destinations. - + \ No newline at end of file From b94c807e9a5de13fdbbaa226d45ef3daefb86fb9 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Mon, 27 May 2024 18:02:19 +0200 Subject: [PATCH 38/41] add page to docs for openapi generator (#1417) * add a page for the openapi generator * small updates to readme * small readme updates (alena) * fixed readme (anton 2nd review) * fix relative links * small update --- .../verified-sources/openapi-generator.md | 210 ++++++++++++++++++ docs/website/sidebars.js | 1 + 2 files changed, 211 insertions(+) create mode 100644 docs/website/docs/dlt-ecosystem/verified-sources/openapi-generator.md diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/openapi-generator.md b/docs/website/docs/dlt-ecosystem/verified-sources/openapi-generator.md new file mode 100644 index 0000000000..a987a55b15 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/verified-sources/openapi-generator.md @@ -0,0 +1,210 @@ +--- +title: OpenAPI source generator +description: OpenAPI dlt source generator +keywords: [openapi, rest api, swagger, source generator, cli, rest] +--- +import Header from './_source-info-header.md'; + +# OpenAPI source generator + +
+ +Our OpenAPI source generator - `dlt-init-openapi` - generates [`dlt`](https://dlthub.com/docs) data pipelines from [OpenAPI 3.x specs](https://swagger.io/specification/) using the [rest_api verified source](./rest_api) to extract data from any REST API. If you are not familiar with the `rest_api` source, please read [rest_api](./rest_api) to learn how our `rest_api` source works. + +:::tip +We also have a cool [Google Colab example](https://colab.research.google.com/drive/1MRZvguOTZj1MlkEGzjiso8lQ_wr1MJRI?usp=sharing#scrollTo=LHGxzf1Ev_yr) that demonstrates this generator. 😎 +::: + +## Features +`dlt-init-openapi` generates code from an OpenAPI spec that you can use to extract data from a `rest_api` into any [`destination`](../destinations/) (e.g., Postgres, BigQuery, Redshift...) that `dlt` supports. dlt-init-openapi additionally executes a set of heuristics to discover information not explicitly defined in OpenAPI specs. + +Features include: + +* **[Pagination](./rest_api#pagination) discovery** for each endpoint. +* **Primary key discovery** for each entity. +* **Endpoint relationship mapping** into `dlt` [`transformers`](../../general-usage/resource#process-resources-with-dlttransformer) (e.g., /users/ -> /user/{id}). +* **Payload JSON path [data selector](./rest_api#data-selection) discovery** for results nested in the returned JSON. +* **[Authentication](./rest_api#authentication)** discovery for an API. + +## A quick example + +You will need Python 3.9 or higher installed, as well as pip. You can run `pip install dlt-init-openapi` to install the current version. + +We will create a simple example pipeline from a [PokeAPI spec](https://pokeapi.co/) in our repo. You can point to any other OpenAPI Spec instead if you prefer. + + +1. Run the generator with a URL: + ```sh + dlt-init-openapi pokemon --url https://raw.githubusercontent.com/dlt-hub/dlt-init-openapi/devel/tests/cases/e2e_specs/pokeapi.yml --global-limit 2 + ``` + +2. Alternatively, if you have a local file, you can use the --path flag: + ```sh + dlt-init-openapi pokemon --path ./my_specs/pokeapi.yml + ``` + +3. You can now pick both of the endpoints from the popup. + +4. After selecting your Pokemon endpoints and hitting Enter, your pipeline will be rendered. + +5. If you have any kind of authentication on your pipeline (this example does not), open the `.dlt/secrets.toml` and provide the credentials. You can find further settings in the `.dlt/config.toml`. + +6. Go to the created pipeline folder and run your pipeline. + ```sh + cd pokemon-pipeline + PROGRESS=enlighten python pipeline.py # we use enlighten for a nice progress bar :) + ``` + +7. Print the pipeline info to the console to see what got loaded. + ```sh + dlt pipeline pokemon_pipeline info + ``` + +8. You can now also install Streamlit to see a preview of the data; you should have loaded 40 Pokemons and their details. + ```sh + pip install pandas streamlit + dlt pipeline pokemon_pipeline show + ``` + +9. You can go to our docs at https://dlthub.com/docs to learn how to modify the generated pipeline to load to many destinations, place schema contracts on your pipeline, and many other things. + +:::note +We used the `--global-limit 2` CLI flag to limit the requests to the PokeAPI +for this example. This way, the Pokemon collection endpoint only gets queried +twice, resulting in 2 x 20 Pokemon details being rendered. +::: + +## What will be created? + +When you run the `dlt-init-openapi` command above, the following files will be generated: + +```text +pokemon_pipeline/ +β”œβ”€β”€ .dlt/ +β”‚ β”œβ”€β”€ config.toml # dlt config, learn more at dlthub.com/docs +β”‚ └── secrets.toml # your secrets, only needed for APIs with auth +β”œβ”€β”€ pokemon/ +β”‚ └── __init__.py # your rest_api dictionary, learn more below +β”œβ”€β”€ rest_api/ +β”‚ └── ... # rest_api copied from our verified sources repo +β”œβ”€β”€ .gitignore +β”œβ”€β”€ pokemon_pipeline.py # your pipeline file that you can execute +β”œβ”€β”€ README.md # a list of your endpoints with some additional info +└── requirements.txt # the pip requirements for your pipeline +``` + +:::warning +If you re-generate your pipeline, you will be prompted to continue if this folder exists. If you select yes, all generated files will be overwritten. All other files you may have created will remain in this folder. In non-interactive mode you will not be asked, and the generated files will be overwritten. +::: + +## A closer look at your `rest_api` dictionary in `pokemon/__init__.py` + +This file contains the [configuration dictionary](./rest_api#source-configuration) for the rest_api source which is the main result of running this generator. For our Pokemon example, we have used an OpenAPI 3 spec that works out of the box. The result of this dictionary depends on the quality of the spec you are using, whether the API you are querying actually adheres to this spec, and whether our heuristics manage to find the right values. + +The generated dictionary will look something like this: + +```py +{ + "client": { + "base_url": base_url, + # -> the detected common paginator + "paginator": { + ... + }, + }, + # -> your two endpoints + "resources": [ + { + # -> A primary key could not be inferred from + # the spec; usual suspects such as id, pokemon_id, etc. + # are not defined. You can add one if you know. + "name": "pokemon_list", + "table_name": "pokemon", + "endpoint": { + # -> the results seem to be nested in { results: [...] } + "data_selector": "results", + "path": "/api/v2/pokemon/", + }, + }, + { + "name": "pokemon_read", + "table_name": "pokemon", + # -> A primary key *name* is assumed, as it is found in the + # url. + "primary_key": "name", + "write_disposition": "merge", + "endpoint": { + "data_selector": "$", + "path": "/api/v2/pokemon/{name}/", + "params": { + # -> your detected transformer settings + # this is a child endpoint of the pokemon_list + "name": { + "type": "resolve", + "resource": "pokemon_list", + "field": "name", + }, + }, + }, + }, + ], +} +``` + +:::info +You can edit this file to adapt the behavior of the dlt rest_api accordingly. Please read our [dlt rest_api](./rest_api) docs to learn how to configure the rest_api source and check out our detailed [Google Colab example](https://colab.research.google.com/drive/1MRZvguOTZj1MlkEGzjiso8lQ_wr1MJRI?usp=sharing#scrollTo=LHGxzf1Ev_yr). +::: + +## CLI command + +```sh +dlt-init-openapi [OPTIONS] +``` + +### Example: +```sh +dlt-init-openapi pokemon --path ./path/to/my_spec.yml --no-interactive --output-path ./my_pipeline +``` + +**Options**: + +_The only required options are either to supply a path or a URL to a spec_ + +- `--url URL`: A URL to read the OpenAPI JSON or YAML file from. +- `--path PATH`: A path to read the OpenAPI JSON or YAML file from locally. +- `--output-path PATH`: A path to render the output to. +- `--config PATH`: Path to the config file to use (see below). +- `--no-interactive`: Skip endpoint selection and render all paths of the OpenAPI spec. +- `--log-level`: Set the logging level for stdout output, defaults to 20 (INFO). +- `--global-limit`: Set a global limit on the generated source. +- `--update-rest-api-source`: Update the locally cached rest_api verified source. +- `--allow-openapi-2`: Allows the use of OpenAPI v2. specs. Migration of the spec to 3.0 is recommended for better results though. +- `--version`: Show the installed version of the generator and exit. +- `--help`: Show this message and exit. + +## Config options +You can pass a path to a config file with the `--config PATH` argument. To see available config values, go to https://github.com/dlt-hub/dlt-init-openapi/blob/devel/dlt_init_openapi/config.py and read the information below each field on the `Config` class. + +The config file can be supplied as JSON or YAML dictionary. For example, to change the package name, you can create a YAML file: + +```yaml +# config.yml +package_name: "other_package_name" +``` + +And use it with the config argument: + +```sh +$ dlt-init-openapi pokemon --url ... --config config.yml +``` + +## Telemetry +We track your usage of this tool similar to how we track other commands in the dlt core library. Read more about this and how to disable it [here](../../reference/telemetry). + +## Prior work +This project started as a fork of [openapi-python-client](https://github.com/openapi-generators/openapi-python-client). Pretty much all parts are heavily changed or completely replaced, but some lines of code still exist, and we like to acknowledge the many good ideas we got from the original project :) + +## Implementation notes +* OAuth Authentication currently is not natively supported. You can supply your own. +* Per endpoint authentication currently is not supported by the generator. Only the first globally set securityScheme will be applied. You can add your own per endpoint if you need to. +* Basic OpenAPI 2.0 support is implemented. We recommend updating your specs at https://editor.swagger.io before using `dlt-init-openapi`. \ No newline at end of file diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index a3fe12c8fb..d3d7def8fc 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -84,6 +84,7 @@ const sidebars = { 'dlt-ecosystem/verified-sources/personio', 'dlt-ecosystem/verified-sources/pipedrive', 'dlt-ecosystem/verified-sources/rest_api', + 'dlt-ecosystem/verified-sources/openapi-generator', 'dlt-ecosystem/verified-sources/salesforce', 'dlt-ecosystem/verified-sources/scrapy', 'dlt-ecosystem/verified-sources/shopify', From 441598883dfd8303f572b5e092fe01b45040658c Mon Sep 17 00:00:00 2001 From: Alexander Butler <41213451+z3z1ma@users.noreply.github.com> Date: Mon, 27 May 2024 19:40:20 +0100 Subject: [PATCH 39/41] Fix: ensure custom session can be provided to rest client (#1396) * fix: ensure custom session can be provided to rest client * fix: move request client retry to correct central req method used in all codepaths * chore: use adapter mock to replicate production code path more accurately * chore: rename session warn func and add docstring * fix: linting err * creates explicit session in rest client tests * allows custom sessions in oauth2 jwt of rest client * adds NotResolved type annotations that excludes type from resolving in configspec * fixes weaviate test --------- Co-authored-by: Marcin Rudolf --- dlt/common/configuration/__init__.py | 9 +- dlt/common/configuration/resolve.py | 6 +- .../configuration/specs/base_configuration.py | 38 ++++++++- dlt/common/destination/reference.py | 16 ++-- dlt/destinations/impl/qdrant/configuration.py | 7 +- .../impl/weaviate/configuration.py | 7 +- dlt/sources/helpers/requests/retry.py | 2 +- dlt/sources/helpers/rest_client/auth.py | 16 +++- dlt/sources/helpers/rest_client/client.py | 23 ++--- .../configuration/test_configuration.py | 55 +++++++++++- tests/load/utils.py | 6 +- tests/load/weaviate/test_weaviate_client.py | 8 +- .../helpers/rest_client/test_client.py | 20 ++++- tests/sources/helpers/test_requests.py | 84 ++++++++++--------- 14 files changed, 216 insertions(+), 81 deletions(-) diff --git a/dlt/common/configuration/__init__.py b/dlt/common/configuration/__init__.py index 8de57f7799..2abc31b17d 100644 --- a/dlt/common/configuration/__init__.py +++ b/dlt/common/configuration/__init__.py @@ -1,4 +1,10 @@ -from .specs.base_configuration import configspec, is_valid_hint, is_secret_hint, resolve_type +from .specs.base_configuration import ( + configspec, + is_valid_hint, + is_secret_hint, + resolve_type, + NotResolved, +) from .specs import known_sections from .resolve import resolve_configuration, inject_section from .inject import with_config, last_config, get_fun_spec, create_resolved_partial @@ -15,6 +21,7 @@ "configspec", "is_valid_hint", "is_secret_hint", + "NotResolved", "resolve_type", "known_sections", "resolve_configuration", diff --git a/dlt/common/configuration/resolve.py b/dlt/common/configuration/resolve.py index ebfa7b6b89..9101cfdd9c 100644 --- a/dlt/common/configuration/resolve.py +++ b/dlt/common/configuration/resolve.py @@ -8,7 +8,6 @@ StrAny, TSecretValue, get_all_types_of_class_in_union, - is_final_type, is_optional_type, is_union_type, ) @@ -21,6 +20,7 @@ is_context_inner_hint, is_base_configuration_inner_hint, is_valid_hint, + is_hint_not_resolved, ) from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.configuration.specs.exceptions import NativeValueError @@ -194,7 +194,7 @@ def _resolve_config_fields( if explicit_values: explicit_value = explicit_values.get(key) else: - if is_final_type(hint): + if is_hint_not_resolved(hint): # for final fields default value is like explicit explicit_value = default_value else: @@ -258,7 +258,7 @@ def _resolve_config_fields( unresolved_fields[key] = traces # set resolved value in config if default_value != current_value: - if not is_final_type(hint): + if not is_hint_not_resolved(hint): # ignore final types setattr(config, key, current_value) diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 1329feae6c..006cde8dce 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -20,7 +20,7 @@ ClassVar, TypeVar, ) -from typing_extensions import get_args, get_origin, dataclass_transform +from typing_extensions import get_args, get_origin, dataclass_transform, Annotated, TypeAlias from functools import wraps if TYPE_CHECKING: @@ -29,8 +29,11 @@ TDtcField = dataclasses.Field from dlt.common.typing import ( + AnyType, TAnyClass, extract_inner_type, + is_annotated, + is_final_type, is_optional_type, is_union_type, ) @@ -48,6 +51,34 @@ _C = TypeVar("_C", bound="CredentialsConfiguration") +class NotResolved: + """Used in type annotations to indicate types that should not be resolved.""" + + def __init__(self, not_resolved: bool = True): + self.not_resolved = not_resolved + + def __bool__(self) -> bool: + return self.not_resolved + + +def is_hint_not_resolved(hint: AnyType) -> bool: + """Checks if hint should NOT be resolved. Final and types annotated like + + >>> Annotated[str, NotResolved()] + + are not resolved. + """ + if is_final_type(hint): + return True + + if is_annotated(hint): + _, *a_m = get_args(hint) + for annotation in a_m: + if isinstance(annotation, NotResolved): + return bool(annotation) + return False + + def is_base_configuration_inner_hint(inner_hint: Type[Any]) -> bool: return inspect.isclass(inner_hint) and issubclass(inner_hint, BaseConfiguration) @@ -70,6 +101,11 @@ def is_valid_hint(hint: Type[Any]) -> bool: if get_origin(hint) is ClassVar: # class vars are skipped by dataclass return True + + if is_hint_not_resolved(hint): + # all hints that are not resolved are valid + return True + hint = extract_inner_type(hint) hint = get_config_if_union_hint(hint) or hint hint = get_origin(hint) or hint diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 2ad5131e63..d4cdfb729d 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -18,8 +18,8 @@ Any, TypeVar, Generic, - Final, ) +from typing_extensions import Annotated import datetime # noqa: 251 from copy import deepcopy import inspect @@ -35,7 +35,7 @@ has_column_with_prop, get_first_column_name_with_prop, ) -from dlt.common.configuration import configspec, resolve_configuration, known_sections +from dlt.common.configuration import configspec, resolve_configuration, known_sections, NotResolved from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration from dlt.common.configuration.accessors import config from dlt.common.destination.capabilities import DestinationCapabilitiesContext @@ -78,7 +78,7 @@ class StateInfo(NamedTuple): @configspec class DestinationClientConfiguration(BaseConfiguration): - destination_type: Final[str] = dataclasses.field( + destination_type: Annotated[str, NotResolved()] = dataclasses.field( default=None, init=False, repr=False, compare=False ) # which destination to load data to credentials: Optional[CredentialsConfiguration] = None @@ -103,11 +103,11 @@ def on_resolved(self) -> None: class DestinationClientDwhConfiguration(DestinationClientConfiguration): """Configuration of a destination that supports datasets/schemas""" - dataset_name: Final[str] = dataclasses.field( + dataset_name: Annotated[str, NotResolved()] = dataclasses.field( default=None, init=False, repr=False, compare=False - ) # dataset must be final so it is not configurable + ) # dataset cannot be resolved """dataset name in the destination to load data to, for schemas that are not default schema, it is used as dataset prefix""" - default_schema_name: Final[Optional[str]] = dataclasses.field( + default_schema_name: Annotated[Optional[str], NotResolved()] = dataclasses.field( default=None, init=False, repr=False, compare=False ) """name of default schema to be used to name effective dataset to load data to""" @@ -121,8 +121,8 @@ def _bind_dataset_name( This method is intended to be used internally. """ - self.dataset_name = dataset_name # type: ignore[misc] - self.default_schema_name = default_schema_name # type: ignore[misc] + self.dataset_name = dataset_name + self.default_schema_name = default_schema_name return self def normalize_dataset_name(self, schema: Schema) -> str: diff --git a/dlt/destinations/impl/qdrant/configuration.py b/dlt/destinations/impl/qdrant/configuration.py index d589537742..fd11cc7dcb 100644 --- a/dlt/destinations/impl/qdrant/configuration.py +++ b/dlt/destinations/impl/qdrant/configuration.py @@ -1,7 +1,8 @@ import dataclasses from typing import Optional, Final +from typing_extensions import Annotated -from dlt.common.configuration import configspec +from dlt.common.configuration import configspec, NotResolved from dlt.common.configuration.specs.base_configuration import ( BaseConfiguration, CredentialsConfiguration, @@ -55,7 +56,9 @@ class QdrantClientConfiguration(DestinationClientDwhConfiguration): dataset_separator: str = "_" # make it optional so empty dataset is allowed - dataset_name: Final[Optional[str]] = dataclasses.field(default=None, init=False, repr=False, compare=False) # type: ignore[misc] + dataset_name: Annotated[Optional[str], NotResolved()] = dataclasses.field( + default=None, init=False, repr=False, compare=False + ) # Batch size for generating embeddings embedding_batch_size: int = 32 diff --git a/dlt/destinations/impl/weaviate/configuration.py b/dlt/destinations/impl/weaviate/configuration.py index 90fb7ce5ce..1a053e41f4 100644 --- a/dlt/destinations/impl/weaviate/configuration.py +++ b/dlt/destinations/impl/weaviate/configuration.py @@ -1,8 +1,9 @@ import dataclasses from typing import Dict, Literal, Optional, Final +from typing_extensions import Annotated from urllib.parse import urlparse -from dlt.common.configuration import configspec +from dlt.common.configuration import configspec, NotResolved from dlt.common.configuration.specs.base_configuration import CredentialsConfiguration from dlt.common.destination.reference import DestinationClientDwhConfiguration from dlt.common.utils import digest128 @@ -26,7 +27,9 @@ def __str__(self) -> str: class WeaviateClientConfiguration(DestinationClientDwhConfiguration): destination_type: Final[str] = dataclasses.field(default="weaviate", init=False, repr=False, compare=False) # type: ignore # make it optional so empty dataset is allowed - dataset_name: Optional[str] = None # type: ignore[misc] + dataset_name: Annotated[Optional[str], NotResolved()] = dataclasses.field( + default=None, init=False, repr=False, compare=False + ) batch_size: int = 100 batch_workers: int = 1 diff --git a/dlt/sources/helpers/requests/retry.py b/dlt/sources/helpers/requests/retry.py index c9a813598f..3f9d7d559e 100644 --- a/dlt/sources/helpers/requests/retry.py +++ b/dlt/sources/helpers/requests/retry.py @@ -239,7 +239,7 @@ def _make_session(self) -> Session: session.mount("http://", self._adapter) session.mount("https://", self._adapter) retry = _make_retry(**self._retry_kwargs) - session.request = retry.wraps(session.request) # type: ignore[method-assign] + session.send = retry.wraps(session.send) # type: ignore[method-assign] return session @property diff --git a/dlt/sources/helpers/rest_client/auth.py b/dlt/sources/helpers/rest_client/auth.py index 020c63a195..29e6d8c77a 100644 --- a/dlt/sources/helpers/rest_client/auth.py +++ b/dlt/sources/helpers/rest_client/auth.py @@ -1,4 +1,5 @@ from base64 import b64encode +import dataclasses import math from typing import ( List, @@ -12,12 +13,13 @@ Iterable, TYPE_CHECKING, ) +from typing_extensions import Annotated from requests.auth import AuthBase -from requests import PreparedRequest # noqa: I251 +from requests import PreparedRequest, Session as BaseSession # noqa: I251 from dlt.common import logger from dlt.common.exceptions import MissingDependencyException -from dlt.common.configuration.specs.base_configuration import configspec +from dlt.common.configuration.specs.base_configuration import configspec, NotResolved from dlt.common.configuration.specs import CredentialsConfiguration from dlt.common.configuration.specs.exceptions import NativeValueError from dlt.common.pendulum import pendulum @@ -146,7 +148,9 @@ def __call__(self, request: PreparedRequest) -> PreparedRequest: class OAuthJWTAuth(BearerTokenAuth): """This is a form of Bearer auth, actually there's not standard way to declare it in openAPI""" - format: Final[Literal["JWT"]] = "JWT" # noqa: A003 + format: Final[Literal["JWT"]] = dataclasses.field( # noqa: A003 + default="JWT", init=False, repr=False, compare=False + ) client_id: str = None private_key: TSecretStrValue = None auth_endpoint: str = None @@ -154,11 +158,15 @@ class OAuthJWTAuth(BearerTokenAuth): headers: Optional[Dict[str, str]] = None private_key_passphrase: Optional[TSecretStrValue] = None default_token_expiration: int = 3600 + session: Annotated[BaseSession, NotResolved()] = None def __post_init__(self) -> None: self.scopes = self.scopes if isinstance(self.scopes, str) else " ".join(self.scopes) self.token = None self.token_expiry: Optional[pendulum.DateTime] = None + # use default system session is not specified + if self.session is None: + self.session = requests.client.session def __call__(self, r: PreparedRequest) -> PreparedRequest: if self.token is None or self.is_token_expired(): @@ -183,7 +191,7 @@ def obtain_token(self) -> None: logger.debug(f"Obtaining token from {self.auth_endpoint}") - response = requests.post(self.auth_endpoint, headers=self.headers, data=data) + response = self.session.post(self.auth_endpoint, headers=self.headers, data=data) response.raise_for_status() token_response = response.json() diff --git a/dlt/sources/helpers/rest_client/client.py b/dlt/sources/helpers/rest_client/client.py index 7d1145a890..dc7304f159 100644 --- a/dlt/sources/helpers/rest_client/client.py +++ b/dlt/sources/helpers/rest_client/client.py @@ -82,8 +82,9 @@ def __init__( self.auth = auth if session: - self._validate_session_raise_for_status(session) - self.session = session + # dlt.sources.helpers.requests.session.Session + # has raise_for_status=True by default + self.session = _warn_if_raise_for_status_and_return(session) else: self.session = Client(raise_for_status=False).session @@ -92,15 +93,6 @@ def __init__( self.data_selector = data_selector - def _validate_session_raise_for_status(self, session: BaseSession) -> None: - # dlt.sources.helpers.requests.session.Session - # has raise_for_status=True by default - if getattr(self.session, "raise_for_status", False): - logger.warning( - "The session provided has raise_for_status enabled. " - "This may cause unexpected behavior." - ) - def _create_request( self, path: str, @@ -298,3 +290,12 @@ def detect_paginator(self, response: Response, data: Any) -> BasePaginator: " instance of the paginator as some settings may not be guessed correctly." ) return paginator + + +def _warn_if_raise_for_status_and_return(session: BaseSession) -> BaseSession: + """A generic function to warn if the session has raise_for_status enabled.""" + if getattr(session, "raise_for_status", False): + logger.warning( + "The session provided has raise_for_status enabled. This may cause unexpected behavior." + ) + return session diff --git a/tests/common/configuration/test_configuration.py b/tests/common/configuration/test_configuration.py index 84b2d1893d..43ccdf856c 100644 --- a/tests/common/configuration/test_configuration.py +++ b/tests/common/configuration/test_configuration.py @@ -12,11 +12,12 @@ Optional, Type, Union, - TYPE_CHECKING, ) +from typing_extensions import Annotated from dlt.common import json, pendulum, Decimal, Wei from dlt.common.configuration.providers.provider import ConfigProvider +from dlt.common.configuration.specs.base_configuration import NotResolved, is_hint_not_resolved from dlt.common.configuration.specs.gcp_credentials import ( GcpServiceAccountCredentialsWithoutDefaults, ) @@ -917,6 +918,58 @@ def test_is_valid_hint() -> None: assert is_valid_hint(Wei) is True # any class type, except deriving from BaseConfiguration is wrong type assert is_valid_hint(ConfigFieldMissingException) is False + # but final and annotated types are not ok because they are not resolved + assert is_valid_hint(Final[ConfigFieldMissingException]) is True # type: ignore[arg-type] + assert is_valid_hint(Annotated[ConfigFieldMissingException, NotResolved()]) is True # type: ignore[arg-type] + assert is_valid_hint(Annotated[ConfigFieldMissingException, "REQ"]) is False # type: ignore[arg-type] + + +def test_is_not_resolved_hint() -> None: + assert is_hint_not_resolved(Final[ConfigFieldMissingException]) is True + assert is_hint_not_resolved(Annotated[ConfigFieldMissingException, NotResolved()]) is True + assert is_hint_not_resolved(Annotated[ConfigFieldMissingException, NotResolved(True)]) is True + assert is_hint_not_resolved(Annotated[ConfigFieldMissingException, NotResolved(False)]) is False + assert is_hint_not_resolved(Annotated[ConfigFieldMissingException, "REQ"]) is False + assert is_hint_not_resolved(str) is False + + +def test_not_resolved_hint() -> None: + class SentinelClass: + pass + + @configspec + class OptionalNotResolveConfiguration(BaseConfiguration): + trace: Final[Optional[SentinelClass]] = None + traces: Annotated[Optional[List[SentinelClass]], NotResolved()] = None + + c = resolve.resolve_configuration(OptionalNotResolveConfiguration()) + assert c.trace is None + assert c.traces is None + + s1 = SentinelClass() + s2 = SentinelClass() + + c = resolve.resolve_configuration(OptionalNotResolveConfiguration(s1, [s2])) + assert c.trace is s1 + assert c.traces[0] is s2 + + @configspec + class NotResolveConfiguration(BaseConfiguration): + trace: Final[SentinelClass] = None + traces: Annotated[List[SentinelClass], NotResolved()] = None + + with pytest.raises(ConfigFieldMissingException): + resolve.resolve_configuration(NotResolveConfiguration()) + + with pytest.raises(ConfigFieldMissingException): + resolve.resolve_configuration(NotResolveConfiguration(trace=s1)) + + with pytest.raises(ConfigFieldMissingException): + resolve.resolve_configuration(NotResolveConfiguration(traces=[s2])) + + c2 = resolve.resolve_configuration(NotResolveConfiguration(s1, [s2])) + assert c2.trace is s1 + assert c2.traces[0] is s2 def test_configspec_auto_base_config_derivation() -> None: diff --git a/tests/load/utils.py b/tests/load/utils.py index 81107e83d9..c03470676f 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -574,8 +574,8 @@ def yield_client( destination = Destination.from_reference(destination_type) # create initial config dest_config: DestinationClientDwhConfiguration = None - dest_config = destination.spec() # type: ignore[assignment] - dest_config.dataset_name = dataset_name # type: ignore[misc] + dest_config = destination.spec() # type: ignore + dest_config.dataset_name = dataset_name if default_config_values is not None: # apply the values to credentials, if dict is provided it will be used as default @@ -597,7 +597,7 @@ def yield_client( staging_config = DestinationClientStagingConfiguration( bucket_url=AWS_BUCKET, )._bind_dataset_name(dataset_name=dest_config.dataset_name) - staging_config.destination_type = "filesystem" # type: ignore[misc] + staging_config.destination_type = "filesystem" staging_config.resolve() dest_config.staging_config = staging_config # type: ignore[attr-defined] diff --git a/tests/load/weaviate/test_weaviate_client.py b/tests/load/weaviate/test_weaviate_client.py index 11d3f13db9..8c3344f152 100644 --- a/tests/load/weaviate/test_weaviate_client.py +++ b/tests/load/weaviate/test_weaviate_client.py @@ -37,10 +37,10 @@ def drop_weaviate_schema() -> Iterator[None]: def get_client_instance(schema: Schema) -> WeaviateClient: - dest = weaviate(dataset_name="ClientTest" + uniq_id()) - return dest.client(schema, dest.spec()) - # with Container().injectable_context(ConfigSectionContext(sections=('destination', 'weaviate'))): - # return dest.client(schema, config) + dest = weaviate() + return dest.client( + schema, dest.spec()._bind_dataset_name(dataset_name="ClientTest" + uniq_id()) + ) @pytest.fixture(scope="function") diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 7f03c6d167..79a57d0e82 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -1,10 +1,11 @@ import os import pytest from typing import Any, cast -from requests import PreparedRequest, Request +from dlt.common import logger +from requests import PreparedRequest, Request, Response from requests.auth import AuthBase from dlt.common.typing import TSecretStrValue -from dlt.sources.helpers.requests import Response +from dlt.sources.helpers.requests import Client from dlt.sources.helpers.rest_client import RESTClient from dlt.sources.helpers.rest_client.client import Hooks from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator @@ -35,6 +36,7 @@ def rest_client() -> RESTClient: return RESTClient( base_url="https://api.example.com", headers={"Accept": "application/json"}, + session=Client().session, ) @@ -168,6 +170,7 @@ def test_oauth_jwt_auth_success(self, rest_client: RESTClient): auth_endpoint="https://api.example.com/oauth/token", scopes=["read", "write"], headers={"Content-Type": "application/json"}, + session=Client().session, ) response = rest_client.get( @@ -185,6 +188,19 @@ def test_oauth_jwt_auth_success(self, rest_client: RESTClient): assert_pagination(list(pages_iter)) + def test_custom_session_client(self, mocker): + mocked_warning = mocker.patch.object(logger, "warning") + RESTClient( + base_url="https://api.example.com", + headers={"Accept": "application/json"}, + session=Client(raise_for_status=True).session, + ) + assert ( + mocked_warning.call_args[0][0] + == "The session provided has raise_for_status enabled. This may cause unexpected" + " behavior." + ) + def test_custom_auth_success(self, rest_client: RESTClient): class CustomAuthConfigBase(AuthConfigBase): def __init__(self, token: str): diff --git a/tests/sources/helpers/test_requests.py b/tests/sources/helpers/test_requests.py index aefdf23e77..70776a50ee 100644 --- a/tests/sources/helpers/test_requests.py +++ b/tests/sources/helpers/test_requests.py @@ -1,4 +1,4 @@ -from typing import Iterator, Type +from typing import Any, Dict, Iterator, List, Type from unittest import mock import os import random @@ -29,7 +29,7 @@ def mock_sleep() -> Iterator[mock.MagicMock]: def test_default_session_retry_settings() -> None: - retry: Retrying = Client().session.request.retry # type: ignore + retry: Retrying = Client().session.send.retry # type: ignore assert retry.stop.max_attempt_number == 5 # type: ignore assert isinstance(retry.retry, retry_any) retries = retry.retry.retries @@ -51,7 +51,7 @@ def custom_retry_cond(response, exception): respect_retry_after_header=False, ).session - retry: Retrying = session.request.retry # type: ignore + retry: Retrying = session.send.retry # type: ignore assert retry.stop.max_attempt_number == 14 # type: ignore assert isinstance(retry.retry, retry_any) retries = retry.retry.retries @@ -63,11 +63,12 @@ def custom_retry_cond(response, exception): def test_retry_on_status_all_fails(mock_sleep: mock.MagicMock) -> None: session = Client().session url = "https://example.com/data" + m = requests_mock.Adapter() + session.mount("https://", m) + m.register_uri("GET", url, status_code=503) - with requests_mock.mock(session=session) as m: - m.get(url, status_code=503) - with pytest.raises(requests.HTTPError): - session.get(url) + with pytest.raises(requests.HTTPError): + session.get(url) assert m.call_count == RunConfiguration.request_max_attempts @@ -76,6 +77,8 @@ def test_retry_on_status_success_after_2(mock_sleep: mock.MagicMock) -> None: """Test successful request after 2 retries""" session = Client().session url = "https://example.com/data" + m = requests_mock.Adapter() + session.mount("https://", m) responses = [ dict(text="error", status_code=503), @@ -83,9 +86,8 @@ def test_retry_on_status_success_after_2(mock_sleep: mock.MagicMock) -> None: dict(text="error", status_code=200), ] - with requests_mock.mock(session=session) as m: - m.get(url, responses) - resp = session.get(url) + m.register_uri("GET", url, responses) + resp = session.get(url) assert resp.status_code == 200 assert m.call_count == 3 @@ -94,11 +96,12 @@ def test_retry_on_status_success_after_2(mock_sleep: mock.MagicMock) -> None: def test_retry_on_status_without_raise_for_status(mock_sleep: mock.MagicMock) -> None: url = "https://example.com/data" session = Client(raise_for_status=False).session + m = requests_mock.Adapter() + session.mount("https://", m) - with requests_mock.mock(session=session) as m: - m.get(url, status_code=503) - response = session.get(url) - assert response.status_code == 503 + m.register_uri("GET", url, status_code=503) + response = session.get(url) + assert response.status_code == 503 assert m.call_count == RunConfiguration.request_max_attempts @@ -106,18 +109,19 @@ def test_retry_on_status_without_raise_for_status(mock_sleep: mock.MagicMock) -> def test_hooks_with_raise_for_statue() -> None: url = "https://example.com/data" session = Client(raise_for_status=True).session + m = requests_mock.Adapter() + session.mount("https://", m) def _no_content(resp: requests.Response, *args, **kwargs) -> requests.Response: resp.status_code = 204 resp._content = b"[]" return resp - with requests_mock.mock(session=session) as m: - m.get(url, status_code=503) - response = session.get(url, hooks={"response": _no_content}) - # we simulate empty response - assert response.status_code == 204 - assert response.json() == [] + m.register_uri("GET", url, status_code=503) + response = session.get(url, hooks={"response": _no_content}) + # we simulate empty response + assert response.status_code == 204 + assert response.json() == [] assert m.call_count == 1 @@ -130,12 +134,13 @@ def test_retry_on_exception_all_fails( exception_class: Type[Exception], mock_sleep: mock.MagicMock ) -> None: session = Client().session + m = requests_mock.Adapter() + session.mount("https://", m) url = "https://example.com/data" - with requests_mock.mock(session=session) as m: - m.get(url, exc=exception_class) - with pytest.raises(exception_class): - session.get(url) + m.register_uri("GET", url, exc=exception_class) + with pytest.raises(exception_class): + session.get(url) assert m.call_count == RunConfiguration.request_max_attempts @@ -145,12 +150,13 @@ def retry_on(response: requests.Response, exception: BaseException) -> bool: return response.text == "error" session = Client(retry_condition=retry_on).session + m = requests_mock.Adapter() + session.mount("https://", m) url = "https://example.com/data" - with requests_mock.mock(session=session) as m: - m.get(url, text="error") - response = session.get(url) - assert response.content == b"error" + m.register_uri("GET", url, text="error") + response = session.get(url) + assert response.content == b"error" assert m.call_count == RunConfiguration.request_max_attempts @@ -160,12 +166,12 @@ def retry_on(response: requests.Response, exception: BaseException) -> bool: return response.text == "error" session = Client(retry_condition=retry_on).session + m = requests_mock.Adapter() + session.mount("https://", m) url = "https://example.com/data" - responses = [dict(text="error"), dict(text="error"), dict(text="success")] - with requests_mock.mock(session=session) as m: - m.get(url, responses) - resp = session.get(url) + m.register_uri("GET", url, [dict(text="error"), dict(text="error"), dict(text="success")]) + resp = session.get(url) assert resp.text == "success" assert m.call_count == 3 @@ -174,14 +180,16 @@ def retry_on(response: requests.Response, exception: BaseException) -> bool: def test_wait_retry_after_int(mock_sleep: mock.MagicMock) -> None: session = Client(request_backoff_factor=0).session url = "https://example.com/data" - responses = [ + m = requests_mock.Adapter() + session.mount("https://", m) + m.register_uri("GET", url, text="error") + responses: List[Dict[str, Any]] = [ dict(text="error", headers={"retry-after": "4"}, status_code=429), dict(text="success"), ] - with requests_mock.mock(session=session) as m: - m.get(url, responses) - session.get(url) + m.register_uri("GET", url, responses) + session.get(url) mock_sleep.assert_called_once() assert 4 <= mock_sleep.call_args[0][0] <= 5 # Adds jitter up to 1s @@ -206,7 +214,7 @@ def test_init_default_client(existing_session: bool) -> None: session = default_client.session assert session.timeout == cfg["RUNTIME__REQUEST_TIMEOUT"] - retry = session.request.retry # type: ignore[attr-defined] + retry = session.send.retry # type: ignore[attr-defined] assert retry.wait.multiplier == cfg["RUNTIME__REQUEST_BACKOFF_FACTOR"] assert retry.stop.max_attempt_number == cfg["RUNTIME__REQUEST_MAX_ATTEMPTS"] assert retry.wait.max == cfg["RUNTIME__REQUEST_MAX_RETRY_DELAY"] @@ -226,7 +234,7 @@ def test_client_instance_with_config(existing_session: bool) -> None: session = client.session assert session.timeout == cfg["RUNTIME__REQUEST_TIMEOUT"] - retry = session.request.retry # type: ignore[attr-defined] + retry = session.send.retry # type: ignore[attr-defined] assert retry.wait.multiplier == cfg["RUNTIME__REQUEST_BACKOFF_FACTOR"] assert retry.stop.max_attempt_number == cfg["RUNTIME__REQUEST_MAX_ATTEMPTS"] assert retry.wait.max == cfg["RUNTIME__REQUEST_MAX_RETRY_DELAY"] From 03f82cad1aaa5f28bbb8d4499f798f1e85bab0ab Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Mon, 27 May 2024 14:43:19 -0400 Subject: [PATCH 40/41] Support partitioning hints for athena iceberg (#1403) * Support partitioning hints for athena iceberg * Escape partitioning column names * Update docs * Move athena skip marker * marks athena adapter tests essential --------- Co-authored-by: Marcin Rudolf --- dlt/destinations/adapters.py | 2 + dlt/destinations/impl/athena/athena.py | 16 +++ .../impl/athena/athena_adapter.py | 117 ++++++++++++++++++ .../docs/dlt-ecosystem/destinations/athena.md | 57 +++++++++ tests/load/athena_iceberg/__init__.py | 4 + .../athena_iceberg/test_athena_adapter.py | 69 +++++++++++ .../athena_iceberg/test_athena_iceberg.py | 3 - tests/load/pipeline/test_athena.py | 68 ++++++++++ tests/load/utils.py | 6 + 9 files changed, 339 insertions(+), 3 deletions(-) create mode 100644 dlt/destinations/impl/athena/athena_adapter.py create mode 100644 tests/load/athena_iceberg/test_athena_adapter.py diff --git a/dlt/destinations/adapters.py b/dlt/destinations/adapters.py index 554bd88924..1c3e094e19 100644 --- a/dlt/destinations/adapters.py +++ b/dlt/destinations/adapters.py @@ -5,6 +5,7 @@ from dlt.destinations.impl.bigquery import bigquery_adapter from dlt.destinations.impl.synapse import synapse_adapter from dlt.destinations.impl.clickhouse import clickhouse_adapter +from dlt.destinations.impl.athena import athena_adapter __all__ = [ "weaviate_adapter", @@ -12,4 +13,5 @@ "bigquery_adapter", "synapse_adapter", "clickhouse_adapter", + "athena_adapter", ] diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 7e1ab8fc27..8f043ba4d5 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -11,6 +11,7 @@ Callable, Iterable, Type, + cast, ) from copy import deepcopy import re @@ -69,6 +70,7 @@ from dlt.destinations.impl.athena.configuration import AthenaClientConfiguration from dlt.destinations.type_mapping import TypeMapper from dlt.destinations import path_utils +from dlt.destinations.impl.athena.athena_adapter import PARTITION_HINT class AthenaTypeMapper(TypeMapper): @@ -405,6 +407,16 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non f"{self.sql_client.escape_ddl_identifier(c['name'])} {self.type_mapper.to_db_type(c, table_format)}" ) + def _iceberg_partition_clause(self, partition_hints: Optional[Dict[str, str]]) -> str: + if not partition_hints: + return "" + formatted_strings = [] + for column_name, template in partition_hints.items(): + formatted_strings.append( + template.format(column_name=self.sql_client.escape_ddl_identifier(column_name)) + ) + return f"PARTITIONED BY ({', '.join(formatted_strings)})" + def _get_table_update_sql( self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool ) -> List[str]: @@ -431,8 +443,12 @@ def _get_table_update_sql( sql.append(f"""ALTER TABLE {qualified_table_name} ADD COLUMNS ({columns});""") else: if is_iceberg: + partition_clause = self._iceberg_partition_clause( + cast(Optional[Dict[str, str]], table.get(PARTITION_HINT)) + ) sql.append(f"""CREATE TABLE {qualified_table_name} ({columns}) + {partition_clause} LOCATION '{location.rstrip('/')}' TBLPROPERTIES ('table_type'='ICEBERG', 'format'='parquet');""") elif table_format == "jsonl": diff --git a/dlt/destinations/impl/athena/athena_adapter.py b/dlt/destinations/impl/athena/athena_adapter.py new file mode 100644 index 0000000000..cb600335c0 --- /dev/null +++ b/dlt/destinations/impl/athena/athena_adapter.py @@ -0,0 +1,117 @@ +from typing import Any, Optional, Dict, Protocol, Sequence, Union, Final + +from dateutil import parser + +from dlt.common.pendulum import timezone +from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TColumnSchema +from dlt.destinations.utils import ensure_resource +from dlt.extract import DltResource +from dlt.extract.items import TTableHintTemplate + + +PARTITION_HINT: Final[str] = "x-athena-partition" + + +class PartitionTransformation: + template: str + """Template string of the transformation including column name placeholder. E.g. `bucket(16, {column_name})`""" + column_name: str + """Column name to apply the transformation to""" + + def __init__(self, template: str, column_name: str) -> None: + self.template = template + self.column_name = column_name + + +class athena_partition: + """Helper class to generate iceberg partition transformations + + E.g. `athena_partition.bucket(16, "id")` will return a transformation with template `bucket(16, {column_name})` + This can be correctly rendered by the athena loader with escaped column name. + """ + + @staticmethod + def year(column_name: str) -> PartitionTransformation: + """Partition by year part of a date or timestamp column.""" + return PartitionTransformation("year({column_name})", column_name) + + @staticmethod + def month(column_name: str) -> PartitionTransformation: + """Partition by month part of a date or timestamp column.""" + return PartitionTransformation("month({column_name})", column_name) + + @staticmethod + def day(column_name: str) -> PartitionTransformation: + """Partition by day part of a date or timestamp column.""" + return PartitionTransformation("day({column_name})", column_name) + + @staticmethod + def hour(column_name: str) -> PartitionTransformation: + """Partition by hour part of a date or timestamp column.""" + return PartitionTransformation("hour({column_name})", column_name) + + @staticmethod + def bucket(n: int, column_name: str) -> PartitionTransformation: + """Partition by hashed value to n buckets.""" + return PartitionTransformation(f"bucket({n}, {{column_name}})", column_name) + + @staticmethod + def truncate(length: int, column_name: str) -> PartitionTransformation: + """Partition by value truncated to length.""" + return PartitionTransformation(f"truncate({length}, {{column_name}})", column_name) + + +def athena_adapter( + data: Any, + partition: Union[ + str, PartitionTransformation, Sequence[Union[str, PartitionTransformation]] + ] = None, +) -> DltResource: + """ + Prepares data for loading into Athena + + Args: + data: The data to be transformed. + This can be raw data or an instance of DltResource. + If raw data is provided, the function will wrap it into a `DltResource` object. + partition: Column name(s) or instances of `PartitionTransformation` to partition the table by. + To use a transformation it's best to use the methods of the helper class `athena_partition` + to generate correctly escaped SQL in the loader. + + Returns: + A `DltResource` object that is ready to be loaded into BigQuery. + + Raises: + ValueError: If any hint is invalid or none are specified. + + Examples: + >>> data = [{"name": "Marcel", "department": "Engineering", "date_hired": "2024-01-30"}] + >>> athena_adapter(data, partition=["department", athena_partition.year("date_hired"), athena_partition.bucket(8, "name")]) + [DltResource with hints applied] + """ + resource = ensure_resource(data) + additional_table_hints: Dict[str, TTableHintTemplate[Any]] = {} + + if partition: + if isinstance(partition, str) or not isinstance(partition, Sequence): + partition = [partition] + + # Partition hint is `{column_name: template}`, e.g. `{"department": "{column_name}", "date_hired": "year({column_name})"}` + # Use one dict for all hints instead of storing on column so order is preserved + partition_hint: Dict[str, str] = {} + + for item in partition: + if isinstance(item, PartitionTransformation): + # Client will generate the final SQL string with escaped column name injected + partition_hint[item.column_name] = item.template + else: + # Item is the column name + partition_hint[item] = "{column_name}" + + additional_table_hints[PARTITION_HINT] = partition_hint + + if additional_table_hints: + resource.apply_hints(additional_table_hints=additional_table_hints) + else: + raise ValueError("A value for `partition` must be specified.") + return resource diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index 7c907664d3..93291bfe9a 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -161,5 +161,62 @@ aws_data_catalog="awsdatacatalog" You can choose the following file formats: * [parquet](../file-formats/parquet.md) is used by default + +## Athena adapter + +You can use the `athena_adapter` to add partitioning to Athena tables. This is currently only supported for Iceberg tables. + +Iceberg tables support a few transformation functions for partitioning. Info on all supported functions in the [AWS documentation](https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg-creating-tables.html#querying-iceberg-creating-tables-query-editor). + +Use the `athena_partition` helper to generate the partitioning hints for these functions: + +* `athena_partition.year(column_name: str)`: Partition by year of date/datetime column. +* `athena_partition.month(column_name: str)`: Partition by month of date/datetime column. +* `athena_partition.day(column_name: str)`: Partition by day of date/datetime column. +* `athena_partition.hour(column_name: str)`: Partition by hour of date/datetime column. +* `athena_partition.bucket(n: int, column_name: str)`: Partition by hashed value to `n` buckets +* `athena_partition.truncate(length: int, column_name: str)`: Partition by truncated value to `length` (or width for numbers) + +Here is an example of how to use the adapter to partition a table: + +```py +from datetime import date + +import dlt +from dlt.destinations.impl.athena.athena_adapter import athena_partition, athena_adapter + +data_items = [ + (1, "A", date(2021, 1, 1)), + (2, "A", date(2021, 1, 2)), + (3, "A", date(2021, 1, 3)), + (4, "A", date(2021, 2, 1)), + (5, "A", date(2021, 2, 2)), + (6, "B", date(2021, 1, 1)), + (7, "B", date(2021, 1, 2)), + (8, "B", date(2021, 1, 3)), + (9, "B", date(2021, 2, 1)), + (10, "B", date(2021, 3, 2)), +] + +@dlt.resource(table_format="iceberg") +def partitioned_data(): + yield [{"id": i, "category": c, "created_at": d} for i, c, d in data_items] + + +# Add partitioning hints to the table +athena_adapter( + partitioned_table, + partition=[ + # Partition per category and month + "category", + athena_partition.month("created_at"), + ], +) + + +pipeline = dlt.pipeline("athena_example") +pipeline.run(partitioned_data) +``` + diff --git a/tests/load/athena_iceberg/__init__.py b/tests/load/athena_iceberg/__init__.py index e69de29bb2..56e5d539c2 100644 --- a/tests/load/athena_iceberg/__init__.py +++ b/tests/load/athena_iceberg/__init__.py @@ -0,0 +1,4 @@ +from tests.utils import skip_if_not_active + + +skip_if_not_active("athena") diff --git a/tests/load/athena_iceberg/test_athena_adapter.py b/tests/load/athena_iceberg/test_athena_adapter.py new file mode 100644 index 0000000000..3144eb9cc9 --- /dev/null +++ b/tests/load/athena_iceberg/test_athena_adapter.py @@ -0,0 +1,69 @@ +import pytest + +import dlt +from dlt.destinations import filesystem +from dlt.destinations.impl.athena.athena_adapter import athena_adapter, athena_partition + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + + +def test_iceberg_partition_hints(): + """Create a table with athena partition hints and check that the SQL is generated correctly.""" + + @dlt.resource(table_format="iceberg") + def partitioned_table(): + yield { + "product_id": 1, + "name": "product 1", + "created_at": "2021-01-01T00:00:00Z", + "category": "category 1", + "price": 100.0, + "quantity": 10, + } + + @dlt.resource(table_format="iceberg") + def not_partitioned_table(): + yield {"a": 1, "b": 2} + + athena_adapter( + partitioned_table, + partition=[ + "category", + athena_partition.month("created_at"), + athena_partition.bucket(10, "product_id"), + athena_partition.truncate(2, "name"), + ], + ) + + pipeline = dlt.pipeline( + "athena_test", + destination="athena", + staging=filesystem("s3://not-a-real-bucket"), + full_refresh=True, + ) + + pipeline.extract([partitioned_table, not_partitioned_table]) + pipeline.normalize() + + with pipeline._sql_job_client(pipeline.default_schema) as client: + sql_partitioned = client._get_table_update_sql( + "partitioned_table", + list(pipeline.default_schema.tables["partitioned_table"]["columns"].values()), + False, + )[0] + sql_not_partitioned = client._get_table_update_sql( + "not_partitioned_table", + list(pipeline.default_schema.tables["not_partitioned_table"]["columns"].values()), + False, + )[0] + + # Partition clause is generated with original order + expected_clause = ( + "PARTITIONED BY (`category`, month(`created_at`), bucket(10, `product_id`), truncate(2," + " `name`))" + ) + assert expected_clause in sql_partitioned + + # No partition clause otherwise + assert "PARTITIONED BY" not in sql_not_partitioned diff --git a/tests/load/athena_iceberg/test_athena_iceberg.py b/tests/load/athena_iceberg/test_athena_iceberg.py index dbcdc5c23e..d3bb9eb5f5 100644 --- a/tests/load/athena_iceberg/test_athena_iceberg.py +++ b/tests/load/athena_iceberg/test_athena_iceberg.py @@ -11,14 +11,11 @@ from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration -from tests.utils import skip_if_not_active from dlt.destinations.exceptions import DatabaseTerminalException # mark all tests as essential, do not remove pytestmark = pytest.mark.essential -skip_if_not_active("athena") - def test_iceberg() -> None: """ diff --git a/tests/load/pipeline/test_athena.py b/tests/load/pipeline/test_athena.py index 8c034a066b..a5bb6efc0d 100644 --- a/tests/load/pipeline/test_athena.py +++ b/tests/load/pipeline/test_athena.py @@ -9,6 +9,8 @@ from tests.pipeline.utils import assert_load_info, load_table_counts from tests.pipeline.utils import load_table_counts from dlt.destinations.exceptions import CantExtractTablePrefix +from dlt.destinations.impl.athena.athena_adapter import athena_partition, athena_adapter +from dlt.destinations.fs_client import FSClientBase from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration from tests.load.utils import ( @@ -231,3 +233,69 @@ def test_athena_file_layouts(destination_config: DestinationTestConfiguration, l pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] ) assert table_counts == {"items1": 3, "items2": 7} + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["athena"], force_iceberg=True), + ids=lambda x: x.name, +) +def test_athena_partitioned_iceberg_table(destination_config: DestinationTestConfiguration): + """Load an iceberg table with partition hints and verifiy partitions are created correctly.""" + pipeline = destination_config.setup_pipeline("athena_" + uniq_id(), full_refresh=True) + + data_items = [ + (1, "A", datetime.date.fromisoformat("2021-01-01")), + (2, "A", datetime.date.fromisoformat("2021-01-02")), + (3, "A", datetime.date.fromisoformat("2021-01-03")), + (4, "A", datetime.date.fromisoformat("2021-02-01")), + (5, "A", datetime.date.fromisoformat("2021-02-02")), + (6, "B", datetime.date.fromisoformat("2021-01-01")), + (7, "B", datetime.date.fromisoformat("2021-01-02")), + (8, "B", datetime.date.fromisoformat("2021-01-03")), + (9, "B", datetime.date.fromisoformat("2021-02-01")), + (10, "B", datetime.date.fromisoformat("2021-03-02")), + ] + + @dlt.resource(table_format="iceberg") + def partitioned_table(): + yield [{"id": i, "category": c, "created_at": d} for i, c, d in data_items] + + athena_adapter( + partitioned_table, + partition=[ + "category", + athena_partition.month("created_at"), + ], + ) + + info = pipeline.run(partitioned_table) + assert_load_info(info) + + # Get partitions from metadata + with pipeline.sql_client() as sql_client: + tbl_name = sql_client.make_qualified_table_name("partitioned_table$partitions") + rows = sql_client.execute_sql(f"SELECT partition FROM {tbl_name}") + partition_keys = {r[0] for r in rows} + + data_rows = sql_client.execute_sql( + "SELECT id, category, created_at FROM" + f" {sql_client.make_qualified_table_name('partitioned_table')}" + ) + # data_rows = [(i, c, d.toisoformat()) for i, c, d in data_rows] + + # All data is in table + assert len(data_rows) == len(data_items) + assert set(data_rows) == set(data_items) + + # Compare with expected partitions + # Months are number of months since epoch + expected_partitions = { + "{category=A, created_at_month=612}", + "{category=A, created_at_month=613}", + "{category=B, created_at_month=612}", + "{category=B, created_at_month=613}", + "{category=B, created_at_month=614}", + } + + assert partition_keys == expected_partitions diff --git a/tests/load/utils.py b/tests/load/utils.py index c03470676f..e6b860c723 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -180,6 +180,7 @@ def destinations_configs( file_format: Union[TLoaderFileFormat, Sequence[TLoaderFileFormat]] = None, supports_merge: Optional[bool] = None, supports_dbt: Optional[bool] = None, + force_iceberg: Optional[bool] = None, ) -> List[DestinationTestConfiguration]: # sanity check for item in subset: @@ -495,6 +496,11 @@ def destinations_configs( conf for conf in destination_configs if conf.name not in EXCLUDED_DESTINATION_CONFIGURATIONS ] + if force_iceberg is not None: + destination_configs = [ + conf for conf in destination_configs if conf.force_iceberg is force_iceberg + ] + return destination_configs From 01b874992757c72ed7858020c0d10412b53529f7 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Mon, 27 May 2024 20:56:30 +0200 Subject: [PATCH 41/41] bumps dlt to 0.4.12 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8beefe409f..cc18c37353 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "0.4.11" +version = "0.4.12" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ]