Skip to content

Commit

Permalink
Merge pull request #1418 from dlt-hub/devel
Browse files Browse the repository at this point in the history
master merge for 0.4.12 release
  • Loading branch information
rudolfix authored May 28, 2024
2 parents 9225803 + 6b5952d commit 252fa92
Show file tree
Hide file tree
Showing 83 changed files with 1,711 additions and 542 deletions.
24 changes: 24 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,29 @@ We use **master** branch for hot fixes (including documentation) that needs to b

On the release day, **devel** branch is merged into **master**. All releases of `dlt` happen only from the **master**.

### Branch naming rules

We want to make sure that our git history explains in a human readable way what has been changed with which Branch or PR. To this end, we are using the following branch naming pattern (all lowercase and dashes, no underscores):

```sh
{category}/{ticket-id}-description-of-the-branch
# example:
feat/4922-add-avro-support
```

#### Branch categories

* **feat** - a new feature that is being implemented (ticket required)
* **fix** - a change that fixes a bug (ticket required)
* **exp** - an experiment where we are testing a new idea or want to demonstrate something to the team, might turn into a `feat` later (ticket encouraged)
* **test** - anything related to the tests (ticket encouraged)
* **blogs** - a new entry to our blog (ticket optional)
* **docs** - a change to our docs (ticket optional)

#### Ticket Numbers

We encourage you to attach your branches to a ticket, if none exists, create one and explain what you are doing. For `feat` and `fix` branches, tickets are mandatory, for `exp` and `test` branches encouraged and for `blogs` and `docs` branches optional.

### Submitting a hotfix
We'll fix critical bugs and release `dlt` out of the schedule. Follow the regular procedure, but make your PR against **master** branch. Please ping us on Slack if you do it.

Expand Down Expand Up @@ -166,3 +189,4 @@ Once the version has been bumped, follow these steps to publish the new release
- [Poetry Documentation](https://python-poetry.org/docs/)

If you have any questions or need help, don't hesitate to reach out to us. We're here to help you succeed in contributing to `dlt`. Happy coding!
****
9 changes: 8 additions & 1 deletion dlt/common/configuration/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from .specs.base_configuration import configspec, is_valid_hint, is_secret_hint, resolve_type
from .specs.base_configuration import (
configspec,
is_valid_hint,
is_secret_hint,
resolve_type,
NotResolved,
)
from .specs import known_sections
from .resolve import resolve_configuration, inject_section
from .inject import with_config, last_config, get_fun_spec, create_resolved_partial
Expand All @@ -15,6 +21,7 @@
"configspec",
"is_valid_hint",
"is_secret_hint",
"NotResolved",
"resolve_type",
"known_sections",
"resolve_configuration",
Expand Down
6 changes: 3 additions & 3 deletions dlt/common/configuration/resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
StrAny,
TSecretValue,
get_all_types_of_class_in_union,
is_final_type,
is_optional_type,
is_union_type,
)
Expand All @@ -21,6 +20,7 @@
is_context_inner_hint,
is_base_configuration_inner_hint,
is_valid_hint,
is_hint_not_resolved,
)
from dlt.common.configuration.specs.config_section_context import ConfigSectionContext
from dlt.common.configuration.specs.exceptions import NativeValueError
Expand Down Expand Up @@ -194,7 +194,7 @@ def _resolve_config_fields(
if explicit_values:
explicit_value = explicit_values.get(key)
else:
if is_final_type(hint):
if is_hint_not_resolved(hint):
# for final fields default value is like explicit
explicit_value = default_value
else:
Expand Down Expand Up @@ -258,7 +258,7 @@ def _resolve_config_fields(
unresolved_fields[key] = traces
# set resolved value in config
if default_value != current_value:
if not is_final_type(hint):
if not is_hint_not_resolved(hint):
# ignore final types
setattr(config, key, current_value)

Expand Down
11 changes: 10 additions & 1 deletion dlt/common/configuration/specs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@
from .connection_string_credentials import ConnectionStringCredentials
from .api_credentials import OAuth2Credentials
from .aws_credentials import AwsCredentials, AwsCredentialsWithoutDefaults
from .azure_credentials import AzureCredentials, AzureCredentialsWithoutDefaults
from .azure_credentials import (
AzureCredentials,
AzureCredentialsWithoutDefaults,
AzureServicePrincipalCredentials,
AzureServicePrincipalCredentialsWithoutDefaults,
AnyAzureCredentials,
)


# backward compatibility for service account credentials
Expand Down Expand Up @@ -51,6 +57,9 @@
"AwsCredentialsWithoutDefaults",
"AzureCredentials",
"AzureCredentialsWithoutDefaults",
"AzureServicePrincipalCredentials",
"AzureServicePrincipalCredentialsWithoutDefaults",
"AnyAzureCredentials",
"GcpClientCredentials",
"GcpClientCredentialsWithDefault",
]
50 changes: 45 additions & 5 deletions dlt/common/configuration/specs/azure_credentials.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, Dict, Any
from typing import Optional, Dict, Any, Union

from dlt.common.pendulum import pendulum
from dlt.common.typing import TSecretStrValue
Expand All @@ -7,10 +7,6 @@
CredentialsWithDefault,
configspec,
)
from dlt.common.configuration.specs.exceptions import InvalidBoto3Session
from dlt import version

import fsspec


@configspec
Expand Down Expand Up @@ -50,6 +46,22 @@ def on_partial(self) -> None:
self.resolve()


@configspec
class AzureServicePrincipalCredentialsWithoutDefaults(CredentialsConfiguration):
azure_storage_account_name: str = None
azure_tenant_id: str = None
azure_client_id: str = None
azure_client_secret: TSecretStrValue = None

def to_adlfs_credentials(self) -> Dict[str, Any]:
return dict(
account_name=self.azure_storage_account_name,
tenant_id=self.azure_tenant_id,
client_id=self.azure_client_id,
client_secret=self.azure_client_secret,
)


@configspec
class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault):
def on_partial(self) -> None:
Expand All @@ -67,3 +79,31 @@ def to_adlfs_credentials(self) -> Dict[str, Any]:
if self.has_default_credentials():
base_kwargs["anon"] = False
return base_kwargs


@configspec
class AzureServicePrincipalCredentials(
AzureServicePrincipalCredentialsWithoutDefaults, CredentialsWithDefault
):
def on_partial(self) -> None:
from azure.identity import DefaultAzureCredential

self._set_default_credentials(DefaultAzureCredential())
if self.azure_storage_account_name:
self.resolve()

def to_adlfs_credentials(self) -> Dict[str, Any]:
base_kwargs = super().to_adlfs_credentials()
if self.has_default_credentials():
base_kwargs["anon"] = False
return base_kwargs


AnyAzureCredentials = Union[
# Credentials without defaults come first because union types are attempted in order
# and explicit config should supersede system defaults
AzureCredentialsWithoutDefaults,
AzureServicePrincipalCredentialsWithoutDefaults,
AzureCredentials,
AzureServicePrincipalCredentials,
]
38 changes: 37 additions & 1 deletion dlt/common/configuration/specs/base_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
ClassVar,
TypeVar,
)
from typing_extensions import get_args, get_origin, dataclass_transform
from typing_extensions import get_args, get_origin, dataclass_transform, Annotated, TypeAlias
from functools import wraps

if TYPE_CHECKING:
Expand All @@ -29,8 +29,11 @@
TDtcField = dataclasses.Field

from dlt.common.typing import (
AnyType,
TAnyClass,
extract_inner_type,
is_annotated,
is_final_type,
is_optional_type,
is_union_type,
)
Expand All @@ -48,6 +51,34 @@
_C = TypeVar("_C", bound="CredentialsConfiguration")


class NotResolved:
"""Used in type annotations to indicate types that should not be resolved."""

def __init__(self, not_resolved: bool = True):
self.not_resolved = not_resolved

def __bool__(self) -> bool:
return self.not_resolved


def is_hint_not_resolved(hint: AnyType) -> bool:
"""Checks if hint should NOT be resolved. Final and types annotated like
>>> Annotated[str, NotResolved()]
are not resolved.
"""
if is_final_type(hint):
return True

if is_annotated(hint):
_, *a_m = get_args(hint)
for annotation in a_m:
if isinstance(annotation, NotResolved):
return bool(annotation)
return False


def is_base_configuration_inner_hint(inner_hint: Type[Any]) -> bool:
return inspect.isclass(inner_hint) and issubclass(inner_hint, BaseConfiguration)

Expand All @@ -70,6 +101,11 @@ def is_valid_hint(hint: Type[Any]) -> bool:
if get_origin(hint) is ClassVar:
# class vars are skipped by dataclass
return True

if is_hint_not_resolved(hint):
# all hints that are not resolved are valid
return True

hint = extract_inner_type(hint)
hint = get_config_if_union_hint(hint) or hint
hint = get_origin(hint) or hint
Expand Down
3 changes: 3 additions & 0 deletions dlt/common/data_writers/buffered.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ def __init__(
self.closed_files: List[DataWriterMetrics] = [] # all fully processed files
# buffered items must be less than max items in file
self.buffer_max_items = min(buffer_max_items, file_max_items or buffer_max_items)
# Explicitly configured max size supersedes destination limit
self.file_max_bytes = file_max_bytes
if self.file_max_bytes is None and _caps:
self.file_max_bytes = _caps.recommended_file_size
self.file_max_items = file_max_items
# the open function is either gzip.open or open
self.open = (
Expand Down
2 changes: 2 additions & 0 deletions dlt/common/destination/capabilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ class DestinationCapabilitiesContext(ContainerInjectableContext):

preferred_loader_file_format: TLoaderFileFormat = None
supported_loader_file_formats: Sequence[TLoaderFileFormat] = None
recommended_file_size: Optional[int] = None
"""Recommended file size in bytes when writing extract/load files"""
preferred_staging_file_format: Optional[TLoaderFileFormat] = None
supported_staging_file_formats: Sequence[TLoaderFileFormat] = None
escape_identifier: Callable[[str], str] = None
Expand Down
16 changes: 8 additions & 8 deletions dlt/common/destination/reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
Any,
TypeVar,
Generic,
Final,
)
from typing_extensions import Annotated
import datetime # noqa: 251
from copy import deepcopy
import inspect
Expand All @@ -35,7 +35,7 @@
has_column_with_prop,
get_first_column_name_with_prop,
)
from dlt.common.configuration import configspec, resolve_configuration, known_sections
from dlt.common.configuration import configspec, resolve_configuration, known_sections, NotResolved
from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration
from dlt.common.configuration.accessors import config
from dlt.common.destination.capabilities import DestinationCapabilitiesContext
Expand Down Expand Up @@ -78,7 +78,7 @@ class StateInfo(NamedTuple):

@configspec
class DestinationClientConfiguration(BaseConfiguration):
destination_type: Final[str] = dataclasses.field(
destination_type: Annotated[str, NotResolved()] = dataclasses.field(
default=None, init=False, repr=False, compare=False
) # which destination to load data to
credentials: Optional[CredentialsConfiguration] = None
Expand All @@ -103,11 +103,11 @@ def on_resolved(self) -> None:
class DestinationClientDwhConfiguration(DestinationClientConfiguration):
"""Configuration of a destination that supports datasets/schemas"""

dataset_name: Final[str] = dataclasses.field(
dataset_name: Annotated[str, NotResolved()] = dataclasses.field(
default=None, init=False, repr=False, compare=False
) # dataset must be final so it is not configurable
) # dataset cannot be resolved
"""dataset name in the destination to load data to, for schemas that are not default schema, it is used as dataset prefix"""
default_schema_name: Final[Optional[str]] = dataclasses.field(
default_schema_name: Annotated[Optional[str], NotResolved()] = dataclasses.field(
default=None, init=False, repr=False, compare=False
)
"""name of default schema to be used to name effective dataset to load data to"""
Expand All @@ -121,8 +121,8 @@ def _bind_dataset_name(
This method is intended to be used internally.
"""
self.dataset_name = dataset_name # type: ignore[misc]
self.default_schema_name = default_schema_name # type: ignore[misc]
self.dataset_name = dataset_name
self.default_schema_name = default_schema_name
return self

def normalize_dataset_name(self, schema: Schema) -> str:
Expand Down
11 changes: 5 additions & 6 deletions dlt/common/storages/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
GcpServiceAccountCredentials,
AwsCredentials,
GcpOAuthCredentials,
AzureCredentials,
AzureCredentialsWithoutDefaults,
AnyAzureCredentials,
BaseConfiguration,
)
from dlt.common.typing import DictStrAny
Expand Down Expand Up @@ -49,7 +48,7 @@ class LoadStorageConfiguration(BaseConfiguration):


FileSystemCredentials = Union[
AwsCredentials, GcpServiceAccountCredentials, AzureCredentials, GcpOAuthCredentials
AwsCredentials, GcpServiceAccountCredentials, AnyAzureCredentials, GcpOAuthCredentials
]


Expand All @@ -70,9 +69,9 @@ class FilesystemConfiguration(BaseConfiguration):
"gcs": Union[GcpServiceAccountCredentials, GcpOAuthCredentials],
"gdrive": Union[GcpServiceAccountCredentials, GcpOAuthCredentials],
"s3": AwsCredentials,
"az": Union[AzureCredentialsWithoutDefaults, AzureCredentials],
"abfs": Union[AzureCredentialsWithoutDefaults, AzureCredentials],
"adl": Union[AzureCredentialsWithoutDefaults, AzureCredentials],
"az": AnyAzureCredentials,
"abfs": AnyAzureCredentials,
"adl": AnyAzureCredentials,
}

bucket_url: str = None
Expand Down
2 changes: 2 additions & 0 deletions dlt/destinations/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
from dlt.destinations.impl.bigquery import bigquery_adapter
from dlt.destinations.impl.synapse import synapse_adapter
from dlt.destinations.impl.clickhouse import clickhouse_adapter
from dlt.destinations.impl.athena import athena_adapter

__all__ = [
"weaviate_adapter",
"qdrant_adapter",
"bigquery_adapter",
"synapse_adapter",
"clickhouse_adapter",
"athena_adapter",
]
Loading

0 comments on commit 252fa92

Please sign in to comment.