-
Notifications
You must be signed in to change notification settings - Fork 159
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
incremental scd2
with merge_key
#1818
Changes from 6 commits
a3a29ac
aa62983
bcbd0c7
70d036d
5cfc5c8
54dfe14
c28f8ba
7d1aad9
a0aa99c
65838e4
16e52cd
c826afc
ae7cd00
5888a07
8ecf7c6
1035ffa
c5be436
21ce4f8
84c101b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -231,15 +231,22 @@ class TWriteDispositionDict(TypedDict): | |
disposition: TWriteDisposition | ||
|
||
|
||
class TMergeDispositionDict(TWriteDispositionDict, total=False): | ||
class TMergeDispositionDict(TWriteDispositionDict): | ||
strategy: Optional[TLoaderMergeStrategy] | ||
|
||
|
||
class TScd2StrategyDict(TMergeDispositionDict, total=False): | ||
validity_column_names: Optional[List[str]] | ||
active_record_timestamp: Optional[TAnyDateTime] | ||
boundary_timestamp: Optional[TAnyDateTime] | ||
row_version_column_name: Optional[str] | ||
retire_if_absent: Optional[bool] | ||
natural_key: Optional[str] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is no longer needed, is it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Whoops. Removed now. |
||
|
||
|
||
TWriteDispositionConfig = Union[TWriteDisposition, TWriteDispositionDict, TMergeDispositionDict] | ||
TWriteDispositionConfig = Union[ | ||
TWriteDisposition, TWriteDispositionDict, TMergeDispositionDict, TScd2StrategyDict | ||
] | ||
|
||
|
||
class _TTableSchemaBase(TTableProcessingHints, total=False): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
TTableSchemaColumns, | ||
TWriteDispositionConfig, | ||
TMergeDispositionDict, | ||
TScd2StrategyDict, | ||
TAnySchemaColumns, | ||
TTableFormat, | ||
TSchemaContract, | ||
|
@@ -452,10 +453,19 @@ def _merge_merge_disposition_dict(dict_: Dict[str, Any]) -> None: | |
md_dict: TMergeDispositionDict = dict_.pop("write_disposition") | ||
if merge_strategy := md_dict.get("strategy"): | ||
dict_["x-merge-strategy"] = merge_strategy | ||
if "boundary_timestamp" in md_dict: | ||
dict_["x-boundary-timestamp"] = md_dict["boundary_timestamp"] | ||
# add columns for `scd2` merge strategy | ||
|
||
if merge_strategy == "scd2": | ||
md_dict = cast(TScd2StrategyDict, md_dict) | ||
if "boundary_timestamp" in md_dict: | ||
dict_["x-boundary-timestamp"] = md_dict["boundary_timestamp"] | ||
if "retire_if_absent" in md_dict: | ||
dict_["x-retire-if-absent"] = md_dict["retire_if_absent"] | ||
if "natural_key" in md_dict: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also no longer needed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also removed |
||
nk = md_dict["natural_key"] | ||
if nk in dict_["columns"]: | ||
dict_["columns"][nk]["x-natural-key"] = True | ||
else: | ||
dict_["columns"][nk] = {"name": nk, "x-natural-key": True} | ||
if md_dict.get("validity_column_names") is None: | ||
from_, to = DEFAULT_VALIDITY_COLUMN_NAMES | ||
else: | ||
|
@@ -523,13 +533,25 @@ def validate_write_disposition_hint(wd: TTableHintTemplate[TWriteDispositionConf | |
f"""Allowed values: {', '.join(['"' + s + '"' for s in MERGE_STRATEGIES])}.""" | ||
) | ||
|
||
for ts in ("active_record_timestamp", "boundary_timestamp"): | ||
if ts == "active_record_timestamp" and wd.get("active_record_timestamp") is None: | ||
continue # None is allowed for active_record_timestamp | ||
if ts in wd: | ||
try: | ||
ensure_pendulum_datetime(wd[ts]) # type: ignore[literal-required] | ||
except Exception: | ||
raise ValueError( | ||
f'could not parse `{ts}` value "{wd[ts]}"' # type: ignore[literal-required] | ||
) | ||
if wd.get("strategy") == "scd2": | ||
wd = cast(TScd2StrategyDict, wd) | ||
for ts in ("active_record_timestamp", "boundary_timestamp"): | ||
if ( | ||
ts == "active_record_timestamp" | ||
and wd.get("active_record_timestamp") is None | ||
): | ||
continue # None is allowed for active_record_timestamp | ||
if ts in wd: | ||
try: | ||
ensure_pendulum_datetime(wd[ts]) # type: ignore[literal-required] | ||
except Exception: | ||
raise ValueError( | ||
f'could not parse `{ts}` value "{wd[ts]}"' # type: ignore[literal-required] | ||
) | ||
|
||
if ( | ||
"retire_if_absent" in wd | ||
and not wd["retire_if_absent"] | ||
and "natural_key" not in wd | ||
): | ||
raise ValueError("`natural_key` is required when `retire_if_absent=False`") |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,13 +9,11 @@ | |
from dlt.common.typing import TAnyDateTime | ||
from dlt.common.pendulum import pendulum | ||
from dlt.common.pipeline import LoadInfo | ||
from dlt.common.schema.exceptions import ColumnNameConflictException | ||
from dlt.common.schema.typing import DEFAULT_VALIDITY_COLUMN_NAMES | ||
from dlt.common.normalizers.json.relational import DataItemNormalizer | ||
from dlt.common.normalizers.naming.snake_case import NamingConvention as SnakeCaseNamingConvention | ||
from dlt.common.time import ensure_pendulum_datetime, reduce_pendulum_datetime_precision | ||
from dlt.extract.resource import DltResource | ||
from dlt.pipeline.exceptions import PipelineStepFailed | ||
|
||
from tests.cases import arrow_table_all_data_types | ||
from tests.load.utils import ( | ||
|
@@ -717,6 +715,102 @@ def r(data): | |
) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"destination_config", | ||
destinations_configs(default_sql_configs=True, subset=["duckdb"]), | ||
ids=lambda x: x.name, | ||
) | ||
def test_retire_if_absent( | ||
destination_config: DestinationTestConfiguration, | ||
) -> None: | ||
p = destination_config.setup_pipeline("abstract", dev_mode=True) | ||
|
||
@dlt.resource( | ||
table_name="dim_test", | ||
write_disposition={ | ||
"disposition": "merge", | ||
"strategy": "scd2", | ||
"retire_if_absent": False, | ||
"natural_key": "nk", | ||
}, | ||
) | ||
def r(data): | ||
yield data | ||
|
||
# load 1 — initial load | ||
dim_snap = [ | ||
{"nk": 1, "foo": "foo"}, | ||
{"nk": 2, "foo": "foo"}, | ||
] | ||
info = p.run(r(dim_snap), **destination_config.run_kwargs) | ||
assert_load_info(info) | ||
assert load_table_counts(p, "dim_test")["dim_test"] == 2 | ||
_, to = DEFAULT_VALIDITY_COLUMN_NAMES | ||
# both records should be active (i.e. not retired) | ||
assert [row[to] for row in get_table(p, "dim_test")] == [None, None] | ||
|
||
# load 2 — natural key 2 is absent, natural key 1 is unchanged | ||
dim_snap = [ | ||
{"nk": 1, "foo": "foo"}, | ||
] | ||
info = p.run(r(dim_snap), **destination_config.run_kwargs) | ||
assert_load_info(info) | ||
assert load_table_counts(p, "dim_test")["dim_test"] == 2 | ||
# both records should still be active | ||
assert [row[to] for row in get_table(p, "dim_test")] == [None, None] | ||
|
||
# load 3 — natural key 2 is absent, natural key 1 has changed | ||
dim_snap = [ | ||
{"nk": 1, "foo": "bar"}, | ||
] | ||
info = p.run(r(dim_snap), **destination_config.run_kwargs) | ||
assert_load_info(info) | ||
assert load_table_counts(p, "dim_test")["dim_test"] == 3 | ||
boundary_ts = get_load_package_created_at(p, info) | ||
# natural key 1 should now have two records (one retired, one active) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should also test here that when the row with natural key 1 reverts back to the first version (and has the same row hash as it had in the first load) it will still create a new entry. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
actual = [{k: v for k, v in row.items() if k in ("nk", to)} for row in get_table(p, "dim_test")] | ||
expected = [{"nk": 1, to: boundary_ts}, {"nk": 1, to: None}, {"nk": 2, to: None}] | ||
assert_records_as_set(actual, expected) # type: ignore[arg-type] | ||
|
||
# now test various configs | ||
|
||
with pytest.raises(ValueError): | ||
# should raise because `natural_key` is required when `retire_if_absent=False` | ||
r.apply_hints( | ||
write_disposition={ | ||
"disposition": "merge", | ||
"strategy": "scd2", | ||
"retire_if_absent": False, | ||
} | ||
) | ||
|
||
# `retire_if_absent=True` does not require `natural_key` | ||
r.apply_hints( | ||
write_disposition={ | ||
"disposition": "merge", | ||
"strategy": "scd2", | ||
"retire_if_absent": True, | ||
} | ||
) | ||
assert r.compute_table_schema()["x-retire-if-absent"] # type: ignore[typeddict-item] | ||
|
||
# user-provided hints for `natural_key` column should be respected | ||
r.apply_hints( | ||
columns={"nk": {"x-foo": "foo"}}, # type: ignore[typeddict-unknown-key] | ||
write_disposition={ | ||
"disposition": "merge", | ||
"strategy": "scd2", | ||
"retire_if_absent": False, | ||
"natural_key": "nk", | ||
}, | ||
) | ||
assert r.compute_table_schema()["columns"]["nk"] == { | ||
"x-foo": "foo", | ||
"name": "nk", | ||
"x-natural-key": True, | ||
} | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"destination_config", | ||
destinations_configs(default_sql_configs=True, subset=["duckdb"]), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe call this "retire_absent_rows"? then it is a bit more clear just from the naming what it does. wdyt?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes! Changed it.