Feat/1492 extend timestamp config (#1669)

* feat: add timezone flag to configure timestamp data * fix: delete timezone init * test: add duckdb timestamps with timezone * test: fix resource hints for timestamp * test: correct duckdb timestamps * test: timezone tests for parquet files * exp: add notebook with timestamp exploration * test: refactor timestamp tests * test: simplified tests and extended experiments * exp: timestamp exp for duckdb and parquet * fix: add pyarrow reflection for timezone flag * fix lint errors * fix: CI/CD move tests pyarrow module * fix: pyarrow timezone defaults true * refactor: typemapper signatures * fix: duckdb timestamp config * docs: updated duckdb.md timestamps * fix: revert duckdb timestamp defaults * fix: restore duckdb timestamp default * fix: duckdb timestamp mapper * fix: delete notebook * docs: added timestamp and timezone section * refactor: duckdb precision exception message * feat: postgres timestamp timezone config * fix: postgres timestamp precision * fix: postgres timezone false case * feat: add snowflake timezone and precision flag * test: postgres invalid timestamp precision * test: unified timestamp invalid precision * test: unified column flag timezone * chore: add warn log for unsupported timezone or precision flag * docs: timezone and precision flags for timestamps * fix: none case error * docs: add duckdb default precision * fix: typing errors * rebase: formatted files from upstream devel * fix: warning message and reference TODO * test: delete duplicated input_data array * docs: moved timestamp config to data types section * fix: lint and format * fix: lint local errors
dlt-hub · Sep 2, 2024 · 7c803f0 · 7c803f0
1 parent d848f1d
commit 7c803f0
Show file tree

Hide file tree

Showing 21 changed files with 536 additions and 96 deletions.
diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py
@@ -54,7 +54,10 @@ def get_py_arrow_datatype(
     elif column_type == "bool":
         return pyarrow.bool_()
     elif column_type == "timestamp":
-        return get_py_arrow_timestamp(column.get("precision") or caps.timestamp_precision, tz)
+        # sets timezone to None when timezone hint is false
+        timezone = tz if column.get("timezone", True) else None
+        precision = column.get("precision") or caps.timestamp_precision
+        return get_py_arrow_timestamp(precision, timezone)
     elif column_type == "bigint":
         return get_pyarrow_int(column.get("precision"))
     elif column_type == "binary":
@@ -139,6 +142,10 @@ def get_column_type_from_py_arrow(dtype: pyarrow.DataType) -> TColumnType:
             precision = 6
         else:
             precision = 9
+
+        if dtype.tz is None:
+            return dict(data_type="timestamp", precision=precision, timezone=False)
+
         return dict(data_type="timestamp", precision=precision)
     elif pyarrow.types.is_date(dtype):
         return dict(data_type="date")

diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py
@@ -94,6 +94,7 @@ class TColumnType(TypedDict, total=False):
     data_type: Optional[TDataType]
     precision: Optional[int]
     scale: Optional[int]
+    timezone: Optional[bool]
 
 
 class TColumnSchemaBase(TColumnType, total=False):

diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py
@@ -104,9 +104,9 @@ class AthenaTypeMapper(TypeMapper):
     def __init__(self, capabilities: DestinationCapabilitiesContext):
         super().__init__(capabilities)
 
-    def to_db_integer_type(
-        self, precision: Optional[int], table_format: TTableFormat = None
-    ) -> str:
+    def to_db_integer_type(self, column: TColumnSchema, table: TTableSchema = None) -> str:
+        precision = column.get("precision")
+        table_format = table.get("table_format")
         if precision is None:
             return "bigint"
         if precision <= 8:
@@ -403,9 +403,9 @@ def _from_db_type(
     ) -> TColumnType:
         return self.type_mapper.from_db_type(hive_t, precision, scale)
 
-    def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str:
+    def _get_column_def_sql(self, c: TColumnSchema, table: TTableSchema = None) -> str:
         return (
-            f"{self.sql_client.escape_ddl_identifier(c['name'])} {self.type_mapper.to_db_type(c, table_format)}"
+            f"{self.sql_client.escape_ddl_identifier(c['name'])} {self.type_mapper.to_db_type(c, table)}"
         )
 
     def _iceberg_partition_clause(self, partition_hints: Optional[Dict[str, str]]) -> str:
@@ -429,9 +429,9 @@ def _get_table_update_sql(
         # for the system tables we need to create empty iceberg tables to be able to run, DELETE and UPDATE queries
         # or if we are in iceberg mode, we create iceberg tables for all tables
         table = self.prepare_load_table(table_name, self.in_staging_mode)
-        table_format = table.get("table_format")
+
         is_iceberg = self._is_iceberg_table(table) or table.get("write_disposition", None) == "skip"
-        columns = ", ".join([self._get_column_def_sql(c, table_format) for c in new_columns])
+        columns = ", ".join([self._get_column_def_sql(c, table) for c in new_columns])
 
         # create unique tag for iceberg table so it is never recreated in the same folder
         # athena requires some kind of special cleaning (or that is a bug) so we cannot refresh

diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py
@@ -90,9 +90,9 @@ class BigQueryTypeMapper(TypeMapper):
         "TIME": "time",
     }
 
-    def to_db_decimal_type(self, precision: Optional[int], scale: Optional[int]) -> str:
+    def to_db_decimal_type(self, column: TColumnSchema) -> str:
         # Use BigQuery's BIGNUMERIC for large precision decimals
-        precision, scale = self.decimal_precision(precision, scale)
+        precision, scale = self.decimal_precision(column.get("precision"), column.get("scale"))
         if precision > 38 or scale > 9:
             return "BIGNUMERIC(%i,%i)" % (precision, scale)
         return "NUMERIC(%i,%i)" % (precision, scale)
@@ -417,10 +417,10 @@ def _get_info_schema_columns_query(
 
         return query, folded_table_names
 
-    def _get_column_def_sql(self, column: TColumnSchema, table_format: TTableFormat = None) -> str:
+    def _get_column_def_sql(self, column: TColumnSchema, table: TTableSchema = None) -> str:
         name = self.sql_client.escape_column_name(column["name"])
         column_def_sql = (
-            f"{name} {self.type_mapper.to_db_type(column, table_format)} {self._gen_not_null(column.get('nullable', True))}"
+            f"{name} {self.type_mapper.to_db_type(column, table)} {self._gen_not_null(column.get('nullable', True))}"
         )
         if column.get(ROUND_HALF_EVEN_HINT, False):
             column_def_sql += " OPTIONS (rounding_mode='ROUND_HALF_EVEN')"

diff --git a/dlt/destinations/impl/clickhouse/clickhouse.py b/dlt/destinations/impl/clickhouse/clickhouse.py
@@ -293,7 +293,7 @@ def _create_merge_followup_jobs(
     ) -> List[FollowupJobRequest]:
         return [ClickHouseMergeJob.from_table_chain(table_chain, self.sql_client)]
 
-    def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str:
+    def _get_column_def_sql(self, c: TColumnSchema, table: TTableSchema = None) -> str:
         # Build column definition.
         # The primary key and sort order definition is defined outside column specification.
         hints_ = " ".join(
@@ -307,9 +307,9 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non
         # Alter table statements only accept `Nullable` modifiers.
         # JSON type isn't nullable in ClickHouse.
         type_with_nullability_modifier = (
-            f"Nullable({self.type_mapper.to_db_type(c)})"
+            f"Nullable({self.type_mapper.to_db_type(c,table)})"
             if c.get("nullable", True)
-            else self.type_mapper.to_db_type(c)
+            else self.type_mapper.to_db_type(c, table)
         )
 
         return (

diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py
@@ -68,9 +68,8 @@ class DatabricksTypeMapper(TypeMapper):
         "wei": "DECIMAL(%i,%i)",
     }
 
-    def to_db_integer_type(
-        self, precision: Optional[int], table_format: TTableFormat = None
-    ) -> str:
+    def to_db_integer_type(self, column: TColumnSchema, table: TTableSchema = None) -> str:
+        precision = column.get("precision")
         if precision is None:
             return "BIGINT"
         if precision <= 8:
@@ -323,10 +322,12 @@ def _create_merge_followup_jobs(
         return [DatabricksMergeJob.from_table_chain(table_chain, self.sql_client)]
 
     def _make_add_column_sql(
-        self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None
+        self, new_columns: Sequence[TColumnSchema], table: TTableSchema = None
     ) -> List[str]:
         # Override because databricks requires multiple columns in a single ADD COLUMN clause
-        return ["ADD COLUMN\n" + ",\n".join(self._get_column_def_sql(c) for c in new_columns)]
+        return [
+            "ADD COLUMN\n" + ",\n".join(self._get_column_def_sql(c, table) for c in new_columns)
+        ]
 
     def _get_table_update_sql(
         self,
@@ -351,10 +352,10 @@ def _from_db_type(
     ) -> TColumnType:
         return self.type_mapper.from_db_type(bq_t, precision, scale)
 
-    def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str:
+    def _get_column_def_sql(self, c: TColumnSchema, table: TTableSchema = None) -> str:
         name = self.sql_client.escape_column_name(c["name"])
         return (
-            f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}"
+            f"{name} {self.type_mapper.to_db_type(c,table)} {self._gen_not_null(c.get('nullable', True))}"
         )
 
     def _get_storage_table_query_columns(self) -> List[str]:

diff --git a/dlt/destinations/impl/dremio/dremio.py b/dlt/destinations/impl/dremio/dremio.py
@@ -195,10 +195,10 @@ def _from_db_type(
     ) -> TColumnType:
         return self.type_mapper.from_db_type(bq_t, precision, scale)
 
-    def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str:
+    def _get_column_def_sql(self, c: TColumnSchema, table: TTableSchema = None) -> str:
         name = self.sql_client.escape_column_name(c["name"])
         return (
-            f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}"
+            f"{name} {self.type_mapper.to_db_type(c,table)} {self._gen_not_null(c.get('nullable', True))}"
         )
 
     def _create_merge_followup_jobs(
@@ -207,9 +207,13 @@ def _create_merge_followup_jobs(
         return [DremioMergeJob.from_table_chain(table_chain, self.sql_client)]
 
     def _make_add_column_sql(
-        self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None
+        self, new_columns: Sequence[TColumnSchema], table: TTableSchema = None
     ) -> List[str]:
-        return ["ADD COLUMNS (" + ", ".join(self._get_column_def_sql(c) for c in new_columns) + ")"]
+        return [
+            "ADD COLUMNS ("
+            + ", ".join(self._get_column_def_sql(c, table) for c in new_columns)
+            + ")"
+        ]
 
     def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool:
         return self.config.truncate_tables_on_staging_destination_before_load
diff --git a/dlt/destinations/impl/duckdb/duck.py b/dlt/destinations/impl/duckdb/duck.py
@@ -62,9 +62,8 @@ class DuckDbTypeMapper(TypeMapper):
         "TIMESTAMP_NS": "timestamp",
     }
 
-    def to_db_integer_type(
-        self, precision: Optional[int], table_format: TTableFormat = None
-    ) -> str:
+    def to_db_integer_type(self, column: TColumnSchema, table: TTableSchema = None) -> str:
+        precision = column.get("precision")
         if precision is None:
             return "BIGINT"
         # Precision is number of bits
@@ -83,19 +82,39 @@ def to_db_integer_type(
         )
 
     def to_db_datetime_type(
-        self, precision: Optional[int], table_format: TTableFormat = None
+        self,
+        column: TColumnSchema,
+        table: TTableSchema = None,
     ) -> str:
+        column_name = column.get("name")
+        table_name = table.get("name")
+        timezone = column.get("timezone")
+        precision = column.get("precision")
+
+        if timezone and precision is not None:
+            raise TerminalValueError(
+                f"DuckDB does not support both timezone and precision for column '{column_name}' in"
+                f" table '{table_name}'. To resolve this issue, either set timezone to False or"
+                " None, or use the default precision."
+            )
+
+        if timezone:
+            return "TIMESTAMP WITH TIME ZONE"
+        elif timezone is not None:  # condition for when timezone is False given that none is falsy
+            return "TIMESTAMP"
+
         if precision is None or precision == 6:
-            return super().to_db_datetime_type(precision, table_format)
-        if precision == 0:
+            return None
+        elif precision == 0:
             return "TIMESTAMP_S"
-        if precision == 3:
+        elif precision == 3:
             return "TIMESTAMP_MS"
-        if precision == 9:
+        elif precision == 9:
             return "TIMESTAMP_NS"
+
         raise TerminalValueError(
-            f"timestamp with {precision} decimals after seconds cannot be mapped into duckdb"
-            " TIMESTAMP type"
+            f"DuckDB does not support precision '{precision}' for '{column_name}' in table"
+            f" '{table_name}'"
         )
 
     def from_db_type(
@@ -162,15 +181,15 @@ def create_load_job(
             job = DuckDbCopyJob(file_path)
         return job
 
-    def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str:
+    def _get_column_def_sql(self, c: TColumnSchema, table: TTableSchema = None) -> str:
         hints_str = " ".join(
             self.active_hints.get(h, "")
             for h in self.active_hints.keys()
             if c.get(h, False) is True
         )
         column_name = self.sql_client.escape_column_name(c["name"])
         return (
-            f"{column_name} {self.type_mapper.to_db_type(c)} {hints_str} {self._gen_not_null(c.get('nullable', True))}"
+            f"{column_name} {self.type_mapper.to_db_type(c,table)} {hints_str} {self._gen_not_null(c.get('nullable', True))}"
         )
 
     def _from_db_type(

diff --git a/dlt/destinations/impl/lancedb/lancedb_client.py b/dlt/destinations/impl/lancedb/lancedb_client.py
@@ -41,7 +41,7 @@
     LoadJob,
 )
 from dlt.common.pendulum import timedelta
-from dlt.common.schema import Schema, TTableSchema, TSchemaTables
+from dlt.common.schema import Schema, TTableSchema, TSchemaTables, TColumnSchema
 from dlt.common.schema.typing import (
     TColumnType,
     TTableFormat,
@@ -105,21 +105,27 @@ class LanceDBTypeMapper(TypeMapper):
         pa.date32(): "date",
     }
 
-    def to_db_decimal_type(
-        self, precision: Optional[int], scale: Optional[int]
-    ) -> pa.Decimal128Type:
-        precision, scale = self.decimal_precision(precision, scale)
+    def to_db_decimal_type(self, column: TColumnSchema) -> pa.Decimal128Type:
+        precision, scale = self.decimal_precision(column.get("precision"), column.get("scale"))
         return pa.decimal128(precision, scale)
 
     def to_db_datetime_type(
-        self, precision: Optional[int], table_format: TTableFormat = None
+        self,
+        column: TColumnSchema,
+        table: TTableSchema = None,
     ) -> pa.TimestampType:
+        column_name = column.get("name")
+        timezone = column.get("timezone")
+        precision = column.get("precision")
+        if timezone is not None or precision is not None:
+            logger.warning(
+                "LanceDB does not currently support column flags for timezone or precision."
+                f" These flags were used in column '{column_name}'."
+            )
         unit: str = TIMESTAMP_PRECISION_TO_UNIT[self.capabilities.timestamp_precision]
         return pa.timestamp(unit, "UTC")
 
-    def to_db_time_type(
-        self, precision: Optional[int], table_format: TTableFormat = None
-    ) -> pa.Time64Type:
+    def to_db_time_type(self, column: TColumnSchema, table: TTableSchema = None) -> pa.Time64Type:
         unit: str = TIMESTAMP_PRECISION_TO_UNIT[self.capabilities.timestamp_precision]
         return pa.time64(unit)
 

diff --git a/dlt/destinations/impl/mssql/mssql.py b/dlt/destinations/impl/mssql/mssql.py
@@ -59,9 +59,8 @@ class MsSqlTypeMapper(TypeMapper):
         "int": "bigint",
     }
 
-    def to_db_integer_type(
-        self, precision: Optional[int], table_format: TTableFormat = None
-    ) -> str:
+    def to_db_integer_type(self, column: TColumnSchema, table: TTableSchema = None) -> str:
+        precision = column.get("precision")
         if precision is None:
             return "bigint"
         if precision <= 8:
@@ -166,20 +165,18 @@ def _create_merge_followup_jobs(
         return [MsSqlMergeJob.from_table_chain(table_chain, self.sql_client)]
 
     def _make_add_column_sql(
-        self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None
+        self, new_columns: Sequence[TColumnSchema], table: TTableSchema = None
     ) -> List[str]:
         # Override because mssql requires multiple columns in a single ADD COLUMN clause
-        return [
-            "ADD \n" + ",\n".join(self._get_column_def_sql(c, table_format) for c in new_columns)
-        ]
+        return ["ADD \n" + ",\n".join(self._get_column_def_sql(c, table) for c in new_columns)]
 
-    def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str:
+    def _get_column_def_sql(self, c: TColumnSchema, table: TTableSchema = None) -> str:
         sc_type = c["data_type"]
         if sc_type == "text" and c.get("unique"):
             # MSSQL does not allow index on large TEXT columns
             db_type = "nvarchar(%i)" % (c.get("precision") or 900)
         else:
-            db_type = self.type_mapper.to_db_type(c)
+            db_type = self.type_mapper.to_db_type(c, table)
 
         hints_str = " ".join(
             self.active_hints.get(h, "")