diff --git a/dlt/destinations/impl/lancedb/configuration.py b/dlt/destinations/impl/lancedb/configuration.py
index 329132f495..8f6a192bb0 100644
--- a/dlt/destinations/impl/lancedb/configuration.py
+++ b/dlt/destinations/impl/lancedb/configuration.py
@@ -59,6 +59,7 @@ class LanceDBClientOptions(BaseConfiguration):
     "sentence-transformers",
     "huggingface",
     "colbert",
+    "ollama",
 ]
 
 
@@ -92,8 +93,6 @@ class LanceDBClientConfiguration(DestinationClientDwhConfiguration):
     Make sure it corresponds with the associated embedding model's dimensionality."""
     vector_field_name: str = "vector"
     """Name of the special field to store the vector embeddings."""
-    id_field_name: str = "id__"
-    """Name of the special field to manage deduplication."""
     sentinel_table_name: str = "dltSentinelTable"
     """Name of the sentinel table that encapsulates datasets. Since LanceDB has no
     concept of schemas, this table serves as a proxy to group related dlt tables together."""
diff --git a/dlt/destinations/impl/lancedb/factory.py b/dlt/destinations/impl/lancedb/factory.py
index 8ce2217007..d0d22ed3fb 100644
--- a/dlt/destinations/impl/lancedb/factory.py
+++ b/dlt/destinations/impl/lancedb/factory.py
@@ -26,8 +26,8 @@ class lancedb(Destination[LanceDBClientConfiguration, "LanceDBClient"]):
 
     def _raw_capabilities(self) -> DestinationCapabilitiesContext:
         caps = DestinationCapabilitiesContext()
-        caps.preferred_loader_file_format = "jsonl"
-        caps.supported_loader_file_formats = ["jsonl"]
+        caps.preferred_loader_file_format = "parquet"
+        caps.supported_loader_file_formats = ["parquet", "reference"]
         caps.type_mapper = LanceDBTypeMapper
 
         caps.max_identifier_length = 200
@@ -42,6 +42,10 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext:
         caps.timestamp_precision = 6
         caps.supported_replace_strategies = ["truncate-and-insert"]
 
+        caps.recommended_file_size = 128_000_000
+
+        caps.supported_merge_strategies = ["upsert"]
+
         return caps
 
     @property
diff --git a/dlt/destinations/impl/lancedb/lancedb_adapter.py b/dlt/destinations/impl/lancedb/lancedb_adapter.py
index 99d5ef43c6..8f4fbb091d 100644
--- a/dlt/destinations/impl/lancedb/lancedb_adapter.py
+++ b/dlt/destinations/impl/lancedb/lancedb_adapter.py
@@ -1,16 +1,20 @@
-from typing import Any
+from typing import Any, Dict
 
 from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns
 from dlt.destinations.utils import get_resource_for_adapter
 from dlt.extract import DltResource
+from dlt.extract.items import TTableHintTemplate
 
 
 VECTORIZE_HINT = "x-lancedb-embed"
+NO_REMOVE_ORPHANS_HINT = "x-lancedb-remove-orphans"
 
 
 def lancedb_adapter(
     data: Any,
     embed: TColumnNames = None,
+    merge_key: TColumnNames = None,
+    no_remove_orphans: bool = False,
 ) -> DltResource:
     """Prepares data for the LanceDB destination by specifying which columns should be embedded.
 
@@ -20,6 +24,10 @@ def lancedb_adapter(
             object.
         embed (TColumnNames, optional): Specify columns to generate embeddings for.
             It can be a single column name as a string, or a list of column names.
+        merge_key (TColumnNames, optional): Specify columns to merge on.
+            It can be a single column name as a string, or a list of column names.
+        no_remove_orphans (bool): Specify whether to remove orphaned records in child
+            tables with no parent records after merges to maintain referential integrity.
 
     Returns:
         DltResource: A resource with applied LanceDB-specific hints.
@@ -34,6 +42,7 @@ def lancedb_adapter(
     """
     resource = get_resource_for_adapter(data)
 
+    additional_table_hints: Dict[str, TTableHintTemplate[Any]] = {}
     column_hints: TTableSchemaColumns = {}
 
     if embed:
@@ -50,9 +59,28 @@ def lancedb_adapter(
                 VECTORIZE_HINT: True,  # type: ignore[misc]
             }
 
-    if not column_hints:
-        raise ValueError("A value for 'embed' must be specified.")
+    if merge_key:
+        if isinstance(merge_key, str):
+            merge_key = [merge_key]
+        if not isinstance(merge_key, list):
+            raise ValueError(
+                "'merge_key' must be a list of column names or a single column name as a string."
+            )
+
+        for column_name in merge_key:
+            column_hints[column_name] = {
+                "name": column_name,
+                "merge_key": True,
+            }
+
+    additional_table_hints[NO_REMOVE_ORPHANS_HINT] = no_remove_orphans
+
+    if column_hints or additional_table_hints:
+        resource.apply_hints(columns=column_hints, additional_table_hints=additional_table_hints)
     else:
-        resource.apply_hints(columns=column_hints)
+        raise ValueError(
+            "You must must provide at least either the 'embed' or 'merge_key' or 'remove_orphans'"
+            " argument if using the adapter."
+        )
 
     return resource
diff --git a/dlt/destinations/impl/lancedb/lancedb_client.py b/dlt/destinations/impl/lancedb/lancedb_client.py
index ffa556797e..86ef36c045 100644
--- a/dlt/destinations/impl/lancedb/lancedb_client.py
+++ b/dlt/destinations/impl/lancedb/lancedb_client.py
@@ -1,4 +1,3 @@
-import uuid
 from types import TracebackType
 from typing import (
     List,
@@ -12,15 +11,17 @@
     Dict,
     Sequence,
     TYPE_CHECKING,
+    Set,
 )
 
-from dlt.common.destination.capabilities import DataTypeMapper
 import lancedb  # type: ignore
+import lancedb.table  # type: ignore
 import pyarrow as pa
+import pyarrow.parquet as pq
 from lancedb import DBConnection
+from lancedb.common import DATA  # type: ignore
 from lancedb.embeddings import EmbeddingFunctionRegistry, TextEmbeddingFunction  # type: ignore
 from lancedb.query import LanceQueryBuilder  # type: ignore
-from lancedb.table import Table  # type: ignore
 from numpy import ndarray
 from pyarrow import Array, ChunkedArray, ArrowInvalid
 
@@ -39,53 +40,143 @@
     StorageSchemaInfo,
     StateInfo,
     LoadJob,
+    HasFollowupJobs,
+    FollowupJobRequest,
 )
 from dlt.common.pendulum import timedelta
 from dlt.common.schema import Schema, TSchemaTables
 from dlt.common.schema.typing import (
-    C_DLT_LOAD_ID,
+    TColumnType,
     TTableSchemaColumns,
     TWriteDisposition,
+    TColumnSchema,
+    TTableSchema,
 )
 from dlt.common.schema.utils import get_columns_names_with_prop
-from dlt.common.storages import FileStorage
-from dlt.common.typing import DictStrAny
+from dlt.common.storages import FileStorage, LoadJobInfo, ParsedLoadJobFileName
 from dlt.destinations.impl.lancedb.configuration import (
     LanceDBClientConfiguration,
 )
 from dlt.destinations.impl.lancedb.exceptions import (
     lancedb_error,
 )
-from dlt.destinations.impl.lancedb.lancedb_adapter import VECTORIZE_HINT
+from dlt.destinations.impl.lancedb.lancedb_adapter import (
+    VECTORIZE_HINT,
+    NO_REMOVE_ORPHANS_HINT,
+)
 from dlt.destinations.impl.lancedb.schema import (
     make_arrow_field_schema,
     make_arrow_table_schema,
     TArrowSchema,
     NULL_SCHEMA,
     TArrowField,
+    arrow_datatype_to_fusion_datatype,
+    TTableLineage,
+    TableJob,
 )
 from dlt.destinations.impl.lancedb.utils import (
-    list_merge_identifiers,
-    generate_uuid,
     set_non_standard_providers_environment_variables,
+    EMPTY_STRING_PLACEHOLDER,
+    fill_empty_source_column_values_with_placeholder,
+    get_canonical_vector_database_doc_id_merge_key,
+    create_filter_condition,
 )
+from dlt.destinations.job_impl import ReferenceFollowupJobRequest
+from dlt.destinations.type_mapping import TypeMapperImpl
 
 if TYPE_CHECKING:
     NDArray = ndarray[Any, Any]
 else:
     NDArray = ndarray
 
+TIMESTAMP_PRECISION_TO_UNIT: Dict[int, str] = {0: "s", 3: "ms", 6: "us", 9: "ns"}
+UNIT_TO_TIMESTAMP_PRECISION: Dict[str, int] = {v: k for k, v in TIMESTAMP_PRECISION_TO_UNIT.items()}
+BATCH_PROCESS_CHUNK_SIZE = 10_000
 EMPTY_STRING_PLACEHOLDER = "0uEoDNBpQUBwsxKbmxxB"
 
 
-def upload_batch(
-    records: List[DictStrAny],
+class LanceDBTypeMapper(TypeMapperImpl):
+    sct_to_unbound_dbt = {
+        "text": pa.string(),
+        "double": pa.float64(),
+        "bool": pa.bool_(),
+        "bigint": pa.int64(),
+        "binary": pa.binary(),
+        "date": pa.date32(),
+        "json": pa.string(),
+    }
+
+    sct_to_dbt = {}
+
+    dbt_to_sct = {
+        pa.string(): "text",
+        pa.float64(): "double",
+        pa.bool_(): "bool",
+        pa.int64(): "bigint",
+        pa.binary(): "binary",
+        pa.date32(): "date",
+    }
+
+    def to_db_decimal_type(self, column: TColumnSchema) -> pa.Decimal128Type:
+        precision, scale = self.decimal_precision(column.get("precision"), column.get("scale"))
+        return pa.decimal128(precision, scale)
+
+    def to_db_datetime_type(
+        self,
+        column: TColumnSchema,
+        table: TTableSchema = None,
+    ) -> pa.TimestampType:
+        column_name = column.get("name")
+        timezone = column.get("timezone")
+        precision = column.get("precision")
+        if timezone is not None or precision is not None:
+            logger.warning(
+                "LanceDB does not currently support column flags for timezone or precision."
+                f" These flags were used in column '{column_name}'."
+            )
+        unit: str = TIMESTAMP_PRECISION_TO_UNIT[self.capabilities.timestamp_precision]
+        return pa.timestamp(unit, "UTC")
+
+    def to_db_time_type(self, column: TColumnSchema, table: TTableSchema = None) -> pa.Time64Type:
+        unit: str = TIMESTAMP_PRECISION_TO_UNIT[self.capabilities.timestamp_precision]
+        return pa.time64(unit)
+
+    def from_db_type(
+        self,
+        db_type: pa.DataType,
+        precision: Optional[int] = None,
+        scale: Optional[int] = None,
+    ) -> TColumnType:
+        if isinstance(db_type, pa.TimestampType):
+            return dict(
+                data_type="timestamp",
+                precision=UNIT_TO_TIMESTAMP_PRECISION[db_type.unit],
+                scale=scale,
+            )
+        if isinstance(db_type, pa.Time64Type):
+            return dict(
+                data_type="time",
+                precision=UNIT_TO_TIMESTAMP_PRECISION[db_type.unit],
+                scale=scale,
+            )
+        if isinstance(db_type, pa.Decimal128Type):
+            precision, scale = db_type.precision, db_type.scale
+            if (precision, scale) == self.capabilities.wei_precision:
+                return cast(TColumnType, dict(data_type="wei"))
+            return dict(data_type="decimal", precision=precision, scale=scale)
+        return super().from_db_type(cast(str, db_type), precision, scale)  # type: ignore
+
+
+def write_records(
+    records: DATA,
     /,
     *,
     db_client: DBConnection,
     table_name: str,
-    write_disposition: TWriteDisposition,
-    id_field_name: Optional[str] = None,
+    write_disposition: Optional[TWriteDisposition] = "append",
+    merge_key: Optional[str] = None,
+    remove_orphans: Optional[bool] = False,
+    filter_condition: Optional[str] = None,
 ) -> None:
     """Inserts records into a LanceDB table with automatic embedding computation.
 
@@ -93,8 +184,11 @@ def upload_batch(
         records: The data to be inserted as payload.
         db_client: The LanceDB client connection.
         table_name: The name of the table to insert into.
-        id_field_name: The name of the ID field for update/merge operations.
+        merge_key: Keys for update/merge operations.
         write_disposition: The write disposition - one of 'skip', 'append', 'replace', 'merge'.
+        remove_orphans (bool): Whether to remove orphans after insertion or not (only merge disposition).
+        filter_condition (str): If None, then all such rows will be deleted.
+            Otherwise, the condition will be used as an SQL filter to limit what rows are deleted.
 
     Raises:
         ValueError: If the write disposition is unsupported, or `id_field_name` is not
@@ -110,16 +204,17 @@ def upload_batch(
         ) from e
 
     try:
-        if write_disposition in ("append", "skip"):
+        if write_disposition in ("append", "skip", "replace"):
             tbl.add(records)
-        elif write_disposition == "replace":
-            tbl.add(records, mode="overwrite")
         elif write_disposition == "merge":
-            if not id_field_name:
-                raise ValueError("To perform a merge update, 'id_field_name' must be specified.")
-            tbl.merge_insert(
-                id_field_name
-            ).when_matched_update_all().when_not_matched_insert_all().execute(records)
+            if remove_orphans:
+                tbl.merge_insert(merge_key).when_not_matched_by_source_delete(
+                    filter_condition
+                ).execute(records)
+            else:
+                tbl.merge_insert(
+                    merge_key
+                ).when_matched_update_all().when_not_matched_insert_all().execute(records)
         else:
             raise DestinationTerminalException(
                 f"Unsupported write disposition {write_disposition} for LanceDB Destination - batch"
@@ -135,6 +230,8 @@ class LanceDBClient(JobClientBase, WithStateSync):
     """LanceDB destination handler."""
 
     model_func: TextEmbeddingFunction
+    """The embedder callback used for each chunk."""
+    dataset_name: str
 
     def __init__(
         self,
@@ -152,6 +249,7 @@ def __init__(
         self.registry = EmbeddingFunctionRegistry.get_instance()
         self.type_mapper = self.capabilities.get_type_mapper()
         self.sentinel_table_name = config.sentinel_table_name
+        self.dataset_name = self.config.normalize_dataset_name(self.schema)
 
         embedding_model_provider = self.config.embedding_model_provider
 
@@ -169,11 +267,6 @@ def __init__(
         )
 
         self.vector_field_name = self.config.vector_field_name
-        self.id_field_name = self.config.id_field_name
-
-    @property
-    def dataset_name(self) -> str:
-        return self.config.normalize_dataset_name(self.schema)
 
     @property
     def sentinel_table(self) -> str:
@@ -187,7 +280,7 @@ def make_qualified_table_name(self, table_name: str) -> str:
         )
 
     def get_table_schema(self, table_name: str) -> TArrowSchema:
-        schema_table: Table = self.db_client.open_table(table_name)
+        schema_table: "lancedb.table.Table" = self.db_client.open_table(table_name)
         schema_table.checkout_latest()
         schema = schema_table.schema
         return cast(
@@ -196,7 +289,9 @@ def get_table_schema(self, table_name: str) -> TArrowSchema:
         )
 
     @lancedb_error
-    def create_table(self, table_name: str, schema: TArrowSchema, mode: str = "create") -> Table:
+    def create_table(
+        self, table_name: str, schema: TArrowSchema, mode: str = "create"
+    ) -> "lancedb.table.Table":
         """Create a LanceDB Table from the provided LanceModel or PyArrow schema.
 
         Args:
@@ -230,7 +325,7 @@ def query_table(
         Returns:
             A LanceDB query builder.
         """
-        query_table: Table = self.db_client.open_table(table_name)
+        query_table: "lancedb.table.Table" = self.db_client.open_table(table_name)
         query_table.checkout_latest()
         return query_table.search(query=query)
 
@@ -255,7 +350,7 @@ def drop_storage(self) -> None:
 
         Deletes all tables in the dataset and all data, as well as sentinel table associated with them.
 
-        If the dataset name was not provided, it deletes all the tables in the current schema.
+        If the dataset name wasn't provided, it deletes all the tables in the current schema.
         """
         for table_name in self._get_table_names():
             self.db_client.drop_table(table_name)
@@ -282,7 +377,7 @@ def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None:
     def is_storage_initialized(self) -> bool:
         return self.table_exists(self.sentinel_table)
 
-    def _create_sentinel_table(self) -> Table:
+    def _create_sentinel_table(self) -> "lancedb.table.Table":
         """Create an empty table to indicate that the storage is initialized."""
         return self.create_table(schema=NULL_SCHEMA, table_name=self.sentinel_table)
 
@@ -325,7 +420,7 @@ def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]
         try:
             fq_table_name = self.make_qualified_table_name(table_name)
 
-            table: Table = self.db_client.open_table(fq_table_name)
+            table: "lancedb.table.Table" = self.db_client.open_table(fq_table_name)
             table.checkout_latest()
             arrow_schema: TArrowSchema = table.schema
         except FileNotFoundError:
@@ -341,34 +436,33 @@ def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]
         return True, table_schema
 
     @lancedb_error
-    def add_table_fields(
-        self, table_name: str, field_schemas: List[TArrowField]
-    ) -> Optional[Table]:
-        """Add multiple fields to the LanceDB table at once.
+    def extend_lancedb_table_schema(self, table_name: str, field_schemas: List[pa.Field]) -> None:
+        """Extend LanceDB table schema with empty columns.
 
         Args:
-            table_name: The name of the table to create the fields on.
-            field_schemas: The list of fields to create.
+        table_name: The name of the table to create the fields on.
+        field_schemas: The list of PyArrow Fields to create in the target LanceDB table.
         """
-        table: Table = self.db_client.open_table(table_name)
+        table: "lancedb.table.Table" = self.db_client.open_table(table_name)
         table.checkout_latest()
-        arrow_table = table.to_arrow()
 
-        # Check if any of the new fields already exist in the table.
-        existing_fields = set(arrow_table.schema.names)
-        new_fields = [field for field in field_schemas if field.name not in existing_fields]
-
-        if not new_fields:
-            # All fields already present, skip.
-            return None
+        try:
+            # Use DataFusion SQL syntax to alter fields without loading data into client memory.
+            # Now, the most efficient way to modify column values is in LanceDB.
+            new_fields = {
+                field.name: f"CAST(NULL AS {arrow_datatype_to_fusion_datatype(field.type)})"
+                for field in field_schemas
+            }
+            table.add_columns(new_fields)
 
-        null_arrays = [pa.nulls(len(arrow_table), type=field.type) for field in new_fields]
+            # Make new columns nullable in the Arrow schema.
+            # Necessary because the Datafusion SQL API doesn't set new columns as nullable by default.
+            for field in field_schemas:
+                table.alter_columns({"path": field.name, "nullable": field.nullable})
 
-        for field, null_array in zip(new_fields, null_arrays):
-            arrow_table = arrow_table.append_column(field, null_array)
+                # TODO: Update method below doesn't work for bulk NULL assignments, raise with LanceDB developers.
+                # table.update(values={field.name: None})
 
-        try:
-            return self.db_client.create_table(table_name, arrow_table, mode="overwrite")
         except OSError:
             # Error occurred while creating the table, skip.
             return None
@@ -376,36 +470,31 @@ def add_table_fields(
     def _execute_schema_update(self, only_tables: Iterable[str]) -> None:
         for table_name in only_tables or self.schema.tables:
             exists, existing_columns = self.get_storage_table(table_name)
-            new_columns = self.schema.get_new_table_columns(
+            new_columns: List[TColumnSchema] = self.schema.get_new_table_columns(
                 table_name,
                 existing_columns,
                 self.capabilities.generates_case_sensitive_identifiers(),
             )
-            embedding_fields: List[str] = get_columns_names_with_prop(
-                self.schema.get_table(table_name), VECTORIZE_HINT
-            )
             logger.info(f"Found {len(new_columns)} updates for {table_name} in {self.schema.name}")
-            if len(new_columns) > 0:
+            if new_columns:
                 if exists:
                     field_schemas: List[TArrowField] = [
                         make_arrow_field_schema(column["name"], column, self.type_mapper)
                         for column in new_columns
                     ]
                     fq_table_name = self.make_qualified_table_name(table_name)
-                    self.add_table_fields(fq_table_name, field_schemas)
+                    self.extend_lancedb_table_schema(fq_table_name, field_schemas)
                 else:
                     if table_name not in self.schema.dlt_table_names():
                         embedding_fields = get_columns_names_with_prop(
                             self.schema.get_table(table_name=table_name), VECTORIZE_HINT
                         )
                         vector_field_name = self.vector_field_name
-                        id_field_name = self.id_field_name
                         embedding_model_func = self.model_func
                         embedding_model_dimensions = self.config.embedding_model_dimensions
                     else:
                         embedding_fields = None
                         vector_field_name = None
-                        id_field_name = None
                         embedding_model_func = None
                         embedding_model_dimensions = None
 
@@ -417,7 +506,6 @@ def _execute_schema_update(self, only_tables: Iterable[str]) -> None:
                         embedding_model_func=embedding_model_func,
                         embedding_model_dimensions=embedding_model_dimensions,
                         vector_field_name=vector_field_name,
-                        id_field_name=id_field_name,
                     )
                     fq_table_name = self.make_qualified_table_name(table_name)
                     self.create_table(fq_table_name, table_schema)
@@ -446,7 +534,8 @@ def update_schema_in_storage(self) -> None:
         write_disposition = self.schema.get_table(self.schema.version_table_name).get(
             "write_disposition"
         )
-        upload_batch(
+
+        write_records(
             records,
             db_client=self.db_client,
             table_name=fq_version_table_name,
@@ -459,15 +548,17 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]:
         fq_state_table_name = self.make_qualified_table_name(self.schema.state_table_name)
         fq_loads_table_name = self.make_qualified_table_name(self.schema.loads_table_name)
 
-        state_table_: Table = self.db_client.open_table(fq_state_table_name)
+        state_table_: "lancedb.table.Table" = self.db_client.open_table(fq_state_table_name)
         state_table_.checkout_latest()
 
-        loads_table_: Table = self.db_client.open_table(fq_loads_table_name)
+        loads_table_: "lancedb.table.Table" = self.db_client.open_table(fq_loads_table_name)
         loads_table_.checkout_latest()
 
         # normalize property names
         p_load_id = self.schema.naming.normalize_identifier("load_id")
-        p_dlt_load_id = self.schema.naming.normalize_identifier(C_DLT_LOAD_ID)
+        p_dlt_load_id = self.schema.naming.normalize_identifier(
+            self.schema.data_item_normalizer.c_dlt_load_id  # type: ignore[attr-defined]
+        )
         p_pipeline_name = self.schema.naming.normalize_identifier("pipeline_name")
         p_status = self.schema.naming.normalize_identifier("status")
         p_version = self.schema.naming.normalize_identifier("version")
@@ -508,7 +599,7 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]:
     def get_stored_schema_by_hash(self, schema_hash: str) -> Optional[StorageSchemaInfo]:
         fq_version_table_name = self.make_qualified_table_name(self.schema.version_table_name)
 
-        version_table: Table = self.db_client.open_table(fq_version_table_name)
+        version_table: "lancedb.table.Table" = self.db_client.open_table(fq_version_table_name)
         version_table.checkout_latest()
         p_version_hash = self.schema.naming.normalize_identifier("version_hash")
         p_inserted_at = self.schema.naming.normalize_identifier("inserted_at")
@@ -524,8 +615,6 @@ def get_stored_schema_by_hash(self, schema_hash: str) -> Optional[StorageSchemaI
                 )
             ).to_list()
 
-            # LanceDB's ORDER BY clause doesn't seem to work.
-            # See https://github.com/dlt-hub/dlt/pull/1375#issuecomment-2171909341
             most_recent_schema = sorted(schemas, key=lambda x: x[p_inserted_at], reverse=True)[0]
             return StorageSchemaInfo(
                 version_hash=most_recent_schema[p_version_hash],
@@ -543,7 +632,7 @@ def get_stored_schema(self) -> Optional[StorageSchemaInfo]:
         """Retrieves newest schema from destination storage."""
         fq_version_table_name = self.make_qualified_table_name(self.schema.version_table_name)
 
-        version_table: Table = self.db_client.open_table(fq_version_table_name)
+        version_table: "lancedb.table.Table" = self.db_client.open_table(fq_version_table_name)
         version_table.checkout_latest()
         p_version_hash = self.schema.naming.normalize_identifier("version_hash")
         p_inserted_at = self.schema.naming.normalize_identifier("inserted_at")
@@ -559,8 +648,6 @@ def get_stored_schema(self) -> Optional[StorageSchemaInfo]:
                 )
             ).to_list()
 
-            # LanceDB's ORDER BY clause doesn't seem to work.
-            # See https://github.com/dlt-hub/dlt/pull/1375#issuecomment-2171909341
             most_recent_schema = sorted(schemas, key=lambda x: x[p_inserted_at], reverse=True)[0]
             return StorageSchemaInfo(
                 version_hash=most_recent_schema[p_version_hash],
@@ -592,16 +679,14 @@ def complete_load(self, load_id: str) -> None:
                 self.schema.naming.normalize_identifier("schema_name"): self.schema.name,
                 self.schema.naming.normalize_identifier("status"): 0,
                 self.schema.naming.normalize_identifier("inserted_at"): str(pendulum.now()),
-                self.schema.naming.normalize_identifier(
-                    "schema_version_hash"
-                ): None,  # Payload schema must match the target schema.
+                self.schema.naming.normalize_identifier("schema_version_hash"): None,
             }
         ]
         fq_loads_table_name = self.make_qualified_table_name(self.schema.loads_table_name)
         write_disposition = self.schema.get_table(self.schema.loads_table_name).get(
             "write_disposition"
         )
-        upload_batch(
+        write_records(
             records,
             db_client=self.db_client,
             table_name=fq_loads_table_name,
@@ -611,80 +696,158 @@ def complete_load(self, load_id: str) -> None:
     def create_load_job(
         self, table: PreparedTableSchema, file_path: str, load_id: str, restore: bool = False
     ) -> LoadJob:
-        return LanceDBLoadJob(
-            file_path=file_path,
-            type_mapper=self.type_mapper,
-            model_func=self.model_func,
-            fq_table_name=self.make_qualified_table_name(table["name"]),
+        if ReferenceFollowupJobRequest.is_reference_job(file_path):
+            return LanceDBRemoveOrphansJob(file_path)
+        else:
+            return LanceDBLoadJob(file_path, table)
+
+    def create_table_chain_completed_followup_jobs(
+        self,
+        table_chain: Sequence[TTableSchema],
+        completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None,
+    ) -> List[FollowupJobRequest]:
+        jobs = super().create_table_chain_completed_followup_jobs(
+            table_chain, completed_table_chain_jobs  # type: ignore[arg-type]
         )
+        # Orphan removal is only supported for upsert strategy because we need a deterministic key hash.
+        first_table_in_chain = table_chain[0]
+        if first_table_in_chain.get(
+            "write_disposition"
+        ) == "merge" and not first_table_in_chain.get(NO_REMOVE_ORPHANS_HINT):
+            all_job_paths_ordered = [
+                job.file_path
+                for table in table_chain
+                for job in completed_table_chain_jobs
+                if job.job_file_info.table_name == table.get("name")
+            ]
+            root_table_file_name = FileStorage.get_file_name_from_file_path(
+                all_job_paths_ordered[0]
+            )
+            jobs.append(ReferenceFollowupJobRequest(root_table_file_name, all_job_paths_ordered))
+        return jobs
 
     def table_exists(self, table_name: str) -> bool:
         return table_name in self.db_client.table_names()
 
 
-class LanceDBLoadJob(RunnableLoadJob):
+class LanceDBLoadJob(RunnableLoadJob, HasFollowupJobs):
     arrow_schema: TArrowSchema
 
     def __init__(
         self,
         file_path: str,
-        type_mapper: DataTypeMapper,
-        model_func: TextEmbeddingFunction,
-        fq_table_name: str,
+        table_schema: TTableSchema,
     ) -> None:
         super().__init__(file_path)
-        self._type_mapper = type_mapper
-        self._fq_table_name: str = fq_table_name
-        self._model_func = model_func
         self._job_client: "LanceDBClient" = None
+        self._table_schema: TTableSchema = table_schema
 
     def run(self) -> None:
-        self._db_client: DBConnection = self._job_client.db_client
-        self._embedding_model_func: TextEmbeddingFunction = self._model_func
-        self._embedding_model_dimensions: int = self._job_client.config.embedding_model_dimensions
-        self._id_field_name: str = self._job_client.config.id_field_name
-
-        unique_identifiers: Sequence[str] = list_merge_identifiers(self._load_table)
+        db_client: DBConnection = self._job_client.db_client
+        fq_table_name: str = self._job_client.make_qualified_table_name(self._table_schema["name"])
         write_disposition: TWriteDisposition = cast(
             TWriteDisposition, self._load_table.get("write_disposition", "append")
         )
 
-        with FileStorage.open_zipsafe_ro(self._file_path) as f:
-            records: List[DictStrAny] = [json.loads(line) for line in f]
+        with FileStorage.open_zipsafe_ro(self._file_path, mode="rb") as f:
+            arrow_table: pa.Table = pq.read_table(f)
 
         # Replace empty strings with placeholder string if OpenAI is used.
         # https://github.com/lancedb/lancedb/issues/1577#issuecomment-2318104218.
         if (self._job_client.config.embedding_model_provider == "openai") and (
             source_columns := get_columns_names_with_prop(self._load_table, VECTORIZE_HINT)
         ):
-            records = [
-                {
-                    k: EMPTY_STRING_PLACEHOLDER if k in source_columns and v in ("", None) else v
-                    for k, v in record.items()
-                }
-                for record in records
-            ]
-
-        if self._load_table not in self._schema.dlt_tables():
-            for record in records:
-                # Add reserved ID fields.
-                uuid_id = (
-                    generate_uuid(record, unique_identifiers, self._fq_table_name)
-                    if unique_identifiers
-                    else str(uuid.uuid4())
-                )
-                record.update({self._id_field_name: uuid_id})
+            arrow_table = fill_empty_source_column_values_with_placeholder(
+                arrow_table, source_columns, EMPTY_STRING_PLACEHOLDER
+            )
 
-                # LanceDB expects all fields in the target arrow table to be present in the data payload.
-                # We add and set these missing fields, that are fields not present in the target schema, to NULL.
-                missing_fields = set(self._load_table["columns"]) - set(record)
-                for field in missing_fields:
-                    record[field] = None
+        # We need upsert merge's deterministic _dlt_id to perform orphan removal.
+        # Hence, we require at least a primary key on the root table if the merge disposition is chosen.
+        if (
+            (self._load_table not in self._schema.dlt_table_names())
+            and not self._load_table.get("parent")  # Is root table.
+            and (write_disposition == "merge")
+            and (not get_columns_names_with_prop(self._load_table, "primary_key"))
+        ):
+            raise DestinationTerminalException(
+                "LanceDB's write disposition requires at least one explicit primary key."
+            )
 
-        upload_batch(
-            records,
-            db_client=self._db_client,
-            table_name=self._fq_table_name,
+        dlt_id = self._schema.naming.normalize_identifier(
+            self._schema.data_item_normalizer.c_dlt_id  # type: ignore[attr-defined]
+        )
+        write_records(
+            arrow_table,
+            db_client=db_client,
+            table_name=fq_table_name,
             write_disposition=write_disposition,
-            id_field_name=self._id_field_name,
+            merge_key=dlt_id,
         )
+
+
+class LanceDBRemoveOrphansJob(RunnableLoadJob):
+    orphaned_ids: Set[str]
+
+    def __init__(
+        self,
+        file_path: str,
+    ) -> None:
+        super().__init__(file_path)
+        self._job_client: "LanceDBClient" = None
+        self.references = ReferenceFollowupJobRequest.resolve_references(file_path)
+
+    def run(self) -> None:
+        dlt_load_id = self._schema.naming.normalize_identifier(
+            self._schema.data_item_normalizer.c_dlt_load_id  # type: ignore[attr-defined]
+        )
+        dlt_id = self._schema.naming.normalize_identifier(
+            self._schema.data_item_normalizer.c_dlt_id  # type: ignore[attr-defined]
+        )
+        dlt_root_id = self._schema.naming.normalize_identifier(
+            self._schema.data_item_normalizer.c_dlt_root_id  # type: ignore[attr-defined]
+        )
+
+        db_client: DBConnection = self._job_client.db_client
+        table_lineage: TTableLineage = [
+            TableJob(
+                table_schema=self._schema.get_table(
+                    ParsedLoadJobFileName.parse(file_path_).table_name
+                ),
+                table_name=ParsedLoadJobFileName.parse(file_path_).table_name,
+                file_path=file_path_,
+            )
+            for file_path_ in self.references
+        ]
+
+        for job in table_lineage:
+            target_is_root_table = "parent" not in job.table_schema
+            fq_table_name = self._job_client.make_qualified_table_name(job.table_name)
+            file_path = job.file_path
+            with FileStorage.open_zipsafe_ro(file_path, mode="rb") as f:
+                payload_arrow_table: pa.Table = pq.read_table(f)
+
+            if target_is_root_table:
+                canonical_doc_id_field = get_canonical_vector_database_doc_id_merge_key(
+                    job.table_schema
+                )
+                filter_condition = create_filter_condition(
+                    canonical_doc_id_field, payload_arrow_table[canonical_doc_id_field]
+                )
+                merge_key = dlt_load_id
+
+            else:
+                filter_condition = create_filter_condition(
+                    dlt_root_id,
+                    payload_arrow_table[dlt_root_id],
+                )
+                merge_key = dlt_id
+
+            write_records(
+                payload_arrow_table,
+                db_client=db_client,
+                table_name=fq_table_name,
+                write_disposition="merge",
+                merge_key=merge_key,
+                remove_orphans=True,
+                filter_condition=filter_condition,
+            )
diff --git a/dlt/destinations/impl/lancedb/schema.py b/dlt/destinations/impl/lancedb/schema.py
index 27c6fb33a1..25dfbc840a 100644
--- a/dlt/destinations/impl/lancedb/schema.py
+++ b/dlt/destinations/impl/lancedb/schema.py
@@ -1,6 +1,5 @@
 """Utilities for creating arrow schemas from table schemas."""
-
-from dlt.common.json import json
+from collections import namedtuple
 from typing import (
     List,
     cast,
@@ -11,17 +10,19 @@
 from lancedb.embeddings import TextEmbeddingFunction  # type: ignore
 from typing_extensions import TypeAlias
 
+from dlt.common.destination.capabilities import DataTypeMapper
+from dlt.common.json import json
 from dlt.common.schema import Schema, TColumnSchema
 from dlt.common.typing import DictStrAny
 
-from dlt.common.destination.capabilities import DataTypeMapper
-
 
 TArrowSchema: TypeAlias = pa.Schema
 TArrowDataType: TypeAlias = pa.DataType
 TArrowField: TypeAlias = pa.Field
 NULL_SCHEMA: TArrowSchema = pa.schema([])
 """Empty pyarrow Schema with no fields."""
+TableJob = namedtuple("TableJob", ["table_schema", "table_name", "file_path"])
+TTableLineage: TypeAlias = List[TableJob]
 
 
 def arrow_schema_to_dict(schema: TArrowSchema) -> DictStrAny:
@@ -42,7 +43,6 @@ def make_arrow_table_schema(
     table_name: str,
     schema: Schema,
     type_mapper: DataTypeMapper,
-    id_field_name: Optional[str] = None,
     vector_field_name: Optional[str] = None,
     embedding_fields: Optional[List[str]] = None,
     embedding_model_func: Optional[TextEmbeddingFunction] = None,
@@ -51,9 +51,6 @@ def make_arrow_table_schema(
     """Creates a PyArrow schema from a dlt schema."""
     arrow_schema: List[TArrowField] = []
 
-    if id_field_name:
-        arrow_schema.append(pa.field(id_field_name, pa.string()))
-
     if embedding_fields:
         # User's provided dimension config, if provided, takes precedence.
         vec_size = embedding_model_dimensions or embedding_model_func.ndims()
@@ -83,3 +80,22 @@ def make_arrow_table_schema(
         metadata["embedding_functions"] = json.dumps(embedding_functions).encode("utf-8")
 
     return pa.schema(arrow_schema, metadata=metadata)
+
+
+def arrow_datatype_to_fusion_datatype(arrow_type: TArrowSchema) -> str:
+    type_map = {
+        pa.bool_(): "BOOLEAN",
+        pa.int64(): "BIGINT",
+        pa.float64(): "DOUBLE",
+        pa.utf8(): "STRING",
+        pa.binary(): "BYTEA",
+        pa.date32(): "DATE",
+    }
+
+    if isinstance(arrow_type, pa.Decimal128Type):
+        return f"DECIMAL({arrow_type.precision}, {arrow_type.scale})"
+
+    if isinstance(arrow_type, pa.TimestampType):
+        return "TIMESTAMP"
+
+    return type_map.get(arrow_type, "UNKNOWN")
diff --git a/dlt/destinations/impl/lancedb/utils.py b/dlt/destinations/impl/lancedb/utils.py
index aeacd4d34b..f07f2754d2 100644
--- a/dlt/destinations/impl/lancedb/utils.py
+++ b/dlt/destinations/impl/lancedb/utils.py
@@ -1,13 +1,15 @@
 import os
-import uuid
-from typing import Sequence, Union, Dict
+from typing import Union, Dict, List
 
+import pyarrow as pa
+
+from dlt.common import logger
+from dlt.common.destination.exceptions import DestinationTerminalException
 from dlt.common.schema import TTableSchema
 from dlt.common.schema.utils import get_columns_names_with_prop
-from dlt.common.typing import DictStrAny
 from dlt.destinations.impl.lancedb.configuration import TEmbeddingProvider
 
-
+EMPTY_STRING_PLACEHOLDER = "0uEoDNBpQUBwsxKbmxxB"
 PROVIDER_ENVIRONMENT_VARIABLES_MAP: Dict[TEmbeddingProvider, str] = {
     "cohere": "COHERE_API_KEY",
     "gemini-text": "GOOGLE_API_KEY",
@@ -16,40 +18,65 @@
 }
 
 
-def generate_uuid(data: DictStrAny, unique_identifiers: Sequence[str], table_name: str) -> str:
-    """Generates deterministic UUID - used for deduplication.
+def set_non_standard_providers_environment_variables(
+    embedding_model_provider: TEmbeddingProvider, api_key: Union[str, None]
+) -> None:
+    if embedding_model_provider in PROVIDER_ENVIRONMENT_VARIABLES_MAP:
+        os.environ[PROVIDER_ENVIRONMENT_VARIABLES_MAP[embedding_model_provider]] = api_key or ""
 
-    Args:
-        data (Dict[str, Any]): Arbitrary data to generate UUID for.
-        unique_identifiers (Sequence[str]): A list of unique identifiers.
-        table_name (str): LanceDB table name.
 
-    Returns:
-        str: A string representation of the generated UUID.
-    """
-    data_id = "_".join(str(data[key]) for key in unique_identifiers)
-    return str(uuid.uuid5(uuid.NAMESPACE_DNS, table_name + data_id))
+def get_canonical_vector_database_doc_id_merge_key(
+    load_table: TTableSchema,
+) -> str:
+    if merge_key := get_columns_names_with_prop(load_table, "merge_key"):
+        if len(merge_key) > 1:
+            raise DestinationTerminalException(
+                "You cannot specify multiple merge keys with LanceDB orphan remove enabled:"
+                f" {merge_key}"
+            )
+        else:
+            return merge_key[0]
+    elif primary_key := get_columns_names_with_prop(load_table, "primary_key"):
+        # No merge key defined, warn and assume the first element of the primary key is `doc_id`.
+        logger.warning(
+            "Merge strategy selected without defined merge key - using the first element of the"
+            f" primary key ({primary_key}) as merge key."
+        )
+        return primary_key[0]
+    else:
+        raise DestinationTerminalException(
+            "You must specify at least a primary key in order to perform orphan removal."
+        )
 
 
-def list_merge_identifiers(table_schema: TTableSchema) -> Sequence[str]:
-    """Returns a list of merge keys for a table used for either merging or deduplication.
+def fill_empty_source_column_values_with_placeholder(
+    table: pa.Table, source_columns: List[str], placeholder: str
+) -> pa.Table:
+    """
+    Replaces empty strings and null values in the specified source columns of an Arrow table with a placeholder string.
 
     Args:
-        table_schema (TTableSchema): a dlt table schema.
+        table (pa.Table): The input Arrow table.
+        source_columns (List[str]): A list of column names to replace empty strings and null values in.
+        placeholder (str): The placeholder string to use for replacement.
 
     Returns:
-        Sequence[str]: A list of unique column identifiers.
+        pa.Table: The modified Arrow table with empty strings and null values replaced in the specified columns.
     """
-    if table_schema.get("write_disposition") == "merge":
-        primary_keys = get_columns_names_with_prop(table_schema, "primary_key")
-        merge_keys = get_columns_names_with_prop(table_schema, "merge_key")
-        if join_keys := list(set(primary_keys + merge_keys)):
-            return join_keys
-    return get_columns_names_with_prop(table_schema, "unique")
+    for col_name in source_columns:
+        column = table[col_name]
+        filled_column = pa.compute.fill_null(column, fill_value=placeholder)
+        new_column = pa.compute.replace_substring_regex(
+            filled_column, pattern=r"^$", replacement=placeholder
+        )
+        table = table.set_column(table.column_names.index(col_name), col_name, new_column)
+    return table
 
 
-def set_non_standard_providers_environment_variables(
-    embedding_model_provider: TEmbeddingProvider, api_key: Union[str, None]
-) -> None:
-    if embedding_model_provider in PROVIDER_ENVIRONMENT_VARIABLES_MAP:
-        os.environ[PROVIDER_ENVIRONMENT_VARIABLES_MAP[embedding_model_provider]] = api_key or ""
+def create_filter_condition(field_name: str, array: pa.Array) -> str:
+    def format_value(element: Union[str, int, float, pa.Scalar]) -> str:
+        if isinstance(element, pa.Scalar):
+            element = element.as_py()
+        return "'" + element.replace("'", "''") + "'" if isinstance(element, str) else str(element)
+
+    return f"{field_name} IN ({', '.join(map(format_value, array))})"
diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py
index 2407d2db62..a2f6312e98 100644
--- a/dlt/destinations/sql_jobs.py
+++ b/dlt/destinations/sql_jobs.py
@@ -78,7 +78,6 @@ def from_table_chain(
             job = cls(file_info.file_name())
             job._save_text_file("\n".join(sql))
         except Exception as e:
-            # raise exception with some context
             raise SqlJobCreationException(e, table_chain) from e
 
         return job
diff --git a/docs/website/docs/dlt-ecosystem/destinations/lancedb.md b/docs/website/docs/dlt-ecosystem/destinations/lancedb.md
index 083d196aea..0ed21ad43e 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/lancedb.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/lancedb.md
@@ -179,19 +179,61 @@ info = pipeline.run(
 
 ### Merge
 
-The [merge](../../general-usage/incremental-loading.md) write disposition merges the data from the resource with the data at the destination based on a unique identifier.
+The [merge](../../general-usage/incremental-loading.md) write disposition merges the data from the resource with the data at the destination based on a unique identifier. The LanceDB destination merge write disposition only supports upsert strategy. This updates existing records and inserts new ones based on a unique identifier.
+
+You can specify the merge disposition, primary key, and merge key either in a resource or adapter:
+
+```py
+@dlt.resource(
+  primary_key=["doc_id", "chunk_id"],
+  merge_key=["doc_id"],
+  write_disposition={"disposition": "merge", "strategy": "upsert"},
+)
+def my_rag_docs(
+  data: List[DictStrAny],
+) -> Generator[List[DictStrAny], None, None]:
+    yield data
+```
+
+Or:
+
+```py
+pipeline.run(
+  lancedb_adapter(
+    my_new_rag_docs,
+    merge_key="doc_id"
+  ),
+  write_disposition={"disposition": "merge", "strategy": "upsert"},
+  primary_key=["doc_id", "chunk_id"],
+)
+```
+
+The `primary_key` uniquely identifies each record, typically comprising a document ID and a chunk ID.
+The `merge_key`, which cannot be compound, should correspond to the canonical `doc_id` used in vector databases and represent the document identifier in your data model.
+It must be the first element of the `primary_key`.
+This `merge_key` is crucial for document identification and orphan removal during merge operations.
+This structure ensures proper record identification and maintains consistency with vector database concepts.
+
+
+#### Orphan Removal
+
+LanceDB **automatically removes orphaned chunks** when updating or deleting parent documents during a merge operation. To disable this feature:
 
 ```py
 pipeline.run(
   lancedb_adapter(
     movies,
     embed="title",
+    no_remove_orphans=True # Disable with the `no_remove_orphans` flag.
   ),
-  write_disposition="merge",
-  primary_key="id",
+  write_disposition={"disposition": "merge", "strategy": "upsert"},
+  primary_key=["doc_id", "chunk_id"],
 )
 ```
 
+Note: While it's possible to omit the `merge_key` for brevity (in which case it is assumed to be the first entry of `primary_key`),
+explicitly specifying both is recommended for clarity.
+
 ### Append
 
 This is the default disposition. It will append the data to the existing data in the destination.
@@ -200,7 +242,6 @@ This is the default disposition. It will append the data to the existing data in
 
 - `dataset_separator`: The character used to separate the dataset name from table names. Defaults to "___".
 - `vector_field_name`: The name of the special field to store vector embeddings. Defaults to "vector".
-- `id_field_name`: The name of the special field used for deduplication and merging. Defaults to "id__".
 - `max_retries`: The maximum number of retries for embedding operations. Set to 0 to disable retries. Defaults to 3.
 
 ## dbt support
diff --git a/poetry.lock b/poetry.lock
index 12c0d75d1e..34f32de996 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "about-time"
@@ -2167,32 +2167,33 @@ typing-extensions = ">=3.10.0"
 
 [[package]]
 name = "databricks-sql-connector"
-version = "3.3.0"
+version = "2.9.6"
 description = "Databricks SQL Connector for Python"
 optional = true
-python-versions = "<4.0.0,>=3.8.0"
+python-versions = "<4.0.0,>=3.7.1"
 files = [
-    {file = "databricks_sql_connector-3.3.0-py3-none-any.whl", hash = "sha256:55ee5a4a11291bf91a235ac76e41b419ddd66a9a321065a8bfaf119acbb26d6b"},
-    {file = "databricks_sql_connector-3.3.0.tar.gz", hash = "sha256:19e82965da4c86574adfe9f788c17b4494d98eb8075ba4fd4306573d2edbf194"},
+    {file = "databricks_sql_connector-2.9.6-py3-none-any.whl", hash = "sha256:d830abf86e71d2eb83c6a7b7264d6c03926a8a83cec58541ddd6b83d693bde8f"},
+    {file = "databricks_sql_connector-2.9.6.tar.gz", hash = "sha256:e55f5b8ede8ae6c6f31416a4cf6352f0ac019bf6875896c668c7574ceaf6e813"},
 ]
 
 [package.dependencies]
+alembic = ">=1.0.11,<2.0.0"
 lz4 = ">=4.0.2,<5.0.0"
 numpy = [
-    {version = ">=1.16.6,<2.0.0", markers = "python_version >= \"3.8\" and python_version < \"3.11\""},
-    {version = ">=1.23.4,<2.0.0", markers = "python_version >= \"3.11\""},
+    {version = ">=1.16.6", markers = "python_version >= \"3.7\" and python_version < \"3.11\""},
+    {version = ">=1.23.4", markers = "python_version >= \"3.11\""},
 ]
 oauthlib = ">=3.1.0,<4.0.0"
 openpyxl = ">=3.0.10,<4.0.0"
-pandas = {version = ">=1.2.5,<2.2.0", markers = "python_version >= \"3.8\""}
-pyarrow = ">=14.0.1,<17"
+pandas = {version = ">=1.2.5,<3.0.0", markers = "python_version >= \"3.8\""}
+pyarrow = [
+    {version = ">=6.0.0", markers = "python_version >= \"3.7\" and python_version < \"3.11\""},
+    {version = ">=10.0.1", markers = "python_version >= \"3.11\""},
+]
 requests = ">=2.18.1,<3.0.0"
-thrift = ">=0.16.0,<0.21.0"
-urllib3 = ">=1.26"
-
-[package.extras]
-alembic = ["alembic (>=1.0.11,<2.0.0)", "sqlalchemy (>=2.0.21)"]
-sqlalchemy = ["sqlalchemy (>=2.0.21)"]
+sqlalchemy = ">=1.3.24,<2.0.0"
+thrift = ">=0.16.0,<0.17.0"
+urllib3 = ">=1.0"
 
 [[package]]
 name = "dbt-athena-community"
@@ -3788,6 +3789,106 @@ files = [
     {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"},
     {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"},
     {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"},
+    {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:222fc2ee0e40522de0b21ad3bc90ab8983be3bf3cec3d349c80d76c8bb1a4beb"},
+    {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d4763b0b9195b72132a4e7de8e5a9bf1f05542f442a9115aa27cfc2a8004f581"},
+    {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:209649da10c9d4a93d8a4d100ecbf9cc3b0252169426bec3e8b4ad7e57d600cf"},
+    {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:68813aa333c1604a2df4a495b2a6ed065d7c8aebf26cc7e7abb5a6835d08353c"},
+    {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:370a23ec775ad14e9d1e71474d56f381224dcf3e72b15d8ca7b4ad7dd9cd5853"},
+    {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:14664a66a3ddf6bc9e56f401bf029db2d169982c53eff3f5876399104df0e9a6"},
+    {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ea3722cc4932cbcebd553b69dce1b4a73572823cff4e6a244f1c855da21d511"},
+    {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e14bb264c40fd7c627ef5678e295370cd6ba95ca71d835798b6e37502fc4c690"},
+    {file = "google_re2-1.1-5-cp310-cp310-win32.whl", hash = "sha256:39512cd0151ea4b3969c992579c79b423018b464624ae955be685fc07d94556c"},
+    {file = "google_re2-1.1-5-cp310-cp310-win_amd64.whl", hash = "sha256:ac66537aa3bc5504320d922b73156909e3c2b6da19739c866502f7827b3f9fdf"},
+    {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5b5ea68d54890c9edb1b930dcb2658819354e5d3f2201f811798bbc0a142c2b4"},
+    {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:33443511b6b83c35242370908efe2e8e1e7cae749c766b2b247bf30e8616066c"},
+    {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:413d77bdd5ba0bfcada428b4c146e87707452ec50a4091ec8e8ba1413d7e0619"},
+    {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:5171686e43304996a34baa2abcee6f28b169806d0e583c16d55e5656b092a414"},
+    {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3b284db130283771558e31a02d8eb8fb756156ab98ce80035ae2e9e3a5f307c4"},
+    {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:296e6aed0b169648dc4b870ff47bd34c702a32600adb9926154569ef51033f47"},
+    {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:38d50e68ead374160b1e656bbb5d101f0b95fb4cc57f4a5c12100155001480c5"},
+    {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2a0416a35921e5041758948bcb882456916f22845f66a93bc25070ef7262b72a"},
+    {file = "google_re2-1.1-5-cp311-cp311-win32.whl", hash = "sha256:a1d59568bbb5de5dd56dd6cdc79907db26cce63eb4429260300c65f43469e3e7"},
+    {file = "google_re2-1.1-5-cp311-cp311-win_amd64.whl", hash = "sha256:72f5a2f179648b8358737b2b493549370debd7d389884a54d331619b285514e3"},
+    {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:cbc72c45937b1dc5acac3560eb1720007dccca7c9879138ff874c7f6baf96005"},
+    {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5fadd1417fbef7235fa9453dba4eb102e6e7d94b1e4c99d5fa3dd4e288d0d2ae"},
+    {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:040f85c63cc02696485b59b187a5ef044abe2f99b92b4fb399de40b7d2904ccc"},
+    {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:64e3b975ee6d9bbb2420494e41f929c1a0de4bcc16d86619ab7a87f6ea80d6bd"},
+    {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8ee370413e00f4d828eaed0e83b8af84d7a72e8ee4f4bd5d3078bc741dfc430a"},
+    {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:5b89383001079323f693ba592d7aad789d7a02e75adb5d3368d92b300f5963fd"},
+    {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:63cb4fdfbbda16ae31b41a6388ea621510db82feb8217a74bf36552ecfcd50ad"},
+    {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ebedd84ae8be10b7a71a16162376fd67a2386fe6361ef88c622dcf7fd679daf"},
+    {file = "google_re2-1.1-5-cp312-cp312-win32.whl", hash = "sha256:c8e22d1692bc2c81173330c721aff53e47ffd3c4403ff0cd9d91adfd255dd150"},
+    {file = "google_re2-1.1-5-cp312-cp312-win_amd64.whl", hash = "sha256:5197a6af438bb8c4abda0bbe9c4fbd6c27c159855b211098b29d51b73e4cbcf6"},
+    {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b6727e0b98417e114b92688ad2aa256102ece51f29b743db3d831df53faf1ce3"},
+    {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:711e2b6417eb579c61a4951029d844f6b95b9b373b213232efd413659889a363"},
+    {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:71ae8b3df22c5c154c8af0f0e99d234a450ef1644393bc2d7f53fc8c0a1e111c"},
+    {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:94a04e214bc521a3807c217d50cf099bbdd0c0a80d2d996c0741dbb995b5f49f"},
+    {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:a770f75358508a9110c81a1257721f70c15d9bb592a2fb5c25ecbd13566e52a5"},
+    {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:07c9133357f7e0b17c6694d5dcb82e0371f695d7c25faef2ff8117ef375343ff"},
+    {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:204ca6b1cf2021548f4a9c29ac015e0a4ab0a7b6582bf2183d838132b60c8fda"},
+    {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f0b95857c2c654f419ca684ec38c9c3325c24e6ba7d11910a5110775a557bb18"},
+    {file = "google_re2-1.1-5-cp38-cp38-win32.whl", hash = "sha256:347ac770e091a0364e822220f8d26ab53e6fdcdeaec635052000845c5a3fb869"},
+    {file = "google_re2-1.1-5-cp38-cp38-win_amd64.whl", hash = "sha256:ec32bb6de7ffb112a07d210cf9f797b7600645c2d5910703fa07f456dd2150e0"},
+    {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb5adf89060f81c5ff26c28e261e6b4997530a923a6093c9726b8dec02a9a326"},
+    {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a22630c9dd9ceb41ca4316bccba2643a8b1d5c198f21c00ed5b50a94313aaf10"},
+    {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:544dc17fcc2d43ec05f317366375796351dec44058e1164e03c3f7d050284d58"},
+    {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:19710af5ea88751c7768575b23765ce0dfef7324d2539de576f75cdc319d6654"},
+    {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:f82995a205e08ad896f4bd5ce4847c834fab877e1772a44e5f262a647d8a1dec"},
+    {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:63533c4d58da9dc4bc040250f1f52b089911699f0368e0e6e15f996387a984ed"},
+    {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79e00fcf0cb04ea35a22b9014712d448725ce4ddc9f08cc818322566176ca4b0"},
+    {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bc41afcefee2da6c4ed883a93d7f527c4b960cd1d26bbb0020a7b8c2d341a60a"},
+    {file = "google_re2-1.1-5-cp39-cp39-win32.whl", hash = "sha256:486730b5e1f1c31b0abc6d80abe174ce4f1188fe17d1b50698f2bf79dc6e44be"},
+    {file = "google_re2-1.1-5-cp39-cp39-win_amd64.whl", hash = "sha256:4de637ca328f1d23209e80967d1b987d6b352cd01b3a52a84b4d742c69c3da6c"},
+    {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:621e9c199d1ff0fdb2a068ad450111a84b3bf14f96dfe5a8a7a0deae5f3f4cce"},
+    {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:220acd31e7dde95373f97c3d1f3b3bd2532b38936af28b1917ee265d25bebbf4"},
+    {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:db34e1098d164f76251a6ece30e8f0ddfd65bb658619f48613ce71acb3f9cbdb"},
+    {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:5152bac41d8073977582f06257219541d0fc46ad99b0bbf30e8f60198a43b08c"},
+    {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:6191294799e373ee1735af91f55abd23b786bdfd270768a690d9d55af9ea1b0d"},
+    {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:070cbafbb4fecbb02e98feb28a1eb292fb880f434d531f38cc33ee314b521f1f"},
+    {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8437d078b405a59a576cbed544490fe041140f64411f2d91012e8ec05ab8bf86"},
+    {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f00f9a9af8896040e37896d9b9fc409ad4979f1ddd85bb188694a7d95ddd1164"},
+    {file = "google_re2-1.1-6-cp310-cp310-win32.whl", hash = "sha256:df26345f229a898b4fd3cafd5f82259869388cee6268fc35af16a8e2293dd4e5"},
+    {file = "google_re2-1.1-6-cp310-cp310-win_amd64.whl", hash = "sha256:3665d08262c57c9b28a5bdeb88632ad792c4e5f417e5645901695ab2624f5059"},
+    {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b26b869d8aa1d8fe67c42836bf3416bb72f444528ee2431cfb59c0d3e02c6ce3"},
+    {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:41fd4486c57dea4f222a6bb7f1ff79accf76676a73bdb8da0fcbd5ba73f8da71"},
+    {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:0ee378e2e74e25960070c338c28192377c4dd41e7f4608f2688064bd2badc41e"},
+    {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:a00cdbf662693367b36d075b29feb649fd7ee1b617cf84f85f2deebeda25fc64"},
+    {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c09455014217a41499432b8c8f792f25f3df0ea2982203c3a8c8ca0e7895e69"},
+    {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6501717909185327935c7945e23bb5aa8fc7b6f237b45fe3647fa36148662158"},
+    {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3510b04790355f199e7861c29234081900e1e1cbf2d1484da48aa0ba6d7356ab"},
+    {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8c0e64c187ca406764f9e9ad6e750d62e69ed8f75bf2e865d0bfbc03b642361c"},
+    {file = "google_re2-1.1-6-cp311-cp311-win32.whl", hash = "sha256:2a199132350542b0de0f31acbb3ca87c3a90895d1d6e5235f7792bb0af02e523"},
+    {file = "google_re2-1.1-6-cp311-cp311-win_amd64.whl", hash = "sha256:83bdac8ceaece8a6db082ea3a8ba6a99a2a1ee7e9f01a9d6d50f79c6f251a01d"},
+    {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:81985ff894cd45ab5a73025922ac28c0707759db8171dd2f2cc7a0e856b6b5ad"},
+    {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5635af26065e6b45456ccbea08674ae2ab62494008d9202df628df3b267bc095"},
+    {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:813b6f04de79f4a8fdfe05e2cb33e0ccb40fe75d30ba441d519168f9d958bd54"},
+    {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:5ec2f5332ad4fd232c3f2d6748c2c7845ccb66156a87df73abcc07f895d62ead"},
+    {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5a687b3b32a6cbb731647393b7c4e3fde244aa557f647df124ff83fb9b93e170"},
+    {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:39a62f9b3db5d3021a09a47f5b91708b64a0580193e5352751eb0c689e4ad3d7"},
+    {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca0f0b45d4a1709cbf5d21f355e5809ac238f1ee594625a1e5ffa9ff7a09eb2b"},
+    {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a64b3796a7a616c7861247bd061c9a836b5caf0d5963e5ea8022125601cf7b09"},
+    {file = "google_re2-1.1-6-cp312-cp312-win32.whl", hash = "sha256:32783b9cb88469ba4cd9472d459fe4865280a6b1acdad4480a7b5081144c4eb7"},
+    {file = "google_re2-1.1-6-cp312-cp312-win_amd64.whl", hash = "sha256:259ff3fd2d39035b9cbcbf375995f83fa5d9e6a0c5b94406ff1cc168ed41d6c6"},
+    {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e4711bcffe190acd29104d8ecfea0c0e42b754837de3fb8aad96e6cc3c613cdc"},
+    {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:4d081cce43f39c2e813fe5990e1e378cbdb579d3f66ded5bade96130269ffd75"},
+    {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:4f123b54d48450d2d6b14d8fad38e930fb65b5b84f1b022c10f2913bd956f5b5"},
+    {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:e1928b304a2b591a28eb3175f9db7f17c40c12cf2d4ec2a85fdf1cc9c073ff91"},
+    {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:3a69f76146166aec1173003c1f547931bdf288c6b135fda0020468492ac4149f"},
+    {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:fc08c388f4ebbbca345e84a0c56362180d33d11cbe9ccfae663e4db88e13751e"},
+    {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b057adf38ce4e616486922f2f47fc7d19c827ba0a7f69d540a3664eba2269325"},
+    {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4138c0b933ab099e96f5d8defce4486f7dfd480ecaf7f221f2409f28022ccbc5"},
+    {file = "google_re2-1.1-6-cp38-cp38-win32.whl", hash = "sha256:9693e45b37b504634b1abbf1ee979471ac6a70a0035954592af616306ab05dd6"},
+    {file = "google_re2-1.1-6-cp38-cp38-win_amd64.whl", hash = "sha256:5674d437baba0ea287a5a7f8f81f24265d6ae8f8c09384e2ef7b6f84b40a7826"},
+    {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:7783137cb2e04f458a530c6d0ee9ef114815c1d48b9102f023998c371a3b060e"},
+    {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a49b7153935e7a303675f4deb5f5d02ab1305adefc436071348706d147c889e0"},
+    {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a96a8bb309182090704593c60bdb369a2756b38fe358bbf0d40ddeb99c71769f"},
+    {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:dff3d4be9f27ef8ec3705eed54f19ef4ab096f5876c15fe011628c69ba3b561c"},
+    {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:40f818b0b39e26811fa677978112a8108269977fdab2ba0453ac4363c35d9e66"},
+    {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:8a7e53538cdb40ef4296017acfbb05cab0c19998be7552db1cfb85ba40b171b9"},
+    {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6ee18e7569fb714e5bb8c42809bf8160738637a5e71ed5a4797757a1fb4dc4de"},
+    {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cda4f6d1a7d5b43ea92bc395f23853fba0caf8b1e1efa6e8c48685f912fcb89"},
+    {file = "google_re2-1.1-6-cp39-cp39-win32.whl", hash = "sha256:6a9cdbdc36a2bf24f897be6a6c85125876dc26fea9eb4247234aec0decbdccfd"},
+    {file = "google_re2-1.1-6-cp39-cp39-win_amd64.whl", hash = "sha256:73f646cecfad7cc5b4330b4192c25f2e29730a3b8408e089ffd2078094208196"},
 ]
 
 [[package]]
@@ -6737,52 +6838,55 @@ files = [
 
 [[package]]
 name = "pyarrow"
-version = "16.1.0"
+version = "17.0.0"
 description = "Python library for Apache Arrow"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"},
-    {file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"},
-    {file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"},
-    {file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"},
-    {file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"},
-    {file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"},
-    {file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"},
-    {file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"},
-    {file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"},
-    {file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"},
-    {file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"},
-    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"},
-    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"},
-    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"},
-    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"},
-    {file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"},
-    {file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"},
-    {file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"},
-    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"},
-    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"},
-    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"},
-    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"},
-    {file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"},
-    {file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"},
+    {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"},
+    {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"},
+    {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"},
+    {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"},
+    {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"},
+    {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"},
+    {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"},
+    {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"},
+    {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
+    {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
+    {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"},
+    {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"},
+    {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"},
+    {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"},
+    {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"},
+    {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
 ]
 
 [package.dependencies]
 numpy = ">=1.16.6"
 
+[package.extras]
+test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
+
 [[package]]
 name = "pyasn1"
 version = "0.5.0"
@@ -9829,4 +9933,4 @@ weaviate = ["weaviate-client"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<3.13"
-content-hash = "985bb75a9579b44a5f9fd029ade1cc77455b544f2e18f9741b1d0d89bd188537"
+content-hash = "6101cae0864d80307ae6d5f33ea263ce8e6d9f86e6e06d317c3d301818aa442e"
diff --git a/pyproject.toml b/pyproject.toml
index 1fa045bb07..c7b5f37a3a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -235,7 +235,7 @@ dbt-duckdb = ">=1.2.0"
 pymongo = ">=4.3.3"
 pandas = ">2"
 alive-progress = ">=3.0.1"
-pyarrow = ">=14.0.0"
+pyarrow = ">=17.0.0"
 psycopg2-binary = ">=2.9"
 lancedb = { version = ">=0.8.2", markers = "python_version >= '3.9'", allow-prereleases = true }
 openai = ">=1.45"
diff --git a/tests/load/lancedb/test_merge.py b/tests/load/lancedb/test_merge.py
new file mode 100644
index 0000000000..f04c846df7
--- /dev/null
+++ b/tests/load/lancedb/test_merge.py
@@ -0,0 +1,425 @@
+from typing import Iterator, List, Generator, Any
+
+import numpy as np
+import pandas as pd
+import pytest
+from lancedb.table import Table  # type: ignore
+from pandas import DataFrame
+from pandas.testing import assert_frame_equal
+
+import dlt
+from dlt.common.typing import DictStrAny, DictStrStr
+from dlt.common.utils import uniq_id
+from dlt.destinations.impl.lancedb.lancedb_adapter import (
+    lancedb_adapter,
+)
+from tests.load.lancedb.utils import chunk_document
+from tests.load.utils import (
+    drop_active_pipeline_data,
+    sequence_generator,
+)
+from tests.pipeline.utils import (
+    assert_load_info,
+)
+
+
+# Mark all tests as essential, don't remove.
+pytestmark = pytest.mark.essential
+
+
+@pytest.fixture(autouse=True)
+def drop_lancedb_data() -> Iterator[None]:
+    yield
+    drop_active_pipeline_data()
+
+
+def test_lancedb_remove_nested_orphaned_records() -> None:
+    pipeline = dlt.pipeline(
+        pipeline_name="test_lancedb_remove_orphaned_records",
+        destination="lancedb",
+        dataset_name=f"test_lancedb_remove_orphaned_records_{uniq_id()}",
+        dev_mode=True,
+    )
+
+    @dlt.resource(
+        table_name="parent",
+        write_disposition={"disposition": "merge", "strategy": "upsert"},
+        primary_key="id",
+        merge_key="id",
+    )
+    def identity_resource(
+        data: List[DictStrAny],
+    ) -> Generator[List[DictStrAny], None, None]:
+        yield data
+
+    run_1 = [
+        {
+            "id": 1,
+            "child": [
+                {"bar": 1, "grandchild": [{"baz": 1}, {"baz": 2}]},
+                {"bar": 2, "grandchild": [{"baz": 3}]},
+            ],
+        },
+        {"id": 2, "child": [{"bar": 3, "grandchild": [{"baz": 4}]}]},
+        {
+            "id": 3,
+            "child": [
+                {"bar": 10, "grandchild": [{"baz": 5}]},
+                {"bar": 11, "grandchild": [{"baz": 6}, {"baz": 7}]},
+            ],
+        },
+    ]
+    info = pipeline.run(identity_resource(run_1))
+    assert_load_info(info)
+
+    run_2 = [
+        {
+            "id": 1,
+            "child": [{"bar": 1, "grandchild": [{"baz": 1}]}],
+        },  # Removes bar_2, baz_2 and baz_3.
+        {
+            "id": 2,
+            "child": [{"bar": 4, "grandchild": [{"baz": 8}]}],
+        },  # Removes bar_3, baz_4.
+    ]
+    info = pipeline.run(identity_resource(run_2))
+    assert_load_info(info)
+
+    with pipeline.destination_client() as client:
+        expected_parent_data = pd.DataFrame(
+            data=[
+                {"id": 1},
+                {"id": 2},
+                {"id": 3},
+            ]
+        )
+
+        expected_child_data = pd.DataFrame(
+            data=[
+                {"bar": 1},
+                {"bar": 4},
+                {"bar": 10},
+                {"bar": 11},
+            ]
+        )
+
+        expected_grandchild_data = pd.DataFrame(
+            data=[
+                {"baz": 1},
+                {"baz": 8},
+                {"baz": 5},
+                {"baz": 6},
+                {"baz": 7},
+            ]
+        )
+
+        parent_table_name = client.make_qualified_table_name("parent")  # type: ignore[attr-defined]
+        child_table_name = client.make_qualified_table_name("parent__child")  # type: ignore[attr-defined]
+        grandchild_table_name = client.make_qualified_table_name(  # type: ignore[attr-defined]
+            "parent__child__grandchild"
+        )
+
+        parent_tbl = client.db_client.open_table(parent_table_name)  # type: ignore[attr-defined]
+        child_tbl = client.db_client.open_table(child_table_name)  # type: ignore[attr-defined]
+        grandchild_tbl = client.db_client.open_table(grandchild_table_name)  # type: ignore[attr-defined]
+
+        actual_parent_df = parent_tbl.to_pandas().sort_values(by="id").reset_index(drop=True)
+        actual_child_df = child_tbl.to_pandas().sort_values(by="bar").reset_index(drop=True)
+        actual_grandchild_df = (
+            grandchild_tbl.to_pandas().sort_values(by="baz").reset_index(drop=True)
+        )
+
+        expected_parent_data = expected_parent_data.sort_values(by="id").reset_index(drop=True)
+        expected_child_data = expected_child_data.sort_values(by="bar").reset_index(drop=True)
+        expected_grandchild_data = expected_grandchild_data.sort_values(by="baz").reset_index(
+            drop=True
+        )
+
+        assert_frame_equal(actual_parent_df[["id"]], expected_parent_data)
+        assert_frame_equal(actual_child_df[["bar"]], expected_child_data)
+        assert_frame_equal(actual_grandchild_df[["baz"]], expected_grandchild_data)
+
+
+def test_lancedb_remove_orphaned_records_root_table() -> None:
+    pipeline = dlt.pipeline(
+        pipeline_name="test_lancedb_remove_orphaned_records_root_table",
+        destination="lancedb",
+        dataset_name=f"test_lancedb_remove_orphaned_records_root_table_{uniq_id()}",
+        dev_mode=True,
+    )
+
+    @dlt.resource(
+        table_name="root",
+        write_disposition={"disposition": "merge", "strategy": "upsert"},
+        primary_key=["doc_id", "chunk_hash"],
+        merge_key=["doc_id"],
+    )
+    def identity_resource(
+        data: List[DictStrAny],
+    ) -> Generator[List[DictStrAny], None, None]:
+        yield data
+
+    lancedb_adapter(identity_resource)
+
+    run_1 = [
+        {"doc_id": 1, "chunk_hash": "1a"},
+        {"doc_id": 2, "chunk_hash": "2a"},
+        {"doc_id": 2, "chunk_hash": "2b"},
+        {"doc_id": 2, "chunk_hash": "2c"},
+        {"doc_id": 3, "chunk_hash": "3a"},
+        {"doc_id": 3, "chunk_hash": "3b"},
+    ]
+    info = pipeline.run(identity_resource(run_1))
+    assert_load_info(info)
+
+    run_2 = [
+        {"doc_id": 2, "chunk_hash": "2d"},
+        {"doc_id": 2, "chunk_hash": "2e"},
+        {"doc_id": 3, "chunk_hash": "3b"},
+    ]
+    info = pipeline.run(identity_resource(run_2))
+    assert_load_info(info)
+
+    with pipeline.destination_client() as client:
+        expected_root_table_df = (
+            pd.DataFrame(
+                data=[
+                    {"doc_id": 1, "chunk_hash": "1a"},
+                    {"doc_id": 2, "chunk_hash": "2d"},
+                    {"doc_id": 2, "chunk_hash": "2e"},
+                    {"doc_id": 3, "chunk_hash": "3b"},
+                ]
+            )
+            .sort_values(by=["doc_id", "chunk_hash"])
+            .reset_index(drop=True)
+        )
+
+        root_table_name = client.make_qualified_table_name("root")  # type: ignore[attr-defined]
+        tbl = client.db_client.open_table(root_table_name)  # type: ignore[attr-defined]
+
+        actual_root_df: DataFrame = (
+            tbl.to_pandas().sort_values(by=["doc_id", "chunk_hash"]).reset_index(drop=True)
+        )[["doc_id", "chunk_hash"]]
+
+        assert_frame_equal(actual_root_df, expected_root_table_df)
+
+
+def test_lancedb_remove_orphaned_records_root_table_string_doc_id() -> None:
+    pipeline = dlt.pipeline(
+        pipeline_name="test_lancedb_remove_orphaned_records_root_table",
+        destination="lancedb",
+        dataset_name=f"test_lancedb_remove_orphaned_records_root_table_{uniq_id()}",
+        dev_mode=True,
+    )
+
+    @dlt.resource(
+        table_name="root",
+        write_disposition={"disposition": "merge", "strategy": "upsert"},
+        primary_key=["doc_id", "chunk_hash"],
+        merge_key=["doc_id"],
+    )
+    def identity_resource(
+        data: List[DictStrAny],
+    ) -> Generator[List[DictStrAny], None, None]:
+        yield data
+
+    lancedb_adapter(identity_resource)
+
+    run_1 = [
+        {"doc_id": "A", "chunk_hash": "1a"},
+        {"doc_id": "B", "chunk_hash": "2a"},
+        {"doc_id": "B", "chunk_hash": "2b"},
+        {"doc_id": "B", "chunk_hash": "2c"},
+        {"doc_id": "C", "chunk_hash": "3a"},
+        {"doc_id": "C", "chunk_hash": "3b"},
+    ]
+    info = pipeline.run(identity_resource(run_1))
+    assert_load_info(info)
+
+    run_2 = [
+        {"doc_id": "B", "chunk_hash": "2d"},
+        {"doc_id": "B", "chunk_hash": "2e"},
+        {"doc_id": "C", "chunk_hash": "3b"},
+    ]
+    info = pipeline.run(identity_resource(run_2))
+    assert_load_info(info)
+
+    with pipeline.destination_client() as client:
+        expected_root_table_df = (
+            pd.DataFrame(
+                data=[
+                    {"doc_id": "A", "chunk_hash": "1a"},
+                    {"doc_id": "B", "chunk_hash": "2d"},
+                    {"doc_id": "B", "chunk_hash": "2e"},
+                    {"doc_id": "C", "chunk_hash": "3b"},
+                ]
+            )
+            .sort_values(by=["doc_id", "chunk_hash"])
+            .reset_index(drop=True)
+        )
+
+        root_table_name = client.make_qualified_table_name("root")  # type: ignore[attr-defined]
+        tbl = client.db_client.open_table(root_table_name)  # type: ignore[attr-defined]
+
+        actual_root_df: DataFrame = (
+            tbl.to_pandas().sort_values(by=["doc_id", "chunk_hash"]).reset_index(drop=True)
+        )[["doc_id", "chunk_hash"]]
+
+        assert_frame_equal(actual_root_df, expected_root_table_df)
+
+
+def test_lancedb_root_table_remove_orphaned_records_with_real_embeddings() -> None:
+    @dlt.resource(
+        write_disposition={"disposition": "merge", "strategy": "upsert"},
+        table_name="document",
+        primary_key=["doc_id", "chunk"],
+        merge_key="doc_id",
+    )
+    def documents(docs: List[DictStrAny]) -> Generator[DictStrAny, None, None]:
+        for doc in docs:
+            doc_id = doc["doc_id"]
+            for chunk in chunk_document(doc["text"]):
+                yield {"doc_id": doc_id, "doc_text": doc["text"], "chunk": chunk}
+
+    @dlt.source()
+    def documents_source(
+        docs: List[DictStrAny],
+    ) -> Any:
+        return documents(docs)
+
+    lancedb_adapter(
+        documents,
+        embed=["chunk"],
+    )
+
+    pipeline = dlt.pipeline(
+        pipeline_name="test_lancedb_remove_orphaned_records_with_embeddings",
+        destination="lancedb",
+        dataset_name=f"test_lancedb_remove_orphaned_records_{uniq_id()}",
+        dev_mode=True,
+    )
+
+    initial_docs = [
+        {
+            "text": (
+                "This is the first document. It contains some text that will be chunked and"
+                " embedded. (I don't want to be seen in updated run's embedding chunk texts btw)"
+            ),
+            "doc_id": 1,
+        },
+        {
+            "text": "Here's another document. It's a bit different from the first one.",
+            "doc_id": 2,
+        },
+    ]
+
+    info = pipeline.run(documents_source(initial_docs))
+    assert_load_info(info)
+
+    updated_docs = [
+        {
+            "text": "This is the first document, but it has been updated with new content.",
+            "doc_id": 1,
+        },
+        {
+            "text": "This is a completely new document that wasn't in the initial set.",
+            "doc_id": 3,
+        },
+    ]
+
+    info = pipeline.run(documents_source(updated_docs))
+    assert_load_info(info)
+
+    with pipeline.destination_client() as client:
+        embeddings_table_name = client.make_qualified_table_name("document")  # type: ignore[attr-defined]
+        tbl: Table = client.db_client.open_table(embeddings_table_name)  # type: ignore[attr-defined]
+        df = tbl.to_pandas()
+
+        # Check (non-empty) embeddings as present, and that orphaned embeddings have been discarded.
+        assert len(df) == 21
+        assert "vector" in df.columns
+        for _, vector in enumerate(df["vector"]):
+            assert isinstance(vector, np.ndarray)
+            assert vector.size > 0
+
+
+def test_lancedb_compound_merge_key_root_table() -> None:
+    pipeline = dlt.pipeline(
+        pipeline_name="test_lancedb_compound_merge_key",
+        destination="lancedb",
+        dataset_name=f"test_lancedb_remove_orphaned_records_root_table_{uniq_id()}",
+        dev_mode=True,
+    )
+
+    @dlt.resource(
+        table_name="root",
+        write_disposition={"disposition": "merge", "strategy": "upsert"},
+        primary_key=["doc_id", "chunk_hash"],
+        merge_key=["doc_id", "chunk_hash"],
+    )
+    def identity_resource(
+        data: List[DictStrAny],
+    ) -> Generator[List[DictStrAny], None, None]:
+        yield data
+
+    lancedb_adapter(identity_resource, no_remove_orphans=True)
+
+    run_1 = [
+        {"doc_id": 1, "chunk_hash": "a", "foo": "bar"},
+        {"doc_id": 1, "chunk_hash": "b", "foo": "coo"},
+    ]
+    info = pipeline.run(identity_resource(run_1))
+    assert_load_info(info)
+
+    run_2 = [
+        {"doc_id": 1, "chunk_hash": "a", "foo": "aat"},
+        {"doc_id": 1, "chunk_hash": "c", "foo": "loot"},
+    ]
+    info = pipeline.run(identity_resource(run_2))
+    assert_load_info(info)
+
+    with pipeline.destination_client() as client:
+        expected_root_table_df = (
+            pd.DataFrame(
+                data=[
+                    {"doc_id": 1, "chunk_hash": "a", "foo": "aat"},
+                    {"doc_id": 1, "chunk_hash": "b", "foo": "coo"},
+                    {"doc_id": 1, "chunk_hash": "c", "foo": "loot"},
+                ]
+            )
+            .sort_values(by=["doc_id", "chunk_hash", "foo"])
+            .reset_index(drop=True)
+        )
+
+        root_table_name = client.make_qualified_table_name("root")  # type: ignore[attr-defined]
+        tbl = client.db_client.open_table(root_table_name)  # type: ignore[attr-defined]
+
+        actual_root_df: DataFrame = (
+            tbl.to_pandas().sort_values(by=["doc_id", "chunk_hash", "foo"]).reset_index(drop=True)
+        )[["doc_id", "chunk_hash", "foo"]]
+
+    assert_frame_equal(actual_root_df, expected_root_table_df)
+
+
+def test_must_provide_at_least_primary_key_on_merge_disposition() -> None:
+    """We need upsert merge's deterministic _dlt_id to perform orphan removal.
+    Hence, we require at least the primary key required (raises exception if missing).
+    Specify a merge key for custom orphan identification."""
+    generator_instance1 = sequence_generator()
+
+    @dlt.resource(write_disposition={"disposition": "merge", "strategy": "upsert"})
+    def some_data() -> Generator[DictStrStr, Any, None]:
+        yield from next(generator_instance1)
+
+    pipeline = dlt.pipeline(
+        pipeline_name="test_must_provide_both_primary_and_merge_key_on_merge_disposition",
+        destination="lancedb",
+        dataset_name=(
+            f"test_must_provide_both_primary_and_merge_key_on_merge_disposition{uniq_id()}"
+        ),
+    )
+    with pytest.raises(Exception):
+        load_info = pipeline.run(
+            some_data(),
+        )
+        assert_load_info(load_info)
diff --git a/tests/load/lancedb/test_pipeline.py b/tests/load/lancedb/test_pipeline.py
index 3dc2a999d4..7d320ee83c 100644
--- a/tests/load/lancedb/test_pipeline.py
+++ b/tests/load/lancedb/test_pipeline.py
@@ -1,25 +1,30 @@
 import multiprocessing
-from typing import Iterator, Generator, Any, List, Mapping
+import os
+from typing import Iterator, Generator, Any, List
+from typing import Mapping
+from typing import Union, Dict
 
 import pytest
-import lancedb  # type: ignore
-from lancedb import DBConnection
+from lancedb import DBConnection  # type: ignore
 from lancedb.embeddings import EmbeddingFunctionRegistry  # type: ignore
+from lancedb.table import Table  # type: ignore
 
 import dlt
 from dlt.common import json
-from dlt.common.typing import DictStrStr, DictStrAny
-from dlt.common.utils import uniq_id
+from dlt.common.typing import DictStrAny
+from dlt.common.typing import DictStrStr
+from dlt.common.utils import uniq_id, digest128
 from dlt.destinations.impl.lancedb.lancedb_adapter import (
     lancedb_adapter,
     VECTORIZE_HINT,
 )
 from dlt.destinations.impl.lancedb.lancedb_client import LanceDBClient
-from tests.load.lancedb.utils import assert_table
+from dlt.extract import DltResource
+from tests.load.lancedb.utils import assert_table, chunk_document, mock_embed
 from tests.load.utils import sequence_generator, drop_active_pipeline_data
 from tests.pipeline.utils import assert_load_info
 
-# Mark all tests as essential, do not remove.
+# Mark all tests as essential, don't remove.
 pytestmark = pytest.mark.essential
 
 
@@ -49,6 +54,18 @@ def some_data() -> Generator[DictStrStr, Any, None]:
         "x-lancedb-embed": True,
     }
 
+    lancedb_adapter(
+        some_data,
+        merge_key=["content"],
+    )
+
+    assert some_data.columns["content"] == {  # type: ignore
+        "name": "content",
+        "data_type": "text",
+        "x-lancedb-embed": True,
+        "merge_key": True,
+    }
+
 
 def test_basic_state_and_schema() -> None:
     generator_instance1 = sequence_generator()
@@ -118,14 +135,13 @@ def some_data() -> Generator[DictStrStr, Any, None]:
 
 
 def test_explicit_append() -> None:
-    """Append should work even when the primary key is specified."""
     data = [
         {"doc_id": 1, "content": "1"},
         {"doc_id": 2, "content": "2"},
         {"doc_id": 3, "content": "3"},
     ]
 
-    @dlt.resource(primary_key="doc_id")
+    @dlt.resource()
     def some_data() -> Generator[List[DictStrAny], Any, None]:
         yield data
 
@@ -142,6 +158,7 @@ def some_data() -> Generator[List[DictStrAny], Any, None]:
     info = pipeline.run(
         some_data(),
     )
+    assert_load_info(info)
 
     assert_table(pipeline, "some_data", items=data)
 
@@ -156,25 +173,22 @@ def some_data() -> Generator[List[DictStrAny], Any, None]:
 
 
 def test_pipeline_replace() -> None:
-    generator_instance1 = sequence_generator()
-    generator_instance2 = sequence_generator()
+    os.environ["DATA_WRITER__BUFFER_MAX_ITEMS"] = "2"
+    os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "2"
+
+    generator_instance1, generator_instance2 = (sequence_generator(), sequence_generator())
 
     @dlt.resource
     def some_data() -> Generator[DictStrStr, Any, None]:
         yield from next(generator_instance1)
 
-    lancedb_adapter(
-        some_data,
-        embed=["content"],
-    )
-
     uid = uniq_id()
 
     pipeline = dlt.pipeline(
         pipeline_name="test_pipeline_replace",
         destination="lancedb",
         dataset_name="test_pipeline_replace_dataset"
-        + uid,  # lancedb doesn't mandate any name normalization
+        + uid,  # Lancedb doesn't mandate any name normalization.
     )
 
     info = pipeline.run(
@@ -263,23 +277,11 @@ def test_pipeline_merge() -> None:
         },
     ]
 
-    @dlt.resource(primary_key="doc_id")
+    @dlt.resource(primary_key=["doc_id"])
     def movies_data() -> Any:
         yield data
 
-    @dlt.resource(primary_key="doc_id", merge_key=["merge_id", "title"])
-    def movies_data_explicit_merge_keys() -> Any:
-        yield data
-
-    lancedb_adapter(
-        movies_data,
-        embed=["description"],
-    )
-
-    lancedb_adapter(
-        movies_data_explicit_merge_keys,
-        embed=["description"],
-    )
+    lancedb_adapter(movies_data, embed=["description"], no_remove_orphans=True)
 
     pipeline = dlt.pipeline(
         pipeline_name="movies",
@@ -288,7 +290,7 @@ def movies_data_explicit_merge_keys() -> Any:
     )
     info = pipeline.run(
         movies_data(),
-        write_disposition="merge",
+        write_disposition={"disposition": "merge", "strategy": "upsert"},
         dataset_name=f"MoviesDataset{uniq_id()}",
     )
     assert_load_info(info)
@@ -299,26 +301,11 @@ def movies_data_explicit_merge_keys() -> Any:
 
     info = pipeline.run(
         movies_data(),
-        write_disposition="merge",
+        write_disposition={"disposition": "merge", "strategy": "upsert"},
     )
     assert_load_info(info)
     assert_table(pipeline, "movies_data", items=data)
 
-    info = pipeline.run(
-        movies_data(),
-        write_disposition="merge",
-    )
-    assert_load_info(info)
-    assert_table(pipeline, "movies_data", items=data)
-
-    # Test with explicit merge keys.
-    info = pipeline.run(
-        movies_data_explicit_merge_keys(),
-        write_disposition="merge",
-    )
-    assert_load_info(info)
-    assert_table(pipeline, "movies_data_explicit_merge_keys", items=data)
-
 
 def test_pipeline_with_schema_evolution() -> None:
     data = [
@@ -388,9 +375,9 @@ def test_merge_github_nested() -> None:
         data = json.load(f)
 
     info = pipe.run(
-        lancedb_adapter(data[:17], embed=["title", "body"]),
+        lancedb_adapter(data[:17], embed=["title", "body"], no_remove_orphans=True),
         table_name="issues",
-        write_disposition="merge",
+        write_disposition={"disposition": "merge", "strategy": "upsert"},
         primary_key="id",
     )
     assert_load_info(info)
@@ -426,18 +413,116 @@ def test_merge_github_nested() -> None:
 def test_empty_dataset_allowed() -> None:
     # dataset_name is optional so dataset name won't be autogenerated when not explicitly passed.
     pipe = dlt.pipeline(destination="lancedb", dev_mode=True)
-    client: LanceDBClient = pipe.destination_client()  # type: ignore[assignment]
 
     assert pipe.dataset_name is None
     info = pipe.run(lancedb_adapter(["context", "created", "not a stop word"], embed=["value"]))
     # Dataset in load info is empty.
     assert info.dataset_name is None
-    client = pipe.destination_client()  # type: ignore[assignment]
-    assert client.dataset_name is None
-    assert client.sentinel_table == "dltSentinelTable"
+    client = pipe.destination_client()
+    assert client.dataset_name is None  # type: ignore
+    assert client.sentinel_table == "dltSentinelTable"  # type: ignore
     assert_table(pipe, "content", expected_items_count=3)
 
 
+def test_lancedb_remove_nested_orphaned_records_with_chunks() -> None:
+    @dlt.resource(
+        write_disposition={"disposition": "merge", "strategy": "upsert"},
+        table_name="document",
+        primary_key=["doc_id"],
+        merge_key=["doc_id"],
+    )
+    def documents(docs: List[DictStrAny]) -> Generator[DictStrAny, None, None]:
+        for doc in docs:
+            doc_id = doc["doc_id"]
+            chunks = chunk_document(doc["text"])
+            embeddings = [
+                {
+                    "chunk_hash": digest128(chunk),
+                    "chunk_text": chunk,
+                    "embedding": mock_embed(),
+                }
+                for chunk in chunks
+            ]
+            yield {"doc_id": doc_id, "doc_text": doc["text"], "embeddings": embeddings}
+
+    @dlt.source(max_table_nesting=1)
+    def documents_source(
+        docs: List[DictStrAny],
+    ) -> Union[Generator[Dict[str, Any], None, None], DltResource]:
+        return documents(docs)
+
+    pipeline = dlt.pipeline(
+        pipeline_name="chunked_docs",
+        destination="lancedb",
+        dataset_name="chunked_documents",
+        dev_mode=True,
+    )
+
+    initial_docs = [
+        {
+            "text": (
+                "This is the first document. It contains some text that will be chunked and"
+                " embedded. (I don't want to be seen in updated run's embedding chunk texts btw)"
+            ),
+            "doc_id": 1,
+        },
+        {
+            "text": "Here's another document. It's a bit different from the first one.",
+            "doc_id": 2,
+        },
+    ]
+
+    info = pipeline.run(documents_source(initial_docs))
+    assert_load_info(info)
+
+    updated_docs = [
+        {
+            "text": "This is the first document, but it has been updated with new content.",
+            "doc_id": 1,
+        },
+        {
+            "text": "This is a completely new document that wasn't in the initial set.",
+            "doc_id": 3,
+        },
+    ]
+
+    info = pipeline.run(documents_source(updated_docs))
+    assert_load_info(info)
+
+    with pipeline.destination_client() as client:
+        # Orphaned chunks/documents must have been discarded.
+        # Shouldn't contain any text from `initial_docs' where doc_id=1.
+        expected_text = {
+            "Here's ano",
+            "ther docum",
+            "ent. It's ",
+            "a bit diff",
+            "erent from",
+            " the first",
+            " one.",
+            "This is th",
+            "e first do",
+            "cument, bu",
+            "t it has b",
+            "een update",
+            "d with new",
+            " content.",
+            "This is a ",
+            "completely",
+            " new docum",
+            "ent that w",
+            "asn't in t",
+            "he initial",
+            " set.",
+        }
+
+        embeddings_table_name = client.make_qualified_table_name("document__embeddings")  # type: ignore[attr-defined]
+
+        tbl: Table = client.db_client.open_table(embeddings_table_name)  # type: ignore[attr-defined]
+        df = tbl.to_pandas()
+        assert set(df["chunk_text"]) == expected_text
+
+
 search_data = [
     {"text": "Frodo was a happy puppy"},
     {"text": "There are several kittens playing"},
diff --git a/tests/load/lancedb/test_utils.py b/tests/load/lancedb/test_utils.py
new file mode 100644
index 0000000000..2f517aac8e
--- /dev/null
+++ b/tests/load/lancedb/test_utils.py
@@ -0,0 +1,32 @@
+import pyarrow as pa
+import pytest
+
+from dlt.destinations.impl.lancedb.utils import fill_empty_source_column_values_with_placeholder
+
+
+# Mark all tests as essential, don't remove.
+pytestmark = pytest.mark.essential
+
+
+def test_fill_empty_source_column_values_with_placeholder() -> None:
+    data = [
+        pa.array(["", "hello", ""]),
+        pa.array(["hello", None, ""]),
+        pa.array([1, 2, 3]),
+        pa.array(["world", "", "arrow"]),
+    ]
+    table = pa.Table.from_arrays(data, names=["A", "B", "C", "D"])
+
+    source_columns = ["A", "B"]
+    placeholder = "placeholder"
+
+    new_table = fill_empty_source_column_values_with_placeholder(table, source_columns, placeholder)
+
+    expected_data = [
+        pa.array(["placeholder", "hello", "placeholder"]),
+        pa.array(["hello", "placeholder", "placeholder"]),
+        pa.array([1, 2, 3]),
+        pa.array(["world", "", "arrow"]),
+    ]
+    expected_table = pa.Table.from_arrays(expected_data, names=["A", "B", "C", "D"])
+    assert new_table.equals(expected_table)
diff --git a/tests/load/lancedb/utils.py b/tests/load/lancedb/utils.py
index 7431e895b7..30430fe076 100644
--- a/tests/load/lancedb/utils.py
+++ b/tests/load/lancedb/utils.py
@@ -40,7 +40,7 @@ def assert_table(
     exists = client.table_exists(qualified_table_name)
     assert exists
 
-    records = client.db_client.open_table(qualified_table_name).search().limit(50).to_list()
+    records = client.db_client.open_table(qualified_table_name).search().limit(0).to_list()
 
     if expected_items_count is not None:
         assert expected_items_count == len(records)
@@ -51,7 +51,6 @@ def assert_table(
     drop_keys = [
         "_dlt_id",
         "_dlt_load_id",
-        dlt.config.get("destination.lancedb.credentials.id_field_name", str) or "id__",
         dlt.config.get("destination.lancedb.credentials.vector_field_name", str) or "vector",
     ]
     objects_without_dlt_or_special_keys = [
@@ -72,3 +71,13 @@ def generate_embeddings(
 
     def ndims(self) -> int:
         return 2
+
+
+def mock_embed(
+    dim: int = 10,
+) -> str:
+    return str(np.random.random_sample(dim))
+
+
+def chunk_document(doc: str, chunk_size: int = 10) -> List[str]:
+    return [doc[i : i + chunk_size] for i in range(0, len(doc), chunk_size)]
diff --git a/tests/load/utils.py b/tests/load/utils.py
index 19601f2cf1..2abd37ce9e 100644
--- a/tests/load/utils.py
+++ b/tests/load/utils.py
@@ -989,7 +989,7 @@ def prepare_load_package(
 def sequence_generator() -> Generator[List[Dict[str, str]], None, None]:
     count = 1
     while True:
-        yield [{"content": str(count + i)} for i in range(3)]
+        yield [{"content": str(count + i)} for i in range(2000)]
         count += 3