disable_null_filter -> filter_nan

lancedb · Oct 10, 2024 · 82672fa · 82672fa
1 parent 46031e6
commit 82672fa
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 13 deletions.
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -1447,7 +1447,7 @@ def create_index(
         ivf_centroids_file: Optional[str] = None,
         precomputed_partition_dataset: Optional[str] = None,
         storage_options: Optional[Dict[str, str]] = None,
-        disable_null_filter: bool = False,
+        filter_nan: bool = True,
         **kwargs,
     ) -> LanceDataset:
         """Create index on column.
@@ -1504,10 +1504,10 @@ def create_index(
         storage_options : optional, dict
             Extra options that make sense for a particular storage connection. This is
             used to store connection parameters like credentials, endpoint, etc.
-        disable_null_filter: bool
-            Defaults to false. True is UNSAFE. Disable the null filter used for nullable
-            columns. Will crash if any nulls are present in the column. Obtains a small
-            speed boost.
+        filter_nan: bool
+            Defaults to True. False is UNSAFE, and will cause a crash if any null/nan values
+            are present (and otherwise will not). Disables the null filter used for nullable
+            columns. Obtains a small speed boost.
         kwargs :
             Parameters passed to the index building process.
 
@@ -1707,7 +1707,7 @@ def create_index(
                     num_partitions,
                     metric,
                     accelerator,
-                    disable_null_filter,
+                    filter_nan=filter_nan,
                 )
                 timers["ivf_train:end"] = time.time()
                 ivf_train_time = timers["ivf_train:end"] - timers["ivf_train:start"]
@@ -1723,7 +1723,7 @@ def create_index(
                     kmeans,
                     batch_size=20480,
                     num_sub_vectors=num_sub_vectors_cur,
-                    disable_null_filter=disable_null_filter,
+                    filter_nan=filter_nan,
                 )
                 timers["ivf_assign:end"] = time.time()
                 ivf_assign_time = timers["ivf_assign:end"] - timers["ivf_assign:start"]

diff --git a/python/python/lance/vector.py b/python/python/lance/vector.py
@@ -201,7 +201,7 @@ def train_ivf_centroids_on_accelerator(
     *,
     sample_rate: int = 256,
     max_iters: int = 50,
-    disable_null_filter: bool = False,
+    filter_nan: bool = True,
 ) -> (np.ndarray, Any):
     """Use accelerator (GPU or MPS) to train kmeans."""
     if isinstance(accelerator, str) and (
@@ -220,7 +220,7 @@ def train_ivf_centroids_on_accelerator(
 
     k = int(k)
 
-    if dataset.schema.field(column).nullable and not disable_null_filter:
+    if dataset.schema.field(column).nullable and filter_nan:
         filt = f"{column} is not null"
     else:
         filt = None
@@ -417,7 +417,7 @@ def compute_partitions(
     dst_dataset_uri: Optional[Union[str, Path]] = None,
     allow_cuda_tf32: bool = True,
     num_sub_vectors: Optional[int] = None,
-    disable_null_filter: bool = False,
+    filter_nan: bool = True,
 ) -> str:
     """Compute partitions for each row using GPU kmeans and spill to disk.
 
@@ -446,9 +446,11 @@ def compute_partitions(
 
     num_rows = dataset.count_rows()
 
-    null_filter = f"{column} is not null"
-    if disable_null_filter:
-        null_filter = None
+    if dataset.schema.field(column).nullable and filter_nan:
+        filt = f"{column} is not null"
+    else:
+        filt = None
+
     torch_ds = TorchDataset(
         dataset,
         batch_size=batch_size,