Skip to content

Commit

Permalink
disable_null_filter -> filter_nan
Browse files Browse the repository at this point in the history
  • Loading branch information
jacketsj committed Oct 10, 2024
1 parent 46031e6 commit 82672fa
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 13 deletions.
14 changes: 7 additions & 7 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1447,7 +1447,7 @@ def create_index(
ivf_centroids_file: Optional[str] = None,
precomputed_partition_dataset: Optional[str] = None,
storage_options: Optional[Dict[str, str]] = None,
disable_null_filter: bool = False,
filter_nan: bool = True,
**kwargs,
) -> LanceDataset:
"""Create index on column.
Expand Down Expand Up @@ -1504,10 +1504,10 @@ def create_index(
storage_options : optional, dict
Extra options that make sense for a particular storage connection. This is
used to store connection parameters like credentials, endpoint, etc.
disable_null_filter: bool
Defaults to false. True is UNSAFE. Disable the null filter used for nullable
columns. Will crash if any nulls are present in the column. Obtains a small
speed boost.
filter_nan: bool
Defaults to True. False is UNSAFE, and will cause a crash if any null/nan values
are present (and otherwise will not). Disables the null filter used for nullable
columns. Obtains a small speed boost.
kwargs :
Parameters passed to the index building process.
Expand Down Expand Up @@ -1707,7 +1707,7 @@ def create_index(
num_partitions,
metric,
accelerator,
disable_null_filter,
filter_nan=filter_nan,
)
timers["ivf_train:end"] = time.time()
ivf_train_time = timers["ivf_train:end"] - timers["ivf_train:start"]
Expand All @@ -1723,7 +1723,7 @@ def create_index(
kmeans,
batch_size=20480,
num_sub_vectors=num_sub_vectors_cur,
disable_null_filter=disable_null_filter,
filter_nan=filter_nan,
)
timers["ivf_assign:end"] = time.time()
ivf_assign_time = timers["ivf_assign:end"] - timers["ivf_assign:start"]
Expand Down
14 changes: 8 additions & 6 deletions python/python/lance/vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def train_ivf_centroids_on_accelerator(
*,
sample_rate: int = 256,
max_iters: int = 50,
disable_null_filter: bool = False,
filter_nan: bool = True,
) -> (np.ndarray, Any):
"""Use accelerator (GPU or MPS) to train kmeans."""
if isinstance(accelerator, str) and (
Expand All @@ -220,7 +220,7 @@ def train_ivf_centroids_on_accelerator(

k = int(k)

if dataset.schema.field(column).nullable and not disable_null_filter:
if dataset.schema.field(column).nullable and filter_nan:
filt = f"{column} is not null"
else:
filt = None
Expand Down Expand Up @@ -417,7 +417,7 @@ def compute_partitions(
dst_dataset_uri: Optional[Union[str, Path]] = None,
allow_cuda_tf32: bool = True,
num_sub_vectors: Optional[int] = None,
disable_null_filter: bool = False,
filter_nan: bool = True,
) -> str:
"""Compute partitions for each row using GPU kmeans and spill to disk.
Expand Down Expand Up @@ -446,9 +446,11 @@ def compute_partitions(

num_rows = dataset.count_rows()

null_filter = f"{column} is not null"
if disable_null_filter:
null_filter = None
if dataset.schema.field(column).nullable and filter_nan:
filt = f"{column} is not null"
else:
filt = None

torch_ds = TorchDataset(
dataset,
batch_size=batch_size,
Expand Down

0 comments on commit 82672fa

Please sign in to comment.