feat: add a python API for reading arrays of binary/large_binary as f…

…ile objects (#2944) Many APIs for working with binary data expect file objects. This PR adds a small utility to convert a binary array into an iterable of file objects. This will get more sophisticated in future releases where the input can be a column or row ids instead of a binary array (allowing for sort of scan on demand workflow that will be more memory efficient for extremely large blobs)
lancedb · Oct 3, 2024 · 19d5a82 · 19d5a82
1 parent a386edd
commit 19d5a82
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 2 deletions.
diff --git a/python/python/lance/__init__.py b/python/python/lance/__init__.py
@@ -5,6 +5,7 @@
 
 from typing import TYPE_CHECKING, Dict, Optional, Union
 
+from .blob import BlobColumn
 from .dataset import (
     LanceDataset,
     LanceOperation,
@@ -29,7 +30,10 @@
 
 
 __all__ = [
+    "BlobColumn",
+    "FragmentMetadata",
     "LanceDataset",
+    "LanceFragment",
     "LanceOperation",
     "LanceScanner",
     "MergeInsertBuilder",
@@ -38,8 +42,6 @@
     "schema_to_json",
     "json_to_schema",
     "dataset",
-    "FragmentMetadata",
-    "LanceFragment",
     "batch_udf",
 ]
 

diff --git a/python/python/lance/blob.py b/python/python/lance/blob.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
+
+import io
+from typing import IO, Iterator, Optional, Union
+
+import pyarrow as pa
+
+
+class BlobIterator:
+    def __init__(self, binary_iter: Iterator[pa.BinaryScalar]):
+        self.binary_iter = binary_iter
+
+    def __next__(self) -> Optional[IO[bytes]]:
+        value = next(self.binary_iter)
+        if value is None:
+            return None
+        return io.BytesIO(value.as_py())
+
+
+class BlobColumn:
+    def __init__(self, blob_column: Union[pa.Array, pa.ChunkedArray]):
+        if not isinstance(blob_column, (pa.Array, pa.ChunkedArray)):
+            raise ValueError(
+                "Expected a pyarrow.Array or pyarrow.ChunkedArray, "
+                f"got {type(blob_column)}"
+            )
+
+        if not pa.types.is_large_binary(blob_column.type) and not pa.types.is_binary(
+            blob_column.type
+        ):
+            raise ValueError(f"Expected a binary array, got {blob_column.type}")
+
+        self.blob_column = blob_column
+
+    def __iter__(self) -> Iterator[IO[bytes]]:
+        return BlobIterator(iter(self.blob_column))
diff --git a/python/python/tests/test_blob.py b/python/python/tests/test_blob.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
+
+import pyarrow as pa
+import pytest
+from lance import BlobColumn
+
+
+def test_blob_read_from_binary():
+    values = [b"foo", b"bar", b"baz"]
+    data = pa.table(
+        {
+            "bin": pa.array(values, type=pa.binary()),
+            "largebin": pa.array(values, type=pa.large_binary()),
+        }
+    )
+
+    for col_name in ["bin", "largebin"]:
+        blobs = BlobColumn(data.column(col_name))
+        for i, f in enumerate(blobs):
+            assert f.read() in values[i]
+
+
+def test_blob_reject_invalid_col():
+    values = pa.array([1, 2, 3])
+    with pytest.raises(ValueError, match="Expected a binary array"):
+        BlobColumn(values)