Skip to content

Commit

Permalink
feat: add a python API for reading arrays of binary/large_binary as f…
Browse files Browse the repository at this point in the history
…ile objects (#2944)

Many APIs for working with binary data expect file objects. This PR adds
a small utility to convert a binary array into an iterable of file
objects.

This will get more sophisticated in future releases where the input can
be a column or row ids instead of a binary array (allowing for sort of
scan on demand workflow that will be more memory efficient for extremely
large blobs)
  • Loading branch information
westonpace authored Oct 3, 2024
1 parent a386edd commit 19d5a82
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 2 deletions.
6 changes: 4 additions & 2 deletions python/python/lance/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from typing import TYPE_CHECKING, Dict, Optional, Union

from .blob import BlobColumn
from .dataset import (
LanceDataset,
LanceOperation,
Expand All @@ -29,7 +30,10 @@


__all__ = [
"BlobColumn",
"FragmentMetadata",
"LanceDataset",
"LanceFragment",
"LanceOperation",
"LanceScanner",
"MergeInsertBuilder",
Expand All @@ -38,8 +42,6 @@
"schema_to_json",
"json_to_schema",
"dataset",
"FragmentMetadata",
"LanceFragment",
"batch_udf",
]

Expand Down
37 changes: 37 additions & 0 deletions python/python/lance/blob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors

import io
from typing import IO, Iterator, Optional, Union

import pyarrow as pa


class BlobIterator:
def __init__(self, binary_iter: Iterator[pa.BinaryScalar]):
self.binary_iter = binary_iter

def __next__(self) -> Optional[IO[bytes]]:
value = next(self.binary_iter)
if value is None:
return None
return io.BytesIO(value.as_py())


class BlobColumn:
def __init__(self, blob_column: Union[pa.Array, pa.ChunkedArray]):
if not isinstance(blob_column, (pa.Array, pa.ChunkedArray)):
raise ValueError(
"Expected a pyarrow.Array or pyarrow.ChunkedArray, "
f"got {type(blob_column)}"
)

if not pa.types.is_large_binary(blob_column.type) and not pa.types.is_binary(
blob_column.type
):
raise ValueError(f"Expected a binary array, got {blob_column.type}")

self.blob_column = blob_column

def __iter__(self) -> Iterator[IO[bytes]]:
return BlobIterator(iter(self.blob_column))
27 changes: 27 additions & 0 deletions python/python/tests/test_blob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors

import pyarrow as pa
import pytest
from lance import BlobColumn


def test_blob_read_from_binary():
values = [b"foo", b"bar", b"baz"]
data = pa.table(
{
"bin": pa.array(values, type=pa.binary()),
"largebin": pa.array(values, type=pa.large_binary()),
}
)

for col_name in ["bin", "largebin"]:
blobs = BlobColumn(data.column(col_name))
for i, f in enumerate(blobs):
assert f.read() in values[i]


def test_blob_reject_invalid_col():
values = pa.array([1, 2, 3])
with pytest.raises(ValueError, match="Expected a binary array"):
BlobColumn(values)

0 comments on commit 19d5a82

Please sign in to comment.