Skip to content

Commit

Permalink
feat: support large types for image ext types (#1944)
Browse files Browse the repository at this point in the history
Closes #1942
  • Loading branch information
wjones127 authored Feb 13, 2024
1 parent fd3bf5f commit 4873151
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 41 deletions.
81 changes: 56 additions & 25 deletions python/python/lance/arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,29 @@
]


def _is_pyarrow_string_type(t: pa.DataType) -> bool:
# TODO: allow string_view once available?
return pa.types.is_string(t) or pa.types.is_large_string(t)


def _is_pyarrow_binary_type(t: pa.DataType) -> bool:
# TODO: allow binary_view once available?
return pa.types.is_binary(t) or pa.types.is_large_binary(t)


class ImageURIType(pa.ExtensionType):
def __init__(self):
pa.ExtensionType.__init__(self, pa.string(), "lance.arrow.image_uri")
def __init__(self, storage_type: pa.DataType = pa.string()):
# TODO: allow string_view once available?
if not _is_pyarrow_string_type(storage_type):
raise ValueError("storage_type must be a string type")
pa.ExtensionType.__init__(self, storage_type, "lance.arrow.image_uri")

def __arrow_ext_serialize__(self):
return b""

@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
return ImageURIType()
return ImageURIType(storage_type)

def __arrow_ext_class__(self):
return ImageURIArray
Expand All @@ -62,16 +75,18 @@ def __arrow_ext_scalar_class__(self):


class EncodedImageType(pa.ExtensionType):
def __init__(self):
def __init__(self, storage_type: pa.DataType = pa.binary()):
# TODO: use pa.BinaryView once available?
pa.ExtensionType.__init__(self, pa.binary(), "lance.arrow.encoded_image")
if not _is_pyarrow_binary_type(storage_type):
raise ValueError("storage_type must be a binary type")
pa.ExtensionType.__init__(self, storage_type, "lance.arrow.encoded_image")

def __arrow_ext_serialize__(self):
return b""

@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
return EncodedImageType()
return EncodedImageType(storage_type)

def __arrow_ext_class__(self):
return EncodedImageArray
Expand All @@ -81,7 +96,7 @@ def __arrow_ext_scalar_class__(self):


class FixedShapeImageTensorType(pa.ExtensionType):
def __init__(self, arrow_type, shape):
def __init__(self, arrow_type: pa.DataType, shape):
self.shape = shape
self.arrow_type = arrow_type
assert len(shape) > 0
Expand Down Expand Up @@ -140,10 +155,10 @@ def from_array(cls, images):
Union[ImageURIArray, EncodedImageArray, FixedShapeImageTensorArray]
"""

if isinstance(images, pa.StringArray):
return pa.ExtensionArray.from_storage(ImageURIType(), images)
elif isinstance(images, pa.BinaryArray):
return pa.ExtensionArray.from_storage(EncodedImageType(), images)
if isinstance(images, (pa.StringArray, pa.LargeStringArray)):
return pa.ExtensionArray.from_storage(ImageURIType(images.type), images)
elif isinstance(images, (pa.BinaryArray, pa.LargeBinaryArray)):
return pa.ExtensionArray.from_storage(EncodedImageType(images.type), images)
elif isinstance(images, pa.FixedShapeTensorArray):
shape = images.type.shape
value_type = images.type.value_type
Expand All @@ -169,13 +184,16 @@ class ImageURIArray(ImageArray):
"""

@classmethod
def from_uris(cls, uris: Union[pa.StringArray, Iterable[Union[str, Path]]]):
def from_uris(
cls,
uris: Union[pa.StringArray, pa.LargeStringArray, Iterable[Union[str, Path]]],
):
"""
Create an ImageURIArray from a pa.StringArray or an iterable.
Create an ImageURIArray from an array or iterable of URIs (such as a list).
Parameters
----------
uris : Union[pa.StringArray, Iterable[Union[str, Path]]]
uris : Union[pa.StringArray, pa.LargeStringArray, Iterable[Union[str, Path]]]
Returns
-------
Expand All @@ -189,18 +207,25 @@ def from_uris(cls, uris: Union[pa.StringArray, Iterable[Union[str, Path]]]):
<lance.arrow.ImageURIArray object at 0x...>
['file::///tmp/1.png']
"""

if not isinstance(uris, pa.StringArray):
if isinstance(uris, (pa.StringArray, pa.LargeStringArray)):
pass
elif isinstance(uris, Iterable):
uris = pa.array((str(uri) for uri in uris), type=pa.string())
else:
raise TypeError("Cannot build a ImageURIArray from {}".format(type(uris)))

return cls.from_storage(ImageURIType(), uris)
return cls.from_storage(ImageURIType(uris.type), uris)

def read_uris(self):
def read_uris(self, storage_type=pa.binary()) -> "EncodedImageArray":
"""
Read the images from the URIs into memory and return an EncodedImageArray
Parameters
----------
storage_type : pa.DataType, optional
The storage type to use for the encoded images. Default is pa.binary().
To support arrays with more than 2GiB of data, use pa.large_binary().
Returns
-------
EncodedImageArray
Expand Down Expand Up @@ -244,7 +269,7 @@ def download(url):
images.append(f.read())

return EncodedImageArray.from_storage(
EncodedImageType(), pa.array(images, type=pa.binary())
EncodedImageType(storage_type), pa.array(images, type=storage_type)
)


Expand Down Expand Up @@ -292,15 +317,18 @@ def tensorflow_metadata_decoder(images):
)

def to_tensor(
self, decoder: Optional[Callable[[pa.BinaryArray], np.ndarray]] = None
self,
decoder: Optional[
Callable[[Union[pa.BinaryArray, pa.LargeBinaryArray]], np.ndarray]
] = None,
):
"""
Decode encoded images and return a FixedShapeImageTensorArray
Parameters
----------
decoder : Callable[pa.binary()], optional
A function that takes a pa.binary() and returns a numpy.ndarray
A function that takes a binary array and returns a numpy.ndarray
or pa.fixed_shape_tensor. If not provided, will attempt to use
tensorflow and then pillow decoder in that order.
Expand Down Expand Up @@ -401,14 +429,17 @@ def to_numpy(self):
tensor_array = pa.ExtensionArray.from_storage(ext_type, self.storage)
return tensor_array.to_numpy_ndarray()

def to_encoded(self, encoder=None):
def to_encoded(self, encoder=None, storage_type=pa.binary()) -> "EncodedImageArray":
"""
Encode FixedShapeImageTensorArray to PNG bytes and return an EncodedImageArray.
Parameters
----------
encoder : Callable[np.ndarray], optional
An encoder function that takes a numpy.ndarray and returns an encoded image.
storage_type : pa.DataType, optional
The storage type to use for the encoded images. Default is pa.binary().
To support arrays with more than 2GiB of data, use pa.large_binary().
Returns
-------
Expand Down Expand Up @@ -439,15 +470,15 @@ def pillow_encoder(x):
with io.BytesIO() as buf:
Image.fromarray(y).save(buf, format="PNG")
encoded_images.append(buf.getvalue())
return pa.array(encoded_images, type=pa.binary())
return pa.array(encoded_images, type=storage_type)

def tensorflow_encoder(x):
import tensorflow as tf

encoded_images = (
tf.io.encode_png(y).numpy() for y in tf.convert_to_tensor(x)
)
return pa.array(encoded_images, type=pa.binary())
return pa.array(encoded_images, type=storage_type)

if not encoder:
encoders = (
Expand All @@ -468,7 +499,7 @@ def tensorflow_encoder(x):
)

return EncodedImageArray.from_storage(
EncodedImageType(), encoder(self.to_numpy())
EncodedImageType(storage_type), encoder(self.to_numpy())
)


Expand Down
43 changes: 27 additions & 16 deletions python/python/tests/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import re
from pathlib import Path

Expand Down Expand Up @@ -198,31 +199,41 @@ def test_roundtrip_take_ext_types(tmp_path: Path):
]


def test_image_arrays(tmp_path: Path):
import os
from pathlib import Path

tf = pytest.importorskip("tensorflow")

n = 10
png_uris = [
"file://" + os.path.join(os.path.dirname(__file__), "images/1.png"),
] * n

@pytest.fixture
def png_uris():
local_path = "file://" + os.path.join(os.path.dirname(__file__), "images/1.png")
png_uris = [local_path] * 10
if os.name == "nt":
png_uris = [str(Path(x)) for x in png_uris]
return png_uris


def test_image_uri_arrays(tmp_path: Path, png_uris):
from_list = ImageURIArray.from_uris(png_uris)
from_pyarrow = ImageURIArray.from_uris(pa.array(png_uris, pa.string()))
from_pyarrow_large = ImageURIArray.from_uris(pa.array(png_uris, pa.large_string()))
for arr in [from_list, from_pyarrow, from_pyarrow_large]:
assert arr.to_pylist() == png_uris

assert ImageArray.from_array(png_uris).to_pylist() == png_uris
assert (
ImageArray.from_array(pa.array(png_uris, pa.string())).to_pylist() == png_uris
)
image_array = ImageURIArray.from_uris(png_uris)
assert ImageArray.from_array(image_array) == image_array

encoded_image_array = image_array.read_uris()
assert len(ImageArray.from_array(encoded_image_array.storage)) == n
assert len(ImageArray.from_array(encoded_image_array.storage)) == 10
assert ImageArray.from_array(encoded_image_array) == encoded_image_array

large_array = encoded_image_array.storage.cast(pa.large_binary())
large_array = ImageArray.from_array(large_array)
assert large_array.to_pylist() == encoded_image_array.to_pylist()


def test_image_tensor_arrays(tmp_path: Path, png_uris):
tf = pytest.importorskip("tensorflow")

n = 10

encoded_image_array = ImageURIArray.from_uris(png_uris).read_uris()

tensor_image_array = encoded_image_array.to_tensor()
fixed_shape_tensor_array = pa.ExtensionArray.from_storage(
pa.fixed_shape_tensor(
Expand Down

0 comments on commit 4873151

Please sign in to comment.