Skip to content

Commit

Permalink
[v3] Array.append (#2413)
Browse files Browse the repository at this point in the history
* feature(array): implement Array.append

changes the Array.resize to be an inplace operation

* better error message

* no more warn

* style: pre-commit fixes

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
jhamman and pre-commit-ci[bot] authored Oct 23, 2024
1 parent 8a33df7 commit 6ce0526
Show file tree
Hide file tree
Showing 3 changed files with 302 additions and 19 deletions.
130 changes: 112 additions & 18 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import json
from asyncio import gather
from dataclasses import dataclass, field, replace
from dataclasses import dataclass, field
from itertools import starmap
from logging import getLogger
from typing import TYPE_CHECKING, Any, Generic, Literal, cast, overload
Expand Down Expand Up @@ -1104,15 +1104,15 @@ async def setitem(
)
return await self._set_selection(indexer, value, prototype=prototype)

async def resize(self, new_shape: ChunkCoords, delete_outside_chunks: bool = True) -> Self:
async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) -> None:
new_shape = parse_shapelike(new_shape)
assert len(new_shape) == len(self.metadata.shape)
new_metadata = self.metadata.update_shape(new_shape)

# Remove all chunks outside of the new shape
old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape))
new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape))

if delete_outside_chunks:
# Remove all chunks outside of the new shape
old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape))
new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape))

async def _delete_key(key: str) -> None:
await (self.store_path / key).delete()
Expand All @@ -1128,7 +1128,63 @@ async def _delete_key(key: str) -> None:

# Write new metadata
await self._save_metadata(new_metadata)
return replace(self, metadata=new_metadata)

# Update metadata (in place)
object.__setattr__(self, "metadata", new_metadata)

async def append(self, data: npt.ArrayLike, axis: int = 0) -> ChunkCoords:
"""Append `data` to `axis`.
Parameters
----------
data : array-like
Data to be appended.
axis : int
Axis along which to append.
Returns
-------
new_shape : tuple
Notes
-----
The size of all dimensions other than `axis` must match between this
array and `data`.
"""
# ensure data is array-like
if not hasattr(data, "shape"):
data = np.asanyarray(data)

self_shape_preserved = tuple(s for i, s in enumerate(self.shape) if i != axis)
data_shape_preserved = tuple(s for i, s in enumerate(data.shape) if i != axis)
if self_shape_preserved != data_shape_preserved:
raise ValueError(
f"shape of data to append is not compatible with the array. "
f"The shape of the data is ({data_shape_preserved})"
f"and the shape of the array is ({self_shape_preserved})."
"All dimensions must match except for the dimension being "
"appended."
)
# remember old shape
old_shape = self.shape

# determine new shape
new_shape = tuple(
self.shape[i] if i != axis else self.shape[i] + data.shape[i]
for i in range(len(self.shape))
)

# resize
await self.resize(new_shape)

# store data
append_selection = tuple(
slice(None) if i != axis else slice(old_shape[i], new_shape[i])
for i in range(len(self.shape))
)
await self.setitem(append_selection, data)

return new_shape

async def update_attributes(self, new_attributes: dict[str, JSON]) -> Self:
# metadata.attributes is "frozen" so we simply clear and update the dict
Expand All @@ -1147,7 +1203,8 @@ async def info(self) -> None:
raise NotImplementedError


@dataclass(frozen=True)
# TODO: Array can be a frozen data class again once property setters (e.g. shape) are removed
@dataclass(frozen=False)
class Array:
"""Instantiate an array from an initialized store."""

Expand Down Expand Up @@ -1297,6 +1354,11 @@ def shape(self) -> ChunkCoords:
"""
return self._async_array.shape

@shape.setter
def shape(self, value: ChunkCoords) -> None:
"""Sets the shape of the array by calling resize."""
self.resize(value)

@property
def chunks(self) -> ChunkCoords:
"""Returns a tuple of integers describing the length of each dimension of a chunk of the array.
Expand Down Expand Up @@ -2754,18 +2816,18 @@ def blocks(self) -> BlockIndex:
:func:`set_block_selection` for documentation and examples."""
return BlockIndex(self)

def resize(self, new_shape: ChunkCoords) -> Array:
def resize(self, new_shape: ShapeLike) -> None:
"""
Change the shape of the array by growing or shrinking one or more
dimensions.
This method does not modify the original Array object. Instead, it returns a new Array
with the specified shape.
Parameters
----------
new_shape : tuple
New shape of the array.
Notes
-----
When resizing an array, the data are not rearranged in any way.
If one or more dimensions are shrunk, any chunks falling outside the
new array shape will be deleted from the underlying store.
However, it is noteworthy that the chunks partially falling inside the new array
Expand All @@ -2778,7 +2840,6 @@ def resize(self, new_shape: ChunkCoords) -> Array:
>>> import zarr
>>> z = zarr.zeros(shape=(10000, 10000),
>>> chunk_shape=(1000, 1000),
>>> store=StorePath(MemoryStore(mode="w")),
>>> dtype="i4",)
>>> z.shape
(10000, 10000)
Expand All @@ -2791,10 +2852,43 @@ def resize(self, new_shape: ChunkCoords) -> Array:
>>> z2.shape
(50, 50)
"""
resized = sync(self._async_array.resize(new_shape))
# TODO: remove this cast when type inference improves
_resized = cast(AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], resized)
return type(self)(_resized)
sync(self._async_array.resize(new_shape))

def append(self, data: npt.ArrayLike, axis: int = 0) -> ChunkCoords:
"""Append `data` to `axis`.
Parameters
----------
data : array-like
Data to be appended.
axis : int
Axis along which to append.
Returns
-------
new_shape : tuple
Notes
-----
The size of all dimensions other than `axis` must match between this
array and `data`.
Examples
--------
>>> import numpy as np
>>> import zarr
>>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000)
>>> z = zarr.array(a, chunks=(1000, 100))
>>> z.shape
(10000, 1000)
>>> z.append(a)
(20000, 1000)
>>> z.append(np.vstack([a, a]), axis=1)
(20000, 2000)
>>> z.shape
(20000, 2000)
"""
return sync(self._async_array.append(data, axis=axis))

def update_attributes(self, new_attributes: dict[str, JSON]) -> Array:
# TODO: remove this cast when type inference improves
Expand Down
188 changes: 188 additions & 0 deletions tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,194 @@ def test_update_attrs(zarr_format: int) -> None:
assert arr2.attrs["foo"] == "bar"


@pytest.mark.parametrize("store", ["memory"], indirect=True)
@pytest.mark.parametrize("zarr_format", [2, 3])
def test_resize_1d(store: MemoryStore, zarr_format: int) -> None:
z = zarr.create(
shape=105, chunks=10, dtype="i4", fill_value=0, store=store, zarr_format=zarr_format
)
a = np.arange(105, dtype="i4")
z[:] = a
assert (105,) == z.shape
assert (105,) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10,) == z.chunks
np.testing.assert_array_equal(a, z[:])

z.resize(205)
assert (205,) == z.shape
assert (205,) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10,) == z.chunks
np.testing.assert_array_equal(a, z[:105])
np.testing.assert_array_equal(np.zeros(100, dtype="i4"), z[105:])

z.resize(55)
assert (55,) == z.shape
assert (55,) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10,) == z.chunks
np.testing.assert_array_equal(a[:55], z[:])

# via shape setter
new_shape = (105,)
z.shape = new_shape
assert new_shape == z.shape
assert new_shape == z[:].shape


@pytest.mark.parametrize("store", ["memory"], indirect=True)
@pytest.mark.parametrize("zarr_format", [2, 3])
def test_resize_2d(store: MemoryStore, zarr_format: int) -> None:
z = zarr.create(
shape=(105, 105),
chunks=(10, 10),
dtype="i4",
fill_value=0,
store=store,
zarr_format=zarr_format,
)
a = np.arange(105 * 105, dtype="i4").reshape((105, 105))
z[:] = a
assert (105, 105) == z.shape
assert (105, 105) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10, 10) == z.chunks
np.testing.assert_array_equal(a, z[:])

z.resize((205, 205))
assert (205, 205) == z.shape
assert (205, 205) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10, 10) == z.chunks
np.testing.assert_array_equal(a, z[:105, :105])
np.testing.assert_array_equal(np.zeros((100, 205), dtype="i4"), z[105:, :])
np.testing.assert_array_equal(np.zeros((205, 100), dtype="i4"), z[:, 105:])

z.resize((55, 55))
assert (55, 55) == z.shape
assert (55, 55) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10, 10) == z.chunks
np.testing.assert_array_equal(a[:55, :55], z[:])

z.resize((55, 1))
assert (55, 1) == z.shape
assert (55, 1) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10, 10) == z.chunks
np.testing.assert_array_equal(a[:55, :1], z[:])

z.resize((1, 55))
assert (1, 55) == z.shape
assert (1, 55) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10, 10) == z.chunks
np.testing.assert_array_equal(a[:1, :10], z[:, :10])
np.testing.assert_array_equal(np.zeros((1, 55 - 10), dtype="i4"), z[:, 10:55])

# via shape setter
new_shape = (105, 105)
z.shape = new_shape
assert new_shape == z.shape
assert new_shape == z[:].shape


@pytest.mark.parametrize("store", ["memory"], indirect=True)
@pytest.mark.parametrize("zarr_format", [2, 3])
def test_append_1d(store: MemoryStore, zarr_format: int) -> None:
a = np.arange(105)
z = zarr.create(shape=a.shape, chunks=10, dtype=a.dtype, store=store, zarr_format=zarr_format)
z[:] = a
assert a.shape == z.shape
assert a.dtype == z.dtype
assert (10,) == z.chunks
np.testing.assert_array_equal(a, z[:])

b = np.arange(105, 205)
e = np.append(a, b)
assert z.shape == (105,)
z.append(b)
assert e.shape == z.shape
assert e.dtype == z.dtype
assert (10,) == z.chunks
np.testing.assert_array_equal(e, z[:])

# check append handles array-like
c = [1, 2, 3]
f = np.append(e, c)
z.append(c)
assert f.shape == z.shape
assert f.dtype == z.dtype
assert (10,) == z.chunks
np.testing.assert_array_equal(f, z[:])


@pytest.mark.parametrize("store", ["memory"], indirect=True)
@pytest.mark.parametrize("zarr_format", [2, 3])
def test_append_2d(store: MemoryStore, zarr_format: int) -> None:
a = np.arange(105 * 105, dtype="i4").reshape((105, 105))
z = zarr.create(
shape=a.shape, chunks=(10, 10), dtype=a.dtype, store=store, zarr_format=zarr_format
)
z[:] = a
assert a.shape == z.shape
assert a.dtype == z.dtype
assert (10, 10) == z.chunks
actual = z[:]
np.testing.assert_array_equal(a, actual)

b = np.arange(105 * 105, 2 * 105 * 105, dtype="i4").reshape((105, 105))
e = np.append(a, b, axis=0)
z.append(b)
assert e.shape == z.shape
assert e.dtype == z.dtype
assert (10, 10) == z.chunks
actual = z[:]
np.testing.assert_array_equal(e, actual)


@pytest.mark.parametrize("store", ["memory"], indirect=True)
@pytest.mark.parametrize("zarr_format", [2, 3])
def test_append_2d_axis(store: MemoryStore, zarr_format: int) -> None:
a = np.arange(105 * 105, dtype="i4").reshape((105, 105))
z = zarr.create(
shape=a.shape, chunks=(10, 10), dtype=a.dtype, store=store, zarr_format=zarr_format
)
z[:] = a
assert a.shape == z.shape
assert a.dtype == z.dtype
assert (10, 10) == z.chunks
np.testing.assert_array_equal(a, z[:])

b = np.arange(105 * 105, 2 * 105 * 105, dtype="i4").reshape((105, 105))
e = np.append(a, b, axis=1)
z.append(b, axis=1)
assert e.shape == z.shape
assert e.dtype == z.dtype
assert (10, 10) == z.chunks
np.testing.assert_array_equal(e, z[:])


@pytest.mark.parametrize("store", ["memory"], indirect=True)
@pytest.mark.parametrize("zarr_format", [2, 3])
def test_append_bad_shape(store: MemoryStore, zarr_format: int) -> None:
a = np.arange(100)
z = zarr.create(shape=a.shape, chunks=10, dtype=a.dtype, store=store, zarr_format=zarr_format)
z[:] = a
b = a.reshape(10, 10)
with pytest.raises(ValueError):
z.append(b)


@pytest.mark.parametrize("order", ["C", "F", None])
@pytest.mark.parametrize("zarr_format", [2, 3])
@pytest.mark.parametrize("store", ["memory"], indirect=True)
Expand Down
Loading

0 comments on commit 6ce0526

Please sign in to comment.