diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index bdafa33f6..8c4d797e9 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -2,7 +2,7 @@ import json from asyncio import gather -from dataclasses import dataclass, field, replace +from dataclasses import dataclass, field from itertools import starmap from logging import getLogger from typing import TYPE_CHECKING, Any, Generic, Literal, cast, overload @@ -1104,15 +1104,15 @@ async def setitem( ) return await self._set_selection(indexer, value, prototype=prototype) - async def resize(self, new_shape: ChunkCoords, delete_outside_chunks: bool = True) -> Self: + async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) -> None: + new_shape = parse_shapelike(new_shape) assert len(new_shape) == len(self.metadata.shape) new_metadata = self.metadata.update_shape(new_shape) - # Remove all chunks outside of the new shape - old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape)) - new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape)) - if delete_outside_chunks: + # Remove all chunks outside of the new shape + old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape)) + new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape)) async def _delete_key(key: str) -> None: await (self.store_path / key).delete() @@ -1128,7 +1128,63 @@ async def _delete_key(key: str) -> None: # Write new metadata await self._save_metadata(new_metadata) - return replace(self, metadata=new_metadata) + + # Update metadata (in place) + object.__setattr__(self, "metadata", new_metadata) + + async def append(self, data: npt.ArrayLike, axis: int = 0) -> ChunkCoords: + """Append `data` to `axis`. + + Parameters + ---------- + data : array-like + Data to be appended. + axis : int + Axis along which to append. + + Returns + ------- + new_shape : tuple + + Notes + ----- + The size of all dimensions other than `axis` must match between this + array and `data`. + """ + # ensure data is array-like + if not hasattr(data, "shape"): + data = np.asanyarray(data) + + self_shape_preserved = tuple(s for i, s in enumerate(self.shape) if i != axis) + data_shape_preserved = tuple(s for i, s in enumerate(data.shape) if i != axis) + if self_shape_preserved != data_shape_preserved: + raise ValueError( + f"shape of data to append is not compatible with the array. " + f"The shape of the data is ({data_shape_preserved})" + f"and the shape of the array is ({self_shape_preserved})." + "All dimensions must match except for the dimension being " + "appended." + ) + # remember old shape + old_shape = self.shape + + # determine new shape + new_shape = tuple( + self.shape[i] if i != axis else self.shape[i] + data.shape[i] + for i in range(len(self.shape)) + ) + + # resize + await self.resize(new_shape) + + # store data + append_selection = tuple( + slice(None) if i != axis else slice(old_shape[i], new_shape[i]) + for i in range(len(self.shape)) + ) + await self.setitem(append_selection, data) + + return new_shape async def update_attributes(self, new_attributes: dict[str, JSON]) -> Self: # metadata.attributes is "frozen" so we simply clear and update the dict @@ -1147,7 +1203,8 @@ async def info(self) -> None: raise NotImplementedError -@dataclass(frozen=True) +# TODO: Array can be a frozen data class again once property setters (e.g. shape) are removed +@dataclass(frozen=False) class Array: """Instantiate an array from an initialized store.""" @@ -1297,6 +1354,11 @@ def shape(self) -> ChunkCoords: """ return self._async_array.shape + @shape.setter + def shape(self, value: ChunkCoords) -> None: + """Sets the shape of the array by calling resize.""" + self.resize(value) + @property def chunks(self) -> ChunkCoords: """Returns a tuple of integers describing the length of each dimension of a chunk of the array. @@ -2754,18 +2816,18 @@ def blocks(self) -> BlockIndex: :func:`set_block_selection` for documentation and examples.""" return BlockIndex(self) - def resize(self, new_shape: ChunkCoords) -> Array: + def resize(self, new_shape: ShapeLike) -> None: """ Change the shape of the array by growing or shrinking one or more dimensions. - This method does not modify the original Array object. Instead, it returns a new Array - with the specified shape. + Parameters + ---------- + new_shape : tuple + New shape of the array. Notes ----- - When resizing an array, the data are not rearranged in any way. - If one or more dimensions are shrunk, any chunks falling outside the new array shape will be deleted from the underlying store. However, it is noteworthy that the chunks partially falling inside the new array @@ -2778,7 +2840,6 @@ def resize(self, new_shape: ChunkCoords) -> Array: >>> import zarr >>> z = zarr.zeros(shape=(10000, 10000), >>> chunk_shape=(1000, 1000), - >>> store=StorePath(MemoryStore(mode="w")), >>> dtype="i4",) >>> z.shape (10000, 10000) @@ -2791,10 +2852,43 @@ def resize(self, new_shape: ChunkCoords) -> Array: >>> z2.shape (50, 50) """ - resized = sync(self._async_array.resize(new_shape)) - # TODO: remove this cast when type inference improves - _resized = cast(AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], resized) - return type(self)(_resized) + sync(self._async_array.resize(new_shape)) + + def append(self, data: npt.ArrayLike, axis: int = 0) -> ChunkCoords: + """Append `data` to `axis`. + + Parameters + ---------- + data : array-like + Data to be appended. + axis : int + Axis along which to append. + + Returns + ------- + new_shape : tuple + + Notes + ----- + The size of all dimensions other than `axis` must match between this + array and `data`. + + Examples + -------- + >>> import numpy as np + >>> import zarr + >>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000) + >>> z = zarr.array(a, chunks=(1000, 100)) + >>> z.shape + (10000, 1000) + >>> z.append(a) + (20000, 1000) + >>> z.append(np.vstack([a, a]), axis=1) + (20000, 2000) + >>> z.shape + (20000, 2000) + """ + return sync(self._async_array.append(data, axis=axis)) def update_attributes(self, new_attributes: dict[str, JSON]) -> Array: # TODO: remove this cast when type inference improves diff --git a/tests/test_array.py b/tests/test_array.py index f182cb1a1..ae8e7f99c 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -419,6 +419,194 @@ def test_update_attrs(zarr_format: int) -> None: assert arr2.attrs["foo"] == "bar" +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_resize_1d(store: MemoryStore, zarr_format: int) -> None: + z = zarr.create( + shape=105, chunks=10, dtype="i4", fill_value=0, store=store, zarr_format=zarr_format + ) + a = np.arange(105, dtype="i4") + z[:] = a + assert (105,) == z.shape + assert (105,) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(a, z[:]) + + z.resize(205) + assert (205,) == z.shape + assert (205,) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(a, z[:105]) + np.testing.assert_array_equal(np.zeros(100, dtype="i4"), z[105:]) + + z.resize(55) + assert (55,) == z.shape + assert (55,) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(a[:55], z[:]) + + # via shape setter + new_shape = (105,) + z.shape = new_shape + assert new_shape == z.shape + assert new_shape == z[:].shape + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_resize_2d(store: MemoryStore, zarr_format: int) -> None: + z = zarr.create( + shape=(105, 105), + chunks=(10, 10), + dtype="i4", + fill_value=0, + store=store, + zarr_format=zarr_format, + ) + a = np.arange(105 * 105, dtype="i4").reshape((105, 105)) + z[:] = a + assert (105, 105) == z.shape + assert (105, 105) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a, z[:]) + + z.resize((205, 205)) + assert (205, 205) == z.shape + assert (205, 205) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a, z[:105, :105]) + np.testing.assert_array_equal(np.zeros((100, 205), dtype="i4"), z[105:, :]) + np.testing.assert_array_equal(np.zeros((205, 100), dtype="i4"), z[:, 105:]) + + z.resize((55, 55)) + assert (55, 55) == z.shape + assert (55, 55) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a[:55, :55], z[:]) + + z.resize((55, 1)) + assert (55, 1) == z.shape + assert (55, 1) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a[:55, :1], z[:]) + + z.resize((1, 55)) + assert (1, 55) == z.shape + assert (1, 55) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a[:1, :10], z[:, :10]) + np.testing.assert_array_equal(np.zeros((1, 55 - 10), dtype="i4"), z[:, 10:55]) + + # via shape setter + new_shape = (105, 105) + z.shape = new_shape + assert new_shape == z.shape + assert new_shape == z[:].shape + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_append_1d(store: MemoryStore, zarr_format: int) -> None: + a = np.arange(105) + z = zarr.create(shape=a.shape, chunks=10, dtype=a.dtype, store=store, zarr_format=zarr_format) + z[:] = a + assert a.shape == z.shape + assert a.dtype == z.dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(a, z[:]) + + b = np.arange(105, 205) + e = np.append(a, b) + assert z.shape == (105,) + z.append(b) + assert e.shape == z.shape + assert e.dtype == z.dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(e, z[:]) + + # check append handles array-like + c = [1, 2, 3] + f = np.append(e, c) + z.append(c) + assert f.shape == z.shape + assert f.dtype == z.dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(f, z[:]) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_append_2d(store: MemoryStore, zarr_format: int) -> None: + a = np.arange(105 * 105, dtype="i4").reshape((105, 105)) + z = zarr.create( + shape=a.shape, chunks=(10, 10), dtype=a.dtype, store=store, zarr_format=zarr_format + ) + z[:] = a + assert a.shape == z.shape + assert a.dtype == z.dtype + assert (10, 10) == z.chunks + actual = z[:] + np.testing.assert_array_equal(a, actual) + + b = np.arange(105 * 105, 2 * 105 * 105, dtype="i4").reshape((105, 105)) + e = np.append(a, b, axis=0) + z.append(b) + assert e.shape == z.shape + assert e.dtype == z.dtype + assert (10, 10) == z.chunks + actual = z[:] + np.testing.assert_array_equal(e, actual) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_append_2d_axis(store: MemoryStore, zarr_format: int) -> None: + a = np.arange(105 * 105, dtype="i4").reshape((105, 105)) + z = zarr.create( + shape=a.shape, chunks=(10, 10), dtype=a.dtype, store=store, zarr_format=zarr_format + ) + z[:] = a + assert a.shape == z.shape + assert a.dtype == z.dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a, z[:]) + + b = np.arange(105 * 105, 2 * 105 * 105, dtype="i4").reshape((105, 105)) + e = np.append(a, b, axis=1) + z.append(b, axis=1) + assert e.shape == z.shape + assert e.dtype == z.dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(e, z[:]) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_append_bad_shape(store: MemoryStore, zarr_format: int) -> None: + a = np.arange(100) + z = zarr.create(shape=a.shape, chunks=10, dtype=a.dtype, store=store, zarr_format=zarr_format) + z[:] = a + b = a.reshape(10, 10) + with pytest.raises(ValueError): + z.append(b) + + @pytest.mark.parametrize("order", ["C", "F", None]) @pytest.mark.parametrize("zarr_format", [2, 3]) @pytest.mark.parametrize("store", ["memory"], indirect=True) diff --git a/tests/test_codecs/test_codecs.py b/tests/test_codecs/test_codecs.py index 7a5fb979a..0f2f89291 100644 --- a/tests/test_codecs/test_codecs.py +++ b/tests/test_codecs/test_codecs.py @@ -371,8 +371,9 @@ async def test_resize(store: Store) -> None: assert await store.get(f"{path}/0.1", prototype=default_buffer_prototype()) is not None assert await store.get(f"{path}/1.0", prototype=default_buffer_prototype()) is not None - a = await a.resize((10, 12)) + await a.resize((10, 12)) assert a.metadata.shape == (10, 12) + assert a.shape == (10, 12) assert await store.get(f"{path}/0.0", prototype=default_buffer_prototype()) is not None assert await store.get(f"{path}/0.1", prototype=default_buffer_prototype()) is not None assert await store.get(f"{path}/1.0", prototype=default_buffer_prototype()) is None