diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 680433565..f80e505f6 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -16,6 +16,7 @@ ChunkCoords, MemoryOrder, ZarrFormat, + _warn_write_empty_chunks_kwarg, ) from zarr.core.config import config from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata @@ -724,7 +725,6 @@ async def create( read_only: bool | None = None, object_codec: Codec | None = None, # TODO: type has changed dimension_separator: Literal[".", "/"] | None = None, - write_empty_chunks: bool = False, # TODO: default has changed zarr_version: ZarrFormat | None = None, # deprecated zarr_format: ZarrFormat | None = None, meta_array: Any | None = None, # TODO: need type @@ -794,17 +794,6 @@ async def create( .. versionadded:: 2.8 - write_empty_chunks : bool, optional - If True (default), all chunks will be stored regardless of their - contents. If False, each chunk is compared to the array's fill value - prior to storing. If a chunk is uniformly equal to the fill value, then - that chunk is not be stored, and the store entry for that chunk's key - is deleted. This setting enables sparser storage, as only chunks with - non-fill-value data are stored, at the expense of overhead associated - with checking the data of each chunk. - - .. versionadded:: 2.11 - zarr_format : {2, 3, None}, optional The zarr format to use when saving. meta_array : array-like, optional @@ -856,8 +845,12 @@ async def create( RuntimeWarning, stacklevel=2, ) - if write_empty_chunks: - warnings.warn("write_empty_chunks is not yet implemented", RuntimeWarning, stacklevel=2) + + if "write_empty_chunks" in kwargs: + # warn users if the write_empty_chunks kwarg was used + write_empty_chunks = kwargs.pop("write_empty_chunks") + _warn_write_empty_chunks_kwarg(write_empty_chunks) + if meta_array is not None: warnings.warn("meta_array is not yet implemented", RuntimeWarning, stacklevel=2) @@ -1058,6 +1051,11 @@ async def open_array( zarr_format = _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) + if "write_empty_chunks" in kwargs: + # warn users if the write_empty_chunks kwarg was used + write_empty_chunks = kwargs.pop("write_empty_chunks") + _warn_write_empty_chunks_kwarg(write_empty_chunks) + try: return await AsyncArray.open(store_path, zarr_format=zarr_format) except FileNotFoundError: diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index 038a2eeac..485fb0a59 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -331,6 +331,7 @@ async def write_batch( value: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: + write_empty_chunks = config.get("array.write_empty_chunks") == True # noqa: E712 if self.supports_partial_encode: await self.encode_partial_batch( [ @@ -360,7 +361,7 @@ async def _read_key( _read_key, config.get("async.concurrency"), ) - chunk_array_batch = await self.decode_batch( + chunk_array_decoded = await self.decode_batch( [ (chunk_bytes, chunk_spec) for chunk_bytes, (_, chunk_spec, _, _) in zip( @@ -369,23 +370,25 @@ async def _read_key( ], ) - chunk_array_batch = [ + chunk_array_merged = [ self._merge_chunk_array( chunk_array, value, out_selection, chunk_spec, chunk_selection, drop_axes ) for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( - chunk_array_batch, batch_info, strict=False - ) - ] - - chunk_array_batch = [ - None - if chunk_array is None or chunk_array.all_equal(chunk_spec.fill_value) - else chunk_array - for chunk_array, (_, chunk_spec, _, _) in zip( - chunk_array_batch, batch_info, strict=False + chunk_array_decoded, batch_info, strict=False ) ] + chunk_array_batch: list[NDBuffer | None] = [] + for chunk_array, (_, chunk_spec, _, _) in zip( + chunk_array_merged, batch_info, strict=False + ): + if chunk_array is None: + chunk_array_batch.append(None) # type: ignore[unreachable] + else: + if not write_empty_chunks and chunk_array.all_equal(chunk_spec.fill_value): + chunk_array_batch.append(None) + else: + chunk_array_batch.append(chunk_array) chunk_bytes_batch = await self.encode_batch( [ diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index f3f49b0d5..54401e5d2 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -3,6 +3,7 @@ import asyncio import functools import operator +import warnings from collections.abc import Iterable, Mapping from enum import Enum from itertools import starmap @@ -166,3 +167,12 @@ def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: else: return _STRING_DTYPE return np.dtype(dtype) + + +def _warn_write_empty_chunks_kwarg(write_empty_chunks: bool) -> None: + msg = ( + f"The `write_empty_chunks` keyword argument was provided to this function with a value of {write_empty_chunks}." + "This keyword argument has no effect. To control whether empty chunks are written to" + " storage, change the 'array.write_empty_chunks' configuration variable." + ) + warnings.warn(msg, RuntimeWarning, stacklevel=2) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index ec2c8c47a..fabba7038 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -42,7 +42,7 @@ def reset(self) -> None: defaults=[ { "default_zarr_version": 3, - "array": {"order": "C"}, + "array": {"order": "C", "write_empty_chunks": False}, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, "json_indent": 2, diff --git a/tests/test_api.py b/tests/test_api.py index 5b62e3a2f..8525c982d 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -48,6 +48,21 @@ def test_create_array(memory_store: Store) -> None: assert z.chunks == (40,) +@pytest.mark.parametrize("write_empty_chunks", [True, False]) +def test_write_empty_chunks_warns(write_empty_chunks: bool) -> None: + """ + Test that using the `write_empty_chunks` kwarg on array access will raise a warning. + """ + match = f"The `write_empty_chunks` keyword argument was provided to this function with a value of {write_empty_chunks}." + with pytest.warns(RuntimeWarning, match=match): + _ = zarr.array( + data=np.arange(10), shape=(10,), dtype="uint8", write_empty_chunks=write_empty_chunks + ) + + with pytest.warns(RuntimeWarning, match=match): + _ = zarr.create(shape=(10,), dtype="uint8", write_empty_chunks=write_empty_chunks) + + @pytest.mark.parametrize("path", ["foo", "/", "/foo", "///foo/bar"]) @pytest.mark.parametrize("node_type", ["array", "group"]) def test_open_normalized_path( diff --git a/tests/test_array.py b/tests/test_array.py index f182cb1a1..c740c2d4e 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -11,6 +11,7 @@ from zarr.core.array import chunks_initialized from zarr.core.buffer.cpu import NDBuffer from zarr.core.common import JSON, MemoryOrder, ZarrFormat +from zarr.core.config import config from zarr.core.group import AsyncGroup from zarr.core.indexing import ceildiv from zarr.core.sync import sync @@ -436,3 +437,39 @@ def test_array_create_order( assert vals.flags.f_contiguous else: raise AssertionError + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("write_empty_chunks", [True, False]) +@pytest.mark.parametrize("fill_value", [0, 5]) +def test_write_empty_chunks( + zarr_format: ZarrFormat, store: MemoryStore, write_empty_chunks: bool, fill_value: int +) -> None: + """ + Check that the write_empty_chunks value of the config is applied correctly. We expect that + when write_empty_chunks is True, writing chunks equal to the fill value will result in + those chunks appearing in the store. + + When write_empty_chunks is False, writing chunks that are equal to the fill value will result in + those chunks not being present in the store. In particular, they should be deleted if they were + already present. + """ + arr = Array.create( + store=store, + shape=(2,), + zarr_format=zarr_format, + dtype="i4", + fill_value=fill_value, + chunk_shape=(1,), + ) + + # initialize the store with some non-fill value chunks + arr[:] = fill_value + 1 + assert arr.nchunks_initialized == arr.nchunks + + with config.set({"array.write_empty_chunks": write_empty_chunks}): + arr[:] = fill_value + if not write_empty_chunks: + assert arr.nchunks_initialized == 0 + else: + assert arr.nchunks_initialized == arr.nchunks diff --git a/tests/test_config.py b/tests/test_config.py index c4cf794c5..b20b43327 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -40,7 +40,7 @@ def test_config_defaults_set() -> None: assert config.defaults == [ { "default_zarr_version": 3, - "array": {"order": "C"}, + "array": {"order": "C", "write_empty_chunks": False}, "async": {"concurrency": 10, "timeout": None}, "threading": {"max_workers": None}, "json_indent": 2,