zarr-developers · jstriebel · Nov 17, 2021 · Nov 17, 2021 · Nov 17, 2021 · Nov 18, 2021
diff --git a/chunking_test.py b/chunking_test.py
@@ -0,0 +1,24 @@
+import json
+import os
+
+import zarr
+
+store = zarr.DirectoryStore("data/chunking_test.zarr")
+z = zarr.zeros((20, 3), chunks=(3, 3), shards=(2, 2), store=store, overwrite=True, compressor=None)
+z[...] = 42
+z[15, 1] = 389
+z[19, 2] = 1
+z[0, 1] = -4.2
+
+print("ONDISK", sorted(os.listdir("data/chunking_test.zarr")))
+assert json.loads(store[".zarray"].decode()) ["shards"] == [2, 2]
+
+print("STORE", list(store))
+print("CHUNKSTORE (SHARDED)", list(z.chunk_store))
+
+z_reopened = zarr.open("data/chunking_test.zarr")
+assert z_reopened.shards == (2, 2)
+assert z_reopened[15, 1] == 389
+assert z_reopened[19, 2] == 1
+assert z_reopened[0, 1] == -4.2
+assert z_reopened[0, 0] == 42
diff --git a/zarr/_storage/sharded_store.py b/zarr/_storage/sharded_store.py
@@ -0,0 +1,109 @@
+from functools import reduce
+from itertools import product
+from typing import Any, Iterable, Iterator, Optional, Tuple
+
+import numpy as np
+
+from zarr._storage.store import BaseStore, Store
+from zarr.storage import StoreLike, array_meta_key, attrs_key, group_meta_key
+
+
+def _cum_prod(x: Iterable[int]) -> Iterable[int]:
+    prod = 1
+    yield prod
+    for i in x[:-1]:
+        prod *= i
+        yield prod
+
+
+class ShardedStore(Store):
+    """This class should not be used directly,
+    but is added to an Array as a wrapper when needed automatically."""
+
+    def __init__(
+        self,
+        store: StoreLike,
+        shards: Tuple[int, ...],
+        dimension_separator: str,
+        are_chunks_compressed: bool,
+        dtype: np.dtype,
+        fill_value: Any,
+        chunk_size: int,
+    ) -> None:
+        self._store: BaseStore = BaseStore._ensure_store(store)
+        self._shards = shards
+        # This defines C/F-order
+        self._shard_strides = tuple(_cum_prod(shards))
+        self._num_chunks_per_shard = reduce(lambda x, y: x*y, shards, 1)
+        self._dimension_separator = dimension_separator
+        # TODO: add jumptable for compressed data
+        chunk_has_constant_size = not are_chunks_compressed and not dtype == object
+        assert chunk_has_constant_size, "Currently only uncompressed, fixed-length data can be used."
+        self._chunk_has_constant_size = chunk_has_constant_size
+        if chunk_has_constant_size:
+            binary_fill_value = np.full(1, fill_value=fill_value or 0, dtype=dtype).tobytes()
+            self._fill_chunk = binary_fill_value * chunk_size
+        else:
+            self._fill_chunk = None
+
+        # TODO: add warnings for ineffective reads/writes:
+        # * warn if partial reads are not available
+        # * optionally warn on unaligned writes if no partial writes are available
+
+    def __key_to_sharded__(self, key: str) -> Tuple[str, int]:
+        # TODO: allow to be in a group (aka only use last parts for dimensions)
+        subkeys = map(int, key.split(self._dimension_separator))
+
+        shard_tuple, index_tuple = zip(*((subkey // shard_i, subkey % shard_i) for subkey, shard_i in zip(subkeys, self._shards)))
+        shard_key = self._dimension_separator.join(map(str, shard_tuple))
+        index = sum(i * j for i, j in zip(index_tuple, self._shard_strides))
+        return shard_key, index
+
+    def __get_chunk_slice__(self, shard_key: str, shard_index: int) -> Tuple[int, int]:
+        # TODO: here we would use the jumptable for compression, which uses shard_key
+        start = shard_index * len(self._fill_chunk)
+        return slice(start, start + len(self._fill_chunk))
+
+    def __getitem__(self, key: str) -> bytes:
+        shard_key, shard_index = self.__key_to_sharded__(key)
+        chunk_slice = self.__get_chunk_slice__(shard_key, shard_index)
+        # TODO use partial reads if available
+        full_shard_value = self._store[shard_key]
+        return full_shard_value[chunk_slice]
+
+    def __setitem__(self, key: str, value: bytes) -> None:
+        shard_key, shard_index = self.__key_to_sharded__(key)
+        if shard_key in self._store:
+            full_shard_value = bytearray(self._store[shard_key])
+        else:
+            full_shard_value = bytearray(self._fill_chunk * self._num_chunks_per_shard)
+        chunk_slice = self.__get_chunk_slice__(shard_key, shard_index)
+        # TODO use partial writes if available
+        full_shard_value[chunk_slice] = value
+        self._store[shard_key] = full_shard_value
+
+    def __delitem__(self, key) -> None:
+        # TODO not implemented yet
+        # For uncompressed chunks, deleting the "last" chunk might need to be detected.
+        raise NotImplementedError("Deletion is not yet implemented")
+
+    def __iter__(self) -> Iterator[str]:
+        for shard_key in self._store.__iter__():
+            if any(shard_key.endswith(i) for i in (array_meta_key, group_meta_key, attrs_key)):
+                # Special keys such as ".zarray" are passed on as-is
+                yield shard_key
+            else:
+                # For each shard key in the wrapped store, all corresponding chunks are yielded.
+                # TODO: For compressed chunks we might yield only the actualy contained chunks by reading the jumptables.
+                # TODO: allow to be in a group (aka only use last parts for dimensions)
+                subkeys = tuple(map(int, shard_key.split(self._dimension_separator)))
+                for offset in product(*(range(i) for i in self._shards)):
+                    original_key = (subkeys_i * shards_i + offset_i for subkeys_i, offset_i, shards_i in zip(subkeys, offset, self._shards))
+                    yield self._dimension_separator.join(map(str, original_key))
+
+    def __len__(self) -> int:
+        return sum(1 for _ in self.keys())
+
+    # TODO: For efficient reads and writes, we need to implement
+    # getitems, setitems & delitems
+    # and combine writes/reads/deletions to the same shard.
diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py
@@ -110,6 +110,7 @@ def _ensure_store(store: Any):
 
 
 class Store(BaseStore):
+    # TODO: document methods which allow optimizations, e.g. delitems, setitems, getitems, listdir, …
     """Abstract store class used by implementations following the Zarr v2 spec.
 
     Adds public `listdir`, `rename`, and `rmdir` methods on top of BaseStore.

diff --git a/zarr/core.py b/zarr/core.py
@@ -5,11 +5,13 @@
 import operator
 import re
 from functools import reduce
+from typing import Optional, Tuple
 
 import numpy as np
 from numcodecs.compat import ensure_bytes, ensure_ndarray
 
 from collections.abc import MutableMapping
+from zarr._storage.sharded_store import ShardedStore
 
 from zarr.attrs import Attributes
 from zarr.codecs import AsType, get_codec
@@ -191,6 +193,9 @@ def __init__(
         self._oindex = OIndex(self)
         self._vindex = VIndex(self)
 
+        # the sharded store is only initialized when needed
+        self._cached_sharded_store = None
+
     def _load_metadata(self):
         """(Re)load metadata from store."""
         if self._synchronizer is None:
@@ -213,6 +218,7 @@ def _load_metadata_nosync(self):
             self._meta = meta
             self._shape = meta['shape']
             self._chunks = meta['chunks']
+            self._shards = meta.get('shards')
             self._dtype = meta['dtype']
             self._fill_value = meta['fill_value']
             self._order = meta['order']
@@ -262,9 +268,11 @@ def _flush_metadata_nosync(self):
             filters_config = [f.get_config() for f in self._filters]
         else:
             filters_config = None
+        # Possible (unrelated) bug:
+        # should the dimension_separator also be included in this dict?
         meta = dict(shape=self._shape, chunks=self._chunks, dtype=self._dtype,
                     compressor=compressor_config, fill_value=self._fill_value,
-                    order=self._order, filters=filters_config)
+                    order=self._order, filters=filters_config, shards=self._shards)
         mkey = self._key_prefix + array_meta_key
         self._store[mkey] = self._store._metadata_class.encode_array_metadata(meta)
 
@@ -309,9 +317,23 @@ def read_only(self, value):
     def chunk_store(self):
         """A MutableMapping providing the underlying storage for array chunks."""
         if self._chunk_store is None:
-            return self._store
+            chunk_store = self._store
+        else:
+            chunk_store = self._chunk_store
+        if self._shards is None:
+            return chunk_store
         else:
-            return self._chunk_store
+            if self._cached_sharded_store is None:
+                self._cached_sharded_store = ShardedStore(
+                    chunk_store,
+                    shards=self._shards,
+                    dimension_separator=self._dimension_separator,
+                    are_chunks_compressed=self._compressor is not None,
+                    dtype=self._dtype,
+                    fill_value=self._fill_value or 0,
+                    chunk_size=reduce(operator.mul, self._chunks, 1),
+                )
+            return self._cached_sharded_store
 
     @property
     def shape(self):
@@ -327,11 +349,17 @@ def shape(self, value):
         self.resize(value)
 
     @property
-    def chunks(self):
+    def chunks(self) -> Optional[Tuple[int, ...]]:
         """A tuple of integers describing the length of each dimension of a
-        chunk of the array."""
+        chunk of the array, or None."""
         return self._chunks
 
+    @property
+    def shards(self):
+        """A tuple of integers describing the number of chunks in each shard
+        of the array."""
+        return self._shards
+
     @property
     def dtype(self):
         """The NumPy data type."""
@@ -1899,7 +1927,7 @@ def _chunk_getitems(self, lchunk_coords, lchunk_selection, out, lout_selection,
             and hasattr(self._compressor, "decode_partial")
             and not fields
             and self.dtype != object
-            and hasattr(self.chunk_store, "getitems")
+            and hasattr(self.chunk_store, "getitems")  # TODO: this should rather check for read_block or similar
         ):
             partial_read_decode = True
             cdatas = {
@@ -2236,6 +2264,7 @@ def digest(self, hashname="sha1"):
 
         h = hashlib.new(hashname)
 
+        # TODO: operate on shards here if available:
         for i in itertools.product(*[range(s) for s in self.cdata_shape]):
             h.update(self.chunk_store.get(self._chunk_key(i), b""))
 
@@ -2362,6 +2391,7 @@ def _resize_nosync(self, *args):
                 except KeyError:
                     # chunk not initialized
                     pass
+        # TODO: collect all chunks do delete and use _chunk_delitems
 
     def append(self, data, axis=0):
         """Append `data` to `axis`.

diff --git a/zarr/creation.py b/zarr/creation.py
@@ -1,3 +1,4 @@
+from typing import Optional, Tuple, Union
 from warnings import warn
 
 import numpy as np
@@ -19,7 +20,8 @@ def create(shape, chunks=True, dtype=None, compressor='default',
            fill_value=0, order='C', store=None, synchronizer=None,
            overwrite=False, path=None, chunk_store=None, filters=None,
            cache_metadata=True, cache_attrs=True, read_only=False,
-           object_codec=None, dimension_separator=None, write_empty_chunks=True, **kwargs):
+           object_codec=None, dimension_separator=None, write_empty_chunks=True,
+           shards: Union[int, Tuple[int, ...], None]=None, **kwargs):
     """Create an array.
 
     Parameters
@@ -145,7 +147,7 @@ def create(shape, chunks=True, dtype=None, compressor='default',
     init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor,
                fill_value=fill_value, order=order, overwrite=overwrite, path=path,
                chunk_store=chunk_store, filters=filters, object_codec=object_codec,
-               dimension_separator=dimension_separator)
+               dimension_separator=dimension_separator, shards=shards)
 
     # instantiate array
     z = Array(store, path=path, chunk_store=chunk_store, synchronizer=synchronizer,

diff --git a/zarr/meta.py b/zarr/meta.py
@@ -51,6 +51,7 @@ def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, A
                 object_codec = None
 
             dimension_separator = meta.get("dimension_separator", None)
+            shards = meta.get("shards", None)
             fill_value = cls.decode_fill_value(meta['fill_value'], dtype, object_codec)
             meta = dict(
                 zarr_format=meta["zarr_format"],
@@ -64,6 +65,8 @@ def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, A
             )
             if dimension_separator:
                 meta['dimension_separator'] = dimension_separator
+            if shards:
+                meta['shards'] = tuple(shards)
         except Exception as e:
             raise MetadataError("error decoding metadata") from e
         else:
@@ -77,6 +80,7 @@ def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes:
             dtype, sdshape = dtype.subdtype
 
         dimension_separator = meta.get("dimension_separator")
+        shards = meta.get("shards")
         if dtype.hasobject:
             import numcodecs
             object_codec = numcodecs.get_codec(meta['filters'][0])
@@ -96,8 +100,8 @@ def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes:
         if dimension_separator:
             meta['dimension_separator'] = dimension_separator
 
-        if dimension_separator:
-            meta["dimension_separator"] = dimension_separator
+        if shards:
+            meta['shards'] = shards
 
         return json_dumps(meta)
 

diff --git a/zarr/storage.py b/zarr/storage.py
@@ -54,7 +54,7 @@
 from zarr.util import (buffer_size, json_loads, nolock, normalize_chunks,
                        normalize_dimension_separator,
                        normalize_dtype, normalize_fill_value, normalize_order,
-                       normalize_shape, normalize_storage_path, retry_call)
+                       normalize_shape, normalize_shards, normalize_storage_path, retry_call)
 
 from zarr._storage.absstore import ABSStore  # noqa: F401
 from zarr._storage.store import (_listdir_from_keys,
@@ -236,6 +236,7 @@ def init_array(
     filters=None,
     object_codec=None,
     dimension_separator=None,
+    shards: Union[int, Tuple[int, ...], None]=None,
 ):
     """Initialize an array store with the given configuration. Note that this is a low-level
     function and there should be no need to call this directly from user code.
@@ -353,7 +354,8 @@ def init_array(
                          order=order, overwrite=overwrite, path=path,
                          chunk_store=chunk_store, filters=filters,
                          object_codec=object_codec,
-                         dimension_separator=dimension_separator)
+                         dimension_separator=dimension_separator,
+                         shards=shards)
 
 
 def _init_array_metadata(
@@ -370,6 +372,7 @@ def _init_array_metadata(
     filters=None,
     object_codec=None,
     dimension_separator=None,
+    shards:Union[int, Tuple[int, ...], None] = None,
 ):
 
     # guard conditions
@@ -388,6 +391,7 @@ def _init_array_metadata(
     shape = normalize_shape(shape) + dtype.shape
     dtype = dtype.base
     chunks = normalize_chunks(chunks, shape, dtype.itemsize)
+    shards = normalize_shards(shards, shape)
     order = normalize_order(order)
     fill_value = normalize_fill_value(fill_value, dtype)
 
@@ -445,6 +449,8 @@ def _init_array_metadata(
                 compressor=compressor_config, fill_value=fill_value,
                 order=order, filters=filters_config,
                 dimension_separator=dimension_separator)
+    if shards is not None:
+        meta["shards"] = shards
     key = _path_to_prefix(path) + array_meta_key
     if hasattr(store, '_metadata_class'):
         store[key] = store._metadata_class.encode_array_metadata(meta)  # type: ignore