Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow creating references for empty archival datasets #260

Merged
merged 22 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,17 @@ New Features
- Load scalar variables by default. (:pull:`205`)
By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.

- Support empty files (:pull:`260`)
By `Justus Magin <https://github.com/keewis>`_.

Breaking changes
~~~~~~~~~~~~~~~~

- Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`)
By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
- VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass
`pydantic.BaseModel` (:pull:`210`)
- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`xxx`)
- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`210`)

Deprecations
~~~~~~~~~~~~
Expand All @@ -55,7 +58,7 @@ Bug fixes
Documentation
~~~~~~~~~~~~~

- Adds virtualizarr + coiled serverless example notebook (:pull`223`)
- Adds virtualizarr + coiled serverless example notebook (:pull:`223`)
By `Raphael Hagen <https://github.com/norlandrhagen>`_.


Expand Down
12 changes: 8 additions & 4 deletions virtualizarr/manifests/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class ChunkManifest:
_offsets: np.ndarray[Any, np.dtype[np.uint64]]
_lengths: np.ndarray[Any, np.dtype[np.uint64]]

def __init__(self, entries: dict) -> None:
def __init__(self, entries: dict, shape: tuple[int, ...] | None = None) -> None:
"""
Create a ChunkManifest from a dictionary mapping zarr chunk keys to byte ranges.

Expand All @@ -105,13 +105,14 @@ def __init__(self, entries: dict) -> None:
"0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
}
"""
if shape is None and not entries:
raise ValueError("need a chunk grid shape if no chunks given")

# TODO do some input validation here first?
validate_chunk_keys(entries.keys())

# TODO should we actually optionally pass chunk grid shape in,
# in case there are not enough chunks to give correct idea of full shape?
shape = get_chunk_grid_shape(entries.keys())
if shape is None:
shape = get_chunk_grid_shape(entries.keys())

# Initializing to empty implies that entries with path='' are treated as missing chunks
paths = cast( # `np.empty` apparently is type hinted as if the output could have Any dtype
Expand Down Expand Up @@ -386,6 +387,9 @@ def get_ndim_from_key(key: str) -> int:


def validate_chunk_keys(chunk_keys: Iterable[ChunkKey]):
if not chunk_keys:
return

# Check if all keys have the correct form
for key in chunk_keys:
if not re.match(_CHUNK_KEY, key):
Expand Down
15 changes: 14 additions & 1 deletion virtualizarr/readers/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
KerchunkStoreRefs,
)
from virtualizarr.utils import _FsspecFSFromFilepath
from virtualizarr.zarr import ZArray, ZAttrs
from virtualizarr.zarr import ZArray, ZAttrs, ceildiv


# TODO shouldn't this live in backend.py? Because it's not just useful for the kerchunk-specific readers...
Expand Down Expand Up @@ -230,6 +230,13 @@ def dataset_from_kerchunk_refs(
return vds


def determine_chunk_grid_shape(zarray):
return tuple(
ceildiv(length, chunksize)
for length, chunksize in zip(zarray.shape, zarray.chunks)
)


def variable_from_kerchunk_refs(
refs: KerchunkStoreRefs, var_name: str, virtual_array_class
) -> Variable:
Expand All @@ -242,6 +249,12 @@ def variable_from_kerchunk_refs(
if chunk_dict:
manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict)
varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest)
elif len(zarray.shape) != 0:
keewis marked this conversation as resolved.
Show resolved Hide resolved
# empty variables don't have physical chunks, but zarray shows that the variable
# is at least 1D
shape = determine_chunk_grid_shape(zarray)
manifest = ChunkManifest(entries={}, shape=shape)
varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest)
else:
# This means we encountered a scalar variable of dimension 0,
# very likely that it actually has no numeric value and its only purpose
Expand Down
108 changes: 108 additions & 0 deletions virtualizarr/tests/test_manifests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,38 @@ def test_broadcast_any_shape(self, shape, chunks, target_shape):
for len_arr, len_chunk in zip(broadcasted_marr.shape, broadcasted_chunk_shape):
assert len_chunk <= len_arr

@pytest.mark.parametrize(
"shape, chunks, grid_shape, target_shape",
[
((1,), (1,), (1,), (3,)),
((2,), (1,), (2,), (2,)),
((3,), (2,), (2,), (5, 4, 3)),
((3, 1), (2, 1), (2, 1), (2, 3, 4)),
],
)
def test_broadcast_empty(self, shape, chunks, grid_shape, target_shape):
zarray = ZArray(
chunks=chunks,
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
order="C",
shape=shape,
zarr_format=2,
)
manifest = ChunkManifest(entries={}, shape=grid_shape)
marr = ManifestArray(zarray, manifest)

expanded = np.broadcast_to(marr, shape=target_shape)
assert expanded.shape == target_shape
assert len(expanded.chunks) == expanded.ndim
assert all(
len_chunk <= len_arr
for len_arr, len_chunk in zip(expanded.shape, expanded.chunks)
)
assert expanded.manifest.dict() == {}
Comment on lines +232 to +236
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Really nice test!



# TODO we really need some kind of fixtures to generate useful example data
# The hard part is having an alternative way to get to the expected result of concatenation
Expand Down Expand Up @@ -250,6 +282,44 @@ def test_concat(self):
assert result.zarray.order == zarray.order
assert result.zarray.zarr_format == zarray.zarr_format

def test_concat_empty(self):
# both manifest arrays in this example have the same zarray properties
zarray = ZArray(
chunks=(5, 1, 10),
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
order="C",
shape=(5, 1, 20),
zarr_format=2,
)

chunks_dict1 = {}
manifest1 = ChunkManifest(entries=chunks_dict1, shape=(1, 1, 2))
marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)

chunks_dict2 = {
"0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
"0.0.1": {"path": "foo.nc", "offset": 400, "length": 100},
}
manifest2 = ChunkManifest(entries=chunks_dict2)
marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)

result = np.concatenate([marr1, marr2], axis=1)

assert result.shape == (5, 2, 20)
assert result.chunks == (5, 1, 10)
assert result.manifest.dict() == {
"0.1.0": {"path": "foo.nc", "offset": 300, "length": 100},
"0.1.1": {"path": "foo.nc", "offset": 400, "length": 100},
}
assert result.zarray.compressor == zarray.compressor
assert result.zarray.filters == zarray.filters
assert result.zarray.fill_value == zarray.fill_value
assert result.zarray.order == zarray.order
assert result.zarray.zarr_format == zarray.zarr_format


class TestStack:
def test_stack(self):
Expand Down Expand Up @@ -295,6 +365,44 @@ def test_stack(self):
assert result.zarray.order == zarray.order
assert result.zarray.zarr_format == zarray.zarr_format

def test_stack_empty(self):
# both manifest arrays in this example have the same zarray properties
zarray = ZArray(
chunks=(5, 10),
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
order="C",
shape=(5, 20),
zarr_format=2,
)

chunks_dict1 = {}
manifest1 = ChunkManifest(entries=chunks_dict1, shape=(1, 2))
marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)

chunks_dict2 = {
"0.0": {"path": "foo.nc", "offset": 300, "length": 100},
"0.1": {"path": "foo.nc", "offset": 400, "length": 100},
}
manifest2 = ChunkManifest(entries=chunks_dict2)
marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)

result = np.stack([marr1, marr2], axis=1)

assert result.shape == (5, 2, 20)
assert result.chunks == (5, 1, 10)
assert result.manifest.dict() == {
"0.1.0": {"path": "foo.nc", "offset": 300, "length": 100},
"0.1.1": {"path": "foo.nc", "offset": 400, "length": 100},
}
assert result.zarray.compressor == zarray.compressor
assert result.zarray.filters == zarray.filters
assert result.zarray.fill_value == zarray.fill_value
assert result.zarray.order == zarray.order
assert result.zarray.zarr_format == zarray.zarr_format


def test_refuse_combine():
# TODO test refusing to concatenate arrays that have conflicting shapes / chunk sizes
Expand Down
8 changes: 8 additions & 0 deletions virtualizarr/tests/test_manifests/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ def test_create_manifest(self):
manifest = ChunkManifest(entries=chunks)
assert manifest.dict() == chunks

chunks = {}
manifest = ChunkManifest(entries=chunks, shape=(2, 2))
assert manifest.dict() == chunks

def test_create_manifest_empty_missing_shape(self):
with pytest.raises(ValueError, match="chunk grid shape if no chunks"):
ChunkManifest(entries={})

def test_invalid_chunk_entries(self):
chunks = {
"0.0.0": {"path": "s3://bucket/foo.nc"},
Expand Down
24 changes: 22 additions & 2 deletions virtualizarr/tests/test_readers/test_kerchunk.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import ujson

from virtualizarr.manifests import ManifestArray
from virtualizarr.readers.kerchunk import (
Expand Down Expand Up @@ -45,8 +46,6 @@ def test_dataset_from_df_refs():


def test_dataset_from_df_refs_with_filters():
import ujson

filters = [{"elementsize": 4, "id": "shuffle"}, {"id": "zlib", "level": 4}]
zarray = {
"chunks": [2, 3],
Expand All @@ -62,3 +61,24 @@ def test_dataset_from_df_refs_with_filters():
ds = dataset_from_kerchunk_refs(ds_refs)
da = ds["a"]
assert da.data.zarray.filters == filters


def test_dataset_from_kerchunk_refs_empty_chunk_manifest():
zarray = {
"chunks": [50, 100],
"compressor": None,
"dtype": "<i8",
"fill_value": 100,
"filters": None,
"order": "C",
"shape": [100, 200],
"zarr_format": 2,
}
refs = gen_ds_refs(zarray=ujson.dumps(zarray))
del refs["refs"]["a/0.0"]

ds = dataset_from_kerchunk_refs(refs)
assert "a" in ds.variables
assert isinstance(ds["a"].data, ManifestArray)
assert ds["a"].sizes == {"x": 100, "y": 200}
assert ds["a"].chunksizes == {"x": 50, "y": 100}
29 changes: 29 additions & 0 deletions virtualizarr/tests/test_writers/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,35 @@ def test_accessor_to_kerchunk_dict(self):
result_ds_refs = ds.virtualize.to_kerchunk(format="dict")
assert result_ds_refs == expected_ds_refs

def test_accessor_to_kerchunk_dict_empty(self):
manifest = ChunkManifest(entries={}, shape=(1, 1))
arr = ManifestArray(
chunkmanifest=manifest,
zarray=dict(
shape=(2, 3),
dtype=np.dtype("<i8"),
chunks=(2, 3),
compressor=None,
filters=None,
fill_value=np.nan,
order="C",
),
)
ds = Dataset({"a": (["x", "y"], arr)})

expected_ds_refs = {
"version": 1,
"refs": {
".zgroup": '{"zarr_format":2}',
".zattrs": "{}",
"a/.zarray": '{"shape":[2,3],"chunks":[2,3],"dtype":"<i8","fill_value":null,"order":"C","compressor":null,"filters":null,"zarr_format":2}',
"a/.zattrs": '{"_ARRAY_DIMENSIONS":["x","y"]}',
},
}

result_ds_refs = ds.virtualize.to_kerchunk(format="dict")
assert result_ds_refs == expected_ds_refs

def test_accessor_to_kerchunk_json(self, tmp_path):
import ujson

Expand Down
Loading