Skip to content

Commit

Permalink
Support groups with HDF5 reader.
Browse files Browse the repository at this point in the history
  • Loading branch information
sharkinsspatial committed Oct 10, 2024
1 parent 77f1689 commit 42c653a
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 11 deletions.
3 changes: 2 additions & 1 deletion virtualizarr/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,12 @@ def open_virtual_dataset(

virtual_vars = virtual_vars_from_hdf(
path=filepath,
group=group,
drop_variables=drop_variables + loadable_variables,
reader_options=reader_options,
)
ds_attrs = attrs_from_root_group(
path=filepath, reader_options=reader_options
path=filepath, reader_options=reader_options, group=group
)
coord_names = ds_attrs.pop("coordinates", [])
# we currently read every other filetype using kerchunks various file format backends
Expand Down
22 changes: 18 additions & 4 deletions virtualizarr/readers/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab

def virtual_vars_from_hdf(
path: str,
group: Optional[str] = None,
drop_variables: Optional[List[str]] = None,
reader_options: Optional[dict] = {
"storage_options": {"key": "", "secret": "", "anon": True}
Expand All @@ -220,11 +221,17 @@ def virtual_vars_from_hdf(
filepath=path, reader_options=reader_options
)
f = h5py.File(open_file, mode="r")
if group:
g = f[group]
if not isinstance(g, h5py.Group):
raise ValueError("The provided group is not an HDF group")
else:
g = f
variables = {}
for key in f.keys():
for key in g.keys():
if key not in drop_variables:
if isinstance(f[key], h5py.Dataset):
variable = _dataset_to_variable(path, f[key])
if isinstance(g[key], h5py.Dataset):
variable = _dataset_to_variable(path, g[key])
if variable is not None:
variables[key] = variable
else:
Expand All @@ -235,6 +242,7 @@ def virtual_vars_from_hdf(

def attrs_from_root_group(
path: str,
group: Optional[str] = None,
reader_options: Optional[dict] = {
"storage_options": {"key": "", "secret": "", "anon": True}
},
Expand All @@ -243,5 +251,11 @@ def attrs_from_root_group(
filepath=path, reader_options=reader_options
)
f = h5py.File(open_file, mode="r")
attrs = _extract_attrs(f)
if group:
g = f[group]
if not isinstance(g, h5py.Group):
raise ValueError("The provided group is not an HDF group")
else:
g = f
attrs = _extract_attrs(g)
return attrs
5 changes: 3 additions & 2 deletions virtualizarr/tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,9 +293,9 @@ def test_explicit_filetype(self, netcdf4_file):
open_virtual_dataset(netcdf4_file, filetype="grib")

def test_group_kwarg(self, hdf5_groups_file):
with pytest.raises(ValueError, match="Multiple HDF Groups found"):
with pytest.raises(NotImplementedError, match="Nested groups"):
open_virtual_dataset(hdf5_groups_file)
with pytest.raises(ValueError, match="not found in"):
with pytest.raises(KeyError, match="doesn't exist"):
open_virtual_dataset(hdf5_groups_file, group="doesnt_exist")

vars_to_load = ["air", "time"]
Expand All @@ -321,6 +321,7 @@ def test_open_virtual_dataset_passes_expected_args(
open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options)
args = {
"path": netcdf4_file,
"group": None,
"drop_variables": [],
"reader_options": reader_options,
}
Expand Down
15 changes: 14 additions & 1 deletion virtualizarr/tests/test_readers/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,20 @@ def root_attributes_hdf5_file(tmpdir):
def group_hdf5_file(tmpdir):
filepath = f"{tmpdir}/group.nc"
f = h5py.File(filepath, "w")
f.create_group("group")
g = f.create_group("group")
data = np.random.random((10, 10))
g.create_dataset("data", data=data)
return filepath


@pytest.fixture
def nested_group_hdf5_file(tmpdir):
filepath = f"{tmpdir}/nested_group.nc"
f = h5py.File(filepath, "w")
g = f.create_group("group")
data = np.random.random((10, 10))
g.create_dataset("data", data=data)
g.create_group("nested_group")
return filepath


Expand Down
16 changes: 13 additions & 3 deletions virtualizarr/tests/test_readers/test_hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,20 @@ def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):
variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file)
assert len(variables) == 3

def test_groups_not_implemented(self, group_hdf5_file):
def test_nested_groups_not_implemented(self, nested_group_hdf5_file):
with pytest.raises(NotImplementedError):
virtual_vars_from_hdf(group_hdf5_file)
virtual_vars_from_hdf(path=nested_group_hdf5_file, group="group")

def test_drop_variables(self, multiple_datasets_hdf5_file):
variables = virtual_vars_from_hdf(multiple_datasets_hdf5_file, ["data2"])
variables = virtual_vars_from_hdf(
path=multiple_datasets_hdf5_file, drop_variables=["data2"]
)
assert "data2" not in variables.keys()

def test_dataset_in_group(self, group_hdf5_file):
variables = virtual_vars_from_hdf(path=group_hdf5_file, group="group")
assert len(variables) == 1

def test_non_group_error(self, group_hdf5_file):
with pytest.raises(ValueError):
virtual_vars_from_hdf(path=group_hdf5_file, group="group/data")

0 comments on commit 42c653a

Please sign in to comment.