Skip to content

Commit

Permalink
Integrate and test Zarr backend (#513)
Browse files Browse the repository at this point in the history
* setup for zarr

* fix imports

* fix test class names; add more zarr

* changelog

* control when io is returned

* debugs

* debugs

* debugs

* debugs

* debugs

* down to only one

* remove f strings automatically

* remove some

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* override mystery

* fix streaming path

* Update tests/test_inspector.py

Co-authored-by: Steph Prince <[email protected]>

* PR suggestions

* adjust docstring

* add Zarr to caching

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* rollback caching

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: Ben Dichter <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Steph Prince <[email protected]>
  • Loading branch information
4 people authored Sep 27, 2024
1 parent ea18bf6 commit 0a27c83
Show file tree
Hide file tree
Showing 19 changed files with 373 additions and 145 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
### Deprecation (API)
* The `inspect_nwb` method has been removed. Please use `inspect_nwbfile` or `inspect_nwbfile_object` instead. [#505](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/505)

### New Features
* Added Zarr support. [#513](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/513)

### Improvements
* Removed the `robust_ros3_read` utility helper. [#506](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/506)
* Simplified the `nwbinspector.testing` configuration framework. [#509](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/509)
Expand Down
5 changes: 2 additions & 3 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
graft tests
global-exclude *.py[cod]
include src/nwbinspector/internal_configs/dandi.inspector_config.yaml
include src/nwbinspector/config.schema.json
include requirements.txt
include src/nwbinspector/_internal_configs/dandi.inspector_config.yaml
include src/nwbinspector/_internal_configs/config.schema.json
8 changes: 8 additions & 0 deletions src/nwbinspector/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
# Still keeping the legacy magic version attribute requested by some users
__version__ = importlib.metadata.version(distribution_name="nwbinspector")

# Note: this is not exposed at this outer level, but is used here to trigger the automatic submodule import
# (otherwise someone would have to import nwbinspector.testing explicitly)
from .testing import check_streaming_tests_enabled # noqa: F401

__all__ = [
"available_checks",
"default_check_registry",
Expand All @@ -51,4 +55,8 @@
"FormatterOptions",
"organize_messages",
"__version__",
# Public submodules
"checks",
"testing",
"utils",
]
7 changes: 5 additions & 2 deletions src/nwbinspector/_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@
from . import available_checks
from ._registration import Importance

INTERNAL_CONFIGS = dict(dandi=Path(__file__).parent / "internal_configs" / "dandi.inspector_config.yaml")
INTERNAL_CONFIGS = dict(
dandi=Path(__file__).parent / "_internal_configs" / "dandi.inspector_config.yaml",
)


def validate_config(config: dict):
"""Validate an instance of configuration against the official schema."""
with open(file=Path(__file__).parent / "config.schema.json", mode="r") as fp:
config_schema_file_path = Path(__file__).parent / "_internal_configs" / "config.schema.json"
with open(file=config_schema_file_path, mode="r") as fp:
schema = json.load(fp=fp)
jsonschema.validate(instance=config, schema=schema)

Expand Down
File renamed without changes.
79 changes: 39 additions & 40 deletions src/nwbinspector/_nwb_inspection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
from natsort import natsorted
from tqdm import tqdm

from . import available_checks, configure_checks
from . import available_checks
from ._configuration import configure_checks
from ._registration import Importance, InspectorMessage
from .tools._read_nwbfile import read_nwbfile
from .utils import (
FilePathType,
OptionalListOfStrings,
Expand Down Expand Up @@ -127,7 +129,7 @@ def inspect_all(
progress_bar_options = dict(position=0, leave=False)

if in_path.is_dir():
nwbfiles = list(in_path.rglob("*.nwb"))
nwbfiles = list(in_path.rglob("*.nwb*"))

# Remove any macOS sidecar files
nwbfiles = [nwbfile for nwbfile in nwbfiles if not nwbfile.name.startswith("._")]
Expand All @@ -141,17 +143,16 @@ def inspect_all(
# Manual identifier check over all files in the folder path
identifiers = defaultdict(list)
for nwbfile_path in nwbfiles:
with pynwb.NWBHDF5IO(path=nwbfile_path, mode="r", load_namespaces=True) as io:
try:
nwbfile = io.read()
identifiers[nwbfile.identifier].append(nwbfile_path)
except Exception as exception:
yield InspectorMessage(
message=traceback.format_exc(),
importance=Importance.ERROR,
check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
file_path=nwbfile_path,
)
try:
nwbfile = read_nwbfile(nwbfile_path=nwbfile_path)
identifiers[nwbfile.identifier].append(nwbfile_path)
except Exception as exception:
yield InspectorMessage(
message=traceback.format_exc(),
importance=Importance.ERROR,
check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
file_path=nwbfile_path,
)

if len(identifiers) != len(nwbfiles):
for identifier, nwbfiles_with_identifier in identifiers.items():
Expand Down Expand Up @@ -198,7 +199,7 @@ def inspect_all(
yield message
else:
for nwbfile_path in nwbfiles_iterable:
for message in inspect_nwbfile(nwbfile_path=nwbfile_path, checks=checks):
for message in inspect_nwbfile(nwbfile_path=nwbfile_path, checks=checks, skip_validate=skip_validate):
yield message


Expand Down Expand Up @@ -237,7 +238,7 @@ def inspect_nwbfile(
config : dict
Dictionary valid against our JSON configuration schema.
Can specify a mapping of importance levels and list of check functions whose importance you wish to change.
Typically loaded via json.load from a valid .json file
Typically loaded via `json.load` from a valid .json file.
ignore: list, optional
Names of functions to skip.
select: list, optional
Expand Down Expand Up @@ -267,10 +268,12 @@ def inspect_nwbfile(
filterwarnings(action="ignore", message="No cached namespaces found in .*")
filterwarnings(action="ignore", message="Ignoring cached namespace .*")

if not skip_validate:
validation_error_list, _ = pynwb.validate(paths=[nwbfile_path])
for validation_namespace_errors in validation_error_list:
for validation_error in validation_namespace_errors:
try:
in_memory_nwbfile, io = read_nwbfile(nwbfile_path=nwbfile_path, return_io=True)

if not skip_validate:
validation_errors = pynwb.validate(io=io)
for validation_error in validation_errors:
yield InspectorMessage(
message=validation_error.reason,
importance=Importance.PYNWB_VALIDATION,
Expand All @@ -279,27 +282,23 @@ def inspect_nwbfile(
file_path=nwbfile_path,
)

with pynwb.NWBHDF5IO(path=nwbfile_path, mode="r", load_namespaces=True) as io:
try:
in_memory_nwbfile = io.read()

for inspector_message in inspect_nwbfile_object(
nwbfile_object=in_memory_nwbfile,
checks=checks,
config=config,
ignore=ignore,
select=select,
importance_threshold=importance_threshold,
):
inspector_message.file_path = nwbfile_path
yield inspector_message
except Exception as exception:
yield InspectorMessage(
message=traceback.format_exc(),
importance=Importance.ERROR,
check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
file_path=nwbfile_path,
)
for inspector_message in inspect_nwbfile_object(
nwbfile_object=in_memory_nwbfile,
checks=checks,
config=config,
ignore=ignore,
select=select,
importance_threshold=importance_threshold,
):
inspector_message.file_path = nwbfile_path
yield inspector_message
except Exception as exception:
yield InspectorMessage(
message=traceback.format_exc(),
importance=Importance.ERROR,
check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
file_path=nwbfile_path,
)


# TODO: deprecate once subject types and dandi schemas have been extended
Expand Down
12 changes: 8 additions & 4 deletions src/nwbinspector/_registration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Optional

import h5py
import zarr
from pynwb import NWBFile
from pynwb.ecephys import Device, ElectrodeGroup
from pynwb.file import Subject
Expand Down Expand Up @@ -93,7 +94,7 @@ def _auto_parse(check_function, obj, result: Optional[InspectorMessage] = None):


def _parse_location(neurodata_object) -> Optional[str]:
"""Grab the object location from a h5py.Dataset or a container content that is an h5py.Dataset object."""
"""Grab the object location from a dataset or a container content that is an dataset object."""
known_locations = {
NWBFile: "/",
Subject: "/general/subject",
Expand All @@ -105,13 +106,16 @@ def _parse_location(neurodata_object) -> Optional[str]:
for key, val in known_locations.items():
if isinstance(neurodata_object, key):
return val
"""Infer the human-readable path of the object within an NWBFile by tracing its parents."""

# Infer the human-readable path of the object within an NWBFile by tracing its parents
if neurodata_object.parent is None:
return "/"
# Best solution: object is or has a HDF5 Dataset
if isinstance(neurodata_object, h5py.Dataset):
if isinstance(neurodata_object, (h5py.Dataset, zarr.Array)):
return neurodata_object.name
else:
for field in neurodata_object.fields.values():
for field_name, field in neurodata_object.fields.items():
if isinstance(field, h5py.Dataset):
return field.parent.name
elif isinstance(field, zarr.Array):
return field.name.removesuffix(f"/{field_name}")
23 changes: 19 additions & 4 deletions src/nwbinspector/checks/_nwb_containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os

import h5py
import zarr
from pynwb import NWBContainer

from .._registration import Importance, InspectorMessage, Severity, register_check
Expand All @@ -19,9 +20,16 @@ def check_large_dataset_compression(nwb_container: NWBContainer, gb_lower_bound:
Best Practice: :ref:`best_practice_compression`
"""
for field in getattr(nwb_container, "fields", dict()).values():
if not isinstance(field, h5py.Dataset):
if not isinstance(field, (h5py.Dataset, zarr.Array)):
continue
if field.compression is None and field.size * field.dtype.itemsize > gb_lower_bound * 1e9:

compression_indicator = None
if isinstance(field, h5py.Dataset):
compression_indicator = field.compression
elif isinstance(field, zarr.Array):
compression_indicator = field.compressor

if compression_indicator is not None and field.size * field.dtype.itemsize > gb_lower_bound * 1e9:
return InspectorMessage(
severity=Severity.HIGH,
message=f"{os.path.split(field.name)[1]} is a large uncompressed dataset! Please enable compression.",
Expand All @@ -44,10 +52,17 @@ def check_small_dataset_compression(
Best Practice: :ref:`best_practice_compression`
"""
for field in getattr(nwb_container, "fields", dict()).values():
if not isinstance(field, h5py.Dataset):
if not isinstance(field, (h5py.Dataset, zarr.Array)):
continue

compression_indicator = None
if isinstance(field, h5py.Dataset):
compression_indicator = field.compression
elif isinstance(field, zarr.Array):
compression_indicator = field.compressor

if (
field.compression is None
compression_indicator is None
and mb_lower_bound * 1e6 < field.size * field.dtype.itemsize < gb_upper_bound * 1e9
):
if field.size * field.dtype.itemsize > gb_severity_threshold * 1e9:
Expand Down
3 changes: 2 additions & 1 deletion src/nwbinspector/tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from ._dandi import get_s3_urls_and_dandi_paths
from ._nwb import all_of_type, get_nwbfile_path_from_internal_object
from ._read_nwbfile import read_nwbfile
from ._read_nwbfile import BACKEND_IO_CLASSES, read_nwbfile

__all__ = [
"BACKEND_IO_CLASSES",
"get_s3_urls_and_dandi_paths",
"all_of_type",
"get_nwbfile_path_from_internal_object",
Expand Down
39 changes: 23 additions & 16 deletions src/nwbinspector/tools/_read_nwbfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,14 @@
from warnings import filterwarnings

import h5py
from hdmf.backends.io import HDMFIO
from hdmf_zarr import NWBZarrIO
from pynwb import NWBHDF5IO, NWBFile

_BACKEND_IO_CLASSES = dict(hdf5=NWBHDF5IO)

try:
from hdmf_zarr import NWBZarrIO

_BACKEND_IO_CLASSES.update(zarr=NWBZarrIO)
except ModuleNotFoundError as exception:
if str(exception) != "No module named 'hdmf_zarr'": # not the exception we're looking for, so re-raise
raise exception
BACKEND_IO_CLASSES = dict(
hdf5=NWBHDF5IO,
zarr=NWBZarrIO,
)


def _get_method(path: str):
Expand Down Expand Up @@ -47,11 +44,11 @@ def _get_backend(path: str, method: Literal["local", "fsspec", "ros3"]):
if method == "fsspec":
fs = _init_fsspec(path=path)
with fs.open(path=path, mode="rb") as file:
for backend_name, backend_class in _BACKEND_IO_CLASSES.items():
for backend_name, backend_class in BACKEND_IO_CLASSES.items():
if backend_class.can_read(path=file):
possible_backends.append(backend_name)
else:
for backend_name, backend_class in _BACKEND_IO_CLASSES.items():
for backend_name, backend_class in BACKEND_IO_CLASSES.items():
if backend_class.can_read(path):
possible_backends.append(backend_name)

Expand All @@ -69,7 +66,8 @@ def read_nwbfile(
nwbfile_path: Union[str, Path],
method: Optional[Literal["local", "fsspec", "ros3"]] = None,
backend: Optional[Literal["hdf5", "zarr"]] = None,
) -> NWBFile:
return_io: bool = False,
) -> Union[NWBFile, tuple[NWBFile, HDMFIO]]:
"""
Read an NWB file using the specified (or auto-detected) method and specified (or auto-detected) backend.
Expand All @@ -85,10 +83,16 @@ def read_nwbfile(
backend : "hdf5", "zarr", or None (default)
Type of backend used to write the file.
The default auto-detects the type of the file.
return_io : bool, default: False
Whether to return the HDMFIO object used to open the file.
Returns
-------
pynwb.NWBFile
nwbfile : pynwb.NWBFile
The in-memory NWBFile object.
io : hdmf.backends.io.HDMFIO, optional
Only passed if `return_io` is True.
The initialized HDMFIO object used to read the file.
"""
nwbfile_path = str(nwbfile_path) # If pathlib.Path, cast to str; if already str, no harm done

Expand All @@ -109,7 +113,7 @@ def read_nwbfile(
)

backend = backend or _get_backend(nwbfile_path, method)
if method == "local" and not _BACKEND_IO_CLASSES[ # Temporary until .can_read() is able to work on streamed bytes
if method == "local" and not BACKEND_IO_CLASSES[ # Temporary until .can_read() is able to work on streamed bytes
backend
].can_read(path=nwbfile_path):
raise IOError(f"The chosen backend ({backend}) is unable to read the file! Please select a different backend.")
Expand All @@ -127,7 +131,10 @@ def read_nwbfile(
io_kwargs.update(path=nwbfile_path)
if method == "ros3":
io_kwargs.update(driver="ros3")
io = _BACKEND_IO_CLASSES[backend](**io_kwargs)
io = BACKEND_IO_CLASSES[backend](**io_kwargs)
nwbfile = io.read()

return nwbfile
if return_io:
return (nwbfile, io)
else: # Note: do not be concerned about io object closing due to garbage collection here
return nwbfile # (it is attached as an attribute to the NWBFile object)
3 changes: 2 additions & 1 deletion src/nwbinspector/utils/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import h5py
import numpy as np
import zarr
from hdmf.backends.hdf5.h5_utils import H5Dataset
from numpy.typing import ArrayLike
from packaging import version
Expand All @@ -25,7 +26,7 @@

@lru_cache(maxsize=MAX_CACHE_ITEMS)
def _cache_data_retrieval_command(
data: h5py.Dataset, reduced_selection: tuple[tuple[Optional[int], Optional[int], Optional[int]]]
data: Union[h5py.Dataset, zarr.Array], reduced_selection: tuple[tuple[Optional[int], Optional[int], Optional[int]]]
) -> np.ndarray:
"""LRU caching for _cache_data_selection cannot be applied to list inputs; this expects the tuple or Dataset."""
selection = tuple([slice(*reduced_slice) for reduced_slice in reduced_selection]) # reconstitute the slices
Expand Down
File renamed without changes.
Loading

0 comments on commit 0a27c83

Please sign in to comment.