Integrate and test Zarr backend (#513)

* setup for zarr * fix imports * fix test class names; add more zarr * changelog * control when io is returned * debugs * debugs * debugs * debugs * debugs * down to only one * remove f strings automatically * remove some * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * override mystery * fix streaming path * Update tests/test_inspector.py Co-authored-by: Steph Prince <[email protected]> * PR suggestions * adjust docstring * add Zarr to caching * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rollback caching * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Ben Dichter <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Steph Prince <[email protected]>
NeurodataWithoutBorders · Sep 27, 2024 · 0a27c83 · 0a27c83
1 parent ea18bf6
commit 0a27c83
Show file tree

Hide file tree

Showing 19 changed files with 373 additions and 145 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,9 @@
 ### Deprecation (API)
 * The `inspect_nwb` method has been removed. Please use `inspect_nwbfile` or `inspect_nwbfile_object` instead. [#505](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/505)
 
+### New Features
+* Added Zarr support. [#513](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/513)
+
 ### Improvements
 * Removed the `robust_ros3_read` utility helper. [#506](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/506)
 * Simplified the `nwbinspector.testing` configuration framework. [#509](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/509)

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,5 +1,4 @@
 graft tests
 global-exclude *.py[cod]
-include src/nwbinspector/internal_configs/dandi.inspector_config.yaml
-include src/nwbinspector/config.schema.json
-include requirements.txt
+include src/nwbinspector/_internal_configs/dandi.inspector_config.yaml
+include src/nwbinspector/_internal_configs/config.schema.json
diff --git a/src/nwbinspector/__init__.py b/src/nwbinspector/__init__.py
@@ -26,6 +26,10 @@
 # Still keeping the legacy magic version attribute requested by some users
 __version__ = importlib.metadata.version(distribution_name="nwbinspector")
 
+# Note: this is not exposed at this outer level, but is used here to trigger the automatic submodule import
+# (otherwise someone would have to import nwbinspector.testing explicitly)
+from .testing import check_streaming_tests_enabled  # noqa: F401
+
 __all__ = [
     "available_checks",
     "default_check_registry",
@@ -51,4 +55,8 @@
     "FormatterOptions",
     "organize_messages",
     "__version__",
+    # Public submodules
+    "checks",
+    "testing",
+    "utils",
 ]
diff --git a/src/nwbinspector/_configuration.py b/src/nwbinspector/_configuration.py
@@ -13,12 +13,15 @@
 from . import available_checks
 from ._registration import Importance
 
-INTERNAL_CONFIGS = dict(dandi=Path(__file__).parent / "internal_configs" / "dandi.inspector_config.yaml")
+INTERNAL_CONFIGS = dict(
+    dandi=Path(__file__).parent / "_internal_configs" / "dandi.inspector_config.yaml",
+)
 
 
 def validate_config(config: dict):
     """Validate an instance of configuration against the official schema."""
-    with open(file=Path(__file__).parent / "config.schema.json", mode="r") as fp:
+    config_schema_file_path = Path(__file__).parent / "_internal_configs" / "config.schema.json"
+    with open(file=config_schema_file_path, mode="r") as fp:
         schema = json.load(fp=fp)
     jsonschema.validate(instance=config, schema=schema)
 

diff --git a/src/nwbinspector/config.schema.json → ...ctor/_internal_configs/config.schema.json b/src/nwbinspector/config.schema.json → ...ctor/_internal_configs/config.schema.json
diff --git a/...ernal_configs/dandi.inspector_config.yaml → ...ernal_configs/dandi.inspector_config.yaml b/...ernal_configs/dandi.inspector_config.yaml → ...ernal_configs/dandi.inspector_config.yaml
diff --git a/src/nwbinspector/_nwb_inspection.py b/src/nwbinspector/_nwb_inspection.py
@@ -12,8 +12,10 @@
 from natsort import natsorted
 from tqdm import tqdm
 
-from . import available_checks, configure_checks
+from . import available_checks
+from ._configuration import configure_checks
 from ._registration import Importance, InspectorMessage
+from .tools._read_nwbfile import read_nwbfile
 from .utils import (
     FilePathType,
     OptionalListOfStrings,
@@ -127,7 +129,7 @@ def inspect_all(
         progress_bar_options = dict(position=0, leave=False)
 
     if in_path.is_dir():
-        nwbfiles = list(in_path.rglob("*.nwb"))
+        nwbfiles = list(in_path.rglob("*.nwb*"))
 
         # Remove any macOS sidecar files
         nwbfiles = [nwbfile for nwbfile in nwbfiles if not nwbfile.name.startswith("._")]
@@ -141,17 +143,16 @@ def inspect_all(
     # Manual identifier check over all files in the folder path
     identifiers = defaultdict(list)
     for nwbfile_path in nwbfiles:
-        with pynwb.NWBHDF5IO(path=nwbfile_path, mode="r", load_namespaces=True) as io:
-            try:
-                nwbfile = io.read()
-                identifiers[nwbfile.identifier].append(nwbfile_path)
-            except Exception as exception:
-                yield InspectorMessage(
-                    message=traceback.format_exc(),
-                    importance=Importance.ERROR,
-                    check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
-                    file_path=nwbfile_path,
-                )
+        try:
+            nwbfile = read_nwbfile(nwbfile_path=nwbfile_path)
+            identifiers[nwbfile.identifier].append(nwbfile_path)
+        except Exception as exception:
+            yield InspectorMessage(
+                message=traceback.format_exc(),
+                importance=Importance.ERROR,
+                check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
+                file_path=nwbfile_path,
+            )
 
     if len(identifiers) != len(nwbfiles):
         for identifier, nwbfiles_with_identifier in identifiers.items():
@@ -198,7 +199,7 @@ def inspect_all(
                     yield message
     else:
         for nwbfile_path in nwbfiles_iterable:
-            for message in inspect_nwbfile(nwbfile_path=nwbfile_path, checks=checks):
+            for message in inspect_nwbfile(nwbfile_path=nwbfile_path, checks=checks, skip_validate=skip_validate):
                 yield message
 
 
@@ -237,7 +238,7 @@ def inspect_nwbfile(
     config : dict
         Dictionary valid against our JSON configuration schema.
         Can specify a mapping of importance levels and list of check functions whose importance you wish to change.
-        Typically loaded via json.load from a valid .json file
+        Typically loaded via `json.load` from a valid .json file.
     ignore: list, optional
         Names of functions to skip.
     select: list, optional
@@ -267,10 +268,12 @@ def inspect_nwbfile(
     filterwarnings(action="ignore", message="No cached namespaces found in .*")
     filterwarnings(action="ignore", message="Ignoring cached namespace .*")
 
-    if not skip_validate:
-        validation_error_list, _ = pynwb.validate(paths=[nwbfile_path])
-        for validation_namespace_errors in validation_error_list:
-            for validation_error in validation_namespace_errors:
+    try:
+        in_memory_nwbfile, io = read_nwbfile(nwbfile_path=nwbfile_path, return_io=True)
+
+        if not skip_validate:
+            validation_errors = pynwb.validate(io=io)
+            for validation_error in validation_errors:
                 yield InspectorMessage(
                     message=validation_error.reason,
                     importance=Importance.PYNWB_VALIDATION,
@@ -279,27 +282,23 @@ def inspect_nwbfile(
                     file_path=nwbfile_path,
                 )
 
-    with pynwb.NWBHDF5IO(path=nwbfile_path, mode="r", load_namespaces=True) as io:
-        try:
-            in_memory_nwbfile = io.read()
-
-            for inspector_message in inspect_nwbfile_object(
-                nwbfile_object=in_memory_nwbfile,
-                checks=checks,
-                config=config,
-                ignore=ignore,
-                select=select,
-                importance_threshold=importance_threshold,
-            ):
-                inspector_message.file_path = nwbfile_path
-                yield inspector_message
-        except Exception as exception:
-            yield InspectorMessage(
-                message=traceback.format_exc(),
-                importance=Importance.ERROR,
-                check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
-                file_path=nwbfile_path,
-            )
+        for inspector_message in inspect_nwbfile_object(
+            nwbfile_object=in_memory_nwbfile,
+            checks=checks,
+            config=config,
+            ignore=ignore,
+            select=select,
+            importance_threshold=importance_threshold,
+        ):
+            inspector_message.file_path = nwbfile_path
+            yield inspector_message
+    except Exception as exception:
+        yield InspectorMessage(
+            message=traceback.format_exc(),
+            importance=Importance.ERROR,
+            check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
+            file_path=nwbfile_path,
+        )
 
 
 # TODO: deprecate once subject types and dandi schemas have been extended

diff --git a/src/nwbinspector/_registration.py b/src/nwbinspector/_registration.py
@@ -4,6 +4,7 @@
 from typing import Optional
 
 import h5py
+import zarr
 from pynwb import NWBFile
 from pynwb.ecephys import Device, ElectrodeGroup
 from pynwb.file import Subject
@@ -93,7 +94,7 @@ def _auto_parse(check_function, obj, result: Optional[InspectorMessage] = None):
 
 
 def _parse_location(neurodata_object) -> Optional[str]:
-    """Grab the object location from a h5py.Dataset or a container content that is an h5py.Dataset object."""
+    """Grab the object location from a dataset or a container content that is an dataset object."""
     known_locations = {
         NWBFile: "/",
         Subject: "/general/subject",
@@ -105,13 +106,16 @@ def _parse_location(neurodata_object) -> Optional[str]:
     for key, val in known_locations.items():
         if isinstance(neurodata_object, key):
             return val
-    """Infer the human-readable path of the object within an NWBFile by tracing its parents."""
+
+    # Infer the human-readable path of the object within an NWBFile by tracing its parents
     if neurodata_object.parent is None:
         return "/"
     # Best solution: object is or has a HDF5 Dataset
-    if isinstance(neurodata_object, h5py.Dataset):
+    if isinstance(neurodata_object, (h5py.Dataset, zarr.Array)):
         return neurodata_object.name
     else:
-        for field in neurodata_object.fields.values():
+        for field_name, field in neurodata_object.fields.items():
             if isinstance(field, h5py.Dataset):
                 return field.parent.name
+            elif isinstance(field, zarr.Array):
+                return field.name.removesuffix(f"/{field_name}")
diff --git a/src/nwbinspector/checks/_nwb_containers.py b/src/nwbinspector/checks/_nwb_containers.py
@@ -3,6 +3,7 @@
 import os
 
 import h5py
+import zarr
 from pynwb import NWBContainer
 
 from .._registration import Importance, InspectorMessage, Severity, register_check
@@ -19,9 +20,16 @@ def check_large_dataset_compression(nwb_container: NWBContainer, gb_lower_bound:
     Best Practice: :ref:`best_practice_compression`
     """
     for field in getattr(nwb_container, "fields", dict()).values():
-        if not isinstance(field, h5py.Dataset):
+        if not isinstance(field, (h5py.Dataset, zarr.Array)):
             continue
-        if field.compression is None and field.size * field.dtype.itemsize > gb_lower_bound * 1e9:
+
+        compression_indicator = None
+        if isinstance(field, h5py.Dataset):
+            compression_indicator = field.compression
+        elif isinstance(field, zarr.Array):
+            compression_indicator = field.compressor
+
+        if compression_indicator is not None and field.size * field.dtype.itemsize > gb_lower_bound * 1e9:
             return InspectorMessage(
                 severity=Severity.HIGH,
                 message=f"{os.path.split(field.name)[1]} is a large uncompressed dataset! Please enable compression.",
@@ -44,10 +52,17 @@ def check_small_dataset_compression(
     Best Practice: :ref:`best_practice_compression`
     """
     for field in getattr(nwb_container, "fields", dict()).values():
-        if not isinstance(field, h5py.Dataset):
+        if not isinstance(field, (h5py.Dataset, zarr.Array)):
             continue
+
+        compression_indicator = None
+        if isinstance(field, h5py.Dataset):
+            compression_indicator = field.compression
+        elif isinstance(field, zarr.Array):
+            compression_indicator = field.compressor
+
         if (
-            field.compression is None
+            compression_indicator is None
             and mb_lower_bound * 1e6 < field.size * field.dtype.itemsize < gb_upper_bound * 1e9
         ):
             if field.size * field.dtype.itemsize > gb_severity_threshold * 1e9:

diff --git a/src/nwbinspector/tools/__init__.py b/src/nwbinspector/tools/__init__.py
@@ -1,8 +1,9 @@
 from ._dandi import get_s3_urls_and_dandi_paths
 from ._nwb import all_of_type, get_nwbfile_path_from_internal_object
-from ._read_nwbfile import read_nwbfile
+from ._read_nwbfile import BACKEND_IO_CLASSES, read_nwbfile
 
 __all__ = [
+    "BACKEND_IO_CLASSES",
     "get_s3_urls_and_dandi_paths",
     "all_of_type",
     "get_nwbfile_path_from_internal_object",

diff --git a/src/nwbinspector/tools/_read_nwbfile.py b/src/nwbinspector/tools/_read_nwbfile.py
@@ -5,17 +5,14 @@
 from warnings import filterwarnings
 
 import h5py
+from hdmf.backends.io import HDMFIO
+from hdmf_zarr import NWBZarrIO
 from pynwb import NWBHDF5IO, NWBFile
 
-_BACKEND_IO_CLASSES = dict(hdf5=NWBHDF5IO)
-
-try:
-    from hdmf_zarr import NWBZarrIO
-
-    _BACKEND_IO_CLASSES.update(zarr=NWBZarrIO)
-except ModuleNotFoundError as exception:
-    if str(exception) != "No module named 'hdmf_zarr'":  # not the exception we're looking for, so re-raise
-        raise exception
+BACKEND_IO_CLASSES = dict(
+    hdf5=NWBHDF5IO,
+    zarr=NWBZarrIO,
+)
 
 
 def _get_method(path: str):
@@ -47,11 +44,11 @@ def _get_backend(path: str, method: Literal["local", "fsspec", "ros3"]):
     if method == "fsspec":
         fs = _init_fsspec(path=path)
         with fs.open(path=path, mode="rb") as file:
-            for backend_name, backend_class in _BACKEND_IO_CLASSES.items():
+            for backend_name, backend_class in BACKEND_IO_CLASSES.items():
                 if backend_class.can_read(path=file):
                     possible_backends.append(backend_name)
     else:
-        for backend_name, backend_class in _BACKEND_IO_CLASSES.items():
+        for backend_name, backend_class in BACKEND_IO_CLASSES.items():
             if backend_class.can_read(path):
                 possible_backends.append(backend_name)
 
@@ -69,7 +66,8 @@ def read_nwbfile(
     nwbfile_path: Union[str, Path],
     method: Optional[Literal["local", "fsspec", "ros3"]] = None,
     backend: Optional[Literal["hdf5", "zarr"]] = None,
-) -> NWBFile:
+    return_io: bool = False,
+) -> Union[NWBFile, tuple[NWBFile, HDMFIO]]:
     """
     Read an NWB file using the specified (or auto-detected) method and specified (or auto-detected) backend.
 
@@ -85,10 +83,16 @@ def read_nwbfile(
     backend : "hdf5", "zarr", or None (default)
         Type of backend used to write the file.
         The default auto-detects the type of the file.
+    return_io : bool, default: False
+        Whether to return the HDMFIO object used to open the file.
 
     Returns
     -------
-    pynwb.NWBFile
+    nwbfile : pynwb.NWBFile
+        The in-memory NWBFile object.
+    io : hdmf.backends.io.HDMFIO, optional
+        Only passed if `return_io` is True.
+        The initialized HDMFIO object used to read the file.
     """
     nwbfile_path = str(nwbfile_path)  # If pathlib.Path, cast to str; if already str, no harm done
 
@@ -109,7 +113,7 @@ def read_nwbfile(
         )
 
     backend = backend or _get_backend(nwbfile_path, method)
-    if method == "local" and not _BACKEND_IO_CLASSES[  # Temporary until .can_read() is able to work on streamed bytes
+    if method == "local" and not BACKEND_IO_CLASSES[  # Temporary until .can_read() is able to work on streamed bytes
         backend
     ].can_read(path=nwbfile_path):
         raise IOError(f"The chosen backend ({backend}) is unable to read the file! Please select a different backend.")
@@ -127,7 +131,10 @@ def read_nwbfile(
         io_kwargs.update(path=nwbfile_path)
     if method == "ros3":
         io_kwargs.update(driver="ros3")
-    io = _BACKEND_IO_CLASSES[backend](**io_kwargs)
+    io = BACKEND_IO_CLASSES[backend](**io_kwargs)
     nwbfile = io.read()
 
-    return nwbfile
+    if return_io:
+        return (nwbfile, io)
+    else:  # Note: do not be concerned about io object closing due to garbage collection here
+        return nwbfile  # (it is attached as an attribute to the NWBFile object)
diff --git a/src/nwbinspector/utils/_utils.py b/src/nwbinspector/utils/_utils.py
@@ -10,6 +10,7 @@
 
 import h5py
 import numpy as np
+import zarr
 from hdmf.backends.hdf5.h5_utils import H5Dataset
 from numpy.typing import ArrayLike
 from packaging import version
@@ -25,7 +26,7 @@
 
 @lru_cache(maxsize=MAX_CACHE_ITEMS)
 def _cache_data_retrieval_command(
-    data: h5py.Dataset, reduced_selection: tuple[tuple[Optional[int], Optional[int], Optional[int]]]
+    data: Union[h5py.Dataset, zarr.Array], reduced_selection: tuple[tuple[Optional[int], Optional[int], Optional[int]]]
 ) -> np.ndarray:
     """LRU caching for _cache_data_selection cannot be applied to list inputs; this expects the tuple or Dataset."""
     selection = tuple([slice(*reduced_slice) for reduced_slice in reduced_selection])  # reconstitute the slices

diff --git a/tests/000126_report.txt → tests/expected_reports/000126_report.txt b/tests/000126_report.txt → tests/expected_reports/000126_report.txt