Skip to content

Commit

Permalink
Skip tests that require kerchunk (#259)
Browse files Browse the repository at this point in the history
* try making ujson an optional dep

* skip all tests which require kerchunk

* add new CI job

* rename git workflow

* move numcodecs import inside

* add numcodecs to CI env

* make ujson required

* unskip tests for parsing in-memory kerchunk dicts

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add ujson to CI environment

* in-memory roundtrip doesn't require kerchunk

* move in-memory kerchunk roundtrip test to test_integration.py

* remove now-empty test_kerchunk.py file

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
TomNicholas and pre-commit-ci[bot] authored Oct 17, 2024
1 parent ec8e465 commit e6407e0
Show file tree
Hide file tree
Showing 17 changed files with 183 additions and 62 deletions.
60 changes: 60 additions & 0 deletions .github/workflows/min-deps.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
name: min-deps

on:
push:
branches: [ "main" ]
paths-ignore:
- 'docs/**'
pull_request:
branches: [ "main" ]
paths-ignore:
- 'docs/**'
schedule:
- cron: "0 0 * * *"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:

test:
name: ${{ matrix.python-version }}-build
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@v4

- name: Setup micromamba
uses: mamba-org/setup-micromamba@v1
with:
environment-file: ci/min-deps.yml
cache-environment: true
create-args: >-
python=${{matrix.python-version}}
- name: Install virtualizarr
run: |
python -m pip install -e . --no-deps
- name: Conda list information
run: |
conda env list
conda list
- name: Running Tests
run: |
python -m pytest ./virtualizarr --cov=./ --cov-report=xml --verbose
- name: Upload code coverage to Codecov
uses: codecov/[email protected]
with:
file: ./coverage.xml
flags: unittests
env_vars: OS,PYTHON
name: codecov-umbrella
fail_ci_if_error: false
26 changes: 26 additions & 0 deletions ci/min-deps.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: virtualizarr-min-deps
channels:
- conda-forge
- nodefaults
dependencies:
- h5netcdf
- h5py
- hdf5
- netcdf4
- xarray>=2024.6.0
- numpy>=2.0.0
- numcodecs
- packaging
- ujson
- universal_pathlib
# Testing
- codecov
- pre-commit
- mypy
- ruff
- pandas-stubs
- pytest-mypy
- pytest-cov
- pytest
- pooch
- fsspec
7 changes: 4 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,21 @@ requires-python = ">=3.10"
dynamic = ["version"]
dependencies = [
"xarray>=2024.06.0",
"kerchunk>=0.2.5",
"h5netcdf",
"numpy>=2.0.0",
"ujson",
"packaging",
"universal-pathlib",
"numcodecs",
"ujson",
]

[project.optional-dependencies]
test = [
"codecov",
"fastparquet",
"fsspec",
"h5netcdf",
"h5py",
"kerchunk>=0.2.5",
"mypy",
"netcdf4",
"pandas-stubs",
Expand Down
3 changes: 2 additions & 1 deletion virtualizarr/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
overload,
)

import ujson # type: ignore
from xarray import Dataset, register_dataset_accessor

from virtualizarr.manifests import ManifestArray
Expand Down Expand Up @@ -91,6 +90,8 @@ def to_kerchunk(
if format == "dict":
return refs
elif format == "json":
import ujson

if filepath is None:
raise ValueError("Filepath must be provided when format is 'json'")

Expand Down
3 changes: 2 additions & 1 deletion virtualizarr/manifests/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast

import numpy as np
from upath import UPath

from virtualizarr.types import ChunkKey

Expand Down Expand Up @@ -41,6 +40,8 @@ class ChunkEntry:
def from_kerchunk(
cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int]
) -> "ChunkEntry":
from upath import UPath

if len(path_and_byte_range_info) == 1:
path = path_and_byte_range_info[0]
offset = 0
Expand Down
3 changes: 2 additions & 1 deletion virtualizarr/readers/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from pathlib import Path
from typing import Any, MutableMapping, Optional, cast

import ujson # type: ignore
from xarray import Dataset
from xarray.core.indexes import Index
from xarray.core.variable import Variable
Expand Down Expand Up @@ -300,6 +299,8 @@ def fully_decode_arr_refs(d: dict) -> KerchunkArrRefs:
"""
Only have to do this because kerchunk.SingleHdf5ToZarr apparently doesn't bother converting .zarray and .zattrs contents to dicts, see https://github.com/fsspec/kerchunk/issues/415 .
"""
import ujson

sanitized = d.copy()
for k, v in d.items():
if k.startswith("."):
Expand Down
2 changes: 2 additions & 0 deletions virtualizarr/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ def _importorskip(


has_astropy, requires_astropy = _importorskip("astropy")
has_kerchunk, requires_kerchunk = _importorskip("kerchunk")
has_s3fs, requires_s3fs = _importorskip("s3fs")
has_scipy, requires_scipy = _importorskip("scipy")
has_tifffile, requires_tifffile = _importorskip("tifffile")


Expand Down
19 changes: 17 additions & 2 deletions virtualizarr/tests/test_backend.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from collections.abc import Mapping
from unittest.mock import patch

import fsspec
import numpy as np
import pytest
import xarray as xr
Expand All @@ -13,9 +12,17 @@
from virtualizarr.backend import FileType
from virtualizarr.manifests import ManifestArray
from virtualizarr.readers.kerchunk import _automatically_determine_filetype
from virtualizarr.tests import has_astropy, has_tifffile, network, requires_s3fs
from virtualizarr.tests import (
has_astropy,
has_tifffile,
network,
requires_kerchunk,
requires_s3fs,
requires_scipy,
)


@requires_scipy
def test_automatically_determine_filetype_netcdf3_netcdf4():
# test the NetCDF3 vs NetCDF4 automatic file type selection

Expand Down Expand Up @@ -75,6 +82,7 @@ def test_FileType():
FileType(None)


@requires_kerchunk
class TestOpenVirtualDatasetIndexes:
def test_no_indexes(self, netcdf4_file):
vds = open_virtual_dataset(netcdf4_file, indexes={})
Expand Down Expand Up @@ -105,6 +113,7 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I
return True


@requires_kerchunk
def test_cftime_index(tmpdir):
"""Ensure a virtual dataset contains the same indexes as an Xarray dataset"""
# Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168
Expand All @@ -130,6 +139,7 @@ def test_cftime_index(tmpdir):
assert vds.attrs == ds.attrs


@requires_kerchunk
class TestOpenVirtualDatasetAttrs:
def test_drop_array_dimensions(self, netcdf4_file):
# regression test for GH issue #150
Expand Down Expand Up @@ -237,6 +247,8 @@ def test_read_from_url(self, filetype, url):
assert isinstance(vds, xr.Dataset)

def test_virtualizarr_vs_local_nisar(self):
import fsspec

# Open group directly from locally cached file with xarray
url = "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5"
tmpfile = fsspec.open_local(
Expand Down Expand Up @@ -266,6 +278,7 @@ def test_virtualizarr_vs_local_nisar(self):
xrt.assert_equal(dsXR, dsV)


@requires_kerchunk
class TestLoadVirtualDataset:
def test_loadable_variables(self, netcdf4_file):
vars_to_load = ["air", "time"]
Expand Down Expand Up @@ -338,6 +351,7 @@ def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir):
assert vds.scalar.attrs == {"scalar": "true"}


@requires_kerchunk
@pytest.mark.parametrize(
"reference_format",
["json", "parquet", "invalid"],
Expand Down Expand Up @@ -395,6 +409,7 @@ def test_open_virtual_dataset_existing_kerchunk_refs(
assert set(vds.variables) == set(netcdf4_virtual_dataset.variables)


@requires_kerchunk
def test_notimplemented_read_inline_refs(tmp_path, netcdf4_inlined_ref):
# For now, we raise a NotImplementedError if we read existing references that have inlined data
# https://github.com/zarr-developers/VirtualiZarr/pull/251#pullrequestreview-2361916932
Expand Down
48 changes: 46 additions & 2 deletions virtualizarr/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,53 @@
import xarray.testing as xrt

from virtualizarr import open_virtual_dataset
from virtualizarr.manifests.array import ManifestArray
from virtualizarr.manifests.manifest import ChunkManifest
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.readers.kerchunk import (
dataset_from_kerchunk_refs,
find_var_names,
)
from virtualizarr.tests import requires_kerchunk
from virtualizarr.zarr import ZArray


def test_kerchunk_roundtrip_in_memory_no_concat():
# Set up example xarray dataset
chunks_dict = {
"0.0": {"path": "foo.nc", "offset": 100, "length": 100},
"0.1": {"path": "foo.nc", "offset": 200, "length": 100},
}
manifest = ChunkManifest(entries=chunks_dict)
marr = ManifestArray(
zarray=dict(
shape=(2, 4),
dtype=np.dtype("<i8"),
chunks=(2, 2),
compressor=None,
filters=None,
fill_value=np.nan,
order="C",
),
chunkmanifest=manifest,
)
ds = xr.Dataset({"a": (["x", "y"], marr)})

# Use accessor to write it out to kerchunk reference dict
ds_refs = ds.virtualize.to_kerchunk(format="dict")

# Use dataset_from_kerchunk_refs to reconstruct the dataset
roundtrip = dataset_from_kerchunk_refs(ds_refs)

# Assert equal to original dataset
xrt.assert_equal(roundtrip, ds)


def test_no_duplicates_find_var_names():
"""Verify that we get a deduplicated list of var names"""
ref_dict = {"refs": {"x/something": {}, "x/otherthing": {}}}
assert len(find_var_names(ref_dict)) == 1


@requires_kerchunk
@pytest.mark.parametrize(
"inline_threshold, vars_to_inline",
[
Expand Down Expand Up @@ -45,6 +87,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
assert refs["refs"]["time/0"] == expected["refs"]["time/0"]


@requires_kerchunk
@pytest.mark.parametrize("format", ["dict", "json", "parquet"])
class TestKerchunkRoundtrip:
def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
Expand Down Expand Up @@ -212,6 +255,7 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
assert roundtrip.a.attrs == ds.a.attrs


@requires_kerchunk
def test_open_scalar_variable(tmpdir):
# regression test for GH issue #100

Expand Down
46 changes: 0 additions & 46 deletions virtualizarr/tests/test_kerchunk.py

This file was deleted.

Loading

0 comments on commit e6407e0

Please sign in to comment.