From be1e0f9bdaad5b5dbbab86f5cc1da6e095d92dc9 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 6 Oct 2024 11:35:38 +0900 Subject: [PATCH 1/4] Remove unvetted DataTree methods As [discussed](https://docs.google.com/presentation/d/1zBjEsihBhK_U972jxHwaAZBbzS1-hd3aDLnO9uu2Ob4/edit#slide=id.g3087b787633_13_0) in the last DataTree meeting, this PR deletes the many Dataset methods that were copied onto DataTree without unit tests, along with a few that are not implemented properly yet, e.g., 1. Arithmetic methods were removed, because `DataTree + Dataset` should probably raise an error. 2. Indexing and aggregation methods were removed, because these should allow for dimensions that are missing only on some nodes. 3. The untested `map_over_subtree_inplace` and `render` methods were removed. 3. A few other methods (e.g., `merge` and `plot`) that were only implemented by raising `NotImplementedError`` are entirely removed instead. --- xarray/core/datatree.py | 58 ----------------------------------- xarray/tests/test_datatree.py | 35 ++++++++++++++------- 2 files changed, 24 insertions(+), 69 deletions(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 52d44bec96f..bc04b855a4c 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -24,12 +24,6 @@ check_isomorphic, map_over_subtree, ) -from xarray.core.datatree_ops import ( - DataTreeArithmeticMixin, - MappedDatasetMethodsMixin, - MappedDataWithCoords, -) -from xarray.core.datatree_render import RenderDataTree from xarray.core.formatting import datatree_repr, dims_and_coords_repr from xarray.core.formatting_html import ( datatree_repr as datatree_repr_html, @@ -404,9 +398,6 @@ def map( # type: ignore[override] class DataTree( NamedNode["DataTree"], - MappedDatasetMethodsMixin, - MappedDataWithCoords, - DataTreeArithmeticMixin, TreeAttrAccessMixin, Mapping[str, "DataArray | DataTree"], ): @@ -1413,34 +1404,6 @@ def map_over_subtree( # TODO fix this typing error return map_over_subtree(func)(self, *args, **kwargs) - def map_over_subtree_inplace( - self, - func: Callable, - *args: Iterable[Any], - **kwargs: Any, - ) -> None: - """ - Apply a function to every dataset in this subtree, updating data in place. - - Parameters - ---------- - func : callable - Function to apply to datasets with signature: - `func(node.dataset, *args, **kwargs) -> Dataset`. - - Function will not be applied to any nodes without datasets, - *args : tuple, optional - Positional arguments passed on to `func`. - **kwargs : Any - Keyword arguments passed on to `func`. - """ - - # TODO if func fails on some node then the previous nodes will still have been updated... - - for node in self.subtree: - if node.has_data: - node.dataset = func(node.dataset, *args, **kwargs) - def pipe( self, func: Callable | tuple[Callable, str], *args: Any, **kwargs: Any ) -> Any: @@ -1501,26 +1464,8 @@ def pipe( args = (self,) + args return func(*args, **kwargs) - def render(self): - """Print tree structure, including any data stored at each node.""" - for pre, fill, node in RenderDataTree(self): - print(f"{pre}DataTree('{self.name}')") - for ds_line in repr(node.dataset)[1:]: - print(f"{fill}{ds_line}") - - def merge(self, datatree: DataTree) -> DataTree: - """Merge all the leaves of a second DataTree into this one.""" - raise NotImplementedError - - def merge_child_nodes(self, *paths, new_path: T_Path) -> DataTree: - """Merge a set of child nodes into a single new node.""" - raise NotImplementedError - # TODO some kind of .collapse() or .flatten() method to merge a subtree - def to_dataarray(self) -> DataArray: - return self.dataset.to_dataarray() - @property def groups(self): """Return all netCDF4 groups in the tree, given as a tuple of path-like strings.""" @@ -1655,6 +1600,3 @@ def to_zarr( compute=compute, **kwargs, ) - - def plot(self): - raise NotImplementedError diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index 30934f83c63..bd0b6c34e7d 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -668,10 +668,11 @@ def test_modify(self): actual.coords["x"] = ("x", [-1]) assert_identical(actual, dt) # should not be modified - actual = dt.copy() - del actual.coords["b"] - expected = dt.reset_coords("b", drop=True) - assert_identical(expected, actual) + # TODO: re-enable after implementing reset_coords() + # actual = dt.copy() + # del actual.coords["b"] + # expected = dt.reset_coords("b", drop=True) + # assert_identical(expected, actual) with pytest.raises(KeyError): del dt.coords["not_found"] @@ -679,14 +680,15 @@ def test_modify(self): with pytest.raises(KeyError): del dt.coords["foo"] - actual = dt.copy(deep=True) - actual.coords.update({"c": 11}) - expected = dt.assign_coords({"c": 11}) - assert_identical(expected, actual) + # TODO: re-enable after implementing assign_coords() + # actual = dt.copy(deep=True) + # actual.coords.update({"c": 11}) + # expected = dt.assign_coords({"c": 11}) + # assert_identical(expected, actual) - # regression test for GH3746 - del actual.coords["x"] - assert "x" not in actual.xindexes + # # regression test for GH3746 + # del actual.coords["x"] + # assert "x" not in actual.xindexes # test that constructors can also handle the `DataTreeCoordinates` object ds2 = Dataset(coords=dt.coords) @@ -968,6 +970,7 @@ def test_ipython_key_completions(self, create_test_datatree): var_keys = list(dt.variables.keys()) assert all(var_key in key_completions for var_key in var_keys) + @pytest.mark.xfail(reason="sel not implemented yet") def test_operation_with_attrs_but_no_data(self): # tests bug from xarray-datatree GH262 xs = xr.Dataset({"testvar": xr.DataArray(np.ones((2, 3)))}) @@ -1557,6 +1560,7 @@ def test_filter(self): class TestDSMethodInheritance: + @pytest.mark.xfail(reason="isel not implemented yet") def test_dataset_method(self): ds = xr.Dataset({"a": ("x", [1, 2, 3])}) dt = DataTree.from_dict( @@ -1576,6 +1580,7 @@ def test_dataset_method(self): result = dt.isel(x=1) assert_equal(result, expected) + @pytest.mark.xfail(reason="reduce methods not implemented yet") def test_reduce_method(self): ds = xr.Dataset({"a": ("x", [False, True, False])}) dt = DataTree.from_dict({"/": ds, "/results": ds}) @@ -1585,6 +1590,7 @@ def test_reduce_method(self): result = dt.any() assert_equal(result, expected) + @pytest.mark.xfail(reason="reduce methods not implemented yet") def test_nan_reduce_method(self): ds = xr.Dataset({"a": ("x", [1, 2, 3])}) dt = DataTree.from_dict({"/": ds, "/results": ds}) @@ -1594,6 +1600,7 @@ def test_nan_reduce_method(self): result = dt.mean() assert_equal(result, expected) + @pytest.mark.xfail(reason="cum methods not implemented yet") def test_cum_method(self): ds = xr.Dataset({"a": ("x", [1, 2, 3])}) dt = DataTree.from_dict({"/": ds, "/results": ds}) @@ -1610,6 +1617,7 @@ def test_cum_method(self): class TestOps: + @pytest.mark.xfail(reason="arithmetic not implemented yet") def test_binary_op_on_int(self): ds1 = xr.Dataset({"a": [5], "b": [3]}) ds2 = xr.Dataset({"x": [0.1, 0.2], "y": [10, 20]}) @@ -1621,6 +1629,7 @@ def test_binary_op_on_int(self): result: DataTree = dt * 5 # type: ignore[assignment,operator] assert_equal(result, expected) + @pytest.mark.xfail(reason="arithmetic not implemented yet") def test_binary_op_on_dataset(self): ds1 = xr.Dataset({"a": [5], "b": [3]}) ds2 = xr.Dataset({"x": [0.1, 0.2], "y": [10, 20]}) @@ -1643,6 +1652,7 @@ def test_binary_op_on_dataset(self): result = dt * other_ds assert_equal(result, expected) + @pytest.mark.xfail(reason="arithmetic not implemented yet") def test_binary_op_on_datatree(self): ds1 = xr.Dataset({"a": [5], "b": [3]}) ds2 = xr.Dataset({"x": [0.1, 0.2], "y": [10, 20]}) @@ -1655,6 +1665,7 @@ def test_binary_op_on_datatree(self): result = dt * dt # type: ignore[operator] assert_equal(result, expected) + @pytest.mark.xfail(reason="arithmetic not implemented yet") def test_arithmetic_inherited_coords(self): tree = DataTree(xr.Dataset(coords={"x": [1, 2, 3]})) tree["/foo"] = DataTree(xr.Dataset({"bar": ("x", [4, 5, 6])})) @@ -1669,6 +1680,8 @@ def test_arithmetic_inherited_coords(self): class TestUFuncs: + + @pytest.mark.xfail(reason="__array_ufunc__ not implemented yet") def test_tree(self, create_test_datatree): dt = create_test_datatree() expected = create_test_datatree(modify=lambda ds: np.sin(ds)) From 2ddf24115db5462a78ba86b570a73c7c474ced93 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 6 Oct 2024 11:56:41 +0900 Subject: [PATCH 2/4] groups docstring --- xarray/core/datatree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index bc04b855a4c..57b7b087d95 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -1468,7 +1468,7 @@ def pipe( @property def groups(self): - """Return all netCDF4 groups in the tree, given as a tuple of path-like strings.""" + """Return all groups in the tree, given as a tuple of path-like strings.""" return tuple(node.path for node in self.subtree) def to_netcdf( From b3be8781a31a1304b8fb9a0f404c202df6e47e1d Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 6 Oct 2024 21:47:45 +0900 Subject: [PATCH 3/4] comment out removed DataTree methods --- doc/api.rst | 269 ++++++++++++++++++++++++++-------------------------- 1 file changed, 135 insertions(+), 134 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 87f116514cc..c1e3c09c77b 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -725,17 +725,18 @@ Manipulate the contents of all nodes in a ``DataTree`` simultaneously. :toctree: generated/ DataTree.copy - DataTree.assign_coords - DataTree.merge - DataTree.rename - DataTree.rename_vars - DataTree.rename_dims - DataTree.swap_dims - DataTree.expand_dims - DataTree.drop_vars - DataTree.drop_dims - DataTree.set_coords - DataTree.reset_coords + + .. DataTree.assign_coords + .. DataTree.merge + .. DataTree.rename + .. DataTree.rename_vars + .. DataTree.rename_dims + .. DataTree.swap_dims + .. DataTree.expand_dims + .. DataTree.drop_vars + .. DataTree.drop_dims + .. DataTree.set_coords + .. DataTree.reset_coords DataTree Node Contents ---------------------- @@ -760,129 +761,129 @@ Compare one ``DataTree`` object to another. DataTree.equals DataTree.identical -Indexing --------- - -Index into all nodes in the subtree simultaneously. - -.. autosummary:: - :toctree: generated/ - - DataTree.isel - DataTree.sel - DataTree.drop_sel - DataTree.drop_isel - DataTree.head - DataTree.tail - DataTree.thin - DataTree.squeeze - DataTree.interp - DataTree.interp_like - DataTree.reindex - DataTree.reindex_like - DataTree.set_index - DataTree.reset_index - DataTree.reorder_levels - DataTree.query - -.. - - Missing: - ``DataTree.loc`` - - -Missing Value Handling ----------------------- - -.. autosummary:: - :toctree: generated/ - - DataTree.isnull - DataTree.notnull - DataTree.combine_first - DataTree.dropna - DataTree.fillna - DataTree.ffill - DataTree.bfill - DataTree.interpolate_na - DataTree.where - DataTree.isin - -Computation ------------ - -Apply a computation to the data in all nodes in the subtree simultaneously. - -.. autosummary:: - :toctree: generated/ - - DataTree.map - DataTree.reduce - DataTree.diff - DataTree.quantile - DataTree.differentiate - DataTree.integrate - DataTree.map_blocks - DataTree.polyfit - DataTree.curvefit - -Aggregation ------------ - -Aggregate data in all nodes in the subtree simultaneously. - -.. autosummary:: - :toctree: generated/ - - DataTree.all - DataTree.any - DataTree.argmax - DataTree.argmin - DataTree.idxmax - DataTree.idxmin - DataTree.max - DataTree.min - DataTree.mean - DataTree.median - DataTree.prod - DataTree.sum - DataTree.std - DataTree.var - DataTree.cumsum - DataTree.cumprod - -ndarray methods ---------------- - -Methods copied from :py:class:`numpy.ndarray` objects, here applying to the data in all nodes in the subtree. - -.. autosummary:: - :toctree: generated/ - - DataTree.argsort - DataTree.astype - DataTree.clip - DataTree.conj - DataTree.conjugate - DataTree.round - DataTree.rank - -Reshaping and reorganising --------------------------- - -Reshape or reorganise the data in all nodes in the subtree. - -.. autosummary:: - :toctree: generated/ - - DataTree.transpose - DataTree.stack - DataTree.unstack - DataTree.shift - DataTree.roll - DataTree.pad - DataTree.sortby - DataTree.broadcast_like +.. Indexing +.. -------- + +.. Index into all nodes in the subtree simultaneously. + +.. .. autosummary:: +.. :toctree: generated/ + +.. DataTree.isel +.. DataTree.sel +.. DataTree.drop_sel +.. DataTree.drop_isel +.. DataTree.head +.. DataTree.tail +.. DataTree.thin +.. DataTree.squeeze +.. DataTree.interp +.. DataTree.interp_like +.. DataTree.reindex +.. DataTree.reindex_like +.. DataTree.set_index +.. DataTree.reset_index +.. DataTree.reorder_levels +.. DataTree.query + +.. .. + +.. Missing: +.. ``DataTree.loc`` + + +.. Missing Value Handling +.. ---------------------- + +.. .. autosummary:: +.. :toctree: generated/ + +.. DataTree.isnull +.. DataTree.notnull +.. DataTree.combine_first +.. DataTree.dropna +.. DataTree.fillna +.. DataTree.ffill +.. DataTree.bfill +.. DataTree.interpolate_na +.. DataTree.where +.. DataTree.isin + +.. Computation +.. ----------- + +.. Apply a computation to the data in all nodes in the subtree simultaneously. + +.. .. autosummary:: +.. :toctree: generated/ + +.. DataTree.map +.. DataTree.reduce +.. DataTree.diff +.. DataTree.quantile +.. DataTree.differentiate +.. DataTree.integrate +.. DataTree.map_blocks +.. DataTree.polyfit +.. DataTree.curvefit + +.. Aggregation +.. ----------- + +.. Aggregate data in all nodes in the subtree simultaneously. + +.. .. autosummary:: +.. :toctree: generated/ + +.. DataTree.all +.. DataTree.any +.. DataTree.argmax +.. DataTree.argmin +.. DataTree.idxmax +.. DataTree.idxmin +.. DataTree.max +.. DataTree.min +.. DataTree.mean +.. DataTree.median +.. DataTree.prod +.. DataTree.sum +.. DataTree.std +.. DataTree.var +.. DataTree.cumsum +.. DataTree.cumprod + +.. ndarray methods +.. --------------- + +.. Methods copied from :py:class:`numpy.ndarray` objects, here applying to the data in all nodes in the subtree. + +.. .. autosummary:: +.. :toctree: generated/ + +.. DataTree.argsort +.. DataTree.astype +.. DataTree.clip +.. DataTree.conj +.. DataTree.conjugate +.. DataTree.round +.. DataTree.rank + +.. Reshaping and reorganising +.. -------------------------- + +.. Reshape or reorganise the data in all nodes in the subtree. + +.. .. autosummary:: +.. :toctree: generated/ + +.. DataTree.transpose +.. DataTree.stack +.. DataTree.unstack +.. DataTree.shift +.. DataTree.roll +.. DataTree.pad +.. DataTree.sortby +.. DataTree.broadcast_like IO / Conversion =============== From 4259f1e09584cd49ae1d313ef87ab3f699ce3620 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 6 Oct 2024 21:59:05 +0900 Subject: [PATCH 4/4] update quick overview on DataTree --- doc/getting-started-guide/quick-overview.rst | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/doc/getting-started-guide/quick-overview.rst b/doc/getting-started-guide/quick-overview.rst index 5efe3acc609..050de3dcb1c 100644 --- a/doc/getting-started-guide/quick-overview.rst +++ b/doc/getting-started-guide/quick-overview.rst @@ -314,19 +314,23 @@ And you can get a copy of just the node local values of :py:class:`~xarray.Datas ds_node_local = dt["simulation/coarse"].to_dataset(inherited=False) ds_node_local -Operations map over subtrees, so we can take a mean over the ``x`` dimension of both the ``fine`` and ``coarse`` groups just by: +We intend to eventually implement most :py:class:`~xarray.Dataset` methods +(indexing, aggregation, arithmetic, etc) on :py:class:`~xarray.DataTree` +objects, but many methods have not been implemented yet. -.. ipython:: python +.. Operations map over subtrees, so we can take a mean over the ``x`` dimension of both the ``fine`` and ``coarse`` groups just by: + +.. .. ipython:: python - avg = dt["simulation"].mean(dim="x") - avg +.. avg = dt["simulation"].mean(dim="x") +.. avg -Here the ``"x"`` dimension used is always the one local to that subgroup. +.. Here the ``"x"`` dimension used is always the one local to that subgroup. -You can do almost everything you can do with :py:class:`~xarray.Dataset` objects with :py:class:`~xarray.DataTree` objects -(including indexing and arithmetic), as operations will be mapped over every subgroup in the tree. -This allows you to work with multiple groups of non-alignable variables at once. +.. You can do almost everything you can do with :py:class:`~xarray.Dataset` objects with :py:class:`~xarray.DataTree` objects +.. (including indexing and arithmetic), as operations will be mapped over every subgroup in the tree. +.. This allows you to work with multiple groups of non-alignable variables at once. .. note::