From 1709721d4a76159f8ac61663ebfb91c87734cdfe Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 24 Oct 2024 14:19:01 -0700 Subject: [PATCH 1/5] Fix writing of DataTree subgroups to zarr or netCDF Consider a DataTree with a group, e.g., `tree = DataTree.from_dict({'/': ... '/child': ...})` If we write `tree['/child']` to disk, the result should have groups relative to `'/child'`, so writing and reading from the same path restores the same object. In addition, coordinates defined at the root should be written to disk instead of being omitted. --- xarray/core/datatree_io.py | 90 +++++++------------------- xarray/tests/test_backends_datatree.py | 36 +++++++++++ 2 files changed, 61 insertions(+), 65 deletions(-) diff --git a/xarray/core/datatree_io.py b/xarray/core/datatree_io.py index e5a4ca6bf9d..b2e374e9600 100644 --- a/xarray/core/datatree_io.py +++ b/xarray/core/datatree_io.py @@ -11,33 +11,6 @@ T_DataTreeNetcdfTypes = Literal["NETCDF4"] -def _get_nc_dataset_class(engine: T_DataTreeNetcdfEngine | None): - if engine == "netcdf4": - from netCDF4 import Dataset - elif engine == "h5netcdf": - from h5netcdf.legacyapi import Dataset - elif engine is None: - try: - from netCDF4 import Dataset - except ImportError: - from h5netcdf.legacyapi import Dataset - else: - raise ValueError(f"unsupported engine: {engine}") - return Dataset - - -def _create_empty_netcdf_group( - filename: str | PathLike, - group: str, - mode: NetcdfWriteModes, - engine: T_DataTreeNetcdfEngine | None, -): - ncDataset = _get_nc_dataset_class(engine) - - with ncDataset(filename, mode=mode) as rootgrp: - rootgrp.createGroup(group) - - def _datatree_to_netcdf( dt: DataTree, filepath: str | PathLike, @@ -85,34 +58,23 @@ def _datatree_to_netcdf( unlimited_dims = {} for node in dt.subtree: - ds = node.to_dataset(inherit=False) - group_path = node.path - if ds is None: - _create_empty_netcdf_group(filepath, group_path, mode, engine) - else: - ds.to_netcdf( - filepath, - group=group_path, - mode=mode, - encoding=encoding.get(node.path), - unlimited_dims=unlimited_dims.get(node.path), - engine=engine, - format=format, - compute=compute, - **kwargs, - ) + at_root = node is dt + ds = node.to_dataset(inherit=at_root) + group_path = None if at_root else "/" + node.relative_to(dt) + ds.to_netcdf( + filepath, + group=group_path, + mode=mode, + encoding=encoding.get(node.path), + unlimited_dims=unlimited_dims.get(node.path), + engine=engine, + format=format, + compute=compute, + **kwargs, + ) mode = "a" -def _create_empty_zarr_group( - store: MutableMapping | str | PathLike[str], group: str, mode: ZarrWriteModes -): - import zarr - - root = zarr.open_group(store, mode=mode) - root.create_group(group, overwrite=True) - - def _datatree_to_zarr( dt: DataTree, store: MutableMapping | str | PathLike[str], @@ -151,19 +113,17 @@ def _datatree_to_zarr( ) for node in dt.subtree: - ds = node.to_dataset(inherit=False) - group_path = node.path - if ds is None: - _create_empty_zarr_group(store, group_path, mode) - else: - ds.to_zarr( - store, - group=group_path, - mode=mode, - encoding=encoding.get(node.path), - consolidated=False, - **kwargs, - ) + at_root = node is dt + ds = node.to_dataset(inherit=at_root) + group_path = None if at_root else "/" + node.relative_to(dt) + ds.to_zarr( + store, + group=group_path, + mode=mode, + encoding=encoding.get(node.path), + consolidated=False, + **kwargs, + ) if "w" in mode: mode = "a" diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 01b8b0ae81b..04c184c9581 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -196,6 +196,24 @@ def test_netcdf_encoding(self, tmpdir, simple_datatree): with pytest.raises(ValueError, match="unexpected encoding group.*"): original_dt.to_netcdf(filepath, encoding=enc, engine=self.engine) + def test_write_subgroup(self, tmpdir): + original_dt = DataTree.from_dict( + { + "/": xr.Dataset(coords={"x": [1, 2, 3]}), + "/child": xr.Dataset({"foo": ("x", [4, 5, 6])}), + } + ).children["child"] + + expected_dt = original_dt.copy() + expected_dt.name = None + + filepath = tmpdir / "test.zarr" + original_dt.to_netcdf(filepath, engine=self.engine) + + with open_datatree(filepath, engine=self.engine) as roundtrip_dt: + assert_equal(original_dt, roundtrip_dt) + assert_identical(expected_dt, roundtrip_dt) + @requires_netCDF4 class TestNetCDF4DatatreeIO(DatatreeIOBase): @@ -532,3 +550,21 @@ def test_open_groups_chunks(self, tmpdir) -> None: for ds in dict_of_datasets.values(): ds.close() + + def test_write_subgroup(self, tmpdir): + original_dt = DataTree.from_dict( + { + "/": xr.Dataset(coords={"x": [1, 2, 3]}), + "/child": xr.Dataset({"foo": ("x", [4, 5, 6])}), + } + ).children["child"] + + expected_dt = original_dt.copy() + expected_dt.name = None + + filepath = tmpdir / "test.zarr" + original_dt.to_zarr(filepath) + + with open_datatree(filepath, engine="zarr") as roundtrip_dt: + assert_equal(original_dt, roundtrip_dt) + assert_identical(expected_dt, roundtrip_dt) From 7e5ccd2529c52c856253337d60187aa53726b3fe Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 24 Oct 2024 15:35:08 -0700 Subject: [PATCH 2/5] Add write_inherited_coords for additional control in DataTree.to_zarr As discussed in the last xarray meeting, this defaults to write_inherited_coords=True, which has a little more overhead but means you always get coordinates when opening a sub-group. --- xarray/core/datatree.py | 14 ++++++++++++++ xarray/core/datatree_io.py | 6 ++++-- xarray/tests/test_backends_datatree.py | 19 +++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index d4ee2621557..8f38fd4eb4e 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -1573,6 +1573,7 @@ def to_netcdf( format: T_DataTreeNetcdfTypes | None = None, engine: T_DataTreeNetcdfEngine | None = None, group: str | None = None, + write_inherited_coords: bool = True, compute: bool = True, **kwargs, ): @@ -1609,6 +1610,11 @@ def to_netcdf( group : str, optional Path to the netCDF4 group in the given file to open as the root group of the ``DataTree``. Currently, specifying a group is not supported. + write_inherited_coords : bool, default: True + If true, replicate inherited coordinates on all descendant nodes. + Otherwise, only write coordinates at the level at which they are + originally defined. This saves disk space, but requires opening the + full tree to load inherited coordinates. compute : bool, default: True If true compute immediately, otherwise return a ``dask.delayed.Delayed`` object that can be computed later. @@ -1632,6 +1638,7 @@ def to_netcdf( format=format, engine=engine, group=group, + write_inherited_coords=write_inherited_coords, compute=compute, **kwargs, ) @@ -1643,6 +1650,7 @@ def to_zarr( encoding=None, consolidated: bool = True, group: str | None = None, + write_inherited_coords: bool = True, compute: Literal[True] = True, **kwargs, ): @@ -1668,6 +1676,11 @@ def to_zarr( after writing metadata for all groups. group : str, optional Group path. (a.k.a. `path` in zarr terminology.) + write_inherited_coords : bool, default: True + If true, replicate inherited coordinates on all descendant nodes. + Otherwise, only write coordinates at the level at which they are + originally defined. This saves disk space, but requires opening the + full tree to load inherited coordinates. compute : bool, default: True If true compute immediately, otherwise return a ``dask.delayed.Delayed`` object that can be computed later. Metadata @@ -1690,6 +1703,7 @@ def to_zarr( encoding=encoding, consolidated=consolidated, group=group, + write_inherited_coords=write_inherited_coords, compute=compute, **kwargs, ) diff --git a/xarray/core/datatree_io.py b/xarray/core/datatree_io.py index b2e374e9600..a6868f9d0df 100644 --- a/xarray/core/datatree_io.py +++ b/xarray/core/datatree_io.py @@ -20,6 +20,7 @@ def _datatree_to_netcdf( format: T_DataTreeNetcdfTypes | None = None, engine: T_DataTreeNetcdfEngine | None = None, group: str | None = None, + write_inherited_coords: bool = True, compute: bool = True, **kwargs, ): @@ -59,7 +60,7 @@ def _datatree_to_netcdf( for node in dt.subtree: at_root = node is dt - ds = node.to_dataset(inherit=at_root) + ds = node.to_dataset(inherit=write_inherited_coords or at_root) group_path = None if at_root else "/" + node.relative_to(dt) ds.to_netcdf( filepath, @@ -82,6 +83,7 @@ def _datatree_to_zarr( encoding: Mapping[str, Any] | None = None, consolidated: bool = True, group: str | None = None, + write_inherited_coords: bool = True, compute: Literal[True] = True, **kwargs, ): @@ -114,7 +116,7 @@ def _datatree_to_zarr( for node in dt.subtree: at_root = node is dt - ds = node.to_dataset(inherit=at_root) + ds = node.to_dataset(inherit=write_inherited_coords or at_root) group_path = None if at_root else "/" + node.relative_to(dt) ds.to_zarr( store, diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 76a0e43af93..9b3631472db 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -592,3 +592,22 @@ def test_write_subgroup(self, tmpdir): with open_datatree(filepath, engine="zarr") as roundtrip_dt: assert_equal(original_dt, roundtrip_dt) assert_identical(expected_dt, roundtrip_dt) + + def test_write_inherited_coords(self, tmpdir): + original_dt = DataTree.from_dict( + { + "/": xr.Dataset(coords={"x": [1, 2, 3]}), + "/child": xr.Dataset({"foo": ("x", [4, 5, 6])}), + } + ) + + filepath = tmpdir / "test.zarr" + original_dt.to_zarr(filepath, write_inherited_coords=False) + + with open_datatree(filepath, engine="zarr") as roundtrip_dt: + assert_identical(original_dt, roundtrip_dt) + + expected_child = original_dt.children["child"].copy(inherit=False) + expected_child.name = None + with open_datatree(filepath, group="child", engine="zarr") as roundtrip_child: + assert_identical(expected_child, roundtrip_child) From 5646c61314964a2cd456f151c1232d30a3c2fda3 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 28 Oct 2024 14:06:20 -0700 Subject: [PATCH 3/5] Switch write_inherited_coords default to false --- xarray/core/datatree.py | 8 ++++---- xarray/core/datatree_io.py | 4 ++-- xarray/tests/test_backends_datatree.py | 21 ++++++++++++++++++++- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 140bf8260b0..6aa7eedde27 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -1573,7 +1573,7 @@ def to_netcdf( format: T_DataTreeNetcdfTypes | None = None, engine: T_DataTreeNetcdfEngine | None = None, group: str | None = None, - write_inherited_coords: bool = True, + write_inherited_coords: bool = False, compute: bool = True, **kwargs, ): @@ -1610,7 +1610,7 @@ def to_netcdf( group : str, optional Path to the netCDF4 group in the given file to open as the root group of the ``DataTree``. Currently, specifying a group is not supported. - write_inherited_coords : bool, default: True + write_inherited_coords : bool, default: False If true, replicate inherited coordinates on all descendant nodes. Otherwise, only write coordinates at the level at which they are originally defined. This saves disk space, but requires opening the @@ -1650,7 +1650,7 @@ def to_zarr( encoding=None, consolidated: bool = True, group: str | None = None, - write_inherited_coords: bool = True, + write_inherited_coords: bool = False, compute: Literal[True] = True, **kwargs, ): @@ -1676,7 +1676,7 @@ def to_zarr( after writing metadata for all groups. group : str, optional Group path. (a.k.a. `path` in zarr terminology.) - write_inherited_coords : bool, default: True + write_inherited_coords : bool, default: False If true, replicate inherited coordinates on all descendant nodes. Otherwise, only write coordinates at the level at which they are originally defined. This saves disk space, but requires opening the diff --git a/xarray/core/datatree_io.py b/xarray/core/datatree_io.py index a6868f9d0df..d13de6cfc30 100644 --- a/xarray/core/datatree_io.py +++ b/xarray/core/datatree_io.py @@ -20,7 +20,7 @@ def _datatree_to_netcdf( format: T_DataTreeNetcdfTypes | None = None, engine: T_DataTreeNetcdfEngine | None = None, group: str | None = None, - write_inherited_coords: bool = True, + write_inherited_coords: bool = False, compute: bool = True, **kwargs, ): @@ -83,7 +83,7 @@ def _datatree_to_zarr( encoding: Mapping[str, Any] | None = None, consolidated: bool = True, group: str | None = None, - write_inherited_coords: bool = True, + write_inherited_coords: bool = False, compute: Literal[True] = True, **kwargs, ): diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 9b3631472db..608f9645ce8 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -593,7 +593,7 @@ def test_write_subgroup(self, tmpdir): assert_equal(original_dt, roundtrip_dt) assert_identical(expected_dt, roundtrip_dt) - def test_write_inherited_coords(self, tmpdir): + def test_write_inherited_coords_false(self, tmpdir): original_dt = DataTree.from_dict( { "/": xr.Dataset(coords={"x": [1, 2, 3]}), @@ -611,3 +611,22 @@ def test_write_inherited_coords(self, tmpdir): expected_child.name = None with open_datatree(filepath, group="child", engine="zarr") as roundtrip_child: assert_identical(expected_child, roundtrip_child) + + def test_write_inherited_coords_true(self, tmpdir): + original_dt = DataTree.from_dict( + { + "/": xr.Dataset(coords={"x": [1, 2, 3]}), + "/child": xr.Dataset({"foo": ("x", [4, 5, 6])}), + } + ) + + filepath = tmpdir / "test.zarr" + original_dt.to_zarr(filepath, write_inherited_coords=True) + + with open_datatree(filepath, engine="zarr") as roundtrip_dt: + assert_identical(original_dt, roundtrip_dt) + + expected_child = original_dt.children["child"].copy(inherit=True) + expected_child.name = None + with open_datatree(filepath, group="child", engine="zarr") as roundtrip_child: + assert_identical(expected_child, roundtrip_child) From 0311614cd581bc58102122083abbc98b37fecafb Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 28 Oct 2024 15:14:24 -0700 Subject: [PATCH 4/5] add whats new --- doc/whats-new.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 52f75c1a6f1..a73bbd0d30a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -23,6 +23,9 @@ New Features ~~~~~~~~~~~~ - Added :py:meth:`DataTree.persist` method (:issue:`9675`, :pull:`9682`). By `Sam Levang `_. +- Added ``write_inherited_coords`` option to :py:meth:`DataTree.to_netcdf` + and :py:meth:`DataTree.to_zarr` (:pull:`9677`). + By `Stephan Hoyer `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -35,7 +38,11 @@ Deprecations Bug fixes ~~~~~~~~~ -- Fix inadvertent deep-copying of child data in DataTree. +- Fix inadvertent deep-copying of child data in DataTree (:issue:`9683`, + :pull:`9684`). + By `Stephan Hoyer `_. +- Avoid including parent groups when writing DataTree subgroups to Zarr or + netCDF (:pull:`9682`). By `Stephan Hoyer `_. Documentation From 6df6817420a210dd88c8e24302de1b050871bb4b Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 4 Nov 2024 13:22:11 -0800 Subject: [PATCH 5/5] remove unused import --- xarray/core/datatree_io.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/xarray/core/datatree_io.py b/xarray/core/datatree_io.py index 00d50ef6197..3d0daa26b90 100644 --- a/xarray/core/datatree_io.py +++ b/xarray/core/datatree_io.py @@ -2,14 +2,11 @@ from collections.abc import Mapping, MutableMapping from os import PathLike -from typing import TYPE_CHECKING, Any, Literal, get_args +from typing import Any, Literal, get_args from xarray.core.datatree import DataTree from xarray.core.types import NetcdfWriteModes, ZarrWriteModes -if TYPE_CHECKING: - pass - T_DataTreeNetcdfEngine = Literal["netcdf4", "h5netcdf"] T_DataTreeNetcdfTypes = Literal["NETCDF4"]