From fe95b16a86587a7245f5e6c87467284b16e7eb61 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Tue, 22 Oct 2024 19:04:28 +0200 Subject: [PATCH 01/22] support chunking and default values in `open_groups` --- xarray/backends/api.py | 63 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index b367da586c7..9b02ab019ab 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -866,7 +866,22 @@ def open_datatree( def open_groups( filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + *, engine: T_Engine = None, + chunks: T_Chunks = None, + cache: bool | None = None, + decode_cf: bool | None = None, + mask_and_scale: bool | Mapping[str, bool] | None = None, + decode_times: bool | Mapping[str, bool] | None = None, + decode_timedelta: bool | Mapping[str, bool] | None = None, + use_cftime: bool | Mapping[str, bool] | None = None, + concat_characters: bool | Mapping[str, bool] | None = None, + decode_coords: Literal["coordinates", "all"] | bool | None = None, + drop_variables: str | Iterable[str] | None = None, + inline_array: bool = False, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, + backend_kwargs: dict[str, Any] | None = None, **kwargs, ) -> dict[str, Dataset]: """ @@ -893,12 +908,58 @@ def open_groups( open_datatree() DataTree.from_dict() """ + if cache is None: + cache = chunks is None + + if backend_kwargs is not None: + kwargs.update(backend_kwargs) + if engine is None: engine = plugins.guess_engine(filename_or_obj) + if from_array_kwargs is None: + from_array_kwargs = {} + backend = plugins.get_backend(engine) - return backend.open_groups_as_dict(filename_or_obj, **kwargs) + decoders = _resolve_decoders_kwargs( + decode_cf, + open_backend_dataset_parameters=(), + mask_and_scale=mask_and_scale, + decode_times=decode_times, + decode_timedelta=decode_timedelta, + concat_characters=concat_characters, + use_cftime=use_cftime, + decode_coords=decode_coords, + ) + overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None) + + backend_groups = backend.open_groups_as_dict( + filename_or_obj, + drop_variables=drop_variables, + **decoders, + **kwargs, + ) + + groups = { + name: _dataset_from_backend_dataset( + backend_ds, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + inline_array, + chunked_array_type, + from_array_kwargs, + drop_variables=drop_variables, + **decoders, + **kwargs, + ) + for name, backend_ds in backend_groups.items() + } + + return groups def open_mfdataset( From 3bfbc3a12b891d05145b883572ea6595334c33b3 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Tue, 22 Oct 2024 19:30:56 +0200 Subject: [PATCH 02/22] same for `open_datatree` --- xarray/backends/api.py | 109 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 9b02ab019ab..0dc6adf23f6 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -41,6 +41,7 @@ ) from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset, _get_chunk, _maybe_chunk +from xarray.core.datatree_mapping import map_over_datasets from xarray.core.indexes import Index from xarray.core.types import NetcdfWriteModes, ZarrWriteModes from xarray.core.utils import is_remote_uri @@ -414,6 +415,54 @@ def _dataset_from_backend_dataset( return ds +def _datatree_from_backend_datatree( + backend_tree, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + inline_array, + chunked_array_type, + from_array_kwargs, + **extra_tokens, +): + if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: + raise ValueError( + f"chunks must be an int, dict, 'auto', or None. Instead found {chunks}." + ) + + # _protect_datatree_variables_inplace(backend_tree, cache) + if chunks is None: + tree = backend_tree + else: + tree = map_over_datasets( + lambda ds: _chunk_ds( + ds, + filename_or_obj, + engine, + chunks, + overwrite_encoded_chunks, + inline_array, + chunked_array_type, + from_array_kwargs, + **extra_tokens, + ), + backend_tree, + ) + + # ds.set_close(backend_ds._close) + + # Ensure source filename always stored in dataset object + if "source" not in tree.encoding: + path = getattr(filename_or_obj, "path", filename_or_obj) + + if isinstance(path, str | os.PathLike): + tree.encoding["source"] = _normalize_path(path) + + return tree + + def open_dataset( filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, *, @@ -838,7 +887,22 @@ def open_dataarray( def open_datatree( filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + *, engine: T_Engine = None, + chunks: T_Chunks = None, + cache: bool | None = None, + decode_cf: bool | None = None, + mask_and_scale: bool | Mapping[str, bool] | None = None, + decode_times: bool | Mapping[str, bool] | None = None, + decode_timedelta: bool | Mapping[str, bool] | None = None, + use_cftime: bool | Mapping[str, bool] | None = None, + concat_characters: bool | Mapping[str, bool] | None = None, + decode_coords: Literal["coordinates", "all"] | bool | None = None, + drop_variables: str | Iterable[str] | None = None, + inline_array: bool = False, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, + backend_kwargs: dict[str, Any] | None = None, **kwargs, ) -> DataTree: """ @@ -856,12 +920,55 @@ def open_datatree( ------- xarray.DataTree """ + if cache is None: + cache = chunks is None + + if backend_kwargs is not None: + kwargs.update(backend_kwargs) + if engine is None: engine = plugins.guess_engine(filename_or_obj) + if from_array_kwargs is None: + from_array_kwargs = {} + backend = plugins.get_backend(engine) - return backend.open_datatree(filename_or_obj, **kwargs) + decoders = _resolve_decoders_kwargs( + decode_cf, + open_backend_dataset_parameters=(), + mask_and_scale=mask_and_scale, + decode_times=decode_times, + decode_timedelta=decode_timedelta, + concat_characters=concat_characters, + use_cftime=use_cftime, + decode_coords=decode_coords, + ) + overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None) + + backend_tree = backend.open_datatree( + filename_or_obj, + drop_variables=drop_variables, + **decoders, + **kwargs, + ) + + tree = _datatree_from_backend_datatree( + backend_tree, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + inline_array, + chunked_array_type, + from_array_kwargs, + drop_variables=drop_variables, + **decoders, + **kwargs, + ) + + return tree def open_groups( From f4abb0168a885d150736f800dd78498ca19fc3bd Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Tue, 22 Oct 2024 20:42:24 +0200 Subject: [PATCH 03/22] use `group_subtrees` instead of `map_over_datasets` --- xarray/backends/api.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 0dc6adf23f6..4a96ea43a7a 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -41,8 +41,9 @@ ) from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset, _get_chunk, _maybe_chunk -from xarray.core.datatree_mapping import map_over_datasets +from xarray.core.datatree import DataTree from xarray.core.indexes import Index +from xarray.core.treenode import group_subtrees from xarray.core.types import NetcdfWriteModes, ZarrWriteModes from xarray.core.utils import is_remote_uri from xarray.namedarray.daskmanager import DaskManager @@ -75,7 +76,6 @@ T_NetcdfTypes = Literal[ "NETCDF4", "NETCDF4_CLASSIC", "NETCDF3_64BIT", "NETCDF3_CLASSIC" ] - from xarray.core.datatree import DataTree DATAARRAY_NAME = "__xarray_dataarray_name__" DATAARRAY_VARIABLE = "__xarray_dataarray_variable__" @@ -436,19 +436,21 @@ def _datatree_from_backend_datatree( if chunks is None: tree = backend_tree else: - tree = map_over_datasets( - lambda ds: _chunk_ds( - ds, - filename_or_obj, - engine, - chunks, - overwrite_encoded_chunks, - inline_array, - chunked_array_type, - from_array_kwargs, - **extra_tokens, - ), - backend_tree, + tree = DataTree.from_dict( + { + path: _chunk_ds( + node.dataset, + filename_or_obj, + engine, + chunks, + overwrite_encoded_chunks, + inline_array, + chunked_array_type, + from_array_kwargs, + **extra_tokens, + ) + for path, [node] in group_subtrees(backend_tree) + } ) # ds.set_close(backend_ds._close) From b0458aa8150fdb637a7398ca7f3615e5211b15e1 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Tue, 22 Oct 2024 22:06:53 +0200 Subject: [PATCH 04/22] check that `chunks` on `open_datatree` works --- xarray/tests/test_backends_datatree.py | 78 ++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index b9990de1f44..5537726a347 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -11,6 +11,7 @@ from xarray.core.datatree import DataTree from xarray.testing import assert_equal, assert_identical from xarray.tests import ( + requires_dask, requires_h5netcdf, requires_netCDF4, requires_zarr, @@ -25,6 +26,43 @@ pass +def diff_chunks(comparison, tree1, tree2): + mismatching_variables = [loc for loc, equals in comparison.items() if not equals] + + variable_messages = [ + "\n".join( + [ + f"L {path}:{name}: {tree1[path].variables[name].chunksizes}", + f"R {path}:{name}: {tree2[path].variables[name].chunksizes}", + ] + ) + for path, name in mismatching_variables + ] + return "\n".join(["Differing chunk sizes:"] + variable_messages) + + +def assert_chunks_equal(actual, expected, enforce_dask=False): + __tracebackhide__ = True + + from xarray.namedarray.pycompat import array_type + + dask_array_type = array_type("dask") + + comparison = { + (path, name): ( + ( + not enforce_dask + or isinstance(node1.variables[name].data, dask_array_type) + ) + and node1.variables[name].chunksizes == node2.variables[name].chunksizes + ) + for path, (node1, node2) in xr.group_subtrees(actual, expected) + for name in node1.variables.keys() + } + + assert all(comparison.values()), diff_chunks(comparison, actual, expected) + + @pytest.fixture(scope="module") def unaligned_datatree_nc(tmp_path_factory): """Creates a test netCDF4 file with the following unaligned structure, writes it to a /tmp directory @@ -170,6 +208,26 @@ def test_open_datatree(self, unaligned_datatree_nc) -> None: ): open_datatree(unaligned_datatree_nc) + @requires_dask + def test_open_datatree_chunks(self, tmpdir, simple_datatree) -> None: + filepath = tmpdir / "test.nc" + + root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) + set1_data = xr.Dataset({"a": ("y", [-1, 0, 1]), "b": ("x", [-10, 6])}) + set2_data = xr.Dataset({"a": ("y", [1, 2, 3]), "b": ("x", [0.1, 0.2])}) + original_tree = DataTree.from_dict( + { + "/": root_data.chunk({"x": 2, "y": 1}), + "/group1": set1_data.chunk({"x": 1, "y": 2}), + "/group2": set2_data.chunk({"x": 2, "y": 3}), + } + ) + original_tree.to_netcdf(filepath, engine="netcdf4") + + with open_datatree(filepath, engine="netcdf4", chunks={}) as tree: + xr.testing.assert_identical(tree, original_tree) + assert_chunks_equal(tree, original_tree, enforce_dask=True) + def test_open_groups(self, unaligned_datatree_nc) -> None: """Test `open_groups` with a netCDF4 file with an unaligned group hierarchy.""" unaligned_dict_of_datasets = open_groups(unaligned_datatree_nc) @@ -348,6 +406,26 @@ def test_open_datatree(self, unaligned_datatree_zarr) -> None: ): open_datatree(unaligned_datatree_zarr, engine="zarr") + @requires_dask + def test_open_datatree_chunks(self, tmpdir, simple_datatree) -> None: + filepath = tmpdir / "test.zarr" + + root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) + set1_data = xr.Dataset({"a": ("y", [-1, 0, 1]), "b": ("x", [-10, 6])}) + set2_data = xr.Dataset({"a": ("y", [1, 2, 3]), "b": ("x", [0.1, 0.2])}) + original_tree = DataTree.from_dict( + { + "/": root_data.chunk({"x": 2, "y": 1}), + "/group1": set1_data.chunk({"x": 1, "y": 2}), + "/group2": set2_data.chunk({"x": 2, "y": 3}), + } + ) + original_tree.to_zarr(filepath) + + with open_datatree(filepath, engine="zarr", chunks={}) as tree: + xr.testing.assert_identical(original_tree, tree) + assert_chunks_equal(tree, original_tree, enforce_dask=True) + def test_open_groups(self, unaligned_datatree_zarr) -> None: """Test `open_groups` with a zarr store of an unaligned group hierarchy.""" From 4dbd91ef6e464c66b33d1722fa7039a854bc5321 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 23 Oct 2024 19:03:05 +0200 Subject: [PATCH 05/22] specify the chunksizes when opening from disk --- xarray/tests/test_backends_datatree.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 5537726a347..85d422306a6 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -212,20 +212,23 @@ def test_open_datatree(self, unaligned_datatree_nc) -> None: def test_open_datatree_chunks(self, tmpdir, simple_datatree) -> None: filepath = tmpdir / "test.nc" + chunks = {"x": 2, "y": 1} + root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) set1_data = xr.Dataset({"a": ("y", [-1, 0, 1]), "b": ("x", [-10, 6])}) set2_data = xr.Dataset({"a": ("y", [1, 2, 3]), "b": ("x", [0.1, 0.2])}) original_tree = DataTree.from_dict( { - "/": root_data.chunk({"x": 2, "y": 1}), - "/group1": set1_data.chunk({"x": 1, "y": 2}), - "/group2": set2_data.chunk({"x": 2, "y": 3}), + "/": root_data.chunk(chunks), + "/group1": set1_data.chunk(chunks), + "/group2": set2_data.chunk(chunks), } ) original_tree.to_netcdf(filepath, engine="netcdf4") - with open_datatree(filepath, engine="netcdf4", chunks={}) as tree: + with open_datatree(filepath, engine="netcdf4", chunks=chunks) as tree: xr.testing.assert_identical(tree, original_tree) + assert_chunks_equal(tree, original_tree, enforce_dask=True) def test_open_groups(self, unaligned_datatree_nc) -> None: @@ -410,19 +413,21 @@ def test_open_datatree(self, unaligned_datatree_zarr) -> None: def test_open_datatree_chunks(self, tmpdir, simple_datatree) -> None: filepath = tmpdir / "test.zarr" + chunks = {"x": 2, "y": 1} + root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) set1_data = xr.Dataset({"a": ("y", [-1, 0, 1]), "b": ("x", [-10, 6])}) set2_data = xr.Dataset({"a": ("y", [1, 2, 3]), "b": ("x", [0.1, 0.2])}) original_tree = DataTree.from_dict( { - "/": root_data.chunk({"x": 2, "y": 1}), - "/group1": set1_data.chunk({"x": 1, "y": 2}), - "/group2": set2_data.chunk({"x": 2, "y": 3}), + "/": root_data.chunk(chunks), + "/group1": set1_data.chunk(chunks), + "/group2": set2_data.chunk(chunks), } ) original_tree.to_zarr(filepath) - with open_datatree(filepath, engine="zarr", chunks={}) as tree: + with open_datatree(filepath, engine="zarr", chunks=chunks) as tree: xr.testing.assert_identical(original_tree, tree) assert_chunks_equal(tree, original_tree, enforce_dask=True) From 11850fdcc3ee8b0da363c742d592b5f77f635c49 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 23 Oct 2024 19:27:39 +0200 Subject: [PATCH 06/22] check that `open_groups` with chunks works, too --- xarray/tests/test_backends_datatree.py | 60 ++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 85d422306a6..e867a9d8803 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -252,6 +252,36 @@ def test_open_groups(self, unaligned_datatree_nc) -> None: for ds in unaligned_dict_of_datasets.values(): ds.close() + def test_open_groups_chunks(self, tmpdir) -> None: + """Test `open_groups` with chunks on a netcdf4 file.""" + + chunks = {"x": 2, "y": 1} + filepath = tmpdir / "test.nc" + + chunks = {"x": 2, "y": 1} + + root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) + set1_data = xr.Dataset({"a": ("y", [-1, 0, 1]), "b": ("x", [-10, 6])}) + set2_data = xr.Dataset({"a": ("y", [1, 2, 3]), "b": ("x", [0.1, 0.2])}) + original_tree = DataTree.from_dict( + { + "/": root_data.chunk(chunks), + "/group1": set1_data.chunk(chunks), + "/group2": set2_data.chunk(chunks), + } + ) + original_tree.to_netcdf(filepath, mode="w") + + dict_of_datasets = open_groups(filepath, engine="netcdf4", chunks=chunks) + + for path, ds in dict_of_datasets.items(): + assert { + k: max(vs) for k, vs in ds.chunksizes.items() + } == chunks, f"unexpected chunking for {path}" + + for ds in dict_of_datasets.values(): + ds.close() + def test_open_groups_to_dict(self, tmpdir) -> None: """Create an aligned netCDF4 with the following structure to test `open_groups` and `DataTree.from_dict`. @@ -460,3 +490,33 @@ def test_open_groups(self, unaligned_datatree_zarr) -> None: for ds in unaligned_dict_of_datasets.values(): ds.close() + + def test_open_groups_chunks(self, tmpdir) -> None: + """Test `open_groups` with chunks on a zarr store.""" + + chunks = {"x": 2, "y": 1} + filepath = tmpdir / "test.zarr" + + chunks = {"x": 2, "y": 1} + + root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) + set1_data = xr.Dataset({"a": ("y", [-1, 0, 1]), "b": ("x", [-10, 6])}) + set2_data = xr.Dataset({"a": ("y", [1, 2, 3]), "b": ("x", [0.1, 0.2])}) + original_tree = DataTree.from_dict( + { + "/": root_data.chunk(chunks), + "/group1": set1_data.chunk(chunks), + "/group2": set2_data.chunk(chunks), + } + ) + original_tree.to_zarr(filepath, mode="w") + + dict_of_datasets = open_groups(filepath, engine="zarr", chunks=chunks) + + for path, ds in dict_of_datasets.items(): + assert { + k: max(vs) for k, vs in ds.chunksizes.items() + } == chunks, f"unexpected chunking for {path}" + + for ds in dict_of_datasets.values(): + ds.close() From a71f5e25432f90c7b2bda9f65ccd94461a2cf63d Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 23 Oct 2024 13:17:24 -0600 Subject: [PATCH 07/22] require dask for `test_open_groups_chunks` --- xarray/tests/test_backends_datatree.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index e867a9d8803..130d84dbc11 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -252,6 +252,7 @@ def test_open_groups(self, unaligned_datatree_nc) -> None: for ds in unaligned_dict_of_datasets.values(): ds.close() + @requires_dask def test_open_groups_chunks(self, tmpdir) -> None: """Test `open_groups` with chunks on a netcdf4 file.""" From 6d3deeda7932cd73b92326443b3adc9967bd187b Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 23 Oct 2024 23:14:25 +0200 Subject: [PATCH 08/22] protect variables from write operations --- xarray/backends/api.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 4a96ea43a7a..7a28435e33e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -265,6 +265,17 @@ def _protect_dataset_variables_inplace(dataset, cache): variable.data = data +def _protect_datatree_variables_inplace(tree, cache): + for node in tree.subtree: + for name, variable in node.variables.items(): + if name not in node._indexes: + # no need to protect IndexVariable objects + data = indexing.CopyOnWriteArray(variable._data) + if cache: + data = indexing.MemoryCachedArray(data) + variable.data = data + + def _finalize_store(write, store): """Finalize this store by explicitly syncing and closing""" del write # ensure writing is done first @@ -432,7 +443,7 @@ def _datatree_from_backend_datatree( f"chunks must be an int, dict, 'auto', or None. Instead found {chunks}." ) - # _protect_datatree_variables_inplace(backend_tree, cache) + _protect_datatree_variables_inplace(backend_tree, cache) if chunks is None: tree = backend_tree else: From 7f770cfe4a6e9856e7b0429ff26277288e8f1938 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 23 Oct 2024 23:22:49 +0200 Subject: [PATCH 09/22] copy over `_close` from the backend tree --- xarray/backends/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 7a28435e33e..a11727d1f66 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -464,7 +464,8 @@ def _datatree_from_backend_datatree( } ) - # ds.set_close(backend_ds._close) + for path, [node] in group_subtrees(backend_tree): + tree[path].set_close(node._close) # Ensure source filename always stored in dataset object if "source" not in tree.encoding: From 05efaf6370ef33f0a01752928d6605441e01a597 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 23 Oct 2024 23:30:56 +0200 Subject: [PATCH 10/22] copy a lot of the docstring from `open_dataset` --- xarray/backends/api.py | 139 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 134 insertions(+), 5 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index a11727d1f66..3a045ac9451 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -926,13 +926,142 @@ def open_datatree( ---------- filename_or_obj : str, Path, file-like, or DataStore Strings and Path objects are interpreted as a path to a netCDF file or Zarr store. - engine : str, optional - Xarray backend engine to use. Valid options include `{"netcdf4", "h5netcdf", "zarr"}`. - **kwargs : dict - Additional keyword arguments passed to :py:func:`~xarray.open_dataset` for each group. + engine : {"netcdf4", "h5netcdf", "zarr", None}, \ + installed backend or xarray.backends.BackendEntrypoint, optional + Engine to use when reading files. If not provided, the default engine + is chosen based on available dependencies, with a preference for + "netcdf4". A custom backend class (a subclass of ``BackendEntrypoint``) + can also be used. + chunks : int, dict, 'auto' or None, default: None + If provided, used to load the data into dask arrays. + + - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. + - ``chunks=None`` skips using dask, which is generally faster for + small arrays. + - ``chunks=-1`` loads the data with dask using a single chunk for all arrays. + - ``chunks={}`` loads the data with dask using the engine's preferred chunk + size, generally identical to the format's chunk size. If not available, a + single chunk for all arrays. + + See dask chunking for more details. + cache : bool, optional + If True, cache data loaded from the underlying datastore in memory as + NumPy arrays when accessed to avoid reading from the underlying data- + store multiple times. Defaults to True unless you specify the `chunks` + argument to use dask, in which case it defaults to False. Does not + change the behavior of coordinates corresponding to dimensions, which + always load their data from disk into a ``pandas.Index``. + decode_cf : bool, optional + Whether to decode these variables, assuming they were saved according + to CF conventions. + mask_and_scale : bool or dict-like, optional + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + decode_times : bool or dict-like, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, leave them encoded as numbers. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + decode_timedelta : bool or dict-like, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of decode_time. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + use_cftime: bool or dict-like, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + concat_characters : bool or dict-like, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + decode_coords : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. + + Only existing variables can be set as coordinates. Missing variables + will be silently ignored. + drop_variables: str or iterable of str, optional + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or + inconsistent values. + inline_array: bool, default: False + How to include the array in the dask task graph. + By default(``inline_array=False``) the array is included in a task by + itself, and each chunk refers to that task by its key. With + ``inline_array=True``, Dask will instead inline the array directly + in the values of the task graph. See :py:func:`dask.array.from_array`. + chunked_array_type: str, optional + Which chunked array type to coerce this datasets' arrays to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. + from_array_kwargs: dict + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + backend_kwargs: dict + Additional keyword arguments passed on to the engine open function, + equivalent to `**kwargs`. + **kwargs: dict + Additional keyword arguments passed on to the engine open function. + For example: + + - 'group': path to the group in the given file to open as the root group as + a str. + - 'lock': resource lock to use when reading data from disk. Only + relevant when using dask or another form of parallelism. By default, + appropriate locks are chosen to safely read and write files with the + currently active dask scheduler. Supported by "netcdf4", "h5netcdf", + "scipy". + + See engine open function for kwargs accepted by each specific engine. + Returns ------- - xarray.DataTree + tree : DataTree + The newly created datatree. + + Notes + ----- + ``open_datatree`` opens the file with read-only access. When you modify + values of a DataTree, even one linked to files on disk, only the in-memory + copy you are manipulating in xarray is modified: the original file on disk + is never touched. + + See Also + -------- + xarray.open_groups + xarray.open_dataset """ if cache is None: cache = chunks is None From f9fee40ad2827158337e22a00221fa3a1ffb5a90 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 23 Oct 2024 23:33:28 +0200 Subject: [PATCH 11/22] same for `open_groups` --- xarray/backends/api.py | 143 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 136 insertions(+), 7 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 3a045ac9451..96064b2fafe 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1136,6 +1136,7 @@ def open_groups( ) -> dict[str, Dataset]: """ Open and decode a file or file-like object, creating a dictionary containing one xarray Dataset for each group in the file. + Useful for an HDF file ("netcdf4" or "h5netcdf") containing many groups that are not alignable with their parents and cannot be opened directly with ``open_datatree``. It is encouraged to use this function to inspect your data, then make the necessary changes to make the structure coercible to a `DataTree` object before calling `DataTree.from_dict()` and proceeding with your analysis. @@ -1144,19 +1145,147 @@ def open_groups( ---------- filename_or_obj : str, Path, file-like, or DataStore Strings and Path objects are interpreted as a path to a netCDF file. - engine : str, optional - Xarray backend engine to use. Valid options include `{"netcdf4", "h5netcdf"}`. - **kwargs : dict - Additional keyword arguments passed to :py:func:`~xarray.open_dataset` for each group. + Parameters + ---------- + filename_or_obj : str, Path, file-like, or DataStore + Strings and Path objects are interpreted as a path to a netCDF file or Zarr store. + engine : {"netcdf4", "h5netcdf", "zarr", None}, \ + installed backend or xarray.backends.BackendEntrypoint, optional + Engine to use when reading files. If not provided, the default engine + is chosen based on available dependencies, with a preference for + "netcdf4". A custom backend class (a subclass of ``BackendEntrypoint``) + can also be used. + chunks : int, dict, 'auto' or None, default: None + If provided, used to load the data into dask arrays. + + - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. + - ``chunks=None`` skips using dask, which is generally faster for + small arrays. + - ``chunks=-1`` loads the data with dask using a single chunk for all arrays. + - ``chunks={}`` loads the data with dask using the engine's preferred chunk + size, generally identical to the format's chunk size. If not available, a + single chunk for all arrays. + + See dask chunking for more details. + cache : bool, optional + If True, cache data loaded from the underlying datastore in memory as + NumPy arrays when accessed to avoid reading from the underlying data- + store multiple times. Defaults to True unless you specify the `chunks` + argument to use dask, in which case it defaults to False. Does not + change the behavior of coordinates corresponding to dimensions, which + always load their data from disk into a ``pandas.Index``. + decode_cf : bool, optional + Whether to decode these variables, assuming they were saved according + to CF conventions. + mask_and_scale : bool or dict-like, optional + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + decode_times : bool or dict-like, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, leave them encoded as numbers. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + decode_timedelta : bool or dict-like, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of decode_time. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + use_cftime: bool or dict-like, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + concat_characters : bool or dict-like, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + decode_coords : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. + + Only existing variables can be set as coordinates. Missing variables + will be silently ignored. + drop_variables: str or iterable of str, optional + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or + inconsistent values. + inline_array: bool, default: False + How to include the array in the dask task graph. + By default(``inline_array=False``) the array is included in a task by + itself, and each chunk refers to that task by its key. With + ``inline_array=True``, Dask will instead inline the array directly + in the values of the task graph. See :py:func:`dask.array.from_array`. + chunked_array_type: str, optional + Which chunked array type to coerce this datasets' arrays to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. + from_array_kwargs: dict + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + backend_kwargs: dict + Additional keyword arguments passed on to the engine open function, + equivalent to `**kwargs`. + **kwargs: dict + Additional keyword arguments passed on to the engine open function. + For example: + + - 'group': path to the group in the given file to open as the root group as + a str. + - 'lock': resource lock to use when reading data from disk. Only + relevant when using dask or another form of parallelism. By default, + appropriate locks are chosen to safely read and write files with the + currently active dask scheduler. Supported by "netcdf4", "h5netcdf", + "scipy". + + See engine open function for kwargs accepted by each specific engine. Returns ------- - dict[str, xarray.Dataset] + groups : dict of str to xarray.Dataset + The groups as Dataset objects + + Notes + ----- + ``open_groups`` opens the file with read-only access. When you modify + values of a Dataset, even one linked to files on disk, only the in-memory + copy you are manipulating in xarray is modified: the original file on disk + is never touched. See Also -------- - open_datatree() - DataTree.from_dict() + xarray.open_datatree + xarray.open_dataset + xarray.DataTree.from_dict """ if cache is None: cache = chunks is None From a4e99c6e8b812bd7d8186ed36e0d20707b35f5bb Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 23 Oct 2024 23:37:42 +0200 Subject: [PATCH 12/22] reuse `_protect_dataset_variables_inplace` --- xarray/backends/api.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 3088490ae3f..f53fdd52250 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -268,13 +268,7 @@ def _protect_dataset_variables_inplace(dataset, cache): def _protect_datatree_variables_inplace(tree, cache): for node in tree.subtree: - for name, variable in node.variables.items(): - if name not in node._indexes: - # no need to protect IndexVariable objects - data = indexing.CopyOnWriteArray(variable._data) - if cache: - data = indexing.MemoryCachedArray(data) - variable.data = data + _protect_dataset_variables_inplace(node, cache) def _finalize_store(write, store): From 3e8b80cc4b730587753270bc4227bfbf750d025c Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 23 Oct 2024 23:44:15 +0200 Subject: [PATCH 13/22] final missing `requires_dask` --- xarray/tests/test_backends_datatree.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 48c9c3185fd..84b73699fd9 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -497,6 +497,7 @@ def test_open_groups(self, unaligned_datatree_zarr) -> None: for ds in unaligned_dict_of_datasets.values(): ds.close() + @requires_dask def test_open_groups_chunks(self, tmpdir) -> None: """Test `open_groups` with chunks on a zarr store.""" From cf1a6b045ea3522d12e36763fc6e76f608a71fe9 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 10:52:19 +0200 Subject: [PATCH 14/22] typing for the test utils Co-authored-by: Tom Nicholas --- xarray/tests/test_backends_datatree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 84b73699fd9..85d991f28b8 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -28,7 +28,7 @@ have_zarr_v3 = xr.backends.zarr._zarr_v3() -def diff_chunks(comparison, tree1, tree2): +def diff_chunks(comparison: dict[tuple[str, str], bool]], tree1: DataTree, tree2: DataTree) -> str: mismatching_variables = [loc for loc, equals in comparison.items() if not equals] variable_messages = [ @@ -43,7 +43,7 @@ def diff_chunks(comparison, tree1, tree2): return "\n".join(["Differing chunk sizes:"] + variable_messages) -def assert_chunks_equal(actual, expected, enforce_dask=False): +def assert_chunks_equal(actual: DataTree, expected: DataTree, enforce_dask: bool = False) -> None: __tracebackhide__ = True from xarray.namedarray.pycompat import array_type From 114c4dcd1e47895a9b205555d30b8f9d36d9d05e Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 10:52:59 +0200 Subject: [PATCH 15/22] type hints for `_protect_datatree_variables_inplace` Co-authored-by: Tom Nicholas --- xarray/backends/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index f53fdd52250..72a873ff31e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -266,7 +266,7 @@ def _protect_dataset_variables_inplace(dataset, cache): variable.data = data -def _protect_datatree_variables_inplace(tree, cache): +def _protect_datatree_variables_inplace(tree: DataTree, cache: bool) -> None: for node in tree.subtree: _protect_dataset_variables_inplace(node, cache) From 9eac19d4f0e33a1c816b08cdf235e4f929e30a6d Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 10:54:46 +0200 Subject: [PATCH 16/22] type hints for `_protect_dataset_variables_inplace` --- xarray/backends/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 72a873ff31e..2183adcf08c 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -256,7 +256,7 @@ def _get_mtime(filename_or_obj): return mtime -def _protect_dataset_variables_inplace(dataset, cache): +def _protect_dataset_variables_inplace(dataset: Dataset, cache: bool) -> None: for name, variable in dataset.variables.items(): if name not in dataset._indexes: # no need to protect IndexVariable objects From 446a53d9cc381d4c7139cff4b91f83a166710504 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 10:56:36 +0200 Subject: [PATCH 17/22] copy over the name of the backend tree Co-authored-by: Tom Nicholas --- xarray/backends/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 2183adcf08c..dfb120e7761 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -456,7 +456,8 @@ def _datatree_from_backend_datatree( **extra_tokens, ) for path, [node] in group_subtrees(backend_tree) - } + }, + name=backend_tree.name, ) for path, [node] in group_subtrees(backend_tree): From 5b367015be99b330ad545fde27cd6cab37e9670f Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 11:03:08 +0200 Subject: [PATCH 18/22] typo --- xarray/tests/test_backends_datatree.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 85d991f28b8..ec650a4f048 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -28,7 +28,9 @@ have_zarr_v3 = xr.backends.zarr._zarr_v3() -def diff_chunks(comparison: dict[tuple[str, str], bool]], tree1: DataTree, tree2: DataTree) -> str: +def diff_chunks( + comparison: dict[tuple[str, str], bool], tree1: DataTree, tree2: DataTree +) -> str: mismatching_variables = [loc for loc, equals in comparison.items() if not equals] variable_messages = [ @@ -43,7 +45,9 @@ def diff_chunks(comparison: dict[tuple[str, str], bool]], tree1: DataTree, tree2 return "\n".join(["Differing chunk sizes:"] + variable_messages) -def assert_chunks_equal(actual: DataTree, expected: DataTree, enforce_dask: bool = False) -> None: +def assert_chunks_equal( + actual: DataTree, expected: DataTree, enforce_dask: bool = False +) -> None: __tracebackhide__ = True from xarray.namedarray.pycompat import array_type From 66616f705c4d91ca277aec5551a900d714dc0e1b Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 11:08:35 +0200 Subject: [PATCH 19/22] swap the order of arguments to `assert_identical` --- xarray/tests/test_backends_datatree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index ec650a4f048..0d23d9257cd 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -468,7 +468,7 @@ def test_open_datatree_chunks(self, tmpdir, simple_datatree) -> None: original_tree.to_zarr(filepath) with open_datatree(filepath, engine="zarr", chunks=chunks) as tree: - xr.testing.assert_identical(original_tree, tree) + xr.testing.assert_identical(tree, original_tree) assert_chunks_equal(tree, original_tree, enforce_dask=True) def test_open_groups(self, unaligned_datatree_zarr) -> None: From 843b2fc72d10b2f8b3011400cdae1439b9a0af46 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 15:04:47 +0200 Subject: [PATCH 20/22] try explicitly typing `data` --- xarray/backends/api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index dfb120e7761..c9d32a129c7 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -260,6 +260,7 @@ def _protect_dataset_variables_inplace(dataset: Dataset, cache: bool) -> None: for name, variable in dataset.variables.items(): if name not in dataset._indexes: # no need to protect IndexVariable objects + data: indexing.ExplicitlyIndexedNDarrayMixin data = indexing.CopyOnWriteArray(variable._data) if cache: data = indexing.MemoryCachedArray(data) From 8950841c7a4e370222934a55978bb49bd5343172 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 15:43:24 +0200 Subject: [PATCH 21/22] typo --- xarray/backends/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index c9d32a129c7..a52e73701ab 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -260,7 +260,7 @@ def _protect_dataset_variables_inplace(dataset: Dataset, cache: bool) -> None: for name, variable in dataset.variables.items(): if name not in dataset._indexes: # no need to protect IndexVariable objects - data: indexing.ExplicitlyIndexedNDarrayMixin + data: indexing.ExplicitlyIndexedNDArrayMixin data = indexing.CopyOnWriteArray(variable._data) if cache: data = indexing.MemoryCachedArray(data) From 4d93ada63213f4917669bd397631e7be014bfb97 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 24 Oct 2024 15:44:38 +0200 Subject: [PATCH 22/22] use `Hashable` for variable names --- xarray/tests/test_backends_datatree.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 0d23d9257cd..01b8b0ae81b 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +from collections.abc import Hashable from typing import TYPE_CHECKING, cast import numpy as np @@ -29,7 +30,7 @@ def diff_chunks( - comparison: dict[tuple[str, str], bool], tree1: DataTree, tree2: DataTree + comparison: dict[tuple[str, Hashable], bool], tree1: DataTree, tree2: DataTree ) -> str: mismatching_variables = [loc for loc, equals in comparison.items() if not equals]