From 5b62b7c42724a85d038d2eea64a9782e984abb83 Mon Sep 17 00:00:00 2001 From: Sam Levang Date: Wed, 8 Nov 2023 23:41:43 -0500 Subject: [PATCH 1/9] implement auto region and transpose --- xarray/backends/api.py | 46 ++++++++++++++-- xarray/backends/zarr.py | 19 ++++--- xarray/tests/test_backends.py | 99 +++++++++++++++++++++++++++++++++++ 3 files changed, 154 insertions(+), 10 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 27e155872de..4307c807cff 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -27,6 +27,7 @@ _normalize_path, ) from xarray.backends.locks import _get_scheduler +from xarray.backends.zarr import open_zarr from xarray.core import indexing from xarray.core.combine import ( _infer_concat_order_from_positions, @@ -1446,6 +1447,44 @@ def save_mfdataset( ) +def _auto_detect_region(ds_new, ds_orig, dim): + # Create a mapping array of coordinates to indices on the original array + coord = ds_orig[dim] + da_map = DataArray(np.arange(coord.size), coords={dim: coord}) + + try: + da_idxs = da_map.sel({dim: ds_new[dim]}) + except KeyError as e: + if "not all values found" in str(e): + raise KeyError( + f"Not all values of coordinate '{dim}' in the new array were" + " found in the original store. Writing to a zarr region slice" + " requires that no dimensions or metadata are changed by the write." + ) + else: + raise e + + if (da_idxs.diff(dim) != 1).any(): + raise ValueError( + f"The auto-detected region of coordinate '{dim}' for writing new data" + " to the original store had non-contiguous indices. Writing to a zarr" + " region slice requires that the new data constitute a contiguous subset" + " of the original store." + ) + + dim_slice = slice(da_idxs.values[0], da_idxs.values[-1] + 1) + + return dim_slice + + +def _auto_detect_regions(ds, region, store): + ds_original = open_zarr(store) + for key, val in region.items(): + if val == "auto": + region[key] = _auto_detect_region(ds, ds_original, key) + return region + + def _validate_region(ds, region): if not isinstance(region, dict): raise TypeError(f"``region`` must be a dict, got {type(region)}") @@ -1532,7 +1571,7 @@ def to_zarr( compute: Literal[True] = True, consolidated: bool | None = None, append_dim: Hashable | None = None, - region: Mapping[str, slice] | None = None, + region: Mapping[str, slice | Literal["auto"]] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, @@ -1556,7 +1595,7 @@ def to_zarr( compute: Literal[False], consolidated: bool | None = None, append_dim: Hashable | None = None, - region: Mapping[str, slice] | None = None, + region: Mapping[str, slice | Literal["auto"]] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, @@ -1578,7 +1617,7 @@ def to_zarr( compute: bool = True, consolidated: bool | None = None, append_dim: Hashable | None = None, - region: Mapping[str, slice] | None = None, + region: Mapping[str, slice | Literal["auto"]] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, @@ -1643,6 +1682,7 @@ def to_zarr( _validate_dataset_names(dataset) if region is not None: + region = _auto_detect_regions(dataset, region, store) _validate_region(dataset, region) if append_dim is not None and append_dim in region: raise ValueError( diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 2b41fa5224e..2dd620e0244 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -322,12 +322,15 @@ def encode_zarr_variable(var, needs_copy=True, name=None): def _validate_existing_dims(var_name, new_var, existing_var, region, append_dim): if new_var.dims != existing_var.dims: - raise ValueError( - f"variable {var_name!r} already exists with different " - f"dimension names {existing_var.dims} != " - f"{new_var.dims}, but changing variable " - f"dimensions is not supported by to_zarr()." - ) + if set(existing_var.dims) == set(new_var.dims): + new_var = new_var.transpose(*existing_var.dims) + else: + raise ValueError( + f"variable {var_name!r} already exists with different " + f"dimension names {existing_var.dims} != " + f"{new_var.dims}, but changing variable " + f"dimensions is not supported by to_zarr()." + ) existing_sizes = {} for dim, size in existing_var.sizes.items(): @@ -347,6 +350,8 @@ def _validate_existing_dims(var_name, new_var, existing_var, region, append_dim) f"explicitly appending, but append_dim={append_dim!r}." ) + return new_var + def _put_attrs(zarr_obj, attrs): """Raise a more informative error message for invalid attrs.""" @@ -616,7 +621,7 @@ def store( for var_name in existing_variable_names: new_var = variables_encoded[var_name] existing_var = existing_vars[var_name] - _validate_existing_dims( + new_var = _validate_existing_dims( var_name, new_var, existing_var, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 73352c3f7e1..4d30c7b638b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5431,3 +5431,102 @@ def test_raise_writing_to_nczarr(self, mode) -> None: def test_pickle_open_mfdataset_dataset(): ds = open_example_mfdataset(["bears.nc"]) assert_identical(ds, pickle.loads(pickle.dumps(ds))) + + +@requires_zarr +class TestZarrRegionAuto: + def test_zarr_region_auto_success(self, tmp_path): + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + ds.to_zarr(tmp_path / "test.zarr") + + ds_region = 1 + ds.isel(x=slice(2, 4), y=slice(6, 8)) + ds_region.to_zarr(tmp_path / "test.zarr", region={"x": "auto", "y": "auto"}) + + ds_updated = xr.open_zarr(tmp_path / "test.zarr") + + expected = ds.copy() + expected["test"][2:4, 6:8] += 1 + assert_identical(ds_updated, expected) + + def test_zarr_region_auto_noncontiguous(self, tmp_path): + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + ds.to_zarr(tmp_path / "test.zarr") + + ds_region = 1 + ds.isel(x=[0, 2, 3], y=[5, 6]) + with pytest.raises(ValueError): + ds_region.to_zarr(tmp_path / "test.zarr", region={"x": "auto", "y": "auto"}) + + def test_zarr_region_auto_new_coord_vals(self, tmp_path): + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + ds.to_zarr(tmp_path / "test.zarr") + + x = np.arange(5, 55, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + + ds_region = 1 + ds.isel(x=slice(2, 4), y=slice(6, 8)) + with pytest.raises(KeyError): + ds_region.to_zarr(tmp_path / "test.zarr", region={"x": "auto", "y": "auto"}) + + +def test_zarr_region_transpose(tmp_path): + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + ds.to_zarr(tmp_path / "test.zarr") + + ds_region = 1 + ds.isel(x=[0], y=[0]).transpose() + ds_region.to_zarr( + tmp_path / "test.zarr", region={"x": slice(0, 1), "y": slice(0, 1)} + ) From b0058ae668b058656e14cefa1511abf8099d6a9c Mon Sep 17 00:00:00 2001 From: Sam Levang Date: Thu, 9 Nov 2023 11:02:57 -0500 Subject: [PATCH 2/9] fix validation --- xarray/backends/api.py | 10 +++++++--- xarray/backends/zarr.py | 6 ++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 4307c807cff..7b64c273d26 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1485,10 +1485,13 @@ def _auto_detect_regions(ds, region, store): return region -def _validate_region(ds, region): +def _validate_and_autodetect_region(ds, region, store) -> dict[str, slice]: if not isinstance(region, dict): raise TypeError(f"``region`` must be a dict, got {type(region)}") + if any(v == "auto" for v in region.values()): + region = _auto_detect_regions(ds, region, store) + for k, v in region.items(): if k not in ds.dims: raise ValueError( @@ -1520,6 +1523,8 @@ def _validate_region(ds, region): f".drop_vars({non_matching_vars!r})" ) + return region + def _validate_datatypes_for_zarr_append(zstore, dataset): """If variable exists in the store, confirm dtype of the data to append is compatible with @@ -1682,8 +1687,7 @@ def to_zarr( _validate_dataset_names(dataset) if region is not None: - region = _auto_detect_regions(dataset, region, store) - _validate_region(dataset, region) + region = _validate_and_autodetect_region(dataset, region, store) if append_dim is not None and append_dim in region: raise ValueError( f"cannot list the same dimension in both ``append_dim`` and " diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 2dd620e0244..fc2d6433ccf 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -320,7 +320,9 @@ def encode_zarr_variable(var, needs_copy=True, name=None): return var -def _validate_existing_dims(var_name, new_var, existing_var, region, append_dim): +def _validate_and_transpose_existing_dims( + var_name, new_var, existing_var, region, append_dim +): if new_var.dims != existing_var.dims: if set(existing_var.dims) == set(new_var.dims): new_var = new_var.transpose(*existing_var.dims) @@ -621,7 +623,7 @@ def store( for var_name in existing_variable_names: new_var = variables_encoded[var_name] existing_var = existing_vars[var_name] - new_var = _validate_existing_dims( + new_var = _validate_and_transpose_existing_dims( var_name, new_var, existing_var, From 17fbca9c7db07db70638c40f455b32803cb21530 Mon Sep 17 00:00:00 2001 From: Sam Levang Date: Thu, 9 Nov 2023 20:48:49 -0500 Subject: [PATCH 3/9] support str auto, Dataset doc string, and user-guide example --- doc/user-guide/io.rst | 13 ++++++++----- xarray/backends/api.py | 27 +++++++++++++++++++-------- xarray/core/dataset.py | 14 ++++++++++---- xarray/tests/test_backends.py | 30 ++++++++++++++++++++++++++++-- 4 files changed, 65 insertions(+), 19 deletions(-) diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 9656a2ba973..4fb6ef4e3ec 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -876,17 +876,20 @@ and then calling ``to_zarr`` with ``compute=False`` to write only metadata ds.to_zarr(path, compute=False) Now, a Zarr store with the correct variable shapes and attributes exists that -can be filled out by subsequent calls to ``to_zarr``. The ``region`` provides a -mapping from dimension names to Python ``slice`` objects indicating where the -data should be written (in index space, not coordinate space), e.g., +can be filled out by subsequent calls to ``to_zarr``. ``region`` can be +specified as ``"auto"``, which opens the existing store and determines the +correct alignment of the new data with the existing coordinates, or as an +explicit mapping from dimension names to Python ``slice`` objects indicating +where the data should be written (in index space, not coordinate space), e.g., .. ipython:: python # For convenience, we'll slice a single dataset, but in the real use-case # we would create them separately possibly even from separate processes. ds = xr.Dataset({"foo": ("x", np.arange(30))}) - ds.isel(x=slice(0, 10)).to_zarr(path, region={"x": slice(0, 10)}) - ds.isel(x=slice(10, 20)).to_zarr(path, region={"x": slice(10, 20)}) + # Any of the following region specifications are valid + ds.isel(x=slice(0, 10)).to_zarr(path, region="auto") + ds.isel(x=slice(10, 20)).to_zarr(path, region={"x": "auto"}) ds.isel(x=slice(20, 30)).to_zarr(path, region={"x": slice(20, 30)}) Concurrent writes with ``region`` are safe as long as they modify distinct diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 7b64c273d26..6ba49f5a56e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1477,20 +1477,23 @@ def _auto_detect_region(ds_new, ds_orig, dim): return dim_slice -def _auto_detect_regions(ds, region, store): - ds_original = open_zarr(store) +def _auto_detect_regions(ds, region, open_kwargs): + ds_original = open_zarr(**open_kwargs) for key, val in region.items(): if val == "auto": region[key] = _auto_detect_region(ds, ds_original, key) return region -def _validate_and_autodetect_region(ds, region, store) -> dict[str, slice]: +def _validate_and_autodetect_region(ds, region, open_kwargs) -> dict[str, slice]: + if region == "auto": + region = {dim: "auto" for dim in ds.dims} + if not isinstance(region, dict): raise TypeError(f"``region`` must be a dict, got {type(region)}") if any(v == "auto" for v in region.values()): - region = _auto_detect_regions(ds, region, store) + region = _auto_detect_regions(ds, region, open_kwargs) for k, v in region.items(): if k not in ds.dims: @@ -1576,7 +1579,7 @@ def to_zarr( compute: Literal[True] = True, consolidated: bool | None = None, append_dim: Hashable | None = None, - region: Mapping[str, slice | Literal["auto"]] | None = None, + region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, @@ -1600,7 +1603,7 @@ def to_zarr( compute: Literal[False], consolidated: bool | None = None, append_dim: Hashable | None = None, - region: Mapping[str, slice | Literal["auto"]] | None = None, + region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, @@ -1622,7 +1625,7 @@ def to_zarr( compute: bool = True, consolidated: bool | None = None, append_dim: Hashable | None = None, - region: Mapping[str, slice | Literal["auto"]] | None = None, + region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, @@ -1687,7 +1690,15 @@ def to_zarr( _validate_dataset_names(dataset) if region is not None: - region = _validate_and_autodetect_region(dataset, region, store) + open_kwargs = dict( + store=store, + synchronizer=synchronizer, + group=group, + consolidated=consolidated, + storage_options=storage_options, + zarr_version=zarr_version, + ) + region = _validate_and_autodetect_region(dataset, region, open_kwargs) if append_dim is not None and append_dim in region: raise ValueError( f"cannot list the same dimension in both ``append_dim`` and " diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 10deea5f62b..81ee33cb0e2 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2305,7 +2305,7 @@ def to_zarr( compute: Literal[True] = True, consolidated: bool | None = None, append_dim: Hashable | None = None, - region: Mapping[str, slice] | None = None, + region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, @@ -2328,7 +2328,7 @@ def to_zarr( compute: Literal[False], consolidated: bool | None = None, append_dim: Hashable | None = None, - region: Mapping[str, slice] | None = None, + region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, @@ -2349,7 +2349,7 @@ def to_zarr( compute: bool = True, consolidated: bool | None = None, append_dim: Hashable | None = None, - region: Mapping[str, slice] | None = None, + region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, @@ -2411,7 +2411,7 @@ def to_zarr( append_dim : hashable, optional If set, the dimension along which the data will be appended. All other dimensions on overridden variables must remain the same size. - region : dict, optional + region : dict or "auto", optional Optional mapping from dimension names to integer slices along dataset dimensions to indicate the region of existing zarr array(s) in which to write this dataset's data. For example, @@ -2419,6 +2419,12 @@ def to_zarr( that values should be written to the region ``0:1000`` along ``x`` and ``10000:11000`` along ``y``. + Can also specify ``"auto"``, in which case the existing store will be + opened and the region inferred by matching the new data's coordinates. + ``"auto"`` can be used as a single string, which will automatically infer + the region for all dimensions, or as dictionary values for specific + dimensions mixed together with explicit slices for other dimensions. + Two restrictions apply to the use of ``region``: - If ``region`` is set, _all_ variables in a dataset must have at diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4d30c7b638b..8df5fb3b4b0 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5435,7 +5435,7 @@ def test_pickle_open_mfdataset_dataset(): @requires_zarr class TestZarrRegionAuto: - def test_zarr_region_auto_success(self, tmp_path): + def test_zarr_region_auto_all(self, tmp_path): x = np.arange(0, 50, 10) y = np.arange(0, 20, 2) data = np.ones((5, 10)) @@ -5451,7 +5451,33 @@ def test_zarr_region_auto_success(self, tmp_path): ds.to_zarr(tmp_path / "test.zarr") ds_region = 1 + ds.isel(x=slice(2, 4), y=slice(6, 8)) - ds_region.to_zarr(tmp_path / "test.zarr", region={"x": "auto", "y": "auto"}) + ds_region.to_zarr(tmp_path / "test.zarr", region="auto") + + ds_updated = xr.open_zarr(tmp_path / "test.zarr") + + expected = ds.copy() + expected["test"][2:4, 6:8] += 1 + assert_identical(ds_updated, expected) + + def test_zarr_region_auto_mixed(self, tmp_path): + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + ds.to_zarr(tmp_path / "test.zarr") + + ds_region = 1 + ds.isel(x=slice(2, 4), y=slice(6, 8)) + ds_region.to_zarr( + tmp_path / "test.zarr", region={"x": "auto", "y": slice(6, 8)} + ) ds_updated = xr.open_zarr(tmp_path / "test.zarr") From d10c0296b727d9cdb4931e33ac24b94ad080ae1a Mon Sep 17 00:00:00 2001 From: Sam Levang Date: Thu, 9 Nov 2023 21:00:14 -0500 Subject: [PATCH 4/9] add whats new entry --- doc/whats-new.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e28177814b7..18d9c698b7f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -26,6 +26,10 @@ New Features By `Deepak Cherian `_. (:issue:`7764`, :pull:`8373`). - Add ``DataArray.dt.total_seconds()`` method to match the Pandas API. (:pull:`8435`). By `Ben Mares `_. +- Allow passing ``region="auto"`` in :py:meth:`Dataset.to_zarr` to automatically infer the + region to write in the original store. Also implement automatic transpose when dimension + order does not match the original store. (:issue:`7702`, :issue:`8421`, :pull:`8434`). + By `Sam Levang `_. Breaking changes ~~~~~~~~~~~~~~~~ From 7e3419b9d375ca5e23665e6985317f0172363590 Mon Sep 17 00:00:00 2001 From: Sam Levang <39069044+slevang@users.noreply.github.com> Date: Thu, 9 Nov 2023 21:22:21 -0500 Subject: [PATCH 5/9] Update doc/user-guide/io.rst Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- doc/user-guide/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 4fb6ef4e3ec..71270dffcba 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -880,7 +880,7 @@ can be filled out by subsequent calls to ``to_zarr``. ``region`` can be specified as ``"auto"``, which opens the existing store and determines the correct alignment of the new data with the existing coordinates, or as an explicit mapping from dimension names to Python ``slice`` objects indicating -where the data should be written (in index space, not coordinate space), e.g., +where the data should be written (in index space, not label space), e.g., .. ipython:: python From 5120f1f1ea65af6e5b54355ae86432b0922554e4 Mon Sep 17 00:00:00 2001 From: Sam Levang Date: Mon, 13 Nov 2023 21:22:59 -0500 Subject: [PATCH 6/9] drop indices and test that they are not written --- xarray/backends/api.py | 25 +++++++++++++++++++++--- xarray/backends/zarr.py | 5 ++++- xarray/tests/test_backends.py | 36 +++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 6ba49f5a56e..4faf15e36c0 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -40,6 +40,7 @@ from xarray.core.indexes import Index from xarray.core.parallelcompat import guess_chunkmanager from xarray.core.utils import is_remote_uri +from xarray.core.variable import IndexVariable if TYPE_CHECKING: try: @@ -1485,7 +1486,7 @@ def _auto_detect_regions(ds, region, open_kwargs): return region -def _validate_and_autodetect_region(ds, region, open_kwargs) -> dict[str, slice]: +def _validate_and_autodetect_region(ds, region, mode, open_kwargs) -> dict[str, slice]: if region == "auto": region = {dim: "auto" for dim in ds.dims} @@ -1493,7 +1494,14 @@ def _validate_and_autodetect_region(ds, region, open_kwargs) -> dict[str, slice] raise TypeError(f"``region`` must be a dict, got {type(region)}") if any(v == "auto" for v in region.values()): + region_was_autodetected = True + if mode != "r+": + raise ValueError( + f"``mode`` must be 'r+' when using ``region='auto'``, got {mode}" + ) region = _auto_detect_regions(ds, region, open_kwargs) + else: + region_was_autodetected = False for k, v in region.items(): if k not in ds.dims: @@ -1526,7 +1534,7 @@ def _validate_and_autodetect_region(ds, region, open_kwargs) -> dict[str, slice] f".drop_vars({non_matching_vars!r})" ) - return region + return region, region_was_autodetected def _validate_datatypes_for_zarr_append(zstore, dataset): @@ -1698,7 +1706,18 @@ def to_zarr( storage_options=storage_options, zarr_version=zarr_version, ) - region = _validate_and_autodetect_region(dataset, region, open_kwargs) + region, region_was_autodetected = _validate_and_autodetect_region( + dataset, region, mode, open_kwargs + ) + # drop indices to avoid potential race condition with auto region + if region_was_autodetected: + dataset = dataset.drop_vars( + [ + name + for name, v in dataset.variables.items() + if isinstance(v, IndexVariable) + ] + ) if append_dim is not None and append_dim in region: raise ValueError( f"cannot list the same dimension in both ``append_dim`` and " diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index fc2d6433ccf..6632e40cf6f 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -349,7 +349,10 @@ def _validate_and_transpose_existing_dims( f"variable {var_name!r} already exists with different " f"dimension sizes: {existing_sizes} != {new_sizes}. " f"to_zarr() only supports changing dimension sizes when " - f"explicitly appending, but append_dim={append_dim!r}." + f"explicitly appending, but append_dim={append_dim!r}. " + f"If you are attempting to write to a subset of the " + f"existing store without changing dimension sizes, " + f"consider using the region argument in to_zarr()." ) return new_var diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 8df5fb3b4b0..cb1d2e59bd4 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5536,7 +5536,43 @@ def test_zarr_region_auto_new_coord_vals(self, tmp_path): with pytest.raises(KeyError): ds_region.to_zarr(tmp_path / "test.zarr", region={"x": "auto", "y": "auto"}) + def test_zarr_region_index_write(self, tmp_path): + from xarray.backends.zarr import ZarrStore + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + + ds_region = 1 + ds.isel(x=slice(2, 4), y=slice(6, 8)) + + ds.to_zarr(tmp_path / "test.zarr") + + with patch.object( + ZarrStore, + "set_variables", + side_effect=ZarrStore.set_variables, + autospec=True, + ) as mock: + ds_region.to_zarr(tmp_path / "test.zarr", region="auto", mode="r+") + + # should write the data vars but never the index vars with auto mode + for call in mock.call_args_list: + written_variables = call.args[1].keys() + assert "test" in written_variables + assert "x" not in written_variables + assert "y" not in written_variables + + +@requires_zarr def test_zarr_region_transpose(tmp_path): x = np.arange(0, 50, 10) y = np.arange(0, 20, 2) From 9eb1b58731eae3885700c066ff8d832d04ecea86 Mon Sep 17 00:00:00 2001 From: Sam Levang Date: Mon, 13 Nov 2023 21:37:32 -0500 Subject: [PATCH 7/9] test that auto append fails --- xarray/tests/test_backends.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b6188214b7e..1c8a24770d7 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5350,6 +5350,40 @@ def test_zarr_region_index_write(self, tmp_path): assert "x" not in written_variables assert "y" not in written_variables + def test_zarr_region_append(self, tmp_path): + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) + data = np.ones((5, 10)) + ds = xr.Dataset( + { + "test": xr.DataArray( + data, + dims=("x", "y"), + coords={"x": x, "y": y}, + ) + } + ) + ds.to_zarr(tmp_path / "test.zarr") + + x_new = np.arange(40, 70, 10) + data_new = np.ones((3, 10)) + ds_new = xr.Dataset( + { + "test": xr.DataArray( + data_new, + dims=("x", "y"), + coords={"x": x_new, "y": y}, + ) + } + ) + + # Don't allow auto region detection in append mode due to complexities in + # implementing the overlap logic and lack of safety with parallel writes + with pytest.raises(ValueError): + ds_new.to_zarr( + tmp_path / "test.zarr", mode="a", append_dim="x", region="auto" + ) + @requires_zarr def test_zarr_region_transpose(tmp_path): From e1a2cb3ce8ec3b49c6d15ca5f3d979889a900e73 Mon Sep 17 00:00:00 2001 From: Sam Levang Date: Mon, 13 Nov 2023 22:09:38 -0500 Subject: [PATCH 8/9] more concise indexes detection --- xarray/backends/api.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 1bee97ed1a0..d732bfd3978 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -40,7 +40,6 @@ from xarray.core.indexes import Index from xarray.core.parallelcompat import guess_chunkmanager from xarray.core.utils import is_remote_uri -from xarray.core.variable import IndexVariable if TYPE_CHECKING: try: @@ -1708,13 +1707,7 @@ def to_zarr( ) # drop indices to avoid potential race condition with auto region if region_was_autodetected: - dataset = dataset.drop_vars( - [ - name - for name, v in dataset.variables.items() - if isinstance(v, IndexVariable) - ] - ) + dataset = dataset.drop_vars(dataset.indexes) if append_dim is not None and append_dim in region: raise ValueError( f"cannot list the same dimension in both ``append_dim`` and " From d4b8a0d227ccdf3febe2bd3834c73b171057a194 Mon Sep 17 00:00:00 2001 From: Sam Levang Date: Mon, 13 Nov 2023 23:01:53 -0500 Subject: [PATCH 9/9] fix typing --- xarray/backends/api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index d732bfd3978..3e6d00a8059 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1482,7 +1482,9 @@ def _auto_detect_regions(ds, region, open_kwargs): return region -def _validate_and_autodetect_region(ds, region, mode, open_kwargs) -> dict[str, slice]: +def _validate_and_autodetect_region( + ds, region, mode, open_kwargs +) -> tuple[dict[str, slice], bool]: if region == "auto": region = {dim: "auto" for dim in ds.dims}