From 4673c787ad694c0a03148a66cc2e6e2aae57851a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Gunyh=C3=B3?= <20118130+mgunyho@users.noreply.github.com> Date: Thu, 9 Nov 2023 09:33:41 +0200 Subject: [PATCH 1/5] Add tests for issue #7823 --- xarray/tests/test_dataarray.py | 45 ++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 1fbb834b679..8c9d13d8596 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3822,6 +3822,51 @@ def test_to_dataset_retains_keys(self) -> None: assert_equal(array, result) + def test_to_dataset_coord_value_is_dim(self) -> None: + # github issue #7823 + + array = DataArray( + np.zeros((3, 3)), + coords={ + # 'a' is both a coordinate value and the name of a coordinate + "x": ["a", "b", "c"], + "a": [1, 2, 3], + }, + ) + + with pytest.raises( + ValueError, + match=( + re.escape("dimension 'x' would produce the variables ('a',)") + + ".*" + + re.escape("DataArray.rename(a=...) or DataArray.assign_coords(x=...)") + ), + ): + array.to_dataset("x") + + # test error message formatting when there are multiple ambiguous + # values/coordinates + array2 = DataArray( + np.zeros((3, 3, 2)), + coords={ + "x": ["a", "b", "c"], + "a": [1, 2, 3], + "b": [0.0, 0.1], + }, + ) + + with pytest.raises( + ValueError, + match=( + re.escape("dimension 'x' would produce the variables ('a', 'b')") + + ".*" + + re.escape( + "DataArray.rename(a=..., b=...) or DataArray.assign_coords(x=...)" + ) + ), + ): + array2.to_dataset("x") + def test__title_for_slice(self) -> None: array = DataArray( np.ones((4, 3, 2)), From 303e88f473eecf2e8d7093c28b88d51e712ae3a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Gunyh=C3=B3?= <20118130+mgunyho@users.noreply.github.com> Date: Tue, 14 Nov 2023 08:47:58 +0200 Subject: [PATCH 2/5] Use 2D array to properly test to_dataset error handling logic --- xarray/tests/test_dataarray.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 8c9d13d8596..67165495a11 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3793,8 +3793,16 @@ def test_to_dataset_whole(self) -> None: actual = named.to_dataset("bar") def test_to_dataset_split(self) -> None: - array = DataArray([1, 2, 3], coords=[("x", list("abc"))], attrs={"a": 1}) - expected = Dataset({"a": 1, "b": 2, "c": 3}, attrs={"a": 1}) + array = DataArray( + [[1, 2], [3, 4], [5, 6]], + coords=[("x", list("abc")), ("y", [0.0, 0.1])], + attrs={"a": 1}, + ) + expected = Dataset( + {"a": ("y", [1, 2]), "b": ("y", [3, 4]), "c": ("y", [5, 6])}, + coords={"y": [0.0, 0.1]}, + attrs={"a": 1}, + ) actual = array.to_dataset("x") assert_identical(expected, actual) From 08a57ba953edb564593fd3e7feebf6a862e35a2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Gunyh=C3=B3?= <20118130+mgunyho@users.noreply.github.com> Date: Thu, 9 Nov 2023 09:35:59 +0200 Subject: [PATCH 3/5] Raise exception if a variable is also a coordinate in to_dataset Co-authored-by: Deepak Cherian --- xarray/core/dataarray.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 27eb3cdfddc..904286d6792 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -579,9 +579,22 @@ def subset(dim, label): array.attrs = {} return as_variable(array) - variables = {label: subset(dim, label) for label in self.get_index(dim)} - variables.update({k: v for k, v in self._coords.items() if k != dim}) + variables_from_split = { + label: subset(dim, label) for label in self.get_index(dim) + } coord_names = set(self._coords) - {dim} + + ambiguous_vars = set(variables_from_split) & coord_names + if ambiguous_vars: + rename_msg_fmt = ", ".join([f"{v}=..." for v in sorted(ambiguous_vars)]) + raise ValueError( + f"Splitting along the dimension {dim!r} would produce the variables " + f"{tuple(sorted(ambiguous_vars))} which are also existing coordinate " + f"variables. Use DataArray.rename({rename_msg_fmt}) or " + f"DataArray.assign_coords({dim}=...) to resolve this ambiguity." + ) + + variables = variables_from_split | {c: self._coords[c] for c in coord_names} indexes = filter_indexes_from_coords(self._indexes, coord_names) dataset = Dataset._construct_direct( variables, coord_names, indexes=indexes, attrs=self.attrs From 2042ef90a91947de7b9f8bad7bde5d7e4b474c7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Gunyh=C3=B3?= <20118130+mgunyho@users.noreply.github.com> Date: Thu, 9 Nov 2023 09:42:36 +0200 Subject: [PATCH 4/5] Update whats-new --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4da1d45a3dd..80788255027 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -65,6 +65,9 @@ Bug fixes - Fix to once again support date offset strings as input to the loffset parameter of resample and test this functionality (:pull:`8422`, :issue:`8399`). By `Katelyn FitzGerald `_. +- Fix a bug where :py:meth:`DataArray.to_dataset` silently drops a variable + if a coordinate with the same name already exists (:pull:`8433`, :issue:`7823`). + By `András Gunyhó `_. Documentation ~~~~~~~~~~~~~ From a9a45bb8e39a302aca20aec27be3037b3ba1479c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Gunyh=C3=B3?= <20118130+mgunyho@users.noreply.github.com> Date: Tue, 14 Nov 2023 22:13:42 +0200 Subject: [PATCH 5/5] Ensure that coordinates are in the original order --- xarray/core/dataarray.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 904286d6792..014a3533555 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -594,7 +594,9 @@ def subset(dim, label): f"DataArray.assign_coords({dim}=...) to resolve this ambiguity." ) - variables = variables_from_split | {c: self._coords[c] for c in coord_names} + variables = variables_from_split | { + k: v for k, v in self._coords.items() if k != dim + } indexes = filter_indexes_from_coords(self._indexes, coord_names) dataset = Dataset._construct_direct( variables, coord_names, indexes=indexes, attrs=self.attrs