Skip to content

Commit 867646f

Browse files
TomNicholasmathausedcherianIllviljan
authored
Combine by coords dataarray bugfix (#5834)
* fixed bug * added tests + reorganised slightly * clarified logic for dealing with mixed sets of objects * removed commented out old code * recorded bugfix in whatsnew * Update doc/whats-new.rst Co-authored-by: Mathias Hauser <[email protected]> * Update xarray/core/combine.py Co-authored-by: Mathias Hauser <[email protected]> * removed pointless renaming * update tests to look for capitalized error message * clarified return type in docstring * added test for combining two dataarrays with the same name * Update xarray/tests/test_combine.py Co-authored-by: Deepak Cherian <[email protected]> * Update doc/whats-new.rst Co-authored-by: Deepak Cherian <[email protected]> * added examples to docstrings * correct docstring example * re-trigger CI * Update xarray/core/combine.py Co-authored-by: Illviljan <[email protected]> Co-authored-by: Mathias Hauser <[email protected]> Co-authored-by: Deepak Cherian <[email protected]> Co-authored-by: Illviljan <[email protected]>
1 parent 1d94b1e commit 867646f

File tree

3 files changed

+139
-38
lines changed

3 files changed

+139
-38
lines changed

doc/whats-new.rst

+2
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ Bug fixes
8080
- Fixed performance bug where ``cftime`` import attempted within various core operations if ``cftime`` not
8181
installed (:pull:`5640`).
8282
By `Luke Sewell <https://github.com/lusewell>`_
83+
- Fixed bug when combining named DataArrays using :py:func:`combine_by_coords`. (:pull:`5834`).
84+
By `Tom Nicholas <https://github.com/TomNicholas>`_.
8385
- When a custom engine was used in :py:func:`~xarray.open_dataset` the engine
8486
wasn't initialized properly, causing missing argument errors or inconsistent
8587
method signatures. (:pull:`5684`)

xarray/core/combine.py

+79-25
Original file line numberDiff line numberDiff line change
@@ -673,7 +673,7 @@ def combine_by_coords(
673673
Attempt to auto-magically combine the given datasets (or data arrays)
674674
into one by using dimension coordinates.
675675
676-
This method attempts to combine a group of datasets along any number of
676+
This function attempts to combine a group of datasets along any number of
677677
dimensions into a single entity by inspecting coords and metadata and using
678678
a combination of concat and merge.
679679
@@ -765,6 +765,8 @@ def combine_by_coords(
765765
Returns
766766
-------
767767
combined : xarray.Dataset or xarray.DataArray
768+
Will return a Dataset unless all the inputs are unnamed DataArrays, in which case a
769+
DataArray will be returned.
768770
769771
See also
770772
--------
@@ -870,6 +872,50 @@ def combine_by_coords(
870872
Data variables:
871873
temperature (y, x) float64 10.98 14.3 12.06 nan ... 18.89 10.44 8.293
872874
precipitation (y, x) float64 0.4376 0.8918 0.9637 ... 0.5684 0.01879 0.6176
875+
876+
You can also combine DataArray objects, but the behaviour will differ depending on
877+
whether or not the DataArrays are named. If all DataArrays are named then they will
878+
be promoted to Datasets before combining, and then the resultant Dataset will be
879+
returned, e.g.
880+
881+
>>> named_da1 = xr.DataArray(
882+
... name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x"
883+
... )
884+
>>> named_da1
885+
<xarray.DataArray 'a' (x: 2)>
886+
array([1., 2.])
887+
Coordinates:
888+
* x (x) int64 0 1
889+
890+
>>> named_da2 = xr.DataArray(
891+
... name="a", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x"
892+
... )
893+
>>> named_da2
894+
<xarray.DataArray 'a' (x: 2)>
895+
array([3., 4.])
896+
Coordinates:
897+
* x (x) int64 2 3
898+
899+
>>> xr.combine_by_coords([named_da1, named_da2])
900+
<xarray.Dataset>
901+
Dimensions: (x: 4)
902+
Coordinates:
903+
* x (x) int64 0 1 2 3
904+
Data variables:
905+
a (x) float64 1.0 2.0 3.0 4.0
906+
907+
If all the DataArrays are unnamed, a single DataArray will be returned, e.g.
908+
909+
>>> unnamed_da1 = xr.DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
910+
>>> unnamed_da2 = xr.DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")
911+
>>> xr.combine_by_coords([unnamed_da1, unnamed_da2])
912+
<xarray.DataArray (x: 4)>
913+
array([1., 2., 3., 4.])
914+
Coordinates:
915+
* x (x) int64 0 1 2 3
916+
917+
Finally, if you attempt to combine a mix of unnamed DataArrays with either named
918+
DataArrays or Datasets, a ValueError will be raised (as this is an ambiguous operation).
873919
"""
874920

875921
# TODO remove after version 0.21, see PR4696
@@ -883,33 +929,41 @@ def combine_by_coords(
883929
if not data_objects:
884930
return Dataset()
885931

886-
mixed_arrays_and_datasets = any(
932+
objs_are_unnamed_dataarrays = [
887933
isinstance(data_object, DataArray) and data_object.name is None
888934
for data_object in data_objects
889-
) and any(isinstance(data_object, Dataset) for data_object in data_objects)
890-
if mixed_arrays_and_datasets:
891-
raise ValueError("Can't automatically combine datasets with unnamed arrays.")
892-
893-
all_unnamed_data_arrays = all(
894-
isinstance(data_object, DataArray) and data_object.name is None
895-
for data_object in data_objects
896-
)
897-
if all_unnamed_data_arrays:
898-
unnamed_arrays = data_objects
899-
temp_datasets = [data_array._to_temp_dataset() for data_array in unnamed_arrays]
900-
901-
combined_temp_dataset = _combine_single_variable_hypercube(
902-
temp_datasets,
903-
fill_value=fill_value,
904-
data_vars=data_vars,
905-
coords=coords,
906-
compat=compat,
907-
join=join,
908-
combine_attrs=combine_attrs,
909-
)
910-
return DataArray()._from_temp_dataset(combined_temp_dataset)
911-
935+
]
936+
if any(objs_are_unnamed_dataarrays):
937+
if all(objs_are_unnamed_dataarrays):
938+
# Combine into a single larger DataArray
939+
temp_datasets = [
940+
unnamed_dataarray._to_temp_dataset()
941+
for unnamed_dataarray in data_objects
942+
]
943+
944+
combined_temp_dataset = _combine_single_variable_hypercube(
945+
temp_datasets,
946+
fill_value=fill_value,
947+
data_vars=data_vars,
948+
coords=coords,
949+
compat=compat,
950+
join=join,
951+
combine_attrs=combine_attrs,
952+
)
953+
return DataArray()._from_temp_dataset(combined_temp_dataset)
954+
else:
955+
# Must be a mix of unnamed dataarrays with either named dataarrays or with datasets
956+
# Can't combine these as we wouldn't know whether to merge or concatenate the arrays
957+
raise ValueError(
958+
"Can't automatically combine unnamed DataArrays with either named DataArrays or Datasets."
959+
)
912960
else:
961+
# Promote any named DataArrays to single-variable Datasets to simplify combining
962+
data_objects = [
963+
obj.to_dataset() if isinstance(obj, DataArray) else obj
964+
for obj in data_objects
965+
]
966+
913967
# Group by data vars
914968
sorted_datasets = sorted(data_objects, key=vars_as_keys)
915969
grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)

xarray/tests/test_combine.py

+58-13
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
combine_by_coords,
1313
combine_nested,
1414
concat,
15+
merge,
1516
)
1617
from xarray.core import dtypes
1718
from xarray.core.combine import (
@@ -688,7 +689,7 @@ def test_nested_combine_mixed_datasets_arrays(self):
688689
combine_nested(objs, "x")
689690

690691

691-
class TestCombineAuto:
692+
class TestCombineDatasetsbyCoords:
692693
def test_combine_by_coords(self):
693694
objs = [Dataset({"x": [0]}), Dataset({"x": [1]})]
694695
actual = combine_by_coords(objs)
@@ -730,17 +731,6 @@ def test_combine_by_coords(self):
730731
def test_empty_input(self):
731732
assert_identical(Dataset(), combine_by_coords([]))
732733

733-
def test_combine_coords_mixed_datasets_arrays(self):
734-
objs = [
735-
DataArray([0, 1], dims=("x"), coords=({"x": [0, 1]})),
736-
Dataset({"x": [2, 3]}),
737-
]
738-
with pytest.raises(
739-
ValueError,
740-
match=r"Can't automatically combine datasets with unnamed arrays.",
741-
):
742-
combine_by_coords(objs)
743-
744734
@pytest.mark.parametrize(
745735
"join, expected",
746736
[
@@ -1044,7 +1034,35 @@ def test_combine_by_coords_incomplete_hypercube(self):
10441034
with pytest.raises(ValueError):
10451035
combine_by_coords([x1, x2, x3], fill_value=None)
10461036

1047-
def test_combine_by_coords_unnamed_arrays(self):
1037+
1038+
class TestCombineMixedObjectsbyCoords:
1039+
def test_combine_by_coords_mixed_unnamed_dataarrays(self):
1040+
named_da = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
1041+
unnamed_da = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")
1042+
1043+
with pytest.raises(
1044+
ValueError, match="Can't automatically combine unnamed DataArrays with"
1045+
):
1046+
combine_by_coords([named_da, unnamed_da])
1047+
1048+
da = DataArray([0, 1], dims="x", coords=({"x": [0, 1]}))
1049+
ds = Dataset({"x": [2, 3]})
1050+
with pytest.raises(
1051+
ValueError,
1052+
match="Can't automatically combine unnamed DataArrays with",
1053+
):
1054+
combine_by_coords([da, ds])
1055+
1056+
def test_combine_coords_mixed_datasets_named_dataarrays(self):
1057+
da = DataArray(name="a", data=[4, 5], dims="x", coords=({"x": [0, 1]}))
1058+
ds = Dataset({"b": ("x", [2, 3])})
1059+
actual = combine_by_coords([da, ds])
1060+
expected = Dataset(
1061+
{"a": ("x", [4, 5]), "b": ("x", [2, 3])}, coords={"x": ("x", [0, 1])}
1062+
)
1063+
assert_identical(expected, actual)
1064+
1065+
def test_combine_by_coords_all_unnamed_dataarrays(self):
10481066
unnamed_array = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
10491067

10501068
actual = combine_by_coords([unnamed_array])
@@ -1060,6 +1078,33 @@ def test_combine_by_coords_unnamed_arrays(self):
10601078
)
10611079
assert_identical(expected, actual)
10621080

1081+
def test_combine_by_coords_all_named_dataarrays(self):
1082+
named_da = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
1083+
1084+
actual = combine_by_coords([named_da])
1085+
expected = named_da.to_dataset()
1086+
assert_identical(expected, actual)
1087+
1088+
named_da1 = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
1089+
named_da2 = DataArray(name="b", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")
1090+
1091+
actual = combine_by_coords([named_da1, named_da2])
1092+
expected = Dataset(
1093+
{
1094+
"a": DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x"),
1095+
"b": DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x"),
1096+
}
1097+
)
1098+
assert_identical(expected, actual)
1099+
1100+
def test_combine_by_coords_all_dataarrays_with_the_same_name(self):
1101+
named_da1 = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
1102+
named_da2 = DataArray(name="a", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")
1103+
1104+
actual = combine_by_coords([named_da1, named_da2])
1105+
expected = merge([named_da1, named_da2])
1106+
assert_identical(expected, actual)
1107+
10631108

10641109
@requires_cftime
10651110
def test_combine_by_coords_distant_cftime_dates():

0 commit comments

Comments
 (0)