Skip to content

Combine by coords dataarray bugfix #5834

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Oct 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ Bug fixes
- Fixed performance bug where ``cftime`` import attempted within various core operations if ``cftime`` not
installed (:pull:`5640`).
By `Luke Sewell <https://github.com/lusewell>`_
- Fixed bug when combining named DataArrays using :py:func:`combine_by_coords`. (:pull:`5834`).
By `Tom Nicholas <https://github.com/TomNicholas>`_.
- When a custom engine was used in :py:func:`~xarray.open_dataset` the engine
wasn't initialized properly, causing missing argument errors or inconsistent
method signatures. (:pull:`5684`)
Expand Down
104 changes: 79 additions & 25 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@ def combine_by_coords(
Attempt to auto-magically combine the given datasets (or data arrays)
into one by using dimension coordinates.

This method attempts to combine a group of datasets along any number of
This function attempts to combine a group of datasets along any number of
dimensions into a single entity by inspecting coords and metadata and using
a combination of concat and merge.

Expand Down Expand Up @@ -765,6 +765,8 @@ def combine_by_coords(
Returns
-------
combined : xarray.Dataset or xarray.DataArray
Will return a Dataset unless all the inputs are unnamed DataArrays, in which case a
DataArray will be returned.

See also
--------
Expand Down Expand Up @@ -870,6 +872,50 @@ def combine_by_coords(
Data variables:
temperature (y, x) float64 10.98 14.3 12.06 nan ... 18.89 10.44 8.293
precipitation (y, x) float64 0.4376 0.8918 0.9637 ... 0.5684 0.01879 0.6176

You can also combine DataArray objects, but the behaviour will differ depending on
whether or not the DataArrays are named. If all DataArrays are named then they will
be promoted to Datasets before combining, and then the resultant Dataset will be
returned, e.g.

>>> named_da1 = xr.DataArray(
... name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x"
... )
>>> named_da1
<xarray.DataArray 'a' (x: 2)>
array([1., 2.])
Coordinates:
* x (x) int64 0 1

>>> named_da2 = xr.DataArray(
... name="a", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x"
... )
>>> named_da2
<xarray.DataArray 'a' (x: 2)>
array([3., 4.])
Coordinates:
* x (x) int64 2 3

>>> xr.combine_by_coords([named_da1, named_da2])
<xarray.Dataset>
Dimensions: (x: 4)
Coordinates:
* x (x) int64 0 1 2 3
Data variables:
a (x) float64 1.0 2.0 3.0 4.0

If all the DataArrays are unnamed, a single DataArray will be returned, e.g.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
If all the DataArrays are unnamed, a single DataArray will be returned, e.g.
If all the DataArrays are unnamed, a single DataArray will be returned, e.g.

Maybe it's complaining about the missing newline?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That fixed it!


>>> unnamed_da1 = xr.DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
>>> unnamed_da2 = xr.DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")
>>> xr.combine_by_coords([unnamed_da1, unnamed_da2])
<xarray.DataArray (x: 4)>
array([1., 2., 3., 4.])
Coordinates:
* x (x) int64 0 1 2 3

Finally, if you attempt to combine a mix of unnamed DataArrays with either named
DataArrays or Datasets, a ValueError will be raised (as this is an ambiguous operation).
"""

# TODO remove after version 0.21, see PR4696
Expand All @@ -883,33 +929,41 @@ def combine_by_coords(
if not data_objects:
return Dataset()

mixed_arrays_and_datasets = any(
objs_are_unnamed_dataarrays = [
isinstance(data_object, DataArray) and data_object.name is None
for data_object in data_objects
) and any(isinstance(data_object, Dataset) for data_object in data_objects)
if mixed_arrays_and_datasets:
raise ValueError("Can't automatically combine datasets with unnamed arrays.")

all_unnamed_data_arrays = all(
isinstance(data_object, DataArray) and data_object.name is None
for data_object in data_objects
)
if all_unnamed_data_arrays:
unnamed_arrays = data_objects
temp_datasets = [data_array._to_temp_dataset() for data_array in unnamed_arrays]

combined_temp_dataset = _combine_single_variable_hypercube(
temp_datasets,
fill_value=fill_value,
data_vars=data_vars,
coords=coords,
compat=compat,
join=join,
combine_attrs=combine_attrs,
)
return DataArray()._from_temp_dataset(combined_temp_dataset)

]
if any(objs_are_unnamed_dataarrays):
if all(objs_are_unnamed_dataarrays):
# Combine into a single larger DataArray
temp_datasets = [
unnamed_dataarray._to_temp_dataset()
for unnamed_dataarray in data_objects
]

combined_temp_dataset = _combine_single_variable_hypercube(
temp_datasets,
fill_value=fill_value,
data_vars=data_vars,
coords=coords,
compat=compat,
join=join,
combine_attrs=combine_attrs,
)
return DataArray()._from_temp_dataset(combined_temp_dataset)
else:
# Must be a mix of unnamed dataarrays with either named dataarrays or with datasets
# Can't combine these as we wouldn't know whether to merge or concatenate the arrays
raise ValueError(
"Can't automatically combine unnamed DataArrays with either named DataArrays or Datasets."
)
else:
# Promote any named DataArrays to single-variable Datasets to simplify combining
data_objects = [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So if the input is all named DataArrays with the same name, the output is a Dataset? Or is that handled somplace else?

Copy link
Member Author

@TomNicholas TomNicholas Oct 29, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't think of that case, but yes the output is a Dataset. I think that makes sense, because it still matches what the docstring says about the return type, and it also matches the results of merge([named_da, named_da]). (Not coincidentally, because delegating to merge is what combine is doing under the hood in this case.)

Docstring says:

Returns
-------
combined : xarray.Dataset or xarray.DataArray
    Will return a Dataset unless all the inputs are unnamed DataArrays, in which case a
    DataArray will be returned.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add some examples in the docstring to illustrate this?

obj.to_dataset() if isinstance(obj, DataArray) else obj
for obj in data_objects
]

# Group by data vars
sorted_datasets = sorted(data_objects, key=vars_as_keys)
grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)
Expand Down
71 changes: 58 additions & 13 deletions xarray/tests/test_combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
combine_by_coords,
combine_nested,
concat,
merge,
)
from xarray.core import dtypes
from xarray.core.combine import (
Expand Down Expand Up @@ -688,7 +689,7 @@ def test_nested_combine_mixed_datasets_arrays(self):
combine_nested(objs, "x")


class TestCombineAuto:
class TestCombineDatasetsbyCoords:
def test_combine_by_coords(self):
objs = [Dataset({"x": [0]}), Dataset({"x": [1]})]
actual = combine_by_coords(objs)
Expand Down Expand Up @@ -730,17 +731,6 @@ def test_combine_by_coords(self):
def test_empty_input(self):
assert_identical(Dataset(), combine_by_coords([]))

def test_combine_coords_mixed_datasets_arrays(self):
objs = [
DataArray([0, 1], dims=("x"), coords=({"x": [0, 1]})),
Dataset({"x": [2, 3]}),
]
with pytest.raises(
ValueError,
match=r"Can't automatically combine datasets with unnamed arrays.",
):
combine_by_coords(objs)

@pytest.mark.parametrize(
"join, expected",
[
Expand Down Expand Up @@ -1044,7 +1034,35 @@ def test_combine_by_coords_incomplete_hypercube(self):
with pytest.raises(ValueError):
combine_by_coords([x1, x2, x3], fill_value=None)

def test_combine_by_coords_unnamed_arrays(self):

class TestCombineMixedObjectsbyCoords:
def test_combine_by_coords_mixed_unnamed_dataarrays(self):
named_da = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
unnamed_da = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")

with pytest.raises(
ValueError, match="Can't automatically combine unnamed DataArrays with"
):
combine_by_coords([named_da, unnamed_da])

da = DataArray([0, 1], dims="x", coords=({"x": [0, 1]}))
ds = Dataset({"x": [2, 3]})
with pytest.raises(
ValueError,
match="Can't automatically combine unnamed DataArrays with",
):
combine_by_coords([da, ds])

def test_combine_coords_mixed_datasets_named_dataarrays(self):
da = DataArray(name="a", data=[4, 5], dims="x", coords=({"x": [0, 1]}))
ds = Dataset({"b": ("x", [2, 3])})
actual = combine_by_coords([da, ds])
expected = Dataset(
{"a": ("x", [4, 5]), "b": ("x", [2, 3])}, coords={"x": ("x", [0, 1])}
)
assert_identical(expected, actual)

def test_combine_by_coords_all_unnamed_dataarrays(self):
unnamed_array = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")

actual = combine_by_coords([unnamed_array])
Expand All @@ -1060,6 +1078,33 @@ def test_combine_by_coords_unnamed_arrays(self):
)
assert_identical(expected, actual)

def test_combine_by_coords_all_named_dataarrays(self):
named_da = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")

actual = combine_by_coords([named_da])
expected = named_da.to_dataset()
assert_identical(expected, actual)

named_da1 = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
named_da2 = DataArray(name="b", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")

actual = combine_by_coords([named_da1, named_da2])
expected = Dataset(
{
"a": DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x"),
"b": DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x"),
}
)
assert_identical(expected, actual)

def test_combine_by_coords_all_dataarrays_with_the_same_name(self):
named_da1 = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x")
named_da2 = DataArray(name="a", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x")

actual = combine_by_coords([named_da1, named_da2])
expected = merge([named_da1, named_da2])
assert_identical(expected, actual)


@requires_cftime
def test_combine_by_coords_distant_cftime_dates():
Expand Down