Skip to content

Commit 47889ee

Browse files
authored
add a drop_conflicts strategy for merging attrs (#4827)
1 parent 59088a0 commit 47889ee

File tree

7 files changed

+247
-29
lines changed

7 files changed

+247
-29
lines changed

doc/whats-new.rst

+4
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ New Features
6060
By `Maximilian Roos <https://github.com/max-sixty>`_.
6161

6262
- Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables.
63+
By `Deepak Cherian <https://github.com/dcherian>`_
64+
- add ``"drop_conflicts"`` to the strategies supported by the ``combine_attrs`` kwarg
65+
(:issue:`4749`, :pull:`4827`).
66+
By `Justus Magin <https://github.com/keewis>`_.
6367
By `Deepak Cherian <https://github.com/dcherian>`_.
6468
- :py:meth:`DataArray.swap_dims` & :py:meth:`Dataset.swap_dims` now accept dims
6569
in the form of kwargs as well as a dict, like most similar methods.

xarray/core/combine.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -412,14 +412,16 @@ def combine_nested(
412412
- "override": if indexes are of same size, rewrite indexes to be
413413
those of the first object with that dimension. Indexes for the same
414414
dimension must have the same size in all objects.
415-
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \
416-
default: "drop"
415+
combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
416+
"override"}, default: "drop"
417417
String indicating how to combine attrs of the objects being merged:
418418
419419
- "drop": empty attrs on returned Dataset.
420420
- "identical": all attrs must be the same on every object.
421421
- "no_conflicts": attrs from all objects are combined, any that have
422422
the same name must also have the same value.
423+
- "drop_conflicts": attrs from all objects are combined, any that have
424+
the same name but different values are dropped.
423425
- "override": skip comparing and copy attrs from the first dataset to
424426
the result.
425427
@@ -625,14 +627,16 @@ def combine_by_coords(
625627
- "override": if indexes are of same size, rewrite indexes to be
626628
those of the first object with that dimension. Indexes for the same
627629
dimension must have the same size in all objects.
628-
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \
629-
default: "drop"
630+
combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
631+
"override"}, default: "drop"
630632
String indicating how to combine attrs of the objects being merged:
631633
632634
- "drop": empty attrs on returned Dataset.
633635
- "identical": all attrs must be the same on every object.
634636
- "no_conflicts": attrs from all objects are combined, any that have
635637
the same name must also have the same value.
638+
- "drop_conflicts": attrs from all objects are combined, any that have
639+
the same name but different values are dropped.
636640
- "override": skip comparing and copy attrs from the first dataset to
637641
the result.
638642

xarray/core/concat.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -142,14 +142,16 @@ def concat(
142142
- "override": if indexes are of same size, rewrite indexes to be
143143
those of the first object with that dimension. Indexes for the same
144144
dimension must have the same size in all objects.
145-
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \
146-
default: "override"
145+
combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
146+
"override"}, default: "override"
147147
String indicating how to combine attrs of the objects being merged:
148148
149149
- "drop": empty attrs on returned Dataset.
150150
- "identical": all attrs must be the same on every object.
151151
- "no_conflicts": attrs from all objects are combined, any that have
152152
the same name must also have the same value.
153+
- "drop_conflicts": attrs from all objects are combined, any that have
154+
the same name but different values are dropped.
153155
- "override": skip comparing and copy attrs from the first dataset to
154156
the result.
155157

xarray/core/merge.py

+25-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from . import dtypes, pdcompat
2121
from .alignment import deep_align
2222
from .duck_array_ops import lazy_array_equiv
23-
from .utils import Frozen, compat_dict_union, dict_equiv
23+
from .utils import Frozen, compat_dict_union, dict_equiv, equivalent
2424
from .variable import Variable, as_variable, assert_unique_multiindex_level_names
2525

2626
if TYPE_CHECKING:
@@ -513,6 +513,24 @@ def merge_attrs(variable_attrs, combine_attrs):
513513
"the same. Merging %s with %s" % (str(result), str(attrs))
514514
)
515515
return result
516+
elif combine_attrs == "drop_conflicts":
517+
result = {}
518+
dropped_keys = set()
519+
for attrs in variable_attrs:
520+
result.update(
521+
{
522+
key: value
523+
for key, value in attrs.items()
524+
if key not in result and key not in dropped_keys
525+
}
526+
)
527+
result = {
528+
key: value
529+
for key, value in result.items()
530+
if key not in attrs or equivalent(attrs[key], value)
531+
}
532+
dropped_keys |= {key for key in attrs if key not in result}
533+
return result
516534
elif combine_attrs == "identical":
517535
result = dict(variable_attrs[0])
518536
for attrs in variable_attrs[1:]:
@@ -556,7 +574,8 @@ def merge_core(
556574
Compatibility checks to use when merging variables.
557575
join : {"outer", "inner", "left", "right"}, optional
558576
How to combine objects with different indexes.
559-
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, optional
577+
combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
578+
"override"}, optional
560579
How to combine attributes of objects
561580
priority_arg : int, optional
562581
Optional argument in `objects` that takes precedence over the others.
@@ -668,14 +687,16 @@ def merge(
668687
Value to use for newly missing values. If a dict-like, maps
669688
variable names to fill values. Use a data array's name to
670689
refer to its values.
671-
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \
672-
default: "drop"
690+
combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
691+
"override"}, default: "drop"
673692
String indicating how to combine attrs of the objects being merged:
674693
675694
- "drop": empty attrs on returned Dataset.
676695
- "identical": all attrs must be the same on every object.
677696
- "no_conflicts": attrs from all objects are combined, any that have
678697
the same name must also have the same value.
698+
- "drop_conflicts": attrs from all objects are combined, any that have
699+
the same name but different values are dropped.
679700
- "override": skip comparing and copy attrs from the first dataset to
680701
the result.
681702

xarray/tests/test_combine.py

+11
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,17 @@ def test_combine_coords_combine_attrs_identical(self):
732732
objs, concat_dim="x", join="outer", combine_attrs="identical"
733733
)
734734

735+
def test_combine_nested_combine_attrs_drop_conflicts(self):
736+
objs = [
737+
Dataset({"x": [0], "y": [0]}, attrs={"a": 1, "b": 2, "c": 3}),
738+
Dataset({"x": [1], "y": [1]}, attrs={"a": 1, "b": 0, "d": 3}),
739+
]
740+
expected = Dataset({"x": [0, 1], "y": [0, 1]}, attrs={"a": 1, "c": 3, "d": 3})
741+
actual = combine_nested(
742+
objs, concat_dim="x", join="outer", combine_attrs="drop_conflicts"
743+
)
744+
assert_identical(expected, actual)
745+
735746
def test_infer_order_from_coords(self):
736747
data = create_test_data()
737748
objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))]

xarray/tests/test_concat.py

+110-19
Original file line numberDiff line numberDiff line change
@@ -258,27 +258,118 @@ def test_concat_join_kwarg(self):
258258
)
259259
assert_identical(actual, expected)
260260

261-
def test_concat_combine_attrs_kwarg(self):
262-
ds1 = Dataset({"a": ("x", [0])}, coords={"x": [0]}, attrs={"b": 42})
263-
ds2 = Dataset({"a": ("x", [0])}, coords={"x": [1]}, attrs={"b": 42, "c": 43})
264-
265-
expected = {}
266-
expected["drop"] = Dataset({"a": ("x", [0, 0])}, {"x": [0, 1]})
267-
expected["no_conflicts"] = Dataset(
268-
{"a": ("x", [0, 0])}, {"x": [0, 1]}, {"b": 42, "c": 43}
269-
)
270-
expected["override"] = Dataset({"a": ("x", [0, 0])}, {"x": [0, 1]}, {"b": 42})
271-
272-
with raises_regex(ValueError, "combine_attrs='identical'"):
273-
actual = concat([ds1, ds2], dim="x", combine_attrs="identical")
274-
with raises_regex(ValueError, "combine_attrs='no_conflicts'"):
275-
ds3 = ds2.copy(deep=True)
276-
ds3.attrs["b"] = 44
277-
actual = concat([ds1, ds3], dim="x", combine_attrs="no_conflicts")
261+
@pytest.mark.parametrize(
262+
"combine_attrs, var1_attrs, var2_attrs, expected_attrs, expect_exception",
263+
[
264+
(
265+
"no_conflicts",
266+
{"a": 1, "b": 2},
267+
{"a": 1, "c": 3},
268+
{"a": 1, "b": 2, "c": 3},
269+
False,
270+
),
271+
("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False),
272+
("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False),
273+
(
274+
"no_conflicts",
275+
{"a": 1, "b": 2},
276+
{"a": 4, "c": 3},
277+
{"a": 1, "b": 2, "c": 3},
278+
True,
279+
),
280+
("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False),
281+
("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False),
282+
("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True),
283+
(
284+
"override",
285+
{"a": 1, "b": 2},
286+
{"a": 4, "b": 5, "c": 3},
287+
{"a": 1, "b": 2},
288+
False,
289+
),
290+
(
291+
"drop_conflicts",
292+
{"a": 41, "b": 42, "c": 43},
293+
{"b": 2, "c": 43, "d": 44},
294+
{"a": 41, "c": 43, "d": 44},
295+
False,
296+
),
297+
],
298+
)
299+
def test_concat_combine_attrs_kwarg(
300+
self, combine_attrs, var1_attrs, var2_attrs, expected_attrs, expect_exception
301+
):
302+
ds1 = Dataset({"a": ("x", [0])}, coords={"x": [0]}, attrs=var1_attrs)
303+
ds2 = Dataset({"a": ("x", [0])}, coords={"x": [1]}, attrs=var2_attrs)
304+
305+
if expect_exception:
306+
with pytest.raises(ValueError, match=f"combine_attrs='{combine_attrs}'"):
307+
concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
308+
else:
309+
actual = concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
310+
expected = Dataset(
311+
{"a": ("x", [0, 0])}, {"x": [0, 1]}, attrs=expected_attrs
312+
)
278313

279-
for combine_attrs in expected:
314+
assert_identical(actual, expected)
315+
316+
@pytest.mark.skip(reason="not implemented, yet (see #4827)")
317+
@pytest.mark.parametrize(
318+
"combine_attrs, attrs1, attrs2, expected_attrs, expect_exception",
319+
[
320+
(
321+
"no_conflicts",
322+
{"a": 1, "b": 2},
323+
{"a": 1, "c": 3},
324+
{"a": 1, "b": 2, "c": 3},
325+
False,
326+
),
327+
("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False),
328+
("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False),
329+
(
330+
"no_conflicts",
331+
{"a": 1, "b": 2},
332+
{"a": 4, "c": 3},
333+
{"a": 1, "b": 2, "c": 3},
334+
True,
335+
),
336+
("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False),
337+
("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False),
338+
("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True),
339+
(
340+
"override",
341+
{"a": 1, "b": 2},
342+
{"a": 4, "b": 5, "c": 3},
343+
{"a": 1, "b": 2},
344+
False,
345+
),
346+
(
347+
"drop_conflicts",
348+
{"a": 41, "b": 42, "c": 43},
349+
{"b": 2, "c": 43, "d": 44},
350+
{"a": 41, "c": 43, "d": 44},
351+
False,
352+
),
353+
],
354+
)
355+
def test_concat_combine_attrs_kwarg_variables(
356+
self, combine_attrs, attrs1, attrs2, expected_attrs, expect_exception
357+
):
358+
"""check that combine_attrs is used on data variables and coords"""
359+
ds1 = Dataset({"a": ("x", [0], attrs1)}, coords={"x": ("x", [0], attrs1)})
360+
ds2 = Dataset({"a": ("x", [0], attrs2)}, coords={"x": ("x", [1], attrs2)})
361+
362+
if expect_exception:
363+
with pytest.raises(ValueError, match=f"combine_attrs='{combine_attrs}'"):
364+
concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
365+
else:
280366
actual = concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
281-
assert_identical(actual, expected[combine_attrs])
367+
expected = Dataset(
368+
{"a": ("x", [0, 0], expected_attrs)},
369+
{"x": ("x", [0, 1], expected_attrs)},
370+
)
371+
372+
assert_identical(actual, expected)
282373

283374
def test_concat_promote_shape(self):
284375
# mixed dims within variables

xarray/tests/test_merge.py

+85
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,20 @@ def test_merge_arrays_attrs_default(self):
9292
{"a": 1, "b": 2},
9393
False,
9494
),
95+
(
96+
"drop_conflicts",
97+
{"a": 1, "b": 2, "c": 3},
98+
{"b": 1, "c": 3, "d": 4},
99+
{"a": 1, "c": 3, "d": 4},
100+
False,
101+
),
102+
(
103+
"drop_conflicts",
104+
{"a": 1, "b": np.array([2]), "c": np.array([3])},
105+
{"b": 1, "c": np.array([3]), "d": 4},
106+
{"a": 1, "c": np.array([3]), "d": 4},
107+
False,
108+
),
95109
],
96110
)
97111
def test_merge_arrays_attrs(
@@ -109,13 +123,84 @@ def test_merge_arrays_attrs(
109123
expected.attrs = expected_attrs
110124
assert_identical(actual, expected)
111125

126+
@pytest.mark.skip(reason="not implemented, yet (see #4827)")
127+
@pytest.mark.parametrize(
128+
"combine_attrs, attrs1, attrs2, expected_attrs, expect_exception",
129+
[
130+
(
131+
"no_conflicts",
132+
{"a": 1, "b": 2},
133+
{"a": 1, "c": 3},
134+
{"a": 1, "b": 2, "c": 3},
135+
False,
136+
),
137+
("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False),
138+
("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False),
139+
(
140+
"no_conflicts",
141+
{"a": 1, "b": 2},
142+
{"a": 4, "c": 3},
143+
{"a": 1, "b": 2, "c": 3},
144+
True,
145+
),
146+
("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False),
147+
("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False),
148+
("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True),
149+
(
150+
"override",
151+
{"a": 1, "b": 2},
152+
{"a": 4, "b": 5, "c": 3},
153+
{"a": 1, "b": 2},
154+
False,
155+
),
156+
(
157+
"drop_conflicts",
158+
{"a": 1, "b": 2, "c": 3},
159+
{"b": 1, "c": 3, "d": 4},
160+
{"a": 1, "c": 3, "d": 4},
161+
False,
162+
),
163+
],
164+
)
165+
def test_merge_arrays_attrs_variables(
166+
self, combine_attrs, attrs1, attrs2, expected_attrs, expect_exception
167+
):
168+
"""check that combine_attrs is used on data variables and coords"""
169+
data = create_test_data()
170+
data1 = data.copy()
171+
data1.var1.attrs = attrs1
172+
data1.dim1.attrs = attrs1
173+
data2 = data.copy()
174+
data2.var1.attrs = attrs2
175+
data2.dim1.attrs = attrs2
176+
177+
if expect_exception:
178+
with raises_regex(MergeError, "combine_attrs"):
179+
actual = xr.merge([data1, data2], combine_attrs=combine_attrs)
180+
else:
181+
actual = xr.merge([data1, data2], combine_attrs=combine_attrs)
182+
expected = data.copy()
183+
expected.var1.attrs = expected_attrs
184+
expected.dim1.attrs = expected_attrs
185+
186+
assert_identical(actual, expected)
187+
112188
def test_merge_attrs_override_copy(self):
113189
ds1 = xr.Dataset(attrs={"x": 0})
114190
ds2 = xr.Dataset(attrs={"x": 1})
115191
ds3 = xr.merge([ds1, ds2], combine_attrs="override")
116192
ds3.attrs["x"] = 2
117193
assert ds1.x == 0
118194

195+
def test_merge_attrs_drop_conflicts(self):
196+
ds1 = xr.Dataset(attrs={"a": 0, "b": 0, "c": 0})
197+
ds2 = xr.Dataset(attrs={"b": 0, "c": 1, "d": 0})
198+
ds3 = xr.Dataset(attrs={"a": 0, "b": 1, "c": 0, "e": 0})
199+
200+
actual = xr.merge([ds1, ds2, ds3], combine_attrs="drop_conflicts")
201+
expected = xr.Dataset(attrs={"a": 0, "d": 0, "e": 0})
202+
assert_identical(actual, expected)
203+
119204
def test_merge_dicts_simple(self):
120205
actual = xr.merge([{"foo": 0}, {"bar": "one"}, {"baz": 3.5}])
121206
expected = xr.Dataset({"foo": 0, "bar": "one", "baz": 3.5})

0 commit comments

Comments
 (0)