Skip to content

Commit 4cf912a

Browse files
committed
Split out concat tests into test_combine.py
1 parent 6015f8d commit 4cf912a

File tree

5 files changed

+295
-251
lines changed

5 files changed

+295
-251
lines changed

doc/whats-new.rst

+4-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@ Backwards incompatible changes
1717

1818
- The optional arguments ``concat_over`` and ``mode`` in :py:func:`~xray.concat` have
1919
been removed and replaced by ``data_vars`` and ``coords``. The new arguments are both
20-
more easily understood and more robustly implemented.
20+
more easily understood and more robustly implemented, and allowed us to fix a bug
21+
where ``concat`` accidentally loaded data into memory. If you set values for
22+
these optional arguments manually, you will need to update your code. The default
23+
behavior should be unchanged.
2124

2225
Enhancements
2326
~~~~~~~~~~~~

xray/core/combine.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def concat(objs, dim=None, data_vars='all', coords='different',
9090
FutureWarning, stacklevel=2)
9191
dim = 'concat_dims'
9292

93-
if indexers is not None:
93+
if indexers is not None: # pragma: nocover
9494
warnings.warn('indexers has been renamed to positions; the alias '
9595
'will be removed in a future version of xray',
9696
FutureWarning, stacklevel=2)

xray/test/test_combine.py

+290
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
from copy import deepcopy
2+
3+
import numpy as np
4+
import pandas as pd
5+
6+
from xray import Dataset, DataArray, auto_combine, concat
7+
from xray.core.pycompat import iteritems, OrderedDict
8+
9+
from . import (TestCase, unittest, InaccessibleArray, UnexpectedDataAccess,
10+
requires_dask)
11+
from .test_dataset import create_test_data
12+
13+
14+
class TestConcatDataset(TestCase):
15+
def test_concat(self):
16+
# TODO: simplify and split this test case
17+
18+
# drop the third dimension to keep things relatively understandable
19+
data = create_test_data().drop('dim3')
20+
21+
split_data = [data.isel(dim1=slice(3)),
22+
data.isel(dim1=slice(3, None))]
23+
self.assertDatasetIdentical(data, concat(split_data, 'dim1'))
24+
25+
def rectify_dim_order(dataset):
26+
# return a new dataset with all variable dimensions tranposed into
27+
# the order in which they are found in `data`
28+
return Dataset(dict((k, v.transpose(*data[k].dims))
29+
for k, v in iteritems(dataset.data_vars)),
30+
dataset.coords, attrs=dataset.attrs)
31+
32+
for dim in ['dim1', 'dim2']:
33+
datasets = [g for _, g in data.groupby(dim, squeeze=False)]
34+
self.assertDatasetIdentical(data, concat(datasets, dim))
35+
self.assertDatasetIdentical(
36+
data, concat(datasets, data[dim]))
37+
self.assertDatasetIdentical(
38+
data, concat(datasets, data[dim], coords='minimal'))
39+
40+
datasets = [g for _, g in data.groupby(dim, squeeze=True)]
41+
concat_over = [k for k, v in iteritems(data.coords)
42+
if dim in v.dims and k != dim]
43+
actual = concat(datasets, data[dim], coords=concat_over)
44+
self.assertDatasetIdentical(data, rectify_dim_order(actual))
45+
46+
actual = concat(datasets, data[dim], coords='different')
47+
self.assertDatasetIdentical(data, rectify_dim_order(actual))
48+
49+
# make sure the coords argument behaves as expected
50+
data.coords['extra'] = ('dim4', np.arange(3))
51+
for dim in ['dim1', 'dim2']:
52+
datasets = [g for _, g in data.groupby(dim, squeeze=True)]
53+
actual = concat(datasets, data[dim], coords='all')
54+
expected = np.array([data['extra'].values
55+
for _ in range(data.dims[dim])])
56+
self.assertArrayEqual(actual['extra'].values, expected)
57+
58+
actual = concat(datasets, data[dim], coords='different')
59+
self.assertDataArrayEqual(data['extra'], actual['extra'])
60+
actual = concat(datasets, data[dim], coords='minimal')
61+
self.assertDataArrayEqual(data['extra'], actual['extra'])
62+
63+
# verify that the dim argument takes precedence over
64+
# concatenating dataset variables of the same name
65+
dim = (2 * data['dim1']).rename('dim1')
66+
datasets = [g for _, g in data.groupby('dim1', squeeze=False)]
67+
expected = data.copy()
68+
expected['dim1'] = dim
69+
self.assertDatasetIdentical(expected, concat(datasets, dim))
70+
71+
def test_concat_data_vars(self):
72+
data = Dataset({'foo': ('x', np.random.randn(10))})
73+
objs = [data.isel(x=slice(5)), data.isel(x=slice(5, None))]
74+
for data_vars in ['minimal', 'different', 'all', [], ['foo']]:
75+
actual = concat(objs, dim='x', data_vars=data_vars)
76+
self.assertDatasetIdentical(data, actual)
77+
78+
def test_concat_coords(self):
79+
data = Dataset({'foo': ('x', np.random.randn(10))})
80+
expected = data.assign_coords(c=('x', [0] * 5 + [1] * 5))
81+
objs = [data.isel(x=slice(5)).assign_coords(c=0),
82+
data.isel(x=slice(5, None)).assign_coords(c=1)]
83+
for coords in ['different', 'all', ['c']]:
84+
actual = concat(objs, dim='x', coords=coords)
85+
self.assertDatasetIdentical(expected, actual)
86+
for coords in ['minimal', []]:
87+
with self.assertRaisesRegexp(ValueError, 'not equal across'):
88+
concat(objs, dim='x', coords=coords)
89+
90+
def test_concat_constant_index(self):
91+
# GH425
92+
ds1 = Dataset({'foo': 1.5}, {'y': 1})
93+
ds2 = Dataset({'foo': 2.5}, {'y': 1})
94+
expected = Dataset({'foo': ('y', [1.5, 2.5]), 'y': [1, 1]})
95+
for mode in ['different', 'all', ['foo']]:
96+
actual = concat([ds1, ds2], 'y', data_vars=mode)
97+
self.assertDatasetIdentical(expected, actual)
98+
with self.assertRaisesRegexp(ValueError, 'not equal across datasets'):
99+
concat([ds1, ds2], 'y', data_vars='minimal')
100+
101+
def test_concat_size0(self):
102+
data = create_test_data()
103+
split_data = [data.isel(dim1=slice(0, 0)), data]
104+
actual = concat(split_data, 'dim1')
105+
self.assertDatasetIdentical(data, actual)
106+
107+
actual = concat(split_data[::-1], 'dim1')
108+
self.assertDatasetIdentical(data, actual)
109+
110+
def test_concat_errors(self):
111+
data = create_test_data()
112+
split_data = [data.isel(dim1=slice(3)),
113+
data.isel(dim1=slice(3, None))]
114+
115+
with self.assertRaisesRegexp(ValueError, 'must supply at least one'):
116+
concat([], 'dim1')
117+
118+
with self.assertRaisesRegexp(ValueError, 'are not coordinates'):
119+
concat([data, data], 'new_dim', coords=['not_found'])
120+
121+
with self.assertRaisesRegexp(ValueError, 'global attributes not'):
122+
data0, data1 = deepcopy(split_data)
123+
data1.attrs['foo'] = 'bar'
124+
concat([data0, data1], 'dim1', compat='identical')
125+
self.assertDatasetIdentical(
126+
data, concat([data0, data1], 'dim1', compat='equals'))
127+
128+
with self.assertRaisesRegexp(ValueError, 'encountered unexpected'):
129+
data0, data1 = deepcopy(split_data)
130+
data1['foo'] = ('bar', np.random.randn(10))
131+
concat([data0, data1], 'dim1')
132+
133+
with self.assertRaisesRegexp(ValueError, 'not equal across datasets'):
134+
data0, data1 = deepcopy(split_data)
135+
data1['dim2'] = 2 * data1['dim2']
136+
concat([data0, data1], 'dim1', coords='minimal')
137+
138+
with self.assertRaisesRegexp(ValueError, 'must be defined with 1-d'):
139+
concat([data0, data1], 'dim1')
140+
141+
with self.assertRaisesRegexp(ValueError, 'compat.* invalid'):
142+
concat(split_data, 'dim1', compat='foobar')
143+
144+
with self.assertRaisesRegexp(ValueError, 'unexpected value for'):
145+
concat([data, data], 'new_dim', coords='foobar')
146+
147+
with self.assertRaisesRegexp(ValueError,
148+
'coordinate in some datasets but not others'):
149+
concat([Dataset({'x': 0}), Dataset({'x': [1]})], dim='z')
150+
151+
with self.assertRaisesRegexp(ValueError,
152+
'coordinate in some datasets but not others'):
153+
concat([Dataset({'x': 0}), Dataset({}, {'x': 1})], dim='z')
154+
155+
with self.assertRaisesRegexp(ValueError, 'no longer a valid'):
156+
concat([data, data], 'new_dim', mode='different')
157+
with self.assertRaisesRegexp(ValueError, 'no longer a valid'):
158+
concat([data, data], 'new_dim', concat_over='different')
159+
160+
def test_concat_promote_shape(self):
161+
# mixed dims within variables
162+
objs = [Dataset({}, {'x': 0}), Dataset({'x': [1]})]
163+
actual = concat(objs, 'x')
164+
expected = Dataset({'x': [0, 1]})
165+
self.assertDatasetIdentical(actual, expected)
166+
167+
objs = [Dataset({'x': [0]}), Dataset({}, {'x': 1})]
168+
actual = concat(objs, 'x')
169+
self.assertDatasetIdentical(actual, expected)
170+
171+
# mixed dims between variables
172+
objs = [Dataset({'x': [2], 'y': 3}), Dataset({'x': [4], 'y': 5})]
173+
actual = concat(objs, 'x')
174+
expected = Dataset({'x': [2, 4], 'y': ('x', [3, 5])})
175+
self.assertDatasetIdentical(actual, expected)
176+
177+
# mixed dims in coord variable
178+
objs = [Dataset({'x': [0]}, {'y': -1}),
179+
Dataset({'x': [1]}, {'y': ('x', [-2])})]
180+
actual = concat(objs, 'x')
181+
expected = Dataset({'x': [0, 1]}, {'y': ('x', [-1, -2])})
182+
self.assertDatasetIdentical(actual, expected)
183+
184+
# scalars with mixed lengths along concat dim -- values should repeat
185+
objs = [Dataset({'x': [0]}, {'y': -1}),
186+
Dataset({'x': [1, 2]}, {'y': -2})]
187+
actual = concat(objs, 'x')
188+
expected = Dataset({}, {'y': ('x', [-1, -2, -2])})
189+
self.assertDatasetIdentical(actual, expected)
190+
191+
# broadcast 1d x 1d -> 2d
192+
objs = [Dataset({'z': ('x', [-1])}, {'x': [0], 'y': [0]}),
193+
Dataset({'z': ('y', [1])}, {'x': [1], 'y': [0]})]
194+
actual = concat(objs, 'x')
195+
expected = Dataset({'z': (('x', 'y'), [[-1], [1]])})
196+
self.assertDatasetIdentical(actual, expected)
197+
198+
def test_concat_do_not_promote(self):
199+
# GH438
200+
objs = [Dataset({'y': ('t', [1])}, {'x': 1}),
201+
Dataset({'y': ('t', [2])}, {'x': 1})]
202+
expected = Dataset({'y': ('t', [1, 2])}, {'x': 1, 't': [0, 0]})
203+
actual = concat(objs, 't')
204+
self.assertDatasetIdentical(expected, actual)
205+
206+
objs = [Dataset({'y': ('t', [1])}, {'x': 1}),
207+
Dataset({'y': ('t', [2])}, {'x': 2})]
208+
with self.assertRaises(ValueError):
209+
concat(objs, 't', coords='minimal')
210+
211+
@requires_dask # only for toolz
212+
def test_auto_combine(self):
213+
objs = [Dataset({'x': [0]}), Dataset({'x': [1]})]
214+
actual = auto_combine(objs)
215+
expected = Dataset({'x': [0, 1]})
216+
self.assertDatasetIdentical(expected, actual)
217+
218+
actual = auto_combine([actual])
219+
self.assertDatasetIdentical(expected, actual)
220+
221+
objs = [Dataset({'x': [0, 1]}), Dataset({'x': [2]})]
222+
actual = auto_combine(objs)
223+
expected = Dataset({'x': [0, 1, 2]})
224+
self.assertDatasetIdentical(expected, actual)
225+
226+
# ensure auto_combine handles non-sorted dimensions
227+
objs = [Dataset(OrderedDict([('x', ('a', [0])), ('y', ('a', [0]))])),
228+
Dataset(OrderedDict([('y', ('a', [1])), ('x', ('a', [1]))]))]
229+
actual = auto_combine(objs)
230+
expected = Dataset({'x': ('a', [0, 1]), 'y': ('a', [0, 1]), 'a': [0, 0]})
231+
self.assertDatasetIdentical(expected, actual)
232+
233+
objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'y': [1], 'x': [1]})]
234+
with self.assertRaisesRegexp(ValueError, 'too many .* dimensions'):
235+
auto_combine(objs)
236+
237+
objs = [Dataset({'x': 0}), Dataset({'x': 1})]
238+
with self.assertRaisesRegexp(ValueError, 'cannot infer dimension'):
239+
auto_combine(objs)
240+
241+
objs = [Dataset({'x': [0], 'y': [0]}), Dataset({'x': [0]})]
242+
with self.assertRaises(KeyError):
243+
auto_combine(objs)
244+
245+
246+
class TestConcatDataArray(TestCase):
247+
def test_concat(self):
248+
ds = Dataset({'foo': (['x', 'y'], np.random.random((10, 20))),
249+
'bar': (['x', 'y'], np.random.random((10, 20)))})
250+
foo = ds['foo']
251+
bar = ds['bar']
252+
253+
# from dataset array:
254+
expected = DataArray(np.array([foo.values, bar.values]),
255+
dims=['w', 'x', 'y'])
256+
actual = concat([foo, bar], 'w')
257+
self.assertDataArrayEqual(expected, actual)
258+
# from iteration:
259+
grouped = [g for _, g in foo.groupby('x')]
260+
stacked = concat(grouped, ds['x'])
261+
self.assertDataArrayIdentical(foo, stacked)
262+
# with an index as the 'dim' argument
263+
stacked = concat(grouped, ds.indexes['x'])
264+
self.assertDataArrayIdentical(foo, stacked)
265+
266+
actual = concat([foo[0], foo[1]], pd.Index([0, 1])).reset_coords(drop=True)
267+
expected = foo[:2].rename({'x': 'concat_dim'})
268+
self.assertDataArrayIdentical(expected, actual)
269+
270+
actual = concat([foo[0], foo[1]], [0, 1]).reset_coords(drop=True)
271+
expected = foo[:2].rename({'x': 'concat_dim'})
272+
self.assertDataArrayIdentical(expected, actual)
273+
274+
with self.assertRaisesRegexp(ValueError, 'not identical'):
275+
concat([foo, bar], dim='w', compat='identical')
276+
277+
with self.assertRaisesRegexp(ValueError, 'not a valid argument'):
278+
concat([foo, bar], dim='w', data_vars='minimal')
279+
280+
@requires_dask
281+
def test_concat_lazy(self):
282+
import dask.array as da
283+
284+
arrays = [DataArray(
285+
da.from_array(InaccessibleArray(np.zeros((3, 3))), 3),
286+
dims=['x', 'y']) for _ in range(2)]
287+
# should not raise
288+
combined = concat(arrays, dim='z')
289+
self.assertEqual(combined.shape, (2, 3, 3))
290+
self.assertEqual(combined.dims, ('z', 'x', 'y'))

xray/test/test_dataarray.py

-45
Original file line numberDiff line numberDiff line change
@@ -1149,51 +1149,6 @@ def test_resample_upsampling(self):
11491149
actual = array.resample('12H', 'time', how=how)
11501150
self.assertDataArrayIdentical(expected, actual)
11511151

1152-
def test_concat(self):
1153-
self.ds['bar'] = Variable(['x', 'y'], np.random.randn(10, 20))
1154-
foo = self.ds['foo']
1155-
bar = self.ds['bar']
1156-
# from dataset array:
1157-
expected = DataArray(np.array([foo.values, bar.values]),
1158-
dims=['w', 'x', 'y'])
1159-
actual = concat([foo, bar], 'w')
1160-
self.assertDataArrayEqual(expected, actual)
1161-
# from iteration:
1162-
grouped = [g for _, g in foo.groupby('x')]
1163-
stacked = concat(grouped, self.ds['x'])
1164-
self.assertDataArrayIdentical(foo, stacked)
1165-
# with an index as the 'dim' argument
1166-
stacked = concat(grouped, self.ds.indexes['x'])
1167-
self.assertDataArrayIdentical(foo, stacked)
1168-
1169-
actual = concat([foo[0], foo[1]], pd.Index([0, 1])).reset_coords(drop=True)
1170-
expected = foo[:2].rename({'x': 'concat_dim'})
1171-
self.assertDataArrayIdentical(expected, actual)
1172-
1173-
actual = concat([foo[0], foo[1]], [0, 1]).reset_coords(drop=True)
1174-
expected = foo[:2].rename({'x': 'concat_dim'})
1175-
self.assertDataArrayIdentical(expected, actual)
1176-
1177-
with self.assertRaisesRegexp(ValueError, 'not identical'):
1178-
concat([foo, bar], dim='w', compat='identical')
1179-
1180-
with self.assertRaisesRegexp(ValueError, 'not a valid argument'):
1181-
concat([foo, bar], dim='w', data_vars='minimal')
1182-
1183-
@requires_dask
1184-
def test_concat_lazy(self):
1185-
import dask.array as da
1186-
from xray import DataArray, concat
1187-
from xray.test import InaccessibleArray
1188-
import numpy as np
1189-
arrays = [DataArray(
1190-
da.from_array(InaccessibleArray(np.zeros((3, 3))), 3),
1191-
dims=['x', 'y']) for _ in range(2)]
1192-
# should not raise
1193-
combined = concat(arrays, dim='z')
1194-
self.assertEqual(combined.shape, (2, 3, 3))
1195-
self.assertEqual(combined.dims, ('z', 'x', 'y'))
1196-
11971152
def test_align(self):
11981153
self.ds['x'] = ('x', np.array(list('abcdefghij')))
11991154
dv1, dv2 = align(self.dv, self.dv[:5], join='inner')

0 commit comments

Comments
 (0)