diff --git a/xarray/core/common.py b/xarray/core/common.py index 0acff90c149..2a5650011b3 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -341,14 +341,16 @@ def groupby(self, group, squeeze=True): """ if isinstance(group, basestring): group = self[group] + elif isinstance(group, (list, tuple)): + group = [self[g] if isinstance(g, basestring) else g for g in group] return self.groupby_cls(self, group, squeeze=squeeze) def groupby_bins(self, group, bins, right=True, labels=None, precision=3, include_lowest=False, squeeze=True): """Returns a GroupBy object for performing grouped operations. - Rather than using all unique values of `group`, the values are discretized - first by applying `pandas.cut` [1]_ to `group`. + Rather than using all unique values of `group`, the values are + discretized first by applying `pandas.cut` [1]_ to `group`. Parameters ---------- diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 868961ee653..ee30802c510 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -286,9 +286,11 @@ def _to_dataset_whole(self, name=None, shallow_copy=True): if name is None: raise ValueError('unable to convert unnamed DataArray to a ' 'Dataset without providing an explicit name') - if name in self.coords: + if (name in self.coords and + not self.variable.identical(self._coords[name])): raise ValueError('cannot create a Dataset from a DataArray with ' - 'the same name as one of its coordinates') + 'the same name as one of its coordinates ' + 'unless they are identical') # use private APIs here for speed: this is called by _to_temp_dataset(), # which is used in the guts of a lot of operations (e.g., reindex) variables = self._coords.copy() diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 3d5d61c747e..f0f77836c21 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -4,11 +4,13 @@ from . import nputils from . import ops +from .alignment import broadcast from .combine import concat from .common import ( ImplementsArrayReduce, ImplementsDatasetReduce, _maybe_promote, ) -from .pycompat import zip +from .merge import merge +from .pycompat import zip, OrderedDict from .utils import peek_at, maybe_wrap_array, safe_cast_to_index from .variable import as_variable, Variable, Coordinate @@ -19,22 +21,28 @@ def unique_value_groups(ar): Parameters ---------- ar : array-like - Input array. This will be flattened if it is not already 1-D. + One dimensional array-like. Returns ------- - values : np.ndarray - Sorted, unique values as returned by `np.unique`. + values : pd.Index + Sorted, unique values as returned by `pd.factorize`. indices : list of lists of int Each element provides the integer indices in `ar` with values given by the corresponding value in `unique_values`. """ - inverse, values = pd.factorize(ar, sort=True) + index = safe_cast_to_index(ar) + inverse, values = pd.factorize(index, sort=True) groups = [[] for _ in range(len(values))] for n, g in enumerate(inverse): if g >= 0: # pandas uses -1 to mark NaN, but doesn't include them in values groups[g].append(n) + + if isinstance(values, pd.MultiIndex): + # restore level names + values = values.set_names(index.names) + return values, groups @@ -114,6 +122,11 @@ def _inverse_permutation_indices(positions): return indices +def _is_monotonic_unique(group): + index = safe_cast_to_index(group) + return index.is_monotonic and index.is_unique + + class GroupBy(object): """A object that implements the split-apply-combine pattern. @@ -131,7 +144,7 @@ class GroupBy(object): DataArray.groupby """ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, - cut_kwargs={}): + cut_kwargs={}): """Create a GroupBy object Parameters @@ -152,44 +165,104 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, cut_kwargs : dict, optional Extra keyword arguments to pass to `pandas.cut` """ - from .dataset import as_dataset + from .dataset import Dataset from .dataarray import DataArray - if getattr(group, 'name', None) is None: - raise ValueError('`group` must have a name') - self._stacked_dim = None - if group.ndim != 1: + def check_valid_group(group_obj): + if not isinstance(group_obj, (DataArray, Variable)): + raise TypeError('`group` must be a DataArray, Variable or list ' + 'of DataArrays and/or Variables') + if getattr(group_obj, 'name', None) is None: + raise ValueError('each item in `group` must have a name') + + if grouper is not None and bins is not None: + raise TypeError("Can't specify both `grouper` and `bins`.") + + if isinstance(group, (list, tuple)): + if not group: + raise ValueError('must supply at least one item to groupby') + for g in group: + check_valid_group(g) + group_names = [g.name for g in group] + # we merge multiple groupby variables into Dataset, so they can be + # stacked if they use multiple dimensions + group = merge(group) + else: + check_valid_group(group) + group_names = [] + + orig_dims = [] + stacked_dim_name = None + if len(group.dims) > 1: # try to stack the dims of the group into a single dim # TODO: figure out how to exclude dimensions from the stacking # (e.g. group over space dims but leave time dim intact) - orig_dims = group.dims + orig_dims = tuple(group.dims) stacked_dim_name = 'stacked_' + '_'.join(orig_dims) + # the copy is necessary here, otherwise read only array raises error # in pandas: https://github.com/pydata/pandas/issues/12813 group = group.stack(**{stacked_dim_name: orig_dims}).copy() obj = obj.stack(**{stacked_dim_name: orig_dims}) - self._stacked_dim = stacked_dim_name - self._unstacked_dims = orig_dims - if not hasattr(group, 'dims'): - raise ValueError("`group` must have a 'dims' attribute") - group_dim, = group.dims - try: - expected_size = obj.dims[group_dim] - except TypeError: - expected_size = obj.shape[obj.get_axis_num(group_dim)] + grouped_dim_name = None + + if isinstance(group, Dataset): + # list or tuple input is now a 1-dimensional Dataset + + unstacked_group_names = [g for g in group_names + if g not in orig_dims] + stacked_group_names = [g for g in group_names if g in orig_dims] + + levels = [] + labels = [] + names = [] + if unstacked_group_names: + if len(unstacked_group_names) == 1: + # MultiIndex.from_array returns a normal Index when passed + # a single argument, so we use factorize instead. + unstacked_name, = unstacked_group_names + label, level = pd.factorize( + group[unstacked_name].to_index()) + levels.append(level) + labels.append(label) + names.append(unstacked_name) + else: + index = pd.MultiIndex.from_arrays( + [group[name].to_index() + for name in unstacked_group_names], + names=unstacked_group_names) + levels.extend(index.levels) + labels.extend(index.labels) + names.extend(index.names) + + if stacked_group_names: + index = group.coords[stacked_dim_name].to_index() + for level, label, name in zip( + index.levels, index.labels, index.names): + if name in stacked_group_names: + levels.append(level) + labels.append(label) + names.append(name) + + group_index = pd.MultiIndex(levels, labels, names=names) + grouped_dim_name = 'grouped_' + '_'.join(group_names) + group = DataArray(group_index, group.coords, + dims=list(group.dims), name=grouped_dim_name) + + group_dim, = group.dims + expected_size = obj.coords[group_dim].size if group.size != expected_size: raise ValueError('the group variable\'s length does not ' 'match the length of this variable along its ' 'dimension') full_index = None - if grouper is not None and bins is not None: - raise TypeError("Can't specify both `grouper` and `bins`.") if bins is not None: binned = pd.cut(group.values, bins, **cut_kwargs) new_dim_name = group.name + '_bins' group = DataArray(binned, group.coords, name=new_dim_name) + if grouper is not None: index = safe_cast_to_index(group) if not index.is_monotonic: @@ -205,13 +278,10 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, group_indices = ([slice(i, j) for i, j in zip(sbins[:-1], sbins[1:])] + [slice(sbins[-1], None)]) unique_coord = Coordinate(group.name, first_items.index) - elif group.name in obj.dims and bins is None: - # assume that group already has sorted, unique values - # (if using bins, the group will have the same name as a dimension - # but different values) - if group.dims != (group.name,): - raise ValueError('`group` is required to be a coordinate if ' - '`group.name` is a dimension in `obj`') + elif group.name in obj.dims and _is_monotonic_unique(group): + # TODO(shoyer): Figure out how to handle cases where group is a + # dimension coordinate, but not monotonic unique. How should we + # handle squeeze? group_indices = np.arange(group.size) if not squeeze: # group_indices = group_indices.reshape(-1, 1) @@ -230,6 +300,8 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None, self.unique_coord = unique_coord self._groups = None self._full_index = full_index + self._stacked_dim = stacked_dim_name + self._grouped_dim = grouped_dim_name @property def groups(self): @@ -296,7 +368,7 @@ def _maybe_restore_empty_groups(self, combined): """Our index contained empty groups (e.g., from a resampling). If we reduced on that dimension, we want to restore the full index. """ - if (self._full_index is not None and self.group.name in combined.dims): + if self._full_index is not None and self.group.name in combined.dims: indexers = {self.group.name: self._full_index} combined = combined.reindex(**indexers) return combined @@ -304,8 +376,12 @@ def _maybe_restore_empty_groups(self, combined): def _maybe_unstack_array(self, arr): """This gets called if we are applying on an array with a multidimensional group.""" - if self._stacked_dim is not None and self._stacked_dim in arr.dims: - arr = arr.unstack(self._stacked_dim) + if self._stacked_dim is not None: + if self._stacked_dim in arr.dims: + arr = arr.unstack(self._stacked_dim) + elif (self._grouped_dim is not None + and self._grouped_dim in arr.dims): + arr = arr.unstack(self._grouped_dim) return arr def fillna(self, value): @@ -426,12 +502,6 @@ def lookup_order(dimension): new_order = sorted(stacked.dims, key=lookup_order) return stacked.transpose(*new_order) - def _restore_multiindex(self, combined): - if self._stacked_dim is not None and self._stacked_dim in combined.dims: - stacked_dim = self.group[self._stacked_dim] - combined[self._stacked_dim] = stacked_dim - return combined - def apply(self, func, shortcut=False, **kwargs): """Apply a function over each array in the group and concatenate them together into a new array. @@ -490,7 +560,6 @@ def _concat(self, applied, shortcut=False): combined = _maybe_reorder(combined, concat_dim, positions) if isinstance(combined, type(self.obj)): combined = self._restore_dim_order(combined) - combined = self._restore_multiindex(combined) return combined def reduce(self, func, dim=None, axis=None, keep_attrs=False, diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index e873280721b..837116d443f 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -1131,270 +1131,6 @@ def test_fillna(self): actual = a.groupby('b').fillna(DataArray([0, 2], dims='b')) self.assertDataArrayIdentical(expected, actual) - def test_groupby_iter(self): - for ((act_x, act_dv), (exp_x, exp_ds)) in \ - zip(self.dv.groupby('y'), self.ds.groupby('y')): - self.assertEqual(exp_x, act_x) - self.assertDataArrayIdentical(exp_ds['foo'], act_dv) - for ((_, exp_dv), act_dv) in zip(self.dv.groupby('x'), self.dv): - self.assertDataArrayIdentical(exp_dv, act_dv) - - def make_groupby_example_array(self): - da = self.dv.copy() - da.coords['abc'] = ('y', np.array(['a'] * 9 + ['c'] + ['b'] * 10)) - da.coords['y'] = 20 + 100 * da['y'] - return da - - def test_groupby_properties(self): - grouped = self.make_groupby_example_array().groupby('abc') - expected_unique = Variable('abc', ['a', 'b', 'c']) - self.assertVariableEqual(expected_unique, grouped.unique_coord) - self.assertEqual(3, len(grouped)) - - def test_groupby_apply_identity(self): - expected = self.make_groupby_example_array() - idx = expected.coords['y'] - - def identity(x): - return x - - for g in ['x', 'y', 'abc', idx]: - for shortcut in [False, True]: - for squeeze in [False, True]: - grouped = expected.groupby(g, squeeze=squeeze) - actual = grouped.apply(identity, shortcut=shortcut) - self.assertDataArrayIdentical(expected, actual) - - def test_groupby_sum(self): - array = self.make_groupby_example_array() - grouped = array.groupby('abc') - - expected_sum_all = Dataset( - {'foo': Variable(['abc'], np.array([self.x[:, :9].sum(), - self.x[:, 10:].sum(), - self.x[:, 9:10].sum()]).T), - 'abc': Variable(['abc'], np.array(['a', 'b', 'c']))})['foo'] - self.assertDataArrayAllClose(expected_sum_all, grouped.reduce(np.sum)) - self.assertDataArrayAllClose(expected_sum_all, grouped.sum()) - - expected = DataArray([array['y'].values[idx].sum() for idx - in [slice(9), slice(10, None), slice(9, 10)]], - [['a', 'b', 'c']], ['abc']) - actual = array['y'].groupby('abc').apply(np.sum) - self.assertDataArrayAllClose(expected, actual) - actual = array['y'].groupby('abc').sum() - self.assertDataArrayAllClose(expected, actual) - - expected_sum_axis1 = Dataset( - {'foo': (['x', 'abc'], np.array([self.x[:, :9].sum(1), - self.x[:, 10:].sum(1), - self.x[:, 9:10].sum(1)]).T), - 'x': self.ds['x'], - 'abc': Variable(['abc'], np.array(['a', 'b', 'c']))})['foo'] - self.assertDataArrayAllClose(expected_sum_axis1, - grouped.reduce(np.sum, 'y')) - self.assertDataArrayAllClose(expected_sum_axis1, grouped.sum('y')) - - def test_groupby_count(self): - array = DataArray([0, 0, np.nan, np.nan, 0, 0], - coords={'cat': ('x', ['a', 'b', 'b', 'c', 'c', 'c'])}, - dims='x') - actual = array.groupby('cat').count() - expected = DataArray([1, 1, 2], coords=[('cat', ['a', 'b', 'c'])]) - self.assertDataArrayIdentical(actual, expected) - - @unittest.skip('needs to be fixed for shortcut=False, keep_attrs=False') - def test_groupby_reduce_attrs(self): - array = self.make_groupby_example_array() - array.attrs['foo'] = 'bar' - - for shortcut in [True, False]: - for keep_attrs in [True, False]: - print('shortcut=%s, keep_attrs=%s' % (shortcut, keep_attrs)) - actual = array.groupby('abc').reduce( - np.mean, keep_attrs=keep_attrs, shortcut=shortcut) - expected = array.groupby('abc').mean() - if keep_attrs: - expected.attrs['foo'] = 'bar' - self.assertDataArrayIdentical(expected, actual) - - def test_groupby_apply_center(self): - def center(x): - return x - np.mean(x) - - array = self.make_groupby_example_array() - grouped = array.groupby('abc') - - expected_ds = array.to_dataset() - exp_data = np.hstack([center(self.x[:, :9]), - center(self.x[:, 9:10]), - center(self.x[:, 10:])]) - expected_ds['foo'] = (['x', 'y'], exp_data) - expected_centered = expected_ds['foo'] - self.assertDataArrayAllClose(expected_centered, grouped.apply(center)) - - def test_groupby_apply_ndarray(self): - # regression test for #326 - array = self.make_groupby_example_array() - grouped = array.groupby('abc') - actual = grouped.apply(np.asarray) - self.assertDataArrayEqual(array, actual) - - def test_groupby_apply_changes_metadata(self): - def change_metadata(x): - x.coords['x'] = x.coords['x'] * 2 - x.attrs['fruit'] = 'lemon' - return x - - array = self.make_groupby_example_array() - grouped = array.groupby('abc') - actual = grouped.apply(change_metadata) - expected = array.copy() - expected = change_metadata(expected) - self.assertDataArrayEqual(expected, actual) - - def test_groupby_math(self): - array = self.make_groupby_example_array() - for squeeze in [True, False]: - grouped = array.groupby('x', squeeze=squeeze) - - expected = array + array.coords['x'] - actual = grouped + array.coords['x'] - self.assertDataArrayIdentical(expected, actual) - - actual = array.coords['x'] + grouped - self.assertDataArrayIdentical(expected, actual) - - ds = array.coords['x'].to_dataset('X') - expected = array + ds - actual = grouped + ds - self.assertDatasetIdentical(expected, actual) - - actual = ds + grouped - self.assertDatasetIdentical(expected, actual) - - grouped = array.groupby('abc') - expected_agg = (grouped.mean() - np.arange(3)).rename(None) - actual = grouped - DataArray(range(3), [('abc', ['a', 'b', 'c'])]) - actual_agg = actual.groupby('abc').mean() - self.assertDataArrayAllClose(expected_agg, actual_agg) - - with self.assertRaisesRegexp(TypeError, 'only support binary ops'): - grouped + 1 - with self.assertRaisesRegexp(TypeError, 'only support binary ops'): - grouped + grouped - with self.assertRaisesRegexp(TypeError, 'in-place operations'): - array += grouped - - def test_groupby_math_not_aligned(self): - array = DataArray(range(4), {'b': ('x', [0, 0, 1, 1])}, dims='x') - other = DataArray([10], dims='b') - actual = array.groupby('b') + other - expected = DataArray([10, 11, np.nan, np.nan], array.coords) - self.assertDataArrayIdentical(expected, actual) - - other = DataArray([10], coords={'c': 123}, dims='b') - actual = array.groupby('b') + other - expected.coords['c'] = (['x'], [123] * 2 + [np.nan] * 2) - self.assertDataArrayIdentical(expected, actual) - - other = Dataset({'a': ('b', [10])}) - actual = array.groupby('b') + other - expected = Dataset({'a': ('x', [10, 11, np.nan, np.nan])}, - array.coords) - self.assertDatasetIdentical(expected, actual) - - def test_groupby_restore_dim_order(self): - array = DataArray(np.random.randn(5, 3), - coords={'a': ('x', range(5)), 'b': ('y', range(3))}, - dims=['x', 'y']) - for by, expected_dims in [('x', ('x', 'y')), - ('y', ('x', 'y')), - ('a', ('a', 'y')), - ('b', ('x', 'b'))]: - result = array.groupby(by).apply(lambda x: x.squeeze()) - self.assertEqual(result.dims, expected_dims) - - def test_groupby_first_and_last(self): - array = DataArray([1, 2, 3, 4, 5], dims='x') - by = DataArray(['a'] * 2 + ['b'] * 3, dims='x', name='ab') - - expected = DataArray([1, 3], [('ab', ['a', 'b'])]) - actual = array.groupby(by).first() - self.assertDataArrayIdentical(expected, actual) - - expected = DataArray([2, 5], [('ab', ['a', 'b'])]) - actual = array.groupby(by).last() - self.assertDataArrayIdentical(expected, actual) - - array = DataArray(np.random.randn(5, 3), dims=['x', 'y']) - expected = DataArray(array[[0, 2]], {'ab': ['a', 'b']}, ['ab', 'y']) - actual = array.groupby(by).first() - self.assertDataArrayIdentical(expected, actual) - - actual = array.groupby('x').first() - expected = array # should be a no-op - self.assertDataArrayIdentical(expected, actual) - - def make_groupby_multidim_example_array(self): - return DataArray([[[0,1],[2,3]],[[5,10],[15,20]]], - coords={'lon': (['ny', 'nx'], [[30., 40.], [40., 50.]] ), - 'lat': (['ny', 'nx'], [[10., 10.], [20., 20.]] ),}, - dims=['time', 'ny', 'nx']) - - def test_groupby_multidim(self): - array = self.make_groupby_multidim_example_array() - for dim, expected_sum in [ - ('lon', DataArray([5, 28, 23], coords={'lon': [30., 40., 50.]})), - ('lat', DataArray([16, 40], coords={'lat': [10., 20.]}))]: - actual_sum = array.groupby(dim).sum() - self.assertDataArrayIdentical(expected_sum, actual_sum) - - def test_groupby_multidim_apply(self): - array = self.make_groupby_multidim_example_array() - actual = array.groupby('lon').apply( - lambda x : x - x.mean(), shortcut=False) - expected = DataArray([[[-2.5, -6.], [-5., -8.5]], - [[ 2.5, 3.], [ 8., 8.5]]], - coords=array.coords, dims=array.dims) - self.assertDataArrayIdentical(expected, actual) - - def test_groupby_bins(self): - array = DataArray(np.arange(4), dims='dim_0') - # the first value should not be part of any group ("right" binning) - array[0] = 99 - # bins follow conventions for pandas.cut - # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html - bins = [0,1.5,5] - bin_coords = ['(0, 1.5]', '(1.5, 5]'] - expected = DataArray([1,5], dims='dim_0_bins', - coords={'dim_0_bins': bin_coords}) - # the problem with this is that it overwrites the dimensions of array! - #actual = array.groupby('dim_0', bins=bins).sum() - actual = array.groupby_bins('dim_0', bins).apply( - lambda x : x.sum(), shortcut=False) - self.assertDataArrayIdentical(expected, actual) - # make sure original array dims are unchanged - # (would fail with shortcut=True above) - self.assertEqual(len(array.dim_0), 4) - - def test_groupby_bins_multidim(self): - array = self.make_groupby_multidim_example_array() - bins = [0,15,20] - bin_coords = ['(0, 15]', '(15, 20]'] - expected = DataArray([16, 40], dims='lat_bins', - coords={'lat_bins': bin_coords}) - actual = array.groupby_bins('lat', bins).apply( - lambda x : x.sum(), shortcut=False) - self.assertDataArrayIdentical(expected, actual) - # modify the array coordinates to be non-monotonic after unstacking - array['lat'].data = np.array([[10., 20.], [20., 10.]]) - expected = DataArray([28, 28], dims='lat_bins', - coords={'lat_bins': bin_coords}) - actual = array.groupby_bins('lat', bins).apply( - lambda x : x.sum(), shortcut=False) - self.assertDataArrayIdentical(expected, actual) - def make_rolling_example_array(self): times = pd.date_range('2000-01-01', freq='1D', periods=21) values = np.random.random((21, 4)) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 7636b67178a..7059fa8ff17 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -1499,141 +1499,6 @@ def get_args(v): with self.assertRaisesRegexp(ValueError, 'cannot select a dimension'): data.squeeze('y') - def test_groupby(self): - data = Dataset({'z': (['x', 'y'], np.random.randn(3, 5))}, - {'x': ('x', list('abc')), - 'c': ('x', [0, 1, 0])}) - groupby = data.groupby('x') - self.assertEqual(len(groupby), 3) - expected_groups = {'a': 0, 'b': 1, 'c': 2} - self.assertEqual(groupby.groups, expected_groups) - expected_items = [('a', data.isel(x=0)), - ('b', data.isel(x=1)), - ('c', data.isel(x=2))] - for actual, expected in zip(groupby, expected_items): - self.assertEqual(actual[0], expected[0]) - self.assertDatasetEqual(actual[1], expected[1]) - - identity = lambda x: x - for k in ['x', 'c', 'y']: - actual = data.groupby(k, squeeze=False).apply(identity) - self.assertDatasetEqual(data, actual) - - def test_groupby_returns_new_type(self): - data = Dataset({'z': (['x', 'y'], np.random.randn(3, 5))}) - - actual = data.groupby('x').apply(lambda ds: ds['z']) - expected = data['z'] - self.assertDataArrayIdentical(expected, actual) - - actual = data['z'].groupby('x').apply(lambda x: x.to_dataset()) - expected = data - self.assertDatasetIdentical(expected, actual) - - def test_groupby_iter(self): - data = create_test_data() - for n, (t, sub) in enumerate(list(data.groupby('dim1'))[:3]): - self.assertEqual(data['dim1'][n], t) - self.assertVariableEqual(data['var1'][n], sub['var1']) - self.assertVariableEqual(data['var2'][n], sub['var2']) - self.assertVariableEqual(data['var3'][:, n], sub['var3']) - - def test_groupby_errors(self): - data = create_test_data() - with self.assertRaisesRegexp(ValueError, 'must have a name'): - data.groupby(np.arange(10)) - with self.assertRaisesRegexp(ValueError, 'length does not match'): - data.groupby(data['dim1'][:3]) - with self.assertRaisesRegexp(ValueError, "must have a 'dims'"): - data.groupby(data.coords['dim1'].to_index()) - - def test_groupby_reduce(self): - data = Dataset({'xy': (['x', 'y'], np.random.randn(3, 4)), - 'xonly': ('x', np.random.randn(3)), - 'yonly': ('y', np.random.randn(4)), - 'letters': ('y', ['a', 'a', 'b', 'b'])}) - - expected = data.mean('y') - expected['yonly'] = expected['yonly'].variable.expand_dims({'x': 3}) - actual = data.groupby('x').mean() - self.assertDatasetAllClose(expected, actual) - - actual = data.groupby('x').mean('y') - self.assertDatasetAllClose(expected, actual) - - letters = data['letters'] - expected = Dataset({'xy': data['xy'].groupby(letters).mean(), - 'xonly': (data['xonly'].mean().variable - .expand_dims({'letters': 2})), - 'yonly': data['yonly'].groupby(letters).mean()}) - actual = data.groupby('letters').mean() - self.assertDatasetAllClose(expected, actual) - - def test_groupby_math(self): - reorder_dims = lambda x: x.transpose('dim1', 'dim2', 'dim3', 'time') - - ds = create_test_data() - for squeeze in [True, False]: - grouped = ds.groupby('dim1', squeeze=squeeze) - - expected = reorder_dims(ds + ds.coords['dim1']) - actual = grouped + ds.coords['dim1'] - self.assertDatasetIdentical(expected, reorder_dims(actual)) - - actual = ds.coords['dim1'] + grouped - self.assertDatasetIdentical(expected, reorder_dims(actual)) - - ds2 = 2 * ds - expected = reorder_dims(ds + ds2) - actual = grouped + ds2 - self.assertDatasetIdentical(expected, reorder_dims(actual)) - - actual = ds2 + grouped - self.assertDatasetIdentical(expected, reorder_dims(actual)) - - grouped = ds.groupby('numbers') - zeros = DataArray([0, 0, 0, 0], [('numbers', range(4))]) - expected = ((ds + Variable('dim3', np.zeros(10))) - .transpose('dim3', 'dim1', 'dim2', 'time')) - actual = grouped + zeros - self.assertDatasetEqual(expected, actual) - - actual = zeros + grouped - self.assertDatasetEqual(expected, actual) - - with self.assertRaisesRegexp(ValueError, 'dimensions .* do not exist'): - grouped + ds - with self.assertRaisesRegexp(ValueError, 'dimensions .* do not exist'): - ds + grouped - with self.assertRaisesRegexp(TypeError, 'only support binary ops'): - grouped + 1 - with self.assertRaisesRegexp(TypeError, 'only support binary ops'): - grouped + grouped - with self.assertRaisesRegexp(TypeError, 'in-place operations'): - ds += grouped - - ds = Dataset({'x': ('time', np.arange(100)), - 'time': pd.date_range('2000-01-01', periods=100)}) - with self.assertRaisesRegexp(ValueError, 'incompat.* grouped binary'): - ds + ds.groupby('time.month') - - def test_groupby_math_virtual(self): - ds = Dataset({'x': ('t', [1, 2, 3])}, - {'t': pd.date_range('20100101', periods=3)}) - grouped = ds.groupby('t.day') - actual = grouped - grouped.mean() - expected = Dataset({'x': ('t', [0, 0, 0])}, - ds[['t', 't.day']]) - self.assertDatasetIdentical(actual, expected) - - def test_groupby_nan(self): - # nan should be excluded from groupby - ds = Dataset({'foo': ('x', [1, 2, 3, 4])}, - {'bar': ('x', [1, 1, 2, np.nan])}) - actual = ds.groupby('bar').mean() - expected = Dataset({'foo': ('bar', [1.5, 3]), 'bar': [1, 2]}) - self.assertDatasetIdentical(actual, expected) - def test_resample_and_first(self): times = pd.date_range('2000-01-01', freq='6H', periods=10) ds = Dataset({'foo': (['time', 'x', 'y'], np.random.randn(10, 5, 3)), diff --git a/xarray/test/test_groupby.py b/xarray/test/test_groupby.py index c10e574888c..5b8467cb73a 100644 --- a/xarray/test/test_groupby.py +++ b/xarray/test/test_groupby.py @@ -1,8 +1,12 @@ import numpy as np +import pandas as pd +import pytest + import xarray as xr from xarray.core.groupby import _consolidate_slices -import pytest +from . import TestCase, unittest +from .test_dataset import create_test_data def test_consolidate_slices(): @@ -19,6 +23,420 @@ def test_consolidate_slices(): _consolidate_slices([slice(3), 4]) +class TestDatasetGroupBy(TestCase): + + def test_groupby(self): + data = xr.Dataset({'z': (['x', 'y'], np.random.randn(3, 5))}, + {'x': ('x', list('abc')), + 'c': ('x', [0, 1, 0])}) + groupby = data.groupby('x') + self.assertEqual(len(groupby), 3) + expected_groups = {'a': 0, 'b': 1, 'c': 2} + self.assertEqual(groupby.groups, expected_groups) + expected_items = [('a', data.isel(x=0)), + ('b', data.isel(x=1)), + ('c', data.isel(x=2))] + for actual, expected in zip(groupby, expected_items): + self.assertEqual(actual[0], expected[0]) + self.assertDatasetEqual(actual[1], expected[1]) + + identity = lambda x: x + for k in ['x', 'c', 'y']: + actual = data.groupby(k, squeeze=False).apply(identity) + self.assertDatasetEqual(data, actual) + + def test_groupby_returns_new_type(self): + data = xr.Dataset({'z': (['x', 'y'], np.random.randn(3, 5))}) + + actual = data.groupby('x').apply(lambda ds: ds['z']) + expected = data['z'] + self.assertDataArrayIdentical(expected, actual) + + actual = data['z'].groupby('x').apply(lambda x: x.to_dataset()) + expected = data + self.assertDatasetIdentical(expected, actual) + + def test_groupby_iter(self): + data = create_test_data() + for n, (t, sub) in enumerate(list(data.groupby('dim1'))[:3]): + self.assertEqual(data['dim1'][n], t) + self.assertVariableEqual(data['var1'][n], sub['var1']) + self.assertVariableEqual(data['var2'][n], sub['var2']) + self.assertVariableEqual(data['var3'][:, n], sub['var3']) + + def test_groupby_errors(self): + data = create_test_data() + with self.assertRaisesRegexp(TypeError, 'must be'): + data.groupby(np.arange(10)) + with self.assertRaisesRegexp(ValueError, 'must have a name'): + data.groupby(xr.DataArray(range(10), dims='foo')) + with self.assertRaisesRegexp(ValueError, 'length does not match'): + data.groupby(data['dim1'][:3]) + with self.assertRaisesRegexp(TypeError, 'must be'): + data.groupby(data.coords['dim1'].to_index()) + + def test_groupby_reduce(self): + data = xr.Dataset({'xy': (['x', 'y'], np.random.randn(3, 4)), + 'xonly': ('x', np.random.randn(3)), + 'yonly': ('y', np.random.randn(4)), + 'letters': ('y', ['a', 'a', 'b', 'b'])}) + + expected = data.mean('y') + expected['yonly'] = expected['yonly'].variable.expand_dims({'x': 3}) + actual = data.groupby('x').mean() + self.assertDatasetAllClose(expected, actual) + + actual = data.groupby('x').mean('y') + self.assertDatasetAllClose(expected, actual) + + letters = data['letters'] + expected = xr.Dataset({'xy': data['xy'].groupby(letters).mean(), + 'xonly': (data['xonly'].mean().variable + .expand_dims({'letters': 2})), + 'yonly': data['yonly'].groupby(letters).mean()}) + actual = data.groupby('letters').mean() + self.assertDatasetAllClose(expected, actual) + + def test_groupby_math(self): + reorder_dims = lambda x: x.transpose('dim1', 'dim2', 'dim3', 'time') + + ds = create_test_data() + for squeeze in [True, False]: + grouped = ds.groupby('dim1', squeeze=squeeze) + + expected = reorder_dims(ds + ds.coords['dim1']) + actual = grouped + ds.coords['dim1'] + self.assertDatasetIdentical(expected, reorder_dims(actual)) + + actual = ds.coords['dim1'] + grouped + self.assertDatasetIdentical(expected, reorder_dims(actual)) + + ds2 = 2 * ds + expected = reorder_dims(ds + ds2) + actual = grouped + ds2 + self.assertDatasetIdentical(expected, reorder_dims(actual)) + + actual = ds2 + grouped + self.assertDatasetIdentical(expected, reorder_dims(actual)) + + grouped = ds.groupby('numbers') + zeros = xr.DataArray([0, 0, 0, 0], [('numbers', range(4))]) + expected = ((ds + xr.Variable('dim3', np.zeros(10))) + .transpose('dim3', 'dim1', 'dim2', 'time')) + actual = grouped + zeros + self.assertDatasetEqual(expected, actual) + + actual = zeros + grouped + self.assertDatasetEqual(expected, actual) + + with self.assertRaisesRegexp(ValueError, 'dimensions .* do not exist'): + grouped + ds + with self.assertRaisesRegexp(ValueError, 'dimensions .* do not exist'): + ds + grouped + with self.assertRaisesRegexp(TypeError, 'only support binary ops'): + grouped + 1 + with self.assertRaisesRegexp(TypeError, 'only support binary ops'): + grouped + grouped + with self.assertRaisesRegexp(TypeError, 'in-place operations'): + ds += grouped + + ds = xr.Dataset({'x': ('time', np.arange(100)), + 'time': pd.date_range('2000-01-01', periods=100)}) + with self.assertRaisesRegexp(ValueError, 'incompat.* grouped binary'): + ds + ds.groupby('time.month') + + def test_groupby_math_virtual(self): + ds = xr.Dataset({'x': ('t', [1, 2, 3])}, + {'t': pd.date_range('20100101', periods=3)}) + grouped = ds.groupby('t.day') + actual = grouped - grouped.mean() + expected = xr.Dataset({'x': ('t', [0, 0, 0])}, + ds[['t', 't.day']]) + self.assertDatasetIdentical(actual, expected) + + def test_groupby_nan(self): + # nan should be excluded from groupby + ds = xr.Dataset({'foo': ('x', [1, 2, 3, 4])}, + {'bar': ('x', [1, 1, 2, np.nan])}) + actual = ds.groupby('bar').mean() + expected = xr.Dataset({'foo': ('bar', [1.5, 3]), 'bar': [1, 2]}) + self.assertDatasetIdentical(actual, expected) + + +class TestDataArrayGroupBy(TestCase): + + def make_groupby_example_array(self): + da = xr.DataArray(np.random.RandomState(0).rand(10, 20), + {'abc': ('y', ['a'] * 9 + ['c'] + ['b'] * 10), + 'y': 20 + 100 * np.arange(20)}, + ('x', 'y'), + name='foo', + attrs={'attr1': 'value1', 'attr2': 2929}) + return da + + def test_groupby_iter(self): + array = self.make_groupby_example_array() + ds = array.to_dataset() + for ((act_x, act_dv), (exp_x, exp_ds)) in \ + zip(array.groupby('y'), ds.groupby('y')): + self.assertEqual(exp_x, act_x) + self.assertDataArrayIdentical(exp_ds['foo'], act_dv) + for ((_, exp_dv), act_dv) in zip(array.groupby('x'), array): + self.assertDataArrayIdentical(exp_dv, act_dv) + + def test_groupby_properties(self): + grouped = self.make_groupby_example_array().groupby('abc') + expected_unique = xr.Variable('abc', ['a', 'b', 'c']) + self.assertVariableEqual(expected_unique, grouped.unique_coord) + self.assertEqual(3, len(grouped)) + + def test_groupby_apply_identity(self): + expected = self.make_groupby_example_array() + idx = expected.coords['y'] + + def identity(x): + return x + + for g in ['x', 'y', 'abc', idx]: + for shortcut in [False, True]: + for squeeze in [False, True]: + grouped = expected.groupby(g, squeeze=squeeze) + actual = grouped.apply(identity, shortcut=shortcut) + self.assertDataArrayIdentical(expected, actual) + + def test_groupby_sum(self): + array = self.make_groupby_example_array() + grouped = array.groupby('abc') + + expected_sum_all = xr.Dataset( + {'foo': (['abc'], np.array([array.values[:, :9].sum(), + array.values[:, 10:].sum(), + array.values[:, 9:10].sum()]).T), + 'abc': (['abc'], np.array(['a', 'b', 'c']))})['foo'] + self.assertDataArrayAllClose(expected_sum_all, grouped.reduce(np.sum)) + self.assertDataArrayAllClose(expected_sum_all, grouped.sum()) + + expected = xr.DataArray([array['y'].values[idx].sum() for idx + in [slice(9), slice(10, None), slice(9, 10)]], + [['a', 'b', 'c']], ['abc']) + actual = array['y'].groupby('abc').apply(np.sum) + self.assertDataArrayAllClose(expected, actual) + actual = array['y'].groupby('abc').sum() + self.assertDataArrayAllClose(expected, actual) + + expected_sum_axis1 = xr.Dataset( + {'foo': (['x', 'abc'], np.array([array.values[:, :9].sum(1), + array.values[:, 10:].sum(1), + array.values[:, 9:10].sum(1)]).T), + 'x': array['x'], + 'abc': (['abc'], np.array(['a', 'b', 'c']))})['foo'] + self.assertDataArrayAllClose(expected_sum_axis1, + grouped.reduce(np.sum, 'y')) + self.assertDataArrayAllClose(expected_sum_axis1, grouped.sum('y')) + + def test_groupby_count(self): + array = xr.DataArray([0, 0, np.nan, np.nan, 0, 0], + coords={'cat': ('x', ['a', 'b', 'b', 'c', 'c', 'c'])}, + dims='x') + actual = array.groupby('cat').count() + expected = xr.DataArray([1, 1, 2], coords=[('cat', ['a', 'b', 'c'])]) + self.assertDataArrayIdentical(actual, expected) + + @unittest.skip('needs to be fixed for shortcut=False, keep_attrs=False') + def test_groupby_reduce_attrs(self): + array = self.make_groupby_example_array() + array.attrs['foo'] = 'bar' + + for shortcut in [True, False]: + for keep_attrs in [True, False]: + print('shortcut=%s, keep_attrs=%s' % (shortcut, keep_attrs)) + actual = array.groupby('abc').reduce( + np.mean, keep_attrs=keep_attrs, shortcut=shortcut) + expected = array.groupby('abc').mean() + if keep_attrs: + expected.attrs['foo'] = 'bar' + self.assertDataArrayIdentical(expected, actual) + + def test_groupby_apply_center(self): + def center(x): + return x - np.mean(x) + + array = self.make_groupby_example_array() + grouped = array.groupby('abc') + + expected_ds = array.to_dataset() + exp_data = np.hstack([center(array.values[:, :9]), + center(array.values[:, 9:10]), + center(array.values[:, 10:])]) + expected_ds['foo'] = (['x', 'y'], exp_data) + expected_centered = expected_ds['foo'] + self.assertDataArrayAllClose(expected_centered, grouped.apply(center)) + + def test_groupby_apply_ndarray(self): + # regression test for #326 + array = self.make_groupby_example_array() + grouped = array.groupby('abc') + actual = grouped.apply(np.asarray) + self.assertDataArrayEqual(array, actual) + + def test_groupby_apply_changes_metadata(self): + def change_metadata(x): + x.coords['x'] = x.coords['x'] * 2 + x.attrs['fruit'] = 'lemon' + return x + + array = self.make_groupby_example_array() + grouped = array.groupby('abc') + actual = grouped.apply(change_metadata) + expected = array.copy() + expected = change_metadata(expected) + self.assertDataArrayEqual(expected, actual) + + def test_groupby_math(self): + array = self.make_groupby_example_array() + for squeeze in [True, False]: + grouped = array.groupby('x', squeeze=squeeze) + + expected = array + array.coords['x'] + actual = grouped + array.coords['x'] + self.assertDataArrayIdentical(expected, actual) + + actual = array.coords['x'] + grouped + self.assertDataArrayIdentical(expected, actual) + + ds = array.coords['x'].to_dataset('X') + expected = array + ds + actual = grouped + ds + self.assertDatasetIdentical(expected, actual) + + actual = ds + grouped + self.assertDatasetIdentical(expected, actual) + + grouped = array.groupby('abc') + expected_agg = (grouped.mean() - np.arange(3)).rename(None) + actual = grouped - xr.DataArray(range(3), [('abc', ['a', 'b', 'c'])]) + actual_agg = actual.groupby('abc').mean() + self.assertDataArrayAllClose(expected_agg, actual_agg) + + with self.assertRaisesRegexp(TypeError, 'only support binary ops'): + grouped + 1 + with self.assertRaisesRegexp(TypeError, 'only support binary ops'): + grouped + grouped + with self.assertRaisesRegexp(TypeError, 'in-place operations'): + array += grouped + + def test_groupby_math_not_aligned(self): + array = xr.DataArray(range(4), {'b': ('x', [0, 0, 1, 1])}, dims='x') + other = xr.DataArray([10], dims='b') + actual = array.groupby('b') + other + expected = xr.DataArray([10, 11, np.nan, np.nan], array.coords) + self.assertDataArrayIdentical(expected, actual) + + other = xr.DataArray([10], coords={'c': 123}, dims='b') + actual = array.groupby('b') + other + expected.coords['c'] = (['x'], [123] * 2 + [np.nan] * 2) + self.assertDataArrayIdentical(expected, actual) + + other = xr.Dataset({'a': ('b', [10])}) + actual = array.groupby('b') + other + expected = xr.Dataset({'a': ('x', [10, 11, np.nan, np.nan])}, + array.coords) + self.assertDatasetIdentical(expected, actual) + + def test_groupby_restore_dim_order(self): + array = xr.DataArray(np.random.randn(5, 3), + coords={'a': ('x', range(5)), 'b': ('y', range(3))}, + dims=['x', 'y']) + for by, expected_dims in [('x', ('x', 'y')), + ('y', ('x', 'y')), + ('a', ('a', 'y')), + ('b', ('x', 'b'))]: + result = array.groupby(by).apply(lambda x: x.squeeze()) + self.assertEqual(result.dims, expected_dims) + + def test_groupby_first_and_last(self): + array = xr.DataArray([1, 2, 3, 4, 5], dims='x') + by = xr.DataArray(['a'] * 2 + ['b'] * 3, dims='x', name='ab') + + expected = xr.DataArray([1, 3], [('ab', ['a', 'b'])]) + actual = array.groupby(by).first() + self.assertDataArrayIdentical(expected, actual) + + expected = xr.DataArray([2, 5], [('ab', ['a', 'b'])]) + actual = array.groupby(by).last() + self.assertDataArrayIdentical(expected, actual) + + array = xr.DataArray(np.random.randn(5, 3), dims=['x', 'y']) + expected = xr.DataArray(array[[0, 2]], {'ab': ['a', 'b']}, ['ab', 'y']) + actual = array.groupby(by).first() + self.assertDataArrayIdentical(expected, actual) + + actual = array.groupby('x').first() + expected = array # should be a no-op + self.assertDataArrayIdentical(expected, actual) + + def make_groupby_multidim_example_array(self): + return xr.DataArray([[[0, 1], [2, 3]], [[5, 10], [15, 20]]], + coords={'lon': (['ny', 'nx'], [[30., 40.], [40., 50.]]), + 'lat': (['ny', 'nx'], + [[10., 10.], [20., 20.]])}, + dims=['time', 'ny', 'nx']) + + def test_groupby_multidim(self): + array = self.make_groupby_multidim_example_array() + for dim, expected_sum in [ + ('lon', xr.DataArray([5, 28, 23], + coords={'lon': [30., 40., 50.]})), + ('lat', xr.DataArray([16, 40], coords={'lat': [10., 20.]}))]: + actual_sum = array.groupby(dim).sum() + self.assertDataArrayIdentical(expected_sum, actual_sum) + + def test_groupby_multidim_apply(self): + array = self.make_groupby_multidim_example_array() + actual = array.groupby('lon').apply( + lambda x: x - x.mean(), shortcut=False) + expected = xr.DataArray([[[-2.5, -6.], [-5., -8.5]], + [[2.5, 3.], [8., 8.5]]], + coords=array.coords, dims=array.dims) + self.assertDataArrayIdentical(expected, actual) + + def test_groupby_bins(self): + array = xr.DataArray(np.arange(4), dims='dim_0') + # the first value should not be part of any group ("right" binning) + array[0] = 99 + # bins follow conventions for pandas.cut + # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html + bins = [0, 1.5, 5] + bin_coords = ['(0, 1.5]', '(1.5, 5]'] + expected = xr.DataArray([1, 5], dims='dim_0_bins', + coords={'dim_0_bins': bin_coords}) + # the problem with this is that it overwrites the dimensions of array! + #actual = array.groupby('dim_0', bins=bins).sum() + actual = array.groupby_bins('dim_0', bins).apply( + lambda x: x.sum(), shortcut=False) + self.assertDataArrayIdentical(expected, actual) + # make sure original array dims are unchanged + # (would fail with shortcut=True above) + self.assertEqual(len(array.dim_0), 4) + + def test_groupby_bins_multidim(self): + array = self.make_groupby_multidim_example_array() + bins = [0, 15, 20] + bin_coords = ['(0, 15]', '(15, 20]'] + expected = xr.DataArray([16, 40], dims='lat_bins', + coords={'lat_bins': bin_coords}) + actual = array.groupby_bins('lat', bins).apply( + lambda x: x.sum(), shortcut=False) + self.assertDataArrayIdentical(expected, actual) + # modify the array coordinates to be non-monotonic after unstacking + array['lat'].data = np.array([[10., 20.], [20., 10.]]) + expected = xr.DataArray([28, 28], dims='lat_bins', + coords={'lat_bins': bin_coords}) + actual = array.groupby_bins('lat', bins).apply( + lambda x: x.sum(), shortcut=False) + self.assertDataArrayIdentical(expected, actual) + + def test_multi_index_groupby_apply(): # regression test for GH873 ds = xr.Dataset({'foo': (('x', 'y'), np.random.randn(3, 4))}, @@ -43,4 +461,20 @@ def test_multi_index_groupby_sum(): assert expected.equals(actual) -# TODO: move other groupby tests from test_dataset and test_dataarray over here +class TestMultipleArgumentGroupby(TestCase): + + def test_apply_identity(self): + ds = xr.Dataset({'foo': ('x', [1, 2, 3])}, {'y': ('x', list('abc'))}) + roundtripped = ds.groupby(['x', 'y']).apply(lambda x: x) + pandas_roundtripped = xr.Dataset.from_dataframe( + ds.to_dataframe().reset_index().groupby(['x', 'y']).apply(lambda x: x)) + self.assertDatasetIdentical(ds, roundtripped) + self.assertDatasetIdentical(ds, pandas_roundtripped) + + def test_sum_identity(self): + ds = xr.Dataset({'foo': ('x', [1, 2, 3])}, {'y': ('x', list('abc'))}) + roundtripped = ds.groupby(['x', 'y']).sum() + pandas_roundtripped = xr.Dataset.from_dataframe( + ds.to_dataframe().reset_index().groupby(['x', 'y']).sum()) + self.assertDatasetIdentical(ds, roundtripped) + self.assertDatasetIdentical(ds, pandas_roundtripped)