Skip to content

WIP: progress toward making groupby work with multiple arguments #924

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,14 +341,16 @@ def groupby(self, group, squeeze=True):
"""
if isinstance(group, basestring):
group = self[group]
elif isinstance(group, (list, tuple)):
group = [self[g] if isinstance(g, basestring) else g for g in group]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I think I see...here is where the list of groups names is converted to a list of DataArrays.

return self.groupby_cls(self, group, squeeze=squeeze)

def groupby_bins(self, group, bins, right=True, labels=None, precision=3,
include_lowest=False, squeeze=True):
"""Returns a GroupBy object for performing grouped operations.

Rather than using all unique values of `group`, the values are discretized
first by applying `pandas.cut` [1]_ to `group`.
Rather than using all unique values of `group`, the values are
discretized first by applying `pandas.cut` [1]_ to `group`.

Parameters
----------
Expand Down
6 changes: 4 additions & 2 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,9 +286,11 @@ def _to_dataset_whole(self, name=None, shallow_copy=True):
if name is None:
raise ValueError('unable to convert unnamed DataArray to a '
'Dataset without providing an explicit name')
if name in self.coords:
if (name in self.coords and
not self.variable.identical(self._coords[name])):
raise ValueError('cannot create a Dataset from a DataArray with '
'the same name as one of its coordinates')
'the same name as one of its coordinates '
'unless they are identical')
# use private APIs here for speed: this is called by _to_temp_dataset(),
# which is used in the guts of a lot of operations (e.g., reindex)
variables = self._coords.copy()
Expand Down
69 changes: 43 additions & 26 deletions xarray/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@

from . import nputils
from . import ops
from .alignment import broadcast
from .combine import concat
from .common import (
ImplementsArrayReduce, ImplementsDatasetReduce, _maybe_promote,
)
from .merge import merge
from .pycompat import zip
from .utils import peek_at, maybe_wrap_array, safe_cast_to_index
from .variable import as_variable, Variable, Coordinate
Expand Down Expand Up @@ -131,7 +133,7 @@ class GroupBy(object):
DataArray.groupby
"""
def __init__(self, obj, group, squeeze=False, grouper=None, bins=None,
cut_kwargs={}):
cut_kwargs={}):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😳 these PEP8 violations are from my PR. Sorry! I have since started linting...

"""Create a GroupBy object

Parameters
Expand All @@ -152,44 +154,69 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None,
cut_kwargs : dict, optional
Extra keyword arguments to pass to `pandas.cut`
"""
from .dataset import as_dataset
from .dataset import Dataset
from .dataarray import DataArray

if getattr(group, 'name', None) is None:
raise ValueError('`group` must have a name')
def check_valid_group(group_obj):
if not isinstance(group_obj, (DataArray, Variable)):
raise TypeError('`group` must be a DataArray, Variable or list '
'of DataArrays and/or Variables')
if getattr(group_obj, 'name', None) is None:
raise ValueError('each item in `group` must have a name')

def is_monotonic_unique(group_obj):
index = safe_cast_to_index(group)
return index.is_monotonic and index.is_unique

if grouper is not None and bins is not None:
raise TypeError("Can't specify both `grouper` and `bins`.")

if isinstance(group, (list, tuple)):
for g in group:
check_valid_group(g)
group = merge(group)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm trying to understand what happens here if the group is a list of dimension names (e.g. group=['x', 'y']). What will merge return?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Never mind...I get it now. By this point, all the items in the list should already be DataArrays or Variables. Still not sure I can visualize what merge is doing though.

else:
check_valid_group(group)

self._stacked_dim = None
if group.ndim != 1:
if len(group.dims) > 1:
# try to stack the dims of the group into a single dim
# TODO: figure out how to exclude dimensions from the stacking
# (e.g. group over space dims but leave time dim intact)
orig_dims = group.dims
orig_dims = tuple(group.dims)
stacked_dim_name = 'stacked_' + '_'.join(orig_dims)

# the copy is necessary here, otherwise read only array raises error
# in pandas: https://github.com/pydata/pandas/issues/12813
group = group.stack(**{stacked_dim_name: orig_dims}).copy()

obj = obj.stack(**{stacked_dim_name: orig_dims})
self._stacked_dim = stacked_dim_name
self._unstacked_dims = orig_dims
if not hasattr(group, 'dims'):
raise ValueError("`group` must have a 'dims' attribute")

if isinstance(group, Dataset):
# list or tuple input is now a 1-dimensional Dataset
group_values = pd.MultiIndex.from_arrays(
[v.values for v in group.values()])
group = DataArray(group_values, group.coords, name='__joined__')

group_dim, = group.dims

try:
try: # Dataset
expected_size = obj.dims[group_dim]
except TypeError:
except TypeError: # DataArray
expected_size = obj.shape[obj.get_axis_num(group_dim)]
if group.size != expected_size:
raise ValueError('the group variable\'s length does not '
'match the length of this variable along its '
'dimension')
full_index = None

if grouper is not None and bins is not None:
raise TypeError("Can't specify both `grouper` and `bins`.")
if bins is not None:
binned = pd.cut(group.values, bins, **cut_kwargs)
new_dim_name = group.name + '_bins'
group = DataArray(binned, group.coords, name=new_dim_name)

if grouper is not None:
index = safe_cast_to_index(group)
if not index.is_monotonic:
Expand All @@ -205,13 +232,10 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None,
group_indices = ([slice(i, j) for i, j in zip(sbins[:-1], sbins[1:])] +
[slice(sbins[-1], None)])
unique_coord = Coordinate(group.name, first_items.index)
elif group.name in obj.dims and bins is None:
# assume that group already has sorted, unique values
# (if using bins, the group will have the same name as a dimension
# but different values)
if group.dims != (group.name,):
raise ValueError('`group` is required to be a coordinate if '
'`group.name` is a dimension in `obj`')
elif group.name in obj.dims and is_monotonic_unique(group):
# if group.dims != (group.name,):
# raise ValueError('`group` is required to be a coordinate if '
# '`group.name` is a dimension in `obj`')
group_indices = np.arange(group.size)
if not squeeze:
# group_indices = group_indices.reshape(-1, 1)
Expand Down Expand Up @@ -426,12 +450,6 @@ def lookup_order(dimension):
new_order = sorted(stacked.dims, key=lookup_order)
return stacked.transpose(*new_order)

def _restore_multiindex(self, combined):
if self._stacked_dim is not None and self._stacked_dim in combined.dims:
stacked_dim = self.group[self._stacked_dim]
combined[self._stacked_dim] = stacked_dim
return combined

def apply(self, func, shortcut=False, **kwargs):
"""Apply a function over each array in the group and concatenate them
together into a new array.
Expand Down Expand Up @@ -490,7 +508,6 @@ def _concat(self, applied, shortcut=False):
combined = _maybe_reorder(combined, concat_dim, positions)
if isinstance(combined, type(self.obj)):
combined = self._restore_dim_order(combined)
combined = self._restore_multiindex(combined)
return combined

def reduce(self, func, dim=None, axis=None, keep_attrs=False,
Expand Down
Loading