Skip to content

Commit ad9a913

Browse files
committed
Array.groupby
We should probably remove Array.aggregate to reduce confusion, but for now I'll keep aggregate for error checks.
1 parent 4cd1361 commit ad9a913

11 files changed

+505
-137
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ used for self-describing scientific data (netCDF, OpenDAP, etc.).
1111
but keeps ancilliary variables and metadata intact.
1212
- Array broadcasting based on dimension names and coordinate indices
1313
instead of only shapes.
14-
- Aggregate variables across dimensions or grouped by other variables.
14+
- Flexible split-apply-combine functionality with the `Array.groupby` method
15+
(patterned after [pandas][pandas]).
1516
- Fast label-based indexing and (limited) time-series functionality built on
1617
[pandas][pandas].
1718

src/scidata/array_.py

Lines changed: 64 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
import functools
22
import warnings
33
from collections import OrderedDict
4+
from itertools import izip
45

56
import numpy as np
67

78
import conventions
89
import dataset
910
import dataset_array
11+
import groupby
1012
import ops
1113
import utils
1214
from common import AbstractArray
@@ -350,6 +352,9 @@ def _collapse(self, f, dim, **kwargs):
350352
+ ': ' + f.__name__)
351353
return new_var
352354

355+
def groupby(self, group_name, group_array, squeeze=True):
356+
return groupby.GroupBy(self, group_name, group_array, squeeze=squeeze)
357+
353358
def aggregate(self, func, new_dim_name, group_by, **kwargs):
354359
"""Aggregate this variable by applying `func` to grouped elements
355360
@@ -396,7 +401,7 @@ def aggregate(self, func, new_dim_name, group_by, **kwargs):
396401

397402
@classmethod
398403
def from_stack(cls, variables, dimension='stacked_dimension',
399-
length=None):
404+
stacked_indexers=None, length=None, template=None):
400405
"""Stack variables along a new or existing dimension to form a new
401406
variable
402407
@@ -406,12 +411,13 @@ def from_stack(cls, variables, dimension='stacked_dimension',
406411
Arrays to stack together. Each variable is expected to have
407412
matching dimensions and shape except for along the stacked
408413
dimension.
409-
dimension : str, optional
414+
dimension : str or DatasetArray, optional
410415
Name of the dimension to stack along. This can either be a new
411416
dimension name, in which case it is added along axis=0, or an
412417
existing dimension name, in which case the location of the
413418
dimension is unchanged. Where to insert the new dimension is
414419
determined by the first variable.
420+
stacked_indexers : iterable of indexers, optional
415421
length : int, optional
416422
Length of the new dimension. This is used to allocate the new data
417423
array for the stacked variable data before iterating over all
@@ -423,73 +429,68 @@ def from_stack(cls, variables, dimension='stacked_dimension',
423429
Stacked variable formed by stacking all the supplied variables
424430
along the new dimension.
425431
"""
426-
if length is None:
432+
if not isinstance(dimension, basestring):
433+
length = dimension.size
434+
dimension, = dimension.dimensions
435+
436+
if length is None or stacked_indexers is None:
427437
# so much for lazy evaluation! we need to look at all the variables
428-
# to figure out the dimensions of the stacked variable
438+
# to figure out the indexers and/or dimensions of the stacked
439+
# variable
429440
variables = list(variables)
430-
length = 0
431-
for var in variables:
441+
steps = [var.shape[var.dimensions.index(dimension)]
442+
if dimension in var.dimensions else 1
443+
for var in variables]
444+
if length is None:
445+
length = sum(steps)
446+
if stacked_indexers is None:
447+
stacked_indexers = []
448+
i = 0
449+
for step in steps:
450+
stacked_indexers.append(slice(i, i + step))
451+
i += step
452+
if i != length:
453+
raise ValueError('actual length of stacked variables '
454+
'along %s is %r but expected length was '
455+
'%s' % (dimension, i, length))
456+
457+
# initialize the stacked variable with empty data
458+
first_var, variables = groupby.peek_at(variables)
459+
if dimension in first_var.dimensions:
460+
axis = first_var.dimensions.index(dimension)
461+
shape = tuple(length if n == axis else s
462+
for n, s in enumerate(first_var.shape))
463+
dims = first_var.dimensions
464+
else:
465+
axis = 0
466+
shape = (length,) + first_var.shape
467+
dims = (dimension,) + first_var.dimensions
468+
attr = OrderedDict() if template is None else template.attributes
469+
470+
stacked = cls(dims, np.empty(shape, dtype=first_var.dtype), attr)
471+
stacked.attributes.update(first_var.attributes)
472+
473+
alt_dims = tuple(d for d in dims if d != dimension)
474+
475+
# copy in the data from the variables
476+
for var, indexer in izip(variables, stacked_indexers):
477+
if template is None:
478+
# do sanity checks if we don't have a template
432479
if dimension in var.dimensions:
433-
axis = var.dimensions.index(dimension)
434-
length += var.shape[axis]
435-
else:
436-
length += 1
437-
438-
# manually keep track of progress along
439-
i = 0
440-
for var in variables:
441-
if i == 0:
442-
# initialize the stacked variable with empty data
443-
if dimension not in var.dimensions:
444-
shape = (length,) + var.shape
445-
dims = (dimension,) + var.dimensions
446-
else:
447-
shape = tuple(length if d == dimension else s
448-
for d, s in zip(var.dimensions, var.shape))
449-
dims = var.dimensions
450-
stacked = cls(dims, np.empty(shape, dtype=var.dtype),
451-
var.attributes)
452-
# required dimensions (including order) if we have any N - 1
453-
# dimensional variables
454-
alt_dims = tuple(d for d in dims if d != dimension)
455-
456-
if dimension in var.dimensions:
457-
# transpose requires that the dimensions are equivalent
458-
var = var.transpose(*stacked.dimensions)
459-
axis = var.dimensions.index(dimension)
460-
step = var.shape[axis]
461-
elif var.dimensions == alt_dims:
462-
step = 1
463-
else:
464-
raise ValueError('inconsistent dimensions')
465-
466-
if i + step > length:
467-
raise ValueError('actual length of stacked variables along %s '
468-
'is greater than expected length %s'
469-
% (dimension, length))
470-
471-
indexer = tuple((slice(i, i + step) if step > 1 else i)
472-
if d == dimension else slice(None)
473-
for d in stacked.dimensions)
474-
# by-pass variable indexing for possible speedup
475-
stacked.data[indexer] = var.data
476-
utils.remove_incompatible_items(stacked.attributes, var.attributes)
477-
i += step
478-
479-
if i != length:
480-
raise ValueError('actual length of stacked variables along %s is '
481-
'%s but expected length was %s'
482-
% (dimension, i, length))
480+
# transpose verifies that the dimensions are equivalent
481+
if var.dimensions != stacked.dimensions:
482+
var = var.transpose(*stacked.dimensions)
483+
elif var.dimensions != alt_dims:
484+
raise ValueError('inconsistent dimensions')
485+
utils.remove_incompatible_items(stacked.attributes,
486+
var.attributes)
487+
488+
key = tuple(indexer if n == axis else slice(None)
489+
for n in range(stacked.ndim))
490+
stacked.data[tuple(key)] = var.data
483491

484492
return stacked
485493

486-
def apply(self, func, *args, **kwargs):
487-
"""Apply `func` with *args and **kwargs to this variable's data and
488-
return the result as a new variable with the same dimensions
489-
"""
490-
data = np.asarray(func(self.data, *args, **kwargs))
491-
return type(self)(self.dimensions, data, self.attributes)
492-
493494
def __array_wrap__(self, result):
494495
return type(self)(self.dimensions, result, self.attributes)
495496

src/scidata/common.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,20 @@
11

2-
class AbstractArray(object):
2+
class ImplementsCollapse(object):
3+
@classmethod
4+
def _collapse_method(cls, f, name=None, module=None):
5+
def func(self, dimension=cls._collapse_dimension_default,
6+
axis=cls._collapse_axis_default, **kwargs):
7+
return self.collapse(f, dimension, axis, **kwargs)
8+
if name is None:
9+
name = f.__name__
10+
func.__name__ = name
11+
func.__doc__ = cls._collapse_method_docstring.format(
12+
name=('' if module is None else module + '.') + name,
13+
cls=cls.__name__)
14+
return func
15+
16+
17+
class AbstractArray(ImplementsCollapse):
318
@property
419
def dtype(self):
520
return self._data.dtype
@@ -78,14 +93,5 @@ def T(self):
7893
indicated dimension(s) removed.
7994
"""
8095

81-
@classmethod
82-
def _collapse_method(cls, f, name=None, module=None):
83-
def func(self, dimension=None, axis=None, **kwargs):
84-
return self.collapse(f, dimension, axis, **kwargs)
85-
if name is None:
86-
name = f.__name__
87-
func.__name__ = name
88-
func.__doc__ = cls._collapse_method_docstring.format(
89-
name=('' if module is None else module + '.') + name,
90-
cls=cls.__name__)
91-
return func
96+
_collapse_dimension_default = None
97+
_collapse_axis_default = None

src/scidata/dataset.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ def virtual_variables(self):
309309
for k in self._datetimeindices:
310310
for suffix in _DATETIMEINDEX_COMPONENTS + ['season']:
311311
possible_vars.append('%s.%s' % (k, suffix))
312-
return tuple(k for k in possible_vars if k not in self)
312+
return tuple(k for k in possible_vars if k not in self.variables)
313313

314314
def __getitem__(self, key):
315315
if key not in self.variables:
@@ -327,6 +327,9 @@ def __setitem__(self, key, value):
327327
# (We would need to change DatasetArray.__setitem__ in that case, because
328328
# we definitely don't want to override focus variables.)
329329
if isinstance(value, DatasetArray):
330+
# print 'value was ', repr(value)
331+
# print 'renamed to ', repr(value.renamed())
332+
# print 'setting item', repr(value.renamed(key).dataset)
330333
self.merge(value.renamed(key).dataset, inplace=True)
331334
elif isinstance(value, array.Array):
332335
self.set_variable(key, value)
@@ -830,7 +833,8 @@ def unselect(self, *names, **kwargs):
830833
*names : str
831834
Names of the variables to omit from the returned object.
832835
omit_dimensions : bool, optional (default True)
833-
Whether or not to also omit dimensions with the given names.
836+
Whether or not to also omit dimensions with the given names. All
837+
variables along omited dimensions will also be removed.
834838
835839
Returns
836840
-------
@@ -848,6 +852,9 @@ def unselect(self, *names, **kwargs):
848852
dimensions = OrderedDict((k, v) for k, v
849853
in self.dimensions.iteritems()
850854
if k not in names)
855+
variables = OrderedDict((k, v) for k, v in variables.iteritems()
856+
if all(d in dimensions
857+
for d in v.dimensions))
851858
indices = {k: v for k, v in self.indices.cache.items()
852859
if k not in names}
853860
else:

0 commit comments

Comments
 (0)