pydata
diff --git a/‎README.md
Lines changed: 2 additions & 1 deletion b/‎README.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/scidata/array_.py
Lines changed: 64 additions & 63 deletions b/‎src/scidata/array_.py
Lines changed: 64 additions & 63 deletions
diff --git a/‎src/scidata/common.py
Lines changed: 18 additions & 12 deletions b/‎src/scidata/common.py
Lines changed: 18 additions & 12 deletions
diff --git a/‎src/scidata/dataset.py
Lines changed: 9 additions & 2 deletions b/‎src/scidata/dataset.py
Lines changed: 9 additions & 2 deletions
@@ -11,7 +11,8 @@ used for self-describing scientific data (netCDF, OpenDAP, etc.).
     but keeps ancilliary variables and metadata intact.
   - Array broadcasting based on dimension names and coordinate indices
     instead of only shapes.
-  - Aggregate variables across dimensions or grouped by other variables.
+  - Flexible split-apply-combine functionality with the `Array.groupby` method
+    (patterned after [pandas][pandas]).
   - Fast label-based indexing and (limited) time-series functionality built on
     [pandas][pandas].
 
 
@@ -1,12 +1,14 @@
 import functools
 import warnings
 from collections import OrderedDict
+from itertools import izip
 
 import numpy as np
 
 import conventions
 import dataset
 import dataset_array
+import groupby
 import ops
 import utils
 from common import AbstractArray
@@ -350,6 +352,9 @@ def _collapse(self, f, dim, **kwargs):
                                         + ': ' + f.__name__)
         return new_var
 
+    def groupby(self, group_name, group_array, squeeze=True):
+        return groupby.GroupBy(self, group_name, group_array, squeeze=squeeze)
+
     def aggregate(self, func, new_dim_name, group_by, **kwargs):
         """Aggregate this variable by applying `func` to grouped elements
 
@@ -396,7 +401,7 @@ def aggregate(self, func, new_dim_name, group_by, **kwargs):
 
     @classmethod
     def from_stack(cls, variables, dimension='stacked_dimension',
-                   length=None):
+                   stacked_indexers=None, length=None, template=None):
         """Stack variables along a new or existing dimension to form a new
         variable
 
@@ -406,12 +411,13 @@ def from_stack(cls, variables, dimension='stacked_dimension',
             Arrays to stack together. Each variable is expected to have
             matching dimensions and shape except for along the stacked
             dimension.
-        dimension : str, optional
+        dimension : str or DatasetArray, optional
             Name of the dimension to stack along. This can either be a new
             dimension name, in which case it is added along axis=0, or an
             existing dimension name, in which case the location of the
             dimension is unchanged. Where to insert the new dimension is
             determined by the first variable.
+        stacked_indexers : iterable of indexers, optional
         length : int, optional
             Length of the new dimension. This is used to allocate the new data
             array for the stacked variable data before iterating over all
@@ -423,73 +429,68 @@ def from_stack(cls, variables, dimension='stacked_dimension',
             Stacked variable formed by stacking all the supplied variables
             along the new dimension.
         """
-        if length is None:
+        if not isinstance(dimension, basestring):
+            length = dimension.size
+            dimension, = dimension.dimensions
+
+        if length is None or stacked_indexers is None:
             # so much for lazy evaluation! we need to look at all the variables
-            # to figure out the dimensions of the stacked variable
+            # to figure out the indexers and/or dimensions of the stacked
+            # variable
             variables = list(variables)
-            length = 0
-            for var in variables:
+            steps = [var.shape[var.dimensions.index(dimension)]
+                     if dimension in var.dimensions else 1
+                     for var in variables]
+            if length is None:
+                length = sum(steps)
+            if stacked_indexers is None:
+                stacked_indexers = []
+                i = 0
+                for step in steps:
+                    stacked_indexers.append(slice(i, i + step))
+                    i += step
+                if i != length:
+                    raise ValueError('actual length of stacked variables '
+                                     'along %s is %r but expected length was '
+                                     '%s' % (dimension, i, length))
+
+        # initialize the stacked variable with empty data
+        first_var, variables = groupby.peek_at(variables)
+        if dimension in first_var.dimensions:
+            axis = first_var.dimensions.index(dimension)
+            shape = tuple(length if n == axis else s
+                          for n, s in enumerate(first_var.shape))
+            dims = first_var.dimensions
+        else:
+            axis = 0
+            shape = (length,) + first_var.shape
+            dims = (dimension,) + first_var.dimensions
+        attr = OrderedDict() if template is None else template.attributes
+
+        stacked = cls(dims, np.empty(shape, dtype=first_var.dtype), attr)
+        stacked.attributes.update(first_var.attributes)
+
+        alt_dims = tuple(d for d in dims if d != dimension)
+
+        # copy in the data from the variables
+        for var, indexer in izip(variables, stacked_indexers):
+            if template is None:
+                # do sanity checks if we don't have a template
                 if dimension in var.dimensions:
-                    axis = var.dimensions.index(dimension)
-                    length += var.shape[axis]
-                else:
-                    length += 1
-
-        # manually keep track of progress along
-        i = 0
-        for var in variables:
-            if i == 0:
-                # initialize the stacked variable with empty data
-                if dimension not in var.dimensions:
-                    shape = (length,) + var.shape
-                    dims = (dimension,) + var.dimensions
-                else:
-                    shape = tuple(length if d == dimension else s
-                                  for d, s in zip(var.dimensions, var.shape))
-                    dims = var.dimensions
-                stacked = cls(dims, np.empty(shape, dtype=var.dtype),
-                              var.attributes)
-                # required dimensions (including order) if we have any N - 1
-                # dimensional variables
-                alt_dims = tuple(d for d in dims if d != dimension)
-
-            if dimension in var.dimensions:
-                # transpose requires that the dimensions are equivalent
-                var = var.transpose(*stacked.dimensions)
-                axis = var.dimensions.index(dimension)
-                step = var.shape[axis]
-            elif var.dimensions == alt_dims:
-                step = 1
-            else:
-                raise ValueError('inconsistent dimensions')
-
-            if i + step > length:
-                raise ValueError('actual length of stacked variables along %s '
-                                 'is greater than expected length %s'
-                                 % (dimension, length))
-
-            indexer = tuple((slice(i, i + step) if step > 1 else i)
-                            if d == dimension else slice(None)
-                            for d in stacked.dimensions)
-            # by-pass variable indexing for possible speedup
-            stacked.data[indexer] = var.data
-            utils.remove_incompatible_items(stacked.attributes, var.attributes)
-            i += step
-
-        if i != length:
-            raise ValueError('actual length of stacked variables along %s is '
-                             '%s but expected length was %s'
-                             % (dimension, i, length))
+                    # transpose verifies that the dimensions are equivalent
+                    if var.dimensions != stacked.dimensions:
+                        var = var.transpose(*stacked.dimensions)
+                elif var.dimensions != alt_dims:
+                    raise ValueError('inconsistent dimensions')
+                utils.remove_incompatible_items(stacked.attributes,
+                                                var.attributes)
+
+            key = tuple(indexer if n == axis else slice(None)
+                        for n in range(stacked.ndim))
+            stacked.data[tuple(key)] = var.data
 
         return stacked
 
-    def apply(self, func, *args, **kwargs):
-        """Apply `func` with *args and **kwargs to this variable's data and
-        return the result as a new variable with the same dimensions
-        """
-        data = np.asarray(func(self.data, *args, **kwargs))
-        return type(self)(self.dimensions, data, self.attributes)
-
     def __array_wrap__(self, result):
         return type(self)(self.dimensions, result, self.attributes)
 
 
@@ -1,5 +1,20 @@
 
-class AbstractArray(object):
+class ImplementsCollapse(object):
+    @classmethod
+    def _collapse_method(cls, f, name=None, module=None):
+        def func(self, dimension=cls._collapse_dimension_default,
+                 axis=cls._collapse_axis_default, **kwargs):
+            return self.collapse(f, dimension, axis, **kwargs)
+        if name is None:
+            name = f.__name__
+        func.__name__ = name
+        func.__doc__ = cls._collapse_method_docstring.format(
+            name=('' if module is None else module + '.') + name,
+            cls=cls.__name__)
+        return func
+
+
+class AbstractArray(ImplementsCollapse):
     @property
     def dtype(self):
         return self._data.dtype
@@ -78,14 +93,5 @@ def T(self):
             indicated dimension(s) removed.
         """
 
-    @classmethod
-    def _collapse_method(cls, f, name=None, module=None):
-        def func(self, dimension=None, axis=None, **kwargs):
-            return self.collapse(f, dimension, axis, **kwargs)
-        if name is None:
-            name = f.__name__
-        func.__name__ = name
-        func.__doc__ = cls._collapse_method_docstring.format(
-            name=('' if module is None else module + '.') + name,
-            cls=cls.__name__)
-        return func
+    _collapse_dimension_default = None
+    _collapse_axis_default = None
@@ -309,7 +309,7 @@ def virtual_variables(self):
         for k in self._datetimeindices:
             for suffix in _DATETIMEINDEX_COMPONENTS + ['season']:
                 possible_vars.append('%s.%s' % (k, suffix))
-        return tuple(k for k in possible_vars if k not in self)
+        return tuple(k for k in possible_vars if k not in self.variables)
 
     def __getitem__(self, key):
         if key not in self.variables:
@@ -327,6 +327,9 @@ def __setitem__(self, key, value):
         # (We would need to change DatasetArray.__setitem__ in that case, because
         # we definitely don't want to override focus variables.)
         if isinstance(value, DatasetArray):
+            # print 'value was ', repr(value)
+            # print 'renamed to ', repr(value.renamed())
+            # print 'setting item', repr(value.renamed(key).dataset)
             self.merge(value.renamed(key).dataset, inplace=True)
         elif isinstance(value, array.Array):
             self.set_variable(key, value)
@@ -830,7 +833,8 @@ def unselect(self, *names, **kwargs):
         *names : str
             Names of the variables to omit from the returned object.
         omit_dimensions : bool, optional (default True)
-            Whether or not to also omit dimensions with the given names.
+            Whether or not to also omit dimensions with the given names. All
+            variables along omited dimensions will also be removed.
 
         Returns
         -------
@@ -848,6 +852,9 @@ def unselect(self, *names, **kwargs):
             dimensions = OrderedDict((k, v) for k, v
                                      in self.dimensions.iteritems()
                                      if k not in names)
+            variables = OrderedDict((k, v) for k, v in variables.iteritems()
+                                    if all(d in dimensions
+                                           for d in v.dimensions))
             indices = {k: v for k, v in self.indices.cache.items()
                        if k not in names}
         else: