Merge pull request #1615 from shoyer/apply-ufunc-vectorize

shoyer · web-flow · commit da7972c6bb8e · 2017-10-27T16:43:46.000-07:00
Add vectorize=True option to apply_ufunc
diff --git a/xarray/core/computation.py b/xarray/core/computation.py
@@ -88,6 +88,28 @@ def __repr__(self):
                    list(self.input_core_dims),
                    list(self.output_core_dims)))
 
+    def __str__(self):
+        lhs = ','.join('({})'.format(','.join(dims))
+                       for dims in self.input_core_dims)
+        rhs = ','.join('({})'.format(','.join(dims))
+                       for dims in self.output_core_dims)
+        return '{}->{}'.format(lhs, rhs)
+
+    def to_gufunc_string(self):
+        """Create an equivalent signature string for a NumPy gufunc.
+
+        Unlike __str__, handles dimensions that don't map to Python
+        identifiers.
+        """
+        all_dims = self.all_core_dims
+        dims_map = dict(zip(sorted(all_dims), range(len(all_dims))))
+        input_core_dims = [['dim%d' % dims_map[dim] for dim in core_dims]
+                           for core_dims in self.input_core_dims]
+        output_core_dims = [['dim%d' % dims_map[dim] for dim in core_dims]
+                            for core_dims in self.output_core_dims]
+        alt_signature = type(self)(input_core_dims, output_core_dims)
+        return str(alt_signature)
+
 
 def result_name(objects):
     # type: List[object] -> Any
@@ -636,6 +658,7 @@ def apply_ufunc(func, *args, **kwargs):
                    input_core_dims : Optional[Sequence[Sequence]] = None,
                    output_core_dims : Optional[Sequence[Sequence]] = ((),),
                    exclude_dims : Collection = frozenset(),
+                   vectorize : bool = False,
                    join : str = 'exact',
                    dataset_join : str = 'exact',
                    dataset_fill_value : Any = _NO_FILL_VALUE,
@@ -659,8 +682,9 @@ def apply_ufunc(func, *args, **kwargs):
         (``.data``) that returns an array or tuple of arrays. If multiple
         arguments with non-matching dimensions are supplied, this function is
         expected to vectorize (broadcast) over axes of positional arguments in
-        the style of NumPy universal functions [1]_. If this function returns
-        multiple outputs, you most set ``output_core_dims`` as well.
+        the style of NumPy universal functions [1]_ (if this is not the case,
+        set ``vectorize=True``). If this function returns multiple outputs, you
+        must set ``output_core_dims`` as well.
     *args : Dataset, DataArray, GroupBy, Variable, numpy/dask arrays or scalars
         Mix of labeled and/or unlabeled arrays to which to apply the function.
     input_core_dims : Sequence[Sequence], optional
@@ -689,6 +713,12 @@ def apply_ufunc(func, *args, **kwargs):
         broadcasting entirely. Any input coordinates along these dimensions
         will be dropped. Each excluded dimension must also appear in
         ``input_core_dims`` for at least one argument.
+    vectorize : bool, optional
+        If True, then assume ``func`` only takes arrays defined over core
+        dimensions as input and vectorize it automatically with
+        :py:func:`numpy.vectorize`. This option exists for convenience, but is
+        almost always slower than supplying a pre-vectorized function.
+        Using this option requires NumPy version 1.12 or newer.
     join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
         Method for joining the indexes of the passed objects along each
         dimension, and the variables of Dataset objects with mismatched
@@ -779,15 +809,31 @@ def stack(objects, dim, new_coord):
             result[dim] = new_coord
             return result
 
+    If your function is not vectorized but can be applied only to core
+    dimensions, you can use ``vectorize=True`` to turn into a vectorized
+    function. This wraps :py:func:`numpy.vectorize`, so the operation isn't
+    terribly fast. Here we'll use it to calculate the distance between
+    empirical samples from two probability distributions, using a scipy
+    function that needs to be applied to vectors::
+
+        import scipy.stats
+
+        def earth_mover_distance(first_samples,
+                                 second_samples,
+                                 dim='ensemble'):
+            return apply_ufunc(scipy.stats.wasserstein_distance,
+                               first_samples, second_samples,
+                               input_core_dims=[[dim], [dim]],
+                               vectorize=True)
+
     Most of NumPy's builtin functions already broadcast their inputs
     appropriately for use in `apply`. You may find helper functions such as
-    numpy.broadcast_arrays or numpy.vectorize helpful in writing your function.
-    `apply_ufunc` also works well with numba's vectorize and guvectorize.
+    numpy.broadcast_arrays helpful in writing your function. `apply_ufunc` also
+    works well with numba's vectorize and guvectorize.
 
     See also
     --------
     numpy.broadcast_arrays
-    numpy.vectorize
     numba.vectorize
     numba.guvectorize
 
@@ -802,6 +848,7 @@ def stack(objects, dim, new_coord):
 
     input_core_dims = kwargs.pop('input_core_dims', None)
     output_core_dims = kwargs.pop('output_core_dims', ((),))
+    vectorize = kwargs.pop('vectorize', False)
     join = kwargs.pop('join', 'exact')
     dataset_join = kwargs.pop('dataset_join', 'exact')
     keep_attrs = kwargs.pop('keep_attrs', False)
@@ -827,6 +874,12 @@ def stack(objects, dim, new_coord):
     if kwargs_:
         func = functools.partial(func, **kwargs_)
 
+    if vectorize:
+        func = np.vectorize(func,
+                            otypes=output_dtypes,
+                            signature=signature.to_gufunc_string(),
+                            excluded=set(kwargs))
+
     variables_ufunc = functools.partial(apply_variable_ufunc, func,
                                         signature=signature,
                                         exclude_dims=exclude_dims,
diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py
@@ -2,8 +2,10 @@
 import operator
 from collections import OrderedDict
 
+from distutils.version import LooseVersion
 import numpy as np
 from numpy.testing import assert_array_equal
+import pandas as pd
 
 import pytest
 
@@ -32,6 +34,8 @@ def test_signature_properties():
     assert sig.all_output_core_dims == frozenset(['z'])
     assert sig.num_inputs == 2
     assert sig.num_outputs == 1
+    assert str(sig) == '(x),(x,y)->(z)'
+    assert sig.to_gufunc_string() == '(dim0),(dim0,dim1)->(dim2)'
     # dimension names matter
     assert _UFuncSignature([['x']]) != _UFuncSignature([['y']])
 
@@ -675,6 +679,37 @@ def func(x):
     assert_identical(expected, actual)
 
 
+def pandas_median(x):
+    return pd.Series(x).median()
+
+
+def test_vectorize():
+    if LooseVersion(np.__version__) < LooseVersion('1.12.0'):
+        pytest.skip('numpy 1.12 or later to support vectorize=True.')
+
+    data_array = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=('x', 'y'))
+    expected = xr.DataArray([1, 2], dims=['x'])
+    actual = apply_ufunc(pandas_median, data_array,
+                         input_core_dims=[['y']],
+                         vectorize=True)
+    assert_identical(expected, actual)
+
+
+@requires_dask
+def test_vectorize_dask():
+    if LooseVersion(np.__version__) < LooseVersion('1.12.0'):
+        pytest.skip('numpy 1.12 or later to support vectorize=True.')
+
+    data_array = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=('x', 'y'))
+    expected = xr.DataArray([1, 2], dims=['x'])
+    actual = apply_ufunc(pandas_median, data_array.chunk({'x': 1}),
+                         input_core_dims=[['y']],
+                         vectorize=True,
+                         dask='parallelized',
+                         output_dtypes=[float])
+    assert_identical(expected, actual)
+
+
 def test_where():
     cond = xr.DataArray([True, False], dims='x')
     actual = xr.where(cond, 1, 0)