Skip to content

Commit da7972c

Browse files
authored
Merge pull request #1615 from shoyer/apply-ufunc-vectorize
Add vectorize=True option to apply_ufunc
2 parents 6390230 + 0331c5c commit da7972c

File tree

2 files changed

+93
-5
lines changed

2 files changed

+93
-5
lines changed

xarray/core/computation.py

+58-5
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,28 @@ def __repr__(self):
8888
list(self.input_core_dims),
8989
list(self.output_core_dims)))
9090

91+
def __str__(self):
92+
lhs = ','.join('({})'.format(','.join(dims))
93+
for dims in self.input_core_dims)
94+
rhs = ','.join('({})'.format(','.join(dims))
95+
for dims in self.output_core_dims)
96+
return '{}->{}'.format(lhs, rhs)
97+
98+
def to_gufunc_string(self):
99+
"""Create an equivalent signature string for a NumPy gufunc.
100+
101+
Unlike __str__, handles dimensions that don't map to Python
102+
identifiers.
103+
"""
104+
all_dims = self.all_core_dims
105+
dims_map = dict(zip(sorted(all_dims), range(len(all_dims))))
106+
input_core_dims = [['dim%d' % dims_map[dim] for dim in core_dims]
107+
for core_dims in self.input_core_dims]
108+
output_core_dims = [['dim%d' % dims_map[dim] for dim in core_dims]
109+
for core_dims in self.output_core_dims]
110+
alt_signature = type(self)(input_core_dims, output_core_dims)
111+
return str(alt_signature)
112+
91113

92114
def result_name(objects):
93115
# type: List[object] -> Any
@@ -636,6 +658,7 @@ def apply_ufunc(func, *args, **kwargs):
636658
input_core_dims : Optional[Sequence[Sequence]] = None,
637659
output_core_dims : Optional[Sequence[Sequence]] = ((),),
638660
exclude_dims : Collection = frozenset(),
661+
vectorize : bool = False,
639662
join : str = 'exact',
640663
dataset_join : str = 'exact',
641664
dataset_fill_value : Any = _NO_FILL_VALUE,
@@ -659,8 +682,9 @@ def apply_ufunc(func, *args, **kwargs):
659682
(``.data``) that returns an array or tuple of arrays. If multiple
660683
arguments with non-matching dimensions are supplied, this function is
661684
expected to vectorize (broadcast) over axes of positional arguments in
662-
the style of NumPy universal functions [1]_. If this function returns
663-
multiple outputs, you most set ``output_core_dims`` as well.
685+
the style of NumPy universal functions [1]_ (if this is not the case,
686+
set ``vectorize=True``). If this function returns multiple outputs, you
687+
must set ``output_core_dims`` as well.
664688
*args : Dataset, DataArray, GroupBy, Variable, numpy/dask arrays or scalars
665689
Mix of labeled and/or unlabeled arrays to which to apply the function.
666690
input_core_dims : Sequence[Sequence], optional
@@ -689,6 +713,12 @@ def apply_ufunc(func, *args, **kwargs):
689713
broadcasting entirely. Any input coordinates along these dimensions
690714
will be dropped. Each excluded dimension must also appear in
691715
``input_core_dims`` for at least one argument.
716+
vectorize : bool, optional
717+
If True, then assume ``func`` only takes arrays defined over core
718+
dimensions as input and vectorize it automatically with
719+
:py:func:`numpy.vectorize`. This option exists for convenience, but is
720+
almost always slower than supplying a pre-vectorized function.
721+
Using this option requires NumPy version 1.12 or newer.
692722
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
693723
Method for joining the indexes of the passed objects along each
694724
dimension, and the variables of Dataset objects with mismatched
@@ -779,15 +809,31 @@ def stack(objects, dim, new_coord):
779809
result[dim] = new_coord
780810
return result
781811
812+
If your function is not vectorized but can be applied only to core
813+
dimensions, you can use ``vectorize=True`` to turn into a vectorized
814+
function. This wraps :py:func:`numpy.vectorize`, so the operation isn't
815+
terribly fast. Here we'll use it to calculate the distance between
816+
empirical samples from two probability distributions, using a scipy
817+
function that needs to be applied to vectors::
818+
819+
import scipy.stats
820+
821+
def earth_mover_distance(first_samples,
822+
second_samples,
823+
dim='ensemble'):
824+
return apply_ufunc(scipy.stats.wasserstein_distance,
825+
first_samples, second_samples,
826+
input_core_dims=[[dim], [dim]],
827+
vectorize=True)
828+
782829
Most of NumPy's builtin functions already broadcast their inputs
783830
appropriately for use in `apply`. You may find helper functions such as
784-
numpy.broadcast_arrays or numpy.vectorize helpful in writing your function.
785-
`apply_ufunc` also works well with numba's vectorize and guvectorize.
831+
numpy.broadcast_arrays helpful in writing your function. `apply_ufunc` also
832+
works well with numba's vectorize and guvectorize.
786833
787834
See also
788835
--------
789836
numpy.broadcast_arrays
790-
numpy.vectorize
791837
numba.vectorize
792838
numba.guvectorize
793839
@@ -802,6 +848,7 @@ def stack(objects, dim, new_coord):
802848

803849
input_core_dims = kwargs.pop('input_core_dims', None)
804850
output_core_dims = kwargs.pop('output_core_dims', ((),))
851+
vectorize = kwargs.pop('vectorize', False)
805852
join = kwargs.pop('join', 'exact')
806853
dataset_join = kwargs.pop('dataset_join', 'exact')
807854
keep_attrs = kwargs.pop('keep_attrs', False)
@@ -827,6 +874,12 @@ def stack(objects, dim, new_coord):
827874
if kwargs_:
828875
func = functools.partial(func, **kwargs_)
829876

877+
if vectorize:
878+
func = np.vectorize(func,
879+
otypes=output_dtypes,
880+
signature=signature.to_gufunc_string(),
881+
excluded=set(kwargs))
882+
830883
variables_ufunc = functools.partial(apply_variable_ufunc, func,
831884
signature=signature,
832885
exclude_dims=exclude_dims,

xarray/tests/test_computation.py

+35
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
import operator
33
from collections import OrderedDict
44

5+
from distutils.version import LooseVersion
56
import numpy as np
67
from numpy.testing import assert_array_equal
8+
import pandas as pd
79

810
import pytest
911

@@ -32,6 +34,8 @@ def test_signature_properties():
3234
assert sig.all_output_core_dims == frozenset(['z'])
3335
assert sig.num_inputs == 2
3436
assert sig.num_outputs == 1
37+
assert str(sig) == '(x),(x,y)->(z)'
38+
assert sig.to_gufunc_string() == '(dim0),(dim0,dim1)->(dim2)'
3539
# dimension names matter
3640
assert _UFuncSignature([['x']]) != _UFuncSignature([['y']])
3741

@@ -675,6 +679,37 @@ def func(x):
675679
assert_identical(expected, actual)
676680

677681

682+
def pandas_median(x):
683+
return pd.Series(x).median()
684+
685+
686+
def test_vectorize():
687+
if LooseVersion(np.__version__) < LooseVersion('1.12.0'):
688+
pytest.skip('numpy 1.12 or later to support vectorize=True.')
689+
690+
data_array = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=('x', 'y'))
691+
expected = xr.DataArray([1, 2], dims=['x'])
692+
actual = apply_ufunc(pandas_median, data_array,
693+
input_core_dims=[['y']],
694+
vectorize=True)
695+
assert_identical(expected, actual)
696+
697+
698+
@requires_dask
699+
def test_vectorize_dask():
700+
if LooseVersion(np.__version__) < LooseVersion('1.12.0'):
701+
pytest.skip('numpy 1.12 or later to support vectorize=True.')
702+
703+
data_array = xr.DataArray([[0, 1, 2], [1, 2, 3]], dims=('x', 'y'))
704+
expected = xr.DataArray([1, 2], dims=['x'])
705+
actual = apply_ufunc(pandas_median, data_array.chunk({'x': 1}),
706+
input_core_dims=[['y']],
707+
vectorize=True,
708+
dask='parallelized',
709+
output_dtypes=[float])
710+
assert_identical(expected, actual)
711+
712+
678713
def test_where():
679714
cond = xr.DataArray([True, False], dims='x')
680715
actual = xr.where(cond, 1, 0)

0 commit comments

Comments
 (0)