Faster unstacking (#4746)

max-sixty · web-flow · commit a0c71c1508f3 · 2021-01-24T15:48:04.000-08:00
* Significantly improve unstacking performance * Hack to get sparse tests passing * Use the existing unstack function for dask & sparse * Add whatsnew * Require numpy 1.17 for new unstack * Also special case pint * Revert "Also special case pint" This reverts commit b33aded. * Only run fast unstack on numpy arrays * Update asvs for unstacking * Update whatsnew
diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py
@@ -7,18 +7,23 @@
 
 class Unstacking:
     def setup(self):
-        data = np.random.RandomState(0).randn(1, 1000, 500)
-        self.ds = xr.DataArray(data).stack(flat_dim=["dim_1", "dim_2"])
+        data = np.random.RandomState(0).randn(500, 1000)
+        self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...])
+        self.da_missing = self.da_full[:-1]
+        self.df_missing = self.da_missing.to_pandas()
 
     def time_unstack_fast(self):
-        self.ds.unstack("flat_dim")
+        self.da_full.unstack("flat_dim")
 
     def time_unstack_slow(self):
-        self.ds[:, ::-1].unstack("flat_dim")
+        self.da_missing.unstack("flat_dim")
+
+    def time_unstack_pandas_slow(self):
+        self.df_missing.unstack()
 
 
 class UnstackingDask(Unstacking):
     def setup(self, *args, **kwargs):
         requires_dask()
         super().setup(**kwargs)
-        self.ds = self.ds.chunk({"flat_dim": 50})
+        self.da_full = self.da_full.chunk({"flat_dim": 50})
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -17,7 +17,7 @@ What's New
 
 .. _whats-new.0.16.3:
 
-v0.16.3 (unreleased)
+v0.17.0 (unreleased)
 --------------------
 
 Breaking changes
@@ -45,6 +45,11 @@ Breaking changes
 
 New Features
 ~~~~~~~~~~~~
+- Significantly higher ``unstack`` performance on numpy-backed arrays which
+  contain missing values; 8x faster in our benchmark, and 2x faster than pandas.
+  (:pull:`4746`);
+  By `Maximilian Roos <https://github.com/max-sixty>`_.
+
 - Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables.
   By `Deepak Cherian <https://github.com/dcherian>`_.
 - :py:meth:`DataArray.swap_dims` & :py:meth:`Dataset.swap_dims` now accept dims
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -4,6 +4,7 @@
 import sys
 import warnings
 from collections import defaultdict
+from distutils.version import LooseVersion
 from html import escape
 from numbers import Number
 from operator import methodcaller
@@ -79,7 +80,7 @@
 )
 from .missing import get_clean_interp_index
 from .options import OPTIONS, _get_keep_attrs
-from .pycompat import is_duck_dask_array
+from .pycompat import is_duck_dask_array, sparse_array_type
 from .utils import (
     Default,
     Frozen,
@@ -3715,7 +3716,40 @@ def ensure_stackable(val):
 
         return data_array
 
-    def _unstack_once(self, dim: Hashable, fill_value, sparse) -> "Dataset":
+    def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset":
+        index = self.get_index(dim)
+        index = remove_unused_levels_categories(index)
+
+        variables: Dict[Hashable, Variable] = {}
+        indexes = {k: v for k, v in self.indexes.items() if k != dim}
+
+        for name, var in self.variables.items():
+            if name != dim:
+                if dim in var.dims:
+                    if isinstance(fill_value, Mapping):
+                        fill_value_ = fill_value[name]
+                    else:
+                        fill_value_ = fill_value
+
+                    variables[name] = var._unstack_once(
+                        index=index, dim=dim, fill_value=fill_value_
+                    )
+                else:
+                    variables[name] = var
+
+        for name, lev in zip(index.names, index.levels):
+            variables[name] = IndexVariable(name, lev)
+            indexes[name] = lev
+
+        coord_names = set(self._coord_names) - {dim} | set(index.names)
+
+        return self._replace_with_new_dims(
+            variables, coord_names=coord_names, indexes=indexes
+        )
+
+    def _unstack_full_reindex(
+        self, dim: Hashable, fill_value, sparse: bool
+    ) -> "Dataset":
         index = self.get_index(dim)
         index = remove_unused_levels_categories(index)
         full_idx = pd.MultiIndex.from_product(index.levels, names=index.names)
@@ -3812,7 +3846,38 @@ def unstack(
 
         result = self.copy(deep=False)
         for dim in dims:
-            result = result._unstack_once(dim, fill_value, sparse)
+
+            if (
+                # Dask arrays don't support assignment by index, which the fast unstack
+                # function requires.
+                # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125
+                any(is_duck_dask_array(v.data) for v in self.variables.values())
+                # Sparse doesn't currently support (though we could special-case
+                # it)
+                # https://github.com/pydata/sparse/issues/422
+                or any(
+                    isinstance(v.data, sparse_array_type)
+                    for v in self.variables.values()
+                )
+                or sparse
+                # numpy full_like only added `shape` in 1.17
+                or LooseVersion(np.__version__) < LooseVersion("1.17")
+                # Until https://github.com/pydata/xarray/pull/4751 is resolved,
+                # we check explicitly whether it's a numpy array. Once that is
+                # resolved, explicitly exclude pint arrays.
+                # # pint doesn't implement `np.full_like` in a way that's
+                # # currently compatible.
+                # # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173
+                # # or any(
+                # #     isinstance(v.data, pint_array_type) for v in self.variables.values()
+                # # )
+                or any(
+                    not isinstance(v.data, np.ndarray) for v in self.variables.values()
+                )
+            ):
+                result = result._unstack_full_reindex(dim, fill_value, sparse)
+            else:
+                result = result._unstack_once(dim, fill_value)
         return result
 
     def update(self, other: "CoercibleMapping") -> "Dataset":
@@ -4982,6 +5047,10 @@ def _set_numpy_data_from_dataframe(
                 self[name] = (dims, values)
             return
 
+        # NB: similar, more general logic, now exists in
+        # variable.unstack_once; we could consider combining them at some
+        # point.
+
         shape = tuple(lev.size for lev in idx.levels)
         indexer = tuple(idx.codes)
 
diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -10,6 +10,7 @@
     Any,
     Dict,
     Hashable,
+    List,
     Mapping,
     Optional,
     Sequence,
@@ -1488,7 +1489,7 @@ def set_dims(self, dims, shape=None):
         )
         return expanded_var.transpose(*dims)
 
-    def _stack_once(self, dims, new_dim):
+    def _stack_once(self, dims: List[Hashable], new_dim: Hashable):
         if not set(dims) <= set(self.dims):
             raise ValueError("invalid existing dimensions: %s" % dims)
 
@@ -1544,7 +1545,15 @@ def stack(self, dimensions=None, **dimensions_kwargs):
             result = result._stack_once(dims, new_dim)
         return result
 
-    def _unstack_once(self, dims, old_dim):
+    def _unstack_once_full(
+        self, dims: Mapping[Hashable, int], old_dim: Hashable
+    ) -> "Variable":
+        """
+        Unstacks the variable without needing an index.
+
+        Unlike `_unstack_once`, this function requires the existing dimension to
+        contain the full product of the new dimensions.
+        """
         new_dim_names = tuple(dims.keys())
         new_dim_sizes = tuple(dims.values())
 
@@ -1573,13 +1582,64 @@ def _unstack_once(self, dims, old_dim):
 
         return Variable(new_dims, new_data, self._attrs, self._encoding, fastpath=True)
 
+    def _unstack_once(
+        self,
+        index: pd.MultiIndex,
+        dim: Hashable,
+        fill_value=dtypes.NA,
+    ) -> "Variable":
+        """
+        Unstacks this variable given an index to unstack and the name of the
+        dimension to which the index refers.
+        """
+
+        reordered = self.transpose(..., dim)
+
+        new_dim_sizes = [lev.size for lev in index.levels]
+        new_dim_names = index.names
+        indexer = index.codes
+
+        # Potentially we could replace `len(other_dims)` with just `-1`
+        other_dims = [d for d in self.dims if d != dim]
+        new_shape = list(reordered.shape[: len(other_dims)]) + new_dim_sizes
+        new_dims = reordered.dims[: len(other_dims)] + new_dim_names
+
+        if fill_value is dtypes.NA:
+            is_missing_values = np.prod(new_shape) > np.prod(self.shape)
+            if is_missing_values:
+                dtype, fill_value = dtypes.maybe_promote(self.dtype)
+            else:
+                dtype = self.dtype
+                fill_value = dtypes.get_fill_value(dtype)
+        else:
+            dtype = self.dtype
+
+        # Currently fails on sparse due to https://github.com/pydata/sparse/issues/422
+        data = np.full_like(
+            self.data,
+            fill_value=fill_value,
+            shape=new_shape,
+            dtype=dtype,
+        )
+
+        # Indexer is a list of lists of locations. Each list is the locations
+        # on the new dimension. This is robust to the data being sparse; in that
+        # case the destinations will be NaN / zero.
+        data[(..., *indexer)] = reordered
+
+        return self._replace(dims=new_dims, data=data)
+
     def unstack(self, dimensions=None, **dimensions_kwargs):
         """
         Unstack an existing dimension into multiple new dimensions.
 
         New dimensions will be added at the end, and the order of the data
         along each new dimension will be in contiguous (C) order.
 
+        Note that unlike ``DataArray.unstack`` and ``Dataset.unstack``, this
+        method requires the existing dimension to contain the full product of
+        the new dimensions.
+
         Parameters
         ----------
         dimensions : mapping of hashable to mapping of hashable to int
@@ -1598,11 +1658,13 @@ def unstack(self, dimensions=None, **dimensions_kwargs):
         See also
         --------
         Variable.stack
+        DataArray.unstack
+        Dataset.unstack
         """
         dimensions = either_dict_or_kwargs(dimensions, dimensions_kwargs, "unstack")
         result = self
         for old_dim, dims in dimensions.items():
-            result = result._unstack_once(dims, old_dim)
+            result = result._unstack_once_full(dims, old_dim)
         return result
 
     def fillna(self, value):