Faster unstacking to sparse (#5577)

dcherian · web-flow · commit cdfcf373bc2f · 2021-12-03T09:38:49.000-07:00
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -65,6 +65,7 @@
         "bottleneck": [""],
         "dask": [""],
         "distributed": [""],
+        "sparse": [""]
     },
 
 
diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py
@@ -22,6 +22,13 @@ def requires_dask():
         raise NotImplementedError()
 
 
+def requires_sparse():
+    try:
+        import sparse  # noqa: F401
+    except ImportError:
+        raise NotImplementedError()
+
+
 def randn(shape, frac_nan=None, chunks=None, seed=0):
     rng = np.random.RandomState(seed)
     if chunks is None:
diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py
@@ -1,8 +1,9 @@
 import numpy as np
+import pandas as pd
 
 import xarray as xr
 
-from . import requires_dask
+from . import requires_dask, requires_sparse
 
 
 class Unstacking:
@@ -27,3 +28,37 @@ def setup(self, *args, **kwargs):
         requires_dask()
         super().setup(**kwargs)
         self.da_full = self.da_full.chunk({"flat_dim": 25})
+
+
+class UnstackingSparse(Unstacking):
+    def setup(self, *args, **kwargs):
+        requires_sparse()
+
+        import sparse
+
+        data = sparse.random((500, 1000), random_state=0, fill_value=0)
+        self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...])
+        self.da_missing = self.da_full[:-1]
+
+        mindex = pd.MultiIndex.from_arrays([np.arange(100), np.arange(100)])
+        self.da_eye_2d = xr.DataArray(np.ones((100,)), dims="z", coords={"z": mindex})
+        self.da_eye_3d = xr.DataArray(
+            np.ones((100, 50)),
+            dims=("z", "foo"),
+            coords={"z": mindex, "foo": np.arange(50)},
+        )
+
+    def time_unstack_to_sparse_2d(self):
+        self.da_eye_2d.unstack(sparse=True)
+
+    def time_unstack_to_sparse_3d(self):
+        self.da_eye_3d.unstack(sparse=True)
+
+    def peakmem_unstack_to_sparse_2d(self):
+        self.da_eye_2d.unstack(sparse=True)
+
+    def peakmem_unstack_to_sparse_3d(self):
+        self.da_eye_3d.unstack(sparse=True)
+
+    def time_unstack_pandas_slow(self):
+        pass
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -33,6 +33,12 @@ Deprecations
 ~~~~~~~~~~~~
 
 
+Performance
+~~~~~~~~~~~
+
+- Significantly faster unstacking to a ``sparse`` array. :pull:`5577`
+  By `Deepak Cherian <https://github.com/dcherian>`_.
+
 Bug fixes
 ~~~~~~~~~
 - :py:func:`xr.map_blocks` and :py:func:`xr.corr` now work when dask is not installed (:issue:`3391`, :issue:`5715`, :pull:`5731`).
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -4045,7 +4045,9 @@ def ensure_stackable(val):
 
         return data_array
 
-    def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset":
+    def _unstack_once(
+        self, dim: Hashable, fill_value, sparse: bool = False
+    ) -> "Dataset":
         index = self.get_index(dim)
         index = remove_unused_levels_categories(index)
 
@@ -4061,7 +4063,7 @@ def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset":
                         fill_value_ = fill_value
 
                     variables[name] = var._unstack_once(
-                        index=index, dim=dim, fill_value=fill_value_
+                        index=index, dim=dim, fill_value=fill_value_, sparse=sparse
                     )
                 else:
                     variables[name] = var
@@ -4195,7 +4197,7 @@ def unstack(
         #    Once that is resolved, explicitly exclude pint arrays.
         #    pint doesn't implement `np.full_like` in a way that's
         #    currently compatible.
-        needs_full_reindex = sparse or any(
+        needs_full_reindex = any(
             is_duck_dask_array(v.data)
             or isinstance(v.data, sparse_array_type)
             or not isinstance(v.data, np.ndarray)
@@ -4206,7 +4208,7 @@ def unstack(
             if needs_full_reindex:
                 result = result._unstack_full_reindex(dim, fill_value, sparse)
             else:
-                result = result._unstack_once(dim, fill_value)
+                result = result._unstack_once(dim, fill_value, sparse)
         return result
 
     def update(self, other: "CoercibleMapping") -> "Dataset":
diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -1631,6 +1631,7 @@ def _unstack_once(
         index: pd.MultiIndex,
         dim: Hashable,
         fill_value=dtypes.NA,
+        sparse: bool = False,
     ) -> "Variable":
         """
         Unstacks this variable given an index to unstack and the name of the
@@ -1658,19 +1659,38 @@ def _unstack_once(
         else:
             dtype = self.dtype
 
-        data = np.full_like(
-            self.data,
-            fill_value=fill_value,
-            shape=new_shape,
-            dtype=dtype,
-        )
+        if sparse:
+            # unstacking a dense multitindexed array to a sparse array
+            from sparse import COO
+
+            codes = zip(*index.codes)
+            if reordered.ndim == 1:
+                indexes = codes
+            else:
+                sizes = itertools.product(*[range(s) for s in reordered.shape[:-1]])
+                tuple_indexes = itertools.product(sizes, codes)
+                indexes = map(lambda x: list(itertools.chain(*x)), tuple_indexes)  # type: ignore
+
+            data = COO(
+                coords=np.array(list(indexes)).T,
+                data=self.data.astype(dtype).ravel(),
+                fill_value=fill_value,
+                shape=new_shape,
+                sorted=index.is_monotonic_increasing,
+            )
+
+        else:
+            data = np.full_like(
+                self.data,
+                fill_value=fill_value,
+                shape=new_shape,
+                dtype=dtype,
+            )
 
-        # Indexer is a list of lists of locations. Each list is the locations
-        # on the new dimension. This is robust to the data being sparse; in that
-        # case the destinations will be NaN / zero.
-        # sparse doesn't support item assigment,
-        # https://github.com/pydata/sparse/issues/114
-        data[(..., *indexer)] = reordered
+            # Indexer is a list of lists of locations. Each list is the locations
+            # on the new dimension. This is robust to the data being sparse; in that
+            # case the destinations will be NaN / zero.
+            data[(..., *indexer)] = reordered
 
         return self._replace(dims=new_dims, data=data)
 
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -28,7 +28,7 @@
 from xarray.core import dtypes, indexing, utils
 from xarray.core.common import duck_array_ops, full_like
 from xarray.core.indexes import Index
-from xarray.core.pycompat import integer_types
+from xarray.core.pycompat import integer_types, sparse_array_type
 from xarray.core.utils import is_scalar
 
 from . import (
@@ -3085,14 +3085,42 @@ def test_unstack_sparse(self):
         # test fill_value
         actual = ds.unstack("index", sparse=True)
         expected = ds.unstack("index")
+        assert isinstance(actual["var"].data, sparse_array_type)
         assert actual["var"].variable._to_dense().equals(expected["var"].variable)
         assert actual["var"].data.density < 1.0
 
         actual = ds["var"].unstack("index", sparse=True)
         expected = ds["var"].unstack("index")
+        assert isinstance(actual.data, sparse_array_type)
         assert actual.variable._to_dense().equals(expected.variable)
         assert actual.data.density < 1.0
 
+        mindex = pd.MultiIndex.from_arrays(
+            [np.arange(3), np.arange(3)], names=["a", "b"]
+        )
+        ds_eye = Dataset(
+            {"var": (("z", "foo", "bar"), np.ones((3, 4, 5)))},
+            coords={"z": mindex, "foo": np.arange(4), "bar": np.arange(5)},
+        )
+        actual = ds_eye.unstack(sparse=True, fill_value=0)
+        assert isinstance(actual["var"].data, sparse_array_type)
+        expected = xr.Dataset(
+            {
+                "var": (
+                    ("foo", "bar", "a", "b"),
+                    np.broadcast_to(np.eye(3, 3), (4, 5, 3, 3)),
+                )
+            },
+            coords={
+                "foo": np.arange(4),
+                "bar": np.arange(5),
+                "a": np.arange(3),
+                "b": np.arange(3),
+            },
+        )
+        actual["var"].data = actual["var"].data.todense()
+        assert_equal(expected, actual)
+
     def test_stack_unstack_fast(self):
         ds = Dataset(
             {