From 73c2a917172e83f475398dbece302e9d92b91390 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <taugspurger@microsoft.com>
Date: Thu, 28 Oct 2021 08:36:18 -0500
Subject: [PATCH 1/4] Avoid accessing slow .data in unstack

Closes https://github.com/pydata/xarray/issues/5902
---
 xarray/core/dataset.py | 58 ++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index 550c3587aa6..0335d4cf163 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -4153,34 +4153,38 @@ def unstack(
                 )
 
         result = self.copy(deep=False)
-        for dim in dims:
 
-            if (
-                # Dask arrays don't support assignment by index, which the fast unstack
-                # function requires.
-                # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125
-                any(is_duck_dask_array(v.data) for v in self.variables.values())
-                # Sparse doesn't currently support (though we could special-case
-                # it)
-                # https://github.com/pydata/sparse/issues/422
-                or any(
-                    isinstance(v.data, sparse_array_type)
-                    for v in self.variables.values()
-                )
-                or sparse
-                # Until https://github.com/pydata/xarray/pull/4751 is resolved,
-                # we check explicitly whether it's a numpy array. Once that is
-                # resolved, explicitly exclude pint arrays.
-                # # pint doesn't implement `np.full_like` in a way that's
-                # # currently compatible.
-                # # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173
-                # # or any(
-                # #     isinstance(v.data, pint_array_type) for v in self.variables.values()
-                # # )
-                or any(
-                    not isinstance(v.data, np.ndarray) for v in self.variables.values()
-                )
-            ):
+        # we want to avoid allocating an object-dtype ndarray for a MultiIndex,
+        # so we can't just access self.variables[v].data for every variable.
+        # We only check the non-index variables.
+        # https://github.com/pydata/xarray/issues/5902
+        nonindexes = [
+            self.variables[k] for k in set(self.variables) - set(self.indexes)
+        ]
+        needs_full_reindex = (
+            # Dask arrays don't support assignment by index, which the fast unstack
+            # function requires.
+            # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125
+            any(is_duck_dask_array(v.data) for v in nonindexes)
+            # Sparse doesn't currently support (though we could special-case
+            # it)
+            # https://github.com/pydata/sparse/issues/422
+            or any(isinstance(v.data, sparse_array_type) for v in nonindexes)
+            or sparse
+            # Until https://github.com/pydata/xarray/pull/4751 is resolved,
+            # we check explicitly whether it's a numpy array. Once that is
+            # resolved, explicitly exclude pint arrays.
+            # # pint doesn't implement `np.full_like` in a way that's
+            # # currently compatible.
+            # # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173
+            # # or any(
+            # #     isinstance(v.data, pint_array_type) for v in self.variables.values()
+            # # )
+            or any(not isinstance(v.data, np.ndarray) for v in nonindexes)
+        )
+
+        for dim in dims:
+            if needs_full_reindex:
                 result = result._unstack_full_reindex(dim, fill_value, sparse)
             else:
                 result = result._unstack_once(dim, fill_value)

From d0a76575f6a8940abbc4aaf273bc27222493d5db Mon Sep 17 00:00:00 2001
From: Tom Augspurger <taugspurger@microsoft.com>
Date: Thu, 28 Oct 2021 08:39:59 -0500
Subject: [PATCH 2/4] Added PR number

---
 doc/whats-new.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index df2afdd9c4d..d46bf3a5d24 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -82,6 +82,7 @@ Bug fixes
   By `Maxime Liquet <https://github.com/maximlt>`_.
 - ``open_mfdataset()`` now accepts a single ``pathlib.Path`` object (:issue: `5881`).
   By `Panos Mavrogiorgos <https://github.com/pmav99>`_.
+- Improved performance of :py:meth:`Dataset.unstack` (:pull:`5906`). By `Tom Augspurger <https://github.com/TomAugspurger>`_.
 
 Documentation
 ~~~~~~~~~~~~~

From bba423f6061f1aff462656c3d86b4b0c6b000b24 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <dcherian@users.noreply.github.com>
Date: Thu, 28 Oct 2021 22:58:31 +0530
Subject: [PATCH 3/4] Update xarray/core/dataset.py

---
 xarray/core/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index 0335d4cf163..ecf8946c76c 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -4159,7 +4159,7 @@ def unstack(
         # We only check the non-index variables.
         # https://github.com/pydata/xarray/issues/5902
         nonindexes = [
-            self.variables[k] for k in set(self.variables) - set(self.indexes)
+            self.variables[k] for k in set(self.variables) - set(self.xindexes)
         ]
         needs_full_reindex = (
             # Dask arrays don't support assignment by index, which the fast unstack

From 6363a761c62605bcf3552cacce1c48d5b543eeca Mon Sep 17 00:00:00 2001
From: Tom Augspurger <taugspurger@microsoft.com>
Date: Thu, 28 Oct 2021 15:21:26 -0500
Subject: [PATCH 4/4] Refactor checks to avoid .data more

---
 xarray/core/dataset.py | 36 ++++++++++++++++--------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index ecf8946c76c..9880108f734 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -4161,26 +4161,22 @@ def unstack(
         nonindexes = [
             self.variables[k] for k in set(self.variables) - set(self.xindexes)
         ]
-        needs_full_reindex = (
-            # Dask arrays don't support assignment by index, which the fast unstack
-            # function requires.
-            # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125
-            any(is_duck_dask_array(v.data) for v in nonindexes)
-            # Sparse doesn't currently support (though we could special-case
-            # it)
-            # https://github.com/pydata/sparse/issues/422
-            or any(isinstance(v.data, sparse_array_type) for v in nonindexes)
-            or sparse
-            # Until https://github.com/pydata/xarray/pull/4751 is resolved,
-            # we check explicitly whether it's a numpy array. Once that is
-            # resolved, explicitly exclude pint arrays.
-            # # pint doesn't implement `np.full_like` in a way that's
-            # # currently compatible.
-            # # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173
-            # # or any(
-            # #     isinstance(v.data, pint_array_type) for v in self.variables.values()
-            # # )
-            or any(not isinstance(v.data, np.ndarray) for v in nonindexes)
+        # Notes for each of these cases:
+        # 1. Dask arrays don't support assignment by index, which the fast unstack
+        #    function requires.
+        #    https://github.com/pydata/xarray/pull/4746#issuecomment-753282125
+        # 2. Sparse doesn't currently support (though we could special-case it)
+        #    https://github.com/pydata/sparse/issues/422
+        # 3. pint requires checking if it's a NumPy array until
+        #    https://github.com/pydata/xarray/pull/4751 is resolved,
+        #    Once that is resolved, explicitly exclude pint arrays.
+        #    pint doesn't implement `np.full_like` in a way that's
+        #    currently compatible.
+        needs_full_reindex = sparse or any(
+            is_duck_dask_array(v.data)
+            or isinstance(v.data, sparse_array_type)
+            or not isinstance(v.data, np.ndarray)
+            for v in nonindexes
         )
 
         for dim in dims: