diff --git a/doc/whats-new.rst b/doc/whats-new.rst index df2afdd9c4d..d46bf3a5d24 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -82,6 +82,7 @@ Bug fixes By `Maxime Liquet `_. - ``open_mfdataset()`` now accepts a single ``pathlib.Path`` object (:issue: `5881`). By `Panos Mavrogiorgos `_. +- Improved performance of :py:meth:`Dataset.unstack` (:pull:`5906`). By `Tom Augspurger `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 550c3587aa6..9880108f734 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4153,34 +4153,34 @@ def unstack( ) result = self.copy(deep=False) - for dim in dims: - if ( - # Dask arrays don't support assignment by index, which the fast unstack - # function requires. - # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125 - any(is_duck_dask_array(v.data) for v in self.variables.values()) - # Sparse doesn't currently support (though we could special-case - # it) - # https://github.com/pydata/sparse/issues/422 - or any( - isinstance(v.data, sparse_array_type) - for v in self.variables.values() - ) - or sparse - # Until https://github.com/pydata/xarray/pull/4751 is resolved, - # we check explicitly whether it's a numpy array. Once that is - # resolved, explicitly exclude pint arrays. - # # pint doesn't implement `np.full_like` in a way that's - # # currently compatible. - # # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173 - # # or any( - # # isinstance(v.data, pint_array_type) for v in self.variables.values() - # # ) - or any( - not isinstance(v.data, np.ndarray) for v in self.variables.values() - ) - ): + # we want to avoid allocating an object-dtype ndarray for a MultiIndex, + # so we can't just access self.variables[v].data for every variable. + # We only check the non-index variables. + # https://github.com/pydata/xarray/issues/5902 + nonindexes = [ + self.variables[k] for k in set(self.variables) - set(self.xindexes) + ] + # Notes for each of these cases: + # 1. Dask arrays don't support assignment by index, which the fast unstack + # function requires. + # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125 + # 2. Sparse doesn't currently support (though we could special-case it) + # https://github.com/pydata/sparse/issues/422 + # 3. pint requires checking if it's a NumPy array until + # https://github.com/pydata/xarray/pull/4751 is resolved, + # Once that is resolved, explicitly exclude pint arrays. + # pint doesn't implement `np.full_like` in a way that's + # currently compatible. + needs_full_reindex = sparse or any( + is_duck_dask_array(v.data) + or isinstance(v.data, sparse_array_type) + or not isinstance(v.data, np.ndarray) + for v in nonindexes + ) + + for dim in dims: + if needs_full_reindex: result = result._unstack_full_reindex(dim, fill_value, sparse) else: result = result._unstack_once(dim, fill_value)