From 73c2a917172e83f475398dbece302e9d92b91390 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Oct 2021 08:36:18 -0500 Subject: [PATCH 1/4] Avoid accessing slow .data in unstack Closes https://github.com/pydata/xarray/issues/5902 --- xarray/core/dataset.py | 58 ++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 550c3587aa6..0335d4cf163 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4153,34 +4153,38 @@ def unstack( ) result = self.copy(deep=False) - for dim in dims: - if ( - # Dask arrays don't support assignment by index, which the fast unstack - # function requires. - # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125 - any(is_duck_dask_array(v.data) for v in self.variables.values()) - # Sparse doesn't currently support (though we could special-case - # it) - # https://github.com/pydata/sparse/issues/422 - or any( - isinstance(v.data, sparse_array_type) - for v in self.variables.values() - ) - or sparse - # Until https://github.com/pydata/xarray/pull/4751 is resolved, - # we check explicitly whether it's a numpy array. Once that is - # resolved, explicitly exclude pint arrays. - # # pint doesn't implement `np.full_like` in a way that's - # # currently compatible. - # # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173 - # # or any( - # # isinstance(v.data, pint_array_type) for v in self.variables.values() - # # ) - or any( - not isinstance(v.data, np.ndarray) for v in self.variables.values() - ) - ): + # we want to avoid allocating an object-dtype ndarray for a MultiIndex, + # so we can't just access self.variables[v].data for every variable. + # We only check the non-index variables. + # https://github.com/pydata/xarray/issues/5902 + nonindexes = [ + self.variables[k] for k in set(self.variables) - set(self.indexes) + ] + needs_full_reindex = ( + # Dask arrays don't support assignment by index, which the fast unstack + # function requires. + # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125 + any(is_duck_dask_array(v.data) for v in nonindexes) + # Sparse doesn't currently support (though we could special-case + # it) + # https://github.com/pydata/sparse/issues/422 + or any(isinstance(v.data, sparse_array_type) for v in nonindexes) + or sparse + # Until https://github.com/pydata/xarray/pull/4751 is resolved, + # we check explicitly whether it's a numpy array. Once that is + # resolved, explicitly exclude pint arrays. + # # pint doesn't implement `np.full_like` in a way that's + # # currently compatible. + # # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173 + # # or any( + # # isinstance(v.data, pint_array_type) for v in self.variables.values() + # # ) + or any(not isinstance(v.data, np.ndarray) for v in nonindexes) + ) + + for dim in dims: + if needs_full_reindex: result = result._unstack_full_reindex(dim, fill_value, sparse) else: result = result._unstack_once(dim, fill_value) From d0a76575f6a8940abbc4aaf273bc27222493d5db Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Oct 2021 08:39:59 -0500 Subject: [PATCH 2/4] Added PR number --- doc/whats-new.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index df2afdd9c4d..d46bf3a5d24 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -82,6 +82,7 @@ Bug fixes By `Maxime Liquet `_. - ``open_mfdataset()`` now accepts a single ``pathlib.Path`` object (:issue: `5881`). By `Panos Mavrogiorgos `_. +- Improved performance of :py:meth:`Dataset.unstack` (:pull:`5906`). By `Tom Augspurger `_. Documentation ~~~~~~~~~~~~~ From bba423f6061f1aff462656c3d86b4b0c6b000b24 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 28 Oct 2021 22:58:31 +0530 Subject: [PATCH 3/4] Update xarray/core/dataset.py --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0335d4cf163..ecf8946c76c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4159,7 +4159,7 @@ def unstack( # We only check the non-index variables. # https://github.com/pydata/xarray/issues/5902 nonindexes = [ - self.variables[k] for k in set(self.variables) - set(self.indexes) + self.variables[k] for k in set(self.variables) - set(self.xindexes) ] needs_full_reindex = ( # Dask arrays don't support assignment by index, which the fast unstack From 6363a761c62605bcf3552cacce1c48d5b543eeca Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Oct 2021 15:21:26 -0500 Subject: [PATCH 4/4] Refactor checks to avoid .data more --- xarray/core/dataset.py | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ecf8946c76c..9880108f734 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4161,26 +4161,22 @@ def unstack( nonindexes = [ self.variables[k] for k in set(self.variables) - set(self.xindexes) ] - needs_full_reindex = ( - # Dask arrays don't support assignment by index, which the fast unstack - # function requires. - # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125 - any(is_duck_dask_array(v.data) for v in nonindexes) - # Sparse doesn't currently support (though we could special-case - # it) - # https://github.com/pydata/sparse/issues/422 - or any(isinstance(v.data, sparse_array_type) for v in nonindexes) - or sparse - # Until https://github.com/pydata/xarray/pull/4751 is resolved, - # we check explicitly whether it's a numpy array. Once that is - # resolved, explicitly exclude pint arrays. - # # pint doesn't implement `np.full_like` in a way that's - # # currently compatible. - # # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173 - # # or any( - # # isinstance(v.data, pint_array_type) for v in self.variables.values() - # # ) - or any(not isinstance(v.data, np.ndarray) for v in nonindexes) + # Notes for each of these cases: + # 1. Dask arrays don't support assignment by index, which the fast unstack + # function requires. + # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125 + # 2. Sparse doesn't currently support (though we could special-case it) + # https://github.com/pydata/sparse/issues/422 + # 3. pint requires checking if it's a NumPy array until + # https://github.com/pydata/xarray/pull/4751 is resolved, + # Once that is resolved, explicitly exclude pint arrays. + # pint doesn't implement `np.full_like` in a way that's + # currently compatible. + needs_full_reindex = sparse or any( + is_duck_dask_array(v.data) + or isinstance(v.data, sparse_array_type) + or not isinstance(v.data, np.ndarray) + for v in nonindexes ) for dim in dims: