Skip to content

Commit b27b601

Browse files
Tom Augspurgersnowman2
Tom Augspurger
authored andcommitted
Avoid accessing slow .data in unstack (pydata#5906)
1 parent 27dabd0 commit b27b601

File tree

2 files changed

+28
-27
lines changed

2 files changed

+28
-27
lines changed

doc/whats-new.rst

+1
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ Bug fixes
9090
By `Mike Taves <https://github.com/mwtoews>`_.
9191
- ``open_mfdataset()`` now accepts a single ``pathlib.Path`` object (:issue: `5881`).
9292
By `Panos Mavrogiorgos <https://github.com/pmav99>`_.
93+
- Improved performance of :py:meth:`Dataset.unstack` (:pull:`5906`). By `Tom Augspurger <https://github.com/TomAugspurger>`_.
9394

9495
Documentation
9596
~~~~~~~~~~~~~

xarray/core/dataset.py

+27-27
Original file line numberDiff line numberDiff line change
@@ -4153,34 +4153,34 @@ def unstack(
41534153
)
41544154

41554155
result = self.copy(deep=False)
4156-
for dim in dims:
41574156

4158-
if (
4159-
# Dask arrays don't support assignment by index, which the fast unstack
4160-
# function requires.
4161-
# https://github.com/pydata/xarray/pull/4746#issuecomment-753282125
4162-
any(is_duck_dask_array(v.data) for v in self.variables.values())
4163-
# Sparse doesn't currently support (though we could special-case
4164-
# it)
4165-
# https://github.com/pydata/sparse/issues/422
4166-
or any(
4167-
isinstance(v.data, sparse_array_type)
4168-
for v in self.variables.values()
4169-
)
4170-
or sparse
4171-
# Until https://github.com/pydata/xarray/pull/4751 is resolved,
4172-
# we check explicitly whether it's a numpy array. Once that is
4173-
# resolved, explicitly exclude pint arrays.
4174-
# # pint doesn't implement `np.full_like` in a way that's
4175-
# # currently compatible.
4176-
# # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173
4177-
# # or any(
4178-
# # isinstance(v.data, pint_array_type) for v in self.variables.values()
4179-
# # )
4180-
or any(
4181-
not isinstance(v.data, np.ndarray) for v in self.variables.values()
4182-
)
4183-
):
4157+
# we want to avoid allocating an object-dtype ndarray for a MultiIndex,
4158+
# so we can't just access self.variables[v].data for every variable.
4159+
# We only check the non-index variables.
4160+
# https://github.com/pydata/xarray/issues/5902
4161+
nonindexes = [
4162+
self.variables[k] for k in set(self.variables) - set(self.xindexes)
4163+
]
4164+
# Notes for each of these cases:
4165+
# 1. Dask arrays don't support assignment by index, which the fast unstack
4166+
# function requires.
4167+
# https://github.com/pydata/xarray/pull/4746#issuecomment-753282125
4168+
# 2. Sparse doesn't currently support (though we could special-case it)
4169+
# https://github.com/pydata/sparse/issues/422
4170+
# 3. pint requires checking if it's a NumPy array until
4171+
# https://github.com/pydata/xarray/pull/4751 is resolved,
4172+
# Once that is resolved, explicitly exclude pint arrays.
4173+
# pint doesn't implement `np.full_like` in a way that's
4174+
# currently compatible.
4175+
needs_full_reindex = sparse or any(
4176+
is_duck_dask_array(v.data)
4177+
or isinstance(v.data, sparse_array_type)
4178+
or not isinstance(v.data, np.ndarray)
4179+
for v in nonindexes
4180+
)
4181+
4182+
for dim in dims:
4183+
if needs_full_reindex:
41844184
result = result._unstack_full_reindex(dim, fill_value, sparse)
41854185
else:
41864186
result = result._unstack_once(dim, fill_value)

0 commit comments

Comments
 (0)