Skip to content

Commit f1211e7

Browse files
authored
PERF: Faster transposition of frames with masked arrays (#52836)
1 parent db27c36 commit f1211e7

File tree

4 files changed

+67
-6
lines changed

4 files changed

+67
-6
lines changed

Diff for: asv_bench/benchmarks/reshape.py

+19
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,25 @@ def time_transpose(self, dtype):
9393
self.df.T
9494

9595

96+
class ReshapeMaskedArrayDtype(ReshapeExtensionDtype):
97+
params = ["Int64", "Float64"]
98+
param_names = ["dtype"]
99+
100+
def setup(self, dtype):
101+
lev = pd.Index(list("ABCDEFGHIJ"))
102+
ri = pd.Index(range(1000))
103+
mi = MultiIndex.from_product([lev, ri], names=["foo", "bar"])
104+
105+
values = np.random.randn(10_000).astype(int)
106+
107+
ser = pd.Series(values, dtype=dtype, index=mi)
108+
df = ser.unstack("bar")
109+
# roundtrips -> df.stack().equals(ser)
110+
111+
self.ser = ser
112+
self.df = df
113+
114+
96115
class Unstack:
97116
params = ["int", "category"]
98117

Diff for: doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,7 @@ Performance improvements
396396
- Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`)
397397
- Performance improvement in :meth:`DataFrame.isin` for extension dtypes (:issue:`53514`)
398398
- Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`)
399+
- Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`52836`)
399400
- Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`)
400401
- Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`)
401402
- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with pyarrow dtypes (:issue:`53950`)

Diff for: pandas/core/arrays/masked.py

+24
Original file line numberDiff line numberDiff line change
@@ -1462,3 +1462,27 @@ def _groupby_op(
14621462
# res_values should already have the correct dtype, we just need to
14631463
# wrap in a MaskedArray
14641464
return self._maybe_mask_result(res_values, result_mask)
1465+
1466+
1467+
def transpose_homogenous_masked_arrays(
1468+
masked_arrays: Sequence[BaseMaskedArray],
1469+
) -> list[BaseMaskedArray]:
1470+
"""Transpose masked arrays in a list, but faster.
1471+
1472+
Input should be a list of 1-dim masked arrays of equal length and all have the
1473+
same dtype. The caller is responsible for ensuring validity of input data.
1474+
"""
1475+
values = [arr._data.reshape(1, -1) for arr in masked_arrays]
1476+
transposed_values = np.concatenate(values, axis=0)
1477+
1478+
masks = [arr._mask.reshape(1, -1) for arr in masked_arrays]
1479+
transposed_masks = np.concatenate(masks, axis=0)
1480+
1481+
dtype = masked_arrays[0].dtype
1482+
arr_type = dtype.construct_array_type()
1483+
transposed_arrays: list[BaseMaskedArray] = []
1484+
for i in range(transposed_values.shape[1]):
1485+
transposed_arr = arr_type(transposed_values[:, i], mask=transposed_masks[:, i])
1486+
transposed_arrays.append(transposed_arr)
1487+
1488+
return transposed_arrays

Diff for: pandas/core/frame.py

+23-6
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@
108108
)
109109
from pandas.core.dtypes.dtypes import (
110110
ArrowDtype,
111+
BaseMaskedDtype,
111112
ExtensionDtype,
112113
)
113114
from pandas.core.dtypes.missing import (
@@ -127,6 +128,7 @@
127128
from pandas.core.array_algos.take import take_2d_multi
128129
from pandas.core.arraylike import OpsMixin
129130
from pandas.core.arrays import (
131+
BaseMaskedArray,
130132
DatetimeArray,
131133
ExtensionArray,
132134
PeriodArray,
@@ -3619,14 +3621,29 @@ def transpose(self, *args, copy: bool = False) -> DataFrame:
36193621
and dtypes
36203622
and isinstance(dtypes[0], ExtensionDtype)
36213623
):
3622-
# We have EAs with the same dtype. We can preserve that dtype in transpose.
3623-
dtype = dtypes[0]
3624-
arr_type = dtype.construct_array_type()
3625-
values = self.values
3624+
if isinstance(dtypes[0], BaseMaskedDtype):
3625+
# We have masked arrays with the same dtype. We can transpose faster.
3626+
from pandas.core.arrays.masked import transpose_homogenous_masked_arrays
3627+
3628+
if isinstance(self._mgr, ArrayManager):
3629+
masked_arrays = self._mgr.arrays
3630+
else:
3631+
masked_arrays = list(self._iter_column_arrays())
3632+
new_values = transpose_homogenous_masked_arrays(
3633+
cast(list[BaseMaskedArray], masked_arrays)
3634+
)
3635+
else:
3636+
# We have other EAs with the same dtype. We preserve dtype in transpose.
3637+
dtyp = dtypes[0]
3638+
arr_typ = dtyp.construct_array_type()
3639+
values = self.values
3640+
new_values = [arr_typ._from_sequence(row, dtype=dtyp) for row in values]
36263641

3627-
new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values]
36283642
result = type(self)._from_arrays(
3629-
new_values, index=self.columns, columns=self.index
3643+
new_values,
3644+
index=self.columns,
3645+
columns=self.index,
3646+
verify_integrity=False,
36303647
)
36313648

36323649
else:

0 commit comments

Comments
 (0)