Skip to content

Commit f218fa3

Browse files
authored
PERF: DataFrame.transpose for pyarrow-backed (#54224)
* PERF: DataFrame.transpose for pyarrow-backed * whatsnew * mypy * move condition into _iter_column_arrays * shorten name
1 parent c9b8633 commit f218fa3

File tree

4 files changed

+40
-10
lines changed

4 files changed

+40
-10
lines changed

Diff for: doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,7 @@ Performance improvements
403403
- Performance improvement in :meth:`DataFrame.isin` for extension dtypes (:issue:`53514`)
404404
- Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`)
405405
- Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`52836`)
406+
- Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single pyarrow dtype (:issue:`54224`)
406407
- Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`)
407408
- Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`)
408409
- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with pyarrow dtypes (:issue:`53950`)

Diff for: pandas/core/arrays/arrow/array.py

+16
Original file line numberDiff line numberDiff line change
@@ -2564,3 +2564,19 @@ def _dt_tz_convert(self, tz):
25642564
current_unit = self.dtype.pyarrow_dtype.unit
25652565
result = self._pa_array.cast(pa.timestamp(current_unit, tz))
25662566
return type(self)(result)
2567+
2568+
2569+
def transpose_homogeneous_pyarrow(
2570+
arrays: Sequence[ArrowExtensionArray],
2571+
) -> list[ArrowExtensionArray]:
2572+
"""Transpose arrow extension arrays in a list, but faster.
2573+
2574+
Input should be a list of arrays of equal length and all have the same
2575+
dtype. The caller is responsible for ensuring validity of input data.
2576+
"""
2577+
arrays = list(arrays)
2578+
nrows, ncols = len(arrays[0]), len(arrays)
2579+
indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.flatten()
2580+
arr = pa.chunked_array([chunk for arr in arrays for chunk in arr._pa_array.chunks])
2581+
arr = arr.take(indices)
2582+
return [ArrowExtensionArray(arr.slice(i * ncols, ncols)) for i in range(nrows)]

Diff for: pandas/core/arrays/masked.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1464,14 +1464,15 @@ def _groupby_op(
14641464
return self._maybe_mask_result(res_values, result_mask)
14651465

14661466

1467-
def transpose_homogenous_masked_arrays(
1467+
def transpose_homogeneous_masked_arrays(
14681468
masked_arrays: Sequence[BaseMaskedArray],
14691469
) -> list[BaseMaskedArray]:
14701470
"""Transpose masked arrays in a list, but faster.
14711471
14721472
Input should be a list of 1-dim masked arrays of equal length and all have the
14731473
same dtype. The caller is responsible for ensuring validity of input data.
14741474
"""
1475+
masked_arrays = list(masked_arrays)
14751476
values = [arr._data.reshape(1, -1) for arr in masked_arrays]
14761477
transposed_values = np.concatenate(values, axis=0)
14771478

Diff for: pandas/core/frame.py

+21-9
Original file line numberDiff line numberDiff line change
@@ -3664,16 +3664,25 @@ def transpose(self, *args, copy: bool = False) -> DataFrame:
36643664
and dtypes
36653665
and isinstance(dtypes[0], ExtensionDtype)
36663666
):
3667+
new_values: list
36673668
if isinstance(dtypes[0], BaseMaskedDtype):
36683669
# We have masked arrays with the same dtype. We can transpose faster.
3669-
from pandas.core.arrays.masked import transpose_homogenous_masked_arrays
3670+
from pandas.core.arrays.masked import (
3671+
transpose_homogeneous_masked_arrays,
3672+
)
36703673

3671-
if isinstance(self._mgr, ArrayManager):
3672-
masked_arrays = self._mgr.arrays
3673-
else:
3674-
masked_arrays = list(self._iter_column_arrays())
3675-
new_values = transpose_homogenous_masked_arrays(
3676-
cast(list[BaseMaskedArray], masked_arrays)
3674+
new_values = transpose_homogeneous_masked_arrays(
3675+
cast(Sequence[BaseMaskedArray], self._iter_column_arrays())
3676+
)
3677+
elif isinstance(dtypes[0], ArrowDtype):
3678+
# We have arrow EAs with the same dtype. We can transpose faster.
3679+
from pandas.core.arrays.arrow.array import (
3680+
ArrowExtensionArray,
3681+
transpose_homogeneous_pyarrow,
3682+
)
3683+
3684+
new_values = transpose_homogeneous_pyarrow(
3685+
cast(Sequence[ArrowExtensionArray], self._iter_column_arrays())
36773686
)
36783687
else:
36793688
# We have other EAs with the same dtype. We preserve dtype in transpose.
@@ -3788,8 +3797,11 @@ def _iter_column_arrays(self) -> Iterator[ArrayLike]:
37883797
Warning! The returned array is a view but doesn't handle Copy-on-Write,
37893798
so this should be used with caution (for read-only purposes).
37903799
"""
3791-
for i in range(len(self.columns)):
3792-
yield self._get_column_array(i)
3800+
if isinstance(self._mgr, ArrayManager):
3801+
yield from self._mgr.arrays
3802+
else:
3803+
for i in range(len(self.columns)):
3804+
yield self._get_column_array(i)
37933805

37943806
def _getitem_nocopy(self, key: list):
37953807
"""

0 commit comments

Comments
 (0)