pandas-dev · mroeschke · Jul 13, 2023 · Apr 19, 2023 · Apr 19, 2023 · Apr 19, 2023
diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
@@ -40,6 +40,7 @@ objects.
       api.extensions.ExtensionArray._from_sequence_of_strings
       api.extensions.ExtensionArray._hash_pandas_object
       api.extensions.ExtensionArray._reduce
+      api.extensions.ExtensionArray._reduce_and_wrap
       api.extensions.ExtensionArray._values_for_argsort
       api.extensions.ExtensionArray._values_for_factorize
       api.extensions.ExtensionArray.argsort

diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst
@@ -126,13 +126,20 @@ These dtypes can be merged, reshaped & casted.
    pd.concat([df[["A"]], df[["B", "C"]]], axis=1).dtypes
    df["A"].astype(float)
 
-Reduction and groupby operations such as 'sum' work as well.
+Reduction and groupby operations such as :meth:`~DataFrame.sum` work as well.
 
 .. ipython:: python
 
+   df.sum(numeric_only=True)
    df.sum()
    df.groupby("B").A.sum()
 
+.. versionchanged:: 2.1.0
+
+    When doing reduction operations (:meth:`~DataFrame.sum` etc.) on numeric-only data
+    frames the integer array dtype will be maintained. Previously, the dtype of reduction
+    result would have been a numpy numeric dtype.
+
 Scalar NA Value
 ---------------
 

diff --git a/doc/source/user_guide/pyarrow.rst b/doc/source/user_guide/pyarrow.rst
@@ -152,6 +152,12 @@ The following are just some examples of operations that are accelerated by nativ
    ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type)
    ser_dt.dt.strftime("%Y-%m")
 
+.. versionchanged:: 2.1.0
+
+    When doing :class:`DataFrame` reduction operations (:meth:`~DataFrame.sum` etc.) on
+    pyarrow data the dtype now will be maintained when possible. Previously, the dtype
+    of reduction result would have been a numpy numeric dtype.
+
 I/O Reading
 -----------
 

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -14,10 +14,43 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
-.. _whatsnew_210.enhancements.enhancement1:
+.. _whatsnew_210.enhancements.reduction_extension_dtypes:
 
-enhancement1
-^^^^^^^^^^^^
+DataFrame reductions preserve extension dtypes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions of pandas, the results of DataFrame reductions
+(:meth:`DataFrame.sum` :meth:`DataFrame.mean` etc.) has numpy dtypes even when the DataFrames
+were of extension dtypes. Pandas can now keep the dtypes when doing reductions over Dataframe
+columns with a common dtype (:issue:`52788`).
+
+*Old Behavior*
+
+.. code-block:: ipython
+
+    In [1]: df = pd.DataFrame({"a": [1, 1, 2, 1], "b": [np.nan, 2.0, 3.0, 4.0]}, dtype="Int64")
+    In [2]: df.sum()
+    Out[2]:
+    a    5
+    b    9
+    dtype: int64
+    In [3]: df = df.astype("int64[pyarrow]")
+    In [4]: df.sum()
+    Out[4]:
+    a    5
+    b    9
+    dtype: int64
+
+*New Behavior*
+
+.. ipython:: python
+
+    df = pd.DataFrame({"a": [1, 1, 2, 1], "b": [np.nan, 2.0, 3.0, 4.0]}, dtype="Int64")
+    df.sum()
+    df = df.astype("int64[pyarrow]")
+    df.sum()
+
+Notice that the dtype is now a masked dtype and pyarrow dtype, respectively, while previously it was a numpy integer dtype.
 
 .. _whatsnew_210.enhancements.enhancement2:
 

diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py
@@ -52,7 +52,7 @@ def _reductions(
     axis : int, optional, default None
     """
     if not skipna:
-        if mask.any(axis=axis) or check_below_min_count(values.shape, None, min_count):
+        if mask.any() or check_below_min_count(values.shape, None, min_count):
             return libmissing.NA
         else:
             return func(values, axis=axis, **kwargs)
@@ -119,11 +119,11 @@ def _minmax(
             # min/max with empty array raise in numpy, pandas returns NA
             return libmissing.NA
         else:
-            return func(values)
+            return func(values, axis=axis)
     else:
         subset = values[~mask]
         if subset.size:
-            return func(subset)
+            return func(values, where=~mask, axis=axis, initial=subset[0])
         else:
             # min/max with empty array raise in numpy, pandas returns NA
             return libmissing.NA

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1533,6 +1533,12 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
 
         return result.as_py()
 
+    def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs):
+        """Takes the result of ``_reduce`` and wraps it an a ndarray/extensionArray."""
+        result = self._reduce_pyarrow(name, skipna=skipna, **kwargs)
+        result = pa.array([result.as_py()], type=result.type)
+        return type(self)(result)
+
     def __setitem__(self, key, value) -> None:
         """Set one or more values inplace.
 

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -136,6 +136,7 @@ class ExtensionArray:
     _from_sequence_of_strings
     _hash_pandas_object
     _reduce
+    _reduce_and_wrap
     _values_for_argsort
     _values_for_factorize
 
@@ -184,6 +185,7 @@ class ExtensionArray:
 
     * _accumulate
     * _reduce
+    * _reduce_and_wrap
 
     One can implement methods to handle parsing from strings that will be used
     in methods such as ``pandas.io.parsers.read_csv``.
@@ -1425,6 +1427,11 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
         Raises
         ------
         TypeError : subclass does not define reductions
+
+        See Also
+        --------
+        ExtensionArray._reduce_and_wrap
+            Calls ``_reduce`` and wraps the result in a ndarray/ExtensionArray.
         """
         meth = getattr(self, name, None)
         if meth is None:
@@ -1434,6 +1441,28 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
             )
         return meth(skipna=skipna, **kwargs)
 
+    def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs):
+        """
+        Call ``_reduce`` and wrap the result in a ndarray/ExtensionArray.
+
+        This is used to control the returned dtype when doing reductions in DataFrames,
+        and ensures the correct dtype for e.g. ``DataFrame({"a": extr_arr2}).sum()``.
+
+        Returns
+        -------
+        ndarray or ExtensionArray
+
+        Examples
+        --------
+        >>> arr = pd.array([1, 2, pd.NA])
+        >>> arr._reduce_and_wrap("sum", kwargs={})
+        <IntegerArray>
+        [3]
+        Length: 1, dtype: Int64
+        """
+        result = self._reduce(name, skipna=skipna, **kwargs)
+        return np.array([result])
+
     # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
     # Incompatible types in assignment (expression has type "None", base class
     # "object" defined the type as "Callable[[object], int]")

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2099,6 +2099,10 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:
     # ------------------------------------------------------------------
     # Reductions
 
+    def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs):
+        result = self._reduce(name, skipna=skipna, **kwargs)
+        return type(self)([result], dtype=self.dtype)
+
     def min(self, *, skipna: bool = True, **kwargs):
         """
         The minimum value of the object.

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -34,6 +34,10 @@
     Shape,
     npt,
 )
+from pandas.compat import (
+    IS64,
+    is_platform_windows,
+)
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import doc
 from pandas.util._validators import validate_fillna_kwargs
@@ -1088,25 +1092,62 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
 
         # median, skew, kurt, sem
         op = getattr(nanops, f"nan{name}")
-        result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
-
+        axis = kwargs.pop("axis", None)
+        result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs)
         if np.isnan(result):
-            return libmissing.NA
+            result = libmissing.NA
 
-        return result
+        return self._wrap_reduction_result(
+            name, result, skipna=skipna, axis=axis, **kwargs
+        )
+
+    def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs):
+        df = self.reshape(-1, 1)
+        res = df._reduce(name=name, skipna=skipna, axis=0, **kwargs)
+        return res
 
     def _wrap_reduction_result(self, name: str, result, skipna, **kwargs):
+        axis = kwargs["axis"]
         if isinstance(result, np.ndarray):
-            axis = kwargs["axis"]
             if skipna:
                 # we only retain mask for all-NA rows/columns
                 mask = self._mask.all(axis=axis)
             else:
                 mask = self._mask.any(axis=axis)
 
             return self._maybe_mask_result(result, mask)
+        elif result is libmissing.NA and self.ndim == 2:
+            result = self._wrap_na_result(name=name, axis=axis)
+            return result
         return result
 
+    def _wrap_na_result(self, *, name, axis):
+        mask_size = self.shape[1] if axis == 0 else self.shape[0]
+        mask = np.ones(mask_size, dtype=bool)
+
+        float_dtyp = "float32" if self.dtype == "Float32" else "float64"
+        if name in ["mean", "median", "var", "std", "skew"]:
+            np_dtype = float_dtyp
+        elif name in ["min", "max"] or self.dtype.itemsize == 8:
+            np_dtype = self.dtype.numpy_dtype.name
+        else:
+            is_windows_or_32bit = is_platform_windows() or not IS64
+            int_dtyp = "int32" if is_windows_or_32bit else "int64"
+            uint_dtyp = "uint32" if is_windows_or_32bit else "uint64"
+            np_dtype = {"b": int_dtyp, "i": int_dtyp, "u": uint_dtyp, "f": float_dtyp}[
+                self.dtype.kind
+            ]
+
+        value = np.array([1], dtype=np_dtype)
+        return self._maybe_mask_result(value, mask=mask)
+
+    def _wrap_min_count_reduction_result(
+        self, name: str, result, skipna, min_count, **kwargs
+    ):
+        if min_count == 0 and isinstance(result, np.ndarray):
+            return self._maybe_mask_result(result, np.zeros(result.shape, dtype=bool))
+        return self._wrap_reduction_result(name, result, skipna, **kwargs)
+
     def sum(
         self,
         *,
@@ -1124,8 +1165,8 @@ def sum(
             min_count=min_count,
             axis=axis,
         )
-        return self._wrap_reduction_result(
-            "sum", result, skipna=skipna, axis=axis, **kwargs
+        return self._wrap_min_count_reduction_result(
+            "sum", result, skipna=skipna, min_count=min_count, axis=axis, **kwargs
         )
 
     def prod(
@@ -1137,15 +1178,16 @@ def prod(
         **kwargs,
     ):
         nv.validate_prod((), kwargs)
+
         result = masked_reductions.prod(
             self._data,
             self._mask,
             skipna=skipna,
             min_count=min_count,
             axis=axis,
         )
-        return self._wrap_reduction_result(
-            "prod", result, skipna=skipna, axis=axis, **kwargs
+        return self._wrap_min_count_reduction_result(
+            "prod", result, skipna=skipna, min_count=min_count, axis=axis, **kwargs
         )
 
     def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
@@ -1192,23 +1234,29 @@ def std(
 
     def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
         nv.validate_min((), kwargs)
-        return masked_reductions.min(
+        result = masked_reductions.min(
             self._data,
             self._mask,
             skipna=skipna,
             axis=axis,
         )
+        return self._wrap_reduction_result(
+            "min", result, skipna=skipna, axis=axis, **kwargs
+        )
 
     def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
         nv.validate_max((), kwargs)
-        return masked_reductions.max(
+        result = masked_reductions.max(
             self._data,
             self._mask,
             skipna=skipna,
             axis=axis,
         )
+        return self._wrap_reduction_result(
+            "max", result, skipna=skipna, axis=axis, **kwargs
+        )
 
-    def any(self, *, skipna: bool = True, **kwargs):
+    def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
         """
         Return whether any element is truthy.
 
@@ -1227,6 +1275,7 @@ def any(self, *, skipna: bool = True, **kwargs):
             If `skipna` is False, the result will still be True if there is
             at least one element that is truthy, otherwise NA will be returned
             if there are NA's present.
+        axis : int, optional, default 0
         **kwargs : any, default None
             Additional keywords have no effect but might be accepted for
             compatibility with NumPy.
@@ -1270,7 +1319,6 @@ def any(self, *, skipna: bool = True, **kwargs):
         >>> pd.array([0, 0, pd.NA]).any(skipna=False)
         <NA>
         """
-        kwargs.pop("axis", None)
         nv.validate_any((), kwargs)
 
         values = self._data.copy()
@@ -1289,7 +1337,7 @@ def any(self, *, skipna: bool = True, **kwargs):
             else:
                 return self.dtype.na_value
 
-    def all(self, *, skipna: bool = True, **kwargs):
+    def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
         """
         Return whether all elements are truthy.
 
@@ -1308,6 +1356,7 @@ def all(self, *, skipna: bool = True, **kwargs):
             If `skipna` is False, the result will still be False if there is
             at least one element that is falsey, otherwise NA will be returned
             if there are NA's present.
+        axis : int, optional, default 0
         **kwargs : any, default None
             Additional keywords have no effect but might be accepted for
             compatibility with NumPy.
@@ -1351,7 +1400,6 @@ def all(self, *, skipna: bool = True, **kwargs):
         >>> pd.array([1, 0, pd.NA]).all(skipna=False)
         False
         """
-        kwargs.pop("axis", None)
         nv.validate_all((), kwargs)
 
         values = self._data.copy()
@@ -1361,7 +1409,7 @@ def all(self, *, skipna: bool = True, **kwargs):
         # bool, int, float, complex, str, bytes,
         # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]"
         np.putmask(values, self._mask, self._truthy_value)  # type: ignore[arg-type]
-        result = values.all()
+        result = values.all(axis=axis)
 
         if skipna:
             return result