pandas-dev · jbrockmendel · Aug 21, 2020 · Aug 21, 2020 · Aug 21, 2020 · Aug 21, 2020
diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
@@ -51,7 +51,7 @@ cdef class _BaseGrouper:
             # See the comment in indexes/base.py about _index_data.
             # We need this for EA-backed indexes that have a reference
             # to a 1-d ndarray like datetime / timedelta / period.
-            object.__setattr__(cached_ityp, '_index_data', islider.buf)
+            object.__setattr__(cached_ityp, '_data', islider.buf)
             cached_ityp._engine.clear_mapping()
             cached_ityp._cache.clear()  # e.g. inferred_freq must go
             object.__setattr__(cached_typ._mgr._block, 'values', vslider.buf)
@@ -296,7 +296,7 @@ cdef class Slider:
     Only handles contiguous data for now
     """
     cdef:
-        ndarray values, buf
+        ndarray values, buf, orig_buf
         Py_ssize_t stride, orig_len, orig_stride
         char *orig_data
 
@@ -308,28 +308,22 @@ cdef class Slider:
             values = values.copy()
 
         self.values = values
+        self.orig_buf = buf
         self.buf = buf
         self.stride = values.strides[0]
 
         self.orig_data = self.buf.data
         self.orig_len = self.buf.shape[0]
         self.orig_stride = self.buf.strides[0]
 
-        self.buf.data = self.values.data
-        self.buf.strides[0] = self.stride
-
     cdef move(self, int start, int end):
         """
         For slicing
         """
-        self.buf.data = self.values.data + self.stride * start
-        self.buf.shape[0] = end - start
+        self.buf = self.values[start:end]
 
     cdef reset(self):
-
-        self.buf.shape[0] = self.orig_len
-        self.buf.data = self.orig_data
-        self.buf.strides[0] = self.orig_stride
+        self.buf = self.orig_buf
 
 
 class InvalidApply(Exception):
@@ -359,7 +353,7 @@ def apply_frame_axis0(object frame, object f, object names,
             slider.move(starts[i], ends[i])
 
             item_cache.clear()  # ugh
-            chunk = slider.dummy
+            chunk = slider.frame[starts[i]:ends[i]]
             object.__setattr__(chunk, 'name', names[i])
 
             try:
@@ -406,7 +400,8 @@ cdef class BlockSlider:
         object frame, dummy, index
         int nblocks
         Slider idx_slider
-        list blocks
+        list blocks, blk_values
+        ndarray orig_blklocs, orig_blknos
 
     cdef:
         char **base_ptrs
@@ -420,20 +415,27 @@ cdef class BlockSlider:
         self.dummy = frame[:0]
         self.index = self.dummy.index
 
-        self.blocks = [b.values for b in self.dummy._mgr.blocks]
+        # GH#35417 attributes we need to restore at each step in case
+        #  the function modified them.
+        mgr = self.dummy._mgr
+        self.orig_blklocs = mgr.blklocs
+        self.orig_blknos = mgr.blknos
+        self.blocks = [x for x in self.dummy._mgr.blocks]
 
-        for x in self.blocks:
+        self.blk_values = [b.values for b in self.dummy._mgr.blocks]
+
+        for x in self.blk_values:
             util.set_array_not_contiguous(x)
 
-        self.nblocks = len(self.blocks)
+        self.nblocks = len(self.blk_values)
         # See the comment in indexes/base.py about _index_data.
         # We need this for EA-backed indexes that have a reference to a 1-d
         # ndarray like datetime / timedelta / period.
         self.idx_slider = Slider(
             self.frame.index._index_data, self.dummy.index._index_data)
 
-        self.base_ptrs = <char**>malloc(sizeof(char*) * len(self.blocks))
-        for i, block in enumerate(self.blocks):
+        self.base_ptrs = <char**>malloc(sizeof(char*) * len(self.blk_values))
+        for i, block in enumerate(self.blk_values):
             self.base_ptrs[i] = (<ndarray>block).data
 
     def __dealloc__(self):
@@ -444,9 +446,11 @@ cdef class BlockSlider:
             ndarray arr
             Py_ssize_t i
 
+        self._restore_blocks()
+
         # move blocks
         for i in range(self.nblocks):
-            arr = self.blocks[i]
+            arr = self.blk_values[i]
 
             # axis=1 is the frame's axis=0
             arr.data = self.base_ptrs[i] + arr.strides[1] * start
@@ -459,14 +463,25 @@ cdef class BlockSlider:
         self.index._engine.clear_mapping()
         self.index._cache.clear()  # e.g. inferred_freq must go
 
+    cdef _restore_blocks(self):
+        """
+        Ensure that we have the original blocks, blknos, and blklocs.
+        """
+        mgr = self.dummy._mgr
+        mgr.blocks = self.blocks
+        mgr._blklocs = self.orig_blklocs
+        mgr._blknos = self.orig_blknos
+
     cdef reset(self):
         cdef:
             ndarray arr
             Py_ssize_t i
 
+        self._restore_blocks()
+
         # reset blocks
         for i in range(self.nblocks):
-            arr = self.blocks[i]
+            arr = self.blk_values[i]
 
             # axis=1 is the frame's axis=0
             arr.data = self.base_ptrs[i]

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -404,6 +404,18 @@ def _data(self):
         #  e.g. fastparquet
         return self._mgr
 
+    @property
+    def _can_use_libreduction(self) -> bool:
+        # groupby ops can only use libreduction fast-path if we are all-numpy
+        if self.index._has_complex_internals:
+            return False
+
+        is_invalid = lambda x: is_extension_array_dtype(x) or x.kind in ["m", "M"]
+        if self.ndim == 1:
+            return not is_invalid(self.dtype)
+        else:
+            return not self.dtypes.apply(is_invalid).any()
+
     # ----------------------------------------------------------------------
     # Axis
     _stat_axis_number = 0

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -74,7 +74,14 @@
     get_groupby,
     group_selection_context,
 )
-from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same
+from pandas.core.indexes.api import (
+    DatetimeIndex,
+    Index,
+    MultiIndex,
+    PeriodIndex,
+    TimedeltaIndex,
+    all_indexes_same,
+)
 import pandas.core.indexes.base as ibase
 from pandas.core.internals import BlockManager
 from pandas.core.series import Series
@@ -256,16 +263,25 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
             if self.grouper.nkeys > 1:
                 return self._python_agg_general(func, *args, **kwargs)
 
-            try:
-                return self._python_agg_general(func, *args, **kwargs)
-            except (ValueError, KeyError):
-                # TODO: KeyError is raised in _python_agg_general,
-                #  see see test_groupby.test_basic
-                result = self._aggregate_named(func, *args, **kwargs)
+            if isinstance(
+                self._selected_obj.index, (DatetimeIndex, TimedeltaIndex, PeriodIndex)
+            ):
+                # using _python_agg_general would end up incorrectly patching
+                #  _index_data in reduction.pyx
+                result = self._aggregate_maybe_named(func, *args, **kwargs)
+            else:
+                try:
+                    return self._python_agg_general(func, *args, **kwargs)
+                except (ValueError, KeyError):
+                    # TODO: KeyError is raised in _python_agg_general,
+                    #  see see test_groupby.test_basic
+                    result = self._aggregate_maybe_named(func, *args, **kwargs)
 
-            index = Index(sorted(result), name=self.grouper.names[0])
+            # name setting -> test_metadata_propagation_indiv
+            index = self.grouper.result_index
+            obj = self._selected_obj
             ret = create_series_with_explicit_dtype(
-                result, index=index, dtype_if_empty=object
+                result, index=index, dtype_if_empty=object, name=obj.name
             )
 
         if not self.as_index:  # pragma: no cover
@@ -469,14 +485,34 @@ def _get_index() -> Index:
             )
             return self._reindex_output(result)
 
-    def _aggregate_named(self, func, *args, **kwargs):
+    def _aggregate_maybe_named(self, func, *args, **kwargs):
+        """
+        Try the named-aggregator first, then unnamed, which better matches
+        what libreduction does.
+        """
+        try:
+            return self._aggregate_named(func, *args, named=True, **kwargs)
+        except KeyError:
+            return self._aggregate_named(func, *args, named=False, **kwargs)
+
+    def _aggregate_named(self, func, *args, named: bool = True, **kwargs):
         result = {}
 
-        for name, group in self:
-            group.name = name
+        for name, group in self:  # TODO: could we have duplicate names?
+            if named:
+                group.name = name
+
             output = func(group, *args, **kwargs)
             if isinstance(output, (Series, Index, np.ndarray)):
-                raise ValueError("Must produce aggregated value")
+                if (
+                    isinstance(output, Series)
+                    and len(output) == 1
+                    and name in output.index
+                ):
+                    # FIXME: kludge for test_resampler_grouper.test_apply
+                    output = output.iloc[0]
+                else:
+                    raise ValueError("Must produce aggregated value")
             result[name] = output
 
         return result

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -156,18 +156,13 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
         result_values = None
 
         sdata: FrameOrSeries = splitter._get_sorted_data()
-        if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)):
-            # calling splitter.fast_apply will raise TypeError via apply_frame_axis0
-            #  if we pass EA instead of ndarray
-            #  TODO: can we have a workaround for EAs backed by ndarray?
-            pass
-
-        elif (
+        if (
             com.get_callable_name(f) not in base.plotting_methods
             and isinstance(splitter, FrameSplitter)
             and axis == 0
             # fast_apply/libreduction doesn't allow non-numpy backed indexes
-            and not sdata.index._has_complex_internals
+            #  or columns
+            and sdata._can_use_libreduction
         ):
             try:
                 result_values, mutated = splitter.fast_apply(f, sdata, group_keys)
@@ -609,15 +604,10 @@ def agg_series(self, obj: Series, func: F):
             # SeriesGrouper would raise if we were to call _aggregate_series_fast
             return self._aggregate_series_pure_python(obj, func)
 
-        elif is_extension_array_dtype(obj.dtype):
+        elif not obj._can_use_libreduction:
             # _aggregate_series_fast would raise TypeError when
             #  calling libreduction.Slider
             # In the datetime64tz case it would incorrectly cast to tz-naive
-            # TODO: can we get a performant workaround for EAs backed by ndarray?
-            return self._aggregate_series_pure_python(obj, func)
-
-        elif obj.index._has_complex_internals:
-            # Preempt TypeError in _aggregate_series_fast
             return self._aggregate_series_pure_python(obj, func)
 
         try:

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4053,7 +4053,9 @@ def _has_complex_internals(self) -> bool:
         Indicates if an index is not directly backed by a numpy array
         """
         # used to avoid libreduction code paths, which raise or require conversion
-        return False
+        return isinstance(self, (ABCMultiIndex, ABCRangeIndex)) or not isinstance(
+            self._data, np.ndarray
+        )
 
     def _is_memory_usage_qualified(self) -> bool:
         """

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -343,11 +343,6 @@ def values(self):
         """ return the underlying data, which is a Categorical """
         return self._data
 
-    @property
-    def _has_complex_internals(self) -> bool:
-        # used to avoid libreduction code paths, which raise or require conversion
-        return True
-
     @doc(Index.__contains__)
     def __contains__(self, key: Any) -> bool:
         # if key is a NaN, check if any NaN is in self.

diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
@@ -394,11 +394,6 @@ def values(self) -> IntervalArray:
         """
         return self._data
 
-    @property
-    def _has_complex_internals(self) -> bool:
-        # used to avoid libreduction code paths, which raise or require conversion
-        return True
-
     def __array_wrap__(self, result, context=None):
         # we don't want the superclass implementation
         return result

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -1508,11 +1508,6 @@ def _get_level_number(self, level) -> int:
                 ) from err
         return level
 
-    @property
-    def _has_complex_internals(self) -> bool:
-        # used to avoid libreduction code paths, which raise or require conversion
-        return True
-
     @cache_readonly
     def is_monotonic_increasing(self) -> bool:
         """

diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -247,11 +247,6 @@ def _simple_new(cls, values: PeriodArray, name: Label = None):
     def values(self):
         return np.asarray(self)
 
-    @property
-    def _has_complex_internals(self):
-        # used to avoid libreduction code paths, which raise or require conversion
-        return True
-
     def _shallow_copy(self, values=None, name: Label = no_default):
         name = name if name is not no_default else self.name
         cache = self._cache.copy() if values is None else {}

diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -1002,7 +1002,6 @@ def test_apply_function_with_indexing_return_column():
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(reason="GH-34998")
 def test_apply_with_timezones_aware():
     # GH: 27212