pandas-dev · jreback · Oct 15, 2021 · Aug 9, 2021 · Aug 9, 2021 · Aug 9, 2021
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -47,6 +47,7 @@ class providing the base-class of operations.
     F,
     FrameOrSeries,
     IndexLabel,
+    PositionalIndexer,
     RandomState,
     Scalar,
     T,
@@ -66,6 +67,7 @@ class providing the base-class of operations.
     is_bool_dtype,
     is_datetime64_dtype,
     is_float_dtype,
+    is_integer,
     is_integer_dtype,
     is_numeric_dtype,
     is_object_dtype,
@@ -98,6 +100,7 @@ class providing the base-class of operations.
     numba_,
     ops,
 )
+from pandas.core.groupby.indexing import GroupByIndexingMixin
 from pandas.core.indexes.api import (
     CategoricalIndex,
     Index,
@@ -568,7 +571,7 @@ def group_selection_context(groupby: GroupBy) -> Iterator[GroupBy]:
 ]
 
 
-class BaseGroupBy(PandasObject, SelectionMixin[FrameOrSeries]):
+class BaseGroupBy(PandasObject, SelectionMixin[FrameOrSeries], GroupByIndexingMixin):
     _group_selection: IndexLabel | None = None
     _apply_allowlist: frozenset[str] = frozenset()
     _hidden_attrs = PandasObject._hidden_attrs | {
@@ -2373,20 +2376,25 @@ def backfill(self, limit=None):
     @Substitution(name="groupby")
     @Substitution(see_also=_common_see_also)
     def nth(
-        self, n: int | list[int], dropna: Literal["any", "all", None] = None
-    ) -> DataFrame:
+        self,
+        arg: PositionalIndexer | tuple,
+        dropna: Literal["any", "all", None] = None,
+    ) -> FrameOrSeries:
         """
-        Take the nth row from each group if n is an int, or a subset of rows
-        if n is a list of ints.
+        Take the nth row from each group if n is an int, otherwise a subset of rows.
 
         If dropna, will take the nth non-null row, dropna is either
         'all' or 'any'; this is equivalent to calling dropna(how=dropna)
         before the groupby.
 
         Parameters
         ----------
-        n : int or list of ints
-            A single nth value for the row or a list of nth values.
+        n : int, slice or list of ints and slices
+            A single nth value for the row or a list of nth values or slices.
+
+            .. versionchanged:: 1.4.0
+                Added slice and lists containiing slices
+
         dropna : {'any', 'all', None}, default None
             Apply the specified dropna operation before counting which row is
             the nth row.
@@ -2424,6 +2432,12 @@ def nth(
         1  2.0
         2  3.0
         2  5.0
+        >>> g.nth(slice(None, -1))
+             B
+        A
+        1  NaN
+        1  2.0
+        2  3.0
 
         Specifying `dropna` allows count ignoring ``NaN``
 
@@ -2448,58 +2462,27 @@ def nth(
         1  1  2.0
         4  2  5.0
         """
-        valid_containers = (set, list, tuple)
-        if not isinstance(n, (valid_containers, int)):
-            raise TypeError("n needs to be an int or a list/set/tuple of ints")
-
         if not dropna:
+            if isinstance(arg, Iterable):
+                return self._rows[tuple(arg)]
 
-            if isinstance(n, int):
-                nth_values = [n]
-            elif isinstance(n, valid_containers):
-                nth_values = list(set(n))
-
-            nth_array = np.array(nth_values, dtype=np.intp)
-            with group_selection_context(self):
-
-                mask_left = np.in1d(self._cumcount_array(), nth_array)
-                mask_right = np.in1d(
-                    self._cumcount_array(ascending=False) + 1, -nth_array
-                )
-                mask = mask_left | mask_right
-
-                ids, _, _ = self.grouper.group_info
-
-                # Drop NA values in grouping
-                mask = mask & (ids != -1)
-
-                out = self._selected_obj[mask]
-                if not self.as_index:
-                    return out
-
-                result_index = self.grouper.result_index
-                out.index = result_index[ids[mask]]
-
-                if not self.observed and isinstance(result_index, CategoricalIndex):
-                    out = out.reindex(result_index)
-
-                out = self._reindex_output(out)
-                return out.sort_index() if self.sort else out
+            return self._rows[arg]
 
         # dropna is truthy
-        if isinstance(n, valid_containers):
-            raise ValueError("dropna option with a list of nth values is not supported")
+        if not is_integer(arg):
+            raise ValueError("dropna option only supported for an integer argument")
 
         if dropna not in ["any", "all"]:
             # Note: when agg-ing picker doesn't raise this, just returns NaN
             raise ValueError(
-                "For a DataFrame groupby, dropna must be "
+                "For a DataFrame groupby.nth, dropna must be "
                 "either None, 'any' or 'all', "
                 f"(was passed {dropna})."
             )
 
         # old behaviour, but with all and any support for DataFrames.
         # modified in GH 7559 to have better perf
+        n = cast(int, arg)
         max_len = n if n >= 0 else -1 - n
         dropped = self.obj.dropna(how=dropna, axis=self.axis)
 

diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py
@@ -0,0 +1,238 @@
+from __future__ import annotations
+
+from typing import (
+    Iterable,
+    cast,
+)
+
+import numpy as np
+
+from pandas._typing import (
+    FrameOrSeries,
+    PositionalIndexer,
+)
+from pandas.util._decorators import (
+    cache_readonly,
+    doc,
+)
+
+from pandas.core.dtypes.common import (
+    is_integer,
+    is_list_like,
+)
+
+from pandas.core.groupby import groupby
+from pandas.core.indexes.api import CategoricalIndex
+
+
+class GroupByIndexingMixin:
+    """
+    Mixin for adding .rows to GroupBy.
+    """
+
+    @property
+    def _rows(self) -> _rowsGroupByIndexer:
+        return _rowsGroupByIndexer(cast(groupby.GroupBy, self))
+
+
+@doc(GroupByIndexingMixin._rows)
+class _rowsGroupByIndexer:
+    def __init__(self, grouped: groupby.GroupBy):
+        self.grouped = grouped
+
+    def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries:
+        """
+        Positional index for selection by integer location per group.
+
+        Used to implement GroupBy._rows which is used to implement GroupBy.nth
+        when keyword dropna is None or absent.
+        The behaviour extends GroupBy.nth and handles DataFrame.groupby() 
+        keyword parameters such as as_index and dropna in a compatible way.
+
+        The additions to nth(arg) are:
+        - Handles iterables such as range.
+        - Handles slice(start, stop, step) with
+            start: positive, negative or None.
+            stop: positive, negative or None.
+            step: positive or None.
+
+        Parameters
+        ----------
+        arg : PositionalIndexer | tuple
+            Allowed values are:
+            - Integer
+            - Integer values iterable such as list or range
+            - Slice
+            - Comma separated list of integers and slices
+
+        Returns
+        -------
+        Series
+            The filtered subset of the original groupby Series.
+        DataFrame
+            The filtered subset of the original groupby DataFrame.
+
+        See Also
+        --------
+        DataFrame.iloc : Purely integer-location based indexing for selection by
+            position.
+        GroupBy.head : Return first n rows of each group.
+        GroupBy.tail : Return last n rows of each group.
+        GroupBy.nth : Take the nth row from each group if n is an int, or a
+            subset of rows, if n is a list of ints.
+
+        Examples
+        --------
+            >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]],
+            ...                   columns=["A", "B"])
+            >>> df.groupby("A", as_index=False)._rows[1:2]
+               A  B
+            1  a  2
+            4  b  5
+
+            >>> df.groupby("A", as_index=False)._rows[1, -1]
+               A  B
+            1  a  2
+            2  a  3
+            4  b  5
+        """
+        with groupby.group_selection_context(self.grouped):
+            if isinstance(arg, tuple):
+                if all(is_integer(i) for i in arg):
+                    mask = self._handle_list(arg)
+
+                else:
+                    mask = self._handle_tuple(arg)
+
+            elif isinstance(arg, slice):
+                mask = self._handle_slice(arg)
+
+            elif is_integer(arg):
+                mask = self._handle_int(cast(int, arg))
+
+            elif is_list_like(arg):
+                mask = self._handle_list(cast(Iterable[int], arg))
+
+            else:
+                raise TypeError(
+                    f"Invalid index {type(arg)}. "
+                    "Must be integer, list-like, slice or a tuple of "
+                    "integers and slices"
+                )
+
+            ids, _, _ = self.grouped.grouper.group_info
+
+            # Drop NA values in grouping
+            mask &= ids != -1
+
+            if mask is None or mask is True:
+                result = self.grouped._selected_obj[:]
+
+            else:
+                result = self.grouped._selected_obj[mask]
+
+            if self.grouped.as_index:
+                result_index = self.grouped.grouper.result_index
+                result.index = result_index[ids[mask]]
+
+                if not self.grouped.observed and isinstance(
+                    result_index, CategoricalIndex
+                ):
+                    result = result.reindex(result_index)
+
+                result = self.grouped._reindex_output(result)
+                if self.grouped.sort:
+                    result = result.sort_index()
+
+            return result
+
+    def _handle_int(self, arg: int) -> bool | np.ndarray:
+        if arg >= 0:
+            return self._ascending_count == arg
+
+        else:
+            return self._descending_count == (-arg - 1)
+
+    def _handle_list(self, args: Iterable[int]) -> bool | np.ndarray:
+        positive = [arg for arg in args if arg >= 0]
+        negative = [-arg - 1 for arg in args if arg < 0]
+
+        mask: bool | np.ndarray = False
+
+        if positive:
+            mask |= np.isin(self._ascending_count, positive)
+
+        if negative:
+            mask |= np.isin(self._descending_count, negative)
+
+        return mask
+
+    def _handle_tuple(self, args: tuple) -> bool | np.ndarray:
+        mask: bool | np.ndarray = False
+
+        for arg in args:
+            if is_integer(arg):
+                mask |= self._handle_int(cast(int, arg))
+
+            elif isinstance(arg, slice):
+                mask |= self._handle_slice(arg)
+
+            else:
+                raise ValueError(
+                    f"Invalid argument {type(arg)}. Should be int or slice."
+                )
+
+        return mask
+
+    def _handle_slice(self, arg: slice) -> bool | np.ndarray:
+        start = arg.start
+        stop = arg.stop
+        step = arg.step
+
+        if step is not None and step < 0:
+            raise ValueError(f"Invalid step {step}. Must be non-negative")
+
+        mask: bool | np.ndarray = True
+
+        if step is None:
+            step = 1
+
+        if start is None:
+            if step > 1:
+                mask &= self._ascending_count % step == 0
+
+        elif start >= 0:
+            mask &= self._ascending_count >= start
+
+            if step > 1:
+                mask &= (self._ascending_count - start) % step == 0
+
+        else:
+            mask &= self._descending_count < -start
+
+            offset_array = self._descending_count + start + 1
+            limit_array = (
+                self._ascending_count + self._descending_count + (start + 1)
+            ) < 0
+            offset_array = np.where(
+                limit_array, self._ascending_count, offset_array
+            )
+
+            mask &= offset_array % step == 0
+
+        if stop is not None:
+            if stop >= 0:
+                mask &= self._ascending_count < stop
+
+            else:
+                mask &= self._descending_count >= -stop
+
+        return mask
+
+    @cache_readonly
+    def _ascending_count(self) -> np.ndarray:
+        return self.grouped._cumcount_array()
+
+    @cache_readonly
+    def _descending_count(self) -> np.ndarray:
+        return self.grouped._cumcount_array(ascending=False)