diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d44824183f05f..35c88d544c985 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -178,6 +178,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`?``) +- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) .. --------------------------------------------------------------------------- .. _whatsnew_300.bug_fixes: diff --git a/pandas/conftest.py b/pandas/conftest.py index 3add32293ea06..730b84fdc6201 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -271,7 +271,7 @@ def axis(request): return request.param -@pytest.fixture(params=[True, False, None]) +@pytest.fixture(params=[True, False]) def observed(request): """ Pass in the observed keyword to groupby for [True, False] diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 668c9863612d8..9768a932f73f6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -596,7 +596,7 @@ class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): @final def __len__(self) -> int: - return len(self.groups) + return self._grouper.ngroups @final def __repr__(self) -> str: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 46a307e2f28d9..d02e22c29159f 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -149,6 +149,29 @@ def test_len_nan_group(): assert len(df.groupby(["a", "b"])) == 0 +@pytest.mark.parametrize("keys", [["a"], ["a", "b"]]) +def test_len_categorical(dropna, observed, keys): + # GH#57595 + df = DataFrame( + { + "a": Categorical([1, 1, 2, np.nan], categories=[1, 2, 3]), + "b": Categorical([1, 1, 2, np.nan], categories=[1, 2, 3]), + "c": 1, + } + ) + gb = df.groupby(keys, observed=observed, dropna=dropna) + result = len(gb) + if observed and dropna: + expected = 2 + elif observed and not dropna: + expected = 3 + elif len(keys) == 1: + expected = 3 if dropna else 4 + else: + expected = 9 if dropna else 16 + assert result == expected, f"{result} vs {expected}" + + def test_basic_regression(): # regression result = Series([1.0 * x for x in list(range(1, 10)) * 10])