CLN: Enforce deprecation of groupby with as_index=False excluding out-of-axis groupings (#57741)

rhshadrach · web-flow · commit fe2ef376f862 · 2024-03-06T16:01:46.000-08:00
* CLN: Enforce deprecation of groupby with as_index=False excluding out-of-axis groupings

* type annotation fixup
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -191,6 +191,7 @@ Removal of prior version deprecations/changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 - :class:`.DataFrameGroupBy.idxmin`, :class:`.DataFrameGroupBy.idxmax`, :class:`.SeriesGroupBy.idxmin`, and :class:`.SeriesGroupBy.idxmax` will now raise a ``ValueError`` when used with ``skipna=False`` and an NA value is encountered (:issue:`10694`)
 - :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)
+- :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`)
 - :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`)
 - :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`)
 - All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1286,34 +1286,43 @@ def _set_result_index_ordered(
         return result
 
     @final
-    def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame:
+    def _insert_inaxis_grouper(
+        self, result: Series | DataFrame, qs: npt.NDArray[np.float64] | None = None
+    ) -> DataFrame:
         if isinstance(result, Series):
             result = result.to_frame()
 
+        n_groupings = len(self._grouper.groupings)
+
+        if qs is not None:
+            result.insert(
+                0, f"level_{n_groupings}", np.tile(qs, len(result) // len(qs))
+            )
+
         # zip in reverse so we can always insert at loc 0
-        columns = result.columns
-        for name, lev, in_axis in zip(
-            reversed(self._grouper.names),
-            reversed(self._grouper.get_group_levels()),
-            reversed([grp.in_axis for grp in self._grouper.groupings]),
+        for level, (name, lev, in_axis) in enumerate(
+            zip(
+                reversed(self._grouper.names),
+                reversed(self._grouper.get_group_levels()),
+                reversed([grp.in_axis for grp in self._grouper.groupings]),
+            )
         ):
+            if name is None:
+                # Behave the same as .reset_index() when a level is unnamed
+                name = (
+                    "index"
+                    if n_groupings == 1 and qs is None
+                    else f"level_{n_groupings - level - 1}"
+                )
+
             # GH #28549
             # When using .apply(-), name will be in columns already
-            if name not in columns:
-                if in_axis:
+            if name not in result.columns:
+                # if in_axis:
+                if qs is None:
                     result.insert(0, name, lev)
                 else:
-                    msg = (
-                        "A grouping was used that is not in the columns of the "
-                        "DataFrame and so was excluded from the result. This grouping "
-                        "will be included in a future version of pandas. Add the "
-                        "grouping as a column of the DataFrame to silence this warning."
-                    )
-                    warnings.warn(
-                        message=msg,
-                        category=FutureWarning,
-                        stacklevel=find_stack_level(),
-                    )
+                    result.insert(0, name, Index(np.repeat(lev, len(qs))))
 
         return result
 
@@ -1340,18 +1349,17 @@ def _wrap_aggregated_output(
         if not self.as_index:
             # `not self.as_index` is only relevant for DataFrameGroupBy,
             #   enforced in __init__
-            result = self._insert_inaxis_grouper(result)
+            result = self._insert_inaxis_grouper(result, qs=qs)
             result = result._consolidate()
-            index = Index(range(self._grouper.ngroups))
+            result.index = RangeIndex(len(result))
 
         else:
             index = self._grouper.result_index
-
-        if qs is not None:
-            # We get here with len(qs) != 1 and not self.as_index
-            #  in test_pass_args_kwargs
-            index = _insert_quantile_level(index, qs)
-        result.index = index
+            if qs is not None:
+                # We get here with len(qs) != 1 and not self.as_index
+                #  in test_pass_args_kwargs
+                index = _insert_quantile_level(index, qs)
+            result.index = index
 
         return result
 
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -1248,18 +1248,15 @@ def test_pass_args_kwargs_duplicate_columns(tsframe, as_index):
     tsframe.columns = ["A", "B", "A", "C"]
     gb = tsframe.groupby(lambda x: x.month, as_index=as_index)
 
-    warn = None if as_index else FutureWarning
-    msg = "A grouping .* was excluded from the result"
-    with tm.assert_produces_warning(warn, match=msg):
-        res = gb.agg(np.percentile, 80, axis=0)
+    res = gb.agg(np.percentile, 80, axis=0)
 
     ex_data = {
         1: tsframe[tsframe.index.month == 1].quantile(0.8),
         2: tsframe[tsframe.index.month == 2].quantile(0.8),
     }
     expected = DataFrame(ex_data).T
     if not as_index:
-        # TODO: try to get this more consistent?
+        expected.insert(0, "index", [1, 2])
         expected.index = Index(range(2))
 
     tm.assert_frame_equal(res, expected)
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -779,24 +779,27 @@ def test_as_index():
 
     # function grouper
     f = lambda r: df.loc[r, "A"]
-    msg = "A grouping .* was excluded from the result"
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        result = df.groupby(["cat", f], as_index=False, observed=True).sum()
+    result = df.groupby(["cat", f], as_index=False, observed=True).sum()
     expected = DataFrame(
         {
             "cat": Categorical([1, 2], categories=df.cat.cat.categories),
+            "level_1": [10, 11],
             "A": [10, 22],
             "B": [101, 205],
         },
-        columns=["cat", "A", "B"],
     )
     tm.assert_frame_equal(result, expected)
 
     # another not in-axis grouper (conflicting names in index)
     s = Series(["a", "b", "b"], name="cat")
-    msg = "A grouping .* was excluded from the result"
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        result = df.groupby(["cat", s], as_index=False, observed=True).sum()
+    result = df.groupby(["cat", s], as_index=False, observed=True).sum()
+    expected = DataFrame(
+        {
+            "cat": ["a", "b"],
+            "A": [10, 22],
+            "B": [101, 205],
+        },
+    )
     tm.assert_frame_equal(result, expected)
 
     # is original index dropped?
@@ -1852,7 +1855,7 @@ def test_category_order_reducer(
     request, as_index, sort, observed, reduction_func, index_kind, ordered
 ):
     # GH#48749
-    if reduction_func == "corrwith" and not as_index:
+    if reduction_func == "corrwith" and not as_index and index_kind != "single":
         msg = "GH#49950 - corrwith with as_index=False may not have grouping column"
         request.applymarker(pytest.mark.xfail(reason=msg))
     elif index_kind != "range" and not as_index:
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -103,26 +103,22 @@ def f(x, q=None, axis=0):
     # DataFrame
     for as_index in [True, False]:
         df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index)
-        warn = None if as_index else FutureWarning
-        msg = "A grouping .* was excluded from the result"
-        with tm.assert_produces_warning(warn, match=msg):
-            agg_result = df_grouped.agg(np.percentile, 80, axis=0)
-        with tm.assert_produces_warning(warn, match=msg):
-            apply_result = df_grouped.apply(DataFrame.quantile, 0.8)
-        with tm.assert_produces_warning(warn, match=msg):
-            expected = df_grouped.quantile(0.8)
+        agg_result = df_grouped.agg(np.percentile, 80, axis=0)
+        apply_result = df_grouped.apply(DataFrame.quantile, 0.8)
+        expected = df_grouped.quantile(0.8)
         tm.assert_frame_equal(apply_result, expected, check_names=False)
         tm.assert_frame_equal(agg_result, expected)
 
         apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8])
-        with tm.assert_produces_warning(warn, match=msg):
-            expected_seq = df_grouped.quantile([0.4, 0.8])
+        expected_seq = df_grouped.quantile([0.4, 0.8])
+        if not as_index:
+            # apply treats the op as a transform; .quantile knows it's a reduction
+            apply_result = apply_result.reset_index()
+            apply_result["level_0"] = [1, 1, 2, 2]
         tm.assert_frame_equal(apply_result, expected_seq, check_names=False)
 
-        with tm.assert_produces_warning(warn, match=msg):
-            agg_result = df_grouped.agg(f, q=80)
-        with tm.assert_produces_warning(warn, match=msg):
-            apply_result = df_grouped.apply(DataFrame.quantile, q=0.8)
+        agg_result = df_grouped.agg(f, q=80)
+        apply_result = df_grouped.apply(DataFrame.quantile, q=0.8)
         tm.assert_frame_equal(agg_result, expected)
         tm.assert_frame_equal(apply_result, expected, check_names=False)
 
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
@@ -552,11 +552,6 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki
             expected = expected.set_index(["x", "x2"])
         else:
             expected = expected.set_index("x")
-    elif index_kind != "range" and reduction_func != "size":
-        # size, unlike other methods, has the desired behavior in GH#49519
-        expected = expected.drop(columns="x")
-        if index_kind == "multi":
-            expected = expected.drop(columns="x2")
     if reduction_func in ("idxmax", "idxmin") and index_kind != "range":
         # expected was computed with a RangeIndex; need to translate to index values
         values = expected["y"].values.tolist()
@@ -572,13 +567,7 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki
         if as_index:
             expected = expected["size"].rename(None)
 
-    if as_index or index_kind == "range" or reduction_func == "size":
-        warn = None
-    else:
-        warn = FutureWarning
-    msg = "A grouping .* was excluded from the result"
-    with tm.assert_produces_warning(warn, match=msg):
-        result = getattr(gb_keepna, reduction_func)(*args)
+    result = getattr(gb_keepna, reduction_func)(*args)
 
     # size will return a Series, others are DataFrame
     tm.assert_equal(result, expected)
diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
@@ -1125,12 +1125,8 @@ def test_grouping_by_key_is_in_axis():
     assert not gb._grouper.groupings[0].in_axis
     assert gb._grouper.groupings[1].in_axis
 
-    # Currently only in-axis groupings are including in the result when as_index=False;
-    # This is likely to change in the future.
-    msg = "A grouping .* was excluded from the result"
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        result = gb.sum()
-    expected = DataFrame({"b": [1, 2], "c": [7, 5]})
+    result = gb.sum()
+    expected = DataFrame({"a": [1, 2], "b": [1, 2], "c": [7, 5]})
     tm.assert_frame_equal(result, expected)