Skip to content

CLN: Enforce deprecation of groupby with as_index=False excluding out-of-axis groupings #57741

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ Other Deprecations
Removal of prior version deprecations/changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)
- :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`)
- :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`)
- :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`)
- All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`)
Expand Down
62 changes: 35 additions & 27 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1287,34 +1287,43 @@ def _set_result_index_ordered(
return result

@final
def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame:
def _insert_inaxis_grouper(
self, result: Series | DataFrame, qs: npt.NDArray[np.float64] | None = None
) -> DataFrame:
if isinstance(result, Series):
result = result.to_frame()

n_groupings = len(self._grouper.groupings)

if qs is not None:
result.insert(
0, f"level_{n_groupings}", np.tile(qs, len(result) // len(qs))
)

# zip in reverse so we can always insert at loc 0
columns = result.columns
for name, lev, in_axis in zip(
reversed(self._grouper.names),
reversed(self._grouper.get_group_levels()),
reversed([grp.in_axis for grp in self._grouper.groupings]),
for level, (name, lev, in_axis) in enumerate(
zip(
reversed(self._grouper.names),
reversed(self._grouper.get_group_levels()),
reversed([grp.in_axis for grp in self._grouper.groupings]),
)
):
if name is None:
# Behave the same as .reset_index() when a level is unnamed
name = (
"index"
if n_groupings == 1 and qs is None
else f"level_{n_groupings - level - 1}"
)

# GH #28549
# When using .apply(-), name will be in columns already
if name not in columns:
if in_axis:
if name not in result.columns:
# if in_axis:
if qs is None:
result.insert(0, name, lev)
else:
msg = (
"A grouping was used that is not in the columns of the "
"DataFrame and so was excluded from the result. This grouping "
"will be included in a future version of pandas. Add the "
"grouping as a column of the DataFrame to silence this warning."
)
warnings.warn(
message=msg,
category=FutureWarning,
stacklevel=find_stack_level(),
)
result.insert(0, name, Index(np.repeat(lev, len(qs))))

return result

Expand All @@ -1341,18 +1350,17 @@ def _wrap_aggregated_output(
if not self.as_index:
# `not self.as_index` is only relevant for DataFrameGroupBy,
# enforced in __init__
result = self._insert_inaxis_grouper(result)
result = self._insert_inaxis_grouper(result, qs=qs)
result = result._consolidate()
index = Index(range(self._grouper.ngroups))
result.index = RangeIndex(len(result))

else:
index = self._grouper.result_index

if qs is not None:
# We get here with len(qs) != 1 and not self.as_index
# in test_pass_args_kwargs
index = _insert_quantile_level(index, qs)
result.index = index
if qs is not None:
# We get here with len(qs) != 1 and not self.as_index
# in test_pass_args_kwargs
index = _insert_quantile_level(index, qs)
result.index = index

return result

Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1250,18 +1250,15 @@ def test_pass_args_kwargs_duplicate_columns(tsframe, as_index):
tsframe.columns = ["A", "B", "A", "C"]
gb = tsframe.groupby(lambda x: x.month, as_index=as_index)

warn = None if as_index else FutureWarning
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(warn, match=msg):
res = gb.agg(np.percentile, 80, axis=0)
res = gb.agg(np.percentile, 80, axis=0)

ex_data = {
1: tsframe[tsframe.index.month == 1].quantile(0.8),
2: tsframe[tsframe.index.month == 2].quantile(0.8),
}
expected = DataFrame(ex_data).T
if not as_index:
# TODO: try to get this more consistent?
expected.insert(0, "index", [1, 2])
expected.index = Index(range(2))

tm.assert_frame_equal(res, expected)
Expand Down
19 changes: 11 additions & 8 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -779,24 +779,27 @@ def test_as_index():

# function grouper
f = lambda r: df.loc[r, "A"]
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby(["cat", f], as_index=False, observed=True).sum()
result = df.groupby(["cat", f], as_index=False, observed=True).sum()
expected = DataFrame(
{
"cat": Categorical([1, 2], categories=df.cat.cat.categories),
"level_1": [10, 11],
"A": [10, 22],
"B": [101, 205],
},
columns=["cat", "A", "B"],
)
tm.assert_frame_equal(result, expected)

# another not in-axis grouper (conflicting names in index)
s = Series(["a", "b", "b"], name="cat")
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby(["cat", s], as_index=False, observed=True).sum()
result = df.groupby(["cat", s], as_index=False, observed=True).sum()
expected = DataFrame(
{
"cat": ["a", "b"],
"A": [10, 22],
"B": [101, 205],
},
)
tm.assert_frame_equal(result, expected)

# is original index dropped?
Expand Down Expand Up @@ -1852,7 +1855,7 @@ def test_category_order_reducer(
request, as_index, sort, observed, reduction_func, index_kind, ordered
):
# GH#48749
if reduction_func == "corrwith" and not as_index:
if reduction_func == "corrwith" and not as_index and index_kind != "single":
msg = "GH#49950 - corrwith with as_index=False may not have grouping column"
request.applymarker(pytest.mark.xfail(reason=msg))
elif index_kind != "range" and not as_index:
Expand Down
24 changes: 10 additions & 14 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,26 +103,22 @@ def f(x, q=None, axis=0):
# DataFrame
for as_index in [True, False]:
df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index)
warn = None if as_index else FutureWarning
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(warn, match=msg):
agg_result = df_grouped.agg(np.percentile, 80, axis=0)
with tm.assert_produces_warning(warn, match=msg):
apply_result = df_grouped.apply(DataFrame.quantile, 0.8)
with tm.assert_produces_warning(warn, match=msg):
expected = df_grouped.quantile(0.8)
agg_result = df_grouped.agg(np.percentile, 80, axis=0)
apply_result = df_grouped.apply(DataFrame.quantile, 0.8)
expected = df_grouped.quantile(0.8)
tm.assert_frame_equal(apply_result, expected, check_names=False)
tm.assert_frame_equal(agg_result, expected)

apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8])
with tm.assert_produces_warning(warn, match=msg):
expected_seq = df_grouped.quantile([0.4, 0.8])
expected_seq = df_grouped.quantile([0.4, 0.8])
if not as_index:
# apply treats the op as a transform; .quantile knows it's a reduction
apply_result = apply_result.reset_index()
apply_result["level_0"] = [1, 1, 2, 2]
tm.assert_frame_equal(apply_result, expected_seq, check_names=False)

with tm.assert_produces_warning(warn, match=msg):
agg_result = df_grouped.agg(f, q=80)
with tm.assert_produces_warning(warn, match=msg):
apply_result = df_grouped.apply(DataFrame.quantile, q=0.8)
agg_result = df_grouped.agg(f, q=80)
apply_result = df_grouped.apply(DataFrame.quantile, q=0.8)
tm.assert_frame_equal(agg_result, expected)
tm.assert_frame_equal(apply_result, expected, check_names=False)

Expand Down
13 changes: 1 addition & 12 deletions pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,11 +552,6 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki
expected = expected.set_index(["x", "x2"])
else:
expected = expected.set_index("x")
elif index_kind != "range" and reduction_func != "size":
# size, unlike other methods, has the desired behavior in GH#49519
expected = expected.drop(columns="x")
if index_kind == "multi":
expected = expected.drop(columns="x2")
if reduction_func in ("idxmax", "idxmin") and index_kind != "range":
# expected was computed with a RangeIndex; need to translate to index values
values = expected["y"].values.tolist()
Expand All @@ -572,13 +567,7 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki
if as_index:
expected = expected["size"].rename(None)

if as_index or index_kind == "range" or reduction_func == "size":
warn = None
else:
warn = FutureWarning
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(warn, match=msg):
result = getattr(gb_keepna, reduction_func)(*args)
result = getattr(gb_keepna, reduction_func)(*args)

# size will return a Series, others are DataFrame
tm.assert_equal(result, expected)
Expand Down
8 changes: 2 additions & 6 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1125,12 +1125,8 @@ def test_grouping_by_key_is_in_axis():
assert not gb._grouper.groupings[0].in_axis
assert gb._grouper.groupings[1].in_axis

# Currently only in-axis groupings are including in the result when as_index=False;
# This is likely to change in the future.
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = gb.sum()
expected = DataFrame({"b": [1, 2], "c": [7, 5]})
result = gb.sum()
expected = DataFrame({"a": [1, 2], "b": [1, 2], "c": [7, 5]})
tm.assert_frame_equal(result, expected)


Expand Down