Skip to content

Commit d0470a5

Browse files
rhshadrachTLouf
authored andcommitted
BUG: groupby sum, mean, var should always be floats (pandas-dev#41139)
1 parent 427ff79 commit d0470a5

File tree

15 files changed

+91
-50
lines changed

15 files changed

+91
-50
lines changed

doc/source/whatsnew/v1.3.0.rst

+25
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,31 @@ values as measured by ``np.allclose``. Now no such casting occurs.
333333
334334
df.groupby('key').agg(lambda x: x.sum())
335335
336+
``float`` result for :meth:`.GroupBy.mean`, :meth:`.GroupBy.median`, and :meth:`.GroupBy.var`
337+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
338+
339+
Previously, these methods could result in different dtypes depending on the input values.
340+
Now, these methods will always return a float dtype. (:issue:`41137`)
341+
342+
.. ipython:: python
343+
344+
df = pd.DataFrame({'a': [True], 'b': [1], 'c': [1.0]})
345+
346+
*pandas 1.2.x*
347+
348+
.. code-block:: ipython
349+
350+
In [5]: df.groupby(df.index).mean()
351+
Out[5]:
352+
a b c
353+
0 True 1 1.0
354+
355+
*pandas 1.3.0*
356+
357+
.. ipython:: python
358+
359+
df.groupby(df.index).mean()
360+
336361
Try operating inplace when setting values with ``loc`` and ``iloc``
337362
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
338363

pandas/core/groupby/groupby.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -1600,12 +1600,12 @@ def mean(self, numeric_only: bool = True):
16001600
Groupby two columns and return the mean of the remaining column.
16011601
16021602
>>> df.groupby(['A', 'B']).mean()
1603-
C
1603+
C
16041604
A B
1605-
1 2.0 2
1606-
4.0 1
1607-
2 3.0 1
1608-
5.0 2
1605+
1 2.0 2.0
1606+
4.0 1.0
1607+
2 3.0 1.0
1608+
5.0 2.0
16091609
16101610
Groupby one column and return the mean of only particular column in
16111611
the group.

pandas/core/groupby/grouper.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,8 @@ class Grouper:
150150
>>> df.groupby(pd.Grouper(key="Animal")).mean()
151151
Speed
152152
Animal
153-
Falcon 200
154-
Parrot 10
153+
Falcon 200.0
154+
Parrot 10.0
155155
156156
Specify a resample operation on the column 'Publish date'
157157

pandas/core/groupby/ops.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
is_categorical_dtype,
5050
is_complex_dtype,
5151
is_datetime64_any_dtype,
52+
is_float_dtype,
5253
is_integer_dtype,
5354
is_numeric_dtype,
5455
is_sparse,
@@ -304,10 +305,13 @@ def _get_result_dtype(self, dtype: DtypeObj) -> DtypeObj:
304305
return np.dtype(np.int64)
305306
elif isinstance(dtype, (BooleanDtype, _IntegerDtype)):
306307
return Int64Dtype()
307-
elif how in ["mean", "median", "var"] and isinstance(
308-
dtype, (BooleanDtype, _IntegerDtype)
309-
):
310-
return Float64Dtype()
308+
elif how in ["mean", "median", "var"]:
309+
if isinstance(dtype, (BooleanDtype, _IntegerDtype)):
310+
return Float64Dtype()
311+
elif is_float_dtype(dtype):
312+
return dtype
313+
elif is_numeric_dtype(dtype):
314+
return np.dtype(np.float64)
311315
return dtype
312316

313317
def uses_mask(self) -> bool:

pandas/tests/extension/base/groupby.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping):
2525
_, index = pd.factorize(data_for_grouping, sort=True)
2626

2727
index = pd.Index(index, name="B")
28-
expected = pd.Series([3, 1, 4], index=index, name="A")
28+
expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A")
2929
if as_index:
3030
self.assert_series_equal(result, expected)
3131
else:
@@ -54,7 +54,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping):
5454
_, index = pd.factorize(data_for_grouping, sort=False)
5555

5656
index = pd.Index(index, name="B")
57-
expected = pd.Series([1, 3, 4], index=index, name="A")
57+
expected = pd.Series([1.0, 3.0, 4.0], index=index, name="A")
5858
self.assert_series_equal(result, expected)
5959

6060
def test_groupby_extension_transform(self, data_for_grouping):

pandas/tests/extension/test_boolean.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping):
272272
_, index = pd.factorize(data_for_grouping, sort=True)
273273

274274
index = pd.Index(index, name="B")
275-
expected = pd.Series([3, 1], index=index, name="A")
275+
expected = pd.Series([3.0, 1.0], index=index, name="A")
276276
if as_index:
277277
self.assert_series_equal(result, expected)
278278
else:
@@ -301,7 +301,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping):
301301
_, index = pd.factorize(data_for_grouping, sort=False)
302302

303303
index = pd.Index(index, name="B")
304-
expected = pd.Series([1, 3], index=index, name="A")
304+
expected = pd.Series([1.0, 3.0], index=index, name="A")
305305
self.assert_series_equal(result, expected)
306306

307307
def test_groupby_extension_transform(self, data_for_grouping):

pandas/tests/groupby/aggregate/test_aggregate.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -512,7 +512,9 @@ def test_uint64_type_handling(dtype, how):
512512
expected = df.groupby("y").agg({"x": how})
513513
df.x = df.x.astype(dtype)
514514
result = df.groupby("y").agg({"x": how})
515-
result.x = result.x.astype(np.int64)
515+
if how not in ("mean", "median"):
516+
# mean and median always result in floats
517+
result.x = result.x.astype(np.int64)
516518
tm.assert_frame_equal(result, expected, check_exact=True)
517519

518520

pandas/tests/groupby/aggregate/test_cython.py

-3
Original file line numberDiff line numberDiff line change
@@ -196,9 +196,6 @@ def test_cython_agg_empty_buckets(op, targop, observed):
196196

197197
g = df.groupby(pd.cut(df[0], grps), observed=observed)
198198
expected = g.agg(lambda x: targop(x))
199-
if observed and op not in ("min", "max"):
200-
# TODO: GH 41137
201-
expected = expected.astype("int64")
202199
tm.assert_frame_equal(result, expected)
203200

204201

pandas/tests/groupby/test_categorical.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -285,8 +285,6 @@ def test_apply(ordered):
285285
result = grouped.apply(lambda x: np.mean(x))
286286
tm.assert_frame_equal(result, expected)
287287

288-
# we coerce back to ints
289-
expected = expected.astype("int")
290288
result = grouped.mean()
291289
tm.assert_frame_equal(result, expected)
292290

@@ -371,7 +369,7 @@ def test_observed(observed, using_array_manager):
371369
result = groups_double_key.agg("mean")
372370
expected = DataFrame(
373371
{
374-
"val": [10, 30, 20, 40],
372+
"val": [10.0, 30.0, 20.0, 40.0],
375373
"cat": Categorical(
376374
["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True
377375
),
@@ -418,7 +416,9 @@ def test_observed_codes_remap(observed):
418416
groups_double_key = df.groupby([values, "C2"], observed=observed)
419417

420418
idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"])
421-
expected = DataFrame({"C1": [3, 3, 4, 5], "C3": [10, 100, 200, 34]}, index=idx)
419+
expected = DataFrame(
420+
{"C1": [3.0, 3.0, 4.0, 5.0], "C3": [10.0, 100.0, 200.0, 34.0]}, index=idx
421+
)
422422
if not observed:
423423
expected = cartesian_product_for_groupers(
424424
expected, [values.values, [1, 2, 3, 4]], ["cat", "C2"]
@@ -1515,7 +1515,9 @@ def test_read_only_category_no_sort():
15151515
df = DataFrame(
15161516
{"a": [1, 3, 5, 7], "b": Categorical([1, 1, 2, 2], categories=Index(cats))}
15171517
)
1518-
expected = DataFrame(data={"a": [2, 6]}, index=CategoricalIndex([1, 2], name="b"))
1518+
expected = DataFrame(
1519+
data={"a": [2.0, 6.0]}, index=CategoricalIndex([1, 2], name="b")
1520+
)
15191521
result = df.groupby("b", sort=False).mean()
15201522
tm.assert_frame_equal(result, expected)
15211523

pandas/tests/groupby/test_groupby.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1257,7 +1257,7 @@ def test_groupby_keys_same_size_as_index():
12571257
)
12581258
df = DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index)
12591259
result = df.groupby([Grouper(level=0, freq=freq), "metric"]).mean()
1260-
expected = df.set_index([df.index, "metric"])
1260+
expected = df.set_index([df.index, "metric"]).astype(float)
12611261

12621262
tm.assert_frame_equal(result, expected)
12631263

@@ -1350,7 +1350,7 @@ def test_groupby_2d_malformed():
13501350
d["ones"] = [1, 1]
13511351
d["label"] = ["l1", "l2"]
13521352
tmp = d.groupby(["group"]).mean()
1353-
res_values = np.array([[0, 1], [0, 1]], dtype=np.int64)
1353+
res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
13541354
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
13551355
tm.assert_numpy_array_equal(tmp.values, res_values)
13561356

@@ -2114,7 +2114,7 @@ def test_groupby_crash_on_nunique(axis):
21142114

21152115
def test_groupby_list_level():
21162116
# GH 9790
2117-
expected = DataFrame(np.arange(0, 9).reshape(3, 3))
2117+
expected = DataFrame(np.arange(0, 9).reshape(3, 3), dtype=float)
21182118
result = expected.groupby(level=[0]).mean()
21192119
tm.assert_frame_equal(result, expected)
21202120

pandas/tests/io/formats/test_to_csv.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ def test_to_csv_date_format(self):
274274
df_sec["B"] = 0
275275
df_sec["C"] = 1
276276

277-
expected_rows = ["A,B,C", "2013-01-01,0,1"]
277+
expected_rows = ["A,B,C", "2013-01-01,0,1.0"]
278278
expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
279279

280280
df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"])

pandas/tests/resample/test_datetime_index.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -1064,7 +1064,7 @@ def test_nanosecond_resample_error():
10641064
result = r.agg("mean")
10651065

10661066
exp_indx = date_range(start=pd.to_datetime(exp_start), periods=10, freq="100n")
1067-
exp = Series(range(len(exp_indx)), index=exp_indx)
1067+
exp = Series(range(len(exp_indx)), index=exp_indx, dtype=float)
10681068

10691069
tm.assert_series_equal(result, exp)
10701070

@@ -1636,15 +1636,15 @@ def test_resample_with_nat():
16361636
index_1s = DatetimeIndex(
16371637
["1970-01-01 00:00:00", "1970-01-01 00:00:01", "1970-01-01 00:00:02"]
16381638
)
1639-
frame_1s = DataFrame([3, 7, 11], index=index_1s)
1639+
frame_1s = DataFrame([3.0, 7.0, 11.0], index=index_1s)
16401640
tm.assert_frame_equal(frame.resample("1s").mean(), frame_1s)
16411641

16421642
index_2s = DatetimeIndex(["1970-01-01 00:00:00", "1970-01-01 00:00:02"])
1643-
frame_2s = DataFrame([5, 11], index=index_2s)
1643+
frame_2s = DataFrame([5.0, 11.0], index=index_2s)
16441644
tm.assert_frame_equal(frame.resample("2s").mean(), frame_2s)
16451645

16461646
index_3s = DatetimeIndex(["1970-01-01 00:00:00"])
1647-
frame_3s = DataFrame([7], index=index_3s)
1647+
frame_3s = DataFrame([7.0], index=index_3s)
16481648
tm.assert_frame_equal(frame.resample("3s").mean(), frame_3s)
16491649

16501650
tm.assert_frame_equal(frame.resample("60s").mean(), frame_3s)
@@ -1687,7 +1687,7 @@ def f(data, add_arg):
16871687

16881688
# Testing dataframe
16891689
df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10))
1690-
result = df.groupby("A").resample("D").agg(f, multiplier)
1690+
result = df.groupby("A").resample("D").agg(f, multiplier).astype(float)
16911691
expected = df.groupby("A").resample("D").mean().multiply(multiplier)
16921692
# TODO: GH 41137
16931693
expected = expected.astype("float64")

pandas/tests/resample/test_period_index.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ def test_with_local_timezone_pytz(self):
269269
# Index is moved back a day with the timezone conversion from UTC to
270270
# Pacific
271271
expected_index = period_range(start=start, end=end, freq="D") - offsets.Day()
272-
expected = Series(1, index=expected_index)
272+
expected = Series(1.0, index=expected_index)
273273
tm.assert_series_equal(result, expected)
274274

275275
def test_resample_with_pytz(self):
@@ -279,7 +279,7 @@ def test_resample_with_pytz(self):
279279
)
280280
result = s.resample("D").mean()
281281
expected = Series(
282-
2,
282+
2.0,
283283
index=pd.DatetimeIndex(
284284
["2017-01-01", "2017-01-02"], tz="US/Eastern", freq="D"
285285
),
@@ -312,7 +312,7 @@ def test_with_local_timezone_dateutil(self):
312312
expected_index = (
313313
period_range(start=start, end=end, freq="D", name="idx") - offsets.Day()
314314
)
315-
expected = Series(1, index=expected_index)
315+
expected = Series(1.0, index=expected_index)
316316
tm.assert_series_equal(result, expected)
317317

318318
def test_resample_nonexistent_time_bin_edge(self):
@@ -777,8 +777,8 @@ def test_upsampling_ohlc(self, freq, period_mult, kind):
777777
"freq, expected_values",
778778
[
779779
("1s", [3, np.NaN, 7, 11]),
780-
("2s", [3, int((7 + 11) / 2)]),
781-
("3s", [int((3 + 7) / 2), 11]),
780+
("2s", [3, (7 + 11) / 2]),
781+
("3s", [(3 + 7) / 2, 11]),
782782
],
783783
)
784784
def test_resample_with_nat(self, periods, values, freq, expected_values):
@@ -798,7 +798,7 @@ def test_resample_with_only_nat(self):
798798
pi = PeriodIndex([pd.NaT] * 3, freq="S")
799799
frame = DataFrame([2, 3, 5], index=pi, columns=["a"])
800800
expected_index = PeriodIndex(data=[], freq=pi.freq)
801-
expected = DataFrame(index=expected_index, columns=["a"], dtype="int64")
801+
expected = DataFrame(index=expected_index, columns=["a"], dtype="float64")
802802
result = frame.resample("1s").mean()
803803
tm.assert_frame_equal(result, expected)
804804

pandas/tests/resample/test_timedelta.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def test_resample_timedelta_idempotency():
7777
index = timedelta_range("0", periods=9, freq="10L")
7878
series = Series(range(9), index=index)
7979
result = series.resample("10L").mean()
80-
expected = series
80+
expected = series.astype(float)
8181
tm.assert_series_equal(result, expected)
8282

8383

pandas/tests/reshape/test_pivot.py

+21-10
Original file line numberDiff line numberDiff line change
@@ -240,13 +240,13 @@ def test_pivot_with_non_observable_dropna(self, dropna):
240240
categories=["low", "high"],
241241
ordered=True,
242242
),
243-
"B": range(5),
243+
"B": [0.0, 1.0, 2.0, 3.0, 4.0],
244244
}
245245
)
246246

247247
result = df.pivot_table(index="A", values="B", dropna=dropna)
248248
expected = DataFrame(
249-
{"B": [2, 3]},
249+
{"B": [2.0, 3.0]},
250250
index=Index(
251251
Categorical.from_codes(
252252
[0, 1], categories=["low", "high"], ordered=True
@@ -279,6 +279,8 @@ def test_pivot_with_non_observable_dropna(self, dropna):
279279
name="A",
280280
),
281281
)
282+
if not dropna:
283+
expected["B"] = expected["B"].astype(float)
282284

283285
tm.assert_frame_equal(result, expected)
284286

@@ -287,6 +289,8 @@ def test_pivot_with_interval_index(self, interval_values, dropna):
287289
df = DataFrame({"A": interval_values, "B": 1})
288290
result = df.pivot_table(index="A", values="B", dropna=dropna)
289291
expected = DataFrame({"B": 1}, index=Index(interval_values.unique(), name="A"))
292+
if not dropna:
293+
expected = expected.astype(float)
290294
tm.assert_frame_equal(result, expected)
291295

292296
def test_pivot_with_interval_index_margins(self):
@@ -388,10 +392,7 @@ def test_pivot_preserve_dtypes(self, columns, values):
388392
)
389393

390394
result = dict(df_res.dtypes)
391-
expected = {
392-
col: np.dtype("O") if col[0].startswith("b") else np.dtype("float64")
393-
for col in df_res
394-
}
395+
expected = {col: np.dtype("float64") for col in df_res}
395396
assert result == expected
396397

397398
def test_pivot_no_values(self):
@@ -1711,8 +1712,13 @@ def test_pivot_table_margins_name_with_aggfunc_list(self):
17111712
expected = DataFrame(table.values, index=ix, columns=cols)
17121713
tm.assert_frame_equal(table, expected)
17131714

1714-
@pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)")
1715-
def test_categorical_margins(self, observed):
1715+
def test_categorical_margins(self, observed, request):
1716+
if observed:
1717+
request.node.add_marker(
1718+
pytest.mark.xfail(
1719+
reason="GH#17035 (np.mean of ints is casted back to ints)"
1720+
)
1721+
)
17161722
# GH 10989
17171723
df = DataFrame(
17181724
{"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2}
@@ -1725,8 +1731,13 @@ def test_categorical_margins(self, observed):
17251731
table = df.pivot_table("x", "y", "z", dropna=observed, margins=True)
17261732
tm.assert_frame_equal(table, expected)
17271733

1728-
@pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)")
1729-
def test_categorical_margins_category(self, observed):
1734+
def test_categorical_margins_category(self, observed, request):
1735+
if observed:
1736+
request.node.add_marker(
1737+
pytest.mark.xfail(
1738+
reason="GH#17035 (np.mean of ints is casted back to ints)"
1739+
)
1740+
)
17301741
df = DataFrame(
17311742
{"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2}
17321743
)

0 commit comments

Comments
 (0)