Skip to content

Commit 37e5f78

Browse files
gwpdtjreback
authored andcommitted
BUG: Group-by numeric type-coercion with datetime
closes #14423 closes #15421 closes #15670 During a group-by/apply on a DataFrame, in the presence of one or more DateTime-like columns, Pandas would incorrectly coerce the type of all other columns to numeric. E.g. a String column would be coerced to numeric, producing NaNs. Author: Greg Williams <[email protected]> Closes #15680 from gwpdt/bugfix14423 and squashes the following commits: e1ed104 [Greg Williams] TST: Rename and expand test_numeric_coercion 0a15674 [Greg Williams] CLN: move import, add whatsnew entry c8844e0 [Greg Williams] CLN: PEP8 (whitespace fixes) 46d12c2 [Greg Williams] BUG: Group-by numeric type-coericion with datetime
1 parent e7956c4 commit 37e5f78

File tree

3 files changed

+54
-2
lines changed

3 files changed

+54
-2
lines changed

Diff for: doc/source/whatsnew/v0.20.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -850,7 +850,8 @@ Bug Fixes
850850
- Bug in ``SparseSeries.reindex`` on single level with list of length 1 (:issue:`15447`)
851851

852852

853-
- Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`)
853+
- Bug in groupby operations with timedelta64 when passing ``numeric_only=False`` (:issue:`5724`)
854+
- Bug in ``groupby.apply()`` coercing ``object`` dtypes to numeric types, when not all values were numeric (:issue:`14423`, :issue:`15421`, :issue:`15670`)
854855

855856

856857
- Bug in ``DataFrame.to_html`` with ``index=False`` and ``max_rows`` raising in ``IndexError`` (:issue:`14998`)

Diff for: pandas/core/groupby.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
zip, range, lzip,
1111
callable, map
1212
)
13+
1314
from pandas import compat
1415
from pandas.compat.numpy import function as nv
1516
from pandas.compat.numpy import _np_version_under1p8
@@ -3424,6 +3425,7 @@ def _decide_output_index(self, output, labels):
34243425

34253426
def _wrap_applied_output(self, keys, values, not_indexed_same=False):
34263427
from pandas.core.index import _all_indexes_same
3428+
from pandas.tools.util import to_numeric
34273429

34283430
if len(keys) == 0:
34293431
return DataFrame(index=keys)
@@ -3566,7 +3568,8 @@ def first_non_None_value(values):
35663568
# as we are stacking can easily have object dtypes here
35673569
so = self._selected_obj
35683570
if (so.ndim == 2 and so.dtypes.apply(is_datetimelike).any()):
3569-
result = result._convert(numeric=True)
3571+
result = result.apply(
3572+
lambda x: to_numeric(x, errors='ignore'))
35703573
date_cols = self._selected_obj.select_dtypes(
35713574
include=['datetime', 'timedelta']).columns
35723575
date_cols = date_cols.intersection(result.columns)

Diff for: pandas/tests/groupby/test_groupby.py

+48
Original file line numberDiff line numberDiff line change
@@ -4314,6 +4314,54 @@ def test_cummin_cummax(self):
43144314
expected = pd.Series([1, 2, 1], name='b')
43154315
tm.assert_series_equal(result, expected)
43164316

4317+
def test_apply_numeric_coercion_when_datetime(self):
4318+
# In the past, group-by/apply operations have been over-eager
4319+
# in converting dtypes to numeric, in the presence of datetime
4320+
# columns. Various GH issues were filed, the reproductions
4321+
# for which are here.
4322+
4323+
# GH 15670
4324+
df = pd.DataFrame({'Number': [1, 2],
4325+
'Date': ["2017-03-02"] * 2,
4326+
'Str': ["foo", "inf"]})
4327+
expected = df.groupby(['Number']).apply(lambda x: x.iloc[0])
4328+
df.Date = pd.to_datetime(df.Date)
4329+
result = df.groupby(['Number']).apply(lambda x: x.iloc[0])
4330+
tm.assert_series_equal(result['Str'], expected['Str'])
4331+
4332+
# GH 15421
4333+
df = pd.DataFrame({'A': [10, 20, 30],
4334+
'B': ['foo', '3', '4'],
4335+
'T': [pd.Timestamp("12:31:22")] * 3})
4336+
4337+
def get_B(g):
4338+
return g.iloc[0][['B']]
4339+
result = df.groupby('A').apply(get_B)['B']
4340+
expected = df.B
4341+
expected.index = df.A
4342+
tm.assert_series_equal(result, expected)
4343+
4344+
# GH 14423
4345+
def predictions(tool):
4346+
out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object)
4347+
if 'step1' in list(tool.State):
4348+
out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0])
4349+
if 'step2' in list(tool.State):
4350+
out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0])
4351+
out['useTime'] = str(
4352+
tool[tool.State == 'step2'].oTime.values[0])
4353+
return out
4354+
df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'],
4355+
'State': ['step1', 'step2', 'step1', 'step2'],
4356+
'oTime': ['', '2016-09-19 05:24:33',
4357+
'', '2016-09-19 23:59:04'],
4358+
'Machine': ['23', '36L', '36R', '36R']})
4359+
df2 = df1.copy()
4360+
df2.oTime = pd.to_datetime(df2.oTime)
4361+
expected = df1.groupby('Key').apply(predictions).p1
4362+
result = df2.groupby('Key').apply(predictions).p1
4363+
tm.assert_series_equal(expected, result)
4364+
43174365

43184366
def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
43194367
tups = lmap(tuple, df[keys].values)

0 commit comments

Comments
 (0)