From 46d12c29d25dfbc02510de25b6dcff8e6d4f3306 Mon Sep 17 00:00:00 2001 From: Greg Williams Date: Tue, 14 Mar 2017 06:27:04 -0400 Subject: [PATCH 1/4] BUG: Group-by numeric type-coericion with datetime GH Bug #14423 During a group-by/apply on a DataFrame, in the presence of one or more DateTime-like columns, Pandas would incorrectly coerce the type of all other columns to numeric. E.g. a String column would be coerced to numeric, producing NaNs. Fix the issue, and add a test. --- pandas/core/groupby.py | 4 +++- pandas/tests/groupby/test_groupby.py | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a10be078a8f96..fff28c329494b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -10,6 +10,8 @@ zip, range, lzip, callable, map ) + +import pandas as pd from pandas import compat from pandas.compat.numpy import function as nv from pandas.compat.numpy import _np_version_under1p8 @@ -3566,7 +3568,7 @@ def first_non_None_value(values): # as we are stacking can easily have object dtypes here so = self._selected_obj if (so.ndim == 2 and so.dtypes.apply(is_datetimelike).any()): - result = result._convert(numeric=True) + result = result.apply(lambda x: pd.to_numeric(x, errors='ignore')) date_cols = self._selected_obj.select_dtypes( include=['datetime', 'timedelta']).columns date_cols = date_cols.intersection(result.columns) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d7fa3beda0abf..fb0a103caef59 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4314,7 +4314,14 @@ def test_cummin_cummax(self): expected = pd.Series([1, 2, 1], name='b') tm.assert_series_equal(result, expected) - + def test_numeric_coercion(self): + # GH 14423 + df = pd.DataFrame({'Number' : [1, 2], 'Date' : ["2017-03-02"] * 2, 'Str' : ["foo", "inf"]}) + expected = df.groupby(['Number']).apply(lambda x: x.iloc[0]) + df.Date = pd.to_datetime(df.Date) + result = df.groupby(['Number']).apply(lambda x: x.iloc[0]) + tm.assert_series_equal(result['Str'], expected['Str']) + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) tups = com._asarray_tuplesafe(tups) From c8844e085654dd462a91b56ec2bf0d3dc5fdbfa6 Mon Sep 17 00:00:00 2001 From: Greg Williams Date: Tue, 14 Mar 2017 07:08:10 -0400 Subject: [PATCH 2/4] CLN: PEP8 (whitespace fixes) --- pandas/core/groupby.py | 3 ++- pandas/tests/groupby/test_groupby.py | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index fff28c329494b..5a0bc590eb4f4 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3568,7 +3568,8 @@ def first_non_None_value(values): # as we are stacking can easily have object dtypes here so = self._selected_obj if (so.ndim == 2 and so.dtypes.apply(is_datetimelike).any()): - result = result.apply(lambda x: pd.to_numeric(x, errors='ignore')) + result = result.apply( + lambda x: pd.to_numeric(x, errors='ignore')) date_cols = self._selected_obj.select_dtypes( include=['datetime', 'timedelta']).columns date_cols = date_cols.intersection(result.columns) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index fb0a103caef59..263eb6497e990 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4316,12 +4316,15 @@ def test_cummin_cummax(self): def test_numeric_coercion(self): # GH 14423 - df = pd.DataFrame({'Number' : [1, 2], 'Date' : ["2017-03-02"] * 2, 'Str' : ["foo", "inf"]}) + df = pd.DataFrame({'Number': [1, 2], + 'Date': ["2017-03-02"] * 2, + 'Str': ["foo", "inf"]}) expected = df.groupby(['Number']).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) result = df.groupby(['Number']).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result['Str'], expected['Str']) - + + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) tups = com._asarray_tuplesafe(tups) From 0a156745e39595fe125081282da2acc7f28cec13 Mon Sep 17 00:00:00 2001 From: Greg Williams Date: Tue, 14 Mar 2017 14:42:43 -0400 Subject: [PATCH 3/4] CLN: move import, add whatsnew entry --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/groupby.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8a4f2f47b9853..ce5111e2725e2 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -886,3 +886,4 @@ Bug Fixes - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`) +- Bug in ``groupby.apply()`` coercing ``object`` series to numeric types, when not all values were numeric (:issue:`15680`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 5a0bc590eb4f4..7a017ffae284c 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -11,7 +11,6 @@ callable, map ) -import pandas as pd from pandas import compat from pandas.compat.numpy import function as nv from pandas.compat.numpy import _np_version_under1p8 @@ -3426,6 +3425,7 @@ def _decide_output_index(self, output, labels): def _wrap_applied_output(self, keys, values, not_indexed_same=False): from pandas.core.index import _all_indexes_same + from pandas.tools.util import to_numeric if len(keys) == 0: return DataFrame(index=keys) @@ -3569,7 +3569,7 @@ def first_non_None_value(values): so = self._selected_obj if (so.ndim == 2 and so.dtypes.apply(is_datetimelike).any()): result = result.apply( - lambda x: pd.to_numeric(x, errors='ignore')) + lambda x: to_numeric(x, errors='ignore')) date_cols = self._selected_obj.select_dtypes( include=['datetime', 'timedelta']).columns date_cols = date_cols.intersection(result.columns) From e1ed10401ab1255ad168afeaf744955a25069c59 Mon Sep 17 00:00:00 2001 From: Greg Williams Date: Thu, 16 Mar 2017 07:22:20 -0400 Subject: [PATCH 4/4] TST: Rename and expand test_numeric_coercion Rename test_numeric_coercion to test_apply_numeric_coercion_when_datetime, and add tests for GH #15421 and #14423 --- pandas/tests/groupby/test_groupby.py | 42 ++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 263eb6497e990..c25974c94bfd1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4314,8 +4314,13 @@ def test_cummin_cummax(self): expected = pd.Series([1, 2, 1], name='b') tm.assert_series_equal(result, expected) - def test_numeric_coercion(self): - # GH 14423 + def test_apply_numeric_coercion_when_datetime(self): + # In the past, group-by/apply operations have been over-eager + # in converting dtypes to numeric, in the presence of datetime + # columns. Various GH issues were filed, the reproductions + # for which are here. + + # GH 15670 df = pd.DataFrame({'Number': [1, 2], 'Date': ["2017-03-02"] * 2, 'Str': ["foo", "inf"]}) @@ -4324,6 +4329,39 @@ def test_numeric_coercion(self): result = df.groupby(['Number']).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result['Str'], expected['Str']) + # GH 15421 + df = pd.DataFrame({'A': [10, 20, 30], + 'B': ['foo', '3', '4'], + 'T': [pd.Timestamp("12:31:22")] * 3}) + + def get_B(g): + return g.iloc[0][['B']] + result = df.groupby('A').apply(get_B)['B'] + expected = df.B + expected.index = df.A + tm.assert_series_equal(result, expected) + + # GH 14423 + def predictions(tool): + out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object) + if 'step1' in list(tool.State): + out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0]) + if 'step2' in list(tool.State): + out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0]) + out['useTime'] = str( + tool[tool.State == 'step2'].oTime.values[0]) + return out + df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'], + 'State': ['step1', 'step2', 'step1', 'step2'], + 'oTime': ['', '2016-09-19 05:24:33', + '', '2016-09-19 23:59:04'], + 'Machine': ['23', '36L', '36R', '36R']}) + df2 = df1.copy() + df2.oTime = pd.to_datetime(df2.oTime) + expected = df1.groupby('Key').apply(predictions).p1 + result = df2.groupby('Key').apply(predictions).p1 + tm.assert_series_equal(expected, result) + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values)