From 46d12c29d25dfbc02510de25b6dcff8e6d4f3306 Mon Sep 17 00:00:00 2001
From: Greg Williams <gregwill@pdtpartners.com>
Date: Tue, 14 Mar 2017 06:27:04 -0400
Subject: [PATCH 1/4] BUG: Group-by numeric type-coericion with datetime

GH Bug #14423

During a group-by/apply on a DataFrame, in the presence of one or more
DateTime-like columns, Pandas would incorrectly coerce the type of all
other columns to numeric.  E.g. a String column would be coerced to
numeric, producing NaNs.

Fix the issue, and add a test.
---
 pandas/core/groupby.py               | 4 +++-
 pandas/tests/groupby/test_groupby.py | 9 ++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index a10be078a8f96..fff28c329494b 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -10,6 +10,8 @@
     zip, range, lzip,
     callable, map
 )
+
+import pandas as pd
 from pandas import compat
 from pandas.compat.numpy import function as nv
 from pandas.compat.numpy import _np_version_under1p8
@@ -3566,7 +3568,7 @@ def first_non_None_value(values):
                 # as we are stacking can easily have object dtypes here
                 so = self._selected_obj
                 if (so.ndim == 2 and so.dtypes.apply(is_datetimelike).any()):
-                    result = result._convert(numeric=True)
+                    result = result.apply(lambda x: pd.to_numeric(x, errors='ignore'))
                     date_cols = self._selected_obj.select_dtypes(
                         include=['datetime', 'timedelta']).columns
                     date_cols = date_cols.intersection(result.columns)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index d7fa3beda0abf..fb0a103caef59 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -4314,7 +4314,14 @@ def test_cummin_cummax(self):
         expected = pd.Series([1, 2, 1], name='b')
         tm.assert_series_equal(result, expected)
 
-
+    def test_numeric_coercion(self):
+        # GH 14423
+        df = pd.DataFrame({'Number' : [1, 2], 'Date' : ["2017-03-02"] * 2, 'Str' : ["foo", "inf"]})
+        expected = df.groupby(['Number']).apply(lambda x: x.iloc[0])
+        df.Date = pd.to_datetime(df.Date)
+        result = df.groupby(['Number']).apply(lambda x: x.iloc[0])
+        tm.assert_series_equal(result['Str'], expected['Str'])
+        
 def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
     tups = lmap(tuple, df[keys].values)
     tups = com._asarray_tuplesafe(tups)

From c8844e085654dd462a91b56ec2bf0d3dc5fdbfa6 Mon Sep 17 00:00:00 2001
From: Greg Williams <gregwill@pdtpartners.com>
Date: Tue, 14 Mar 2017 07:08:10 -0400
Subject: [PATCH 2/4] CLN: PEP8 (whitespace fixes)

---
 pandas/core/groupby.py               | 3 ++-
 pandas/tests/groupby/test_groupby.py | 7 +++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index fff28c329494b..5a0bc590eb4f4 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -3568,7 +3568,8 @@ def first_non_None_value(values):
                 # as we are stacking can easily have object dtypes here
                 so = self._selected_obj
                 if (so.ndim == 2 and so.dtypes.apply(is_datetimelike).any()):
-                    result = result.apply(lambda x: pd.to_numeric(x, errors='ignore'))
+                    result = result.apply(
+                        lambda x: pd.to_numeric(x, errors='ignore'))
                     date_cols = self._selected_obj.select_dtypes(
                         include=['datetime', 'timedelta']).columns
                     date_cols = date_cols.intersection(result.columns)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index fb0a103caef59..263eb6497e990 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -4316,12 +4316,15 @@ def test_cummin_cummax(self):
 
     def test_numeric_coercion(self):
         # GH 14423
-        df = pd.DataFrame({'Number' : [1, 2], 'Date' : ["2017-03-02"] * 2, 'Str' : ["foo", "inf"]})
+        df = pd.DataFrame({'Number': [1, 2],
+                           'Date': ["2017-03-02"] * 2,
+                           'Str': ["foo", "inf"]})
         expected = df.groupby(['Number']).apply(lambda x: x.iloc[0])
         df.Date = pd.to_datetime(df.Date)
         result = df.groupby(['Number']).apply(lambda x: x.iloc[0])
         tm.assert_series_equal(result['Str'], expected['Str'])
-        
+
+
 def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
     tups = lmap(tuple, df[keys].values)
     tups = com._asarray_tuplesafe(tups)

From 0a156745e39595fe125081282da2acc7f28cec13 Mon Sep 17 00:00:00 2001
From: Greg Williams <gregwill@pdtpartners.com>
Date: Tue, 14 Mar 2017 14:42:43 -0400
Subject: [PATCH 3/4] CLN: move import, add whatsnew entry

---
 doc/source/whatsnew/v0.20.0.txt | 1 +
 pandas/core/groupby.py          | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 8a4f2f47b9853..ce5111e2725e2 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -886,3 +886,4 @@ Bug Fixes
 - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`)
 - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`)
 - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`)
+- Bug in ``groupby.apply()`` coercing ``object`` series to numeric types, when not all values were numeric (:issue:`15680`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index 5a0bc590eb4f4..7a017ffae284c 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -11,7 +11,6 @@
     callable, map
 )
 
-import pandas as pd
 from pandas import compat
 from pandas.compat.numpy import function as nv
 from pandas.compat.numpy import _np_version_under1p8
@@ -3426,6 +3425,7 @@ def _decide_output_index(self, output, labels):
 
     def _wrap_applied_output(self, keys, values, not_indexed_same=False):
         from pandas.core.index import _all_indexes_same
+        from pandas.tools.util import to_numeric
 
         if len(keys) == 0:
             return DataFrame(index=keys)
@@ -3569,7 +3569,7 @@ def first_non_None_value(values):
                 so = self._selected_obj
                 if (so.ndim == 2 and so.dtypes.apply(is_datetimelike).any()):
                     result = result.apply(
-                        lambda x: pd.to_numeric(x, errors='ignore'))
+                        lambda x: to_numeric(x, errors='ignore'))
                     date_cols = self._selected_obj.select_dtypes(
                         include=['datetime', 'timedelta']).columns
                     date_cols = date_cols.intersection(result.columns)

From e1ed10401ab1255ad168afeaf744955a25069c59 Mon Sep 17 00:00:00 2001
From: Greg Williams <gregwill@pdtpartners.com>
Date: Thu, 16 Mar 2017 07:22:20 -0400
Subject: [PATCH 4/4] TST: Rename and expand test_numeric_coercion

Rename test_numeric_coercion to
test_apply_numeric_coercion_when_datetime, and add tests for GH #15421
and #14423
---
 pandas/tests/groupby/test_groupby.py | 42 ++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 263eb6497e990..c25974c94bfd1 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -4314,8 +4314,13 @@ def test_cummin_cummax(self):
         expected = pd.Series([1, 2, 1], name='b')
         tm.assert_series_equal(result, expected)
 
-    def test_numeric_coercion(self):
-        # GH 14423
+    def test_apply_numeric_coercion_when_datetime(self):
+        # In the past, group-by/apply operations have been over-eager
+        # in converting dtypes to numeric, in the presence of datetime
+        # columns.  Various GH issues were filed, the reproductions
+        # for which are here.
+
+        # GH 15670
         df = pd.DataFrame({'Number': [1, 2],
                            'Date': ["2017-03-02"] * 2,
                            'Str': ["foo", "inf"]})
@@ -4324,6 +4329,39 @@ def test_numeric_coercion(self):
         result = df.groupby(['Number']).apply(lambda x: x.iloc[0])
         tm.assert_series_equal(result['Str'], expected['Str'])
 
+        # GH 15421
+        df = pd.DataFrame({'A': [10, 20, 30],
+                           'B': ['foo', '3', '4'],
+                           'T': [pd.Timestamp("12:31:22")] * 3})
+
+        def get_B(g):
+            return g.iloc[0][['B']]
+        result = df.groupby('A').apply(get_B)['B']
+        expected = df.B
+        expected.index = df.A
+        tm.assert_series_equal(result, expected)
+
+        # GH 14423
+        def predictions(tool):
+            out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object)
+            if 'step1' in list(tool.State):
+                out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0])
+            if 'step2' in list(tool.State):
+                out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0])
+                out['useTime'] = str(
+                    tool[tool.State == 'step2'].oTime.values[0])
+            return out
+        df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'],
+                            'State': ['step1', 'step2', 'step1', 'step2'],
+                            'oTime': ['', '2016-09-19 05:24:33',
+                                      '', '2016-09-19 23:59:04'],
+                            'Machine': ['23', '36L', '36R', '36R']})
+        df2 = df1.copy()
+        df2.oTime = pd.to_datetime(df2.oTime)
+        expected = df1.groupby('Key').apply(predictions).p1
+        result = df2.groupby('Key').apply(predictions).p1
+        tm.assert_series_equal(expected, result)
+
 
 def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
     tups = lmap(tuple, df[keys].values)