From 90b797815289072194c05d60de49145924ab90d0 Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Mon, 26 Feb 2018 19:57:26 +0100 Subject: [PATCH 1/9] Adding skipna as an option to groupby cumsum and cumprod --- pandas/_libs/groupby.pyx | 11 +++++++++-- pandas/core/groupby.py | 4 ++-- pandas/tests/groupby/test_groupby.py | 22 ++++++++++++++++++++-- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e3d208a915225..50581cae6e3c7 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -5,6 +5,7 @@ cimport numpy as cnp import numpy as np cimport cython +cimport util cnp.import_array() @@ -139,7 +140,8 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, def group_cumprod_float64(float64_t[:, :] out, float64_t[:, :] values, int64_t[:] labels, - bint is_datetimelike): + bint is_datetimelike, + bint skipna=True): """ Only transforms on axis=0 """ @@ -163,6 +165,8 @@ def group_cumprod_float64(float64_t[:, :] out, if val == val: accum[lab, j] *= val out[i, j] = accum[lab, j] + if val != val and skipna: + out[i, j] = accum[lab, j] @cython.boundscheck(False) @@ -170,7 +174,8 @@ def group_cumprod_float64(float64_t[:, :] out, def group_cumsum(numeric[:, :] out, numeric[:, :] values, int64_t[:] labels, - is_datetimelike): + is_datetimelike, + bint skipna=True): """ Only transforms on axis=0 """ @@ -196,6 +201,8 @@ def group_cumsum(numeric[:, :] out, if val == val: accum[lab, j] += val out[i, j] = accum[lab, j] + if val != val and skipna == True: + out[i, j] = accum[lab, j] else: accum[lab, j] += val out[i, j] = accum[lab, j] diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 00643614e8803..dbc0adfc15f02 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1839,7 +1839,7 @@ def rank(self, method='average', ascending=True, na_option='keep', @Appender(_doc_template) def cumprod(self, axis=0, *args, **kwargs): """Cumulative product for each group""" - nv.validate_groupby_func('cumprod', args, kwargs, ['numeric_only']) + nv.validate_groupby_func('cumprod', args, kwargs, ['numeric_only', 'skipna']) if axis != 0: return self.apply(lambda x: x.cumprod(axis=axis, **kwargs)) @@ -1849,7 +1849,7 @@ def cumprod(self, axis=0, *args, **kwargs): @Appender(_doc_template) def cumsum(self, axis=0, *args, **kwargs): """Cumulative sum for each group""" - nv.validate_groupby_func('cumsum', args, kwargs, ['numeric_only']) + nv.validate_groupby_func('cumsum', args, kwargs, ['numeric_only', 'skipna']) if axis != 0: return self.apply(lambda x: x.cumsum(axis=axis, **kwargs)) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2429e9975fc8e..5ef5053685dc8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1353,11 +1353,17 @@ def test_cython_api2(self): ], columns=['A', 'B', 'C']) expected = DataFrame( [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C']) - result = df.groupby('A').cumsum() + result = df.groupby('A').cumsum(skipna=False) assert_frame_equal(result, expected) + # check cumsum with the default skipna value of True + skipna_expected = DataFrame( + [[2., 0.], [2., 9.], [4., 9.]], columns=['B', 'C']) + result = df.groupby('A').cumsum() + assert_frame_equal(result, skipna_expected) + # GH 5755 - cumsum is a transformer and should ignore as_index - result = df.groupby('A', as_index=False).cumsum() + result = df.groupby('A', as_index=False).cumsum(skipna=False) assert_frame_equal(result, expected) # GH 13994 @@ -2521,6 +2527,18 @@ def test_groupby_cumprod(self): expected.name = 'value' tm.assert_series_equal(actual, expected) + # make sure that skipna works + df = pd.concat( + [pd.DataFrame({'x': [1.0, 2.0, np.NaN], 'gp': 'a'}), + pd.DataFrame({'x': [2.0, 5.0, 6.0], 'gp': 'b'})]) + result = df.groupby('gp')['x'].cumprod(skipna=False) + expected = pd.Series([1.0, 2.0, np.NaN, 2.0, 10.0, 60.0], name='x', index=(0, 1, 2, 0, 1, 2)) + tm.assert_series_equal(result, expected) + + result = df.groupby('gp')['x'].cumprod() + expected = pd.Series([1.0, 2.0, 2.0, 2.0, 10.0, 60.0], name='x', index=(0, 1, 2, 0, 1, 2)) + tm.assert_series_equal(result, expected) + def test_ops_general(self): ops = [('mean', np.mean), ('median', np.median), From 84db3447375f10566f4b808567419a4bb14367e5 Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Mon, 26 Feb 2018 20:04:47 +0100 Subject: [PATCH 2/9] Fixing flake8 flags --- pandas/core/groupby.py | 6 ++++-- pandas/tests/groupby/test_groupby.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index dbc0adfc15f02..0950f24746788 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1839,7 +1839,8 @@ def rank(self, method='average', ascending=True, na_option='keep', @Appender(_doc_template) def cumprod(self, axis=0, *args, **kwargs): """Cumulative product for each group""" - nv.validate_groupby_func('cumprod', args, kwargs, ['numeric_only', 'skipna']) + nv.validate_groupby_func('cumprod', args, kwargs, + ['numeric_only', 'skipna']) if axis != 0: return self.apply(lambda x: x.cumprod(axis=axis, **kwargs)) @@ -1849,7 +1850,8 @@ def cumprod(self, axis=0, *args, **kwargs): @Appender(_doc_template) def cumsum(self, axis=0, *args, **kwargs): """Cumulative sum for each group""" - nv.validate_groupby_func('cumsum', args, kwargs, ['numeric_only', 'skipna']) + nv.validate_groupby_func('cumsum', args, kwargs, + ['numeric_only', 'skipna']) if axis != 0: return self.apply(lambda x: x.cumsum(axis=axis, **kwargs)) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5ef5053685dc8..eee5dcde2c567 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2532,11 +2532,13 @@ def test_groupby_cumprod(self): [pd.DataFrame({'x': [1.0, 2.0, np.NaN], 'gp': 'a'}), pd.DataFrame({'x': [2.0, 5.0, 6.0], 'gp': 'b'})]) result = df.groupby('gp')['x'].cumprod(skipna=False) - expected = pd.Series([1.0, 2.0, np.NaN, 2.0, 10.0, 60.0], name='x', index=(0, 1, 2, 0, 1, 2)) + expected = pd.Series([1.0, 2.0, np.NaN, 2.0, 10.0, 60.0], + name='x', index=(0, 1, 2, 0, 1, 2)) tm.assert_series_equal(result, expected) result = df.groupby('gp')['x'].cumprod() - expected = pd.Series([1.0, 2.0, 2.0, 2.0, 10.0, 60.0], name='x', index=(0, 1, 2, 0, 1, 2)) + expected = pd.Series([1.0, 2.0, 2.0, 2.0, 10.0, 60.0], + name='x', index=(0, 1, 2, 0, 1, 2)) tm.assert_series_equal(result, expected) def test_ops_general(self): From 6182352530f2f1180e83c1f0f37b6d63859f1f67 Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Mon, 26 Feb 2018 20:11:22 +0100 Subject: [PATCH 3/9] Adding whatsnew entry --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index fb22dc40e335f..a96bb2aae3cae 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -890,6 +890,7 @@ Groupby/Resample/Rolling - Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) - Bug in :func:`DataFrame.resample().aggregate` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`) - Fixed a performance regression for ``GroupBy.nth`` and ``GroupBy.last`` with some object columns (:issue:`19283`) +- Bug in :func:`DataFrame.groupby.cumsum` and :func:`DataFrame.groupby.cumprod` where skipna is not an option Sparse ^^^^^^ From 2579eaa5402f4e9d0e5d8a01994a54c9229cfdaf Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Mon, 26 Feb 2018 23:52:59 +0100 Subject: [PATCH 4/9] Adding a better test --- pandas/_libs/groupby.pyx | 17 ++++++++++++----- pandas/tests/groupby/test_groupby.py | 22 ++++++++-------------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 50581cae6e3c7..555a609b56fbe 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -5,7 +5,6 @@ cimport numpy as cnp import numpy as np cimport cython -cimport util cnp.import_array() @@ -165,8 +164,12 @@ def group_cumprod_float64(float64_t[:, :] out, if val == val: accum[lab, j] *= val out[i, j] = accum[lab, j] - if val != val and skipna: - out[i, j] = accum[lab, j] + if val != val: + if skipna: + out[i, j] = NaN + else: + accum[lab, j] = NaN + out[i, j] = accum[lab, j] @cython.boundscheck(False) @@ -201,8 +204,12 @@ def group_cumsum(numeric[:, :] out, if val == val: accum[lab, j] += val out[i, j] = accum[lab, j] - if val != val and skipna == True: - out[i, j] = accum[lab, j] + if val != val: + if skipna: + out[i, j] = NaN + else: + accum[lab, j] = NaN + out[i, j] = accum[lab, j] else: accum[lab, j] += val out[i, j] = accum[lab, j] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index eee5dcde2c567..b65e3edec3b8c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1353,17 +1353,11 @@ def test_cython_api2(self): ], columns=['A', 'B', 'C']) expected = DataFrame( [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C']) - result = df.groupby('A').cumsum(skipna=False) - assert_frame_equal(result, expected) - - # check cumsum with the default skipna value of True - skipna_expected = DataFrame( - [[2., 0.], [2., 9.], [4., 9.]], columns=['B', 'C']) result = df.groupby('A').cumsum() - assert_frame_equal(result, skipna_expected) + assert_frame_equal(result, expected) # GH 5755 - cumsum is a transformer and should ignore as_index - result = df.groupby('A', as_index=False).cumsum(skipna=False) + result = df.groupby('A', as_index=False).cumsum() assert_frame_equal(result, expected) # GH 13994 @@ -2529,16 +2523,16 @@ def test_groupby_cumprod(self): # make sure that skipna works df = pd.concat( - [pd.DataFrame({'x': [1.0, 2.0, np.NaN], 'gp': 'a'}), - pd.DataFrame({'x': [2.0, 5.0, 6.0], 'gp': 'b'})]) + [pd.DataFrame({'x': [1.0, 2.0, np.nan, np.nan, 3.0, 2.0], 'gp': 'a'}), + pd.DataFrame({'x': [2.0, 5.0, 6.0, 1.0, np.nan, 1.0], 'gp': 'b'})]) result = df.groupby('gp')['x'].cumprod(skipna=False) - expected = pd.Series([1.0, 2.0, np.NaN, 2.0, 10.0, 60.0], - name='x', index=(0, 1, 2, 0, 1, 2)) + expected = pd.Series([1.0, 2.0, np.nan, np.nan, np.nan, np.nan, 2.0, 10.0, 60.0, 60.0, np.nan, np.nan], + name='x', index=(0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5)) tm.assert_series_equal(result, expected) result = df.groupby('gp')['x'].cumprod() - expected = pd.Series([1.0, 2.0, 2.0, 2.0, 10.0, 60.0], - name='x', index=(0, 1, 2, 0, 1, 2)) + expected = pd.Series([1.0, 2.0, np.nan, np.nan, 6.0, 12.0, 2.0, 10.0, 60.0, 60.0, np.nan, 60.0], + name='x', index=(0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5)) tm.assert_series_equal(result, expected) def test_ops_general(self): From 75d2870857c4c67697f236042338389d9750d56d Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Mon, 26 Feb 2018 23:56:28 +0100 Subject: [PATCH 5/9] Fixing pep8 issues --- pandas/tests/groupby/test_groupby.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b65e3edec3b8c..4c8e6c1fa782c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2523,16 +2523,22 @@ def test_groupby_cumprod(self): # make sure that skipna works df = pd.concat( - [pd.DataFrame({'x': [1.0, 2.0, np.nan, np.nan, 3.0, 2.0], 'gp': 'a'}), - pd.DataFrame({'x': [2.0, 5.0, 6.0, 1.0, np.nan, 1.0], 'gp': 'b'})]) + [pd.DataFrame({'x': [1.0, 2.0, np.nan, np.nan, 3.0, 2.0], + 'gp': 'a'}), + pd.DataFrame({'x': [2.0, 5.0, 6.0, 1.0, np.nan, 1.0], + 'gp': 'b'})]) result = df.groupby('gp')['x'].cumprod(skipna=False) - expected = pd.Series([1.0, 2.0, np.nan, np.nan, np.nan, np.nan, 2.0, 10.0, 60.0, 60.0, np.nan, np.nan], - name='x', index=(0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5)) + expected = pd.Series([1.0, 2.0, np.nan, np.nan, np.nan, np.nan, + 2.0, 10.0, 60.0, 60.0, np.nan, np.nan], + name='x', index=(0, 1, 2, 3, 4, 5, + 0, 1, 2, 3, 4, 5)) tm.assert_series_equal(result, expected) result = df.groupby('gp')['x'].cumprod() - expected = pd.Series([1.0, 2.0, np.nan, np.nan, 6.0, 12.0, 2.0, 10.0, 60.0, 60.0, np.nan, 60.0], - name='x', index=(0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5)) + expected = pd.Series([1.0, 2.0, np.nan, np.nan, 6.0, 12.0, + 2.0, 10.0, 60.0, 60.0, np.nan, 60.0], + name='x', index=(0, 1, 2, 3, 4, 5, + 0, 1, 2, 3, 4, 5)) tm.assert_series_equal(result, expected) def test_ops_general(self): From bb740fdc7fe51177725fe8c391aa99619b095764 Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Wed, 28 Feb 2018 11:03:46 +0100 Subject: [PATCH 6/9] Making some changes based on review and altering tests --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/_libs/groupby.pyx | 18 ++++++------- pandas/tests/groupby/test_groupby.py | 40 +++++++++++++++++----------- 3 files changed, 33 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index a96bb2aae3cae..bbeaffa5391b2 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -890,7 +890,7 @@ Groupby/Resample/Rolling - Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) - Bug in :func:`DataFrame.resample().aggregate` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`) - Fixed a performance regression for ``GroupBy.nth`` and ``GroupBy.last`` with some object columns (:issue:`19283`) -- Bug in :func:`DataFrame.groupby.cumsum` and :func:`DataFrame.groupby.cumprod` where skipna is not an option +- Bug in :func:`DataFrameGroupBy.cumsum` and :func:`DataFrameGroupBy.cumprod` where `skipna` is not an option (:issue:`19806`) Sparse ^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 555a609b56fbe..f9ec2c5a851a0 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -164,12 +164,11 @@ def group_cumprod_float64(float64_t[:, :] out, if val == val: accum[lab, j] *= val out[i, j] = accum[lab, j] - if val != val: - if skipna: - out[i, j] = NaN - else: + else: + out[i, j] = NaN + if not skipna: accum[lab, j] = NaN - out[i, j] = accum[lab, j] + break @cython.boundscheck(False) @@ -204,12 +203,11 @@ def group_cumsum(numeric[:, :] out, if val == val: accum[lab, j] += val out[i, j] = accum[lab, j] - if val != val: - if skipna: - out[i, j] = NaN - else: + else: + out[i, j] = NaN + if not skipna: accum[lab, j] = NaN - out[i, j] = accum[lab, j] + break else: accum[lab, j] += val out[i, j] = accum[lab, j] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4c8e6c1fa782c..21334c1e61734 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2521,24 +2521,32 @@ def test_groupby_cumprod(self): expected.name = 'value' tm.assert_series_equal(actual, expected) - # make sure that skipna works - df = pd.concat( - [pd.DataFrame({'x': [1.0, 2.0, np.nan, np.nan, 3.0, 2.0], - 'gp': 'a'}), - pd.DataFrame({'x': [2.0, 5.0, 6.0, 1.0, np.nan, 1.0], - 'gp': 'b'})]) - result = df.groupby('gp')['x'].cumprod(skipna=False) - expected = pd.Series([1.0, 2.0, np.nan, np.nan, np.nan, np.nan, - 2.0, 10.0, 60.0, 60.0, np.nan, np.nan], - name='x', index=(0, 1, 2, 3, 4, 5, - 0, 1, 2, 3, 4, 5)) + def test_groupby_cum_skipna(self): + # GH 19806 + # make sure that skipna works for both cumsum and cumprod + df = pd.DataFrame({'key': ['b'] * 10 + ['a'] * 2, 'value': 3}) + df.at[3] = np.nan + df.at[3, 'key'] = 'b' + + result = df.groupby('key')['value'].cumprod(skipna=False) + expected = pd.Series([3.0, 9.0, 27.0, np.nan, np.nan, np.nan, np.nan, + np.nan, np.nan, np.nan, 3.0, 9.0], + name='value', index=range(12)) tm.assert_series_equal(result, expected) - result = df.groupby('gp')['x'].cumprod() - expected = pd.Series([1.0, 2.0, np.nan, np.nan, 6.0, 12.0, - 2.0, 10.0, 60.0, 60.0, np.nan, 60.0], - name='x', index=(0, 1, 2, 3, 4, 5, - 0, 1, 2, 3, 4, 5)) + result = df.groupby('key')['value'].cumsum(skipna=False) + expected = pd.Series([3.0, 6.0, 9.0, np.nan, np.nan, np.nan, np.nan, + np.nan, np.nan, np.nan, 3.0, 6.0], + name='value', index=range(12)) + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({'key': ['b'] * 10, 'value': np.nan}) + result = df.groupby('key')['value'].cumprod(skipna=False) + expected = pd.Series([np.nan] * 10, name='value', index=range(10)) + tm.assert_series_equal(result, expected) + + result = df.groupby('key')['value'].cumsum(skipna=False) + expected = pd.Series([np.nan] * 10, name='value', index=range(10)) tm.assert_series_equal(result, expected) def test_ops_general(self): From 47fe8d6d24038053aa47dcfb26eaffa87685c3f5 Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Thu, 1 Mar 2018 12:02:13 +0100 Subject: [PATCH 7/9] Making test changes based on review --- pandas/tests/groupby/test_groupby.py | 28 -------------------------- pandas/tests/groupby/test_transform.py | 24 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 28 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 21334c1e61734..2429e9975fc8e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2521,34 +2521,6 @@ def test_groupby_cumprod(self): expected.name = 'value' tm.assert_series_equal(actual, expected) - def test_groupby_cum_skipna(self): - # GH 19806 - # make sure that skipna works for both cumsum and cumprod - df = pd.DataFrame({'key': ['b'] * 10 + ['a'] * 2, 'value': 3}) - df.at[3] = np.nan - df.at[3, 'key'] = 'b' - - result = df.groupby('key')['value'].cumprod(skipna=False) - expected = pd.Series([3.0, 9.0, 27.0, np.nan, np.nan, np.nan, np.nan, - np.nan, np.nan, np.nan, 3.0, 9.0], - name='value', index=range(12)) - tm.assert_series_equal(result, expected) - - result = df.groupby('key')['value'].cumsum(skipna=False) - expected = pd.Series([3.0, 6.0, 9.0, np.nan, np.nan, np.nan, np.nan, - np.nan, np.nan, np.nan, 3.0, 6.0], - name='value', index=range(12)) - tm.assert_series_equal(result, expected) - - df = pd.DataFrame({'key': ['b'] * 10, 'value': np.nan}) - result = df.groupby('key')['value'].cumprod(skipna=False) - expected = pd.Series([np.nan] * 10, name='value', index=range(10)) - tm.assert_series_equal(result, expected) - - result = df.groupby('key')['value'].cumsum(skipna=False) - expected = pd.Series([np.nan] * 10, name='value', index=range(10)) - tm.assert_series_equal(result, expected) - def test_ops_general(self): ops = [('mean', np.mean), ('median', np.median), diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 1be7dfdcc64e6..e49bd889d0a03 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -498,6 +498,30 @@ def test_cython_transform_series(self, op, args, targop): tm.assert_series_equal(expected, getattr( data.groupby(labels), op)(*args)) + @pytest.mark.parametrize("op", ['cumprod', 'cumsum']) + @pytest.mark.parametrize("kwargs", [{'skipna': False}, {'skipna': True}]) + @pytest.mark.parametrize('input, exp', [ + # When everything is NaN + ({'key': ['b'] * 10, 'value': np.nan}, [np.nan] * 10), + # When there is a single NaN + ({'key': ['b'] * 10 + ['a'] * 2, + 'value': [3] * 3 + [np.nan] + [3] * 8}, + {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0], + ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729., + 2187., 6561., 19683., 3.0, 9.0], + ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], + ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18., + 21., 24., 27., 3.0, 6.0]})]) + def test_groupby_cum_skip(self, op, kwargs, input, exp): + df = pd.DataFrame(input) + result = df.groupby('key')['value'].transform(op, **kwargs) + if isinstance(exp, dict): + expected = pd.Series(exp[(op, kwargs['skipna'])], + name='value', index=range(12)) + else: + expected = pd.Series(exp, name='value', index=range(10)) + tm.assert_series_equal(expected, result) + @pytest.mark.parametrize( "op, args, targop", [('cumprod', (), lambda x: x.cumprod()), From 107bc0b61ac2fc206eb1c4b5191f11b24f7bef01 Mon Sep 17 00:00:00 2001 From: Yian Shang Date: Thu, 1 Mar 2018 14:08:45 +0100 Subject: [PATCH 8/9] More changes to tests based on review --- pandas/tests/groupby/test_transform.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index e49bd889d0a03..b418bb0c5fea6 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -499,10 +499,11 @@ def test_cython_transform_series(self, op, args, targop): data.groupby(labels), op)(*args)) @pytest.mark.parametrize("op", ['cumprod', 'cumsum']) - @pytest.mark.parametrize("kwargs", [{'skipna': False}, {'skipna': True}]) + @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize('input, exp', [ # When everything is NaN - ({'key': ['b'] * 10, 'value': np.nan}, [np.nan] * 10), + ({'key': ['b'] * 10, 'value': np.nan}, + pd.Series([np.nan] * 10, name='value')), # When there is a single NaN ({'key': ['b'] * 10 + ['a'] * 2, 'value': [3] * 3 + [np.nan] + [3] * 8}, @@ -512,14 +513,14 @@ def test_cython_transform_series(self, op, args, targop): ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18., 21., 24., 27., 3.0, 6.0]})]) - def test_groupby_cum_skip(self, op, kwargs, input, exp): + def test_groupby_cum_skipna(self, op, skipna, input, exp): df = pd.DataFrame(input) - result = df.groupby('key')['value'].transform(op, **kwargs) + result = df.groupby('key')['value'].transform(op, skipna=skipna) if isinstance(exp, dict): - expected = pd.Series(exp[(op, kwargs['skipna'])], - name='value', index=range(12)) + expected = exp[(op, skipna)] else: - expected = pd.Series(exp, name='value', index=range(10)) + expected = exp + expected = pd.Series(expected, name='value') tm.assert_series_equal(expected, result) @pytest.mark.parametrize( From 3072df70986d85e7186c1070a84915e93b646b79 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 1 Mar 2018 18:11:49 -0500 Subject: [PATCH 9/9] doc --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 8e4f186f23c99..7a19f87051746 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -927,7 +927,7 @@ Groupby/Resample/Rolling - Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) - Bug in :func:`DataFrame.resample().aggregate` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`) - Fixed a performance regression for ``GroupBy.nth`` and ``GroupBy.last`` with some object columns (:issue:`19283`) -- Bug in :func:`DataFrameGroupBy.cumsum` and :func:`DataFrameGroupBy.cumprod` where `skipna` is not an option (:issue:`19806`) +- Bug in :func:`DataFrameGroupBy.cumsum` and :func:`DataFrameGroupBy.cumprod` when ``skipna`` was passed (:issue:`19806`) Sparse ^^^^^^