diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index f2c96ba3f53a8..7a19f87051746 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -927,6 +927,7 @@ Groupby/Resample/Rolling - Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) - Bug in :func:`DataFrame.resample().aggregate` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`) - Fixed a performance regression for ``GroupBy.nth`` and ``GroupBy.last`` with some object columns (:issue:`19283`) +- Bug in :func:`DataFrameGroupBy.cumsum` and :func:`DataFrameGroupBy.cumprod` when ``skipna`` was passed (:issue:`19806`) Sparse ^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d3fcd84e5f38d..43afd1e0f5969 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -139,7 +139,8 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, def group_cumprod_float64(float64_t[:, :] out, float64_t[:, :] values, int64_t[:] labels, - bint is_datetimelike): + bint is_datetimelike, + bint skipna=True): """ Only transforms on axis=0 """ @@ -163,6 +164,11 @@ def group_cumprod_float64(float64_t[:, :] out, if val == val: accum[lab, j] *= val out[i, j] = accum[lab, j] + else: + out[i, j] = NaN + if not skipna: + accum[lab, j] = NaN + break @cython.boundscheck(False) @@ -170,7 +176,8 @@ def group_cumprod_float64(float64_t[:, :] out, def group_cumsum(numeric[:, :] out, numeric[:, :] values, int64_t[:] labels, - is_datetimelike): + is_datetimelike, + bint skipna=True): """ Only transforms on axis=0 """ @@ -196,6 +203,11 @@ def group_cumsum(numeric[:, :] out, if val == val: accum[lab, j] += val out[i, j] = accum[lab, j] + else: + out[i, j] = NaN + if not skipna: + accum[lab, j] = NaN + break else: accum[lab, j] += val out[i, j] = accum[lab, j] diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index b8ca104c4b2c7..4a09d636ee320 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1888,7 +1888,8 @@ def rank(self, method='average', ascending=True, na_option='keep', @Appender(_doc_template) def cumprod(self, axis=0, *args, **kwargs): """Cumulative product for each group""" - nv.validate_groupby_func('cumprod', args, kwargs, ['numeric_only']) + nv.validate_groupby_func('cumprod', args, kwargs, + ['numeric_only', 'skipna']) if axis != 0: return self.apply(lambda x: x.cumprod(axis=axis, **kwargs)) @@ -1898,7 +1899,8 @@ def cumprod(self, axis=0, *args, **kwargs): @Appender(_doc_template) def cumsum(self, axis=0, *args, **kwargs): """Cumulative sum for each group""" - nv.validate_groupby_func('cumsum', args, kwargs, ['numeric_only']) + nv.validate_groupby_func('cumsum', args, kwargs, + ['numeric_only', 'skipna']) if axis != 0: return self.apply(lambda x: x.cumsum(axis=axis, **kwargs)) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 1be7dfdcc64e6..b418bb0c5fea6 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -498,6 +498,31 @@ def test_cython_transform_series(self, op, args, targop): tm.assert_series_equal(expected, getattr( data.groupby(labels), op)(*args)) + @pytest.mark.parametrize("op", ['cumprod', 'cumsum']) + @pytest.mark.parametrize("skipna", [False, True]) + @pytest.mark.parametrize('input, exp', [ + # When everything is NaN + ({'key': ['b'] * 10, 'value': np.nan}, + pd.Series([np.nan] * 10, name='value')), + # When there is a single NaN + ({'key': ['b'] * 10 + ['a'] * 2, + 'value': [3] * 3 + [np.nan] + [3] * 8}, + {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0], + ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729., + 2187., 6561., 19683., 3.0, 9.0], + ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], + ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18., + 21., 24., 27., 3.0, 6.0]})]) + def test_groupby_cum_skipna(self, op, skipna, input, exp): + df = pd.DataFrame(input) + result = df.groupby('key')['value'].transform(op, skipna=skipna) + if isinstance(exp, dict): + expected = exp[(op, skipna)] + else: + expected = exp + expected = pd.Series(expected, name='value') + tm.assert_series_equal(expected, result) + @pytest.mark.parametrize( "op, args, targop", [('cumprod', (), lambda x: x.cumprod()),