Fix cumprod to work properly with Integer columns. (#1750)

itholic · web-flow · commit 91210a2dabd6 · 2020-09-21T14:08:51.000+09:00
Basically, this PR addressed #1739 (review) `cumprod` for `DataFrame` & `Series` & `GroupBy` isn't working properly with integer columns. ```python >>> pdf A B C 0 2.0 1.0 1 1 5.0 NaN 2 2 1.0 1.0 3 3 2.0 4.0 4 4 4.0 9.0 5 >>> pdf.cumprod() A B C 0 2.0 1.0 1 1 10.0 NaN 2 2 10.0 1.0 6 3 20.0 4.0 24 4 80.0 36.0 120 >>> ks.from_pandas(pdf).cumprod() A B C 0 2.0 1.0 1.0 1 10.0 NaN 2.0 2 10.0 1.0 6.0 3 20.0 4.0 24.0 4 80.0 36.0 120.0 ``` This PR addressed it and also addressed the related tests. ```python >>> pdf.cumprod() A B C 0.986323 2.0 1.0 1 0.297507 10.0 NaN 2 0.617855 10.0 1.0 6 0.711719 20.0 4.0 24 0.290114 80.0 36.0 120 >>> ks.from_pandas(pdf).cumprod() A B C 0.986323 2.0 1.0 1 0.297507 10.0 NaN 2 0.617855 10.0 1.0 6 0.711719 20.0 4.0 24 0.290114 80.0 36.0 120 ```
diff --git a/databricks/koalas/groupby.py b/databricks/koalas/groupby.py
@@ -824,11 +824,11 @@ def cumprod(self):
         By default, iterates over rows and finds the sum in each column.
 
         >>> df.groupby("A").cumprod().sort_index()
-              B     C
-        0   NaN   4.0
-        1   0.1  12.0
-        2   2.0  24.0
-        3  10.0   1.0
+              B   C
+        0   NaN   4
+        1   0.1  12
+        2   2.0  24
+        3  10.0   1
 
         It works as below in Series.
 
diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py
@@ -5230,8 +5230,10 @@ def _cum(self, func, skipna, part_cols=(), ascending=True):
     def _cumprod(self, skipna, part_cols=()):
         from pyspark.sql.functions import pandas_udf
 
+        data_type = self.spark.data_type
+
         def cumprod(scol):
-            @pandas_udf(returnType=self.spark.data_type)
+            @pandas_udf(returnType=data_type)
             def negative_check(s):
                 assert len(s) == 0 or ((s > 0) | (s.isnull())).all(), (
                     "values should be bigger than 0: %s" % s
@@ -5241,7 +5243,10 @@ def negative_check(s):
             return F.sum(F.log(negative_check(scol)))
 
         kser = self._cum(cumprod, skipna, part_cols)
-        return kser._with_new_scol(F.exp(kser.spark.column))
+        result = kser._with_new_scol(F.exp(kser.spark.column))
+        if isinstance(data_type, IntegralType):
+            result = result.spark.transform(lambda col: F.round(col).cast(LongType()))
+        return result
 
     # ----------------------------------------------------------------------
     # Accessor Methods
diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -2273,13 +2273,22 @@ def _test_cumprod(self, pdf, kdf):
         self.assert_eq(pdf.cumprod().sum(), kdf.cumprod().sum(), almost=True)
 
     def test_cumprod(self):
-        pdf = pd.DataFrame(
-            [[2.0, 1.0], [5, None], [1.0, 1.0], [2.0, 4.0], [4.0, 9.0]],
-            columns=list("AB"),
-            index=np.random.rand(5),
-        )
-        kdf = ks.from_pandas(pdf)
-        self._test_cumprod(pdf, kdf)
+        if LooseVersion(pyspark.__version__) >= LooseVersion("2.4"):
+            pdf = pd.DataFrame(
+                [[2.0, 1.0, 1], [5, None, 2], [1.0, 1.0, 3], [2.0, 4.0, 4], [4.0, 9.0, 5]],
+                columns=list("ABC"),
+                index=np.random.rand(5),
+            )
+            kdf = ks.from_pandas(pdf)
+            self._test_cumprod(pdf, kdf)
+        else:
+            pdf = pd.DataFrame(
+                [[2, 1, 1], [5, 1, 2], [1, 1, 3], [2, 4, 4], [4, 9, 5]],
+                columns=list("ABC"),
+                index=np.random.rand(5),
+            )
+            kdf = ks.from_pandas(pdf)
+            self._test_cumprod(pdf, kdf)
 
     def test_cumprod_multiindex_columns(self):
         arrays = [np.array(["A", "A", "B", "B"]), np.array(["one", "two", "one", "two"])]
diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py
@@ -979,6 +979,13 @@ def test_cumprod(self):
         self.assert_eq(pser.cumprod(skipna=False), kser.cumprod(skipna=False))
         self.assert_eq(pser.cumprod().sum(), kser.cumprod().sum())
 
+        # with integer type
+        pser = pd.Series([1, 10, 1, 4, 9])
+        kser = ks.from_pandas(pser)
+        self.assert_eq(pser.cumprod(), kser.cumprod())
+        self.assert_eq(pser.cumprod(skipna=False), kser.cumprod(skipna=False))
+        self.assert_eq(pser.cumprod().sum(), kser.cumprod().sum())
+
         # with reversed index
         pser.index = [4, 3, 2, 1, 0]
         kser = ks.from_pandas(pser)