Skip to content

Commit 1ccb7ea

Browse files
authored
Matching behavior with pandas 1.1.2 (#1688)
This should resolve #1685 - [x] DataFrame.truncate - [x] AtIndexer with MultiIndex - [x] GroupBy.nunique - [x] Index.monotonic - [x] GroupByRolling.max (Resolved in pandas-dev/pandas#36152) - [x] GroupByRolling.mean (ditto) - [x] GroupByRolling.min (ditto) - [x] GroupByRolling.std (ditto) - [x] GroupByRolling.sum (ditto) - [x] GroupByRolling.var (ditto) - [x] Series.truncate
1 parent 91210a2 commit 1ccb7ea

File tree

10 files changed

+123
-77
lines changed

10 files changed

+123
-77
lines changed

.github/workflows/master.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ jobs:
115115
pyarrow-version: 0.15.1
116116
- python-version: 3.8
117117
spark-version: 3.0.1
118-
pandas-version: 1.0.5
118+
pandas-version: 1.1.2
119119
pyarrow-version: 1.0.1
120120
default-index-type: 'distributed-sequence'
121121
env:

databricks/koalas/generic.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2246,10 +2246,16 @@ def truncate(self, before=None, after=None, axis=None, copy=True):
22462246
raise ValueError("Truncate: %s must be after %s" % (after, before))
22472247

22482248
if isinstance(self, ks.Series):
2249-
result = first_series(self.to_frame().loc[before:after]).rename(self.name)
2249+
if indexes_increasing:
2250+
result = first_series(self.to_frame().loc[before:after]).rename(self.name)
2251+
else:
2252+
result = first_series(self.to_frame().loc[after:before]).rename(self.name)
22502253
elif isinstance(self, ks.DataFrame):
22512254
if axis == 0:
2252-
result = self.loc[before:after]
2255+
if indexes_increasing:
2256+
result = self.loc[before:after]
2257+
else:
2258+
result = self.loc[after:before]
22532259
elif axis == 1:
22542260
result = self.loc[:, before:after]
22552261

databricks/koalas/groupby.py

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2082,12 +2082,12 @@ def nunique(self, dropna=True):
20822082
4 ham 5 x
20832083
5 ham 5 y
20842084
2085-
>>> df.groupby('id').nunique().sort_index() # doctest: +NORMALIZE_WHITESPACE
2086-
id value1 value2
2085+
>>> df.groupby('id').nunique().sort_index() # doctest: +SKIP
2086+
value1 value2
20872087
id
2088-
egg 1 1 1
2089-
ham 1 1 2
2090-
spam 1 2 1
2088+
egg 1 1
2089+
ham 1 2
2090+
spam 2 1
20912091
20922092
>>> df.groupby('id')['value1'].nunique().sort_index() # doctest: +NORMALIZE_WHITESPACE
20932093
id
@@ -2104,10 +2104,7 @@ def nunique(self, dropna=True):
21042104
+ F.when(F.count(F.when(col.isNull(), 1).otherwise(None)) >= 1, 1).otherwise(0)
21052105
)
21062106

2107-
should_include_groupkeys = isinstance(self, DataFrameGroupBy)
2108-
return self._reduce_for_stat_function(
2109-
stat_function, only_numeric=False, should_include_groupkeys=should_include_groupkeys
2110-
)
2107+
return self._reduce_for_stat_function(stat_function, only_numeric=False)
21112108

21122109
def rolling(self, window, min_periods=None):
21132110
"""
@@ -2158,13 +2155,9 @@ def expanding(self, min_periods=1):
21582155
"""
21592156
return ExpandingGroupby(self, min_periods=min_periods)
21602157

2161-
def _reduce_for_stat_function(self, sfun, only_numeric, should_include_groupkeys=False):
2162-
if should_include_groupkeys:
2163-
agg_columns = self._groupkeys + self._agg_columns
2164-
agg_columns_scols = self._groupkeys_scols + self._agg_columns_scols
2165-
else:
2166-
agg_columns = self._agg_columns
2167-
agg_columns_scols = self._agg_columns_scols
2158+
def _reduce_for_stat_function(self, sfun, only_numeric):
2159+
agg_columns = self._agg_columns
2160+
agg_columns_scols = self._agg_columns_scols
21682161

21692162
groupkey_names = [SPARK_INDEX_NAME_FORMAT(i) for i in range(len(self._groupkeys))]
21702163
groupkey_scols = [s.alias(name) for s, name in zip(self._groupkeys_scols, groupkey_names)]
@@ -2541,11 +2534,8 @@ def _kdf(self) -> DataFrame:
25412534
def _agg_columns(self):
25422535
return [self._kser]
25432536

2544-
def _reduce_for_stat_function(self, sfun, only_numeric, should_include_groupkeys=False):
2545-
assert not should_include_groupkeys, should_include_groupkeys
2546-
return first_series(
2547-
super()._reduce_for_stat_function(sfun, only_numeric, should_include_groupkeys)
2548-
)
2537+
def _reduce_for_stat_function(self, sfun, only_numeric):
2538+
return first_series(super()._reduce_for_stat_function(sfun, only_numeric))
25492539

25502540
def agg(self, *args, **kwargs):
25512541
return MissingPandasLikeSeriesGroupBy.agg(self, *args, **kwargs)

databricks/koalas/tests/test_dataframe.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3630,16 +3630,27 @@ def test_truncate(self):
36303630
self.assert_eq(kdf1.truncate(after=400), pdf1.truncate(after=400))
36313631
self.assert_eq(kdf1.truncate(copy=False), pdf1.truncate(copy=False))
36323632
self.assert_eq(kdf1.truncate(-20, 400, copy=False), pdf1.truncate(-20, 400, copy=False))
3633-
self.assert_eq(kdf2.truncate(0, 550), pdf2.truncate(0, 550))
3634-
self.assert_eq(kdf2.truncate(0, 550, copy=False), pdf2.truncate(0, 550, copy=False))
3633+
# The bug for these tests has been fixed in pandas 1.1.0.
3634+
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
3635+
self.assert_eq(kdf2.truncate(0, 550), pdf2.truncate(0, 550))
3636+
self.assert_eq(kdf2.truncate(0, 550, copy=False), pdf2.truncate(0, 550, copy=False))
3637+
else:
3638+
expected_kdf = ks.DataFrame(
3639+
{"A": ["b", "c", "d"], "B": ["i", "j", "k"], "C": ["p", "q", "r"],},
3640+
index=[550, 400, 0],
3641+
)
3642+
self.assert_eq(kdf2.truncate(0, 550), expected_kdf)
3643+
self.assert_eq(kdf2.truncate(0, 550, copy=False), expected_kdf)
3644+
36353645
# axis = 1
36363646
self.assert_eq(kdf1.truncate(axis=1), pdf1.truncate(axis=1))
36373647
self.assert_eq(kdf1.truncate(before="B", axis=1), pdf1.truncate(before="B", axis=1))
36383648
self.assert_eq(kdf1.truncate(after="A", axis=1), pdf1.truncate(after="A", axis=1))
36393649
self.assert_eq(kdf1.truncate(copy=False, axis=1), pdf1.truncate(copy=False, axis=1))
36403650
self.assert_eq(kdf2.truncate("B", "C", axis=1), pdf2.truncate("B", "C", axis=1))
36413651
self.assert_eq(
3642-
kdf1.truncate("B", "C", copy=False, axis=1), pdf1.truncate("B", "C", copy=False, axis=1)
3652+
kdf1.truncate("B", "C", copy=False, axis=1),
3653+
pdf1.truncate("B", "C", copy=False, axis=1),
36433654
)
36443655

36453656
# MultiIndex columns
@@ -3654,16 +3665,23 @@ def test_truncate(self):
36543665
self.assert_eq(kdf1.truncate(after=400), pdf1.truncate(after=400))
36553666
self.assert_eq(kdf1.truncate(copy=False), pdf1.truncate(copy=False))
36563667
self.assert_eq(kdf1.truncate(-20, 400, copy=False), pdf1.truncate(-20, 400, copy=False))
3657-
self.assert_eq(kdf2.truncate(0, 550), pdf2.truncate(0, 550))
3658-
self.assert_eq(kdf2.truncate(0, 550, copy=False), pdf2.truncate(0, 550, copy=False))
3668+
# The bug for these tests has been fixed in pandas 1.1.0.
3669+
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
3670+
self.assert_eq(kdf2.truncate(0, 550), pdf2.truncate(0, 550))
3671+
self.assert_eq(kdf2.truncate(0, 550, copy=False), pdf2.truncate(0, 550, copy=False))
3672+
else:
3673+
expected_kdf.columns = columns
3674+
self.assert_eq(kdf2.truncate(0, 550), expected_kdf)
3675+
self.assert_eq(kdf2.truncate(0, 550, copy=False), expected_kdf)
36593676
# axis = 1
36603677
self.assert_eq(kdf1.truncate(axis=1), pdf1.truncate(axis=1))
36613678
self.assert_eq(kdf1.truncate(before="B", axis=1), pdf1.truncate(before="B", axis=1))
36623679
self.assert_eq(kdf1.truncate(after="A", axis=1), pdf1.truncate(after="A", axis=1))
36633680
self.assert_eq(kdf1.truncate(copy=False, axis=1), pdf1.truncate(copy=False, axis=1))
36643681
self.assert_eq(kdf2.truncate("B", "C", axis=1), pdf2.truncate("B", "C", axis=1))
36653682
self.assert_eq(
3666-
kdf1.truncate("B", "C", copy=False, axis=1), pdf1.truncate("B", "C", copy=False, axis=1)
3683+
kdf1.truncate("B", "C", copy=False, axis=1),
3684+
pdf1.truncate("B", "C", copy=False, axis=1),
36673685
)
36683686

36693687
# Exceptions

databricks/koalas/tests/test_groupby.py

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -674,13 +674,20 @@ def test_nunique(self):
674674
kdf.groupby("a").agg({"b": "nunique"}).sort_index(),
675675
pdf.groupby("a").agg({"b": "nunique"}).sort_index(),
676676
)
677-
self.assert_eq(
678-
kdf.groupby("a").nunique().sort_index(), pdf.groupby("a").nunique().sort_index()
679-
)
680-
self.assert_eq(
681-
kdf.groupby("a").nunique(dropna=False).sort_index(),
682-
pdf.groupby("a").nunique(dropna=False).sort_index(),
683-
)
677+
if LooseVersion(pd.__version__) < LooseVersion("1.1.0"):
678+
expected = ks.DataFrame({"b": [2, 2]}, index=pd.Index([0, 1], name="a"))
679+
self.assert_eq(kdf.groupby("a").nunique().sort_index(), expected)
680+
self.assert_eq(
681+
kdf.groupby("a").nunique(dropna=False).sort_index(), expected,
682+
)
683+
else:
684+
self.assert_eq(
685+
kdf.groupby("a").nunique().sort_index(), pdf.groupby("a").nunique().sort_index()
686+
)
687+
self.assert_eq(
688+
kdf.groupby("a").nunique(dropna=False).sort_index(),
689+
pdf.groupby("a").nunique(dropna=False).sort_index(),
690+
)
684691
self.assert_eq(
685692
kdf.groupby("a")["b"].nunique().sort_index(),
686693
pdf.groupby("a")["b"].nunique().sort_index(),
@@ -702,14 +709,23 @@ def test_nunique(self):
702709
pdf.columns = columns
703710
kdf.columns = columns
704711

705-
self.assert_eq(
706-
kdf.groupby(("x", "a")).nunique().sort_index(),
707-
pdf.groupby(("x", "a")).nunique().sort_index(),
708-
)
709-
self.assert_eq(
710-
kdf.groupby(("x", "a")).nunique(dropna=False).sort_index(),
711-
pdf.groupby(("x", "a")).nunique(dropna=False).sort_index(),
712-
)
712+
if LooseVersion(pd.__version__) < LooseVersion("1.1.0"):
713+
expected = ks.DataFrame({("y", "b"): [2, 2]}, index=pd.Index([0, 1], name=("x", "a")))
714+
self.assert_eq(
715+
kdf.groupby(("x", "a")).nunique().sort_index(), expected,
716+
)
717+
self.assert_eq(
718+
kdf.groupby(("x", "a")).nunique(dropna=False).sort_index(), expected,
719+
)
720+
else:
721+
self.assert_eq(
722+
kdf.groupby(("x", "a")).nunique().sort_index(),
723+
pdf.groupby(("x", "a")).nunique().sort_index(),
724+
)
725+
self.assert_eq(
726+
kdf.groupby(("x", "a")).nunique(dropna=False).sort_index(),
727+
pdf.groupby(("x", "a")).nunique(dropna=False).sort_index(),
728+
)
713729

714730
def test_unique(self):
715731
for pdf in [

databricks/koalas/tests/test_indexes.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -916,9 +916,12 @@ def test_monotonic(self):
916916
datas.append([(5, 100), (4, 200), (3, None), (2, 400), (1, 500)])
917917
datas.append([(5, 100), (4, 200), (3, 300), (2, 400), (1, None)])
918918
datas.append([(1, 100), (2, 200), (None, None), (4, 400), (5, 500)])
919-
datas.append([(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)])
920-
datas.append([(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")])
921-
datas.append([(None, None), (None, None), (None, None), (None, None), (None, None)])
919+
# The datas below cannot be an arguments for `MultiIndex.from_tuples` in pandas >= 1.1.0.
920+
# Refer https://github.com/databricks/koalas/pull/1688#issuecomment-667156560 for detail.
921+
if LooseVersion(pd.__version__) < LooseVersion("1.1.0"):
922+
datas.append([(-5, None), (-4, None), (-3, None), (-2, None), (-1, None)])
923+
datas.append([(None, "e"), (None, "c"), (None, "b"), (None, "d"), (None, "a")])
924+
datas.append([(None, None), (None, None), (None, None), (None, None), (None, None)])
922925

923926
# duplicated index value tests
924927
datas.append([("x", "d"), ("y", "c"), ("y", "b"), ("z", "a")])

databricks/koalas/tests/test_indexing.py

Lines changed: 32 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -183,10 +183,17 @@ def test_at_multiindex(self):
183183
pdf = self.pdf.set_index("b", append=True)
184184
kdf = self.kdf.set_index("b", append=True)
185185

186-
self.assert_eq(kdf.at[(3, 6), "a"], pdf.at[(3, 6), "a"])
187-
self.assert_eq(kdf.at[(3,), "a"], pdf.at[(3,), "a"])
188-
self.assert_eq(list(kdf.at[(9, 0), "a"]), list(pdf.at[(9, 0), "a"]))
189-
self.assert_eq(list(kdf.at[(9,), "a"]), list(pdf.at[(9,), "a"]))
186+
# TODO: seems like a pandas' bug in pandas>=1.1.0
187+
if LooseVersion(pd.__version__) < LooseVersion("1.1.0"):
188+
self.assert_eq(kdf.at[(3, 6), "a"], pdf.at[(3, 6), "a"])
189+
self.assert_eq(kdf.at[(3,), "a"], pdf.at[(3,), "a"])
190+
self.assert_eq(list(kdf.at[(9, 0), "a"]), list(pdf.at[(9, 0), "a"]))
191+
self.assert_eq(list(kdf.at[(9,), "a"]), list(pdf.at[(9,), "a"]))
192+
else:
193+
self.assert_eq(kdf.at[(3, 6), "a"], 3)
194+
self.assert_eq(kdf.at[(3,), "a"], np.array([3]))
195+
self.assert_eq(list(kdf.at[(9, 0), "a"]), [7, 8, 9])
196+
self.assert_eq(list(kdf.at[(9,), "a"]), [7, 8, 9])
190197

191198
with self.assertRaises(ValueError):
192199
kdf.at[3, "a"]
@@ -1127,38 +1134,38 @@ def test_index_operator_datetime(self):
11271134
kdf = ks.from_pandas(pdf)
11281135

11291136
# Positional iloc search
1130-
self.assert_eq(kdf[:4], pdf[:4])
1131-
self.assert_eq(kdf[:3], pdf[:3])
1132-
self.assert_eq(kdf[3:], pdf[3:])
1133-
self.assert_eq(kdf[2:], pdf[2:])
1134-
self.assert_eq(kdf[2:3], pdf[2:3])
1135-
self.assert_eq(kdf[2:-1], pdf[2:-1])
1136-
self.assert_eq(kdf[10:3], pdf[10:3])
1137+
self.assert_eq(kdf[:4], pdf[:4], almost=True)
1138+
self.assert_eq(kdf[:3], pdf[:3], almost=True)
1139+
self.assert_eq(kdf[3:], pdf[3:], almost=True)
1140+
self.assert_eq(kdf[2:], pdf[2:], almost=True)
1141+
self.assert_eq(kdf[2:3], pdf[2:3], almost=True)
1142+
self.assert_eq(kdf[2:-1], pdf[2:-1], almost=True)
1143+
self.assert_eq(kdf[10:3], pdf[10:3], almost=True)
11371144

11381145
# Index loc search
11391146
self.assert_eq(kdf.A[4], pdf.A[4])
11401147
self.assert_eq(kdf.A[3], pdf.A[3])
11411148

11421149
# Positional iloc search
1143-
self.assert_eq(kdf.A[:4], pdf.A[:4])
1144-
self.assert_eq(kdf.A[:3], pdf.A[:3])
1145-
self.assert_eq(kdf.A[3:], pdf.A[3:])
1146-
self.assert_eq(kdf.A[2:], pdf.A[2:])
1147-
self.assert_eq(kdf.A[2:3], pdf.A[2:3])
1148-
self.assert_eq(kdf.A[2:-1], pdf.A[2:-1])
1149-
self.assert_eq(kdf.A[10:3], pdf.A[10:3])
1150+
self.assert_eq(kdf.A[:4], pdf.A[:4], almost=True)
1151+
self.assert_eq(kdf.A[:3], pdf.A[:3], almost=True)
1152+
self.assert_eq(kdf.A[3:], pdf.A[3:], almost=True)
1153+
self.assert_eq(kdf.A[2:], pdf.A[2:], almost=True)
1154+
self.assert_eq(kdf.A[2:3], pdf.A[2:3], almost=True)
1155+
self.assert_eq(kdf.A[2:-1], pdf.A[2:-1], almost=True)
1156+
self.assert_eq(kdf.A[10:3], pdf.A[10:3], almost=True)
11501157

11511158
dt1 = datetime.datetime.strptime("2013-01-02", "%Y-%m-%d")
11521159
dt2 = datetime.datetime.strptime("2013-01-04", "%Y-%m-%d")
11531160

11541161
# Index loc search
1155-
self.assert_eq(kdf[:dt2], pdf[:dt2])
1156-
self.assert_eq(kdf[dt1:], pdf[dt1:])
1157-
self.assert_eq(kdf[dt1:dt2], pdf[dt1:dt2])
1158-
self.assert_eq(kdf.A[dt2], pdf.A[dt2])
1159-
self.assert_eq(kdf.A[:dt2], pdf.A[:dt2])
1160-
self.assert_eq(kdf.A[dt1:], pdf.A[dt1:])
1161-
self.assert_eq(kdf.A[dt1:dt2], pdf.A[dt1:dt2])
1162+
self.assert_eq(kdf[:dt2], pdf[:dt2], almost=True)
1163+
self.assert_eq(kdf[dt1:], pdf[dt1:], almost=True)
1164+
self.assert_eq(kdf[dt1:dt2], pdf[dt1:dt2], almost=True)
1165+
self.assert_eq(kdf.A[dt2], pdf.A[dt2], almost=True)
1166+
self.assert_eq(kdf.A[:dt2], pdf.A[:dt2], almost=True)
1167+
self.assert_eq(kdf.A[dt1:], pdf.A[dt1:], almost=True)
1168+
self.assert_eq(kdf.A[dt1:dt2], pdf.A[dt1:dt2], almost=True)
11621169

11631170
def test_index_operator_int(self):
11641171
pdf = pd.DataFrame(np.random.randn(6, 4), index=[1, 3, 5, 7, 9, 11], columns=list("ABCD"))

databricks/koalas/tests/test_series.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1268,8 +1268,14 @@ def test_truncate(self):
12681268
self.assert_eq(kser1.truncate(after=5), pser1.truncate(after=5))
12691269
self.assert_eq(kser1.truncate(copy=False), pser1.truncate(copy=False))
12701270
self.assert_eq(kser1.truncate(2, 5, copy=False), pser1.truncate(2, 5, copy=False))
1271-
self.assert_eq(kser2.truncate(4, 6), pser2.truncate(4, 6))
1272-
self.assert_eq(kser2.truncate(4, 6, copy=False), pser2.truncate(4, 6, copy=False))
1271+
# The bug for these tests has been fixed in pandas 1.1.0.
1272+
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
1273+
self.assert_eq(kser2.truncate(4, 6), pser2.truncate(4, 6))
1274+
self.assert_eq(kser2.truncate(4, 6, copy=False), pser2.truncate(4, 6, copy=False))
1275+
else:
1276+
expected_kser = ks.Series([20, 30, 40], index=[6, 5, 4])
1277+
self.assert_eq(kser2.truncate(4, 6), expected_kser)
1278+
self.assert_eq(kser2.truncate(4, 6, copy=False), expected_kser)
12731279

12741280
kser = ks.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 3, 2, 1])
12751281
msg = "truncate requires a sorted index"

requirements-dev.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Dependencies in Koalas. When you update don't forget to update setup.py and install.rst in docs.
2-
pandas>=0.23.2,<1.1.0
2+
pandas>=0.23.2
33
pyarrow>=0.10
44
matplotlib>=3.0.0,<3.3.0
55
numpy>=1.14,<1.19.0

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
},
4949
python_requires='>=3.5,<3.9',
5050
install_requires=[
51-
'pandas>=0.23.2,<1.1.0',
51+
'pandas>=0.23.2',
5252
'pyarrow>=0.10',
5353
'numpy>=1.14,<1.19.0',
5454
'matplotlib>=3.0.0,<3.3.0',

0 commit comments

Comments
 (0)