From 71edcce583aff1415c4de2df53d437adf9514ffa Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 15 Nov 2020 17:27:50 +0100 Subject: [PATCH 01/27] First cross merge draft for merge operation --- pandas/core/frame.py | 6 +++- pandas/core/reshape/merge.py | 36 ++++++++++++++++++++++++ pandas/tests/reshape/merge/test_merge.py | 31 ++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4310d23e4d7f3..67683075ba796 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -205,12 +205,14 @@ The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on. +When performing a cross merge, no column specifications to merge on are +allowed. Parameters ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner'}, default 'inner' +how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -221,6 +223,8 @@ join; sort keys lexicographically. * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. + * cross: creates the karthesian product from both frames, preserves the order + of the left keys. on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 918a894a27916..5a4346185dd5f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -591,6 +591,8 @@ def __init__( ): _left = _validate_operand(left) _right = _validate_operand(right) + if how == "cross": + _left, _right, how, on = self._create_cross_configuration(_left, _right, on) self.left = self.orig_left = _left self.right = self.orig_right = _right self.how = how @@ -690,8 +692,15 @@ def get_result(self): self._maybe_restore_index_levels(result) + self._maybe_drop_cross_column(result) + return result.__finalize__(self, method="merge") + def _maybe_drop_cross_column(self, result: "DataFrame"): + cross_col = getattr(self, "_cross", None) + if cross_col is not None: + result.drop(columns=cross_col, inplace=True) + def _indicator_pre_merge( self, left: "DataFrame", right: "DataFrame" ) -> Tuple["DataFrame", "DataFrame"]: @@ -1200,7 +1209,34 @@ def _maybe_coerce_merge_keys(self): typ = rk.categories.dtype if rk_is_cat else object self.right = self.right.assign(**{name: self.right[name].astype(typ)}) + def _create_cross_configuration( + self, _left, _right, on + ) -> Tuple["DataFrame", "DataFrame", str, str]: + if on is not None: + raise MergeError( + "Can not pass any merge columns when using cross as merge method" + ) + cross_col = f"{max([*_left.columns, *_right.columns])}_cross" + _left = _left.copy() + _right = _right.copy() + _left.insert(loc=0, value=1, column=cross_col) + _right.insert(loc=0, value=1, column=cross_col) + how = "inner" + on = cross_col + self._cross = cross_col + return _left, _right, how, on + def _validate_specification(self): + if hasattr(self, "_cross"): + if ( + self.left_index + or self.right_index + or self.right_on is not None + or self.left_on is not None + ): + raise MergeError( + "Can not pass any merge columns when using cross as merge method" + ) # Hm, any way to make this logic less complicated?? if self.on is None and self.left_on is None and self.right_on is None: diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 999b827fe0571..5e28998d456aa 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2337,3 +2337,34 @@ def test_merge_join_cols_error_reporting_on_and_index(func, kwargs): ) with pytest.raises(MergeError, match=msg): getattr(pd, func)(left, right, on="a", **kwargs) + + +@pytest.mark.parametrize( + ("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])] +) +def test_merge_cross(input_col, output_cols): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({input_col: [3, 4]}) + result = merge(left, right, how="cross") + expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs", + [ + {"left_index": True}, + {"right_index": True}, + {"on": "a"}, + {"left_on": "a"}, + {"right_on": "b"}, + ], +) +def test_merge_cross_error_reporting(kwargs): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({"b": [3, 4]}) + msg = "Can not pass any merge columns when using cross as merge method" + with pytest.raises(MergeError, match=msg): + result = merge(left, right, how="cross", **kwargs) From f573ca46476fa28b320d25cd487429ebd2e996db Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 15 Nov 2020 17:46:49 +0100 Subject: [PATCH 02/27] Fix variable assignment --- pandas/tests/reshape/merge/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 5e28998d456aa..51f693f7e6927 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2367,4 +2367,4 @@ def test_merge_cross_error_reporting(kwargs): right = DataFrame({"b": [3, 4]}) msg = "Can not pass any merge columns when using cross as merge method" with pytest.raises(MergeError, match=msg): - result = merge(left, right, how="cross", **kwargs) + merge(left, right, how="cross", **kwargs) From 0acdd0052adb24964ca09160de6d7667bf9aa0ab Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 18 Nov 2020 23:19:33 +0100 Subject: [PATCH 03/27] Adress review comments --- pandas/core/reshape/merge.py | 41 ++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5a4346185dd5f..1b9b77765605a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -4,6 +4,7 @@ import copy import datetime +import hashlib from functools import partial import string from typing import TYPE_CHECKING, Optional, Tuple, cast @@ -591,8 +592,6 @@ def __init__( ): _left = _validate_operand(left) _right = _validate_operand(right) - if how == "cross": - _left, _right, how, on = self._create_cross_configuration(_left, _right, on) self.left = self.orig_left = _left self.right = self.orig_right = _right self.how = how @@ -645,6 +644,16 @@ def __init__( self._validate_specification() + if self.how == "cross": + ( + self.left, + self.right, + self.how, + cross_col, + ) = self._create_cross_configuration(self.left, self.right) + self.left_on = self.right_on = [cross_col] + self._cross = cross_col + # note this function has side effects ( self.left_join_keys, @@ -1210,35 +1219,31 @@ def _maybe_coerce_merge_keys(self): self.right = self.right.assign(**{name: self.right[name].astype(typ)}) def _create_cross_configuration( - self, _left, _right, on + self, _left, _right ) -> Tuple["DataFrame", "DataFrame", str, str]: - if on is not None: - raise MergeError( - "Can not pass any merge columns when using cross as merge method" - ) - cross_col = f"{max([*_left.columns, *_right.columns])}_cross" - _left = _left.copy() - _right = _right.copy() - _left.insert(loc=0, value=1, column=cross_col) - _right.insert(loc=0, value=1, column=cross_col) + cross_col = f"_cross_{hashlib.md5().hexdigest()}" how = "inner" - on = cross_col - self._cross = cross_col - return _left, _right, how, on + return ( + _left.assign(**{cross_col: 1}), + _right.assign(**{cross_col: 1}), + how, + cross_col, + ) def _validate_specification(self): - if hasattr(self, "_cross"): + if self.how == "cross": if ( self.left_index or self.right_index or self.right_on is not None or self.left_on is not None + or self.on is not None ): raise MergeError( "Can not pass any merge columns when using cross as merge method" ) # Hm, any way to make this logic less complicated?? - if self.on is None and self.left_on is None and self.right_on is None: + elif self.on is None and self.left_on is None and self.right_on is None: if self.left_index and self.right_index: self.left_on, self.right_on = (), () @@ -1302,7 +1307,7 @@ def _validate_specification(self): 'of levels in the index of "left"' ) self.left_on = [None] * n - if len(self.right_on) != len(self.left_on): + if self.how != "cross" and len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") def _validate(self, validate: str): From 949185e0bfec13c4ba71bbf8a185cb8343859960 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 18 Nov 2020 23:26:24 +0100 Subject: [PATCH 04/27] Change function signature --- pandas/core/reshape/merge.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1b9b77765605a..9db686ec5d8bc 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -653,6 +653,8 @@ def __init__( ) = self._create_cross_configuration(self.left, self.right) self.left_on = self.right_on = [cross_col] self._cross = cross_col + else: + self._cross is None # note this function has side effects ( @@ -701,12 +703,11 @@ def get_result(self): self._maybe_restore_index_levels(result) - self._maybe_drop_cross_column(result) + self._maybe_drop_cross_column(result, self._cross) return result.__finalize__(self, method="merge") - def _maybe_drop_cross_column(self, result: "DataFrame"): - cross_col = getattr(self, "_cross", None) + def _maybe_drop_cross_column(self, result: "DataFrame", cross_col: str): if cross_col is not None: result.drop(columns=cross_col, inplace=True) From 2d5ccaa839c0c94568bfc864af2a2e13cace5a38 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 18 Nov 2020 23:33:14 +0100 Subject: [PATCH 05/27] Add cross functionality for join --- pandas/core/frame.py | 9 +++++++++ pandas/tests/reshape/merge/test_join.py | 22 ++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 67683075ba796..f472007d007b6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8069,6 +8069,15 @@ def _join_compat( other = DataFrame({other.name: other}) if isinstance(other, DataFrame): + if how == "cross": + return merge( + self, + other, + how=how, + on=on, + suffixes=(lsuffix, rsuffix), + sort=sort, + ) return merge( self, other, diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 7db92eb55fa0b..a0fcb68324e64 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -4,6 +4,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat, merge import pandas._testing as tm +from pandas.errors import MergeError from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data a_ = np.array @@ -803,3 +804,24 @@ def test_join_inner_multiindex_deterministic_order(): index=MultiIndex.from_tuples([(2, 1, 4, 3)], names=("b", "a", "d", "c")), ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + ("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])] +) +def test_join_cross(input_col, output_cols): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({input_col: [3, 4]}) + result = left.join(right, how="cross", lsuffix="_x", rsuffix="_y") + expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]}) + tm.assert_frame_equal(result, expected) + + +def test_join_cross_error_reporting(): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({"a": [3, 4]}) + msg = "Can not pass any merge columns when using cross as merge method" + with pytest.raises(MergeError, match=msg): + left.join(right, how="cross", on="a") From 909885212615ca861d7affd8fb5132c45c3075c0 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 18 Nov 2020 23:36:29 +0100 Subject: [PATCH 06/27] Change docs --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/frame.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e690334a36c5b..5870c18476ac0 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -235,6 +235,7 @@ Other enhancements - Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`) - Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) +- Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f472007d007b6..db82555c0e273 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -225,6 +225,9 @@ join; preserve the order of the left keys. * cross: creates the karthesian product from both frames, preserves the order of the left keys. + + .. versionadded:: 1.2.0 + on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults From 60f6b2505ca3a6a61675416bc72bf2033de0a3b3 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 18 Nov 2020 23:39:35 +0100 Subject: [PATCH 07/27] Add asvs --- asv_bench/benchmarks/join_merge.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 1333b3a0f0560..cd65dfce28446 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -132,6 +132,9 @@ def time_join_dataframe_index_single_key_small(self, sort): def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort): self.df_shuf.join(self.df_key2, on="key2", sort=sort) + def time_join_dataframes_cross(self): + self.df.join(self.df_key1, how="cross") + class JoinIndex: def setup(self): @@ -205,6 +208,9 @@ def time_merge_dataframe_integer_2key(self, sort): def time_merge_dataframe_integer_key(self, sort): merge(self.df, self.df2, on="key1", sort=sort) + def time_merge_dataframes_cross(self): + merge(self.left, self.right, how="cross") + class I8Merge: From 0601243cb100fafc067406d434728e38d54b0b8e Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 18 Nov 2020 23:43:07 +0100 Subject: [PATCH 08/27] Move import --- pandas/core/reshape/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9db686ec5d8bc..254948aa6638e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -4,8 +4,8 @@ import copy import datetime -import hashlib from functools import partial +import hashlib import string from typing import TYPE_CHECKING, Optional, Tuple, cast import warnings From c46eab315186943e62c00d58fd49891a8b22095a Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 18 Nov 2020 23:44:18 +0100 Subject: [PATCH 09/27] Assign value --- pandas/core/reshape/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 254948aa6638e..f6fd264f5fcfc 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -654,7 +654,7 @@ def __init__( self.left_on = self.right_on = [cross_col] self._cross = cross_col else: - self._cross is None + self._cross = None # note this function has side effects ( From 627412040b53073cb01b631ed95f8c370109e42c Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 18 Nov 2020 23:52:17 +0100 Subject: [PATCH 10/27] Reduce asvs --- asv_bench/benchmarks/join_merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index cd65dfce28446..3b2f7ad4e1e25 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -133,7 +133,7 @@ def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort): self.df_shuf.join(self.df_key2, on="key2", sort=sort) def time_join_dataframes_cross(self): - self.df.join(self.df_key1, how="cross") + self.df.loc[:2000].join(self.df_key1, how="cross") class JoinIndex: @@ -209,7 +209,7 @@ def time_merge_dataframe_integer_key(self, sort): merge(self.df, self.df2, on="key1", sort=sort) def time_merge_dataframes_cross(self): - merge(self.left, self.right, how="cross") + merge(self.left.loc[:2000], self.right.loc[:2000], how="cross") class I8Merge: From 1f0a1c8f5c9bcd2b743b9b0e05a879bb5e3555a3 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 19 Nov 2020 00:13:34 +0100 Subject: [PATCH 11/27] Remove whitespaces --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f21ca59218de6..ff18401915ff5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -225,7 +225,7 @@ join; preserve the order of the left keys. * cross: creates the karthesian product from both frames, preserves the order of the left keys. - + .. versionadded:: 1.2.0 on : label or list From d7c1156971d90249d2e48225915e98767860f7ea Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 19 Nov 2020 00:28:36 +0100 Subject: [PATCH 12/27] Move import --- pandas/tests/reshape/merge/test_join.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index a0fcb68324e64..77c91a922a80e 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -1,10 +1,11 @@ import numpy as np import pytest +from pandas.errors import MergeError + import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat, merge import pandas._testing as tm -from pandas.errors import MergeError from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data a_ = np.array From 7c8d37adedc0442a4dac88584544f45792253719 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 19 Nov 2020 20:00:01 +0100 Subject: [PATCH 13/27] Adress review --- pandas/core/frame.py | 18 ++++++++++++++++++ pandas/core/reshape/merge.py | 3 ++- pandas/tests/reshape/merge/test_join.py | 3 ++- pandas/tests/reshape/merge/test_merge.py | 3 ++- 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ff18401915ff5..0d143a03ef680 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -348,6 +348,24 @@ ... ValueError: columns overlap but no suffix specified: Index(['value'], dtype='object') + +>>> df1 = pd.DataFrame({'left_col': ['foo', 'bar']}) +>>> df2 = pd.DataFrame({'right_col': [7, 8]}) +>>> df1 + left +0 foo +1 bar +>>> df2 + right +0 7 +1 8 + +>>> df1.merge(df2, how='cross') + left right +0 foo 7 +1 foo 8 +2 bar 7 +3 bar 8 """ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f6fd264f5fcfc..883539f23288d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1241,7 +1241,8 @@ def _validate_specification(self): or self.on is not None ): raise MergeError( - "Can not pass any merge columns when using cross as merge method" + "Can not pass on, right_on, left_on or set right_index=True or " + "left_index=True" ) # Hm, any way to make this logic less complicated?? elif self.on is None and self.left_on is None and self.right_on is None: diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 77c91a922a80e..8f2a0973746e0 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -823,6 +823,7 @@ def test_join_cross_error_reporting(): # GH#5401 left = DataFrame({"a": [1, 3]}) right = DataFrame({"a": [3, 4]}) - msg = "Can not pass any merge columns when using cross as merge method" + msg = "Can not pass on, right_on, left_on or set right_index=True or " \ + "left_index=True" with pytest.raises(MergeError, match=msg): left.join(right, how="cross", on="a") diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 51f693f7e6927..d53dd4b85cf59 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2365,6 +2365,7 @@ def test_merge_cross_error_reporting(kwargs): # GH#5401 left = DataFrame({"a": [1, 3]}) right = DataFrame({"b": [3, 4]}) - msg = "Can not pass any merge columns when using cross as merge method" + msg = "Can not pass on, right_on, left_on or set right_index=True or " \ + "left_index=True" with pytest.raises(MergeError, match=msg): merge(left, right, how="cross", **kwargs) From 651540ef9cc8a21618ccf202a2de27c5db01ac14 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 19 Nov 2020 20:16:03 +0100 Subject: [PATCH 14/27] Fix doc checks --- asv_bench/benchmarks/join_merge.py | 8 ++++---- pandas/core/reshape/merge.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 3b2f7ad4e1e25..a572b8a70a680 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -132,8 +132,8 @@ def time_join_dataframe_index_single_key_small(self, sort): def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort): self.df_shuf.join(self.df_key2, on="key2", sort=sort) - def time_join_dataframes_cross(self): - self.df.loc[:2000].join(self.df_key1, how="cross") + def time_join_dataframes_cross(self, sort): + self.df.loc[:2000].join(self.df_key1, how="cross", sort=sort) class JoinIndex: @@ -208,8 +208,8 @@ def time_merge_dataframe_integer_2key(self, sort): def time_merge_dataframe_integer_key(self, sort): merge(self.df, self.df2, on="key1", sort=sort) - def time_merge_dataframes_cross(self): - merge(self.left.loc[:2000], self.right.loc[:2000], how="cross") + def time_merge_dataframes_cross(self, sort): + merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort) class I8Merge: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 883539f23288d..7b54daca5f3eb 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -644,6 +644,7 @@ def __init__( self._validate_specification() + cross_col = None if self.how == "cross": ( self.left, @@ -652,9 +653,7 @@ def __init__( cross_col, ) = self._create_cross_configuration(self.left, self.right) self.left_on = self.right_on = [cross_col] - self._cross = cross_col - else: - self._cross = None + self._cross = cross_col # note this function has side effects ( @@ -707,7 +706,7 @@ def get_result(self): return result.__finalize__(self, method="merge") - def _maybe_drop_cross_column(self, result: "DataFrame", cross_col: str): + def _maybe_drop_cross_column(self, result: "DataFrame", cross_col: Optional[str]): if cross_col is not None: result.drop(columns=cross_col, inplace=True) @@ -1244,6 +1243,7 @@ def _validate_specification(self): "Can not pass on, right_on, left_on or set right_index=True or " "left_index=True" ) + return # Hm, any way to make this logic less complicated?? elif self.on is None and self.left_on is None and self.right_on is None: From f7cdd4d85d7b046efe188c856a56ef7240bb5e29 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 19 Nov 2020 20:21:38 +0100 Subject: [PATCH 15/27] Add docstring --- pandas/core/reshape/merge.py | 16 ++++++++++++++++ pandas/tests/reshape/merge/test_join.py | 6 ++++-- pandas/tests/reshape/merge/test_merge.py | 6 ++++-- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 7b54daca5f3eb..80d16c9764866 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1221,6 +1221,22 @@ def _maybe_coerce_merge_keys(self): def _create_cross_configuration( self, _left, _right ) -> Tuple["DataFrame", "DataFrame", str, str]: + """ + Creates the configuration to dispatch the cross operation to inner join, + e.g. adding a join column and resetting parameters. Join column is added + to a new object, no inplace modification + + Parameters + ---------- + _left: DataFrame + _right DataFrame + + Returns + ------- + a tuple (_left_df, _right_df, how, cross_col) representing the adjusted + DataFrames with cross_col, the merge operation set to inner and the column + to join over. + """ cross_col = f"_cross_{hashlib.md5().hexdigest()}" how = "inner" return ( diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 8f2a0973746e0..66053f325d45e 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -823,7 +823,9 @@ def test_join_cross_error_reporting(): # GH#5401 left = DataFrame({"a": [1, 3]}) right = DataFrame({"a": [3, 4]}) - msg = "Can not pass on, right_on, left_on or set right_index=True or " \ - "left_index=True" + msg = ( + "Can not pass on, right_on, left_on or set right_index=True or " + "left_index=True" + ) with pytest.raises(MergeError, match=msg): left.join(right, how="cross", on="a") diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d53dd4b85cf59..7dcde7aae5586 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2365,7 +2365,9 @@ def test_merge_cross_error_reporting(kwargs): # GH#5401 left = DataFrame({"a": [1, 3]}) right = DataFrame({"b": [3, 4]}) - msg = "Can not pass on, right_on, left_on or set right_index=True or " \ - "left_index=True" + msg = ( + "Can not pass on, right_on, left_on or set right_index=True or " + "left_index=True" + ) with pytest.raises(MergeError, match=msg): merge(left, right, how="cross", **kwargs) From a4d24a913a7aa9bae6f9d5b1fbab3609c960399e Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 19 Nov 2020 21:12:29 +0100 Subject: [PATCH 16/27] Change example --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0d143a03ef680..2d4ef56571966 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -349,8 +349,8 @@ ValueError: columns overlap but no suffix specified: Index(['value'], dtype='object') ->>> df1 = pd.DataFrame({'left_col': ['foo', 'bar']}) ->>> df2 = pd.DataFrame({'right_col': [7, 8]}) +>>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) +>>> df2 = pd.DataFrame({'right': [7, 8]}) >>> df1 left 0 foo From 741b4b79d949abcc2b6346c65c56279054e8532f Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 21 Nov 2020 02:47:51 +0100 Subject: [PATCH 17/27] Fix typos and rename variables --- pandas/core/frame.py | 2 +- pandas/core/reshape/merge.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2d4ef56571966..3c41bd0778439 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -223,7 +223,7 @@ join; sort keys lexicographically. * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. - * cross: creates the karthesian product from both frames, preserves the order + * cross: creates the cartesian product from both frames, preserves the order of the left keys. .. versionadded:: 1.2.0 diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 80d16c9764866..9a39fd4f653fc 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1219,7 +1219,7 @@ def _maybe_coerce_merge_keys(self): self.right = self.right.assign(**{name: self.right[name].astype(typ)}) def _create_cross_configuration( - self, _left, _right + self, left, right ) -> Tuple["DataFrame", "DataFrame", str, str]: """ Creates the configuration to dispatch the cross operation to inner join, @@ -1240,8 +1240,8 @@ def _create_cross_configuration( cross_col = f"_cross_{hashlib.md5().hexdigest()}" how = "inner" return ( - _left.assign(**{cross_col: 1}), - _right.assign(**{cross_col: 1}), + left.assign(**{cross_col: 1}), + right.assign(**{cross_col: 1}), how, cross_col, ) From 0ff78fc89fc41ad1b2e2e57ecc8b047ce421b0b2 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 21 Nov 2020 02:49:01 +0100 Subject: [PATCH 18/27] Check unmodified inputs --- pandas/tests/reshape/merge/test_merge.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7dcde7aae5586..199e0ccbb7d4d 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2346,9 +2346,13 @@ def test_merge_cross(input_col, output_cols): # GH#5401 left = DataFrame({"a": [1, 3]}) right = DataFrame({input_col: [3, 4]}) + left_copy = left.copy() + right_copy = right.copy() result = merge(left, right, how="cross") expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]}) tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(left, left_copy) + tm.assert_frame_equal(right, right_copy) @pytest.mark.parametrize( From 94316f34e019a18fe1a2c3eec51141cd1d184cf4 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 22 Nov 2020 01:06:05 +0100 Subject: [PATCH 19/27] Add examples --- pandas/core/frame.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3c41bd0778439..e734a3502f173 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -349,6 +349,26 @@ ValueError: columns overlap but no suffix specified: Index(['value'], dtype='object') +>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) +>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) +>>> df1 + a b +0 foo 1 +1 bar 2 +>>> df2 + a c +0 foo 3 +1 baz 4 + +>>> df1.merge(df2, how='inner', on='a') + a b c +0 foo 1 3 + +>>> df1.merge(df2, how='left', on='a') + a b c +0 foo 1 3.0 +1 bar 2 np.nan + >>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) >>> df2 = pd.DataFrame({'right': [7, 8]}) >>> df1 From 94b1367cee7bf33e19cc8ed7b3a737cc84a70f69 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 22 Nov 2020 01:14:13 +0100 Subject: [PATCH 20/27] Add tests --- pandas/tests/reshape/merge/test_merge.py | 40 ++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 199e0ccbb7d4d..48be82754a529 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2375,3 +2375,43 @@ def test_merge_cross_error_reporting(kwargs): ) with pytest.raises(MergeError, match=msg): merge(left, right, how="cross", **kwargs) + + +def test_merge_cross_mixed_dtypes(): + # GH#5401 + left = DataFrame(["a", "b", "c"], columns=["A"]) + right = DataFrame(range(2), columns=["B"]) + result = merge(left, right, how="cross") + expected = DataFrame({"A": ["a", "a", "b", "b", "c", "c"], "B": [0, 1, 0, 1, 0, 1]}) + tm.assert_frame_equal(result, expected) + + +def test_merge_cross_more_than_one_column(): + # GH#5401 + left = DataFrame({"A": list("ab"), "B": [2, 1]}) + right = DataFrame({"C": range(2), "D": range(4, 6)}) + result = merge(left, right, how="cross") + expected = DataFrame( + { + "A": ["a", "a", "b", "b"], + "B": [2, 2, 1, 1], + "C": [0, 1, 0, 1], + "D": [4, 5, 4, 5], + } + ) + tm.assert_frame_equal(result, expected) + + +def test_merge_cross_null_values(nulls_fixture): + # GH#5401 + left = DataFrame({"a": [1, nulls_fixture]}) + right = DataFrame({"b": ["a", "b"], "c": [1.0, 2.0]}) + result = merge(left, right, how="cross") + expected = DataFrame( + { + "a": [1, 1, nulls_fixture, nulls_fixture], + "b": ["a", "b", "a", "b"], + "c": [1.0, 2.0, 1.0, 2.0], + } + ) + tm.assert_frame_equal(result, expected) From 77a9e234bf9fc569dee789d69241f5a047aa0429 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 22 Nov 2020 02:27:51 +0100 Subject: [PATCH 21/27] Fix doc --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e734a3502f173..b66f45db92c30 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -367,7 +367,7 @@ >>> df1.merge(df2, how='left', on='a') a b c 0 foo 1 3.0 -1 bar 2 np.nan +1 bar 2 NaN >>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) >>> df2 = pd.DataFrame({'right': [7, 8]}) From 459764240333d1cca128fecfe7d0421b13052036 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 23 Nov 2020 22:11:02 +0100 Subject: [PATCH 22/27] Change signature --- pandas/core/reshape/merge.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9a39fd4f653fc..18a0c68d98839 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1228,12 +1228,12 @@ def _create_cross_configuration( Parameters ---------- - _left: DataFrame - _right DataFrame + left: DataFrame + right DataFrame Returns ------- - a tuple (_left_df, _right_df, how, cross_col) representing the adjusted + a tuple (left, right, how, cross_col) representing the adjusted DataFrames with cross_col, the merge operation set to inner and the column to join over. """ From 67a67a67f474dba60425febe2ce8ce9a14d81270 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 23 Nov 2020 22:11:46 +0100 Subject: [PATCH 23/27] Move test --- pandas/tests/reshape/merge/test_join.py | 12 ------------ pandas/tests/reshape/merge/test_merge.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 66053f325d45e..ec6b285b157d6 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -817,15 +817,3 @@ def test_join_cross(input_col, output_cols): result = left.join(right, how="cross", lsuffix="_x", rsuffix="_y") expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]}) tm.assert_frame_equal(result, expected) - - -def test_join_cross_error_reporting(): - # GH#5401 - left = DataFrame({"a": [1, 3]}) - right = DataFrame({"a": [3, 4]}) - msg = ( - "Can not pass on, right_on, left_on or set right_index=True or " - "left_index=True" - ) - with pytest.raises(MergeError, match=msg): - left.join(right, how="cross", on="a") diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 48be82754a529..b50588a39c7fa 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2415,3 +2415,15 @@ def test_merge_cross_null_values(nulls_fixture): } ) tm.assert_frame_equal(result, expected) + + +def test_join_cross_error_reporting(): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({"a": [3, 4]}) + msg = ( + "Can not pass on, right_on, left_on or set right_index=True or " + "left_index=True" + ) + with pytest.raises(MergeError, match=msg): + left.join(right, how="cross", on="a") From f7310819b4966068db9fb577b037765c7eb9f5d6 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 23 Nov 2020 22:26:43 +0100 Subject: [PATCH 24/27] Delete import --- pandas/tests/reshape/merge/test_join.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index ec6b285b157d6..00ef7a05f5902 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas.errors import MergeError - import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat, merge import pandas._testing as tm From 4fcde78b52ffb635f24d73d1f34255105e373b4f Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 24 Nov 2020 00:46:29 +0100 Subject: [PATCH 25/27] Create new file --- pandas/tests/reshape/merge/test_merge.py | 90 ------------------ .../tests/reshape/merge/test_merge_cross.py | 95 +++++++++++++++++++ 2 files changed, 95 insertions(+), 90 deletions(-) create mode 100644 pandas/tests/reshape/merge/test_merge_cross.py diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index b50588a39c7fa..999b827fe0571 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2337,93 +2337,3 @@ def test_merge_join_cols_error_reporting_on_and_index(func, kwargs): ) with pytest.raises(MergeError, match=msg): getattr(pd, func)(left, right, on="a", **kwargs) - - -@pytest.mark.parametrize( - ("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])] -) -def test_merge_cross(input_col, output_cols): - # GH#5401 - left = DataFrame({"a": [1, 3]}) - right = DataFrame({input_col: [3, 4]}) - left_copy = left.copy() - right_copy = right.copy() - result = merge(left, right, how="cross") - expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]}) - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(left, left_copy) - tm.assert_frame_equal(right, right_copy) - - -@pytest.mark.parametrize( - "kwargs", - [ - {"left_index": True}, - {"right_index": True}, - {"on": "a"}, - {"left_on": "a"}, - {"right_on": "b"}, - ], -) -def test_merge_cross_error_reporting(kwargs): - # GH#5401 - left = DataFrame({"a": [1, 3]}) - right = DataFrame({"b": [3, 4]}) - msg = ( - "Can not pass on, right_on, left_on or set right_index=True or " - "left_index=True" - ) - with pytest.raises(MergeError, match=msg): - merge(left, right, how="cross", **kwargs) - - -def test_merge_cross_mixed_dtypes(): - # GH#5401 - left = DataFrame(["a", "b", "c"], columns=["A"]) - right = DataFrame(range(2), columns=["B"]) - result = merge(left, right, how="cross") - expected = DataFrame({"A": ["a", "a", "b", "b", "c", "c"], "B": [0, 1, 0, 1, 0, 1]}) - tm.assert_frame_equal(result, expected) - - -def test_merge_cross_more_than_one_column(): - # GH#5401 - left = DataFrame({"A": list("ab"), "B": [2, 1]}) - right = DataFrame({"C": range(2), "D": range(4, 6)}) - result = merge(left, right, how="cross") - expected = DataFrame( - { - "A": ["a", "a", "b", "b"], - "B": [2, 2, 1, 1], - "C": [0, 1, 0, 1], - "D": [4, 5, 4, 5], - } - ) - tm.assert_frame_equal(result, expected) - - -def test_merge_cross_null_values(nulls_fixture): - # GH#5401 - left = DataFrame({"a": [1, nulls_fixture]}) - right = DataFrame({"b": ["a", "b"], "c": [1.0, 2.0]}) - result = merge(left, right, how="cross") - expected = DataFrame( - { - "a": [1, 1, nulls_fixture, nulls_fixture], - "b": ["a", "b", "a", "b"], - "c": [1.0, 2.0, 1.0, 2.0], - } - ) - tm.assert_frame_equal(result, expected) - - -def test_join_cross_error_reporting(): - # GH#5401 - left = DataFrame({"a": [1, 3]}) - right = DataFrame({"a": [3, 4]}) - msg = ( - "Can not pass on, right_on, left_on or set right_index=True or " - "left_index=True" - ) - with pytest.raises(MergeError, match=msg): - left.join(right, how="cross", on="a") diff --git a/pandas/tests/reshape/merge/test_merge_cross.py b/pandas/tests/reshape/merge/test_merge_cross.py new file mode 100644 index 0000000000000..d6c29ea129027 --- /dev/null +++ b/pandas/tests/reshape/merge/test_merge_cross.py @@ -0,0 +1,95 @@ +import pytest + +from pandas import DataFrame +import pandas._testing as tm +from pandas.core.reshape.merge import MergeError, merge + + +@pytest.mark.parametrize( + ("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])] +) +def test_merge_cross(input_col, output_cols): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({input_col: [3, 4]}) + left_copy = left.copy() + right_copy = right.copy() + result = merge(left, right, how="cross") + expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]}) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(left, left_copy) + tm.assert_frame_equal(right, right_copy) + + +@pytest.mark.parametrize( + "kwargs", + [ + {"left_index": True}, + {"right_index": True}, + {"on": "a"}, + {"left_on": "a"}, + {"right_on": "b"}, + ], +) +def test_merge_cross_error_reporting(kwargs): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({"b": [3, 4]}) + msg = ( + "Can not pass on, right_on, left_on or set right_index=True or " + "left_index=True" + ) + with pytest.raises(MergeError, match=msg): + merge(left, right, how="cross", **kwargs) + + +def test_merge_cross_mixed_dtypes(): + # GH#5401 + left = DataFrame(["a", "b", "c"], columns=["A"]) + right = DataFrame(range(2), columns=["B"]) + result = merge(left, right, how="cross") + expected = DataFrame({"A": ["a", "a", "b", "b", "c", "c"], "B": [0, 1, 0, 1, 0, 1]}) + tm.assert_frame_equal(result, expected) + + +def test_merge_cross_more_than_one_column(): + # GH#5401 + left = DataFrame({"A": list("ab"), "B": [2, 1]}) + right = DataFrame({"C": range(2), "D": range(4, 6)}) + result = merge(left, right, how="cross") + expected = DataFrame( + { + "A": ["a", "a", "b", "b"], + "B": [2, 2, 1, 1], + "C": [0, 1, 0, 1], + "D": [4, 5, 4, 5], + } + ) + tm.assert_frame_equal(result, expected) + + +def test_merge_cross_null_values(nulls_fixture): + # GH#5401 + left = DataFrame({"a": [1, nulls_fixture]}) + right = DataFrame({"b": ["a", "b"], "c": [1.0, 2.0]}) + result = merge(left, right, how="cross") + expected = DataFrame( + { + "a": [1, 1, nulls_fixture, nulls_fixture], + "b": ["a", "b", "a", "b"], + "c": [1.0, 2.0, 1.0, 2.0], + } + ) + tm.assert_frame_equal(result, expected) + + +def test_join_cross_error_reporting(): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({"a": [3, 4]}) + msg = ( + "Can not pass on, right_on, left_on or set right_index=True or " + "left_index=True" + ) + with pytest.raises(MergeError, match=msg): + left.join(right, how="cross", on="a") From 45896517817b7f71479eef65830018425f9402ad Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 24 Nov 2020 19:31:10 +0100 Subject: [PATCH 26/27] Raise if duplicate on column --- pandas/core/reshape/merge.py | 5 +++++ pandas/tests/reshape/merge/test_merge_cross.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 18a0c68d98839..8dbcbb5124e62 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1238,6 +1238,11 @@ def _create_cross_configuration( to join over. """ cross_col = f"_cross_{hashlib.md5().hexdigest()}" + if cross_col in left.columns or cross_col in right.columns: + raise MergeError( + f"{cross_col} is the synthetic column to perform the " + f"cross merge. This column can not be an input column." + ) how = "inner" return ( left.assign(**{cross_col: 1}), diff --git a/pandas/tests/reshape/merge/test_merge_cross.py b/pandas/tests/reshape/merge/test_merge_cross.py index d6c29ea129027..362f50a3f8d31 100644 --- a/pandas/tests/reshape/merge/test_merge_cross.py +++ b/pandas/tests/reshape/merge/test_merge_cross.py @@ -1,3 +1,5 @@ +import hashlib + import pytest from pandas import DataFrame @@ -93,3 +95,15 @@ def test_join_cross_error_reporting(): ) with pytest.raises(MergeError, match=msg): left.join(right, how="cross", on="a") + + +def test_merge_cross_duplicate_on_column(): + # GH#5401 + left = DataFrame({"a": [1, 2], f"_cross_{hashlib.md5().hexdigest()}": [2, 3]}) + right = DataFrame({"b": [3]}) + msg = ( + f"_cross_{hashlib.md5().hexdigest()} is the synthetic column to perform " + f"the cross merge. This column can not be an input column." + ) + with pytest.raises(MergeError, match=msg): + merge(left, right, how="cross") From d964ef10298713340e8488c63aa01798988d349d Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 25 Nov 2020 22:23:51 +0100 Subject: [PATCH 27/27] Revert "Raise if duplicate on column" This reverts commit 45896517 --- pandas/core/reshape/merge.py | 5 ----- pandas/tests/reshape/merge/test_merge_cross.py | 14 -------------- 2 files changed, 19 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8dbcbb5124e62..18a0c68d98839 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1238,11 +1238,6 @@ def _create_cross_configuration( to join over. """ cross_col = f"_cross_{hashlib.md5().hexdigest()}" - if cross_col in left.columns or cross_col in right.columns: - raise MergeError( - f"{cross_col} is the synthetic column to perform the " - f"cross merge. This column can not be an input column." - ) how = "inner" return ( left.assign(**{cross_col: 1}), diff --git a/pandas/tests/reshape/merge/test_merge_cross.py b/pandas/tests/reshape/merge/test_merge_cross.py index 362f50a3f8d31..d6c29ea129027 100644 --- a/pandas/tests/reshape/merge/test_merge_cross.py +++ b/pandas/tests/reshape/merge/test_merge_cross.py @@ -1,5 +1,3 @@ -import hashlib - import pytest from pandas import DataFrame @@ -95,15 +93,3 @@ def test_join_cross_error_reporting(): ) with pytest.raises(MergeError, match=msg): left.join(right, how="cross", on="a") - - -def test_merge_cross_duplicate_on_column(): - # GH#5401 - left = DataFrame({"a": [1, 2], f"_cross_{hashlib.md5().hexdigest()}": [2, 3]}) - right = DataFrame({"b": [3]}) - msg = ( - f"_cross_{hashlib.md5().hexdigest()} is the synthetic column to perform " - f"the cross merge. This column can not be an input column." - ) - with pytest.raises(MergeError, match=msg): - merge(left, right, how="cross")