diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index ac74e6a8e5f77..abb10ba245870 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -170,7 +170,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- +- Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`) - Sparse diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 68d61da0cf7dd..b8ad932abc2b0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -3058,17 +3058,19 @@ def renamer(x, suffix: str | None): llabels = left._transform_index(lrenamer) rlabels = right._transform_index(rrenamer) - dups = [] + dups = set() if not llabels.is_unique: # Only warn when duplicates are caused because of suffixes, already duplicated # columns in origin should not warn - dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist() + dups.update(llabels[(llabels.duplicated()) & (~left.duplicated())]) if not rlabels.is_unique: - dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist()) + dups.update(rlabels[(rlabels.duplicated()) & (~right.duplicated())]) + # Suffix addition creates duplicate to pre-existing column name + dups.update(llabels.intersection(right.difference(to_rename))) + dups.update(rlabels.intersection(left.difference(to_rename))) if dups: raise MergeError( - f"Passing 'suffixes' which cause duplicate columns {set(dups)} is " - f"not allowed.", + f"Passing 'suffixes' which cause duplicate columns {dups} is not allowed.", ) return llabels, rlabels diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f0f67aebd85ec..b400934f48b04 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -3060,3 +3060,18 @@ def test_merge_on_all_nan_column(): {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]} ) tm.assert_frame_equal(result, expected) + + +def test_merge_for_suffix_collisions(): + # GH#61402 + # Case 1: suffixes=("_dup", "") test collision + df1 = DataFrame({"col1": [1], "col2": [2]}) + df2 = DataFrame({"col1": [1], "col2": [2], "col2_dup": [3]}) + with pytest.raises(MergeError, match="duplicate columns"): + merge(df1, df2, on="col1", suffixes=("_dup", "")) + + # Case 2: suffixes=("", "_dup") test collision + df1 = DataFrame({"col1": [1], "col2": [2]}) + df2 = DataFrame({"col1": [1], "col2": [2], "col2_dup": [3]}) + with pytest.raises(MergeError, match="duplicate columns"): + merge(df1, df2, on="col1", suffixes=("", "_dup"))