diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ee9d18d0c7ce2..eb061f16b5152 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -621,6 +621,7 @@ Reshaping ^^^^^^^^^ - Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`) - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) +- Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 948836bf6a51d..56f8adda93251 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1529,9 +1529,7 @@ def safe_sort( order2 = sorter.argsort() if verify: mask = (codes < -len(values)) | (codes >= len(values)) - codes[mask] = 0 - else: - mask = None + codes[mask] = -1 new_codes = take_nd(order2, codes, fill_value=-1) else: reverse_indexer = np.empty(len(sorter), dtype=int) @@ -1540,14 +1538,6 @@ def safe_sort( # may deal with them here without performance loss using `mode='wrap'` new_codes = reverse_indexer.take(codes, mode="wrap") - if use_na_sentinel: - mask = codes == -1 - if verify: - mask = mask | (codes < -len(values)) | (codes >= len(values)) - - if use_na_sentinel and mask is not None: - np.putmask(new_codes, mask, -1) - return ordered, ensure_platform_int(new_codes) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index ad704d87a491b..cbee85f4aede9 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2998,3 +2998,15 @@ def test_merge_datetime_and_timedelta(how): ) with pytest.raises(ValueError, match=re.escape(msg)): right.merge(left, on="key", how=how) + + +def test_merge_on_all_nan_column(): + # GH#59421 + left = DataFrame({"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6]}) + right = DataFrame({"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "zz": [4, 5, 6]}) + result = left.merge(right, on=["x", "y"], how="outer") + # Should not trigger array bounds eerror with bounds checking or asan enabled. + expected = DataFrame( + {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]} + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 2a225bda953cf..869d41efa6c28 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -408,6 +408,13 @@ def test_codes_out_of_bound(self): tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_codes, expected_codes) + @pytest.mark.parametrize("codes", [[-1, -1], [2, -1], [2, 2]]) + def test_codes_empty_array_out_of_bound(self, codes): + empty_values = np.array([]) + expected_codes = -np.ones_like(codes, dtype=np.intp) + _, result_codes = safe_sort(empty_values, codes) + tm.assert_numpy_array_equal(result_codes, expected_codes) + def test_mixed_integer(self): values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object) result = safe_sort(values)