Skip to content

Commit 50c40ff

Browse files
PetitLeptonTomAugspurger
authored andcommitted
BUG: Fix regression on DataFrame.replace for regex (#25266)
* BUG: Fix regression on DataFrame.replace for regex The commit ensures that the replacement for regex is not confined to the beginning of the string but spans all the characters within. The behaviour is then consistent with versions prior to 0.24.0. One test has been added to account for character replacement when the character is not at the beginning of the string.
1 parent 64e5612 commit 50c40ff

File tree

3 files changed

+14
-6
lines changed

3 files changed

+14
-6
lines changed

Diff for: doc/source/whatsnew/v0.24.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Fixed Regressions
2323
- Fixed regression in :meth:`DataFrame.all` and :meth:`DataFrame.any` where ``bool_only=True`` was ignored (:issue:`25101`)
2424
- Fixed issue in ``DataFrame`` construction with passing a mixed list of mixed types could segfault. (:issue:`25075`)
2525
- Fixed regression in :meth:`DataFrame.apply` causing ``RecursionError`` when ``dict``-like classes were passed as argument. (:issue:`25196`)
26+
- Fixed regression in :meth:`DataFrame.replace` where ``regex=True`` was only replacing patterns matching the start of the string (:issue:`25259`)
2627

2728
- Fixed regression in :meth:`DataFrame.duplicated()`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`)
2829
- Fixed regression in :meth:`Series.min` and :meth:`Series.max` where ``numeric_only=True`` was ignored when the ``Series`` contained ```Categorical`` data (:issue:`25299`)

Diff for: pandas/core/internals/managers.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -552,9 +552,9 @@ def comp(s, regex=False):
552552
if isna(s):
553553
return isna(values)
554554
if hasattr(s, 'asm8'):
555-
return _compare_or_regex_match(maybe_convert_objects(values),
556-
getattr(s, 'asm8'), regex)
557-
return _compare_or_regex_match(values, s, regex)
555+
return _compare_or_regex_search(maybe_convert_objects(values),
556+
getattr(s, 'asm8'), regex)
557+
return _compare_or_regex_search(values, s, regex)
558558

559559
masks = [comp(s, regex) for i, s in enumerate(src_list)]
560560

@@ -1897,11 +1897,11 @@ def _consolidate(blocks):
18971897
return new_blocks
18981898

18991899

1900-
def _compare_or_regex_match(a, b, regex=False):
1900+
def _compare_or_regex_search(a, b, regex=False):
19011901
"""
19021902
Compare two array_like inputs of the same shape or two scalar values
19031903
1904-
Calls operator.eq or re.match, depending on regex argument. If regex is
1904+
Calls operator.eq or re.search, depending on regex argument. If regex is
19051905
True, perform an element-wise regex matching.
19061906
19071907
Parameters
@@ -1917,7 +1917,7 @@ def _compare_or_regex_match(a, b, regex=False):
19171917
if not regex:
19181918
op = lambda x: operator.eq(x, b)
19191919
else:
1920-
op = np.vectorize(lambda x: bool(re.match(b, x)) if isinstance(x, str)
1920+
op = np.vectorize(lambda x: bool(re.search(b, x)) if isinstance(x, str)
19211921
else False)
19221922

19231923
is_a_array = isinstance(a, np.ndarray)

Diff for: pandas/tests/frame/test_replace.py

+7
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,13 @@ def test_regex_replace_dict_nested(self):
466466
assert_frame_equal(res3, expec)
467467
assert_frame_equal(res4, expec)
468468

469+
def test_regex_replace_dict_nested_non_first_character(self):
470+
# GH 25259
471+
df = pd.DataFrame({'first': ['abc', 'bca', 'cab']})
472+
expected = pd.DataFrame({'first': ['.bc', 'bc.', 'c.b']})
473+
result = df.replace({'a': '.'}, regex=True)
474+
assert_frame_equal(result, expected)
475+
469476
def test_regex_replace_dict_nested_gh4115(self):
470477
df = pd.DataFrame({'Type': ['Q', 'T', 'Q', 'Q', 'T'], 'tmp': 2})
471478
expected = DataFrame({'Type': [0, 1, 0, 0, 1], 'tmp': 2})

0 commit comments

Comments
 (0)