BUG: Fix regression on DataFrame.replace for regex (#25266)

PetitLepton · TomAugspurger · commit 50c40ff1afa4 · 2019-02-28T07:55:33.000-06:00
* BUG: Fix regression on DataFrame.replace for regex

The commit ensures that the replacement for regex is not confined to the 
beginning of the string but spans all the characters within. The 
behaviour is then consistent with versions prior to 0.24.0.

One test has been added to account for character replacement when the 
character is not at the beginning of the string.
diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst
@@ -23,6 +23,7 @@ Fixed Regressions
 - Fixed regression in :meth:`DataFrame.all` and :meth:`DataFrame.any` where ``bool_only=True`` was ignored (:issue:`25101`)
 - Fixed issue in ``DataFrame`` construction with passing a mixed list of mixed types could segfault. (:issue:`25075`)
 - Fixed regression in :meth:`DataFrame.apply` causing ``RecursionError`` when ``dict``-like classes were passed as argument. (:issue:`25196`)
+- Fixed regression in :meth:`DataFrame.replace` where ``regex=True`` was only replacing patterns matching the start of the string (:issue:`25259`)
 
 - Fixed regression in :meth:`DataFrame.duplicated()`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`)
 - Fixed regression in :meth:`Series.min` and :meth:`Series.max` where ``numeric_only=True`` was ignored when the ``Series`` contained ```Categorical`` data (:issue:`25299`)
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -552,9 +552,9 @@ def comp(s, regex=False):
             if isna(s):
                 return isna(values)
             if hasattr(s, 'asm8'):
-                return _compare_or_regex_match(maybe_convert_objects(values),
-                                               getattr(s, 'asm8'), regex)
-            return _compare_or_regex_match(values, s, regex)
+                return _compare_or_regex_search(maybe_convert_objects(values),
+                                                getattr(s, 'asm8'), regex)
+            return _compare_or_regex_search(values, s, regex)
 
         masks = [comp(s, regex) for i, s in enumerate(src_list)]
 
@@ -1897,11 +1897,11 @@ def _consolidate(blocks):
     return new_blocks
 
 
-def _compare_or_regex_match(a, b, regex=False):
+def _compare_or_regex_search(a, b, regex=False):
     """
     Compare two array_like inputs of the same shape or two scalar values
 
-    Calls operator.eq or re.match, depending on regex argument. If regex is
+    Calls operator.eq or re.search, depending on regex argument. If regex is
     True, perform an element-wise regex matching.
 
     Parameters
@@ -1917,7 +1917,7 @@ def _compare_or_regex_match(a, b, regex=False):
     if not regex:
         op = lambda x: operator.eq(x, b)
     else:
-        op = np.vectorize(lambda x: bool(re.match(b, x)) if isinstance(x, str)
+        op = np.vectorize(lambda x: bool(re.search(b, x)) if isinstance(x, str)
                           else False)
 
     is_a_array = isinstance(a, np.ndarray)
diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py
@@ -466,6 +466,13 @@ def test_regex_replace_dict_nested(self):
         assert_frame_equal(res3, expec)
         assert_frame_equal(res4, expec)
 
+    def test_regex_replace_dict_nested_non_first_character(self):
+        # GH 25259
+        df = pd.DataFrame({'first': ['abc', 'bca', 'cab']})
+        expected = pd.DataFrame({'first': ['.bc', 'bc.', 'c.b']})
+        result = df.replace({'a': '.'}, regex=True)
+        assert_frame_equal(result, expected)
+
     def test_regex_replace_dict_nested_gh4115(self):
         df = pd.DataFrame({'Type': ['Q', 'T', 'Q', 'Q', 'T'], 'tmp': 2})
         expected = DataFrame({'Type': [0, 1, 0, 0, 1], 'tmp': 2})