Skip to content

Commit 6a643ec

Browse files
merge master
1 parent de3a85c commit 6a643ec

File tree

4 files changed

+99
-30
lines changed

4 files changed

+99
-30
lines changed

Diff for: doc/source/whatsnew/v0.25.0.rst

+2-3
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ Other API Changes
222222
Deprecations
223223
~~~~~~~~~~~~
224224

225+
- :func:`Series.str.replace`, when ``pat`` is single special regex character (such as ``.|\`` etc) and regex is not defined, regex is by default ``False`` for now, but this might be deprecated in the future. (:issue:`24804`)
225226
- Deprecated the `M (months)` and `Y (year)` `units` parameter of :func: `pandas.to_timedelta`, :func: `pandas.Timedelta` and :func: `pandas.TimedeltaIndex` (:issue:`16344`)
226227
- The functions :func:`pandas.to_datetime` and :func:`pandas.to_timedelta` have deprecated the ``box`` keyword. Instead, use :meth:`to_numpy` or :meth:`Timestamp.to_datetime64`/:meth:`Timedelta.to_timedelta64`. (:issue:`24416`)
227228

@@ -305,10 +306,8 @@ Conversion
305306

306307
Strings
307308
^^^^^^^
309+
- Bug in :func:`Series.str.replace` not applying regex in patterns of length 1 (:issue:`24804`)
308310

309-
-
310-
-
311-
-
312311

313312

314313
Interval

Diff for: pandas/core/reshape/melt.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,8 @@ def melt_stub(df, stub, i, j, value_vars, sep):
420420
newdf = melt(df, id_vars=i, value_vars=value_vars,
421421
value_name=stub.rstrip(sep), var_name=j)
422422
newdf[j] = Categorical(newdf[j])
423-
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")
423+
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "",
424+
regex=True)
424425

425426
# GH17627 Cast numerics suffixes to int/float
426427
newdf[j] = to_numeric(newdf[j], errors='ignore')

Diff for: pandas/core/strings.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@ def str_endswith(arr, pat, na=np.nan):
424424
return _na_map(f, arr, na, dtype=bool)
425425

426426

427-
def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
427+
def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=None):
428428
r"""
429429
Replace occurrences of pattern/regex in the Series/Index with
430430
some other string. Equivalent to :meth:`str.replace` or
@@ -455,9 +455,13 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
455455
flags : int, default 0 (no flags)
456456
- re module flags, e.g. re.IGNORECASE
457457
- Cannot be set if `pat` is a compiled regex
458-
regex : bool, default True
458+
regex : boolean, default None
459459
- If True, assumes the passed-in pattern is a regular expression.
460460
- If False, treats the pattern as a literal string
461+
- If `pat` is a single character and `regex` is not specified, `pat`
462+
is interpreted as a string literal. If `pat` is also a regular
463+
expression symbol, a warning is issued that in the future `pat`
464+
will be interpreted as a regex, rather than a literal.
461465
- Cannot be set to False if `pat` is a compiled regex or `repl` is
462466
a callable.
463467
@@ -564,7 +568,7 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
564568
# add case flag, if provided
565569
if case is False:
566570
flags |= re.IGNORECASE
567-
if is_compiled_re or len(pat) > 1 or flags or callable(repl):
571+
if is_compiled_re or pat or flags or callable(repl):
568572
n = n if n >= 0 else 0
569573
compiled = re.compile(pat, flags=flags)
570574
f = lambda x: compiled.sub(repl=repl, string=x, count=n)
@@ -577,6 +581,12 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
577581
if callable(repl):
578582
raise ValueError("Cannot use a callable replacement when "
579583
"regex=False")
584+
# if regex is default None, and a single special character is given
585+
# in pat, still take it as a literal, and raise the Future warning
586+
if regex is None and len(pat) == 1 and pat in list(r"[\^$.|?*+()]"):
587+
warnings.warn("'{}' is interpreted as a literal in ".format(pat) +
588+
"default, not regex. It will change in the future.",
589+
FutureWarning)
580590
f = lambda x: x.replace(pat, repl, n)
581591

582592
return _na_map(f, arr)
@@ -2538,7 +2548,7 @@ def match(self, pat, case=True, flags=0, na=np.nan):
25382548
return self._wrap_result(result, fill_value=na)
25392549

25402550
@copy(str_replace)
2541-
def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
2551+
def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None):
25422552
result = str_replace(self._parent, pat, repl, n=n, case=case,
25432553
flags=flags, regex=regex)
25442554
return self._wrap_result(result)

Diff for: pandas/tests/test_strings.py

+81-22
Original file line numberDiff line numberDiff line change
@@ -904,27 +904,39 @@ def test_casemethods(self):
904904
def test_replace(self):
905905
values = Series(['fooBAD__barBAD', NA])
906906

907-
result = values.str.replace('BAD[_]*', '')
907+
result = values.str.replace('BAD[_]*', '', regex=True)
908908
exp = Series(['foobar', NA])
909909
tm.assert_series_equal(result, exp)
910910

911-
result = values.str.replace('BAD[_]*', '', n=1)
911+
result = values.str.replace('BAD[_]*', '', regex=True, n=1)
912912
exp = Series(['foobarBAD', NA])
913913
tm.assert_series_equal(result, exp)
914914

915915
# mixed
916916
mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD',
917917
None, 1, 2.])
918918

919-
rs = Series(mixed).str.replace('BAD[_]*', '')
919+
rs = Series(mixed).str.replace('BAD[_]*', '', regex=True)
920920
xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA])
921921
assert isinstance(rs, Series)
922922
tm.assert_almost_equal(rs, xp)
923923

924+
# unicode
925+
values = Series([u'fooBAD__barBAD', NA])
926+
927+
result = values.str.replace('BAD[_]*', '', regex=True)
928+
exp = Series([u'foobar', NA])
929+
tm.assert_series_equal(result, exp)
930+
931+
result = values.str.replace('BAD[_]*', '', n=1, regex=True)
932+
exp = Series([u'foobarBAD', NA])
933+
tm.assert_series_equal(result, exp)
934+
924935
# flags + unicode
925936
values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
926937
exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
927-
result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE)
938+
result = values.str.replace(r"(?<=\w),(?=\w)", ", ", regex=True,
939+
flags=re.UNICODE)
928940
tm.assert_series_equal(result, exp)
929941

930942
# GH 13438
@@ -942,7 +954,7 @@ def test_replace_callable(self):
942954

943955
# test with callable
944956
repl = lambda m: m.group(0).swapcase()
945-
result = values.str.replace('[a-z][A-Z]{2}', repl, n=2)
957+
result = values.str.replace('[a-z][A-Z]{2}', repl, n=2, regex=True)
946958
exp = Series(['foObaD__baRbaD', NA])
947959
tm.assert_series_equal(result, exp)
948960

@@ -952,21 +964,21 @@ def test_replace_callable(self):
952964

953965
repl = lambda: None
954966
with pytest.raises(TypeError, match=p_err):
955-
values.str.replace('a', repl)
967+
values.str.replace('a', repl, regex=True)
956968

957969
repl = lambda m, x: None
958970
with pytest.raises(TypeError, match=p_err):
959-
values.str.replace('a', repl)
971+
values.str.replace('a', repl, regex=True)
960972

961973
repl = lambda m, x, y=None: None
962974
with pytest.raises(TypeError, match=p_err):
963-
values.str.replace('a', repl)
975+
values.str.replace('a', repl, regex=True)
964976

965977
# test regex named groups
966978
values = Series(['Foo Bar Baz', NA])
967979
pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
968980
repl = lambda m: m.group('middle').swapcase()
969-
result = values.str.replace(pat, repl)
981+
result = values.str.replace(pat, repl, regex=True)
970982
exp = Series(['bAR', NA])
971983
tm.assert_series_equal(result, exp)
972984

@@ -976,28 +988,39 @@ def test_replace_compiled_regex(self):
976988

977989
# test with compiled regex
978990
pat = re.compile(r'BAD[_]*')
979-
result = values.str.replace(pat, '')
991+
result = values.str.replace(pat, '', regex=True)
980992
exp = Series(['foobar', NA])
981993
tm.assert_series_equal(result, exp)
982994

983-
result = values.str.replace(pat, '', n=1)
995+
result = values.str.replace(pat, '', n=1, regex=True)
984996
exp = Series(['foobarBAD', NA])
985997
tm.assert_series_equal(result, exp)
986998

987999
# mixed
9881000
mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD',
9891001
None, 1, 2.])
9901002

991-
rs = Series(mixed).str.replace(pat, '')
1003+
rs = Series(mixed).str.replace(pat, '', regex=True)
9921004
xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA])
9931005
assert isinstance(rs, Series)
9941006
tm.assert_almost_equal(rs, xp)
9951007

1008+
# unicode
1009+
values = Series([u'fooBAD__barBAD', NA])
1010+
1011+
result = values.str.replace(pat, '', regex=True)
1012+
exp = Series([u'foobar', NA])
1013+
tm.assert_series_equal(result, exp)
1014+
1015+
result = values.str.replace(pat, '', n=1, regex=True)
1016+
exp = Series([u'foobarBAD', NA])
1017+
tm.assert_series_equal(result, exp)
1018+
9961019
# flags + unicode
9971020
values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
9981021
exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
9991022
pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
1000-
result = values.str.replace(pat, ", ")
1023+
result = values.str.replace(pat, ", ", regex=True)
10011024
tm.assert_series_equal(result, exp)
10021025

10031026
# case and flags provided to str.replace will have no effect
@@ -1007,29 +1030,30 @@ def test_replace_compiled_regex(self):
10071030

10081031
with pytest.raises(ValueError,
10091032
match="case and flags cannot be"):
1010-
result = values.str.replace(pat, '', flags=re.IGNORECASE)
1033+
result = values.str.replace(pat, '', flags=re.IGNORECASE,
1034+
regex=True)
10111035

10121036
with pytest.raises(ValueError,
10131037
match="case and flags cannot be"):
1014-
result = values.str.replace(pat, '', case=False)
1038+
result = values.str.replace(pat, '', case=False, regex=True)
10151039

10161040
with pytest.raises(ValueError,
10171041
match="case and flags cannot be"):
1018-
result = values.str.replace(pat, '', case=True)
1042+
result = values.str.replace(pat, '', case=True, regex=True)
10191043

10201044
# test with callable
10211045
values = Series(['fooBAD__barBAD', NA])
10221046
repl = lambda m: m.group(0).swapcase()
10231047
pat = re.compile('[a-z][A-Z]{2}')
1024-
result = values.str.replace(pat, repl, n=2)
1048+
result = values.str.replace(pat, repl, n=2, regex=True)
10251049
exp = Series(['foObaD__baRbaD', NA])
10261050
tm.assert_series_equal(result, exp)
10271051

10281052
def test_replace_literal(self):
10291053
# GH16808 literal replace (regex=False vs regex=True)
10301054
values = Series(['f.o', 'foo', NA])
10311055
exp = Series(['bao', 'bao', NA])
1032-
result = values.str.replace('f.', 'ba')
1056+
result = values.str.replace('f.', 'ba', regex=True)
10331057
tm.assert_series_equal(result, exp)
10341058

10351059
exp = Series(['bao', 'foo', NA])
@@ -2726,6 +2750,7 @@ def test_partition_deprecation(self):
27262750
result = values.str.rpartition(pat='_')
27272751
tm.assert_frame_equal(result, expected)
27282752

2753+
@pytest.mark.filterwarnings("ignore: '|' is interpreted as a literal")
27292754
def test_pipe_failures(self):
27302755
# #2119
27312756
s = Series(['A|B|C'])
@@ -2735,7 +2760,7 @@ def test_pipe_failures(self):
27352760

27362761
tm.assert_series_equal(result, exp)
27372762

2738-
result = s.str.replace('|', ' ')
2763+
result = s.str.replace('|', ' ', regex=None)
27392764
exp = Series(['A B C'])
27402765

27412766
tm.assert_series_equal(result, exp)
@@ -2996,17 +3021,17 @@ def test_replace_moar(self):
29963021
s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA',
29973022
'dog', 'cat'])
29983023

2999-
result = s.str.replace('A', 'YYY')
3024+
result = s.str.replace('A', 'YYY', regex=True)
30003025
expected = Series(['YYY', 'B', 'C', 'YYYaba', 'Baca', '', NA,
30013026
'CYYYBYYY', 'dog', 'cat'])
30023027
assert_series_equal(result, expected)
30033028

3004-
result = s.str.replace('A', 'YYY', case=False)
3029+
result = s.str.replace('A', 'YYY', case=False, regex=True)
30053030
expected = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', '', NA,
30063031
'CYYYBYYY', 'dog', 'cYYYt'])
30073032
assert_series_equal(result, expected)
30083033

3009-
result = s.str.replace('^.a|dog', 'XX-XX ', case=False)
3034+
result = s.str.replace('^.a|dog', 'XX-XX ', case=False, regex=True)
30103035
expected = Series(['A', 'B', 'C', 'XX-XX ba', 'XX-XX ca', '', NA,
30113036
'XX-XX BA', 'XX-XX ', 'XX-XX t'])
30123037
assert_series_equal(result, expected)
@@ -3177,6 +3202,40 @@ def test_method_on_bytes(self):
31773202
with pytest.raises(TypeError, match="can't concat str to bytes"):
31783203
lhs.str.cat(rhs)
31793204

3205+
@pytest.mark.filterwarnings("ignore: '.' is interpreted as a literal")
3206+
@pytest.mark.parametrize("regex, expected_array", [
3207+
(True, ['foofoofoo', 'foofoofoo']),
3208+
(False, ['abc', '123']),
3209+
(None, ['abc', '123'])
3210+
])
3211+
def test_replace_single_pattern(self, regex, expected_array):
3212+
values = Series(['abc', '123'])
3213+
# GH: 24804
3214+
result = values.str.replace('.', 'foo', regex=regex)
3215+
expected = Series(expected_array)
3216+
tm.assert_series_equal(result, expected)
3217+
3218+
@pytest.mark.parametrize("input_array, single_char, replace_char, "
3219+
"expect_array, warn",
3220+
[("a.c", ".", "b", "abc", True),
3221+
("a@c", "@", "at", "aatc", False)]
3222+
)
3223+
def test_replace_warning_single_character(self, input_array,
3224+
single_char, replace_char,
3225+
expect_array, warn):
3226+
# GH: 24804
3227+
values = Series([input_array])
3228+
if warn:
3229+
with tm.assert_produces_warning(FutureWarning,
3230+
check_stacklevel=False):
3231+
result = values.str.replace(single_char, replace_char)
3232+
else:
3233+
result = values.str.replace(single_char, replace_char)
3234+
3235+
expected = Series([expect_array])
3236+
tm.assert_series_equal(result, expected)
3237+
3238+
@pytest.mark.skipif(compat.PY2, reason='not in python2')
31803239
def test_casefold(self):
31813240
# GH25405
31823241
expected = Series(['ss', NA, 'case', 'ssd'])

0 commit comments

Comments
 (0)