Skip to content

Commit 193d838

Browse files
committed
Revert "bpo-23689: re module, fix memory leak when a match is terminated by a signal or memory allocation failure (pythonGH-32283)"
This reverts commit 6e3eee5. Manual fixups to increase the MAGIC number and to handle conflicts with a couple of changes that landed after that.
1 parent 8ba1c7f commit 193d838

File tree

8 files changed

+109
-144
lines changed

8 files changed

+109
-144
lines changed

Lib/re/_compiler.py

+60-37
Original file line numberDiff line numberDiff line change
@@ -28,21 +28,54 @@
2828
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
2929
}
3030

31-
class _CompileData:
32-
__slots__ = ('code', 'repeat_count')
33-
def __init__(self):
34-
self.code = []
35-
self.repeat_count = 0
31+
# Sets of lowercase characters which have the same uppercase.
32+
_equivalences = (
33+
# LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
34+
(0x69, 0x131), # iı
35+
# LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
36+
(0x73, 0x17f), # sſ
37+
# MICRO SIGN, GREEK SMALL LETTER MU
38+
(0xb5, 0x3bc), # µμ
39+
# COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
40+
(0x345, 0x3b9, 0x1fbe), # \u0345ιι
41+
# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
42+
(0x390, 0x1fd3), # ΐΐ
43+
# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
44+
(0x3b0, 0x1fe3), # ΰΰ
45+
# GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
46+
(0x3b2, 0x3d0), # βϐ
47+
# GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
48+
(0x3b5, 0x3f5), # εϵ
49+
# GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
50+
(0x3b8, 0x3d1), # θϑ
51+
# GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
52+
(0x3ba, 0x3f0), # κϰ
53+
# GREEK SMALL LETTER PI, GREEK PI SYMBOL
54+
(0x3c0, 0x3d6), # πϖ
55+
# GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
56+
(0x3c1, 0x3f1), # ρϱ
57+
# GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
58+
(0x3c2, 0x3c3), # ςσ
59+
# GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
60+
(0x3c6, 0x3d5), # φϕ
61+
# LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
62+
(0x1e61, 0x1e9b), # ṡẛ
63+
# LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
64+
(0xfb05, 0xfb06), # ſtst
65+
)
66+
67+
# Maps the lowercase code to lowercase codes which have the same uppercase.
68+
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
69+
for t in _equivalences for i in t}
3670

3771
def _combine_flags(flags, add_flags, del_flags,
3872
TYPE_FLAGS=_parser.TYPE_FLAGS):
3973
if add_flags & TYPE_FLAGS:
4074
flags &= ~TYPE_FLAGS
4175
return (flags | add_flags) & ~del_flags
4276

43-
def _compile(data, pattern, flags):
77+
def _compile(code, pattern, flags):
4478
# internal: compile a (sub)pattern
45-
code = data.code
4679
emit = code.append
4780
_len = len
4881
LITERAL_CODES = _LITERAL_CODES
@@ -115,19 +148,15 @@ def _compile(data, pattern, flags):
115148
skip = _len(code); emit(0)
116149
emit(av[0])
117150
emit(av[1])
118-
_compile(data, av[2], flags)
151+
_compile(code, av[2], flags)
119152
emit(SUCCESS)
120153
code[skip] = _len(code) - skip
121154
else:
122155
emit(REPEATING_CODES[op][0])
123156
skip = _len(code); emit(0)
124157
emit(av[0])
125158
emit(av[1])
126-
# now op is in (MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT)
127-
if op != POSSESSIVE_REPEAT:
128-
emit(data.repeat_count)
129-
data.repeat_count += 1
130-
_compile(data, av[2], flags)
159+
_compile(code, av[2], flags)
131160
code[skip] = _len(code) - skip
132161
emit(REPEATING_CODES[op][1])
133162
elif op is SUBPATTERN:
@@ -136,7 +165,7 @@ def _compile(data, pattern, flags):
136165
emit(MARK)
137166
emit((group-1)*2)
138167
# _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
139-
_compile(data, p, _combine_flags(flags, add_flags, del_flags))
168+
_compile(code, p, _combine_flags(flags, add_flags, del_flags))
140169
if group:
141170
emit(MARK)
142171
emit((group-1)*2+1)
@@ -148,7 +177,7 @@ def _compile(data, pattern, flags):
148177
# pop their stack if they reach it
149178
emit(ATOMIC_GROUP)
150179
skip = _len(code); emit(0)
151-
_compile(data, av, flags)
180+
_compile(code, av, flags)
152181
emit(SUCCESS)
153182
code[skip] = _len(code) - skip
154183
elif op in SUCCESS_CODES:
@@ -163,7 +192,7 @@ def _compile(data, pattern, flags):
163192
if lo != hi:
164193
raise error("look-behind requires fixed-width pattern")
165194
emit(lo) # look behind
166-
_compile(data, av[1], flags)
195+
_compile(code, av[1], flags)
167196
emit(SUCCESS)
168197
code[skip] = _len(code) - skip
169198
elif op is AT:
@@ -182,7 +211,7 @@ def _compile(data, pattern, flags):
182211
for av in av[1]:
183212
skip = _len(code); emit(0)
184213
# _compile_info(code, av, flags)
185-
_compile(data, av, flags)
214+
_compile(code, av, flags)
186215
emit(JUMP)
187216
tailappend(_len(code)); emit(0)
188217
code[skip] = _len(code) - skip
@@ -210,12 +239,12 @@ def _compile(data, pattern, flags):
210239
emit(op)
211240
emit(av[0]-1)
212241
skipyes = _len(code); emit(0)
213-
_compile(data, av[1], flags)
242+
_compile(code, av[1], flags)
214243
if av[2]:
215244
emit(JUMP)
216245
skipno = _len(code); emit(0)
217246
code[skipyes] = _len(code) - skipyes + 1
218-
_compile(data, av[2], flags)
247+
_compile(code, av[2], flags)
219248
code[skipno] = _len(code) - skipno
220249
else:
221250
code[skipyes] = _len(code) - skipyes + 1
@@ -582,17 +611,17 @@ def isstring(obj):
582611
def _code(p, flags):
583612

584613
flags = p.state.flags | flags
585-
data = _CompileData()
614+
code = []
586615

587616
# compile info block
588-
_compile_info(data.code, p, flags)
617+
_compile_info(code, p, flags)
589618

590619
# compile the pattern
591-
_compile(data, p.data, flags)
620+
_compile(code, p.data, flags)
592621

593-
data.code.append(SUCCESS)
622+
code.append(SUCCESS)
594623

595-
return data
624+
return code
596625

597626
def _hex_code(code):
598627
return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
@@ -693,21 +722,14 @@ def print_2(*args):
693722
else:
694723
print_(FAILURE)
695724
i += 1
696-
elif op in (REPEAT_ONE, MIN_REPEAT_ONE,
725+
elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE,
697726
POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE):
698727
skip, min, max = code[i: i+3]
699728
if max == MAXREPEAT:
700729
max = 'MAXREPEAT'
701730
print_(op, skip, min, max, to=i+skip)
702731
dis_(i+3, i+skip)
703732
i += skip
704-
elif op is REPEAT:
705-
skip, min, max, repeat_index = code[i: i+4]
706-
if max == MAXREPEAT:
707-
max = 'MAXREPEAT'
708-
print_(op, skip, min, max, repeat_index, to=i+skip)
709-
dis_(i+4, i+skip)
710-
i += skip
711733
elif op is GROUPREF_EXISTS:
712734
arg, skip = code[i: i+2]
713735
print_(op, arg, skip, to=i+skip)
@@ -762,11 +784,11 @@ def compile(p, flags=0):
762784
else:
763785
pattern = None
764786

765-
data = _code(p, flags)
787+
code = _code(p, flags)
766788

767789
if flags & SRE_FLAG_DEBUG:
768790
print()
769-
dis(data.code)
791+
dis(code)
770792

771793
# map in either direction
772794
groupindex = p.state.groupdict
@@ -775,6 +797,7 @@ def compile(p, flags=0):
775797
indexgroup[i] = k
776798

777799
return _sre.compile(
778-
pattern, flags | p.state.flags, data.code,
779-
p.state.groups-1, groupindex, tuple(indexgroup),
780-
data.repeat_count)
800+
pattern, flags | p.state.flags, code,
801+
p.state.groups-1,
802+
groupindex, tuple(indexgroup)
803+
)

Lib/re/_constants.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
# update when constants are added or removed
1515

16-
MAGIC = 20220423
16+
MAGIC = 20220615
1717

1818
from _sre import MAXREPEAT, MAXGROUPS
1919

Lib/test/test_re.py

+2-26
Original file line numberDiff line numberDiff line change
@@ -1765,12 +1765,9 @@ def test_dealloc(self):
17651765
long_overflow = 2**128
17661766
self.assertRaises(TypeError, re.finditer, "a", {})
17671767
with self.assertRaises(OverflowError):
1768-
_sre.compile("abc", 0, [long_overflow], 0, {}, (), 0)
1768+
_sre.compile("abc", 0, [long_overflow], 0, {}, ())
17691769
with self.assertRaises(TypeError):
1770-
_sre.compile({}, 0, [], 0, [], [], 0)
1771-
with self.assertRaises(RuntimeError):
1772-
# invalid repeat_count -1
1773-
_sre.compile("abc", 0, [1], 0, {}, (), -1)
1770+
_sre.compile({}, 0, [], 0, [], [])
17741771

17751772
def test_search_dot_unicode(self):
17761773
self.assertTrue(re.search("123.*-", '123abc-'))
@@ -2509,27 +2506,6 @@ def test_possesive_repeat(self):
25092506
14. SUCCESS
25102507
''')
25112508

2512-
def test_repeat_index(self):
2513-
self.assertEqual(get_debug_out(r'(?:ab)*?(?:cd)*'), '''\
2514-
MIN_REPEAT 0 MAXREPEAT
2515-
LITERAL 97
2516-
LITERAL 98
2517-
MAX_REPEAT 0 MAXREPEAT
2518-
LITERAL 99
2519-
LITERAL 100
2520-
2521-
0. INFO 4 0b0 0 MAXREPEAT (to 5)
2522-
5: REPEAT 8 0 MAXREPEAT 0 (to 14)
2523-
10. LITERAL 0x61 ('a')
2524-
12. LITERAL 0x62 ('b')
2525-
14: MIN_UNTIL
2526-
15. REPEAT 8 0 MAXREPEAT 1 (to 24)
2527-
20. LITERAL 0x63 ('c')
2528-
22. LITERAL 0x64 ('d')
2529-
24: MAX_UNTIL
2530-
25. SUCCESS
2531-
''')
2532-
25332509

25342510
class PatternReprTests(unittest.TestCase):
25352511
def check(self, pattern, expected):

Modules/_sre/clinic/sre.c.h

+6-19
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)