Skip to content

Commit d8e163b

Browse files
Ansueldpgeorge
authored andcommitted
unix-ffi/re: Convert to PCRE2.
PCRE is marked as EOL and won't receive any new security update. Convert the re module to PCRE2 API to enforce security. Additional dependency is now needed with uctypes due to changes in how PCRE2 return the match_data in a pointer and require special handling. The converted module is tested with the test_re.py with no regression. Signed-off-by: Christian Marangi <[email protected]>
1 parent 0620d02 commit d8e163b

File tree

1 file changed

+48
-25
lines changed

1 file changed

+48
-25
lines changed

unix-ffi/re/re.py

+48-25
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,55 @@
11
import sys
22
import ffilib
33
import array
4+
import uctypes
45

6+
pcre2 = ffilib.open("libpcre2-8")
57

6-
pcre = ffilib.open("libpcre")
8+
# pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
9+
# uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
10+
# pcre2_compile_context *ccontext);
11+
pcre2_compile = pcre2.func("p", "pcre2_compile_8", "siippp")
712

8-
# pcre *pcre_compile(const char *pattern, int options,
9-
# const char **errptr, int *erroffset,
10-
# const unsigned char *tableptr);
11-
pcre_compile = pcre.func("p", "pcre_compile", "sipps")
13+
# int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
14+
# PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options,
15+
# pcre2_match_data *match_data, pcre2_match_context *mcontext);
16+
pcre2_match = pcre2.func("i", "pcre2_match_8", "Psiiipp")
1217

13-
# int pcre_exec(const pcre *code, const pcre_extra *extra,
14-
# const char *subject, int length, int startoffset,
15-
# int options, int *ovector, int ovecsize);
16-
pcre_exec = pcre.func("i", "pcre_exec", "PPsiiipi")
18+
# int pcre2_pattern_info(const pcre2_code *code, uint32_t what,
19+
# void *where);
20+
pcre2_pattern_info = pcre2.func("i", "pcre2_pattern_info_8", "Pip")
1721

18-
# int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
19-
# int what, void *where);
20-
pcre_fullinfo = pcre.func("i", "pcre_fullinfo", "PPip")
22+
# PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
23+
pcre2_get_ovector_pointer = pcre2.func("p", "pcre2_get_ovector_pointer_8", "p")
2124

25+
# pcre2_match_data *pcre2_match_data_create_from_pattern(const pcre2_code *code,
26+
# pcre2_general_context *gcontext);
27+
pcre2_match_data_create_from_pattern = pcre2.func(
28+
"p", "pcre2_match_data_create_from_pattern_8", "Pp"
29+
)
2230

23-
IGNORECASE = I = 1
24-
MULTILINE = M = 2
25-
DOTALL = S = 4
26-
VERBOSE = X = 8
27-
PCRE_ANCHORED = 0x10
31+
# PCRE2_SIZE that is of type size_t.
32+
# Use ULONG as type to support both 32bit and 64bit.
33+
PCRE2_SIZE_SIZE = uctypes.sizeof({"field": 0 | uctypes.ULONG})
34+
PCRE2_SIZE_TYPE = "L"
35+
36+
# Real value in pcre2.h is 0xFFFFFFFF for 32bit and
37+
# 0x0xFFFFFFFFFFFFFFFF for 64bit that is equivalent
38+
# to -1
39+
PCRE2_ZERO_TERMINATED = -1
40+
41+
42+
IGNORECASE = I = 0x8
43+
MULTILINE = M = 0x400
44+
DOTALL = S = 0x20
45+
VERBOSE = X = 0x80
46+
PCRE2_ANCHORED = 0x80000000
2847

2948
# TODO. Note that Python3 has unicode by default
3049
ASCII = A = 0
3150
UNICODE = U = 0
3251

33-
PCRE_INFO_CAPTURECOUNT = 2
52+
PCRE2_INFO_CAPTURECOUNT = 0x4
3453

3554

3655
class PCREMatch:
@@ -67,19 +86,23 @@ def __init__(self, compiled_ptn):
6786
def search(self, s, pos=0, endpos=-1, _flags=0):
6887
assert endpos == -1, "pos: %d, endpos: %d" % (pos, endpos)
6988
buf = array.array("i", [0])
70-
pcre_fullinfo(self.obj, None, PCRE_INFO_CAPTURECOUNT, buf)
89+
pcre2_pattern_info(self.obj, PCRE2_INFO_CAPTURECOUNT, buf)
7190
cap_count = buf[0]
72-
ov = array.array("i", [0, 0, 0] * (cap_count + 1))
73-
num = pcre_exec(self.obj, None, s, len(s), pos, _flags, ov, len(ov))
91+
match_data = pcre2_match_data_create_from_pattern(self.obj, None)
92+
num = pcre2_match(self.obj, s, len(s), pos, _flags, match_data, None)
7493
if num == -1:
7594
# No match
7695
return None
96+
ov_ptr = pcre2_get_ovector_pointer(match_data)
97+
# pcre2_get_ovector_pointer return PCRE2_SIZE
98+
ov_buf = uctypes.bytearray_at(ov_ptr, PCRE2_SIZE_SIZE * (cap_count + 1) * 2)
99+
ov = array.array(PCRE2_SIZE_TYPE, ov_buf)
77100
# We don't care how many matching subexpressions we got, we
78101
# care only about total # of capturing ones (including empty)
79102
return PCREMatch(s, cap_count + 1, ov)
80103

81104
def match(self, s, pos=0, endpos=-1):
82-
return self.search(s, pos, endpos, PCRE_ANCHORED)
105+
return self.search(s, pos, endpos, PCRE2_ANCHORED)
83106

84107
def sub(self, repl, s, count=0):
85108
if not callable(repl):
@@ -141,9 +164,9 @@ def findall(self, s):
141164

142165

143166
def compile(pattern, flags=0):
144-
errptr = bytes(4)
167+
errcode = bytes(4)
145168
erroffset = bytes(4)
146-
regex = pcre_compile(pattern, flags, errptr, erroffset, None)
169+
regex = pcre2_compile(pattern, PCRE2_ZERO_TERMINATED, flags, errcode, erroffset, None)
147170
assert regex
148171
return PCREPattern(regex)
149172

@@ -154,7 +177,7 @@ def search(pattern, string, flags=0):
154177

155178

156179
def match(pattern, string, flags=0):
157-
r = compile(pattern, flags | PCRE_ANCHORED)
180+
r = compile(pattern, flags | PCRE2_ANCHORED)
158181
return r.search(string)
159182

160183

0 commit comments

Comments
 (0)