Skip to content

Commit aee7760

Browse files
committed
Grab string/character lexer constants from pycparser
1 parent 03c24a2 commit aee7760

File tree

4 files changed

+282
-15
lines changed

4 files changed

+282
-15
lines changed

Diff for: LICENSE.txt

+29-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
cxxheaderparser license:
22

3-
Copyright (c) 2020 Dustin Spicuzza <[email protected]>
3+
Copyright (c) 2020-2022 Dustin Spicuzza <[email protected]>
44
All rights reserved.
55

66
Redistribution and use in source and binary forms, with or without
@@ -102,3 +102,31 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
102102
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
103103

104104
-----------------------------------------------------------------------------
105+
106+
pycparser -- A C parser in Python
107+
108+
Copyright (c) 2008-2022, Eli Bendersky
109+
All rights reserved.
110+
111+
Redistribution and use in source and binary forms, with or without modification,
112+
are permitted provided that the following conditions are met:
113+
114+
* Redistributions of source code must retain the above copyright notice, this
115+
list of conditions and the following disclaimer.
116+
* Redistributions in binary form must reproduce the above copyright notice,
117+
this list of conditions and the following disclaimer in the documentation
118+
and/or other materials provided with the distribution.
119+
* Neither the name of the copyright holder nor the names of its contributors may
120+
be used to endorse or promote products derived from this software without
121+
specific prior written permission.
122+
123+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
124+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
125+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
126+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
127+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
128+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
129+
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
130+
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
131+
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
132+
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Diff for: cxxheaderparser/lexer.py

+236-10
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ class LexToken(Protocol):
6161

6262

6363
class Lexer:
64+
"""
65+
This lexer is a combination of pieces from the PLY lexers that CppHeaderParser
66+
and pycparser have.
67+
"""
6468

6569
keywords = {
6670
"__attribute__",
@@ -144,15 +148,33 @@ class Lexer:
144148
}
145149

146150
tokens = [
147-
"NUMBER",
148-
"FLOAT_NUMBER",
151+
# constants
152+
"FLOAT_CONST",
153+
"HEX_FLOAT_CONST",
154+
"INT_CONST_HEX",
155+
"INT_CONST_BIN",
156+
"INT_CONST_OCT",
157+
"INT_CONST_DEC",
158+
"INT_CONST_CHAR",
159+
"CHAR_CONST",
160+
"WCHAR_CONST",
161+
"U8CHAR_CONST",
162+
"U16CHAR_CONST",
163+
"U32CHAR_CONST",
164+
# String literals
165+
"STRING_LITERAL",
166+
"WSTRING_LITERAL",
167+
"U8STRING_LITERAL",
168+
"U16STRING_LITERAL",
169+
"U32STRING_LITERAL",
170+
#
149171
"NAME",
172+
# Comments
150173
"COMMENT_SINGLELINE",
151174
"COMMENT_MULTILINE",
152175
"PRECOMP_MACRO",
176+
# misc
153177
"DIVIDE",
154-
"CHAR_LITERAL",
155-
"STRING_LITERAL",
156178
"NEWLINE",
157179
"ELLIPSIS",
158180
"DBL_LBRACKET",
@@ -189,9 +211,216 @@ class Lexer:
189211
".",
190212
]
191213

214+
#
215+
# Regexes for use in tokens (taken from pycparser)
216+
#
217+
218+
hex_prefix = "0[xX]"
219+
hex_digits = "[0-9a-fA-F]+"
220+
bin_prefix = "0[bB]"
221+
bin_digits = "[01]+"
222+
223+
# integer constants (K&R2: A.2.5.1)
224+
integer_suffix_opt = (
225+
r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?"
226+
)
227+
decimal_constant = (
228+
"(0" + integer_suffix_opt + ")|([1-9][0-9]*" + integer_suffix_opt + ")"
229+
)
230+
octal_constant = "0[0-7]*" + integer_suffix_opt
231+
hex_constant = hex_prefix + hex_digits + integer_suffix_opt
232+
bin_constant = bin_prefix + bin_digits + integer_suffix_opt
233+
234+
bad_octal_constant = "0[0-7]*[89]"
235+
236+
# character constants (K&R2: A.2.5.2)
237+
# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
238+
# directives with Windows paths as filenames (..\..\dir\file)
239+
# For the same reason, decimal_escape allows all digit sequences. We want to
240+
# parse all correct code, even if it means to sometimes parse incorrect
241+
# code.
242+
#
243+
# The original regexes were taken verbatim from the C syntax definition,
244+
# and were later modified to avoid worst-case exponential running time.
245+
#
246+
# simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
247+
# decimal_escape = r"""(\d+)"""
248+
# hex_escape = r"""(x[0-9a-fA-F]+)"""
249+
# bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
250+
#
251+
# The following modifications were made to avoid the ambiguity that allowed backtracking:
252+
# (https://github.com/eliben/pycparser/issues/61)
253+
#
254+
# - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape.
255+
# - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex
256+
# - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal
257+
# - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape.
258+
#
259+
# Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways.
260+
# e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`.
261+
262+
simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
263+
decimal_escape = r"""(\d+)(?!\d)"""
264+
hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
265+
bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
266+
267+
escape_sequence = (
268+
r"""(\\(""" + simple_escape + "|" + decimal_escape + "|" + hex_escape + "))"
269+
)
270+
271+
# This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed
272+
# 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to
273+
274+
escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
275+
276+
cconst_char = r"""([^'\\\n]|""" + escape_sequence + ")"
277+
char_const = "'" + cconst_char + "'"
278+
wchar_const = "L" + char_const
279+
u8char_const = "u8" + char_const
280+
u16char_const = "u" + char_const
281+
u32char_const = "U" + char_const
282+
multicharacter_constant = "'" + cconst_char + "{2,4}'"
283+
unmatched_quote = "('" + cconst_char + "*\\n)|('" + cconst_char + "*$)"
284+
bad_char_const = (
285+
r"""('"""
286+
+ cconst_char
287+
+ """[^'\n]+')|('')|('"""
288+
+ bad_escape
289+
+ r"""[^'\n]*')"""
290+
)
291+
292+
# string literals (K&R2: A.2.6)
293+
string_char = r"""([^"\\\n]|""" + escape_sequence_start_in_string + ")"
294+
string_literal = '"' + string_char + '*"'
295+
wstring_literal = "L" + string_literal
296+
u8string_literal = "u8" + string_literal
297+
u16string_literal = "u" + string_literal
298+
u32string_literal = "U" + string_literal
299+
bad_string_literal = '"' + string_char + "*" + bad_escape + string_char + '*"'
300+
301+
# floating constants (K&R2: A.2.5.3)
302+
exponent_part = r"""([eE][-+]?[0-9]+)"""
303+
fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
304+
floating_constant = (
305+
"(((("
306+
+ fractional_constant
307+
+ ")"
308+
+ exponent_part
309+
+ "?)|([0-9]+"
310+
+ exponent_part
311+
+ "))[FfLl]?)"
312+
)
313+
binary_exponent_part = r"""([pP][+-]?[0-9]+)"""
314+
hex_fractional_constant = (
315+
"(((" + hex_digits + r""")?\.""" + hex_digits + ")|(" + hex_digits + r"""\.))"""
316+
)
317+
hex_floating_constant = (
318+
"("
319+
+ hex_prefix
320+
+ "("
321+
+ hex_digits
322+
+ "|"
323+
+ hex_fractional_constant
324+
+ ")"
325+
+ binary_exponent_part
326+
+ "[FfLl]?)"
327+
)
328+
192329
t_ignore = " \t\r?@\f"
193-
t_NUMBER = r"[0-9][0-9XxA-Fa-f]*"
194-
t_FLOAT_NUMBER = r"[-+]?[0-9]*\.[0-9]+([eE][-+]?[0-9]+)?"
330+
331+
# The following floating and integer constants are defined as
332+
# functions to impose a strict order (otherwise, decimal
333+
# is placed before the others because its regex is longer,
334+
# and this is bad)
335+
#
336+
@TOKEN(floating_constant)
337+
def t_FLOAT_CONST(self, t: LexToken) -> LexToken:
338+
return t
339+
340+
@TOKEN(hex_floating_constant)
341+
def t_HEX_FLOAT_CONST(self, t: LexToken) -> LexToken:
342+
return t
343+
344+
@TOKEN(hex_constant)
345+
def t_INT_CONST_HEX(self, t: LexToken) -> LexToken:
346+
return t
347+
348+
@TOKEN(bin_constant)
349+
def t_INT_CONST_BIN(self, t: LexToken) -> LexToken:
350+
return t
351+
352+
@TOKEN(bad_octal_constant)
353+
def t_BAD_CONST_OCT(self, t: LexToken) -> None:
354+
msg = "Invalid octal constant"
355+
self._error(msg, t)
356+
357+
@TOKEN(octal_constant)
358+
def t_INT_CONST_OCT(self, t: LexToken) -> LexToken:
359+
return t
360+
361+
@TOKEN(decimal_constant)
362+
def t_INT_CONST_DEC(self, t: LexToken) -> LexToken:
363+
return t
364+
365+
# Must come before bad_char_const, to prevent it from
366+
# catching valid char constants as invalid
367+
#
368+
@TOKEN(multicharacter_constant)
369+
def t_INT_CONST_CHAR(self, t: LexToken) -> LexToken:
370+
return t
371+
372+
@TOKEN(char_const)
373+
def t_CHAR_CONST(self, t: LexToken) -> LexToken:
374+
return t
375+
376+
@TOKEN(wchar_const)
377+
def t_WCHAR_CONST(self, t: LexToken) -> LexToken:
378+
return t
379+
380+
@TOKEN(u8char_const)
381+
def t_U8CHAR_CONST(self, t: LexToken) -> LexToken:
382+
return t
383+
384+
@TOKEN(u16char_const)
385+
def t_U16CHAR_CONST(self, t: LexToken) -> LexToken:
386+
return t
387+
388+
@TOKEN(u32char_const)
389+
def t_U32CHAR_CONST(self, t: LexToken) -> LexToken:
390+
return t
391+
392+
@TOKEN(unmatched_quote)
393+
def t_UNMATCHED_QUOTE(self, t: LexToken) -> None:
394+
msg = "Unmatched '"
395+
self._error(msg, t)
396+
397+
@TOKEN(bad_char_const)
398+
def t_BAD_CHAR_CONST(self, t: LexToken) -> None:
399+
msg = "Invalid char constant %s" % t.value
400+
self._error(msg, t)
401+
402+
@TOKEN(wstring_literal)
403+
def t_WSTRING_LITERAL(self, t: LexToken) -> LexToken:
404+
return t
405+
406+
@TOKEN(u8string_literal)
407+
def t_U8STRING_LITERAL(self, t: LexToken) -> LexToken:
408+
return t
409+
410+
@TOKEN(u16string_literal)
411+
def t_U16STRING_LITERAL(self, t: LexToken) -> LexToken:
412+
return t
413+
414+
@TOKEN(u32string_literal)
415+
def t_U32STRING_LITERAL(self, t: LexToken) -> LexToken:
416+
return t
417+
418+
# unmatched string literals are caught by the preprocessor
419+
420+
@TOKEN(bad_string_literal)
421+
def t_BAD_STRING_LITERAL(self, t):
422+
msg = "String contains invalid escape code"
423+
self._error(msg, t)
195424

196425
@TOKEN(r"[A-Za-z_~][A-Za-z0-9_]*")
197426
def t_NAME(self, t: LexToken) -> LexToken:
@@ -222,7 +451,6 @@ def t_COMMENT_SINGLELINE(self, t: LexToken) -> LexToken:
222451
return t
223452

224453
t_DIVIDE = r"/(?!/)"
225-
t_CHAR_LITERAL = "'.'"
226454
t_ELLIPSIS = r"\.\.\."
227455
t_DBL_LBRACKET = r"\[\["
228456
t_DBL_RBRACKET = r"\]\]"
@@ -232,9 +460,7 @@ def t_COMMENT_SINGLELINE(self, t: LexToken) -> LexToken:
232460
t_SHIFT_LEFT = r"<<"
233461
# SHIFT_RIGHT introduces ambiguity
234462

235-
# found at http://wordaligned.org/articles/string-literals-and-regular-expressions
236-
# TODO: This does not work with the string "bla \" bla"
237-
t_STRING_LITERAL = r'"([^"\\]|\\.)*"'
463+
t_STRING_LITERAL = string_literal
238464

239465
# Found at http://ostermiller.org/findcomment.html
240466
@TOKEN(r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?")

Diff for: cxxheaderparser/parser.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1171,7 +1171,7 @@ def _discard_ctor_initializer(self) -> None:
11711171

11721172
def _parse_bitfield(self) -> int:
11731173
# is a integral constant expression... for now, just do integers
1174-
tok = self._next_token_must_be("NUMBER")
1174+
tok = self._next_token_must_be("INT_CONST_DEC")
11751175
return int(tok.value)
11761176

11771177
def _parse_field(

Diff for: cxxheaderparser/tokfmt.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,24 @@
55

66
# key: token type, value: (left spacing, right spacing)
77
_want_spacing = {
8-
"NUMBER": (2, 2),
9-
"FLOAT_NUMBER": (2, 2),
8+
"FLOAT_CONST": (2, 2),
9+
"HEX_FLOAT_CONST": (2, 2),
10+
"INT_CONST_HEX": (2, 2),
11+
"INT_CONST_BIN": (2, 2),
12+
"INT_CONST_OCT": (2, 2),
13+
"INT_CONST_DEC": (2, 2),
14+
"INT_CONST_CHAR": (2, 2),
1015
"NAME": (2, 2),
11-
"CHAR_LITERAL": (2, 2),
16+
"CHAR_CONST": (2, 2),
17+
"WCHAR_CONST": (2, 2),
18+
"U8CHAR_CONST": (2, 2),
19+
"U16CHAR_CONST": (2, 2),
20+
"U32CHAR_CONST": (2, 2),
1221
"STRING_LITERAL": (2, 2),
22+
"WSTRING_LITERAL": (2, 2),
23+
"U8STRING_LITERAL": (2, 2),
24+
"U16STRING_LITERAL": (2, 2),
25+
"U32STRING_LITERAL": (2, 2),
1326
"ELLIPSIS": (2, 2),
1427
">": (0, 2),
1528
")": (0, 1),

0 commit comments

Comments
 (0)