@@ -61,6 +61,10 @@ class LexToken(Protocol):
61
61
62
62
63
63
class Lexer :
64
+ """
65
+ This lexer is a combination of pieces from the PLY lexers that CppHeaderParser
66
+ and pycparser have.
67
+ """
64
68
65
69
keywords = {
66
70
"__attribute__" ,
@@ -144,15 +148,33 @@ class Lexer:
144
148
}
145
149
146
150
tokens = [
147
- "NUMBER" ,
148
- "FLOAT_NUMBER" ,
151
+ # constants
152
+ "FLOAT_CONST" ,
153
+ "HEX_FLOAT_CONST" ,
154
+ "INT_CONST_HEX" ,
155
+ "INT_CONST_BIN" ,
156
+ "INT_CONST_OCT" ,
157
+ "INT_CONST_DEC" ,
158
+ "INT_CONST_CHAR" ,
159
+ "CHAR_CONST" ,
160
+ "WCHAR_CONST" ,
161
+ "U8CHAR_CONST" ,
162
+ "U16CHAR_CONST" ,
163
+ "U32CHAR_CONST" ,
164
+ # String literals
165
+ "STRING_LITERAL" ,
166
+ "WSTRING_LITERAL" ,
167
+ "U8STRING_LITERAL" ,
168
+ "U16STRING_LITERAL" ,
169
+ "U32STRING_LITERAL" ,
170
+ #
149
171
"NAME" ,
172
+ # Comments
150
173
"COMMENT_SINGLELINE" ,
151
174
"COMMENT_MULTILINE" ,
152
175
"PRECOMP_MACRO" ,
176
+ # misc
153
177
"DIVIDE" ,
154
- "CHAR_LITERAL" ,
155
- "STRING_LITERAL" ,
156
178
"NEWLINE" ,
157
179
"ELLIPSIS" ,
158
180
"DBL_LBRACKET" ,
@@ -189,9 +211,216 @@ class Lexer:
189
211
"." ,
190
212
]
191
213
214
+ #
215
+ # Regexes for use in tokens (taken from pycparser)
216
+ #
217
+
218
+ hex_prefix = "0[xX]"
219
+ hex_digits = "[0-9a-fA-F]+"
220
+ bin_prefix = "0[bB]"
221
+ bin_digits = "[01]+"
222
+
223
+ # integer constants (K&R2: A.2.5.1)
224
+ integer_suffix_opt = (
225
+ r"(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?"
226
+ )
227
+ decimal_constant = (
228
+ "(0" + integer_suffix_opt + ")|([1-9][0-9]*" + integer_suffix_opt + ")"
229
+ )
230
+ octal_constant = "0[0-7]*" + integer_suffix_opt
231
+ hex_constant = hex_prefix + hex_digits + integer_suffix_opt
232
+ bin_constant = bin_prefix + bin_digits + integer_suffix_opt
233
+
234
+ bad_octal_constant = "0[0-7]*[89]"
235
+
236
+ # character constants (K&R2: A.2.5.2)
237
+ # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
238
+ # directives with Windows paths as filenames (..\..\dir\file)
239
+ # For the same reason, decimal_escape allows all digit sequences. We want to
240
+ # parse all correct code, even if it means to sometimes parse incorrect
241
+ # code.
242
+ #
243
+ # The original regexes were taken verbatim from the C syntax definition,
244
+ # and were later modified to avoid worst-case exponential running time.
245
+ #
246
+ # simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
247
+ # decimal_escape = r"""(\d+)"""
248
+ # hex_escape = r"""(x[0-9a-fA-F]+)"""
249
+ # bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
250
+ #
251
+ # The following modifications were made to avoid the ambiguity that allowed backtracking:
252
+ # (https://github.com/eliben/pycparser/issues/61)
253
+ #
254
+ # - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape.
255
+ # - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex
256
+ # - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal
257
+ # - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape.
258
+ #
259
+ # Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways.
260
+ # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`.
261
+
262
+ simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
263
+ decimal_escape = r"""(\d+)(?!\d)"""
264
+ hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
265
+ bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""
266
+
267
+ escape_sequence = (
268
+ r"""(\\(""" + simple_escape + "|" + decimal_escape + "|" + hex_escape + "))"
269
+ )
270
+
271
+ # This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed
272
+ # 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to
273
+
274
+ escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""
275
+
276
+ cconst_char = r"""([^'\\\n]|""" + escape_sequence + ")"
277
+ char_const = "'" + cconst_char + "'"
278
+ wchar_const = "L" + char_const
279
+ u8char_const = "u8" + char_const
280
+ u16char_const = "u" + char_const
281
+ u32char_const = "U" + char_const
282
+ multicharacter_constant = "'" + cconst_char + "{2,4}'"
283
+ unmatched_quote = "('" + cconst_char + "*\\ n)|('" + cconst_char + "*$)"
284
+ bad_char_const = (
285
+ r"""('"""
286
+ + cconst_char
287
+ + """[^'\n ]+')|('')|('"""
288
+ + bad_escape
289
+ + r"""[^'\n]*')"""
290
+ )
291
+
292
+ # string literals (K&R2: A.2.6)
293
+ string_char = r"""([^"\\\n]|""" + escape_sequence_start_in_string + ")"
294
+ string_literal = '"' + string_char + '*"'
295
+ wstring_literal = "L" + string_literal
296
+ u8string_literal = "u8" + string_literal
297
+ u16string_literal = "u" + string_literal
298
+ u32string_literal = "U" + string_literal
299
+ bad_string_literal = '"' + string_char + "*" + bad_escape + string_char + '*"'
300
+
301
+ # floating constants (K&R2: A.2.5.3)
302
+ exponent_part = r"""([eE][-+]?[0-9]+)"""
303
+ fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
304
+ floating_constant = (
305
+ "(((("
306
+ + fractional_constant
307
+ + ")"
308
+ + exponent_part
309
+ + "?)|([0-9]+"
310
+ + exponent_part
311
+ + "))[FfLl]?)"
312
+ )
313
+ binary_exponent_part = r"""([pP][+-]?[0-9]+)"""
314
+ hex_fractional_constant = (
315
+ "(((" + hex_digits + r""")?\.""" + hex_digits + ")|(" + hex_digits + r"""\.))"""
316
+ )
317
+ hex_floating_constant = (
318
+ "("
319
+ + hex_prefix
320
+ + "("
321
+ + hex_digits
322
+ + "|"
323
+ + hex_fractional_constant
324
+ + ")"
325
+ + binary_exponent_part
326
+ + "[FfLl]?)"
327
+ )
328
+
192
329
t_ignore = " \t \r ?@\f "
193
- t_NUMBER = r"[0-9][0-9XxA-Fa-f]*"
194
- t_FLOAT_NUMBER = r"[-+]?[0-9]*\.[0-9]+([eE][-+]?[0-9]+)?"
330
+
331
+ # The following floating and integer constants are defined as
332
+ # functions to impose a strict order (otherwise, decimal
333
+ # is placed before the others because its regex is longer,
334
+ # and this is bad)
335
+ #
336
+ @TOKEN (floating_constant )
337
+ def t_FLOAT_CONST (self , t : LexToken ) -> LexToken :
338
+ return t
339
+
340
+ @TOKEN (hex_floating_constant )
341
+ def t_HEX_FLOAT_CONST (self , t : LexToken ) -> LexToken :
342
+ return t
343
+
344
+ @TOKEN (hex_constant )
345
+ def t_INT_CONST_HEX (self , t : LexToken ) -> LexToken :
346
+ return t
347
+
348
+ @TOKEN (bin_constant )
349
+ def t_INT_CONST_BIN (self , t : LexToken ) -> LexToken :
350
+ return t
351
+
352
+ @TOKEN (bad_octal_constant )
353
+ def t_BAD_CONST_OCT (self , t : LexToken ) -> None :
354
+ msg = "Invalid octal constant"
355
+ self ._error (msg , t )
356
+
357
+ @TOKEN (octal_constant )
358
+ def t_INT_CONST_OCT (self , t : LexToken ) -> LexToken :
359
+ return t
360
+
361
+ @TOKEN (decimal_constant )
362
+ def t_INT_CONST_DEC (self , t : LexToken ) -> LexToken :
363
+ return t
364
+
365
+ # Must come before bad_char_const, to prevent it from
366
+ # catching valid char constants as invalid
367
+ #
368
+ @TOKEN (multicharacter_constant )
369
+ def t_INT_CONST_CHAR (self , t : LexToken ) -> LexToken :
370
+ return t
371
+
372
+ @TOKEN (char_const )
373
+ def t_CHAR_CONST (self , t : LexToken ) -> LexToken :
374
+ return t
375
+
376
+ @TOKEN (wchar_const )
377
+ def t_WCHAR_CONST (self , t : LexToken ) -> LexToken :
378
+ return t
379
+
380
+ @TOKEN (u8char_const )
381
+ def t_U8CHAR_CONST (self , t : LexToken ) -> LexToken :
382
+ return t
383
+
384
+ @TOKEN (u16char_const )
385
+ def t_U16CHAR_CONST (self , t : LexToken ) -> LexToken :
386
+ return t
387
+
388
+ @TOKEN (u32char_const )
389
+ def t_U32CHAR_CONST (self , t : LexToken ) -> LexToken :
390
+ return t
391
+
392
+ @TOKEN (unmatched_quote )
393
+ def t_UNMATCHED_QUOTE (self , t : LexToken ) -> None :
394
+ msg = "Unmatched '"
395
+ self ._error (msg , t )
396
+
397
+ @TOKEN (bad_char_const )
398
+ def t_BAD_CHAR_CONST (self , t : LexToken ) -> None :
399
+ msg = "Invalid char constant %s" % t .value
400
+ self ._error (msg , t )
401
+
402
+ @TOKEN (wstring_literal )
403
+ def t_WSTRING_LITERAL (self , t : LexToken ) -> LexToken :
404
+ return t
405
+
406
+ @TOKEN (u8string_literal )
407
+ def t_U8STRING_LITERAL (self , t : LexToken ) -> LexToken :
408
+ return t
409
+
410
+ @TOKEN (u16string_literal )
411
+ def t_U16STRING_LITERAL (self , t : LexToken ) -> LexToken :
412
+ return t
413
+
414
+ @TOKEN (u32string_literal )
415
+ def t_U32STRING_LITERAL (self , t : LexToken ) -> LexToken :
416
+ return t
417
+
418
+ # unmatched string literals are caught by the preprocessor
419
+
420
+ @TOKEN (bad_string_literal )
421
+ def t_BAD_STRING_LITERAL (self , t ):
422
+ msg = "String contains invalid escape code"
423
+ self ._error (msg , t )
195
424
196
425
@TOKEN (r"[A-Za-z_~][A-Za-z0-9_]*" )
197
426
def t_NAME (self , t : LexToken ) -> LexToken :
@@ -222,7 +451,6 @@ def t_COMMENT_SINGLELINE(self, t: LexToken) -> LexToken:
222
451
return t
223
452
224
453
t_DIVIDE = r"/(?!/)"
225
- t_CHAR_LITERAL = "'.'"
226
454
t_ELLIPSIS = r"\.\.\."
227
455
t_DBL_LBRACKET = r"\[\["
228
456
t_DBL_RBRACKET = r"\]\]"
@@ -232,9 +460,7 @@ def t_COMMENT_SINGLELINE(self, t: LexToken) -> LexToken:
232
460
t_SHIFT_LEFT = r"<<"
233
461
# SHIFT_RIGHT introduces ambiguity
234
462
235
- # found at http://wordaligned.org/articles/string-literals-and-regular-expressions
236
- # TODO: This does not work with the string "bla \" bla"
237
- t_STRING_LITERAL = r'"([^"\\]|\\.)*"'
463
+ t_STRING_LITERAL = string_literal
238
464
239
465
# Found at http://ostermiller.org/findcomment.html
240
466
@TOKEN (r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/\n?" )
0 commit comments