From 008f8e5a067657de37f6d8adb3bd415e38c671cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= Date: Wed, 19 Apr 2023 17:25:22 +0200 Subject: [PATCH 01/20] First iteration --- Grammar/Tokens | 1 + Include/internal/pycore_token.h | 1 + Lib/test/test_tokenize.py | 22 ++++-- Lib/token.py | 1 + Lib/tokenize.py | 120 +++++++++++++++++++++++++++++++- 5 files changed, 139 insertions(+), 6 deletions(-) diff --git a/Grammar/Tokens b/Grammar/Tokens index 096876fdd130f8..8f13217ab1e100 100644 --- a/Grammar/Tokens +++ b/Grammar/Tokens @@ -64,6 +64,7 @@ SOFT_KEYWORD FSTRING_START FSTRING_MIDDLE FSTRING_END +FSTRING_EXPR ERRORTOKEN # These aren't used by the C tokenizer but are needed for tokenize.py diff --git a/Include/internal/pycore_token.h b/Include/internal/pycore_token.h index b9df8766736adf..9f50bf05609809 100644 --- a/Include/internal/pycore_token.h +++ b/Include/internal/pycore_token.h @@ -78,6 +78,7 @@ extern "C" { #define FSTRING_MIDDLE 62 #define FSTRING_END 63 #define ERRORTOKEN 64 +#define FSTRING_EXPR 69 #define N_TOKENS 68 #define NT_OFFSET 256 diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 911b53e5816588..95bca9e1a129f7 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -381,21 +381,33 @@ def test_string(self): STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4) """) self.check_tokenize('f"abc"', """\ - STRING 'f"abc"' (1, 0) (1, 6) + FSTRING_START \'f"\' (1, 0) (1, 2) + FSTRING_MIDDLE 'abc' (1, 2) (1, 5) + FSTRING_END \'"\' (1, 5) (1, 6) """) self.check_tokenize('fR"a{b}c"', """\ - STRING 'fR"a{b}c"' (1, 0) (1, 9) + FSTRING_START \'fR"\' (1, 0) (1, 3) + FSTRING_MIDDLE 'a' (1, 3) (1, 4) + FSTRING_EXPR '{b}' (1, 4) (1, 7) + FSTRING_MIDDLE 'c' (1, 7) (1, 8) + FSTRING_END \'"\' (1, 8) (1, 9) """) self.check_tokenize('f"""abc"""', """\ - STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10) + FSTRING_START 'f\"""' (1, 0) (1, 4) + FSTRING_MIDDLE 'abc' (1, 4) (1, 7) + FSTRING_END '\"""' (1, 7) (1, 10) """) self.check_tokenize(r'f"abc\ def"', """\ - STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4) + FSTRING_START \'f"\' (1, 0) (1, 2) + FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 2) (2, 3) + FSTRING_END \'"\' (2, 3) (2, 4) """) self.check_tokenize(r'Rf"abc\ def"', """\ - STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4) + FSTRING_START \'Rf"\' (1, 0) (1, 3) + FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 3) (2, 3) + FSTRING_END \'"\' (2, 3) (2, 4) """) def test_function(self): diff --git a/Lib/token.py b/Lib/token.py index 1459d12b376f82..cdbdba9c091076 100644 --- a/Lib/token.py +++ b/Lib/token.py @@ -67,6 +67,7 @@ FSTRING_START = 61 FSTRING_MIDDLE = 62 FSTRING_END = 63 +FSTRING_EXPR = 69 # These aren't used by the C tokenizer but are needed for tokenize.py ERRORTOKEN = 64 COMMENT = 65 diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 46d2224f5cc083..8062fd4875d8a8 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -29,6 +29,7 @@ import collections import functools from io import TextIOWrapper +from io import BytesIO import itertools as _itertools import re import sys @@ -37,6 +38,14 @@ cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) +fstring_re = re.compile( + r''' + (?P^[fFrR]{1,2}(?P[\'\"]{1,3})) + (?P.*) + (?P=quote)$ + ''', + re.VERBOSE | re.DOTALL +) import token __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding", @@ -430,7 +439,7 @@ def tokenize(readline): return _tokenize(rl_gen.__next__, encoding) -def _tokenize(readline, encoding): +def _tokenize_normal_mode(readline, encoding): lnum = parenlev = continued = 0 numchars = '0123456789' contstr, needcont = '', 0 @@ -613,6 +622,115 @@ def _tokenize(readline, encoding): yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') +def _tokenize_fstring_mode(line, tok_start): + line_number, start = tok_start + + parts = fstring_re.match(line) + end = line_number, start + len(parts.group('start')) + yield TokenInfo( + type=FSTRING_START, + string=parts.group('start'), + start=(line_number, start), + end=end, + line=line) + + middle = parts.group('middle') + mid_token, mid_expr = '', '' + curly_brackets = [] + start, i = end[1], 0 + + for c in middle: + match c: + case '{': + # TODO: handle {{ and \{ + curly_brackets.append(c) + mid_expr += c + yield TokenInfo( + type=FSTRING_MIDDLE, + string=mid_token, + start=end, + end=(line_number, start + i), + line=line) + mid_token = '' + end = line_number, start + i + case '}': + curly_brackets.pop() + mid_expr += c + yield TokenInfo( + type=FSTRING_EXPR, + string=mid_expr, + # +1 is needed here since this token is yielded when + # reading the }, before incrementing i. + start=end, + end=(line_number, start + i + 1), + line=line) + mid_expr = '' + end = line_number, start + i + 1 + case '\n': + if mid_expr: + mid_expr += c + else: + mid_token += c + line_number += 1 + start = 0 + i = -1 + case _: + if mid_expr: + mid_expr += c + else: + mid_token += c + i += 1 + + # once the end of the expression is reached, release what's left of + # mid_token + start += i + yield TokenInfo( + type=FSTRING_MIDDLE, + string=mid_token, + start=end, + end=(line_number, start), + line=line) + end = line_number, start + + if curly_brackets: + # TODO: handle syntax error of not matching {} + pass + + yield TokenInfo( + type=FSTRING_END, + string=parts.group('quote'), + start=end, + end=(line_number, start + len(parts.group('quote'))), + line=line) + + +def _is_fstring(tok): + """Checks whether a STRING token is a fstring or not. + + Args: + tok: TokenInfo object of type STRING. + + Returns: + bool + """ + return tok.string.lower().startswith(('f', 'rf', 'fr')) + + +def _tokenize(readline, encoding): + """Tokenize Python code implementing the string mode and the normal mode. + + See PEP701 por more details. + """ + tokens = _tokenize_normal_mode(readline, encoding) + + for tok in tokens: + if tok.type != STRING or not _is_fstring(tok): + yield tok + else: + for t in _tokenize_fstring_mode(tok.string, tok.start): + yield t + + def generate_tokens(readline): """Tokenize a source reading Python code as unicode strings. From 67a6ad6b66bd827d6e4ab7a0cd2e3fb7e6b6b5d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= Date: Thu, 27 Apr 2023 17:08:05 +0200 Subject: [PATCH 02/20] Handle escaping { --- Lib/test/test_tokenize.py | 5 ++++ Lib/tokenize.py | 58 +++++++++++++++++++++++---------------- 2 files changed, 39 insertions(+), 24 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 95bca9e1a129f7..c66ae34f73ab54 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -391,6 +391,11 @@ def test_string(self): FSTRING_EXPR '{b}' (1, 4) (1, 7) FSTRING_MIDDLE 'c' (1, 7) (1, 8) FSTRING_END \'"\' (1, 8) (1, 9) + """) + self.check_tokenize('fR"a{{b}c"', """\ + FSTRING_START \'fR"\' (1, 0) (1, 3) + FSTRING_MIDDLE 'a{{b}c' (1, 3) (1, 9) + FSTRING_END \'"\' (1, 9) (1, 10) """) self.check_tokenize('f"""abc"""', """\ FSTRING_START 'f\"""' (1, 0) (1, 4) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 8062fd4875d8a8..16c6f0d668fae0 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -639,33 +639,43 @@ def _tokenize_fstring_mode(line, tok_start): curly_brackets = [] start, i = end[1], 0 - for c in middle: + for position, c in enumerate(middle): match c: case '{': - # TODO: handle {{ and \{ - curly_brackets.append(c) - mid_expr += c - yield TokenInfo( - type=FSTRING_MIDDLE, - string=mid_token, - start=end, - end=(line_number, start + i), - line=line) - mid_token = '' - end = line_number, start + i + # check out next position, if it's another {, then it is + # escaping the { character + if ((len(middle) >= position + 1 and middle[position + 1] == '{') + or (position > 0 and middle[position - 1] in ('\\', '{'))): + mid_token += c + else: + curly_brackets.append(c) + mid_expr += c + yield TokenInfo( + type=FSTRING_MIDDLE, + string=mid_token, + start=end, + end=(line_number, start + i), + line=line) + mid_token = '' + end = line_number, start + i case '}': - curly_brackets.pop() - mid_expr += c - yield TokenInfo( - type=FSTRING_EXPR, - string=mid_expr, - # +1 is needed here since this token is yielded when - # reading the }, before incrementing i. - start=end, - end=(line_number, start + i + 1), - line=line) - mid_expr = '' - end = line_number, start + i + 1 + # if no opening { is seen before, this character is taken + # as part of the fstring middle token + if mid_expr: + curly_brackets.pop() + mid_expr += c + yield TokenInfo( + type=FSTRING_EXPR, + string=mid_expr, + # +1 is needed here since this token is yielded when + # reading the }, before incrementing i. + start=end, + end=(line_number, start + i + 1), + line=line) + mid_expr = '' + end = line_number, start + i + 1 + else: + mid_token += c case '\n': if mid_expr: mid_expr += c From f58104d20269ba6878da404be0973e375473d271 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= Date: Sat, 29 Apr 2023 18:22:52 +0200 Subject: [PATCH 03/20] nested expressions --- Lib/test/test_tokenize.py | 25 +++++++++++++++---------- Lib/tokenize.py | 28 +++++++++++++++++++--------- lel.py | 12 ++++++++++++ 3 files changed, 46 insertions(+), 19 deletions(-) create mode 100644 lel.py diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index c66ae34f73ab54..7d2d200033221b 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -381,21 +381,26 @@ def test_string(self): STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4) """) self.check_tokenize('f"abc"', """\ - FSTRING_START \'f"\' (1, 0) (1, 2) + FSTRING_START 'f"' (1, 0) (1, 2) FSTRING_MIDDLE 'abc' (1, 2) (1, 5) - FSTRING_END \'"\' (1, 5) (1, 6) + FSTRING_END '"' (1, 5) (1, 6) """) self.check_tokenize('fR"a{b}c"', """\ - FSTRING_START \'fR"\' (1, 0) (1, 3) + FSTRING_START 'fR"' (1, 0) (1, 3) FSTRING_MIDDLE 'a' (1, 3) (1, 4) FSTRING_EXPR '{b}' (1, 4) (1, 7) FSTRING_MIDDLE 'c' (1, 7) (1, 8) - FSTRING_END \'"\' (1, 8) (1, 9) + FSTRING_END '"' (1, 8) (1, 9) """) self.check_tokenize('fR"a{{b}c"', """\ - FSTRING_START \'fR"\' (1, 0) (1, 3) + FSTRING_START 'fR"' (1, 0) (1, 3) FSTRING_MIDDLE 'a{{b}c' (1, 3) (1, 9) - FSTRING_END \'"\' (1, 9) (1, 10) + FSTRING_END '"' (1, 9) (1, 10) + """) + self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\ + FSTRING_START 'f\"""' (1, 0) (1, 4) + FSTRING_EXPR '{f'''{f'{f"{1+1}"}'}'''}' (1, 4) (1, 28) + FSTRING_END '\"""' (1, 28) (1, 31) """) self.check_tokenize('f"""abc"""', """\ FSTRING_START 'f\"""' (1, 0) (1, 4) @@ -404,15 +409,15 @@ def test_string(self): """) self.check_tokenize(r'f"abc\ def"', """\ - FSTRING_START \'f"\' (1, 0) (1, 2) + FSTRING_START 'f"' (1, 0) (1, 2) FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 2) (2, 3) - FSTRING_END \'"\' (2, 3) (2, 4) + FSTRING_END '"' (2, 3) (2, 4) """) self.check_tokenize(r'Rf"abc\ def"', """\ - FSTRING_START \'Rf"\' (1, 0) (1, 3) + FSTRING_START 'Rf"' (1, 0) (1, 3) FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 3) (2, 3) - FSTRING_END \'"\' (2, 3) (2, 4) + FSTRING_END '"' (2, 3) (2, 4) """) def test_function(self): diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 16c6f0d668fae0..1a8ec9412459fd 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -647,7 +647,7 @@ def _tokenize_fstring_mode(line, tok_start): if ((len(middle) >= position + 1 and middle[position + 1] == '{') or (position > 0 and middle[position - 1] in ('\\', '{'))): mid_token += c - else: + elif mid_token: curly_brackets.append(c) mid_expr += c yield TokenInfo( @@ -658,11 +658,17 @@ def _tokenize_fstring_mode(line, tok_start): line=line) mid_token = '' end = line_number, start + i + else: + curly_brackets.append(c) + mid_expr += c case '}': # if no opening { is seen before, this character is taken # as part of the fstring middle token - if mid_expr: + # if there are remaining elements in the curly_brackets queue + # then the expression is not done yet + if curly_brackets: curly_brackets.pop() + if mid_expr and not curly_brackets: mid_expr += c yield TokenInfo( type=FSTRING_EXPR, @@ -675,7 +681,10 @@ def _tokenize_fstring_mode(line, tok_start): mid_expr = '' end = line_number, start + i + 1 else: - mid_token += c + if mid_expr: + mid_expr += c + else: + mid_token += c case '\n': if mid_expr: mid_expr += c @@ -694,12 +703,13 @@ def _tokenize_fstring_mode(line, tok_start): # once the end of the expression is reached, release what's left of # mid_token start += i - yield TokenInfo( - type=FSTRING_MIDDLE, - string=mid_token, - start=end, - end=(line_number, start), - line=line) + if mid_token: + yield TokenInfo( + type=FSTRING_MIDDLE, + string=mid_token, + start=end, + end=(line_number, start), + line=line) end = line_number, start if curly_brackets: diff --git a/lel.py b/lel.py new file mode 100644 index 00000000000000..c1c0274876054b --- /dev/null +++ b/lel.py @@ -0,0 +1,12 @@ +import tokenize +from pprint import pprint +from io import BytesIO + +def t(s): + pprint(list(tokenize.tokenize(BytesIO(s.encode()).readline))) + + +a = r'f"abc\ +def"' + +t(a) From 26102cca0cd85450b00ee2a3c82123bb6a1dd3a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= Date: Tue, 2 May 2023 23:15:24 +0200 Subject: [PATCH 04/20] Recursive expression tokenization --- Lib/test/test_tokenize.py | 57 ++++++++++++++-- Lib/tokenize.py | 140 +++++++++++++++++++++++++++++--------- lel.py | 12 ---- 3 files changed, 160 insertions(+), 49 deletions(-) delete mode 100644 lel.py diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 7d2d200033221b..317e1191728d55 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -388,18 +388,47 @@ def test_string(self): self.check_tokenize('fR"a{b}c"', """\ FSTRING_START 'fR"' (1, 0) (1, 3) FSTRING_MIDDLE 'a' (1, 3) (1, 4) - FSTRING_EXPR '{b}' (1, 4) (1, 7) + OP '{' (1, 4) (1, 5) + NAME 'b' (1, 5) (1, 6) + OP '}' (1, 6) (1, 7) FSTRING_MIDDLE 'c' (1, 7) (1, 8) FSTRING_END '"' (1, 8) (1, 9) """) - self.check_tokenize('fR"a{{b}c"', """\ + self.check_tokenize('fR"a{{{b!r}}}c"', """\ FSTRING_START 'fR"' (1, 0) (1, 3) - FSTRING_MIDDLE 'a{{b}c' (1, 3) (1, 9) + FSTRING_MIDDLE 'a{b}c' (1, 3) (1, 7) + FSTRING_END '"' (1, 7) (1, 8) + """) + self.check_tokenize('f"{{{1+1}}}"', """\ + FSTRING_START 'f"' (1, 0) (1, 2) + FSTRING_MIDDLE '{' (1, 2) (1, 3) + OP '{' (1, 3) (1, 4) + NUMBER '1' (1, 4) (1, 5) + OP '+' (1, 5) (1, 6) + NUMBER '1' (1, 6) (1, 7) + OP '}' (1, 7) (1, 8) + FSTRING_MIDDLE '}' (1, 8) (1, 9) FSTRING_END '"' (1, 9) (1, 10) """) self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\ FSTRING_START 'f\"""' (1, 0) (1, 4) - FSTRING_EXPR '{f'''{f'{f"{1+1}"}'}'''}' (1, 4) (1, 28) + OP '{' (1, 4) (1, 5) + FSTRING_START "f'''" (1, 5) (1, 9) + OP '{' (1, 9) (1, 10) + FSTRING_START "f'" (1, 10) (1, 12) + OP '{' (1, 12) (1, 13) + FSTRING_START 'f"' (1, 13) (1, 15) + OP '{' (1, 15) (1, 16) + NUMBER '1' (1, 16) (1, 17) + OP '+' (1, 17) (1, 18) + NUMBER '1' (1, 18) (1, 19) + OP '}' (1, 19) (1, 20) + FSTRING_END '"' (1, 20) (1, 21) + OP '}' (1, 21) (1, 22) + FSTRING_END "'" (1, 22) (1, 23) + OP '}' (1, 23) (1, 24) + FSTRING_END "'''" (1, 24) (1, 27) + OP '}' (1, 27) (1, 28) FSTRING_END '\"""' (1, 28) (1, 31) """) self.check_tokenize('f"""abc"""', """\ @@ -418,6 +447,26 @@ def test_string(self): FSTRING_START 'Rf"' (1, 0) (1, 3) FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 3) (2, 3) FSTRING_END '"' (2, 3) (2, 4) + """) + self.check_tokenize("f'some words {a+b:.3f} more words {c+d=} final words'", """\ + FSTRING_START "f'" (1, 0) (1, 2) + FSTRING_MIDDLE 'some words ' (1, 2) (1, 13) + OP '{' (1, 13) (1, 14) + NAME 'a' (1, 14) (1, 15) + OP '+' (1, 15) (1, 16) + NAME 'b' (1, 16) (1, 17) + OP ':' (1, 17) (1, 18) + FSTRING_MIDDLE '.3f' (1, 18) (1, 21) + OP '}' (1, 21) (1, 22) + FSTRING_MIDDLE ' more words ' (1, 22) (1, 34) + OP '{' (1, 34) (1, 35) + NAME 'c' (1, 35) (1, 36) + OP '+' (1, 36) (1, 37) + NAME 'd' (1, 37) (1, 38) + OP '=' (1, 38) (1, 39) + OP '}' (1, 39) (1, 40) + FSTRING_MIDDLE ' final words' (1, 40) (1, 52) + FSTRING_END "'" (1, 52) (1, 53) """) def test_function(self): diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 1a8ec9412459fd..3cee3d99721f07 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -30,6 +30,7 @@ import functools from io import TextIOWrapper from io import BytesIO +from io import StringIO import itertools as _itertools import re import sys @@ -439,14 +440,14 @@ def tokenize(readline): return _tokenize(rl_gen.__next__, encoding) -def _tokenize_normal_mode(readline, encoding): +def _tokenize_normal_mode(readline, encoding, fstring_mode=False): lnum = parenlev = continued = 0 numchars = '0123456789' contstr, needcont = '', 0 contline = None indents = [0] - if encoding is not None: + if encoding is not None and not fstring_mode: if encoding == "utf-8-sig": # BOM will already have been stripped. encoding = "utf-8" @@ -614,6 +615,8 @@ def _tokenize_normal_mode(readline, encoding): (lnum, pos), (lnum, pos+1), line) pos += 1 + if fstring_mode: + return # Add an implicit NEWLINE if the input doesn't end in one if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"): yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') @@ -622,42 +625,60 @@ def _tokenize_normal_mode(readline, encoding): yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') -def _tokenize_fstring_mode(line, tok_start): +def _tokenize_fstring_mode(line, tok_start, encoding): line_number, start = tok_start - parts = fstring_re.match(line) - end = line_number, start + len(parts.group('start')) + end_col = start + len(parts.group('start')) yield TokenInfo( type=FSTRING_START, string=parts.group('start'), start=(line_number, start), - end=end, + end=(line_number, end_col), line=line) middle = parts.group('middle') + + for token in _tokenize_fstring_middle(middle, end_col, line_number, line, encoding): + yield token + + last_line, last_col = token.end + yield TokenInfo( + type=FSTRING_END, + string=parts.group('quote'), + start=token.end, + end=(last_line, last_col + len(parts.group('quote'))), + line=line) + + + +def _tokenize_fstring_middle(middle, start, line_number, line, encoding): + n_chars_in_curr_line = 0 mid_token, mid_expr = '', '' curly_brackets = [] - start, i = end[1], 0 - + end = (line_number, start) + escaping = False for position, c in enumerate(middle): match c: case '{': # check out next position, if it's another {, then it is # escaping the { character - if ((len(middle) >= position + 1 and middle[position + 1] == '{') - or (position > 0 and middle[position - 1] in ('\\', '{'))): - mid_token += c - elif mid_token: + if len(middle) >= position + 1 and middle[position + 1] == '{' and not escaping: + escaping = True + continue + elif mid_token and not escaping: curly_brackets.append(c) mid_expr += c yield TokenInfo( type=FSTRING_MIDDLE, string=mid_token, start=end, - end=(line_number, start + i), + end=(line_number, start + n_chars_in_curr_line), line=line) mid_token = '' - end = line_number, start + i + end = line_number, start + n_chars_in_curr_line + elif escaping: + escaping = False + mid_token += c else: curly_brackets.append(c) mid_expr += c @@ -666,20 +687,80 @@ def _tokenize_fstring_mode(line, tok_start): # as part of the fstring middle token # if there are remaining elements in the curly_brackets queue # then the expression is not done yet + if escaping: + escaping = False + mid_token += c + continue + elif len(middle) >= position + 1 and middle[position + 1] == '}': + escaping = True + continue if curly_brackets: curly_brackets.pop() if mid_expr and not curly_brackets: + yield TokenInfo( + type=OP, + string='{', + start=end, + end=(line_number, end[1] + 1), + line=line) + end = line_number, end[1] + 1 + mid_expr += c + + mid_expr = mid_expr[1:-1] + + # Find any first level : or ! + curly_level = 0 + break_char_index = -1 + for char_index, char in enumerate(mid_expr): + if char == '{': + curly_level += 1 + elif char == '}': + curly_level -= 1 + elif char in {'!', ':'} and not curly_level: + break_char_index = char_index + break + + expression_chunk = mid_expr + if break_char_index != -1: + expression_chunk = mid_expr[:break_char_index+1] + + if encoding is not None: + buffer = BytesIO(expression_chunk.encode()).readline + else: + buffer = StringIO(expression_chunk).readline + for t in _tokenize(buffer, encoding, fstring_mode=True): + yield TokenInfo( + type=t.type, + string=t.string, + start=(t.start[0] - 1 + end[0], t.start[1] + end[1]), + end=(t.end[0] - 1 + end[0], t.end[1] + end[1]), + line=line + ) + + end = t.end[0] - 1 + end[0], t.end[1] + end[1] + + if break_char_index != -1: + formatting_chunk = mid_expr[break_char_index+1:] + for t in _tokenize_fstring_middle( + middle=formatting_chunk, + start=end[1], + line_number=line_number, + line=line, + encoding=encoding): + + yield t + end = t.end + yield TokenInfo( - type=FSTRING_EXPR, - string=mid_expr, - # +1 is needed here since this token is yielded when - # reading the }, before incrementing i. + type=OP, + string='}', start=end, - end=(line_number, start + i + 1), + end=(end[0], end[1] + 1), line=line) + mid_expr = '' - end = line_number, start + i + 1 + end = line_number, start + n_chars_in_curr_line + 1 else: if mid_expr: mid_expr += c @@ -692,17 +773,17 @@ def _tokenize_fstring_mode(line, tok_start): mid_token += c line_number += 1 start = 0 - i = -1 + n_chars_in_curr_line = -1 case _: if mid_expr: mid_expr += c else: mid_token += c - i += 1 + n_chars_in_curr_line += 1 # once the end of the expression is reached, release what's left of # mid_token - start += i + start += n_chars_in_curr_line if mid_token: yield TokenInfo( type=FSTRING_MIDDLE, @@ -716,13 +797,6 @@ def _tokenize_fstring_mode(line, tok_start): # TODO: handle syntax error of not matching {} pass - yield TokenInfo( - type=FSTRING_END, - string=parts.group('quote'), - start=end, - end=(line_number, start + len(parts.group('quote'))), - line=line) - def _is_fstring(tok): """Checks whether a STRING token is a fstring or not. @@ -736,18 +810,18 @@ def _is_fstring(tok): return tok.string.lower().startswith(('f', 'rf', 'fr')) -def _tokenize(readline, encoding): +def _tokenize(readline, encoding, fstring_mode=False): """Tokenize Python code implementing the string mode and the normal mode. See PEP701 por more details. """ - tokens = _tokenize_normal_mode(readline, encoding) + tokens = _tokenize_normal_mode(readline, encoding, fstring_mode) for tok in tokens: if tok.type != STRING or not _is_fstring(tok): yield tok else: - for t in _tokenize_fstring_mode(tok.string, tok.start): + for t in _tokenize_fstring_mode(tok.string, tok.start, encoding): yield t diff --git a/lel.py b/lel.py deleted file mode 100644 index c1c0274876054b..00000000000000 --- a/lel.py +++ /dev/null @@ -1,12 +0,0 @@ -import tokenize -from pprint import pprint -from io import BytesIO - -def t(s): - pprint(list(tokenize.tokenize(BytesIO(s.encode()).readline))) - - -a = r'f"abc\ -def"' - -t(a) From a5f4b408aed4a4006b8eb77a6d741029db630783 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= Date: Tue, 2 May 2023 23:23:56 +0200 Subject: [PATCH 05/20] Remove intermediate token created for dev purposes --- Grammar/Tokens | 1 - Include/internal/pycore_token.h | 1 - 2 files changed, 2 deletions(-) diff --git a/Grammar/Tokens b/Grammar/Tokens index 8f13217ab1e100..096876fdd130f8 100644 --- a/Grammar/Tokens +++ b/Grammar/Tokens @@ -64,7 +64,6 @@ SOFT_KEYWORD FSTRING_START FSTRING_MIDDLE FSTRING_END -FSTRING_EXPR ERRORTOKEN # These aren't used by the C tokenizer but are needed for tokenize.py diff --git a/Include/internal/pycore_token.h b/Include/internal/pycore_token.h index 9f50bf05609809..b9df8766736adf 100644 --- a/Include/internal/pycore_token.h +++ b/Include/internal/pycore_token.h @@ -78,7 +78,6 @@ extern "C" { #define FSTRING_MIDDLE 62 #define FSTRING_END 63 #define ERRORTOKEN 64 -#define FSTRING_EXPR 69 #define N_TOKENS 68 #define NT_OFFSET 256 From 598bab44633ff90d313846b3a0ed1a6498909751 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= Date: Wed, 3 May 2023 23:50:38 +0200 Subject: [PATCH 06/20] More improvements --- Lib/test/test_tokenize.py | 33 ++++++++++++++++++++---------- Lib/tokenize.py | 43 ++++++++++++++++++++------------------- 2 files changed, 44 insertions(+), 32 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 317e1191728d55..d53064b3aa7387 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -396,19 +396,30 @@ def test_string(self): """) self.check_tokenize('fR"a{{{b!r}}}c"', """\ FSTRING_START 'fR"' (1, 0) (1, 3) - FSTRING_MIDDLE 'a{b}c' (1, 3) (1, 7) - FSTRING_END '"' (1, 7) (1, 8) + FSTRING_MIDDLE 'a{' (1, 3) (1, 6) + OP '{' (1, 6) (1, 7) + NAME 'b' (1, 7) (1, 8) + OP '!' (1, 8) (1, 9) + FSTRING_MIDDLE 'r' (1, 9) (1, 10) + OP '}' (1, 10) (1, 12) + FSTRING_MIDDLE '}c' (1, 12) (1, 14) + FSTRING_END '"' (1, 14) (1, 15) """) self.check_tokenize('f"{{{1+1}}}"', """\ - FSTRING_START 'f"' (1, 0) (1, 2) - FSTRING_MIDDLE '{' (1, 2) (1, 3) - OP '{' (1, 3) (1, 4) - NUMBER '1' (1, 4) (1, 5) - OP '+' (1, 5) (1, 6) - NUMBER '1' (1, 6) (1, 7) - OP '}' (1, 7) (1, 8) - FSTRING_MIDDLE '}' (1, 8) (1, 9) - FSTRING_END '"' (1, 9) (1, 10) + FSTRING_START 'f"' (1, 0) (1, 2) + FSTRING_MIDDLE '{' (1, 2) (1, 4) + OP '{' (1, 4) (1, 5) + NUMBER '1' (1, 5) (1, 6) + OP '+' (1, 6) (1, 7) + NUMBER '1' (1, 7) (1, 8) + OP '}' (1, 8) (1, 10) + FSTRING_MIDDLE '}' (1, 10) (1, 11) + FSTRING_END '"' (1, 11) (1, 12) + """) + self.check_tokenize('f"{1+1"', """\ + FSTRING_START 'f"' (1, 0) (1, 2) + ERRORTOKEN '{' (1, 2) (1, 3) + FSTRING_END '"' (1, 3) (1, 4) """) self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\ FSTRING_START 'f\"""' (1, 0) (1, 4) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 3cee3d99721f07..cf6a140eb287bf 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -652,7 +652,6 @@ def _tokenize_fstring_mode(line, tok_start, encoding): def _tokenize_fstring_middle(middle, start, line_number, line, encoding): - n_chars_in_curr_line = 0 mid_token, mid_expr = '', '' curly_brackets = [] end = (line_number, start) @@ -664,36 +663,35 @@ def _tokenize_fstring_middle(middle, start, line_number, line, encoding): # escaping the { character if len(middle) >= position + 1 and middle[position + 1] == '{' and not escaping: escaping = True - continue elif mid_token and not escaping: - curly_brackets.append(c) + curly_brackets.append((line_number, start)) mid_expr += c yield TokenInfo( type=FSTRING_MIDDLE, string=mid_token, start=end, - end=(line_number, start + n_chars_in_curr_line), + end=(line_number, start), line=line) mid_token = '' - end = line_number, start + n_chars_in_curr_line + end = line_number, start elif escaping: escaping = False mid_token += c else: - curly_brackets.append(c) + curly_brackets.append((line_number, start)) mid_expr += c case '}': - # if no opening { is seen before, this character is taken - # as part of the fstring middle token - # if there are remaining elements in the curly_brackets queue - # then the expression is not done yet + # If two }} are seen, then the first one is skipped and the + # second is added as part of the fstring_middle token if escaping: escaping = False mid_token += c continue - elif len(middle) >= position + 1 and middle[position + 1] == '}': + elif len(middle) > position + 1 and middle[position + 1] == '}': escaping = True + start += 1 continue + if curly_brackets: curly_brackets.pop() if mid_expr and not curly_brackets: @@ -756,11 +754,11 @@ def _tokenize_fstring_middle(middle, start, line_number, line, encoding): type=OP, string='}', start=end, - end=(end[0], end[1] + 1), + end=(line_number, start + 1), line=line) mid_expr = '' - end = line_number, start + n_chars_in_curr_line + 1 + end = line_number, start + 1 else: if mid_expr: mid_expr += c @@ -772,30 +770,33 @@ def _tokenize_fstring_middle(middle, start, line_number, line, encoding): else: mid_token += c line_number += 1 - start = 0 - n_chars_in_curr_line = -1 + start = -1 case _: if mid_expr: mid_expr += c else: mid_token += c - n_chars_in_curr_line += 1 + start += 1 # once the end of the expression is reached, release what's left of # mid_token - start += n_chars_in_curr_line if mid_token: yield TokenInfo( type=FSTRING_MIDDLE, string=mid_token, start=end, - end=(line_number, start), + end=(line_number, end[1] + len(mid_token)), line=line) - end = line_number, start + end = line_number, end[1] + len(mid_token) if curly_brackets: - # TODO: handle syntax error of not matching {} - pass + lnum, pos = curly_brackets.pop() + yield TokenInfo( + type=ERRORTOKEN, + string=line.split('\n')[lnum - 1][pos], + start=(lnum, pos), + end=(lnum, pos+1), + line=line) def _is_fstring(tok): From a0ed8162d1bf26183be56a9feb5f05080cfc484b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= Date: Mon, 8 May 2023 00:35:06 +0200 Subject: [PATCH 07/20] fix handling of } tokens --- Lib/test/test_tokenize.py | 10 +-- Lib/tokenize.py | 158 +++++++++++++++++++------------------- 2 files changed, 84 insertions(+), 84 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index d53064b3aa7387..b13b2c0f65ce79 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -400,9 +400,9 @@ def test_string(self): OP '{' (1, 6) (1, 7) NAME 'b' (1, 7) (1, 8) OP '!' (1, 8) (1, 9) - FSTRING_MIDDLE 'r' (1, 9) (1, 10) - OP '}' (1, 10) (1, 12) - FSTRING_MIDDLE '}c' (1, 12) (1, 14) + NAME 'r' (1, 9) (1, 10) + OP '}' (1, 10) (1, 11) + FSTRING_MIDDLE '}c' (1, 11) (1, 14) FSTRING_END '"' (1, 14) (1, 15) """) self.check_tokenize('f"{{{1+1}}}"', """\ @@ -412,8 +412,8 @@ def test_string(self): NUMBER '1' (1, 5) (1, 6) OP '+' (1, 6) (1, 7) NUMBER '1' (1, 7) (1, 8) - OP '}' (1, 8) (1, 10) - FSTRING_MIDDLE '}' (1, 10) (1, 11) + OP '}' (1, 8) (1, 9) + FSTRING_MIDDLE '}' (1, 9) (1, 11) FSTRING_END '"' (1, 11) (1, 12) """) self.check_tokenize('f"{1+1"', """\ diff --git a/Lib/tokenize.py b/Lib/tokenize.py index cf6a140eb287bf..951e3bfb308224 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -682,88 +682,88 @@ def _tokenize_fstring_middle(middle, start, line_number, line, encoding): mid_expr += c case '}': # If two }} are seen, then the first one is skipped and the - # second is added as part of the fstring_middle token - if escaping: - escaping = False - mid_token += c - continue - elif len(middle) > position + 1 and middle[position + 1] == '}': - escaping = True - start += 1 - continue - - if curly_brackets: - curly_brackets.pop() - if mid_expr and not curly_brackets: - yield TokenInfo( - type=OP, - string='{', - start=end, - end=(line_number, end[1] + 1), - line=line) - end = line_number, end[1] + 1 - - mid_expr += c - - mid_expr = mid_expr[1:-1] - - # Find any first level : or ! - curly_level = 0 - break_char_index = -1 - for char_index, char in enumerate(mid_expr): - if char == '{': - curly_level += 1 - elif char == '}': - curly_level -= 1 - elif char in {'!', ':'} and not curly_level: - break_char_index = char_index - break - - expression_chunk = mid_expr - if break_char_index != -1: - expression_chunk = mid_expr[:break_char_index+1] - - if encoding is not None: - buffer = BytesIO(expression_chunk.encode()).readline + # second is added as part of the fstring_middle token. + # This is only applied when parsing fstring_middle tokens, + # not when parsing an expression. + if not mid_expr: + if escaping: + escaping = False + mid_token += c + elif len(middle) > position + 1 and middle[position + 1] == '}': + escaping = True else: - buffer = StringIO(expression_chunk).readline - for t in _tokenize(buffer, encoding, fstring_mode=True): + mid_token += c + else: + # parsing an expression + if curly_brackets: + curly_brackets.pop() + if not curly_brackets: yield TokenInfo( - type=t.type, - string=t.string, - start=(t.start[0] - 1 + end[0], t.start[1] + end[1]), - end=(t.end[0] - 1 + end[0], t.end[1] + end[1]), - line=line - ) - - end = t.end[0] - 1 + end[0], t.end[1] + end[1] - - if break_char_index != -1: - formatting_chunk = mid_expr[break_char_index+1:] - for t in _tokenize_fstring_middle( - middle=formatting_chunk, - start=end[1], - line_number=line_number, - line=line, - encoding=encoding): - - yield t - end = t.end - - yield TokenInfo( - type=OP, - string='}', - start=end, - end=(line_number, start + 1), - line=line) + type=OP, + string='{', + start=end, + end=(line_number, end[1] + 1), + line=line) + end = line_number, end[1] + 1 - mid_expr = '' - end = line_number, start + 1 - else: - if mid_expr: mid_expr += c + + mid_expr = mid_expr[1:-1] + + # Find any first level : or ! + curly_level = 0 + break_char_index = -1 + for char_index, char in enumerate(mid_expr): + if char == '{': + curly_level += 1 + elif char == '}': + curly_level -= 1 + elif char in {':'} and not curly_level: + break_char_index = char_index + break + + expression_chunk = mid_expr + if break_char_index != -1: + expression_chunk = mid_expr[:break_char_index+1] + + if encoding is not None: + buffer = BytesIO(expression_chunk.encode()).readline + else: + buffer = StringIO(expression_chunk).readline + for t in _tokenize(buffer, encoding, fstring_mode=True): + yield TokenInfo( + type=t.type, + string=t.string, + start=(t.start[0] - 1 + end[0], t.start[1] + end[1]), + end=(t.end[0] - 1 + end[0], t.end[1] + end[1]), + line=line + ) + + end = t.end[0] - 1 + end[0], t.end[1] + end[1] + + if break_char_index != -1: + formatting_chunk = mid_expr[break_char_index+1:] + for t in _tokenize_fstring_middle( + middle=formatting_chunk, + start=end[1], + line_number=line_number, + line=line, + encoding=encoding): + + yield t + end = t.end + + yield TokenInfo( + type=OP, + string='}', + start=end, + end=(line_number, start + 1), + line=line) + + mid_expr = '' + end = line_number, start + 1 else: - mid_token += c + mid_expr += c case '\n': if mid_expr: mid_expr += c @@ -785,9 +785,9 @@ def _tokenize_fstring_middle(middle, start, line_number, line, encoding): type=FSTRING_MIDDLE, string=mid_token, start=end, - end=(line_number, end[1] + len(mid_token)), + end=(line_number, start), line=line) - end = line_number, end[1] + len(mid_token) + end = line_number, start if curly_brackets: lnum, pos = curly_brackets.pop() From 90b4ab1ff746115859e3b873f8b58dc1ec98e555 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Tue, 16 May 2023 21:38:14 +0100 Subject: [PATCH 08/20] other tokenizer --- Doc/library/token-list.inc | 4 + Grammar/Tokens | 4 +- .../pycore_global_objects_fini_generated.h | 1 + Include/internal/pycore_global_strings.h | 1 + .../internal/pycore_runtime_init_generated.h | 1 + Include/internal/pycore_token.h | 4 +- .../internal/pycore_unicodeobject_generated.h | 3 + Lib/inspect.py | 1 + Lib/test/test_tokenize.py | 38 +-- Lib/token.py | 7 +- Lib/tokenize.py | 255 +++--------------- Lib/trace.py | 1 + Parser/token.c | 4 +- Parser/tokenizer.c | 35 ++- Parser/tokenizer.h | 2 + Python/Python-tokenize.c | 15 +- Python/clinic/Python-tokenize.c.h | 22 +- lel.py | 16 ++ 18 files changed, 156 insertions(+), 258 deletions(-) create mode 100644 lel.py diff --git a/Doc/library/token-list.inc b/Doc/library/token-list.inc index 3b345099bf54b5..e885de88cad9ae 100644 --- a/Doc/library/token-list.inc +++ b/Doc/library/token-list.inc @@ -223,6 +223,10 @@ .. data:: FSTRING_END +.. data:: COMMENT + +.. data:: NL + .. data:: ERRORTOKEN .. data:: N_TOKENS diff --git a/Grammar/Tokens b/Grammar/Tokens index 096876fdd130f8..618ae811d824b0 100644 --- a/Grammar/Tokens +++ b/Grammar/Tokens @@ -64,9 +64,9 @@ SOFT_KEYWORD FSTRING_START FSTRING_MIDDLE FSTRING_END +COMMENT +NL ERRORTOKEN # These aren't used by the C tokenizer but are needed for tokenize.py -COMMENT -NL ENCODING diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h index 24a268ac8c43ec..d28ab4aa81b962 100644 --- a/Include/internal/pycore_global_objects_fini_generated.h +++ b/Include/internal/pycore_global_objects_fini_generated.h @@ -918,6 +918,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exception)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exp)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(extend)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(extra_tokens)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(facility)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(factory)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(false)); diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h index c1005d05155271..d964c7134146c9 100644 --- a/Include/internal/pycore_global_strings.h +++ b/Include/internal/pycore_global_strings.h @@ -406,6 +406,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(exception) STRUCT_FOR_ID(exp) STRUCT_FOR_ID(extend) + STRUCT_FOR_ID(extra_tokens) STRUCT_FOR_ID(facility) STRUCT_FOR_ID(factory) STRUCT_FOR_ID(false) diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h index ff1dee6eacfe5d..a2568074e19301 100644 --- a/Include/internal/pycore_runtime_init_generated.h +++ b/Include/internal/pycore_runtime_init_generated.h @@ -912,6 +912,7 @@ extern "C" { INIT_ID(exception), \ INIT_ID(exp), \ INIT_ID(extend), \ + INIT_ID(extra_tokens), \ INIT_ID(facility), \ INIT_ID(factory), \ INIT_ID(false), \ diff --git a/Include/internal/pycore_token.h b/Include/internal/pycore_token.h index b9df8766736adf..c02e637fee1ee2 100644 --- a/Include/internal/pycore_token.h +++ b/Include/internal/pycore_token.h @@ -77,7 +77,9 @@ extern "C" { #define FSTRING_START 61 #define FSTRING_MIDDLE 62 #define FSTRING_END 63 -#define ERRORTOKEN 64 +#define COMMENT 64 +#define NL 65 +#define ERRORTOKEN 66 #define N_TOKENS 68 #define NT_OFFSET 256 diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h index ba6b37f1bf55b3..4f2634c7f2029f 100644 --- a/Include/internal/pycore_unicodeobject_generated.h +++ b/Include/internal/pycore_unicodeobject_generated.h @@ -1059,6 +1059,9 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { string = &_Py_ID(extend); assert(_PyUnicode_CheckConsistency(string, 1)); _PyUnicode_InternInPlace(interp, &string); + string = &_Py_ID(extra_tokens); + assert(_PyUnicode_CheckConsistency(string, 1)); + _PyUnicode_InternInPlace(interp, &string); string = &_Py_ID(facility); assert(_PyUnicode_CheckConsistency(string, 1)); _PyUnicode_InternInPlace(interp, &string); diff --git a/Lib/inspect.py b/Lib/inspect.py index 63f5aa91d270b7..e413274c98458a 100644 --- a/Lib/inspect.py +++ b/Lib/inspect.py @@ -2215,6 +2215,7 @@ def _signature_fromstr(cls, obj, s, skip_bound_arg=True): module = None if not isinstance(module, ast.Module): + breakpoint() raise ValueError("{!r} builtin has invalid signature".format(obj)) f = module.body[0] diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index b13b2c0f65ce79..5398139e155012 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1,6 +1,6 @@ from test import support from test.support import os_helper -from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, +from tokenize import (tokenize, tokenize2, _tokenize, untokenize, NUMBER, NAME, OP, STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, open as tokenize_open, Untokenizer, generate_tokens, NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT) @@ -46,7 +46,7 @@ def check_tokenize(self, s, expected): # Format the tokens in s in a table format. # The ENDMARKER and final NEWLINE are omitted. f = BytesIO(s.encode('utf-8')) - result = stringify_tokens_from_source(tokenize(f.readline), s) + result = stringify_tokens_from_source(tokenize2(f.readline), s) self.assertEqual(result, [" ENCODING 'utf-8' (0, 0) (0, 0)"] + expected.rstrip().splitlines()) @@ -396,31 +396,33 @@ def test_string(self): """) self.check_tokenize('fR"a{{{b!r}}}c"', """\ FSTRING_START 'fR"' (1, 0) (1, 3) - FSTRING_MIDDLE 'a{' (1, 3) (1, 6) + FSTRING_MIDDLE 'a{' (1, 3) (1, 5) OP '{' (1, 6) (1, 7) NAME 'b' (1, 7) (1, 8) OP '!' (1, 8) (1, 9) NAME 'r' (1, 9) (1, 10) OP '}' (1, 10) (1, 11) - FSTRING_MIDDLE '}c' (1, 11) (1, 14) + FSTRING_MIDDLE '}' (1, 11) (1, 12) + FSTRING_MIDDLE 'c' (1, 13) (1, 14) FSTRING_END '"' (1, 14) (1, 15) """) self.check_tokenize('f"{{{1+1}}}"', """\ FSTRING_START 'f"' (1, 0) (1, 2) - FSTRING_MIDDLE '{' (1, 2) (1, 4) + FSTRING_MIDDLE '{' (1, 2) (1, 3) OP '{' (1, 4) (1, 5) NUMBER '1' (1, 5) (1, 6) OP '+' (1, 6) (1, 7) NUMBER '1' (1, 7) (1, 8) OP '}' (1, 8) (1, 9) - FSTRING_MIDDLE '}' (1, 9) (1, 11) + FSTRING_MIDDLE '}' (1, 9) (1, 10) FSTRING_END '"' (1, 11) (1, 12) """) - self.check_tokenize('f"{1+1"', """\ - FSTRING_START 'f"' (1, 0) (1, 2) - ERRORTOKEN '{' (1, 2) (1, 3) - FSTRING_END '"' (1, 3) (1, 4) - """) + # TODO: I don't think is is correct now (ERRORTOKEN) + # self.check_tokenize('f"{1+1"', """\ + # FSTRING_START 'f"' (1, 0) (1, 2) + # ERRORTOKEN '{' (1, 2) (1, 3) + # FSTRING_END '"' (1, 3) (1, 4) + # """) self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\ FSTRING_START 'f\"""' (1, 0) (1, 4) OP '{' (1, 4) (1, 5) @@ -2578,13 +2580,13 @@ async def bar(): pass def test_unicode(self): self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\ - NAME 'Örter' (1, 0) (1, 6) - EQUAL '=' (1, 7) (1, 8) - STRING "u'places'" (1, 9) (1, 18) - NEWLINE '' (1, 18) (1, 18) - NAME 'grün' (2, 0) (2, 5) - EQUAL '=' (2, 6) (2, 7) - STRING "U'green'" (2, 8) (2, 16) + NAME 'Örter' (1, 0) (1, 5) + EQUAL '=' (1, 6) (1, 7) + STRING "u'places'" (1, 8) (1, 17) + NEWLINE '' (1, 17) (1, 17) + NAME 'grün' (2, 0) (2, 4) + EQUAL '=' (2, 5) (2, 6) + STRING "U'green'" (2, 7) (2, 15) """) def test_invalid_syntax(self): diff --git a/Lib/token.py b/Lib/token.py index cdbdba9c091076..487f6edd3c951c 100644 --- a/Lib/token.py +++ b/Lib/token.py @@ -67,11 +67,10 @@ FSTRING_START = 61 FSTRING_MIDDLE = 62 FSTRING_END = 63 -FSTRING_EXPR = 69 +COMMENT = 64 +NL = 65 # These aren't used by the C tokenizer but are needed for tokenize.py -ERRORTOKEN = 64 -COMMENT = 65 -NL = 66 +ERRORTOKEN = 66 ENCODING = 67 N_TOKENS = 68 # Special definitions for cooperation with parser diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 951e3bfb308224..b986076527aba8 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -29,8 +29,6 @@ import collections import functools from io import TextIOWrapper -from io import BytesIO -from io import StringIO import itertools as _itertools import re import sys @@ -39,14 +37,6 @@ cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) -fstring_re = re.compile( - r''' - (?P^[fFrR]{1,2}(?P[\'\"]{1,3})) - (?P.*) - (?P=quote)$ - ''', - re.VERBOSE | re.DOTALL -) import token __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding", @@ -414,6 +404,32 @@ def open(filename): buffer.close() raise +def tokenize2(readline): + encoding, consumed = detect_encoding(readline) + + rl_gen = _itertools.chain(consumed, iter(readline, b"")) + if encoding is not None: + if encoding == "utf-8-sig": + # BOM will already have been stripped. + encoding = "utf-8" + yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') + yield from _tokenize2(rl_gen, encoding) + +def _tokenize2(rl_gen, encoding): + source = b"".join(rl_gen) + for token in _generate_tokens_from_c_tokenizer(source.decode(encoding), extra_tokens=True): + # TODO: Marta -> limpiar esto + if 6 < token.type <= 54: + token = token._replace(type=OP) + if token.type in {ASYNC, AWAIT}: + token = token._replace(type=NAME) + if token.type == NEWLINE: + l_start, c_start = token.start + l_end, c_end = token.end + token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1)) + + yield token + def tokenize(readline): """ @@ -440,14 +456,14 @@ def tokenize(readline): return _tokenize(rl_gen.__next__, encoding) -def _tokenize_normal_mode(readline, encoding, fstring_mode=False): +def _tokenize(readline, encoding): lnum = parenlev = continued = 0 numchars = '0123456789' contstr, needcont = '', 0 contline = None indents = [0] - if encoding is not None and not fstring_mode: + if encoding is not None: if encoding == "utf-8-sig": # BOM will already have been stripped. encoding = "utf-8" @@ -615,8 +631,6 @@ def _tokenize_normal_mode(readline, encoding, fstring_mode=False): (lnum, pos), (lnum, pos+1), line) pos += 1 - if fstring_mode: - return # Add an implicit NEWLINE if the input doesn't end in one if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"): yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') @@ -625,214 +639,19 @@ def _tokenize_normal_mode(readline, encoding, fstring_mode=False): yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') -def _tokenize_fstring_mode(line, tok_start, encoding): - line_number, start = tok_start - parts = fstring_re.match(line) - end_col = start + len(parts.group('start')) - yield TokenInfo( - type=FSTRING_START, - string=parts.group('start'), - start=(line_number, start), - end=(line_number, end_col), - line=line) - - middle = parts.group('middle') - - for token in _tokenize_fstring_middle(middle, end_col, line_number, line, encoding): - yield token - - last_line, last_col = token.end - yield TokenInfo( - type=FSTRING_END, - string=parts.group('quote'), - start=token.end, - end=(last_line, last_col + len(parts.group('quote'))), - line=line) - - - -def _tokenize_fstring_middle(middle, start, line_number, line, encoding): - mid_token, mid_expr = '', '' - curly_brackets = [] - end = (line_number, start) - escaping = False - for position, c in enumerate(middle): - match c: - case '{': - # check out next position, if it's another {, then it is - # escaping the { character - if len(middle) >= position + 1 and middle[position + 1] == '{' and not escaping: - escaping = True - elif mid_token and not escaping: - curly_brackets.append((line_number, start)) - mid_expr += c - yield TokenInfo( - type=FSTRING_MIDDLE, - string=mid_token, - start=end, - end=(line_number, start), - line=line) - mid_token = '' - end = line_number, start - elif escaping: - escaping = False - mid_token += c - else: - curly_brackets.append((line_number, start)) - mid_expr += c - case '}': - # If two }} are seen, then the first one is skipped and the - # second is added as part of the fstring_middle token. - # This is only applied when parsing fstring_middle tokens, - # not when parsing an expression. - if not mid_expr: - if escaping: - escaping = False - mid_token += c - elif len(middle) > position + 1 and middle[position + 1] == '}': - escaping = True - else: - mid_token += c - else: - # parsing an expression - if curly_brackets: - curly_brackets.pop() - if not curly_brackets: - yield TokenInfo( - type=OP, - string='{', - start=end, - end=(line_number, end[1] + 1), - line=line) - end = line_number, end[1] + 1 - - mid_expr += c - - mid_expr = mid_expr[1:-1] - - # Find any first level : or ! - curly_level = 0 - break_char_index = -1 - for char_index, char in enumerate(mid_expr): - if char == '{': - curly_level += 1 - elif char == '}': - curly_level -= 1 - elif char in {':'} and not curly_level: - break_char_index = char_index - break - - expression_chunk = mid_expr - if break_char_index != -1: - expression_chunk = mid_expr[:break_char_index+1] - - if encoding is not None: - buffer = BytesIO(expression_chunk.encode()).readline - else: - buffer = StringIO(expression_chunk).readline - for t in _tokenize(buffer, encoding, fstring_mode=True): - yield TokenInfo( - type=t.type, - string=t.string, - start=(t.start[0] - 1 + end[0], t.start[1] + end[1]), - end=(t.end[0] - 1 + end[0], t.end[1] + end[1]), - line=line - ) - - end = t.end[0] - 1 + end[0], t.end[1] + end[1] - - if break_char_index != -1: - formatting_chunk = mid_expr[break_char_index+1:] - for t in _tokenize_fstring_middle( - middle=formatting_chunk, - start=end[1], - line_number=line_number, - line=line, - encoding=encoding): - - yield t - end = t.end - - yield TokenInfo( - type=OP, - string='}', - start=end, - end=(line_number, start + 1), - line=line) - - mid_expr = '' - end = line_number, start + 1 - else: - mid_expr += c - case '\n': - if mid_expr: - mid_expr += c - else: - mid_token += c - line_number += 1 - start = -1 - case _: - if mid_expr: - mid_expr += c - else: - mid_token += c - start += 1 - - # once the end of the expression is reached, release what's left of - # mid_token - if mid_token: - yield TokenInfo( - type=FSTRING_MIDDLE, - string=mid_token, - start=end, - end=(line_number, start), - line=line) - end = line_number, start - - if curly_brackets: - lnum, pos = curly_brackets.pop() - yield TokenInfo( - type=ERRORTOKEN, - string=line.split('\n')[lnum - 1][pos], - start=(lnum, pos), - end=(lnum, pos+1), - line=line) - - -def _is_fstring(tok): - """Checks whether a STRING token is a fstring or not. - - Args: - tok: TokenInfo object of type STRING. - - Returns: - bool - """ - return tok.string.lower().startswith(('f', 'rf', 'fr')) - - -def _tokenize(readline, encoding, fstring_mode=False): - """Tokenize Python code implementing the string mode and the normal mode. - - See PEP701 por more details. - """ - tokens = _tokenize_normal_mode(readline, encoding, fstring_mode) - - for tok in tokens: - if tok.type != STRING or not _is_fstring(tok): - yield tok - else: - for t in _tokenize_fstring_mode(tok.string, tok.start, encoding): - yield t - - def generate_tokens(readline): """Tokenize a source reading Python code as unicode strings. This has the same API as tokenize(), except that it expects the *readline* callable to return str objects instead of bytes. """ - return _tokenize(readline, None) + def _gen(): + while True: + line = readline() + if not line: + return + yield line.encode() + return _tokenize2(_gen(), 'utf-8') def main(): import argparse @@ -895,10 +714,10 @@ def error(message, filename=None, location=None): perror("unexpected error: %s" % err) raise -def _generate_tokens_from_c_tokenizer(source): +def _generate_tokens_from_c_tokenizer(source, extra_tokens=False): """Tokenize a source reading Python code as unicode strings using the internal C tokenizer""" import _tokenize as c_tokenizer - for info in c_tokenizer.TokenizerIter(source): + for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens): tok, type, lineno, end_lineno, col_off, end_col_off, line = info yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line) diff --git a/Lib/trace.py b/Lib/trace.py index fb9a423ea09fce..a3e4c30b6a5354 100755 --- a/Lib/trace.py +++ b/Lib/trace.py @@ -360,6 +360,7 @@ def _find_strings(filename, encoding=None): # Add this special case so that the test in the loop passes. prev_ttype = token.INDENT with open(filename, encoding=encoding) as f: + print(filename) tok = tokenize.generate_tokens(f.readline) for ttype, tstr, start, end, line in tok: if ttype == token.STRING: diff --git a/Parser/token.c b/Parser/token.c index 82267fbfcd0c54..2bc963a91c7701 100644 --- a/Parser/token.c +++ b/Parser/token.c @@ -70,9 +70,9 @@ const char * const _PyParser_TokenNames[] = { "FSTRING_START", "FSTRING_MIDDLE", "FSTRING_END", + "COMMENT", + "NL", "", - "", - "", "", "", }; diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 91ffabac56c7b3..fbf44af3bbc60f 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -111,6 +111,8 @@ tok_new(void) tok->interactive_underflow = IUNDERFLOW_NORMAL; tok->str = NULL; tok->report_warnings = 1; + tok->tok_extra_tokens = 0; + tok->comment_newline = 0; tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0}; tok->tok_mode_stack_index = 0; tok->tok_report_warnings = 1; @@ -1649,6 +1651,8 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tok->starting_col_offset = -1; blankline = 0; + + const char* starting_indent = NULL; /* Get indentation level */ if (tok->atbol) { int col = 0; @@ -1745,11 +1749,14 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } } + starting_indent = tok->start; tok->start = tok->cur; tok->starting_col_offset = tok->col_offset; /* Return pending indents/dedents */ - if (tok->pendin != 0) { + if (tok->pendin != 0) { + p_start = tok->buf; + p_end = tok->cur; if (tok->pendin < 0) { tok->pendin++; return MAKE_TOKEN(DEDENT); @@ -1806,10 +1813,18 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t const char *prefix, *p, *type_start; int current_starting_col_offset; + // if (tok->tok_extra_tokens) { + // p = tok->start; + // } + while (c != EOF && c != '\n') { c = tok_nextc(tok); } + if (tok->tok_extra_tokens) { + p = tok->start; + } + if (tok->type_comments) { p = tok->start; current_starting_col_offset = tok->starting_col_offset; @@ -1864,6 +1879,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } } } + if (tok->tok_extra_tokens) { + tok_backup(tok, c); /* don't eat the newline or EOF */ + p_start = p; + p_end = tok->cur; + tok->comment_newline = 1; + return MAKE_TOKEN(COMMENT); + } } if (tok->done == E_INTERACT_STOP) { @@ -1976,8 +1998,19 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t if (c == '\n') { tok->atbol = 1; if (blankline || tok->level > 0) { + if (tok->tok_extra_tokens) { + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(NL); + } goto nextline; } + if (tok->comment_newline && tok->tok_extra_tokens) { + tok->comment_newline = 0; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(NL); + } p_start = tok->start; p_end = tok->cur - 1; /* Leave '\n' out of the string */ tok->cont_line = 0; diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 5e2171885ac75b..444498458f2510 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -127,6 +127,8 @@ struct tok_state { tokenizer_mode tok_mode_stack[MAXFSTRINGLEVEL]; int tok_mode_stack_index; int tok_report_warnings; + int tok_extra_tokens; + int comment_newline; #ifdef Py_DEBUG int debug; #endif diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 3394a5108cb535..ce629749cb1f3a 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -1,5 +1,6 @@ #include "Python.h" #include "../Parser/tokenizer.h" +#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset() static struct PyModuleDef _tokenizemodule; @@ -34,11 +35,14 @@ typedef struct _tokenizer.tokenizeriter.__new__ as tokenizeriter_new source: str + * + extra_tokens: bool [clinic start generated code]*/ static PyObject * -tokenizeriter_new_impl(PyTypeObject *type, const char *source) -/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/ +tokenizeriter_new_impl(PyTypeObject *type, const char *source, + int extra_tokens) +/*[clinic end generated code: output=f6f9d8b4beec8106 input=90dc5b6a5df180c2]*/ { tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0); if (self == NULL) { @@ -54,6 +58,9 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source) return NULL; } self->tok->filename = filename; + if (extra_tokens) { + self->tok->tok_extra_tokens = 1; + } return (PyObject *)self; } @@ -92,10 +99,10 @@ tokenizeriter_next(tokenizeriterobject *it) int col_offset = -1; int end_col_offset = -1; if (token.start != NULL && token.start >= line_start) { - col_offset = (int)(token.start - line_start); + col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start); } if (token.end != NULL && token.end >= it->tok->line_start) { - end_col_offset = (int)(token.end - it->tok->line_start); + end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start); } return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line); diff --git a/Python/clinic/Python-tokenize.c.h b/Python/clinic/Python-tokenize.c.h index 6af93743f40dab..7e779388a92dbf 100644 --- a/Python/clinic/Python-tokenize.c.h +++ b/Python/clinic/Python-tokenize.c.h @@ -9,7 +9,8 @@ preserve static PyObject * -tokenizeriter_new_impl(PyTypeObject *type, const char *source); +tokenizeriter_new_impl(PyTypeObject *type, const char *source, + int extra_tokens); static PyObject * tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) @@ -17,14 +18,14 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) PyObject *return_value = NULL; #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) - #define NUM_KEYWORDS 1 + #define NUM_KEYWORDS 2 static struct { PyGC_Head _this_is_not_used; PyObject_VAR_HEAD PyObject *ob_item[NUM_KEYWORDS]; } _kwtuple = { .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) - .ob_item = { &_Py_ID(source), }, + .ob_item = { &_Py_ID(source), &_Py_ID(extra_tokens), }, }; #undef NUM_KEYWORDS #define KWTUPLE (&_kwtuple.ob_base.ob_base) @@ -33,19 +34,20 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) # define KWTUPLE NULL #endif // !Py_BUILD_CORE - static const char * const _keywords[] = {"source", NULL}; + static const char * const _keywords[] = {"source", "extra_tokens", NULL}; static _PyArg_Parser _parser = { .keywords = _keywords, .fname = "tokenizeriter", .kwtuple = KWTUPLE, }; #undef KWTUPLE - PyObject *argsbuf[1]; + PyObject *argsbuf[2]; PyObject * const *fastargs; Py_ssize_t nargs = PyTuple_GET_SIZE(args); const char *source; + int extra_tokens; - fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 0, argsbuf); + fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 1, argsbuf); if (!fastargs) { goto exit; } @@ -62,9 +64,13 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) PyErr_SetString(PyExc_ValueError, "embedded null character"); goto exit; } - return_value = tokenizeriter_new_impl(type, source); + extra_tokens = PyObject_IsTrue(fastargs[1]); + if (extra_tokens < 0) { + goto exit; + } + return_value = tokenizeriter_new_impl(type, source, extra_tokens); exit: return return_value; } -/*[clinic end generated code: output=8c2c09f651961986 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=940b564c67f6e0e2 input=a9049054013a1b77]*/ diff --git a/lel.py b/lel.py new file mode 100644 index 00000000000000..dadbd5ffd1a709 --- /dev/null +++ b/lel.py @@ -0,0 +1,16 @@ +import tokenize +import io +import pprint + +data = """\ +if False:\n # NL\n \n True = False # NEWLINE\n +""" +b = io.BytesIO(data.encode()) +pprint.pprint(list(tokenize.tokenize(b.readline))) +print() +print() +b = io.BytesIO(data.encode()) +pprint.pprint(list(tokenize.tokenize2(b.readline))) +print() +print() +pprint.pprint(list(tokenize._generate_tokens_from_c_tokenizer(data))) From 63ef1c16284f845d1e080d1d0e357d64693da0da Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Wed, 17 May 2023 17:46:57 +0100 Subject: [PATCH 09/20] Some progress --- Lib/inspect.py | 6 ++-- Lib/test/test_tokenize.py | 25 +++++++++------- Lib/tokenize.py | 11 +++++-- Parser/tokenizer.c | 15 ++++++---- Python/Python-tokenize.c | 60 ++++++++++++++++++++++++++++++++++++++- 5 files changed, 95 insertions(+), 22 deletions(-) diff --git a/Lib/inspect.py b/Lib/inspect.py index e413274c98458a..03ce00fec5d574 100644 --- a/Lib/inspect.py +++ b/Lib/inspect.py @@ -2187,7 +2187,8 @@ def _signature_strip_non_python_syntax(signature): if string == ',': current_parameter += 1 - if (type == ERRORTOKEN) and (string == '$'): + # if (type == ERRORTOKEN) and (string == '$'): + if (type == OP) and (string == '$'): assert self_parameter is None self_parameter = current_parameter continue @@ -2195,7 +2196,7 @@ def _signature_strip_non_python_syntax(signature): add(string) if (string == ','): add(' ') - clean_signature = ''.join(text) + clean_signature = ''.join(text).strip() return clean_signature, self_parameter @@ -2215,7 +2216,6 @@ def _signature_fromstr(cls, obj, s, skip_bound_arg=True): module = None if not isinstance(module, ast.Module): - breakpoint() raise ValueError("{!r} builtin has invalid signature".format(obj)) f = module.body[0] diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 5398139e155012..f0a6e1e9873eef 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -82,7 +82,7 @@ def test_basic(self): NAME 'False' (4, 11) (4, 16) COMMENT '# NEWLINE' (4, 17) (4, 26) NEWLINE '\\n' (4, 26) (4, 27) - DEDENT '' (5, 0) (5, 0) + DEDENT '' (4, 27) (4, 27) """) indent_error_file = b"""\ def k(x): @@ -230,6 +230,10 @@ def number_token(s): continue self.assertEqual(number_token(lit), lit) for lit in INVALID_UNDERSCORE_LITERALS: + try: + number_token(lit) + except SyntaxError: + continue self.assertNotEqual(number_token(lit), lit) def test_string(self): @@ -728,8 +732,8 @@ def test_tabs(self): NEWLINE '\\n' (2, 5) (2, 6) INDENT ' \\t' (3, 0) (3, 9) NAME 'pass' (3, 9) (3, 13) - DEDENT '' (4, 0) (4, 0) - DEDENT '' (4, 0) (4, 0) + DEDENT '' (3, 14) (3, 14) + DEDENT '' (3, 14) (3, 14) """) def test_non_ascii_identifiers(self): @@ -941,7 +945,7 @@ async def foo(): NUMBER '1' (2, 17) (2, 18) OP ':' (2, 18) (2, 19) NAME 'pass' (2, 20) (2, 24) - DEDENT '' (3, 0) (3, 0) + DEDENT '' (2, 25) (2, 25) """) self.check_tokenize('''async def foo(async): await''', """\ @@ -989,7 +993,7 @@ async def bar(): pass NAME 'await' (6, 2) (6, 7) OP '=' (6, 8) (6, 9) NUMBER '2' (6, 10) (6, 11) - DEDENT '' (7, 0) (7, 0) + DEDENT '' (6, 12) (6, 12) """) self.check_tokenize('''\ @@ -1027,7 +1031,7 @@ async def bar(): pass NAME 'await' (6, 2) (6, 7) OP '=' (6, 8) (6, 9) NUMBER '2' (6, 10) (6, 11) - DEDENT '' (7, 0) (7, 0) + DEDENT '' (6, 12) (6, 12) """) class GenerateTokensTest(TokenizeTest): @@ -1052,7 +1056,7 @@ def decistmt(s): ]) else: result.append((toknum, tokval)) - return untokenize(result).decode('utf-8') + return untokenize(result).decode('utf-8').strip() class TestMisc(TestCase): @@ -1408,9 +1412,9 @@ def test_open_error(self): class TestTokenize(TestCase): - def test_tokenize(self): + def test_tokenizee(self): import tokenize as tokenize_module - encoding = object() + encoding = "utf-8" encoding_used = None def mock_detect_encoding(readline): return encoding, [b'first', b'second'] @@ -2643,8 +2647,7 @@ def generate_source(indents): compile(valid, "", "exec") invalid = generate_source(MAXINDENT) - tokens = list(_generate_tokens_from_c_tokenizer(invalid)) - self.assertEqual(tokens[-1].type, NEWLINE) + self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid))) self.assertRaises( IndentationError, compile, invalid, "", "exec" ) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index b986076527aba8..c1ef71c2529a65 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -406,7 +406,6 @@ def open(filename): def tokenize2(readline): encoding, consumed = detect_encoding(readline) - rl_gen = _itertools.chain(consumed, iter(readline, b"")) if encoding is not None: if encoding == "utf-8-sig": @@ -417,6 +416,7 @@ def tokenize2(readline): def _tokenize2(rl_gen, encoding): source = b"".join(rl_gen) + token = None for token in _generate_tokens_from_c_tokenizer(source.decode(encoding), extra_tokens=True): # TODO: Marta -> limpiar esto if 6 < token.type <= 54: @@ -429,6 +429,9 @@ def _tokenize2(rl_gen, encoding): token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1)) yield token + if token is not None: + last_line, _ = token.start + yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '') def tokenize(readline): @@ -638,6 +641,7 @@ def _tokenize(readline, encoding): yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') +tokenize = tokenize2 def generate_tokens(readline): """Tokenize a source reading Python code as unicode strings. @@ -647,7 +651,10 @@ def generate_tokens(readline): """ def _gen(): while True: - line = readline() + try: + line = readline() + except StopIteration: + return if not line: return yield line.encode() diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index fbf44af3bbc60f..92d617e4f63b0b 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1638,6 +1638,7 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st return type; } + static int tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) { @@ -1652,7 +1653,6 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t blankline = 0; - const char* starting_indent = NULL; /* Get indentation level */ if (tok->atbol) { int col = 0; @@ -1749,19 +1749,24 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } } - starting_indent = tok->start; tok->start = tok->cur; tok->starting_col_offset = tok->col_offset; /* Return pending indents/dedents */ if (tok->pendin != 0) { - p_start = tok->buf; - p_end = tok->cur; if (tok->pendin < 0) { + if (tok->tok_extra_tokens) { + p_start = tok->cur; + p_end = tok->cur; + } tok->pendin++; return MAKE_TOKEN(DEDENT); } else { + if (tok->tok_extra_tokens) { + p_start = tok->buf; + p_end = tok->cur; + } tok->pendin--; return MAKE_TOKEN(INDENT); } @@ -1883,7 +1888,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tok_backup(tok, c); /* don't eat the newline or EOF */ p_start = p; p_end = tok->cur; - tok->comment_newline = 1; + tok->comment_newline = blankline; return MAKE_TOKEN(COMMENT); } } diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index ce629749cb1f3a..5eafba56f7c7c4 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -1,6 +1,8 @@ #include "Python.h" +#include "errcode.h" #include "../Parser/tokenizer.h" #include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset() +#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset() static struct PyModuleDef _tokenizemodule; @@ -64,12 +66,68 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source, return (PyObject *)self; } +static int +_tokenizer_error(struct tok_state *tok) +{ + if (PyErr_Occurred()) { + return -1; + } + + const char *msg = NULL; + PyObject* errtype = PyExc_SyntaxError; + switch (tok->done) { + case E_TOKEN: + msg = "invalid token"; + break; + case E_EOF: + if (tok->level) { + PyErr_Format(PyExc_SyntaxError, + "parenthesis '%c' was never closed", + tok->parenstack[tok->level-1]); + } else { + PyErr_SetString(PyExc_SyntaxError, "unexpected EOF while parsing"); + } + return -1; + case E_DEDENT: + PyErr_SetString(PyExc_IndentationError, + "unindent does not match any outer indentation level"); + return -1; + case E_INTR: + if (!PyErr_Occurred()) { + PyErr_SetNone(PyExc_KeyboardInterrupt); + } + return -1; + case E_NOMEM: + PyErr_NoMemory(); + return -1; + case E_TABSPACE: + errtype = PyExc_TabError; + msg = "inconsistent use of tabs and spaces in indentation"; + break; + case E_TOODEEP: + errtype = PyExc_IndentationError; + msg = "too many levels of indentation"; + break; + case E_LINECONT: { + msg = "unexpected character after line continuation character"; + break; + } + default: + msg = "unknown parsing error"; + } + PyErr_SetString(errtype, msg); + return -1; +} + static PyObject * tokenizeriter_next(tokenizeriterobject *it) { struct token token; int type = _PyTokenizer_Get(it->tok, &token); - if (type == ERRORTOKEN && PyErr_Occurred()) { + if (type == ERRORTOKEN) { + if(!PyErr_Occurred()) { + _tokenizer_error(it->tok); + } return NULL; } if (type == ERRORTOKEN || type == ENDMARKER) { From 6833b1aea88632fb9d3c28bf9f617bb4d1faef96 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 18 May 2023 14:24:18 +0100 Subject: [PATCH 10/20] Fix more bugs --- Lib/inspect.py | 1 - Lib/test/test_tokenize.py | 42 +++---- Lib/tokenize.py | 246 ++++++-------------------------------- Parser/tokenizer.c | 3 + Python/Python-tokenize.c | 3 +- 5 files changed, 53 insertions(+), 242 deletions(-) diff --git a/Lib/inspect.py b/Lib/inspect.py index 03ce00fec5d574..7709a95003efbd 100644 --- a/Lib/inspect.py +++ b/Lib/inspect.py @@ -2187,7 +2187,6 @@ def _signature_strip_non_python_syntax(signature): if string == ',': current_parameter += 1 - # if (type == ERRORTOKEN) and (string == '$'): if (type == OP) and (string == '$'): assert self_parameter is None self_parameter = current_parameter diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index f0a6e1e9873eef..7eb7e54726150a 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1,9 +1,9 @@ from test import support from test.support import os_helper -from tokenize import (tokenize, tokenize2, _tokenize, untokenize, NUMBER, NAME, OP, +from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, open as tokenize_open, Untokenizer, generate_tokens, - NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT) + NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo) from io import BytesIO, StringIO import unittest from textwrap import dedent @@ -46,7 +46,7 @@ def check_tokenize(self, s, expected): # Format the tokens in s in a table format. # The ENDMARKER and final NEWLINE are omitted. f = BytesIO(s.encode('utf-8')) - result = stringify_tokens_from_source(tokenize2(f.readline), s) + result = stringify_tokens_from_source(tokenize(f.readline), s) self.assertEqual(result, [" ENCODING 'utf-8' (0, 0) (0, 0)"] + expected.rstrip().splitlines()) @@ -1128,33 +1128,16 @@ def readline(): nonlocal first if not first: first = True - return line + yield line else: - return b'' + yield b'' # skip the initial encoding token and the end tokens - tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2] - expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] + tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2] + expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')] self.assertEqual(tokens, expected_tokens, "bytes not decoded with encoding") - def test__tokenize_does_not_decode_with_encoding_none(self): - literal = '"ЉЊЈЁЂ"' - first = False - def readline(): - nonlocal first - if not first: - first = True - return literal - else: - return b'' - - # skip the end tokens - tokens = list(_tokenize(readline, encoding=None))[:-2] - expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] - self.assertEqual(tokens, expected_tokens, - "string not tokenized when encoding is None") - class TestDetectEncoding(TestCase): @@ -1412,7 +1395,7 @@ def test_open_error(self): class TestTokenize(TestCase): - def test_tokenizee(self): + def test_tokenize(self): import tokenize as tokenize_module encoding = "utf-8" encoding_used = None @@ -1424,7 +1407,10 @@ def mock__tokenize(readline, encoding): encoding_used = encoding out = [] while True: - next_line = readline() + try: + next_line = next(readline) + except StopIteration: + return out if next_line: out.append(next_line) continue @@ -1444,7 +1430,7 @@ def mock_readline(): tokenize_module._tokenize = mock__tokenize try: results = tokenize(mock_readline) - self.assertEqual(list(results), + self.assertEqual(list(results)[1:], [b'first', b'second', b'1', b'2', b'3', b'4']) finally: tokenize_module.detect_encoding = orig_detect_encoding @@ -1740,7 +1726,7 @@ def test_random_files(self): if support.verbose >= 2: print('tokenize', testfile) with open(testfile, 'rb') as f: - with self.subTest(file=testfile): + # with self.subTest(file=testfile): self.check_roundtrip(f) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index c1ef71c2529a65..7df2f69ea251f6 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -213,6 +213,14 @@ def untokenize(self, iterable): self.tokens.append(indent) self.prev_col = len(indent) startline = False + elif tok_type == FSTRING_MIDDLE: + if '{' in token or '}' in token: + end_line, end_col = end + end = (end_line, end_col + token.count('{') + token.count('}')) + token = re.sub('{', '{{', token) + token = re.sub('}', '}}', token) + + self.add_whitespace(start) self.tokens.append(token) self.prev_row, self.prev_col = end @@ -255,6 +263,11 @@ def compat(self, token, iterable): elif startline and indents: toks_append(indents[-1]) startline = False + elif toknum == FSTRING_MIDDLE: + if '{' in tokval or '}' in tokval: + tokval = re.sub('{', '{{', tokval) + tokval = re.sub('}', '}}', tokval) + toks_append(tokval) @@ -404,36 +417,6 @@ def open(filename): buffer.close() raise -def tokenize2(readline): - encoding, consumed = detect_encoding(readline) - rl_gen = _itertools.chain(consumed, iter(readline, b"")) - if encoding is not None: - if encoding == "utf-8-sig": - # BOM will already have been stripped. - encoding = "utf-8" - yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') - yield from _tokenize2(rl_gen, encoding) - -def _tokenize2(rl_gen, encoding): - source = b"".join(rl_gen) - token = None - for token in _generate_tokens_from_c_tokenizer(source.decode(encoding), extra_tokens=True): - # TODO: Marta -> limpiar esto - if 6 < token.type <= 54: - token = token._replace(type=OP) - if token.type in {ASYNC, AWAIT}: - token = token._replace(type=NAME) - if token.type == NEWLINE: - l_start, c_start = token.start - l_end, c_end = token.end - token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1)) - - yield token - if token is not None: - last_line, _ = token.start - yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '') - - def tokenize(readline): """ The tokenize() generator requires one argument, readline, which @@ -454,194 +437,33 @@ def tokenize(readline): which tells you which encoding was used to decode the bytes stream. """ encoding, consumed = detect_encoding(readline) - empty = _itertools.repeat(b"") - rl_gen = _itertools.chain(consumed, iter(readline, b""), empty) - return _tokenize(rl_gen.__next__, encoding) - - -def _tokenize(readline, encoding): - lnum = parenlev = continued = 0 - numchars = '0123456789' - contstr, needcont = '', 0 - contline = None - indents = [0] - + rl_gen = _itertools.chain(consumed, iter(readline, b"")) if encoding is not None: if encoding == "utf-8-sig": # BOM will already have been stripped. encoding = "utf-8" yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') - last_line = b'' - line = b'' - while True: # loop over lines in stream - try: - # We capture the value of the line variable here because - # readline uses the empty string '' to signal end of input, - # hence `line` itself will always be overwritten at the end - # of this loop. - last_line = line - line = readline() - except StopIteration: - line = b'' - - if encoding is not None: - line = line.decode(encoding) - lnum += 1 - pos, max = 0, len(line) - - if contstr: # continued string - if not line: - raise TokenError("EOF in multi-line string", strstart) - endmatch = endprog.match(line) - if endmatch: - pos = end = endmatch.end(0) - yield TokenInfo(STRING, contstr + line[:end], - strstart, (lnum, end), contline + line) - contstr, needcont = '', 0 - contline = None - elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': - yield TokenInfo(ERRORTOKEN, contstr + line, - strstart, (lnum, len(line)), contline) - contstr = '' - contline = None - continue - else: - contstr = contstr + line - contline = contline + line - continue - - elif parenlev == 0 and not continued: # new statement - if not line: break - column = 0 - while pos < max: # measure leading whitespace - if line[pos] == ' ': - column += 1 - elif line[pos] == '\t': - column = (column//tabsize + 1)*tabsize - elif line[pos] == '\f': - column = 0 - else: - break - pos += 1 - if pos == max: - break - - if line[pos] in '#\r\n': # skip comments or blank lines - if line[pos] == '#': - comment_token = line[pos:].rstrip('\r\n') - yield TokenInfo(COMMENT, comment_token, - (lnum, pos), (lnum, pos + len(comment_token)), line) - pos += len(comment_token) - - yield TokenInfo(NL, line[pos:], - (lnum, pos), (lnum, len(line)), line) - continue - - if column > indents[-1]: # count indents or dedents - indents.append(column) - yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) - while column < indents[-1]: - if column not in indents: - raise IndentationError( - "unindent does not match any outer indentation level", - ("", lnum, pos, line)) - indents = indents[:-1] - - yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) - - else: # continued statement - if not line: - raise TokenError("EOF in multi-line statement", (lnum, 0)) - continued = 0 - - while pos < max: - pseudomatch = _compile(PseudoToken).match(line, pos) - if pseudomatch: # scan for tokens - start, end = pseudomatch.span(1) - spos, epos, pos = (lnum, start), (lnum, end), end - if start == end: - continue - token, initial = line[start:end], line[start] - - if (initial in numchars or # ordinary number - (initial == '.' and token != '.' and token != '...')): - yield TokenInfo(NUMBER, token, spos, epos, line) - elif initial in '\r\n': - if parenlev > 0: - yield TokenInfo(NL, token, spos, epos, line) - else: - yield TokenInfo(NEWLINE, token, spos, epos, line) - - elif initial == '#': - assert not token.endswith("\n") - yield TokenInfo(COMMENT, token, spos, epos, line) - - elif token in triple_quoted: - endprog = _compile(endpats[token]) - endmatch = endprog.match(line, pos) - if endmatch: # all on one line - pos = endmatch.end(0) - token = line[start:pos] - yield TokenInfo(STRING, token, spos, (lnum, pos), line) - else: - strstart = (lnum, start) # multiple lines - contstr = line[start:] - contline = line - break - - # Check up to the first 3 chars of the token to see if - # they're in the single_quoted set. If so, they start - # a string. - # We're using the first 3, because we're looking for - # "rb'" (for example) at the start of the token. If - # we switch to longer prefixes, this needs to be - # adjusted. - # Note that initial == token[:1]. - # Also note that single quote checking must come after - # triple quote checking (above). - elif (initial in single_quoted or - token[:2] in single_quoted or - token[:3] in single_quoted): - if token[-1] == '\n': # continued string - strstart = (lnum, start) - # Again, using the first 3 chars of the - # token. This is looking for the matching end - # regex for the correct type of quote - # character. So it's really looking for - # endpats["'"] or endpats['"'], by trying to - # skip string prefix characters, if any. - endprog = _compile(endpats.get(initial) or - endpats.get(token[1]) or - endpats.get(token[2])) - contstr, needcont = line[start:], 1 - contline = line - break - else: # ordinary string - yield TokenInfo(STRING, token, spos, epos, line) - - elif initial.isidentifier(): # ordinary name - yield TokenInfo(NAME, token, spos, epos, line) - elif initial == '\\': # continued stmt - continued = 1 - else: - if initial in '([{': - parenlev += 1 - elif initial in ')]}': - parenlev -= 1 - yield TokenInfo(OP, token, spos, epos, line) - else: - yield TokenInfo(ERRORTOKEN, line[pos], - (lnum, pos), (lnum, pos+1), line) - pos += 1 + yield from _tokenize(rl_gen, encoding) + +def _tokenize(rl_gen, encoding): + source = b"".join(rl_gen).decode(encoding) + token = None + for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True): + # TODO: Marta -> limpiar esto + if 6 < token.type <= 54: + token = token._replace(type=OP) + if token.type in {ASYNC, AWAIT}: + token = token._replace(type=NAME) + if token.type == NEWLINE: + l_start, c_start = token.start + l_end, c_end = token.end + token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1)) - # Add an implicit NEWLINE if the input doesn't end in one - if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"): - yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') - for indent in indents[1:]: # pop remaining indent levels - yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') - yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') + yield token + if token is not None: + last_line, _ = token.start + yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '') -tokenize = tokenize2 def generate_tokens(readline): """Tokenize a source reading Python code as unicode strings. @@ -658,7 +480,7 @@ def _gen(): if not line: return yield line.encode() - return _tokenize2(_gen(), 'utf-8') + return _tokenize(_gen(), 'utf-8') def main(): import argparse diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 92d617e4f63b0b..d48e9af8df1410 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -2600,6 +2600,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct f_string_middle: + // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle + // this. + tok->multi_line_start = tok->line_start; while (end_quote_size != current_tok->f_string_quote_size) { int c = tok_nextc(tok); if (tok->done == E_ERROR) { diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 5eafba56f7c7c4..a45bd0553994aa 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -113,7 +113,7 @@ _tokenizer_error(struct tok_state *tok) break; } default: - msg = "unknown parsing error"; + msg = "unknown tokenization error"; } PyErr_SetString(errtype, msg); return -1; @@ -127,6 +127,7 @@ tokenizeriter_next(tokenizeriterobject *it) if (type == ERRORTOKEN) { if(!PyErr_Occurred()) { _tokenizer_error(it->tok); + assert(PyErr_Occurred()); } return NULL; } From 90da796a9dd191845ad10b5a8570591a8ef37e0c Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 18 May 2023 17:05:03 +0100 Subject: [PATCH 11/20] Fix more problems --- Lib/tabnanny.py | 10 ++++++++++ Lib/test/test_tabnanny.py | 4 ++-- Lib/tokenize.py | 5 ++++- Lib/trace.py | 1 - Python/Python-tokenize.c | 39 ++++++++++++++++++++++++++++++++++++--- 5 files changed, 52 insertions(+), 7 deletions(-) diff --git a/Lib/tabnanny.py b/Lib/tabnanny.py index 9d2df59d36ff47..e2ac6837f157d5 100755 --- a/Lib/tabnanny.py +++ b/Lib/tabnanny.py @@ -107,6 +107,10 @@ def check(file): errprint("%r: Token Error: %s" % (file, msg)) return + except SyntaxError as msg: + errprint("%r: Token Error: %s" % (file, msg)) + return + except IndentationError as msg: errprint("%r: Indentation Error: %s" % (file, msg)) return @@ -272,6 +276,12 @@ def format_witnesses(w): return prefix + " " + ', '.join(firsts) def process_tokens(tokens): + try: + _process_tokens(tokens) + except TabError as e: + raise NannyNag(e.lineno, e.msg, e.text) + +def _process_tokens(tokens): INDENT = tokenize.INDENT DEDENT = tokenize.DEDENT NEWLINE = tokenize.NEWLINE diff --git a/Lib/test/test_tabnanny.py b/Lib/test/test_tabnanny.py index afb8da719b0eed..dac47318011d9d 100644 --- a/Lib/test/test_tabnanny.py +++ b/Lib/test/test_tabnanny.py @@ -223,7 +223,7 @@ def test_when_nannynag_error_verbose(self): with TemporaryPyFile(SOURCE_CODES["nannynag_errored"]) as file_path: out = f"{file_path!r}: *** Line 3: trouble in tab city! ***\n" out += "offending line: '\\tprint(\"world\")\\n'\n" - out += "indent not equal e.g. at tab size 1\n" + out += "inconsistent use of tabs and spaces in indentation\n" tabnanny.verbose = 1 self.verify_tabnanny_check(file_path, out=out) @@ -315,7 +315,7 @@ def validate_cmd(self, *args, stdout="", stderr="", partial=False, expect_failur def test_with_errored_file(self): """Should displays error when errored python file is given.""" with TemporaryPyFile(SOURCE_CODES["wrong_indented"]) as file_path: - stderr = f"{file_path!r}: Indentation Error: " + stderr = f"{file_path!r}: Token Error: " stderr += ('unindent does not match any outer indentation level' ' (, line 3)') self.validate_cmd(file_path, stderr=stderr, expect_failure=True) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 7df2f69ea251f6..a41f61641de522 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -517,7 +517,10 @@ def error(message, filename=None, location=None): tokens = list(tokenize(f.readline)) else: filename = "" - tokens = _tokenize(sys.stdin.readline, None) + tokens = _tokenize( + (x.encode('utf-8') for x in iter(sys.stdin.readline, "") + ), "utf-8") + # Output the tokenization for token in tokens: diff --git a/Lib/trace.py b/Lib/trace.py index a3e4c30b6a5354..fb9a423ea09fce 100755 --- a/Lib/trace.py +++ b/Lib/trace.py @@ -360,7 +360,6 @@ def _find_strings(filename, encoding=None): # Add this special case so that the test in the loop passes. prev_ttype = token.INDENT with open(filename, encoding=encoding) as f: - print(filename) tok = tokenize.generate_tokens(f.readline) for ttype, tstr, start, end, line in tok: if ttype == token.STRING: diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index a45bd0553994aa..0c5cff21450b3f 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -89,8 +89,10 @@ _tokenizer_error(struct tok_state *tok) } return -1; case E_DEDENT: - PyErr_SetString(PyExc_IndentationError, - "unindent does not match any outer indentation level"); + PyErr_Format(PyExc_IndentationError, + "unindent does not match any outer indentation level " + "(, line %d)", + tok->lineno); return -1; case E_INTR: if (!PyErr_Occurred()) { @@ -115,7 +117,38 @@ _tokenizer_error(struct tok_state *tok) default: msg = "unknown tokenization error"; } - PyErr_SetString(errtype, msg); + + // TODO: Clean up this code and factor out common error paths + + PyObject* errstr = NULL; + PyObject* error_line = NULL; + + Py_ssize_t size = tok->inp - tok->buf; + error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace"); + if (!error_line) { + goto error; + } + PyObject *tmp = Py_BuildValue("(OnnOii)", tok->filename, tok->lineno, 0, error_line, 0, 0); + if (!tmp) { + goto error; + } + Py_CLEAR(error_line); + errstr = PyUnicode_FromString(msg); + if (!errstr) { + goto error; + } + PyObject* value = PyTuple_Pack(2, errstr, tmp); + Py_DECREF(errstr); + Py_DECREF(tmp); + if (!value) { + goto error; + } + PyErr_SetObject(errtype, value); + Py_DECREF(value); + return 0; +error: + Py_XDECREF(errstr); + Py_XDECREF(error_line); return -1; } From b5ccd94e10a2d680b058cbbeb37128a12b42d356 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 18 May 2023 17:11:53 +0100 Subject: [PATCH 12/20] Use IA to clean code --- Python/Python-tokenize.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 0c5cff21450b3f..1ced485a1e9c04 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -118,38 +118,45 @@ _tokenizer_error(struct tok_state *tok) msg = "unknown tokenization error"; } - // TODO: Clean up this code and factor out common error paths - PyObject* errstr = NULL; PyObject* error_line = NULL; + PyObject* tmp = NULL; + PyObject* value = NULL; + int result = 0; Py_ssize_t size = tok->inp - tok->buf; error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace"); if (!error_line) { - goto error; + result = -1; + goto exit; } - PyObject *tmp = Py_BuildValue("(OnnOii)", tok->filename, tok->lineno, 0, error_line, 0, 0); + + tmp = Py_BuildValue("(OnnOii)", tok->filename, tok->lineno, 0, error_line, 0, 0); if (!tmp) { - goto error; + result = -1; + goto exit; } - Py_CLEAR(error_line); + errstr = PyUnicode_FromString(msg); if (!errstr) { - goto error; + result = -1; + goto exit; } - PyObject* value = PyTuple_Pack(2, errstr, tmp); - Py_DECREF(errstr); - Py_DECREF(tmp); + + value = PyTuple_Pack(2, errstr, tmp); if (!value) { - goto error; + result = -1; + goto exit; } + PyErr_SetObject(errtype, value); - Py_DECREF(value); - return 0; -error: + +exit: Py_XDECREF(errstr); Py_XDECREF(error_line); - return -1; + Py_XDECREF(tmp); + Py_XDECREF(value); + return result; } static PyObject * From b1c3b2ae56f7d6df7fa43cddef9a9c15b69ad7eb Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 18 May 2023 17:19:14 +0100 Subject: [PATCH 13/20] Remove lel --- lel.py | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 lel.py diff --git a/lel.py b/lel.py deleted file mode 100644 index dadbd5ffd1a709..00000000000000 --- a/lel.py +++ /dev/null @@ -1,16 +0,0 @@ -import tokenize -import io -import pprint - -data = """\ -if False:\n # NL\n \n True = False # NEWLINE\n -""" -b = io.BytesIO(data.encode()) -pprint.pprint(list(tokenize.tokenize(b.readline))) -print() -print() -b = io.BytesIO(data.encode()) -pprint.pprint(list(tokenize.tokenize2(b.readline))) -print() -print() -pprint.pprint(list(tokenize._generate_tokens_from_c_tokenizer(data))) From e941f12ec8c83d3d3a1fc2fdf5e472653be5354a Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 18 May 2023 17:20:46 +0100 Subject: [PATCH 14/20] Remove whitespace --- Lib/test/test_tokenize.py | 2 +- Lib/tokenize.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 7eb7e54726150a..efcbce88194312 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1727,7 +1727,7 @@ def test_random_files(self): print('tokenize', testfile) with open(testfile, 'rb') as f: # with self.subTest(file=testfile): - self.check_roundtrip(f) + self.check_roundtrip(f) def roundtrip(self, code): diff --git a/Lib/tokenize.py b/Lib/tokenize.py index a41f61641de522..06b14baf291ebb 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -444,7 +444,7 @@ def tokenize(readline): encoding = "utf-8" yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') yield from _tokenize(rl_gen, encoding) - + def _tokenize(rl_gen, encoding): source = b"".join(rl_gen).decode(encoding) token = None From 67a0239232c3d1458deb9b94132c81646c51738b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= Date: Thu, 18 May 2023 21:05:23 +0200 Subject: [PATCH 15/20] Fix docs --- Doc/library/token.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Doc/library/token.rst b/Doc/library/token.rst index a1aceba96ce030..903847bb206d62 100644 --- a/Doc/library/token.rst +++ b/Doc/library/token.rst @@ -50,11 +50,13 @@ The following token type values aren't used by the C tokenizer but are needed fo the :mod:`tokenize` module. .. data:: COMMENT + :noindex: Token value used to indicate a comment. .. data:: NL + :noindex: Token value used to indicate a non-terminating newline. The :data:`NEWLINE` token indicates the end of a logical line of Python code; From dcd221f566a38ed622149a2d0835a743a987ef1f Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Fri, 19 May 2023 15:33:09 +0100 Subject: [PATCH 16/20] Moar tests and fix location error --- Lib/test/test_tokenize.py | 26 ++++++++++++++++++++------ Parser/tokenizer.c | 2 +- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index efcbce88194312..dda7243bfa19fe 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -421,12 +421,6 @@ def test_string(self): FSTRING_MIDDLE '}' (1, 9) (1, 10) FSTRING_END '"' (1, 11) (1, 12) """) - # TODO: I don't think is is correct now (ERRORTOKEN) - # self.check_tokenize('f"{1+1"', """\ - # FSTRING_START 'f"' (1, 0) (1, 2) - # ERRORTOKEN '{' (1, 2) (1, 3) - # FSTRING_END '"' (1, 3) (1, 4) - # """) self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\ FSTRING_START 'f\"""' (1, 0) (1, 4) OP '{' (1, 4) (1, 5) @@ -447,6 +441,26 @@ def test_string(self): FSTRING_END "'''" (1, 24) (1, 27) OP '}' (1, 27) (1, 28) FSTRING_END '\"""' (1, 28) (1, 31) + """) + self.check_tokenize('f""" x\nstr(data, encoding={invalid!r})\n"""', """\ + FSTRING_START 'f\"""' (1, 0) (1, 4) + FSTRING_MIDDLE ' x\\nstr(data, encoding=' (1, 4) (2, 19) + OP '{' (2, 19) (2, 20) + NAME 'invalid' (2, 20) (2, 27) + OP '!' (2, 27) (2, 28) + NAME 'r' (2, 28) (2, 29) + OP '}' (2, 29) (2, 30) + FSTRING_MIDDLE ')\\n' (2, 30) (3, 0) + FSTRING_END '\"""' (3, 0) (3, 3) + """) + self.check_tokenize('f"""123456789\nsomething{None}bad"""', """\ + FSTRING_START 'f\"""' (1, 0) (1, 4) + FSTRING_MIDDLE '123456789\\nsomething' (1, 4) (2, 9) + OP '{' (2, 9) (2, 10) + NAME 'None' (2, 10) (2, 14) + OP '}' (2, 14) (2, 15) + FSTRING_MIDDLE 'bad' (2, 15) (2, 18) + FSTRING_END '\"""' (2, 18) (2, 21) """) self.check_tokenize('f"""abc"""', """\ FSTRING_START 'f\"""' (1, 0) (1, 4) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index d48e9af8df1410..a531ac7505a83d 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1126,7 +1126,7 @@ tok_underflow_interactive(struct tok_state *tok) { static int tok_underflow_file(struct tok_state *tok) { - if (tok->start == NULL) { + if (tok->start == NULL && !INSIDE_FSTRING(tok)) { tok->cur = tok->inp = tok->buf; } if (tok->decoding_state == STATE_INIT) { From fd8b60aeb2a04bfbdc8a9e1bc63dd4563704b748 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Fri, 19 May 2023 17:04:22 +0100 Subject: [PATCH 17/20] Some cleanups --- Parser/tokenizer.c | 7 ++----- Python/Python-tokenize.c | 10 +++++----- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index a531ac7505a83d..887ec9483df7b4 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1815,13 +1815,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'")); } - const char *prefix, *p, *type_start; + const char* p = NULL; + const char *prefix, *type_start; int current_starting_col_offset; - // if (tok->tok_extra_tokens) { - // p = tok->start; - // } - while (c != EOF && c != '\n') { c = tok_nextc(tok); } diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 1ced485a1e9c04..0b9eeae2af816b 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -193,10 +193,10 @@ tokenizeriter_next(tokenizeriterobject *it) return NULL; } const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start; - int lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno; - int end_lineno = it->tok->lineno; - int col_offset = -1; - int end_col_offset = -1; + Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno; + Py_ssize_t end_lineno = it->tok->lineno; + Py_ssize_t col_offset = -1; + Py_ssize_t end_col_offset = -1; if (token.start != NULL && token.start >= line_start) { col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start); } @@ -204,7 +204,7 @@ tokenizeriter_next(tokenizeriterobject *it) end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start); } - return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line); + return Py_BuildValue("(NinnnnN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line); } static void From f1a5090172fd2a9de541854d4a4c1de87703ddd5 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Fri, 19 May 2023 17:12:21 +0100 Subject: [PATCH 18/20] pass the vacuum cleaner --- Lib/tokenize.py | 103 +----------------------------------------------- 1 file changed, 1 insertion(+), 102 deletions(-) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 06b14baf291ebb..bfe40c627fde57 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -56,112 +56,11 @@ def exact_type(self): else: return self.type -def group(*choices): return '(' + '|'.join(choices) + ')' -def any(*choices): return group(*choices) + '*' -def maybe(*choices): return group(*choices) + '?' - -# Note: we use unicode matching for names ("\w") but ascii matching for -# number literals. -Whitespace = r'[ \f\t]*' -Comment = r'#[^\r\n]*' -Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) -Name = r'\w+' - -Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' -Binnumber = r'0[bB](?:_?[01])+' -Octnumber = r'0[oO](?:_?[0-7])+' -Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' -Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) -Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' -Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', - r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) -Expfloat = r'[0-9](?:_?[0-9])*' + Exponent -Floatnumber = group(Pointfloat, Expfloat) -Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') -Number = group(Imagnumber, Floatnumber, Intnumber) - -# Return the empty string, plus all of the valid string prefixes. -def _all_string_prefixes(): - # The valid string prefixes. Only contain the lower case versions, - # and don't contain any permutations (include 'fr', but not - # 'rf'). The various permutations will be generated. - _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr'] - # if we add binary f-strings, add: ['fb', 'fbr'] - result = {''} - for prefix in _valid_string_prefixes: - for t in _itertools.permutations(prefix): - # create a list with upper and lower versions of each - # character - for u in _itertools.product(*[(c, c.upper()) for c in t]): - result.add(''.join(u)) - return result - -@functools.lru_cache -def _compile(expr): - return re.compile(expr, re.UNICODE) - -# Note that since _all_string_prefixes includes the empty string, -# StringPrefix can be the empty string (making it optional). -StringPrefix = group(*_all_string_prefixes()) - -# Tail end of ' string. -Single = r"[^'\\]*(?:\\.[^'\\]*)*'" -# Tail end of " string. -Double = r'[^"\\]*(?:\\.[^"\\]*)*"' -# Tail end of ''' string. -Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" -# Tail end of """ string. -Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' -Triple = group(StringPrefix + "'''", StringPrefix + '"""') -# Single-line ' or " string. -String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", - StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') - -# Sorting in reverse order puts the long operators before their prefixes. -# Otherwise if = came before ==, == would get recognized as two instances -# of =. -Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True))) -Funny = group(r'\r?\n', Special) - -PlainToken = group(Number, Funny, String, Name) -Token = Ignore + PlainToken - -# First (or only) line of ' or " string. -ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + - group("'", r'\\\r?\n'), - StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + - group('"', r'\\\r?\n')) -PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) -PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) - -# For a given string prefix plus quotes, endpats maps it to a regex -# to match the remainder of that string. _prefix can be empty, for -# a normal single or triple quoted string (with no prefix). -endpats = {} -for _prefix in _all_string_prefixes(): - endpats[_prefix + "'"] = Single - endpats[_prefix + '"'] = Double - endpats[_prefix + "'''"] = Single3 - endpats[_prefix + '"""'] = Double3 -del _prefix - -# A set of all of the single and triple quoted string prefixes, -# including the opening quotes. -single_quoted = set() -triple_quoted = set() -for t in _all_string_prefixes(): - for u in (t + '"', t + "'"): - single_quoted.add(u) - for u in (t + '"""', t + "'''"): - triple_quoted.add(u) -del t, u - -tabsize = 8 class TokenError(Exception): pass -class StopTokenizing(Exception): pass +class StopTokenizing(Exception): pass class Untokenizer: From 7fb58b0db4dc2514d95131f35c14e28f6022d28a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= Date: Sat, 20 May 2023 20:03:04 +0200 Subject: [PATCH 19/20] Fix refleaks --- Parser/pegen.c | 4 ++-- Parser/pegen_errors.c | 4 ++-- Parser/tokenizer.c | 15 +++++++++++++++ Parser/tokenizer.h | 2 ++ Python/Python-tokenize.c | 16 +++++++++++----- 5 files changed, 32 insertions(+), 9 deletions(-) diff --git a/Parser/pegen.c b/Parser/pegen.c index da410ea84ecb8e..b031a6f5d440e8 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -208,7 +208,7 @@ int _PyPegen_fill_token(Parser *p) { struct token new_token; - new_token.metadata = NULL; + _PyToken_Init(&new_token); int type = _PyTokenizer_Get(p->tok, &new_token); // Record and skip '# type: ignore' comments @@ -251,7 +251,7 @@ _PyPegen_fill_token(Parser *p) Token *t = p->tokens[p->fill]; return initialize_token(p, t, &new_token, type); error: - Py_XDECREF(new_token.metadata); + _PyToken_Free(&new_token); return -1; } diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c index 1f227da0194e3c..af529057f50e70 100644 --- a/Parser/pegen_errors.c +++ b/Parser/pegen_errors.c @@ -165,7 +165,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { int ret = 0; struct token new_token; - new_token.metadata = NULL; + _PyToken_Init(&new_token); for (;;) { switch (_PyTokenizer_Get(p->tok, &new_token)) { @@ -193,7 +193,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { exit: - Py_XDECREF(new_token.metadata); + _PyToken_Free(&new_token); // If we're in an f-string, we want the syntax error in the expression part // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards // do not swallow it. diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 887ec9483df7b4..090814ab9cd9f5 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -982,6 +982,16 @@ _PyTokenizer_Free(struct tok_state *tok) PyMem_Free(tok); } +void +_PyToken_Free(struct token *token) { + Py_XDECREF(token->metadata); +} + +void +_PyToken_Init(struct token *token) { + token->metadata = NULL; +} + static int tok_readline_raw(struct tok_state *tok) { @@ -1973,6 +1983,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t struct tok_state ahead_tok; struct token ahead_token; + _PyToken_Init(&ahead_token); int ahead_tok_kind; memcpy(&ahead_tok, tok, sizeof(ahead_tok)); @@ -1988,8 +1999,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t returning a plain NAME token, return ASYNC. */ tok->async_def_indent = tok->indent; tok->async_def = 1; + _PyToken_Free(&ahead_token); return MAKE_TOKEN(ASYNC); } + _PyToken_Free(&ahead_token); } } @@ -2823,7 +2836,9 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) // if fetching the encoding shows a warning. tok->report_warnings = 0; while (tok->lineno < 2 && tok->done == E_OK) { + _PyToken_Init(&token); _PyTokenizer_Get(tok, &token); + _PyToken_Free(&token); } fclose(fp); if (tok->encoding) { diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 444498458f2510..b96cb0d9754fae 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -139,6 +139,8 @@ extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int); extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*, const char *, const char *); extern void _PyTokenizer_Free(struct tok_state *); +extern void _PyToken_Free(struct token *); +extern void _PyToken_Init(struct token *); extern int _PyTokenizer_Get(struct tok_state *, struct token *); #define tok_dump _Py_tok_dump diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 0b9eeae2af816b..ece238672e34fd 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -162,18 +162,21 @@ _tokenizer_error(struct tok_state *tok) static PyObject * tokenizeriter_next(tokenizeriterobject *it) { + PyObject* result = NULL; struct token token; + _PyToken_Init(&token); + int type = _PyTokenizer_Get(it->tok, &token); if (type == ERRORTOKEN) { if(!PyErr_Occurred()) { _tokenizer_error(it->tok); assert(PyErr_Occurred()); } - return NULL; + goto exit; } if (type == ERRORTOKEN || type == ENDMARKER) { PyErr_SetString(PyExc_StopIteration, "EOF"); - return NULL; + goto exit; } PyObject *str = NULL; if (token.start == NULL || token.end == NULL) { @@ -183,14 +186,14 @@ tokenizeriter_next(tokenizeriterobject *it) str = PyUnicode_FromStringAndSize(token.start, token.end - token.start); } if (str == NULL) { - return NULL; + goto exit; } Py_ssize_t size = it->tok->inp - it->tok->buf; PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace"); if (line == NULL) { Py_DECREF(str); - return NULL; + goto exit; } const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start; Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno; @@ -204,7 +207,10 @@ tokenizeriter_next(tokenizeriterobject *it) end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start); } - return Py_BuildValue("(NinnnnN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line); + result = Py_BuildValue("(NinnnnN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line); +exit: + _PyToken_Free(&token); + return result; } static void From e1b5d352c07994da3e3d85ab94086c1f1c068409 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Sat, 20 May 2023 23:08:49 +0000 Subject: [PATCH 20/20] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst new file mode 100644 index 00000000000000..ff831c9f935db3 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst @@ -0,0 +1 @@ +Implement PEP 701 changes in the :mod:`tokenize` module. Patch by Marta Gómez Macías and Pablo Galindo Salgado