From 008f8e5a067657de37f6d8adb3bd415e38c671cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= <mgmacias@google.com>
Date: Wed, 19 Apr 2023 17:25:22 +0200
Subject: [PATCH 01/20] First iteration

---
 Grammar/Tokens                  |   1 +
 Include/internal/pycore_token.h |   1 +
 Lib/test/test_tokenize.py       |  22 ++++--
 Lib/token.py                    |   1 +
 Lib/tokenize.py                 | 120 +++++++++++++++++++++++++++++++-
 5 files changed, 139 insertions(+), 6 deletions(-)
diff --git a/Grammar/Tokens b/Grammar/Tokens
index 096876fdd130f8..8f13217ab1e100 100644
--- a/Grammar/Tokens
+++ b/Grammar/Tokens
@@ -64,6 +64,7 @@ SOFT_KEYWORD
 FSTRING_START
 FSTRING_MIDDLE
 FSTRING_END
+FSTRING_EXPR
 ERRORTOKEN
 
 # These aren't used by the C tokenizer but are needed for tokenize.py
diff --git a/Include/internal/pycore_token.h b/Include/internal/pycore_token.h
index b9df8766736adf..9f50bf05609809 100644
--- a/Include/internal/pycore_token.h
+++ b/Include/internal/pycore_token.h
@@ -78,6 +78,7 @@ extern "C" {
 #define FSTRING_MIDDLE  62
 #define FSTRING_END     63
 #define ERRORTOKEN      64
+#define FSTRING_EXPR 69
 #define N_TOKENS        68
 #define NT_OFFSET       256
 
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 911b53e5816588..95bca9e1a129f7 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -381,21 +381,33 @@ def test_string(self):
     STRING     'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
     """)
         self.check_tokenize('f"abc"', """\
-    STRING     'f"abc"'      (1, 0) (1, 6)
+    FSTRING_START \'f"\'          (1, 0) (1, 2)
+    FSTRING_MIDDLE 'abc'         (1, 2) (1, 5)
+    FSTRING_END \'"\'           (1, 5) (1, 6)
     """)
         self.check_tokenize('fR"a{b}c"', """\
-    STRING     'fR"a{b}c"'   (1, 0) (1, 9)
+    FSTRING_START \'fR"\'         (1, 0) (1, 3)
+    FSTRING_MIDDLE 'a'           (1, 3) (1, 4)
+    FSTRING_EXPR '{b}'         (1, 4) (1, 7)
+    FSTRING_MIDDLE 'c'           (1, 7) (1, 8)
+    FSTRING_END \'"\'           (1, 8) (1, 9)
     """)
         self.check_tokenize('f"""abc"""', """\
-    STRING     'f\"\"\"abc\"\"\"'  (1, 0) (1, 10)
+    FSTRING_START 'f\"""'        (1, 0) (1, 4)
+    FSTRING_MIDDLE 'abc'         (1, 4) (1, 7)
+    FSTRING_END '\"""'         (1, 7) (1, 10)
     """)
         self.check_tokenize(r'f"abc\
 def"', """\
-    STRING     'f"abc\\\\\\ndef"' (1, 0) (2, 4)
+    FSTRING_START \'f"\'          (1, 0) (1, 2)
+    FSTRING_MIDDLE 'abc\\\\\\ndef'  (1, 2) (2, 3)
+    FSTRING_END \'"\'           (2, 3) (2, 4)
     """)
         self.check_tokenize(r'Rf"abc\
 def"', """\
-    STRING     'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
+    FSTRING_START \'Rf"\'         (1, 0) (1, 3)
+    FSTRING_MIDDLE 'abc\\\\\\ndef'  (1, 3) (2, 3)
+    FSTRING_END \'"\'           (2, 3) (2, 4)
     """)
 
     def test_function(self):
diff --git a/Lib/token.py b/Lib/token.py
index 1459d12b376f82..cdbdba9c091076 100644
--- a/Lib/token.py
+++ b/Lib/token.py
@@ -67,6 +67,7 @@
 FSTRING_START = 61
 FSTRING_MIDDLE = 62
 FSTRING_END = 63
+FSTRING_EXPR = 69
 # These aren't used by the C tokenizer but are needed for tokenize.py
 ERRORTOKEN = 64
 COMMENT = 65
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 46d2224f5cc083..8062fd4875d8a8 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -29,6 +29,7 @@
 import collections
 import functools
 from io import TextIOWrapper
+from io import BytesIO
 import itertools as _itertools
 import re
 import sys
@@ -37,6 +38,14 @@
 
 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
+fstring_re = re.compile(
+    r'''
+    (?P<start>^[fFrR]{1,2}(?P<quote>[\'\"]{1,3}))
+    (?P<middle>.*)
+    (?P=quote)$
+    ''',
+    re.VERBOSE | re.DOTALL
+)
 
 import token
 __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
@@ -430,7 +439,7 @@ def tokenize(readline):
     return _tokenize(rl_gen.__next__, encoding)
 
 
-def _tokenize(readline, encoding):
+def _tokenize_normal_mode(readline, encoding):
     lnum = parenlev = continued = 0
     numchars = '0123456789'
     contstr, needcont = '', 0
@@ -613,6 +622,115 @@ def _tokenize(readline, encoding):
     yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 
 
+def _tokenize_fstring_mode(line, tok_start):
+    line_number, start = tok_start
+
+    parts = fstring_re.match(line)
+    end = line_number, start + len(parts.group('start'))
+    yield TokenInfo(
+        type=FSTRING_START,
+        string=parts.group('start'),
+        start=(line_number, start),
+        end=end,
+        line=line)
+
+    middle = parts.group('middle')
+    mid_token, mid_expr = '', ''
+    curly_brackets = []
+    start, i = end[1], 0
+
+    for c in middle:
+        match c:
+            case '{':
+                # TODO: handle {{ and \{
+                curly_brackets.append(c)
+                mid_expr += c
+                yield TokenInfo(
+                    type=FSTRING_MIDDLE,
+                    string=mid_token,
+                    start=end,
+                    end=(line_number, start + i),
+                    line=line)
+                mid_token = ''
+                end = line_number, start + i
+            case '}':
+                curly_brackets.pop()
+                mid_expr += c
+                yield TokenInfo(
+                    type=FSTRING_EXPR,
+                    string=mid_expr,
+                    # +1 is needed here since this token is yielded when
+                    # reading the }, before incrementing i.
+                    start=end,
+                    end=(line_number, start + i + 1),
+                    line=line)
+                mid_expr = ''
+                end = line_number, start + i + 1
+            case '\n':
+                if mid_expr:
+                    mid_expr += c
+                else:
+                    mid_token += c
+                line_number += 1
+                start = 0
+                i = -1
+            case _:
+                if mid_expr:
+                    mid_expr += c
+                else:
+                    mid_token += c
+        i += 1
+
+    # once the end of the expression is reached, release what's left of
+    # mid_token
+    start += i
+    yield TokenInfo(
+        type=FSTRING_MIDDLE,
+        string=mid_token,
+        start=end,
+        end=(line_number, start),
+        line=line)
+    end = line_number, start
+
+    if curly_brackets:
+        # TODO: handle syntax error of not matching {}
+        pass
+
+    yield TokenInfo(
+        type=FSTRING_END,
+        string=parts.group('quote'),
+        start=end,
+        end=(line_number, start + len(parts.group('quote'))),
+        line=line)
+
+
+def _is_fstring(tok):
+    """Checks whether a STRING token is a fstring or not.
+
+    Args:
+        tok: TokenInfo object of type STRING.
+
+    Returns:
+        bool
+    """
+    return tok.string.lower().startswith(('f', 'rf', 'fr'))
+
+
+def _tokenize(readline, encoding):
+    """Tokenize Python code implementing the string mode and the normal mode.
+
+    See PEP701 por more details.
+    """
+    tokens = _tokenize_normal_mode(readline, encoding)
+
+    for tok in tokens:
+        if tok.type != STRING or not _is_fstring(tok):
+            yield tok
+        else:
+            for t in _tokenize_fstring_mode(tok.string, tok.start):
+                yield t
+
+
 def generate_tokens(readline):
     """Tokenize a source reading Python code as unicode strings.
 

From 67a6ad6b66bd827d6e4ab7a0cd2e3fb7e6b6b5d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= <mgmacias@google.com>
Date: Thu, 27 Apr 2023 17:08:05 +0200
Subject: [PATCH 02/20] Handle escaping {

---
 Lib/test/test_tokenize.py |  5 ++++
 Lib/tokenize.py           | 58 +++++++++++++++++++++++----------------
 2 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 95bca9e1a129f7..c66ae34f73ab54 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -391,6 +391,11 @@ def test_string(self):
     FSTRING_EXPR '{b}'         (1, 4) (1, 7)
     FSTRING_MIDDLE 'c'           (1, 7) (1, 8)
     FSTRING_END \'"\'           (1, 8) (1, 9)
+    """)
+        self.check_tokenize('fR"a{{b}c"', """\
+    FSTRING_START \'fR"\'         (1, 0) (1, 3)
+    FSTRING_MIDDLE 'a{{b}c'      (1, 3) (1, 9)
+    FSTRING_END \'"\'           (1, 9) (1, 10)
     """)
         self.check_tokenize('f"""abc"""', """\
     FSTRING_START 'f\"""'        (1, 0) (1, 4)
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 8062fd4875d8a8..16c6f0d668fae0 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -639,33 +639,43 @@ def _tokenize_fstring_mode(line, tok_start):
     curly_brackets = []
     start, i = end[1], 0
 
-    for c in middle:
+    for position, c in enumerate(middle):
         match c:
             case '{':
-                # TODO: handle {{ and \{
-                curly_brackets.append(c)
-                mid_expr += c
-                yield TokenInfo(
-                    type=FSTRING_MIDDLE,
-                    string=mid_token,
-                    start=end,
-                    end=(line_number, start + i),
-                    line=line)
-                mid_token = ''
-                end = line_number, start + i
+                # check out next position, if it's another {, then it is
+                # escaping the { character
+                if ((len(middle) >= position + 1 and middle[position + 1] == '{')
+                    or (position > 0 and middle[position - 1] in ('\\', '{'))):
+                    mid_token += c
+                else:
+                    curly_brackets.append(c)
+                    mid_expr += c
+                    yield TokenInfo(
+                        type=FSTRING_MIDDLE,
+                        string=mid_token,
+                        start=end,
+                        end=(line_number, start + i),
+                        line=line)
+                    mid_token = ''
+                    end = line_number, start + i
             case '}':
-                curly_brackets.pop()
-                mid_expr += c
-                yield TokenInfo(
-                    type=FSTRING_EXPR,
-                    string=mid_expr,
-                    # +1 is needed here since this token is yielded when
-                    # reading the }, before incrementing i.
-                    start=end,
-                    end=(line_number, start + i + 1),
-                    line=line)
-                mid_expr = ''
-                end = line_number, start + i + 1
+                # if no opening { is seen before, this character is taken
+                # as part of the fstring middle token
+                if mid_expr:
+                    curly_brackets.pop()
+                    mid_expr += c
+                    yield TokenInfo(
+                        type=FSTRING_EXPR,
+                        string=mid_expr,
+                        # +1 is needed here since this token is yielded when
+                        # reading the }, before incrementing i.
+                        start=end,
+                        end=(line_number, start + i + 1),
+                        line=line)
+                    mid_expr = ''
+                    end = line_number, start + i + 1
+                else:
+                    mid_token += c
             case '\n':
                 if mid_expr:
                     mid_expr += c

From f58104d20269ba6878da404be0973e375473d271 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= <mgmacias@google.com>
Date: Sat, 29 Apr 2023 18:22:52 +0200
Subject: [PATCH 03/20] nested expressions

---
 Lib/test/test_tokenize.py | 25 +++++++++++++++----------
 Lib/tokenize.py           | 28 +++++++++++++++++++---------
 lel.py                    | 12 ++++++++++++
 3 files changed, 46 insertions(+), 19 deletions(-)
 create mode 100644 lel.py

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index c66ae34f73ab54..7d2d200033221b 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -381,21 +381,26 @@ def test_string(self):
     STRING     'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
     """)
         self.check_tokenize('f"abc"', """\
-    FSTRING_START \'f"\'          (1, 0) (1, 2)
+    FSTRING_START 'f"'          (1, 0) (1, 2)
     FSTRING_MIDDLE 'abc'         (1, 2) (1, 5)
-    FSTRING_END \'"\'           (1, 5) (1, 6)
+    FSTRING_END '"'           (1, 5) (1, 6)
     """)
         self.check_tokenize('fR"a{b}c"', """\
-    FSTRING_START \'fR"\'         (1, 0) (1, 3)
+    FSTRING_START 'fR"'         (1, 0) (1, 3)
     FSTRING_MIDDLE 'a'           (1, 3) (1, 4)
     FSTRING_EXPR '{b}'         (1, 4) (1, 7)
     FSTRING_MIDDLE 'c'           (1, 7) (1, 8)
-    FSTRING_END \'"\'           (1, 8) (1, 9)
+    FSTRING_END '"'           (1, 8) (1, 9)
     """)
         self.check_tokenize('fR"a{{b}c"', """\
-    FSTRING_START \'fR"\'         (1, 0) (1, 3)
+    FSTRING_START 'fR"'         (1, 0) (1, 3)
     FSTRING_MIDDLE 'a{{b}c'      (1, 3) (1, 9)
-    FSTRING_END \'"\'           (1, 9) (1, 10)
+    FSTRING_END '"'           (1, 9) (1, 10)
+    """)
+        self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\
+    FSTRING_START 'f\"""'        (1, 0) (1, 4)
+    FSTRING_EXPR '{f'''{f'{f"{1+1}"}'}'''}' (1, 4) (1, 28)
+    FSTRING_END '\"""'         (1, 28) (1, 31)
     """)
         self.check_tokenize('f"""abc"""', """\
     FSTRING_START 'f\"""'        (1, 0) (1, 4)
@@ -404,15 +409,15 @@ def test_string(self):
     """)
         self.check_tokenize(r'f"abc\
 def"', """\
-    FSTRING_START \'f"\'          (1, 0) (1, 2)
+    FSTRING_START 'f"'          (1, 0) (1, 2)
     FSTRING_MIDDLE 'abc\\\\\\ndef'  (1, 2) (2, 3)
-    FSTRING_END \'"\'           (2, 3) (2, 4)
+    FSTRING_END '"'           (2, 3) (2, 4)
     """)
         self.check_tokenize(r'Rf"abc\
 def"', """\
-    FSTRING_START \'Rf"\'         (1, 0) (1, 3)
+    FSTRING_START 'Rf"'         (1, 0) (1, 3)
     FSTRING_MIDDLE 'abc\\\\\\ndef'  (1, 3) (2, 3)
-    FSTRING_END \'"\'           (2, 3) (2, 4)
+    FSTRING_END '"'           (2, 3) (2, 4)
     """)
 
     def test_function(self):
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 16c6f0d668fae0..1a8ec9412459fd 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -647,7 +647,7 @@ def _tokenize_fstring_mode(line, tok_start):
                 if ((len(middle) >= position + 1 and middle[position + 1] == '{')
                     or (position > 0 and middle[position - 1] in ('\\', '{'))):
                     mid_token += c
-                else:
+                elif mid_token:
                     curly_brackets.append(c)
                     mid_expr += c
                     yield TokenInfo(
@@ -658,11 +658,17 @@ def _tokenize_fstring_mode(line, tok_start):
                         line=line)
                     mid_token = ''
                     end = line_number, start + i
+                else:
+                    curly_brackets.append(c)
+                    mid_expr += c
             case '}':
                 # if no opening { is seen before, this character is taken
                 # as part of the fstring middle token
-                if mid_expr:
+                # if there are remaining elements in the curly_brackets queue
+                # then the expression is not done yet
+                if curly_brackets:
                     curly_brackets.pop()
+                if mid_expr and not curly_brackets:
                     mid_expr += c
                     yield TokenInfo(
                         type=FSTRING_EXPR,
@@ -675,7 +681,10 @@ def _tokenize_fstring_mode(line, tok_start):
                     mid_expr = ''
                     end = line_number, start + i + 1
                 else:
-                    mid_token += c
+                    if mid_expr:
+                        mid_expr += c
+                    else:
+                        mid_token += c
             case '\n':
                 if mid_expr:
                     mid_expr += c
@@ -694,12 +703,13 @@ def _tokenize_fstring_mode(line, tok_start):
     # once the end of the expression is reached, release what's left of
     # mid_token
     start += i
-    yield TokenInfo(
-        type=FSTRING_MIDDLE,
-        string=mid_token,
-        start=end,
-        end=(line_number, start),
-        line=line)
+    if mid_token:
+        yield TokenInfo(
+            type=FSTRING_MIDDLE,
+            string=mid_token,
+            start=end,
+            end=(line_number, start),
+            line=line)
     end = line_number, start
 
     if curly_brackets:
diff --git a/lel.py b/lel.py
new file mode 100644
index 00000000000000..c1c0274876054b
--- /dev/null
+++ b/lel.py
@@ -0,0 +1,12 @@
+import tokenize
+from pprint import pprint
+from io import BytesIO
+
+def t(s):
+  pprint(list(tokenize.tokenize(BytesIO(s.encode()).readline)))
+
+
+a = r'f"abc\
+def"'
+
+t(a)

From 26102cca0cd85450b00ee2a3c82123bb6a1dd3a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= <mgmacias@google.com>
Date: Tue, 2 May 2023 23:15:24 +0200
Subject: [PATCH 04/20] Recursive expression tokenization

---
 Lib/test/test_tokenize.py |  57 ++++++++++++++--
 Lib/tokenize.py           | 140 +++++++++++++++++++++++++++++---------
 lel.py                    |  12 ----
 3 files changed, 160 insertions(+), 49 deletions(-)
 delete mode 100644 lel.py

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 7d2d200033221b..317e1191728d55 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -388,18 +388,47 @@ def test_string(self):
         self.check_tokenize('fR"a{b}c"', """\
     FSTRING_START 'fR"'         (1, 0) (1, 3)
     FSTRING_MIDDLE 'a'           (1, 3) (1, 4)
-    FSTRING_EXPR '{b}'         (1, 4) (1, 7)
+    OP         '{'           (1, 4) (1, 5)
+    NAME       'b'           (1, 5) (1, 6)
+    OP         '}'           (1, 6) (1, 7)
     FSTRING_MIDDLE 'c'           (1, 7) (1, 8)
     FSTRING_END '"'           (1, 8) (1, 9)
     """)
-        self.check_tokenize('fR"a{{b}c"', """\
+        self.check_tokenize('fR"a{{{b!r}}}c"', """\
     FSTRING_START 'fR"'         (1, 0) (1, 3)
-    FSTRING_MIDDLE 'a{{b}c'      (1, 3) (1, 9)
+    FSTRING_MIDDLE 'a{b}c'      (1, 3) (1, 7)
+    FSTRING_END '"'           (1, 7) (1, 8)
+    """)
+        self.check_tokenize('f"{{{1+1}}}"', """\
+    FSTRING_START 'f"'         (1, 0) (1, 2)
+    FSTRING_MIDDLE '{'      (1, 2) (1, 3)
+    OP         '{'           (1, 3) (1, 4)
+    NUMBER     '1'           (1, 4) (1, 5)
+    OP         '+'           (1, 5) (1, 6)
+    NUMBER     '1'           (1, 6) (1, 7)
+    OP         '}'           (1, 7) (1, 8)
+    FSTRING_MIDDLE '}'      (1, 8) (1, 9)
     FSTRING_END '"'           (1, 9) (1, 10)
     """)
         self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\
     FSTRING_START 'f\"""'        (1, 0) (1, 4)
-    FSTRING_EXPR '{f'''{f'{f"{1+1}"}'}'''}' (1, 4) (1, 28)
+    OP         '{'           (1, 4) (1, 5)
+    FSTRING_START "f'''"        (1, 5) (1, 9)
+    OP         '{'           (1, 9) (1, 10)
+    FSTRING_START "f'"          (1, 10) (1, 12)
+    OP         '{'           (1, 12) (1, 13)
+    FSTRING_START 'f"'          (1, 13) (1, 15)
+    OP         '{'           (1, 15) (1, 16)
+    NUMBER     '1'           (1, 16) (1, 17)
+    OP         '+'           (1, 17) (1, 18)
+    NUMBER     '1'           (1, 18) (1, 19)
+    OP         '}'           (1, 19) (1, 20)
+    FSTRING_END '"'           (1, 20) (1, 21)
+    OP         '}'           (1, 21) (1, 22)
+    FSTRING_END "'"           (1, 22) (1, 23)
+    OP         '}'           (1, 23) (1, 24)
+    FSTRING_END "'''"         (1, 24) (1, 27)
+    OP         '}'           (1, 27) (1, 28)
     FSTRING_END '\"""'         (1, 28) (1, 31)
     """)
         self.check_tokenize('f"""abc"""', """\
@@ -418,6 +447,26 @@ def test_string(self):
     FSTRING_START 'Rf"'         (1, 0) (1, 3)
     FSTRING_MIDDLE 'abc\\\\\\ndef'  (1, 3) (2, 3)
     FSTRING_END '"'           (2, 3) (2, 4)
+    """)
+        self.check_tokenize("f'some words {a+b:.3f} more words {c+d=} final words'", """\
+    FSTRING_START "f'"          (1, 0) (1, 2)
+    FSTRING_MIDDLE 'some words ' (1, 2) (1, 13)
+    OP         '{'           (1, 13) (1, 14)
+    NAME       'a'           (1, 14) (1, 15)
+    OP         '+'           (1, 15) (1, 16)
+    NAME       'b'           (1, 16) (1, 17)
+    OP         ':'           (1, 17) (1, 18)
+    FSTRING_MIDDLE '.3f'         (1, 18) (1, 21)
+    OP         '}'           (1, 21) (1, 22)
+    FSTRING_MIDDLE ' more words ' (1, 22) (1, 34)
+    OP         '{'           (1, 34) (1, 35)
+    NAME       'c'           (1, 35) (1, 36)
+    OP         '+'           (1, 36) (1, 37)
+    NAME       'd'           (1, 37) (1, 38)
+    OP         '='           (1, 38) (1, 39)
+    OP         '}'           (1, 39) (1, 40)
+    FSTRING_MIDDLE ' final words' (1, 40) (1, 52)
+    FSTRING_END "'"           (1, 52) (1, 53)
     """)
 
     def test_function(self):
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 1a8ec9412459fd..3cee3d99721f07 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -30,6 +30,7 @@
 import functools
 from io import TextIOWrapper
 from io import BytesIO
+from io import StringIO
 import itertools as _itertools
 import re
 import sys
@@ -439,14 +440,14 @@ def tokenize(readline):
     return _tokenize(rl_gen.__next__, encoding)
 
 
-def _tokenize_normal_mode(readline, encoding):
+def _tokenize_normal_mode(readline, encoding, fstring_mode=False):
     lnum = parenlev = continued = 0
     numchars = '0123456789'
     contstr, needcont = '', 0
     contline = None
     indents = [0]
 
-    if encoding is not None:
+    if encoding is not None and not fstring_mode:
         if encoding == "utf-8-sig":
             # BOM will already have been stripped.
             encoding = "utf-8"
@@ -614,6 +615,8 @@ def _tokenize_normal_mode(readline, encoding):
                            (lnum, pos), (lnum, pos+1), line)
                 pos += 1
 
+    if fstring_mode:
+        return
     # Add an implicit NEWLINE if the input doesn't end in one
     if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"):
         yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
@@ -622,42 +625,60 @@ def _tokenize_normal_mode(readline, encoding):
     yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 
 
-def _tokenize_fstring_mode(line, tok_start):
+def _tokenize_fstring_mode(line, tok_start, encoding):
     line_number, start = tok_start
-
     parts = fstring_re.match(line)
-    end = line_number, start + len(parts.group('start'))
+    end_col = start + len(parts.group('start'))
     yield TokenInfo(
         type=FSTRING_START,
         string=parts.group('start'),
         start=(line_number, start),
-        end=end,
+        end=(line_number, end_col),
         line=line)
 
     middle = parts.group('middle')
+
+    for token in _tokenize_fstring_middle(middle, end_col, line_number, line, encoding):
+        yield token
+
+    last_line, last_col = token.end
+    yield TokenInfo(
+        type=FSTRING_END,
+        string=parts.group('quote'),
+        start=token.end,
+        end=(last_line, last_col + len(parts.group('quote'))),
+        line=line)
+
+
+
+def _tokenize_fstring_middle(middle, start, line_number, line, encoding):
+    n_chars_in_curr_line = 0
     mid_token, mid_expr = '', ''
     curly_brackets = []
-    start, i = end[1], 0
-
+    end = (line_number, start)
+    escaping = False
     for position, c in enumerate(middle):
         match c:
             case '{':
                 # check out next position, if it's another {, then it is
                 # escaping the { character
-                if ((len(middle) >= position + 1 and middle[position + 1] == '{')
-                    or (position > 0 and middle[position - 1] in ('\\', '{'))):
-                    mid_token += c
-                elif mid_token:
+                if len(middle) >= position + 1 and middle[position + 1] == '{' and not escaping:
+                    escaping = True
+                    continue
+                elif mid_token and not escaping:
                     curly_brackets.append(c)
                     mid_expr += c
                     yield TokenInfo(
                         type=FSTRING_MIDDLE,
                         string=mid_token,
                         start=end,
-                        end=(line_number, start + i),
+                        end=(line_number, start + n_chars_in_curr_line),
                         line=line)
                     mid_token = ''
-                    end = line_number, start + i
+                    end = line_number, start + n_chars_in_curr_line
+                elif escaping:
+                    escaping = False
+                    mid_token += c
                 else:
                     curly_brackets.append(c)
                     mid_expr += c
@@ -666,20 +687,80 @@ def _tokenize_fstring_mode(line, tok_start):
                 # as part of the fstring middle token
                 # if there are remaining elements in the curly_brackets queue
                 # then the expression is not done yet
+                if escaping:
+                    escaping = False
+                    mid_token += c
+                    continue
+                elif len(middle) >= position + 1 and middle[position + 1] == '}':
+                    escaping = True
+                    continue
                 if curly_brackets:
                     curly_brackets.pop()
                 if mid_expr and not curly_brackets:
+                    yield TokenInfo(
+                        type=OP,
+                        string='{',
+                        start=end,
+                        end=(line_number, end[1] + 1),
+                        line=line)
+                    end = line_number, end[1] + 1
+
                     mid_expr += c
+
+                    mid_expr = mid_expr[1:-1]
+
+                    # Find any first level : or !
+                    curly_level = 0
+                    break_char_index = -1
+                    for char_index, char in enumerate(mid_expr):
+                        if char == '{':
+                            curly_level += 1
+                        elif char == '}':
+                            curly_level -= 1
+                        elif char in {'!', ':'} and not curly_level:
+                            break_char_index = char_index
+                            break
+
+                    expression_chunk = mid_expr
+                    if break_char_index != -1:
+                        expression_chunk = mid_expr[:break_char_index+1]
+
+                    if encoding is not None:
+                        buffer = BytesIO(expression_chunk.encode()).readline
+                    else:
+                        buffer = StringIO(expression_chunk).readline
+                    for t in _tokenize(buffer, encoding, fstring_mode=True):
+                        yield TokenInfo(
+                            type=t.type,
+                            string=t.string,
+                            start=(t.start[0] - 1 + end[0], t.start[1] + end[1]),
+                            end=(t.end[0] - 1 + end[0], t.end[1] + end[1]),
+                            line=line
+                        )
+
+                    end = t.end[0] - 1 + end[0], t.end[1] + end[1]
+
+                    if break_char_index != -1:
+                        formatting_chunk = mid_expr[break_char_index+1:]
+                        for t in _tokenize_fstring_middle(
+                            middle=formatting_chunk,
+                            start=end[1],
+                            line_number=line_number,
+                            line=line,
+                            encoding=encoding):
+
+                            yield t
+                            end = t.end
+
                     yield TokenInfo(
-                        type=FSTRING_EXPR,
-                        string=mid_expr,
-                        # +1 is needed here since this token is yielded when
-                        # reading the }, before incrementing i.
+                        type=OP,
+                        string='}',
                         start=end,
-                        end=(line_number, start + i + 1),
+                        end=(end[0], end[1] + 1),
                         line=line)
+
                     mid_expr = ''
-                    end = line_number, start + i + 1
+                    end = line_number, start + n_chars_in_curr_line + 1
                 else:
                     if mid_expr:
                         mid_expr += c
@@ -692,17 +773,17 @@ def _tokenize_fstring_mode(line, tok_start):
                     mid_token += c
                 line_number += 1
                 start = 0
-                i = -1
+                n_chars_in_curr_line = -1
             case _:
                 if mid_expr:
                     mid_expr += c
                 else:
                     mid_token += c
-        i += 1
+        n_chars_in_curr_line += 1
 
     # once the end of the expression is reached, release what's left of
     # mid_token
-    start += i
+    start += n_chars_in_curr_line
     if mid_token:
         yield TokenInfo(
             type=FSTRING_MIDDLE,
@@ -716,13 +797,6 @@ def _tokenize_fstring_mode(line, tok_start):
         # TODO: handle syntax error of not matching {}
         pass
 
-    yield TokenInfo(
-        type=FSTRING_END,
-        string=parts.group('quote'),
-        start=end,
-        end=(line_number, start + len(parts.group('quote'))),
-        line=line)
-
 
 def _is_fstring(tok):
     """Checks whether a STRING token is a fstring or not.
@@ -736,18 +810,18 @@ def _is_fstring(tok):
     return tok.string.lower().startswith(('f', 'rf', 'fr'))
 
 
-def _tokenize(readline, encoding):
+def _tokenize(readline, encoding, fstring_mode=False):
     """Tokenize Python code implementing the string mode and the normal mode.
 
     See PEP701 por more details.
     """
-    tokens = _tokenize_normal_mode(readline, encoding)
+    tokens = _tokenize_normal_mode(readline, encoding, fstring_mode)
 
     for tok in tokens:
         if tok.type != STRING or not _is_fstring(tok):
             yield tok
         else:
-            for t in _tokenize_fstring_mode(tok.string, tok.start):
+            for t in _tokenize_fstring_mode(tok.string, tok.start, encoding):
                 yield t
 
 
diff --git a/lel.py b/lel.py
deleted file mode 100644
index c1c0274876054b..00000000000000
--- a/lel.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import tokenize
-from pprint import pprint
-from io import BytesIO
-
-def t(s):
-  pprint(list(tokenize.tokenize(BytesIO(s.encode()).readline)))
-
-
-a = r'f"abc\
-def"'
-
-t(a)

From a5f4b408aed4a4006b8eb77a6d741029db630783 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= <mgmacias@google.com>
Date: Tue, 2 May 2023 23:23:56 +0200
Subject: [PATCH 05/20] Remove intermediate token created for dev purposes

---
 Grammar/Tokens                  | 1 -
 Include/internal/pycore_token.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/Grammar/Tokens b/Grammar/Tokens
index 8f13217ab1e100..096876fdd130f8 100644
--- a/Grammar/Tokens
+++ b/Grammar/Tokens
@@ -64,7 +64,6 @@ SOFT_KEYWORD
 FSTRING_START
 FSTRING_MIDDLE
 FSTRING_END
-FSTRING_EXPR
 ERRORTOKEN
 
 # These aren't used by the C tokenizer but are needed for tokenize.py
diff --git a/Include/internal/pycore_token.h b/Include/internal/pycore_token.h
index 9f50bf05609809..b9df8766736adf 100644
--- a/Include/internal/pycore_token.h
+++ b/Include/internal/pycore_token.h
@@ -78,7 +78,6 @@ extern "C" {
 #define FSTRING_MIDDLE  62
 #define FSTRING_END     63
 #define ERRORTOKEN      64
-#define FSTRING_EXPR 69
 #define N_TOKENS        68
 #define NT_OFFSET       256
 

From 598bab44633ff90d313846b3a0ed1a6498909751 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= <mgmacias@google.com>
Date: Wed, 3 May 2023 23:50:38 +0200
Subject: [PATCH 06/20] More improvements

---
 Lib/test/test_tokenize.py | 33 ++++++++++++++++++++----------
 Lib/tokenize.py           | 43 ++++++++++++++++++++-------------------
 2 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 317e1191728d55..d53064b3aa7387 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -396,19 +396,30 @@ def test_string(self):
     """)
         self.check_tokenize('fR"a{{{b!r}}}c"', """\
     FSTRING_START 'fR"'         (1, 0) (1, 3)
-    FSTRING_MIDDLE 'a{b}c'      (1, 3) (1, 7)
-    FSTRING_END '"'           (1, 7) (1, 8)
+    FSTRING_MIDDLE 'a{'          (1, 3) (1, 6)
+    OP         '{'           (1, 6) (1, 7)
+    NAME       'b'           (1, 7) (1, 8)
+    OP         '!'           (1, 8) (1, 9)
+    FSTRING_MIDDLE 'r'           (1, 9) (1, 10)
+    OP         '}'           (1, 10) (1, 12)
+    FSTRING_MIDDLE '}c'          (1, 12) (1, 14)
+    FSTRING_END '"'           (1, 14) (1, 15)
     """)
         self.check_tokenize('f"{{{1+1}}}"', """\
-    FSTRING_START 'f"'         (1, 0) (1, 2)
-    FSTRING_MIDDLE '{'      (1, 2) (1, 3)
-    OP         '{'           (1, 3) (1, 4)
-    NUMBER     '1'           (1, 4) (1, 5)
-    OP         '+'           (1, 5) (1, 6)
-    NUMBER     '1'           (1, 6) (1, 7)
-    OP         '}'           (1, 7) (1, 8)
-    FSTRING_MIDDLE '}'      (1, 8) (1, 9)
-    FSTRING_END '"'           (1, 9) (1, 10)
+    FSTRING_START 'f"'          (1, 0) (1, 2)
+    FSTRING_MIDDLE '{'           (1, 2) (1, 4)
+    OP         '{'           (1, 4) (1, 5)
+    NUMBER     '1'           (1, 5) (1, 6)
+    OP         '+'           (1, 6) (1, 7)
+    NUMBER     '1'           (1, 7) (1, 8)
+    OP         '}'           (1, 8) (1, 10)
+    FSTRING_MIDDLE '}'           (1, 10) (1, 11)
+    FSTRING_END '"'           (1, 11) (1, 12)
+    """)
+        self.check_tokenize('f"{1+1"', """\
+    FSTRING_START 'f"'          (1, 0) (1, 2)
+    ERRORTOKEN '{'           (1, 2) (1, 3)
+    FSTRING_END '"'           (1, 3) (1, 4)
     """)
         self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\
     FSTRING_START 'f\"""'        (1, 0) (1, 4)
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 3cee3d99721f07..cf6a140eb287bf 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -652,7 +652,6 @@ def _tokenize_fstring_mode(line, tok_start, encoding):
 
 
 def _tokenize_fstring_middle(middle, start, line_number, line, encoding):
-    n_chars_in_curr_line = 0
     mid_token, mid_expr = '', ''
     curly_brackets = []
     end = (line_number, start)
@@ -664,36 +663,35 @@ def _tokenize_fstring_middle(middle, start, line_number, line, encoding):
                 # escaping the { character
                 if len(middle) >= position + 1 and middle[position + 1] == '{' and not escaping:
                     escaping = True
-                    continue
                 elif mid_token and not escaping:
-                    curly_brackets.append(c)
+                    curly_brackets.append((line_number, start))
                     mid_expr += c
                     yield TokenInfo(
                         type=FSTRING_MIDDLE,
                         string=mid_token,
                         start=end,
-                        end=(line_number, start + n_chars_in_curr_line),
+                        end=(line_number, start),
                         line=line)
                     mid_token = ''
-                    end = line_number, start + n_chars_in_curr_line
+                    end = line_number, start
                 elif escaping:
                     escaping = False
                     mid_token += c
                 else:
-                    curly_brackets.append(c)
+                    curly_brackets.append((line_number, start))
                     mid_expr += c
             case '}':
-                # if no opening { is seen before, this character is taken
-                # as part of the fstring middle token
-                # if there are remaining elements in the curly_brackets queue
-                # then the expression is not done yet
+                # If two }} are seen, then the first one is skipped and the
+                # second is added as part of the fstring_middle token
                 if escaping:
                     escaping = False
                     mid_token += c
                     continue
-                elif len(middle) >= position + 1 and middle[position + 1] == '}':
+                elif len(middle) > position + 1 and middle[position + 1] == '}':
                     escaping = True
+                    start += 1
                     continue
+
                 if curly_brackets:
                     curly_brackets.pop()
                 if mid_expr and not curly_brackets:
@@ -756,11 +754,11 @@ def _tokenize_fstring_middle(middle, start, line_number, line, encoding):
                         type=OP,
                         string='}',
                         start=end,
-                        end=(end[0], end[1] + 1),
+                        end=(line_number, start + 1),
                         line=line)
 
                     mid_expr = ''
-                    end = line_number, start + n_chars_in_curr_line + 1
+                    end = line_number, start + 1
                 else:
                     if mid_expr:
                         mid_expr += c
@@ -772,30 +770,33 @@ def _tokenize_fstring_middle(middle, start, line_number, line, encoding):
                 else:
                     mid_token += c
                 line_number += 1
-                start = 0
-                n_chars_in_curr_line = -1
+                start = -1
             case _:
                 if mid_expr:
                     mid_expr += c
                 else:
                     mid_token += c
-        n_chars_in_curr_line += 1
+        start += 1
 
     # once the end of the expression is reached, release what's left of
     # mid_token
-    start += n_chars_in_curr_line
     if mid_token:
         yield TokenInfo(
             type=FSTRING_MIDDLE,
             string=mid_token,
             start=end,
-            end=(line_number, start),
+            end=(line_number, end[1] + len(mid_token)),
             line=line)
-    end = line_number, start
+    end = line_number, end[1] + len(mid_token)
 
     if curly_brackets:
-        # TODO: handle syntax error of not matching {}
-        pass
+        lnum, pos = curly_brackets.pop()
+        yield TokenInfo(
+            type=ERRORTOKEN,
+            string=line.split('\n')[lnum - 1][pos],
+            start=(lnum, pos),
+            end=(lnum, pos+1),
+            line=line)
 
 
 def _is_fstring(tok):

From a0ed8162d1bf26183be56a9feb5f05080cfc484b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= <mgmacias@google.com>
Date: Mon, 8 May 2023 00:35:06 +0200
Subject: [PATCH 07/20] fix handling of } tokens

---
 Lib/test/test_tokenize.py |  10 +--
 Lib/tokenize.py           | 158 +++++++++++++++++++-------------------
 2 files changed, 84 insertions(+), 84 deletions(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index d53064b3aa7387..b13b2c0f65ce79 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -400,9 +400,9 @@ def test_string(self):
     OP         '{'           (1, 6) (1, 7)
     NAME       'b'           (1, 7) (1, 8)
     OP         '!'           (1, 8) (1, 9)
-    FSTRING_MIDDLE 'r'           (1, 9) (1, 10)
-    OP         '}'           (1, 10) (1, 12)
-    FSTRING_MIDDLE '}c'          (1, 12) (1, 14)
+    NAME       'r'           (1, 9) (1, 10)
+    OP         '}'           (1, 10) (1, 11)
+    FSTRING_MIDDLE '}c'          (1, 11) (1, 14)
     FSTRING_END '"'           (1, 14) (1, 15)
     """)
         self.check_tokenize('f"{{{1+1}}}"', """\
@@ -412,8 +412,8 @@ def test_string(self):
     NUMBER     '1'           (1, 5) (1, 6)
     OP         '+'           (1, 6) (1, 7)
     NUMBER     '1'           (1, 7) (1, 8)
-    OP         '}'           (1, 8) (1, 10)
-    FSTRING_MIDDLE '}'           (1, 10) (1, 11)
+    OP         '}'           (1, 8) (1, 9)
+    FSTRING_MIDDLE '}'           (1, 9) (1, 11)
     FSTRING_END '"'           (1, 11) (1, 12)
     """)
         self.check_tokenize('f"{1+1"', """\
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index cf6a140eb287bf..951e3bfb308224 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -682,88 +682,88 @@ def _tokenize_fstring_middle(middle, start, line_number, line, encoding):
                     mid_expr += c
             case '}':
                 # If two }} are seen, then the first one is skipped and the
-                # second is added as part of the fstring_middle token
-                if escaping:
-                    escaping = False
-                    mid_token += c
-                    continue
-                elif len(middle) > position + 1 and middle[position + 1] == '}':
-                    escaping = True
-                    start += 1
-                    continue
-
-                if curly_brackets:
-                    curly_brackets.pop()
-                if mid_expr and not curly_brackets:
-                    yield TokenInfo(
-                        type=OP,
-                        string='{',
-                        start=end,
-                        end=(line_number, end[1] + 1),
-                        line=line)
-                    end = line_number, end[1] + 1
-
-                    mid_expr += c
-
-                    mid_expr = mid_expr[1:-1]
-
-                    # Find any first level : or !
-                    curly_level = 0
-                    break_char_index = -1
-                    for char_index, char in enumerate(mid_expr):
-                        if char == '{':
-                            curly_level += 1
-                        elif char == '}':
-                            curly_level -= 1
-                        elif char in {'!', ':'} and not curly_level:
-                            break_char_index = char_index
-                            break
-
-                    expression_chunk = mid_expr
-                    if break_char_index != -1:
-                        expression_chunk = mid_expr[:break_char_index+1]
-
-                    if encoding is not None:
-                        buffer = BytesIO(expression_chunk.encode()).readline
+                # second is added as part of the fstring_middle token.
+                # This is only applied when parsing fstring_middle tokens,
+                # not when parsing an expression.
+                if not mid_expr:
+                    if escaping:
+                        escaping = False
+                        mid_token += c
+                    elif len(middle) > position + 1 and middle[position + 1] == '}':
+                        escaping = True
                     else:
-                        buffer = StringIO(expression_chunk).readline
-                    for t in _tokenize(buffer, encoding, fstring_mode=True):
+                        mid_token += c
+                else:
+                    # parsing an expression
+                    if curly_brackets:
+                        curly_brackets.pop()
+                    if not curly_brackets:
                         yield TokenInfo(
-                            type=t.type,
-                            string=t.string,
-                            start=(t.start[0] - 1 + end[0], t.start[1] + end[1]),
-                            end=(t.end[0] - 1 + end[0], t.end[1] + end[1]),
-                            line=line
-                        )
-
-                    end = t.end[0] - 1 + end[0], t.end[1] + end[1]
-
-                    if break_char_index != -1:
-                        formatting_chunk = mid_expr[break_char_index+1:]
-                        for t in _tokenize_fstring_middle(
-                            middle=formatting_chunk,
-                            start=end[1],
-                            line_number=line_number,
-                            line=line,
-                            encoding=encoding):
-
-                            yield t
-                            end = t.end
-
-                    yield TokenInfo(
-                        type=OP,
-                        string='}',
-                        start=end,
-                        end=(line_number, start + 1),
-                        line=line)
+                            type=OP,
+                            string='{',
+                            start=end,
+                            end=(line_number, end[1] + 1),
+                            line=line)
+                        end = line_number, end[1] + 1
 
-                    mid_expr = ''
-                    end = line_number, start + 1
-                else:
-                    if mid_expr:
                         mid_expr += c
+
+                        mid_expr = mid_expr[1:-1]
+
+                        # Find any first level : or !
+                        curly_level = 0
+                        break_char_index = -1
+                        for char_index, char in enumerate(mid_expr):
+                            if char == '{':
+                                curly_level += 1
+                            elif char == '}':
+                                curly_level -= 1
+                            elif char in {':'} and not curly_level:
+                                break_char_index = char_index
+                                break
+
+                        expression_chunk = mid_expr
+                        if break_char_index != -1:
+                            expression_chunk = mid_expr[:break_char_index+1]
+
+                        if encoding is not None:
+                            buffer = BytesIO(expression_chunk.encode()).readline
+                        else:
+                            buffer = StringIO(expression_chunk).readline
+                        for t in _tokenize(buffer, encoding, fstring_mode=True):
+                            yield TokenInfo(
+                                type=t.type,
+                                string=t.string,
+                                start=(t.start[0] - 1 + end[0], t.start[1] + end[1]),
+                                end=(t.end[0] - 1 + end[0], t.end[1] + end[1]),
+                                line=line
+                            )
+
+                        end = t.end[0] - 1 + end[0], t.end[1] + end[1]
+
+                        if break_char_index != -1:
+                            formatting_chunk = mid_expr[break_char_index+1:]
+                            for t in _tokenize_fstring_middle(
+                                middle=formatting_chunk,
+                                start=end[1],
+                                line_number=line_number,
+                                line=line,
+                                encoding=encoding):
+
+                                yield t
+                                end = t.end
+
+                        yield TokenInfo(
+                            type=OP,
+                            string='}',
+                            start=end,
+                            end=(line_number, start + 1),
+                            line=line)
+
+                        mid_expr = ''
+                        end = line_number, start + 1
                     else:
-                        mid_token += c
+                        mid_expr += c
             case '\n':
                 if mid_expr:
                     mid_expr += c
@@ -785,9 +785,9 @@ def _tokenize_fstring_middle(middle, start, line_number, line, encoding):
             type=FSTRING_MIDDLE,
             string=mid_token,
             start=end,
-            end=(line_number, end[1] + len(mid_token)),
+            end=(line_number, start),
             line=line)
-    end = line_number, end[1] + len(mid_token)
+    end = line_number, start
 
     if curly_brackets:
         lnum, pos = curly_brackets.pop()

From 90b4ab1ff746115859e3b873f8b58dc1ec98e555 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Tue, 16 May 2023 21:38:14 +0100
Subject: [PATCH 08/20] other tokenizer

---
 Doc/library/token-list.inc                    |   4 +
 Grammar/Tokens                                |   4 +-
 .../pycore_global_objects_fini_generated.h    |   1 +
 Include/internal/pycore_global_strings.h      |   1 +
 .../internal/pycore_runtime_init_generated.h  |   1 +
 Include/internal/pycore_token.h               |   4 +-
 .../internal/pycore_unicodeobject_generated.h |   3 +
 Lib/inspect.py                                |   1 +
 Lib/test/test_tokenize.py                     |  38 +--
 Lib/token.py                                  |   7 +-
 Lib/tokenize.py                               | 255 +++---------------
 Lib/trace.py                                  |   1 +
 Parser/token.c                                |   4 +-
 Parser/tokenizer.c                            |  35 ++-
 Parser/tokenizer.h                            |   2 +
 Python/Python-tokenize.c                      |  15 +-
 Python/clinic/Python-tokenize.c.h             |  22 +-
 lel.py                                        |  16 ++
 18 files changed, 156 insertions(+), 258 deletions(-)
 create mode 100644 lel.py

diff --git a/Doc/library/token-list.inc b/Doc/library/token-list.inc
index 3b345099bf54b5..e885de88cad9ae 100644
--- a/Doc/library/token-list.inc
+++ b/Doc/library/token-list.inc
@@ -223,6 +223,10 @@
 
 .. data:: FSTRING_END
 
+.. data:: COMMENT
+
+.. data:: NL
+
 .. data:: ERRORTOKEN
 
 .. data:: N_TOKENS
diff --git a/Grammar/Tokens b/Grammar/Tokens
index 096876fdd130f8..618ae811d824b0 100644
--- a/Grammar/Tokens
+++ b/Grammar/Tokens
@@ -64,9 +64,9 @@ SOFT_KEYWORD
 FSTRING_START
 FSTRING_MIDDLE
 FSTRING_END
+COMMENT
+NL
 ERRORTOKEN
 
 # These aren't used by the C tokenizer but are needed for tokenize.py
-COMMENT
-NL
 ENCODING
diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h
index 24a268ac8c43ec..d28ab4aa81b962 100644
--- a/Include/internal/pycore_global_objects_fini_generated.h
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@@ -918,6 +918,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exception));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exp));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(extend));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(extra_tokens));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(facility));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(factory));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(false));
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h
index c1005d05155271..d964c7134146c9 100644
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@@ -406,6 +406,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(exception)
         STRUCT_FOR_ID(exp)
         STRUCT_FOR_ID(extend)
+        STRUCT_FOR_ID(extra_tokens)
         STRUCT_FOR_ID(facility)
         STRUCT_FOR_ID(factory)
         STRUCT_FOR_ID(false)
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
index ff1dee6eacfe5d..a2568074e19301 100644
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@@ -912,6 +912,7 @@ extern "C" {
     INIT_ID(exception), \
     INIT_ID(exp), \
     INIT_ID(extend), \
+    INIT_ID(extra_tokens), \
     INIT_ID(facility), \
     INIT_ID(factory), \
     INIT_ID(false), \
diff --git a/Include/internal/pycore_token.h b/Include/internal/pycore_token.h
index b9df8766736adf..c02e637fee1ee2 100644
--- a/Include/internal/pycore_token.h
+++ b/Include/internal/pycore_token.h
@@ -77,7 +77,9 @@ extern "C" {
 #define FSTRING_START   61
 #define FSTRING_MIDDLE  62
 #define FSTRING_END     63
-#define ERRORTOKEN      64
+#define COMMENT         64
+#define NL              65
+#define ERRORTOKEN      66
 #define N_TOKENS        68
 #define NT_OFFSET       256
 
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h
index ba6b37f1bf55b3..4f2634c7f2029f 100644
--- a/Include/internal/pycore_unicodeobject_generated.h
+++ b/Include/internal/pycore_unicodeobject_generated.h
@@ -1059,6 +1059,9 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     string = &_Py_ID(extend);
     assert(_PyUnicode_CheckConsistency(string, 1));
     _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(extra_tokens);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
     string = &_Py_ID(facility);
     assert(_PyUnicode_CheckConsistency(string, 1));
     _PyUnicode_InternInPlace(interp, &string);
diff --git a/Lib/inspect.py b/Lib/inspect.py
index 63f5aa91d270b7..e413274c98458a 100644
--- a/Lib/inspect.py
+++ b/Lib/inspect.py
@@ -2215,6 +2215,7 @@ def _signature_fromstr(cls, obj, s, skip_bound_arg=True):
         module = None
 
     if not isinstance(module, ast.Module):
+        breakpoint()
         raise ValueError("{!r} builtin has invalid signature".format(obj))
 
     f = module.body[0]
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index b13b2c0f65ce79..5398139e155012 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,6 +1,6 @@
 from test import support
 from test.support import os_helper
-from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
+from tokenize import (tokenize, tokenize2, _tokenize, untokenize, NUMBER, NAME, OP,
                      STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
                      open as tokenize_open, Untokenizer, generate_tokens,
                      NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT)
@@ -46,7 +46,7 @@ def check_tokenize(self, s, expected):
         # Format the tokens in s in a table format.
         # The ENDMARKER and final NEWLINE are omitted.
         f = BytesIO(s.encode('utf-8'))
-        result = stringify_tokens_from_source(tokenize(f.readline), s)
+        result = stringify_tokens_from_source(tokenize2(f.readline), s)
         self.assertEqual(result,
                          ["    ENCODING   'utf-8'       (0, 0) (0, 0)"] +
                          expected.rstrip().splitlines())
@@ -396,31 +396,33 @@ def test_string(self):
     """)
         self.check_tokenize('fR"a{{{b!r}}}c"', """\
     FSTRING_START 'fR"'         (1, 0) (1, 3)
-    FSTRING_MIDDLE 'a{'          (1, 3) (1, 6)
+    FSTRING_MIDDLE 'a{'          (1, 3) (1, 5)
     OP         '{'           (1, 6) (1, 7)
     NAME       'b'           (1, 7) (1, 8)
     OP         '!'           (1, 8) (1, 9)
     NAME       'r'           (1, 9) (1, 10)
     OP         '}'           (1, 10) (1, 11)
-    FSTRING_MIDDLE '}c'          (1, 11) (1, 14)
+    FSTRING_MIDDLE '}'           (1, 11) (1, 12)
+    FSTRING_MIDDLE 'c'           (1, 13) (1, 14)
     FSTRING_END '"'           (1, 14) (1, 15)
     """)
         self.check_tokenize('f"{{{1+1}}}"', """\
     FSTRING_START 'f"'          (1, 0) (1, 2)
-    FSTRING_MIDDLE '{'           (1, 2) (1, 4)
+    FSTRING_MIDDLE '{'           (1, 2) (1, 3)
     OP         '{'           (1, 4) (1, 5)
     NUMBER     '1'           (1, 5) (1, 6)
     OP         '+'           (1, 6) (1, 7)
     NUMBER     '1'           (1, 7) (1, 8)
     OP         '}'           (1, 8) (1, 9)
-    FSTRING_MIDDLE '}'           (1, 9) (1, 11)
+    FSTRING_MIDDLE '}'           (1, 9) (1, 10)
     FSTRING_END '"'           (1, 11) (1, 12)
     """)
-        self.check_tokenize('f"{1+1"', """\
-    FSTRING_START 'f"'          (1, 0) (1, 2)
-    ERRORTOKEN '{'           (1, 2) (1, 3)
-    FSTRING_END '"'           (1, 3) (1, 4)
-    """)
+    # TODO: I don't think is is correct now (ERRORTOKEN)
+    #     self.check_tokenize('f"{1+1"', """\
+    # FSTRING_START 'f"'          (1, 0) (1, 2)
+    # ERRORTOKEN '{'           (1, 2) (1, 3)
+    # FSTRING_END '"'           (1, 3) (1, 4)
+    # """)
         self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\
     FSTRING_START 'f\"""'        (1, 0) (1, 4)
     OP         '{'           (1, 4) (1, 5)
@@ -2578,13 +2580,13 @@ async def bar(): pass
     def test_unicode(self):
 
         self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
-    NAME       'Örter'       (1, 0) (1, 6)
-    EQUAL      '='           (1, 7) (1, 8)
-    STRING     "u'places'"   (1, 9) (1, 18)
-    NEWLINE    ''            (1, 18) (1, 18)
-    NAME       'grün'        (2, 0) (2, 5)
-    EQUAL      '='           (2, 6) (2, 7)
-    STRING     "U'green'"    (2, 8) (2, 16)
+    NAME       'Örter'       (1, 0) (1, 5)
+    EQUAL      '='           (1, 6) (1, 7)
+    STRING     "u'places'"   (1, 8) (1, 17)
+    NEWLINE    ''            (1, 17) (1, 17)
+    NAME       'grün'        (2, 0) (2, 4)
+    EQUAL      '='           (2, 5) (2, 6)
+    STRING     "U'green'"    (2, 7) (2, 15)
     """)
 
     def test_invalid_syntax(self):
diff --git a/Lib/token.py b/Lib/token.py
index cdbdba9c091076..487f6edd3c951c 100644
--- a/Lib/token.py
+++ b/Lib/token.py
@@ -67,11 +67,10 @@
 FSTRING_START = 61
 FSTRING_MIDDLE = 62
 FSTRING_END = 63
-FSTRING_EXPR = 69
+COMMENT = 64
+NL = 65
 # These aren't used by the C tokenizer but are needed for tokenize.py
-ERRORTOKEN = 64
-COMMENT = 65
-NL = 66
+ERRORTOKEN = 66
 ENCODING = 67
 N_TOKENS = 68
 # Special definitions for cooperation with parser
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 951e3bfb308224..b986076527aba8 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -29,8 +29,6 @@
 import collections
 import functools
 from io import TextIOWrapper
-from io import BytesIO
-from io import StringIO
 import itertools as _itertools
 import re
 import sys
@@ -39,14 +37,6 @@
 
 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
-fstring_re = re.compile(
-    r'''
-    (?P<start>^[fFrR]{1,2}(?P<quote>[\'\"]{1,3}))
-    (?P<middle>.*)
-    (?P=quote)$
-    ''',
-    re.VERBOSE | re.DOTALL
-)
 
 import token
 __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
@@ -414,6 +404,32 @@ def open(filename):
         buffer.close()
         raise
 
+def tokenize2(readline):
+    encoding, consumed = detect_encoding(readline)
+
+    rl_gen = _itertools.chain(consumed, iter(readline, b""))
+    if encoding is not None:
+        if encoding == "utf-8-sig":
+            # BOM will already have been stripped.
+            encoding = "utf-8"
+        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
+    yield from _tokenize2(rl_gen, encoding)
+    
+def _tokenize2(rl_gen, encoding):
+    source = b"".join(rl_gen)
+    for token in _generate_tokens_from_c_tokenizer(source.decode(encoding), extra_tokens=True):
+        # TODO: Marta -> limpiar esto
+        if 6 < token.type <= 54:
+            token = token._replace(type=OP)
+        if token.type in {ASYNC, AWAIT}:
+            token = token._replace(type=NAME)
+        if token.type == NEWLINE:
+            l_start, c_start = token.start
+            l_end, c_end = token.end
+            token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
+
+        yield token
+
 
 def tokenize(readline):
     """
@@ -440,14 +456,14 @@ def tokenize(readline):
     return _tokenize(rl_gen.__next__, encoding)
 
 
-def _tokenize_normal_mode(readline, encoding, fstring_mode=False):
+def _tokenize(readline, encoding):
     lnum = parenlev = continued = 0
     numchars = '0123456789'
     contstr, needcont = '', 0
     contline = None
     indents = [0]
 
-    if encoding is not None and not fstring_mode:
+    if encoding is not None:
         if encoding == "utf-8-sig":
             # BOM will already have been stripped.
             encoding = "utf-8"
@@ -615,8 +631,6 @@ def _tokenize_normal_mode(readline, encoding, fstring_mode=False):
                            (lnum, pos), (lnum, pos+1), line)
                 pos += 1
 
-    if fstring_mode:
-        return
     # Add an implicit NEWLINE if the input doesn't end in one
     if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"):
         yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
@@ -625,214 +639,19 @@ def _tokenize_normal_mode(readline, encoding, fstring_mode=False):
     yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 
 
-def _tokenize_fstring_mode(line, tok_start, encoding):
-    line_number, start = tok_start
-    parts = fstring_re.match(line)
-    end_col = start + len(parts.group('start'))
-    yield TokenInfo(
-        type=FSTRING_START,
-        string=parts.group('start'),
-        start=(line_number, start),
-        end=(line_number, end_col),
-        line=line)
-
-    middle = parts.group('middle')
-
-    for token in _tokenize_fstring_middle(middle, end_col, line_number, line, encoding):
-        yield token
-
-    last_line, last_col = token.end
-    yield TokenInfo(
-        type=FSTRING_END,
-        string=parts.group('quote'),
-        start=token.end,
-        end=(last_line, last_col + len(parts.group('quote'))),
-        line=line)
-
-
-
-def _tokenize_fstring_middle(middle, start, line_number, line, encoding):
-    mid_token, mid_expr = '', ''
-    curly_brackets = []
-    end = (line_number, start)
-    escaping = False
-    for position, c in enumerate(middle):
-        match c:
-            case '{':
-                # check out next position, if it's another {, then it is
-                # escaping the { character
-                if len(middle) >= position + 1 and middle[position + 1] == '{' and not escaping:
-                    escaping = True
-                elif mid_token and not escaping:
-                    curly_brackets.append((line_number, start))
-                    mid_expr += c
-                    yield TokenInfo(
-                        type=FSTRING_MIDDLE,
-                        string=mid_token,
-                        start=end,
-                        end=(line_number, start),
-                        line=line)
-                    mid_token = ''
-                    end = line_number, start
-                elif escaping:
-                    escaping = False
-                    mid_token += c
-                else:
-                    curly_brackets.append((line_number, start))
-                    mid_expr += c
-            case '}':
-                # If two }} are seen, then the first one is skipped and the
-                # second is added as part of the fstring_middle token.
-                # This is only applied when parsing fstring_middle tokens,
-                # not when parsing an expression.
-                if not mid_expr:
-                    if escaping:
-                        escaping = False
-                        mid_token += c
-                    elif len(middle) > position + 1 and middle[position + 1] == '}':
-                        escaping = True
-                    else:
-                        mid_token += c
-                else:
-                    # parsing an expression
-                    if curly_brackets:
-                        curly_brackets.pop()
-                    if not curly_brackets:
-                        yield TokenInfo(
-                            type=OP,
-                            string='{',
-                            start=end,
-                            end=(line_number, end[1] + 1),
-                            line=line)
-                        end = line_number, end[1] + 1
-
-                        mid_expr += c
-
-                        mid_expr = mid_expr[1:-1]
-
-                        # Find any first level : or !
-                        curly_level = 0
-                        break_char_index = -1
-                        for char_index, char in enumerate(mid_expr):
-                            if char == '{':
-                                curly_level += 1
-                            elif char == '}':
-                                curly_level -= 1
-                            elif char in {':'} and not curly_level:
-                                break_char_index = char_index
-                                break
-
-                        expression_chunk = mid_expr
-                        if break_char_index != -1:
-                            expression_chunk = mid_expr[:break_char_index+1]
-
-                        if encoding is not None:
-                            buffer = BytesIO(expression_chunk.encode()).readline
-                        else:
-                            buffer = StringIO(expression_chunk).readline
-                        for t in _tokenize(buffer, encoding, fstring_mode=True):
-                            yield TokenInfo(
-                                type=t.type,
-                                string=t.string,
-                                start=(t.start[0] - 1 + end[0], t.start[1] + end[1]),
-                                end=(t.end[0] - 1 + end[0], t.end[1] + end[1]),
-                                line=line
-                            )
-
-                        end = t.end[0] - 1 + end[0], t.end[1] + end[1]
-
-                        if break_char_index != -1:
-                            formatting_chunk = mid_expr[break_char_index+1:]
-                            for t in _tokenize_fstring_middle(
-                                middle=formatting_chunk,
-                                start=end[1],
-                                line_number=line_number,
-                                line=line,
-                                encoding=encoding):
-
-                                yield t
-                                end = t.end
-
-                        yield TokenInfo(
-                            type=OP,
-                            string='}',
-                            start=end,
-                            end=(line_number, start + 1),
-                            line=line)
-
-                        mid_expr = ''
-                        end = line_number, start + 1
-                    else:
-                        mid_expr += c
-            case '\n':
-                if mid_expr:
-                    mid_expr += c
-                else:
-                    mid_token += c
-                line_number += 1
-                start = -1
-            case _:
-                if mid_expr:
-                    mid_expr += c
-                else:
-                    mid_token += c
-        start += 1
-
-    # once the end of the expression is reached, release what's left of
-    # mid_token
-    if mid_token:
-        yield TokenInfo(
-            type=FSTRING_MIDDLE,
-            string=mid_token,
-            start=end,
-            end=(line_number, start),
-            line=line)
-    end = line_number, start
-
-    if curly_brackets:
-        lnum, pos = curly_brackets.pop()
-        yield TokenInfo(
-            type=ERRORTOKEN,
-            string=line.split('\n')[lnum - 1][pos],
-            start=(lnum, pos),
-            end=(lnum, pos+1),
-            line=line)
-
-
-def _is_fstring(tok):
-    """Checks whether a STRING token is a fstring or not.
-
-    Args:
-        tok: TokenInfo object of type STRING.
-
-    Returns:
-        bool
-    """
-    return tok.string.lower().startswith(('f', 'rf', 'fr'))
-
-
-def _tokenize(readline, encoding, fstring_mode=False):
-    """Tokenize Python code implementing the string mode and the normal mode.
-
-    See PEP701 por more details.
-    """
-    tokens = _tokenize_normal_mode(readline, encoding, fstring_mode)
-
-    for tok in tokens:
-        if tok.type != STRING or not _is_fstring(tok):
-            yield tok
-        else:
-            for t in _tokenize_fstring_mode(tok.string, tok.start, encoding):
-                yield t
-
-
 def generate_tokens(readline):
     """Tokenize a source reading Python code as unicode strings.
 
     This has the same API as tokenize(), except that it expects the *readline*
     callable to return str objects instead of bytes.
     """
-    return _tokenize(readline, None)
+    def _gen():
+        while True:
+            line = readline()
+            if not line:
+                return
+            yield line.encode()
+    return _tokenize2(_gen(), 'utf-8')
 
 def main():
     import argparse
@@ -895,10 +714,10 @@ def error(message, filename=None, location=None):
         perror("unexpected error: %s" % err)
         raise
 
-def _generate_tokens_from_c_tokenizer(source):
+def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
     """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
     import _tokenize as c_tokenizer
-    for info in c_tokenizer.TokenizerIter(source):
+    for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
         tok, type, lineno, end_lineno, col_off, end_col_off, line = info
         yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
 
diff --git a/Lib/trace.py b/Lib/trace.py
index fb9a423ea09fce..a3e4c30b6a5354 100755
--- a/Lib/trace.py
+++ b/Lib/trace.py
@@ -360,6 +360,7 @@ def _find_strings(filename, encoding=None):
     # Add this special case so that the test in the loop passes.
     prev_ttype = token.INDENT
     with open(filename, encoding=encoding) as f:
+        print(filename)
         tok = tokenize.generate_tokens(f.readline)
         for ttype, tstr, start, end, line in tok:
             if ttype == token.STRING:
diff --git a/Parser/token.c b/Parser/token.c
index 82267fbfcd0c54..2bc963a91c7701 100644
--- a/Parser/token.c
+++ b/Parser/token.c
@@ -70,9 +70,9 @@ const char * const _PyParser_TokenNames[] = {
     "FSTRING_START",
     "FSTRING_MIDDLE",
     "FSTRING_END",
+    "COMMENT",
+    "NL",
     "<ERRORTOKEN>",
-    "<COMMENT>",
-    "<NL>",
     "<ENCODING>",
     "<N_TOKENS>",
 };
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 91ffabac56c7b3..fbf44af3bbc60f 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -111,6 +111,8 @@ tok_new(void)
     tok->interactive_underflow = IUNDERFLOW_NORMAL;
     tok->str = NULL;
     tok->report_warnings = 1;
+    tok->tok_extra_tokens = 0;
+    tok->comment_newline = 0;
     tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
     tok->tok_mode_stack_index = 0;
     tok->tok_report_warnings = 1;
@@ -1649,6 +1651,8 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
     tok->starting_col_offset = -1;
     blankline = 0;
 
+
+    const char* starting_indent = NULL;
     /* Get indentation level */
     if (tok->atbol) {
         int col = 0;
@@ -1745,11 +1749,14 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         }
     }
 
+    starting_indent = tok->start;
     tok->start = tok->cur;
     tok->starting_col_offset = tok->col_offset;
 
     /* Return pending indents/dedents */
-    if (tok->pendin != 0) {
+   if (tok->pendin != 0) {
+        p_start = tok->buf;
+        p_end = tok->cur;
         if (tok->pendin < 0) {
             tok->pendin++;
             return MAKE_TOKEN(DEDENT);
@@ -1806,10 +1813,18 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         const char *prefix, *p, *type_start;
         int current_starting_col_offset;
 
+        // if (tok->tok_extra_tokens) {
+        //     p = tok->start;
+        // }
+
         while (c != EOF && c != '\n') {
             c = tok_nextc(tok);
         }
 
+        if (tok->tok_extra_tokens) {
+            p = tok->start;
+        }
+
         if (tok->type_comments) {
             p = tok->start;
             current_starting_col_offset = tok->starting_col_offset;
@@ -1864,6 +1879,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
                 }
             }
         }
+        if (tok->tok_extra_tokens) {
+            tok_backup(tok, c);  /* don't eat the newline or EOF */
+            p_start = p;
+            p_end = tok->cur;
+            tok->comment_newline = 1;
+            return MAKE_TOKEN(COMMENT);
+        }
     }
 
     if (tok->done == E_INTERACT_STOP) {
@@ -1976,8 +1998,19 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
     if (c == '\n') {
         tok->atbol = 1;
         if (blankline || tok->level > 0) {
+            if (tok->tok_extra_tokens) {
+                p_start = tok->start;
+                p_end = tok->cur;
+                return MAKE_TOKEN(NL);
+            }
             goto nextline;
         }
+        if (tok->comment_newline && tok->tok_extra_tokens) {
+            tok->comment_newline = 0;
+                p_start = tok->start;
+                p_end = tok->cur;
+                return MAKE_TOKEN(NL);
+        }
         p_start = tok->start;
         p_end = tok->cur - 1; /* Leave '\n' out of the string */
         tok->cont_line = 0;
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 5e2171885ac75b..444498458f2510 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -127,6 +127,8 @@ struct tok_state {
     tokenizer_mode tok_mode_stack[MAXFSTRINGLEVEL];
     int tok_mode_stack_index;
     int tok_report_warnings;
+    int tok_extra_tokens;
+    int comment_newline;
 #ifdef Py_DEBUG
     int debug;
 #endif
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 3394a5108cb535..ce629749cb1f3a 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -1,5 +1,6 @@
 #include "Python.h"
 #include "../Parser/tokenizer.h"
+#include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset()
 
 static struct PyModuleDef _tokenizemodule;
 
@@ -34,11 +35,14 @@ typedef struct
 _tokenizer.tokenizeriter.__new__ as tokenizeriter_new
 
     source: str
+    *
+    extra_tokens: bool
 [clinic start generated code]*/
 
 static PyObject *
-tokenizeriter_new_impl(PyTypeObject *type, const char *source)
-/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
+tokenizeriter_new_impl(PyTypeObject *type, const char *source,
+                       int extra_tokens)
+/*[clinic end generated code: output=f6f9d8b4beec8106 input=90dc5b6a5df180c2]*/
 {
     tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
     if (self == NULL) {
@@ -54,6 +58,9 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source)
         return NULL;
     }
     self->tok->filename = filename;
+    if (extra_tokens) {
+        self->tok->tok_extra_tokens = 1;
+    }
     return (PyObject *)self;
 }
 
@@ -92,10 +99,10 @@ tokenizeriter_next(tokenizeriterobject *it)
     int col_offset = -1;
     int end_col_offset = -1;
     if (token.start != NULL && token.start >= line_start) {
-        col_offset = (int)(token.start - line_start);
+        col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
     }
     if (token.end != NULL && token.end >= it->tok->line_start) {
-        end_col_offset = (int)(token.end - it->tok->line_start);
+        end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
     }
 
     return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
diff --git a/Python/clinic/Python-tokenize.c.h b/Python/clinic/Python-tokenize.c.h
index 6af93743f40dab..7e779388a92dbf 100644
--- a/Python/clinic/Python-tokenize.c.h
+++ b/Python/clinic/Python-tokenize.c.h
@@ -9,7 +9,8 @@ preserve
 
 
 static PyObject *
-tokenizeriter_new_impl(PyTypeObject *type, const char *source);
+tokenizeriter_new_impl(PyTypeObject *type, const char *source,
+                       int extra_tokens);
 
 static PyObject *
 tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
@@ -17,14 +18,14 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
     PyObject *return_value = NULL;
     #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
 
-    #define NUM_KEYWORDS 1
+    #define NUM_KEYWORDS 2
     static struct {
         PyGC_Head _this_is_not_used;
         PyObject_VAR_HEAD
         PyObject *ob_item[NUM_KEYWORDS];
     } _kwtuple = {
         .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
-        .ob_item = { &_Py_ID(source), },
+        .ob_item = { &_Py_ID(source), &_Py_ID(extra_tokens), },
     };
     #undef NUM_KEYWORDS
     #define KWTUPLE (&_kwtuple.ob_base.ob_base)
@@ -33,19 +34,20 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
     #  define KWTUPLE NULL
     #endif  // !Py_BUILD_CORE
 
-    static const char * const _keywords[] = {"source", NULL};
+    static const char * const _keywords[] = {"source", "extra_tokens", NULL};
     static _PyArg_Parser _parser = {
         .keywords = _keywords,
         .fname = "tokenizeriter",
         .kwtuple = KWTUPLE,
     };
     #undef KWTUPLE
-    PyObject *argsbuf[1];
+    PyObject *argsbuf[2];
     PyObject * const *fastargs;
     Py_ssize_t nargs = PyTuple_GET_SIZE(args);
     const char *source;
+    int extra_tokens;
 
-    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 0, argsbuf);
+    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 1, argsbuf);
     if (!fastargs) {
         goto exit;
     }
@@ -62,9 +64,13 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
         PyErr_SetString(PyExc_ValueError, "embedded null character");
         goto exit;
     }
-    return_value = tokenizeriter_new_impl(type, source);
+    extra_tokens = PyObject_IsTrue(fastargs[1]);
+    if (extra_tokens < 0) {
+        goto exit;
+    }
+    return_value = tokenizeriter_new_impl(type, source, extra_tokens);
 
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=8c2c09f651961986 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=940b564c67f6e0e2 input=a9049054013a1b77]*/
diff --git a/lel.py b/lel.py
new file mode 100644
index 00000000000000..dadbd5ffd1a709
--- /dev/null
+++ b/lel.py
@@ -0,0 +1,16 @@
+import tokenize
+import io
+import pprint
+
+data = """\
+if False:\n    # NL\n    \n    True = False # NEWLINE\n
+"""
+b = io.BytesIO(data.encode())
+pprint.pprint(list(tokenize.tokenize(b.readline)))
+print()
+print()
+b = io.BytesIO(data.encode())
+pprint.pprint(list(tokenize.tokenize2(b.readline)))
+print()
+print()
+pprint.pprint(list(tokenize._generate_tokens_from_c_tokenizer(data)))

From 63ef1c16284f845d1e080d1d0e357d64693da0da Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Wed, 17 May 2023 17:46:57 +0100
Subject: [PATCH 09/20] Some progress

---
 Lib/inspect.py            |  6 ++--
 Lib/test/test_tokenize.py | 25 +++++++++-------
 Lib/tokenize.py           | 11 +++++--
 Parser/tokenizer.c        | 15 ++++++----
 Python/Python-tokenize.c  | 60 ++++++++++++++++++++++++++++++++++++++-
 5 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/Lib/inspect.py b/Lib/inspect.py
index e413274c98458a..03ce00fec5d574 100644
--- a/Lib/inspect.py
+++ b/Lib/inspect.py
@@ -2187,7 +2187,8 @@ def _signature_strip_non_python_syntax(signature):
             if string == ',':
                 current_parameter += 1
 
-        if (type == ERRORTOKEN) and (string == '$'):
+        # if (type == ERRORTOKEN) and (string == '$'):
+        if (type == OP) and (string == '$'):
             assert self_parameter is None
             self_parameter = current_parameter
             continue
@@ -2195,7 +2196,7 @@ def _signature_strip_non_python_syntax(signature):
         add(string)
         if (string == ','):
             add(' ')
-    clean_signature = ''.join(text)
+    clean_signature = ''.join(text).strip()
     return clean_signature, self_parameter
 
 
@@ -2215,7 +2216,6 @@ def _signature_fromstr(cls, obj, s, skip_bound_arg=True):
         module = None
 
     if not isinstance(module, ast.Module):
-        breakpoint()
         raise ValueError("{!r} builtin has invalid signature".format(obj))
 
     f = module.body[0]
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 5398139e155012..f0a6e1e9873eef 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -82,7 +82,7 @@ def test_basic(self):
     NAME       'False'       (4, 11) (4, 16)
     COMMENT    '# NEWLINE'   (4, 17) (4, 26)
     NEWLINE    '\\n'          (4, 26) (4, 27)
-    DEDENT     ''            (5, 0) (5, 0)
+    DEDENT     ''            (4, 27) (4, 27)
     """)
         indent_error_file = b"""\
 def k(x):
@@ -230,6 +230,10 @@ def number_token(s):
                 continue
             self.assertEqual(number_token(lit), lit)
         for lit in INVALID_UNDERSCORE_LITERALS:
+            try:
+                number_token(lit)
+            except SyntaxError:
+                continue
             self.assertNotEqual(number_token(lit), lit)
 
     def test_string(self):
@@ -728,8 +732,8 @@ def test_tabs(self):
     NEWLINE    '\\n'          (2, 5) (2, 6)
     INDENT     '        \\t'  (3, 0) (3, 9)
     NAME       'pass'        (3, 9) (3, 13)
-    DEDENT     ''            (4, 0) (4, 0)
-    DEDENT     ''            (4, 0) (4, 0)
+    DEDENT     ''            (3, 14) (3, 14)
+    DEDENT     ''            (3, 14) (3, 14)
     """)
 
     def test_non_ascii_identifiers(self):
@@ -941,7 +945,7 @@ async def foo():
     NUMBER     '1'           (2, 17) (2, 18)
     OP         ':'           (2, 18) (2, 19)
     NAME       'pass'        (2, 20) (2, 24)
-    DEDENT     ''            (3, 0) (3, 0)
+    DEDENT     ''            (2, 25) (2, 25)
     """)
 
         self.check_tokenize('''async def foo(async): await''', """\
@@ -989,7 +993,7 @@ async def bar(): pass
     NAME       'await'       (6, 2) (6, 7)
     OP         '='           (6, 8) (6, 9)
     NUMBER     '2'           (6, 10) (6, 11)
-    DEDENT     ''            (7, 0) (7, 0)
+    DEDENT     ''            (6, 12) (6, 12)
     """)
 
         self.check_tokenize('''\
@@ -1027,7 +1031,7 @@ async def bar(): pass
     NAME       'await'       (6, 2) (6, 7)
     OP         '='           (6, 8) (6, 9)
     NUMBER     '2'           (6, 10) (6, 11)
-    DEDENT     ''            (7, 0) (7, 0)
+    DEDENT     ''            (6, 12) (6, 12)
     """)
 
 class GenerateTokensTest(TokenizeTest):
@@ -1052,7 +1056,7 @@ def decistmt(s):
             ])
         else:
             result.append((toknum, tokval))
-    return untokenize(result).decode('utf-8')
+    return untokenize(result).decode('utf-8').strip()
 
 class TestMisc(TestCase):
 
@@ -1408,9 +1412,9 @@ def test_open_error(self):
 
 class TestTokenize(TestCase):
 
-    def test_tokenize(self):
+    def test_tokenizee(self):
         import tokenize as tokenize_module
-        encoding = object()
+        encoding = "utf-8"
         encoding_used = None
         def mock_detect_encoding(readline):
             return encoding, [b'first', b'second']
@@ -2643,8 +2647,7 @@ def generate_source(indents):
         compile(valid, "<string>", "exec")
 
         invalid = generate_source(MAXINDENT)
-        tokens = list(_generate_tokens_from_c_tokenizer(invalid))
-        self.assertEqual(tokens[-1].type, NEWLINE)
+        self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid)))
         self.assertRaises(
             IndentationError, compile, invalid, "<string>", "exec"
         )
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index b986076527aba8..c1ef71c2529a65 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -406,7 +406,6 @@ def open(filename):
 
 def tokenize2(readline):
     encoding, consumed = detect_encoding(readline)
-
     rl_gen = _itertools.chain(consumed, iter(readline, b""))
     if encoding is not None:
         if encoding == "utf-8-sig":
@@ -417,6 +416,7 @@ def tokenize2(readline):
     
 def _tokenize2(rl_gen, encoding):
     source = b"".join(rl_gen)
+    token = None
     for token in _generate_tokens_from_c_tokenizer(source.decode(encoding), extra_tokens=True):
         # TODO: Marta -> limpiar esto
         if 6 < token.type <= 54:
@@ -429,6 +429,9 @@ def _tokenize2(rl_gen, encoding):
             token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
 
         yield token
+    if token is not None:
+        last_line, _ = token.start
+        yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
 
 
 def tokenize(readline):
@@ -638,6 +641,7 @@ def _tokenize(readline, encoding):
         yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
     yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 
+tokenize = tokenize2
 
 def generate_tokens(readline):
     """Tokenize a source reading Python code as unicode strings.
@@ -647,7 +651,10 @@ def generate_tokens(readline):
     """
     def _gen():
         while True:
-            line = readline()
+            try:
+                line = readline()
+            except StopIteration:
+                return
             if not line:
                 return
             yield line.encode()
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index fbf44af3bbc60f..92d617e4f63b0b 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1638,6 +1638,7 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st
     return type;
 }
 
+
 static int
 tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
 {
@@ -1652,7 +1653,6 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
     blankline = 0;
 
 
-    const char* starting_indent = NULL;
     /* Get indentation level */
     if (tok->atbol) {
         int col = 0;
@@ -1749,19 +1749,24 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         }
     }
 
-    starting_indent = tok->start;
     tok->start = tok->cur;
     tok->starting_col_offset = tok->col_offset;
 
     /* Return pending indents/dedents */
    if (tok->pendin != 0) {
-        p_start = tok->buf;
-        p_end = tok->cur;
         if (tok->pendin < 0) {
+            if (tok->tok_extra_tokens) {
+                p_start = tok->cur;
+                p_end = tok->cur;
+            }
             tok->pendin++;
             return MAKE_TOKEN(DEDENT);
         }
         else {
+            if (tok->tok_extra_tokens) {
+                p_start = tok->buf;
+                p_end = tok->cur;
+            }
             tok->pendin--;
             return MAKE_TOKEN(INDENT);
         }
@@ -1883,7 +1888,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
             tok_backup(tok, c);  /* don't eat the newline or EOF */
             p_start = p;
             p_end = tok->cur;
-            tok->comment_newline = 1;
+            tok->comment_newline = blankline;
             return MAKE_TOKEN(COMMENT);
         }
     }
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index ce629749cb1f3a..5eafba56f7c7c4 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -1,6 +1,8 @@
 #include "Python.h"
+#include "errcode.h"
 #include "../Parser/tokenizer.h"
 #include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset()
+#include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset()
 
 static struct PyModuleDef _tokenizemodule;
 
@@ -64,12 +66,68 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
     return (PyObject *)self;
 }
 
+static int
+_tokenizer_error(struct tok_state *tok)
+{
+    if (PyErr_Occurred()) {
+        return -1;
+    }
+
+    const char *msg = NULL;
+    PyObject* errtype = PyExc_SyntaxError;
+    switch (tok->done) {
+        case E_TOKEN:
+            msg = "invalid token";
+            break;
+        case E_EOF:
+            if (tok->level) {
+                    PyErr_Format(PyExc_SyntaxError,
+                                 "parenthesis '%c' was never closed",
+                                tok->parenstack[tok->level-1]);
+            } else {
+                PyErr_SetString(PyExc_SyntaxError, "unexpected EOF while parsing");
+            }
+            return -1;
+        case E_DEDENT:
+            PyErr_SetString(PyExc_IndentationError,
+                            "unindent does not match any outer indentation level");
+            return -1;
+        case E_INTR:
+            if (!PyErr_Occurred()) {
+                PyErr_SetNone(PyExc_KeyboardInterrupt);
+            }
+            return -1;
+        case E_NOMEM:
+            PyErr_NoMemory();
+            return -1;
+        case E_TABSPACE:
+            errtype = PyExc_TabError;
+            msg = "inconsistent use of tabs and spaces in indentation";
+            break;
+        case E_TOODEEP:
+            errtype = PyExc_IndentationError;
+            msg = "too many levels of indentation";
+            break;
+        case E_LINECONT: {
+            msg = "unexpected character after line continuation character";
+            break;
+        }
+        default:
+            msg = "unknown parsing error";
+    }
+    PyErr_SetString(errtype, msg);
+    return -1;
+}
+
 static PyObject *
 tokenizeriter_next(tokenizeriterobject *it)
 {
     struct token token;
     int type = _PyTokenizer_Get(it->tok, &token);
-    if (type == ERRORTOKEN && PyErr_Occurred()) {
+    if (type == ERRORTOKEN) {
+        if(!PyErr_Occurred()) {
+            _tokenizer_error(it->tok);
+        }
         return NULL;
     }
     if (type == ERRORTOKEN || type == ENDMARKER) {

From 6833b1aea88632fb9d3c28bf9f617bb4d1faef96 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Thu, 18 May 2023 14:24:18 +0100
Subject: [PATCH 10/20] Fix more bugs

---
 Lib/inspect.py            |   1 -
 Lib/test/test_tokenize.py |  42 +++----
 Lib/tokenize.py           | 246 ++++++--------------------------------
 Parser/tokenizer.c        |   3 +
 Python/Python-tokenize.c  |   3 +-
 5 files changed, 53 insertions(+), 242 deletions(-)

diff --git a/Lib/inspect.py b/Lib/inspect.py
index 03ce00fec5d574..7709a95003efbd 100644
--- a/Lib/inspect.py
+++ b/Lib/inspect.py
@@ -2187,7 +2187,6 @@ def _signature_strip_non_python_syntax(signature):
             if string == ',':
                 current_parameter += 1
 
-        # if (type == ERRORTOKEN) and (string == '$'):
         if (type == OP) and (string == '$'):
             assert self_parameter is None
             self_parameter = current_parameter
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index f0a6e1e9873eef..7eb7e54726150a 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,9 +1,9 @@
 from test import support
 from test.support import os_helper
-from tokenize import (tokenize, tokenize2, _tokenize, untokenize, NUMBER, NAME, OP,
+from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
                      STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
                      open as tokenize_open, Untokenizer, generate_tokens,
-                     NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT)
+                     NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
 from io import BytesIO, StringIO
 import unittest
 from textwrap import dedent
@@ -46,7 +46,7 @@ def check_tokenize(self, s, expected):
         # Format the tokens in s in a table format.
         # The ENDMARKER and final NEWLINE are omitted.
         f = BytesIO(s.encode('utf-8'))
-        result = stringify_tokens_from_source(tokenize2(f.readline), s)
+        result = stringify_tokens_from_source(tokenize(f.readline), s)
         self.assertEqual(result,
                          ["    ENCODING   'utf-8'       (0, 0) (0, 0)"] +
                          expected.rstrip().splitlines())
@@ -1128,33 +1128,16 @@ def readline():
             nonlocal first
             if not first:
                 first = True
-                return line
+                yield line
             else:
-                return b''
+                yield b''
 
         # skip the initial encoding token and the end tokens
-        tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
-        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+        tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
+        expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
         self.assertEqual(tokens, expected_tokens,
                          "bytes not decoded with encoding")
 
-    def test__tokenize_does_not_decode_with_encoding_none(self):
-        literal = '"ЉЊЈЁЂ"'
-        first = False
-        def readline():
-            nonlocal first
-            if not first:
-                first = True
-                return literal
-            else:
-                return b''
-
-        # skip the end tokens
-        tokens = list(_tokenize(readline, encoding=None))[:-2]
-        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
-        self.assertEqual(tokens, expected_tokens,
-                         "string not tokenized when encoding is None")
-
 
 class TestDetectEncoding(TestCase):
 
@@ -1412,7 +1395,7 @@ def test_open_error(self):
 
 class TestTokenize(TestCase):
 
-    def test_tokenizee(self):
+    def test_tokenize(self):
         import tokenize as tokenize_module
         encoding = "utf-8"
         encoding_used = None
@@ -1424,7 +1407,10 @@ def mock__tokenize(readline, encoding):
             encoding_used = encoding
             out = []
             while True:
-                next_line = readline()
+                try:
+                    next_line = next(readline)
+                except StopIteration:
+                    return out
                 if next_line:
                     out.append(next_line)
                     continue
@@ -1444,7 +1430,7 @@ def mock_readline():
         tokenize_module._tokenize = mock__tokenize
         try:
             results = tokenize(mock_readline)
-            self.assertEqual(list(results),
+            self.assertEqual(list(results)[1:],
                              [b'first', b'second', b'1', b'2', b'3', b'4'])
         finally:
             tokenize_module.detect_encoding = orig_detect_encoding
@@ -1740,7 +1726,7 @@ def test_random_files(self):
             if support.verbose >= 2:
                 print('tokenize', testfile)
             with open(testfile, 'rb') as f:
-                with self.subTest(file=testfile):
+                # with self.subTest(file=testfile):
                     self.check_roundtrip(f)
 
 
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index c1ef71c2529a65..7df2f69ea251f6 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -213,6 +213,14 @@ def untokenize(self, iterable):
                     self.tokens.append(indent)
                     self.prev_col = len(indent)
                 startline = False
+            elif tok_type == FSTRING_MIDDLE:
+                if '{' in token or '}' in token:
+                    end_line, end_col = end
+                    end = (end_line, end_col + token.count('{') + token.count('}'))
+                    token = re.sub('{', '{{', token)
+                    token = re.sub('}', '}}', token)
+
+
             self.add_whitespace(start)
             self.tokens.append(token)
             self.prev_row, self.prev_col = end
@@ -255,6 +263,11 @@ def compat(self, token, iterable):
             elif startline and indents:
                 toks_append(indents[-1])
                 startline = False
+            elif toknum == FSTRING_MIDDLE:
+                if '{' in tokval or '}' in tokval:
+                    tokval = re.sub('{', '{{', tokval)
+                    tokval = re.sub('}', '}}', tokval)
+
             toks_append(tokval)
 
 
@@ -404,36 +417,6 @@ def open(filename):
         buffer.close()
         raise
 
-def tokenize2(readline):
-    encoding, consumed = detect_encoding(readline)
-    rl_gen = _itertools.chain(consumed, iter(readline, b""))
-    if encoding is not None:
-        if encoding == "utf-8-sig":
-            # BOM will already have been stripped.
-            encoding = "utf-8"
-        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
-    yield from _tokenize2(rl_gen, encoding)
-    
-def _tokenize2(rl_gen, encoding):
-    source = b"".join(rl_gen)
-    token = None
-    for token in _generate_tokens_from_c_tokenizer(source.decode(encoding), extra_tokens=True):
-        # TODO: Marta -> limpiar esto
-        if 6 < token.type <= 54:
-            token = token._replace(type=OP)
-        if token.type in {ASYNC, AWAIT}:
-            token = token._replace(type=NAME)
-        if token.type == NEWLINE:
-            l_start, c_start = token.start
-            l_end, c_end = token.end
-            token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
-
-        yield token
-    if token is not None:
-        last_line, _ = token.start
-        yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
-
-
 def tokenize(readline):
     """
     The tokenize() generator requires one argument, readline, which
@@ -454,194 +437,33 @@ def tokenize(readline):
     which tells you which encoding was used to decode the bytes stream.
     """
     encoding, consumed = detect_encoding(readline)
-    empty = _itertools.repeat(b"")
-    rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
-    return _tokenize(rl_gen.__next__, encoding)
-
-
-def _tokenize(readline, encoding):
-    lnum = parenlev = continued = 0
-    numchars = '0123456789'
-    contstr, needcont = '', 0
-    contline = None
-    indents = [0]
-
+    rl_gen = _itertools.chain(consumed, iter(readline, b""))
     if encoding is not None:
         if encoding == "utf-8-sig":
             # BOM will already have been stripped.
             encoding = "utf-8"
         yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
-    last_line = b''
-    line = b''
-    while True:                                # loop over lines in stream
-        try:
-            # We capture the value of the line variable here because
-            # readline uses the empty string '' to signal end of input,
-            # hence `line` itself will always be overwritten at the end
-            # of this loop.
-            last_line = line
-            line = readline()
-        except StopIteration:
-            line = b''
-
-        if encoding is not None:
-            line = line.decode(encoding)
-        lnum += 1
-        pos, max = 0, len(line)
-
-        if contstr:                            # continued string
-            if not line:
-                raise TokenError("EOF in multi-line string", strstart)
-            endmatch = endprog.match(line)
-            if endmatch:
-                pos = end = endmatch.end(0)
-                yield TokenInfo(STRING, contstr + line[:end],
-                       strstart, (lnum, end), contline + line)
-                contstr, needcont = '', 0
-                contline = None
-            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
-                yield TokenInfo(ERRORTOKEN, contstr + line,
-                           strstart, (lnum, len(line)), contline)
-                contstr = ''
-                contline = None
-                continue
-            else:
-                contstr = contstr + line
-                contline = contline + line
-                continue
-
-        elif parenlev == 0 and not continued:  # new statement
-            if not line: break
-            column = 0
-            while pos < max:                   # measure leading whitespace
-                if line[pos] == ' ':
-                    column += 1
-                elif line[pos] == '\t':
-                    column = (column//tabsize + 1)*tabsize
-                elif line[pos] == '\f':
-                    column = 0
-                else:
-                    break
-                pos += 1
-            if pos == max:
-                break
-
-            if line[pos] in '#\r\n':           # skip comments or blank lines
-                if line[pos] == '#':
-                    comment_token = line[pos:].rstrip('\r\n')
-                    yield TokenInfo(COMMENT, comment_token,
-                           (lnum, pos), (lnum, pos + len(comment_token)), line)
-                    pos += len(comment_token)
-
-                yield TokenInfo(NL, line[pos:],
-                           (lnum, pos), (lnum, len(line)), line)
-                continue
-
-            if column > indents[-1]:           # count indents or dedents
-                indents.append(column)
-                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
-            while column < indents[-1]:
-                if column not in indents:
-                    raise IndentationError(
-                        "unindent does not match any outer indentation level",
-                        ("<tokenize>", lnum, pos, line))
-                indents = indents[:-1]
-
-                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
-
-        else:                                  # continued statement
-            if not line:
-                raise TokenError("EOF in multi-line statement", (lnum, 0))
-            continued = 0
-
-        while pos < max:
-            pseudomatch = _compile(PseudoToken).match(line, pos)
-            if pseudomatch:                                # scan for tokens
-                start, end = pseudomatch.span(1)
-                spos, epos, pos = (lnum, start), (lnum, end), end
-                if start == end:
-                    continue
-                token, initial = line[start:end], line[start]
-
-                if (initial in numchars or                 # ordinary number
-                    (initial == '.' and token != '.' and token != '...')):
-                    yield TokenInfo(NUMBER, token, spos, epos, line)
-                elif initial in '\r\n':
-                    if parenlev > 0:
-                        yield TokenInfo(NL, token, spos, epos, line)
-                    else:
-                        yield TokenInfo(NEWLINE, token, spos, epos, line)
-
-                elif initial == '#':
-                    assert not token.endswith("\n")
-                    yield TokenInfo(COMMENT, token, spos, epos, line)
-
-                elif token in triple_quoted:
-                    endprog = _compile(endpats[token])
-                    endmatch = endprog.match(line, pos)
-                    if endmatch:                           # all on one line
-                        pos = endmatch.end(0)
-                        token = line[start:pos]
-                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
-                    else:
-                        strstart = (lnum, start)           # multiple lines
-                        contstr = line[start:]
-                        contline = line
-                        break
-
-                # Check up to the first 3 chars of the token to see if
-                #  they're in the single_quoted set. If so, they start
-                #  a string.
-                # We're using the first 3, because we're looking for
-                #  "rb'" (for example) at the start of the token. If
-                #  we switch to longer prefixes, this needs to be
-                #  adjusted.
-                # Note that initial == token[:1].
-                # Also note that single quote checking must come after
-                #  triple quote checking (above).
-                elif (initial in single_quoted or
-                      token[:2] in single_quoted or
-                      token[:3] in single_quoted):
-                    if token[-1] == '\n':                  # continued string
-                        strstart = (lnum, start)
-                        # Again, using the first 3 chars of the
-                        #  token. This is looking for the matching end
-                        #  regex for the correct type of quote
-                        #  character. So it's really looking for
-                        #  endpats["'"] or endpats['"'], by trying to
-                        #  skip string prefix characters, if any.
-                        endprog = _compile(endpats.get(initial) or
-                                           endpats.get(token[1]) or
-                                           endpats.get(token[2]))
-                        contstr, needcont = line[start:], 1
-                        contline = line
-                        break
-                    else:                                  # ordinary string
-                        yield TokenInfo(STRING, token, spos, epos, line)
-
-                elif initial.isidentifier():               # ordinary name
-                    yield TokenInfo(NAME, token, spos, epos, line)
-                elif initial == '\\':                      # continued stmt
-                    continued = 1
-                else:
-                    if initial in '([{':
-                        parenlev += 1
-                    elif initial in ')]}':
-                        parenlev -= 1
-                    yield TokenInfo(OP, token, spos, epos, line)
-            else:
-                yield TokenInfo(ERRORTOKEN, line[pos],
-                           (lnum, pos), (lnum, pos+1), line)
-                pos += 1
+    yield from _tokenize(rl_gen, encoding)
+    
+def _tokenize(rl_gen, encoding):
+    source = b"".join(rl_gen).decode(encoding)
+    token = None
+    for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
+        # TODO: Marta -> limpiar esto
+        if 6 < token.type <= 54:
+            token = token._replace(type=OP)
+        if token.type in {ASYNC, AWAIT}:
+            token = token._replace(type=NAME)
+        if token.type == NEWLINE:
+            l_start, c_start = token.start
+            l_end, c_end = token.end
+            token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
 
-    # Add an implicit NEWLINE if the input doesn't end in one
-    if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"):
-        yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
-    for indent in indents[1:]:                 # pop remaining indent levels
-        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
-    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
+        yield token
+    if token is not None:
+        last_line, _ = token.start
+        yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
 
-tokenize = tokenize2
 
 def generate_tokens(readline):
     """Tokenize a source reading Python code as unicode strings.
@@ -658,7 +480,7 @@ def _gen():
             if not line:
                 return
             yield line.encode()
-    return _tokenize2(_gen(), 'utf-8')
+    return _tokenize(_gen(), 'utf-8')
 
 def main():
     import argparse
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 92d617e4f63b0b..d48e9af8df1410 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -2600,6 +2600,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
 
 f_string_middle:
 
+    // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
+    // this.
+    tok->multi_line_start = tok->line_start;
     while (end_quote_size != current_tok->f_string_quote_size) {
         int c = tok_nextc(tok);
         if (tok->done == E_ERROR) {
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 5eafba56f7c7c4..a45bd0553994aa 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -113,7 +113,7 @@ _tokenizer_error(struct tok_state *tok)
             break;
         }
         default:
-            msg = "unknown parsing error";
+            msg = "unknown tokenization error";
     }
     PyErr_SetString(errtype, msg);
     return -1;
@@ -127,6 +127,7 @@ tokenizeriter_next(tokenizeriterobject *it)
     if (type == ERRORTOKEN) {
         if(!PyErr_Occurred()) {
             _tokenizer_error(it->tok);
+            assert(PyErr_Occurred());
         }
         return NULL;
     }

From 90da796a9dd191845ad10b5a8570591a8ef37e0c Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Thu, 18 May 2023 17:05:03 +0100
Subject: [PATCH 11/20] Fix more problems

---
 Lib/tabnanny.py           | 10 ++++++++++
 Lib/test/test_tabnanny.py |  4 ++--
 Lib/tokenize.py           |  5 ++++-
 Lib/trace.py              |  1 -
 Python/Python-tokenize.c  | 39 ++++++++++++++++++++++++++++++++++++---
 5 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/Lib/tabnanny.py b/Lib/tabnanny.py
index 9d2df59d36ff47..e2ac6837f157d5 100755
--- a/Lib/tabnanny.py
+++ b/Lib/tabnanny.py
@@ -107,6 +107,10 @@ def check(file):
         errprint("%r: Token Error: %s" % (file, msg))
         return
 
+    except SyntaxError as msg:
+        errprint("%r: Token Error: %s" % (file, msg))
+        return
+
     except IndentationError as msg:
         errprint("%r: Indentation Error: %s" % (file, msg))
         return
@@ -272,6 +276,12 @@ def format_witnesses(w):
     return prefix + " " + ', '.join(firsts)
 
 def process_tokens(tokens):
+    try:
+        _process_tokens(tokens)
+    except TabError as e:
+        raise NannyNag(e.lineno, e.msg, e.text)
+
+def _process_tokens(tokens):
     INDENT = tokenize.INDENT
     DEDENT = tokenize.DEDENT
     NEWLINE = tokenize.NEWLINE
diff --git a/Lib/test/test_tabnanny.py b/Lib/test/test_tabnanny.py
index afb8da719b0eed..dac47318011d9d 100644
--- a/Lib/test/test_tabnanny.py
+++ b/Lib/test/test_tabnanny.py
@@ -223,7 +223,7 @@ def test_when_nannynag_error_verbose(self):
         with TemporaryPyFile(SOURCE_CODES["nannynag_errored"]) as file_path:
             out = f"{file_path!r}: *** Line 3: trouble in tab city! ***\n"
             out += "offending line: '\\tprint(\"world\")\\n'\n"
-            out += "indent not equal e.g. at tab size 1\n"
+            out += "inconsistent use of tabs and spaces in indentation\n"
 
             tabnanny.verbose = 1
             self.verify_tabnanny_check(file_path, out=out)
@@ -315,7 +315,7 @@ def validate_cmd(self, *args, stdout="", stderr="", partial=False, expect_failur
     def test_with_errored_file(self):
         """Should displays error when errored python file is given."""
         with TemporaryPyFile(SOURCE_CODES["wrong_indented"]) as file_path:
-            stderr  = f"{file_path!r}: Indentation Error: "
+            stderr  = f"{file_path!r}: Token Error: "
             stderr += ('unindent does not match any outer indentation level'
                     ' (<tokenize>, line 3)')
             self.validate_cmd(file_path, stderr=stderr, expect_failure=True)
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 7df2f69ea251f6..a41f61641de522 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -517,7 +517,10 @@ def error(message, filename=None, location=None):
                 tokens = list(tokenize(f.readline))
         else:
             filename = "<stdin>"
-            tokens = _tokenize(sys.stdin.readline, None)
+            tokens = _tokenize(
+                (x.encode('utf-8') for x in iter(sys.stdin.readline, "")
+            ), "utf-8")
+
 
         # Output the tokenization
         for token in tokens:
diff --git a/Lib/trace.py b/Lib/trace.py
index a3e4c30b6a5354..fb9a423ea09fce 100755
--- a/Lib/trace.py
+++ b/Lib/trace.py
@@ -360,7 +360,6 @@ def _find_strings(filename, encoding=None):
     # Add this special case so that the test in the loop passes.
     prev_ttype = token.INDENT
     with open(filename, encoding=encoding) as f:
-        print(filename)
         tok = tokenize.generate_tokens(f.readline)
         for ttype, tstr, start, end, line in tok:
             if ttype == token.STRING:
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index a45bd0553994aa..0c5cff21450b3f 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -89,8 +89,10 @@ _tokenizer_error(struct tok_state *tok)
             }
             return -1;
         case E_DEDENT:
-            PyErr_SetString(PyExc_IndentationError,
-                            "unindent does not match any outer indentation level");
+            PyErr_Format(PyExc_IndentationError,
+                        "unindent does not match any outer indentation level "
+                        "(<tokenize>, line %d)",
+                        tok->lineno);
             return -1;
         case E_INTR:
             if (!PyErr_Occurred()) {
@@ -115,7 +117,38 @@ _tokenizer_error(struct tok_state *tok)
         default:
             msg = "unknown tokenization error";
     }
-    PyErr_SetString(errtype, msg);
+
+    // TODO: Clean up this code and factor out common error paths
+
+    PyObject* errstr = NULL;
+    PyObject* error_line = NULL;
+
+    Py_ssize_t size = tok->inp - tok->buf;
+    error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
+    if (!error_line) {
+        goto error;
+    }
+    PyObject *tmp = Py_BuildValue("(OnnOii)", tok->filename, tok->lineno, 0, error_line, 0, 0);
+    if (!tmp) {
+        goto error;
+    }
+    Py_CLEAR(error_line);
+    errstr = PyUnicode_FromString(msg);
+    if (!errstr) {
+        goto error;
+    }
+    PyObject* value = PyTuple_Pack(2, errstr, tmp);
+    Py_DECREF(errstr);
+    Py_DECREF(tmp);
+    if (!value) {
+        goto error;
+    }
+    PyErr_SetObject(errtype, value);
+    Py_DECREF(value);
+    return 0;
+error:
+    Py_XDECREF(errstr);
+    Py_XDECREF(error_line);
     return -1;
 }
 

From b5ccd94e10a2d680b058cbbeb37128a12b42d356 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Thu, 18 May 2023 17:11:53 +0100
Subject: [PATCH 12/20] Use IA to clean code

---
 Python/Python-tokenize.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 0c5cff21450b3f..1ced485a1e9c04 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -118,38 +118,45 @@ _tokenizer_error(struct tok_state *tok)
             msg = "unknown tokenization error";
     }
 
-    // TODO: Clean up this code and factor out common error paths
-
     PyObject* errstr = NULL;
     PyObject* error_line = NULL;
+    PyObject* tmp = NULL;
+    PyObject* value = NULL;
+    int result = 0;
 
     Py_ssize_t size = tok->inp - tok->buf;
     error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
     if (!error_line) {
-        goto error;
+        result = -1;
+        goto exit;
     }
-    PyObject *tmp = Py_BuildValue("(OnnOii)", tok->filename, tok->lineno, 0, error_line, 0, 0);
+
+    tmp = Py_BuildValue("(OnnOii)", tok->filename, tok->lineno, 0, error_line, 0, 0);
     if (!tmp) {
-        goto error;
+        result = -1;
+        goto exit;
     }
-    Py_CLEAR(error_line);
+
     errstr = PyUnicode_FromString(msg);
     if (!errstr) {
-        goto error;
+        result = -1;
+        goto exit;
     }
-    PyObject* value = PyTuple_Pack(2, errstr, tmp);
-    Py_DECREF(errstr);
-    Py_DECREF(tmp);
+
+    value = PyTuple_Pack(2, errstr, tmp);
     if (!value) {
-        goto error;
+        result = -1;
+        goto exit;
     }
+
     PyErr_SetObject(errtype, value);
-    Py_DECREF(value);
-    return 0;
-error:
+
+exit:
     Py_XDECREF(errstr);
     Py_XDECREF(error_line);
-    return -1;
+    Py_XDECREF(tmp);
+    Py_XDECREF(value);
+    return result;
 }
 
 static PyObject *

From b1c3b2ae56f7d6df7fa43cddef9a9c15b69ad7eb Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Thu, 18 May 2023 17:19:14 +0100
Subject: [PATCH 13/20] Remove lel

---
 lel.py | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 lel.py

diff --git a/lel.py b/lel.py
deleted file mode 100644
index dadbd5ffd1a709..00000000000000
--- a/lel.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import tokenize
-import io
-import pprint
-
-data = """\
-if False:\n    # NL\n    \n    True = False # NEWLINE\n
-"""
-b = io.BytesIO(data.encode())
-pprint.pprint(list(tokenize.tokenize(b.readline)))
-print()
-print()
-b = io.BytesIO(data.encode())
-pprint.pprint(list(tokenize.tokenize2(b.readline)))
-print()
-print()
-pprint.pprint(list(tokenize._generate_tokens_from_c_tokenizer(data)))

From e941f12ec8c83d3d3a1fc2fdf5e472653be5354a Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Thu, 18 May 2023 17:20:46 +0100
Subject: [PATCH 14/20] Remove whitespace

---
 Lib/test/test_tokenize.py | 2 +-
 Lib/tokenize.py           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 7eb7e54726150a..efcbce88194312 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1727,7 +1727,7 @@ def test_random_files(self):
                 print('tokenize', testfile)
             with open(testfile, 'rb') as f:
                 # with self.subTest(file=testfile):
-                    self.check_roundtrip(f)
+                self.check_roundtrip(f)
 
 
     def roundtrip(self, code):
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index a41f61641de522..06b14baf291ebb 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -444,7 +444,7 @@ def tokenize(readline):
             encoding = "utf-8"
         yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
     yield from _tokenize(rl_gen, encoding)
-    
+
 def _tokenize(rl_gen, encoding):
     source = b"".join(rl_gen).decode(encoding)
     token = None

From 67a0239232c3d1458deb9b94132c81646c51738b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= <mgmacias@google.com>
Date: Thu, 18 May 2023 21:05:23 +0200
Subject: [PATCH 15/20] Fix docs

---
 Doc/library/token.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Doc/library/token.rst b/Doc/library/token.rst
index a1aceba96ce030..903847bb206d62 100644
--- a/Doc/library/token.rst
+++ b/Doc/library/token.rst
@@ -50,11 +50,13 @@ The following token type values aren't used by the C tokenizer but are needed fo
 the :mod:`tokenize` module.
 
 .. data:: COMMENT
+   :noindex:
 
    Token value used to indicate a comment.
 
 
 .. data:: NL
+   :noindex:
 
    Token value used to indicate a non-terminating newline.  The
    :data:`NEWLINE` token indicates the end of a logical line of Python code;

From dcd221f566a38ed622149a2d0835a743a987ef1f Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Fri, 19 May 2023 15:33:09 +0100
Subject: [PATCH 16/20] Moar tests and fix location error

---
 Lib/test/test_tokenize.py | 26 ++++++++++++++++++++------
 Parser/tokenizer.c        |  2 +-
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index efcbce88194312..dda7243bfa19fe 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -421,12 +421,6 @@ def test_string(self):
     FSTRING_MIDDLE '}'           (1, 9) (1, 10)
     FSTRING_END '"'           (1, 11) (1, 12)
     """)
-    # TODO: I don't think is is correct now (ERRORTOKEN)
-    #     self.check_tokenize('f"{1+1"', """\
-    # FSTRING_START 'f"'          (1, 0) (1, 2)
-    # ERRORTOKEN '{'           (1, 2) (1, 3)
-    # FSTRING_END '"'           (1, 3) (1, 4)
-    # """)
         self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\
     FSTRING_START 'f\"""'        (1, 0) (1, 4)
     OP         '{'           (1, 4) (1, 5)
@@ -447,6 +441,26 @@ def test_string(self):
     FSTRING_END "'''"         (1, 24) (1, 27)
     OP         '}'           (1, 27) (1, 28)
     FSTRING_END '\"""'         (1, 28) (1, 31)
+    """)
+        self.check_tokenize('f"""     x\nstr(data, encoding={invalid!r})\n"""', """\
+    FSTRING_START 'f\"""'        (1, 0) (1, 4)
+    FSTRING_MIDDLE '     x\\nstr(data, encoding=' (1, 4) (2, 19)
+    OP         '{'           (2, 19) (2, 20)
+    NAME       'invalid'     (2, 20) (2, 27)
+    OP         '!'           (2, 27) (2, 28)
+    NAME       'r'           (2, 28) (2, 29)
+    OP         '}'           (2, 29) (2, 30)
+    FSTRING_MIDDLE ')\\n'         (2, 30) (3, 0)
+    FSTRING_END '\"""'         (3, 0) (3, 3)
+    """)
+        self.check_tokenize('f"""123456789\nsomething{None}bad"""', """\
+    FSTRING_START 'f\"""'        (1, 0) (1, 4)
+    FSTRING_MIDDLE '123456789\\nsomething' (1, 4) (2, 9)
+    OP         '{'           (2, 9) (2, 10)
+    NAME       'None'        (2, 10) (2, 14)
+    OP         '}'           (2, 14) (2, 15)
+    FSTRING_MIDDLE 'bad'         (2, 15) (2, 18)
+    FSTRING_END '\"""'         (2, 18) (2, 21)
     """)
         self.check_tokenize('f"""abc"""', """\
     FSTRING_START 'f\"""'        (1, 0) (1, 4)
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index d48e9af8df1410..a531ac7505a83d 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1126,7 +1126,7 @@ tok_underflow_interactive(struct tok_state *tok) {
 
 static int
 tok_underflow_file(struct tok_state *tok) {
-    if (tok->start == NULL) {
+    if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
         tok->cur = tok->inp = tok->buf;
     }
     if (tok->decoding_state == STATE_INIT) {

From fd8b60aeb2a04bfbdc8a9e1bc63dd4563704b748 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Fri, 19 May 2023 17:04:22 +0100
Subject: [PATCH 17/20] Some cleanups

---
 Parser/tokenizer.c       |  7 ++-----
 Python/Python-tokenize.c | 10 +++++-----
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index a531ac7505a83d..887ec9483df7b4 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1815,13 +1815,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
             return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'"));
         }
 
-        const char *prefix, *p, *type_start;
+        const char* p = NULL;
+        const char *prefix, *type_start;
         int current_starting_col_offset;
 
-        // if (tok->tok_extra_tokens) {
-        //     p = tok->start;
-        // }
-
         while (c != EOF && c != '\n') {
             c = tok_nextc(tok);
         }
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 1ced485a1e9c04..0b9eeae2af816b 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -193,10 +193,10 @@ tokenizeriter_next(tokenizeriterobject *it)
         return NULL;
     }
     const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
-    int lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
-    int end_lineno = it->tok->lineno;
-    int col_offset = -1;
-    int end_col_offset = -1;
+    Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
+    Py_ssize_t end_lineno = it->tok->lineno;
+    Py_ssize_t col_offset = -1;
+    Py_ssize_t end_col_offset = -1;
     if (token.start != NULL && token.start >= line_start) {
         col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
     }
@@ -204,7 +204,7 @@ tokenizeriter_next(tokenizeriterobject *it)
         end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
     }
 
-    return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
+    return Py_BuildValue("(NinnnnN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
 }
 
 static void

From f1a5090172fd2a9de541854d4a4c1de87703ddd5 Mon Sep 17 00:00:00 2001
From: Pablo Galindo <pablogsal@gmail.com>
Date: Fri, 19 May 2023 17:12:21 +0100
Subject: [PATCH 18/20] pass the vacuum cleaner

---
 Lib/tokenize.py | 103 +-----------------------------------------------
 1 file changed, 1 insertion(+), 102 deletions(-)

diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 06b14baf291ebb..bfe40c627fde57 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -56,112 +56,11 @@ def exact_type(self):
         else:
             return self.type
 
-def group(*choices): return '(' + '|'.join(choices) + ')'
-def any(*choices): return group(*choices) + '*'
-def maybe(*choices): return group(*choices) + '?'
-
-# Note: we use unicode matching for names ("\w") but ascii matching for
-# number literals.
-Whitespace = r'[ \f\t]*'
-Comment = r'#[^\r\n]*'
-Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'\w+'
-
-Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
-Binnumber = r'0[bB](?:_?[01])+'
-Octnumber = r'0[oO](?:_?[0-7])+'
-Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
-Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
-Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
-Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
-                   r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
-Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
-Floatnumber = group(Pointfloat, Expfloat)
-Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
-Number = group(Imagnumber, Floatnumber, Intnumber)
-
-# Return the empty string, plus all of the valid string prefixes.
-def _all_string_prefixes():
-    # The valid string prefixes. Only contain the lower case versions,
-    #  and don't contain any permutations (include 'fr', but not
-    #  'rf'). The various permutations will be generated.
-    _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
-    # if we add binary f-strings, add: ['fb', 'fbr']
-    result = {''}
-    for prefix in _valid_string_prefixes:
-        for t in _itertools.permutations(prefix):
-            # create a list with upper and lower versions of each
-            #  character
-            for u in _itertools.product(*[(c, c.upper()) for c in t]):
-                result.add(''.join(u))
-    return result
-
-@functools.lru_cache
-def _compile(expr):
-    return re.compile(expr, re.UNICODE)
-
-# Note that since _all_string_prefixes includes the empty string,
-#  StringPrefix can be the empty string (making it optional).
-StringPrefix = group(*_all_string_prefixes())
-
-# Tail end of ' string.
-Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
-# Tail end of " string.
-Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
-# Tail end of ''' string.
-Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
-# Tail end of """ string.
-Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-Triple = group(StringPrefix + "'''", StringPrefix + '"""')
-# Single-line ' or " string.
-String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
-               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
-
-# Sorting in reverse order puts the long operators before their prefixes.
-# Otherwise if = came before ==, == would get recognized as two instances
-# of =.
-Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
-Funny = group(r'\r?\n', Special)
-
-PlainToken = group(Number, Funny, String, Name)
-Token = Ignore + PlainToken
-
-# First (or only) line of ' or " string.
-ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
-                group("'", r'\\\r?\n'),
-                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
-                group('"', r'\\\r?\n'))
-PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
-PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
-
-# For a given string prefix plus quotes, endpats maps it to a regex
-#  to match the remainder of that string. _prefix can be empty, for
-#  a normal single or triple quoted string (with no prefix).
-endpats = {}
-for _prefix in _all_string_prefixes():
-    endpats[_prefix + "'"] = Single
-    endpats[_prefix + '"'] = Double
-    endpats[_prefix + "'''"] = Single3
-    endpats[_prefix + '"""'] = Double3
-del _prefix
-
-# A set of all of the single and triple quoted string prefixes,
-#  including the opening quotes.
-single_quoted = set()
-triple_quoted = set()
-for t in _all_string_prefixes():
-    for u in (t + '"', t + "'"):
-        single_quoted.add(u)
-    for u in (t + '"""', t + "'''"):
-        triple_quoted.add(u)
-del t, u
-
-tabsize = 8
 
 class TokenError(Exception): pass
 
-class StopTokenizing(Exception): pass
 
+class StopTokenizing(Exception): pass
 
 class Untokenizer:
 

From 7fb58b0db4dc2514d95131f35c14e28f6022d28a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marta=20G=C3=B3mez=20Mac=C3=ADas?= <mgmacias@google.com>
Date: Sat, 20 May 2023 20:03:04 +0200
Subject: [PATCH 19/20] Fix refleaks

---
 Parser/pegen.c           |  4 ++--
 Parser/pegen_errors.c    |  4 ++--
 Parser/tokenizer.c       | 15 +++++++++++++++
 Parser/tokenizer.h       |  2 ++
 Python/Python-tokenize.c | 16 +++++++++++-----
 5 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/Parser/pegen.c b/Parser/pegen.c
index da410ea84ecb8e..b031a6f5d440e8 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -208,7 +208,7 @@ int
 _PyPegen_fill_token(Parser *p)
 {
     struct token new_token;
-    new_token.metadata = NULL;
+    _PyToken_Init(&new_token);
     int type = _PyTokenizer_Get(p->tok, &new_token);
 
     // Record and skip '# type: ignore' comments
@@ -251,7 +251,7 @@ _PyPegen_fill_token(Parser *p)
     Token *t = p->tokens[p->fill];
     return initialize_token(p, t, &new_token, type);
 error:
-    Py_XDECREF(new_token.metadata);
+    _PyToken_Free(&new_token);
     return -1;
 }
 
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c
index 1f227da0194e3c..af529057f50e70 100644
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@@ -165,7 +165,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
 
     int ret = 0;
     struct token new_token;
-    new_token.metadata = NULL;
+    _PyToken_Init(&new_token);
 
     for (;;) {
         switch (_PyTokenizer_Get(p->tok, &new_token)) {
@@ -193,7 +193,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
 
 
 exit:
-    Py_XDECREF(new_token.metadata);
+    _PyToken_Free(&new_token);
     // If we're in an f-string, we want the syntax error in the expression part
     // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
     // do not swallow it.
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 887ec9483df7b4..090814ab9cd9f5 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -982,6 +982,16 @@ _PyTokenizer_Free(struct tok_state *tok)
     PyMem_Free(tok);
 }
 
+void
+_PyToken_Free(struct token *token) {
+    Py_XDECREF(token->metadata);
+}
+
+void
+_PyToken_Init(struct token *token) {
+    token->metadata = NULL;
+}
+
 static int
 tok_readline_raw(struct tok_state *tok)
 {
@@ -1973,6 +1983,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
 
                 struct tok_state ahead_tok;
                 struct token ahead_token;
+                _PyToken_Init(&ahead_token);
                 int ahead_tok_kind;
 
                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
@@ -1988,8 +1999,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
                        returning a plain NAME token, return ASYNC. */
                     tok->async_def_indent = tok->indent;
                     tok->async_def = 1;
+                    _PyToken_Free(&ahead_token);
                     return MAKE_TOKEN(ASYNC);
                 }
+                _PyToken_Free(&ahead_token);
             }
         }
 
@@ -2823,7 +2836,9 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
     // if fetching the encoding shows a warning.
     tok->report_warnings = 0;
     while (tok->lineno < 2 && tok->done == E_OK) {
+        _PyToken_Init(&token);
         _PyTokenizer_Get(tok, &token);
+        _PyToken_Free(&token);
     }
     fclose(fp);
     if (tok->encoding) {
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 444498458f2510..b96cb0d9754fae 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -139,6 +139,8 @@ extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
 extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
                                               const char *, const char *);
 extern void _PyTokenizer_Free(struct tok_state *);
+extern void _PyToken_Free(struct token *);
+extern void _PyToken_Init(struct token *);
 extern int _PyTokenizer_Get(struct tok_state *, struct token *);
 
 #define tok_dump _Py_tok_dump
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
index 0b9eeae2af816b..ece238672e34fd 100644
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -162,18 +162,21 @@ _tokenizer_error(struct tok_state *tok)
 static PyObject *
 tokenizeriter_next(tokenizeriterobject *it)
 {
+    PyObject* result = NULL;
     struct token token;
+    _PyToken_Init(&token);
+
     int type = _PyTokenizer_Get(it->tok, &token);
     if (type == ERRORTOKEN) {
         if(!PyErr_Occurred()) {
             _tokenizer_error(it->tok);
             assert(PyErr_Occurred());
         }
-        return NULL;
+        goto exit;
     }
     if (type == ERRORTOKEN || type == ENDMARKER) {
         PyErr_SetString(PyExc_StopIteration, "EOF");
-        return NULL;
+        goto exit;
     }
     PyObject *str = NULL;
     if (token.start == NULL || token.end == NULL) {
@@ -183,14 +186,14 @@ tokenizeriter_next(tokenizeriterobject *it)
         str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
     }
     if (str == NULL) {
-        return NULL;
+        goto exit;
     }
 
     Py_ssize_t size = it->tok->inp - it->tok->buf;
     PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
     if (line == NULL) {
         Py_DECREF(str);
-        return NULL;
+        goto exit;
     }
     const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
     Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
@@ -204,7 +207,10 @@ tokenizeriter_next(tokenizeriterobject *it)
         end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
     }
 
-    return Py_BuildValue("(NinnnnN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
+    result = Py_BuildValue("(NinnnnN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
+exit:
+    _PyToken_Free(&token);
+    return result;
 }
 
 static void

From e1b5d352c07994da3e3d85ab94086c1f1c068409 Mon Sep 17 00:00:00 2001
From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com>
Date: Sat, 20 May 2023 23:08:49 +0000
Subject: [PATCH 20/20] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?=
 =?UTF-8?q?lurb=5Fit.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst               | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst

diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst
new file mode 100644
index 00000000000000..ff831c9f935db3
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst	
@@ -0,0 +1 @@
+Implement PEP 701 changes in the :mod:`tokenize` module. Patch by Marta Gómez Macías and Pablo Galindo Salgado