gh-121130: Fix f-string format specifiers with debug expressions (#121150)

pablogsal · web-flow · commit c46d64e0ef8e · 2024-07-16T19:57:22.000+01:00
diff --git a/Doc/library/ast.rst b/Doc/library/ast.rst
@@ -316,9 +316,7 @@ Literals
                             args=[
                                 Name(id='a', ctx=Load())]),
                         conversion=-1,
-                        format_spec=JoinedStr(
-                            values=[
-                                Constant(value='.3')]))]))
+                        format_spec=Constant(value='.3'))]))
 
 
 .. class:: List(elts, ctx)
diff --git a/Lib/test/test_ast.py b/Lib/test/test_ast.py
@@ -3638,7 +3638,7 @@ def main():
 ('Expression', ('Subscript', (1, 0, 1, 10), ('List', (1, 0, 1, 3), [('Constant', (1, 1, 1, 2), 5, None)], ('Load',)), ('Slice', (1, 4, 1, 9), ('Constant', (1, 4, 1, 5), 1, None), ('Constant', (1, 6, 1, 7), 1, None), ('Constant', (1, 8, 1, 9), 1, None)), ('Load',))),
 ('Expression', ('IfExp', (1, 0, 1, 21), ('Name', (1, 9, 1, 10), 'x', ('Load',)), ('Call', (1, 0, 1, 5), ('Name', (1, 0, 1, 3), 'foo', ('Load',)), [], []), ('Call', (1, 16, 1, 21), ('Name', (1, 16, 1, 19), 'bar', ('Load',)), [], []))),
 ('Expression', ('JoinedStr', (1, 0, 1, 6), [('FormattedValue', (1, 2, 1, 5), ('Name', (1, 3, 1, 4), 'a', ('Load',)), -1, None)])),
-('Expression', ('JoinedStr', (1, 0, 1, 10), [('FormattedValue', (1, 2, 1, 9), ('Name', (1, 3, 1, 4), 'a', ('Load',)), -1, ('JoinedStr', (1, 4, 1, 8), [('Constant', (1, 5, 1, 8), '.2f', None)]))])),
+('Expression', ('JoinedStr', (1, 0, 1, 10), [('FormattedValue', (1, 2, 1, 9), ('Name', (1, 3, 1, 4), 'a', ('Load',)), -1, ('Constant', (1, 5, 1, 8), '.2f', None))])),
 ('Expression', ('JoinedStr', (1, 0, 1, 8), [('FormattedValue', (1, 2, 1, 7), ('Name', (1, 3, 1, 4), 'a', ('Load',)), 114, None)])),
 ('Expression', ('JoinedStr', (1, 0, 1, 11), [('Constant', (1, 2, 1, 6), 'foo(', None), ('FormattedValue', (1, 6, 1, 9), ('Name', (1, 7, 1, 8), 'a', ('Load',)), -1, None), ('Constant', (1, 9, 1, 10), ')', None)])),
 ]
diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py
@@ -8,6 +8,7 @@
 # Unicode identifiers in tests is allowed by PEP 3131.
 
 import ast
+import datetime
 import dis
 import os
 import re
@@ -1601,6 +1602,12 @@ def f(a):
         self.assertEqual(f'{f(a=4)}', '3=')
         self.assertEqual(x, 4)
 
+        # Check debug expressions in format spec
+        y = 20
+        self.assertEqual(f"{2:{y=}}", "yyyyyyyyyyyyyyyyyyy2")
+        self.assertEqual(f"{datetime.datetime.now():h1{y=}h2{y=}h3{y=}}",
+                         'h1y=20h2y=20h3y=20')
+
         # Make sure __format__ is being called.
         class C:
             def __format__(self, s):
@@ -1614,9 +1621,11 @@ def __repr__(self):
         self.assertEqual(f'{C()=: }', 'C()=FORMAT- ')
         self.assertEqual(f'{C()=:x}', 'C()=FORMAT-x')
         self.assertEqual(f'{C()=!r:*^20}', 'C()=********REPR********')
+        self.assertEqual(f"{C():{20=}}", 'FORMAT-20=20')
 
         self.assertRaises(SyntaxError, eval, "f'{C=]'")
 
+
         # Make sure leading and following text works.
         x = 'foo'
         self.assertEqual(f'X{x=}Y', 'Xx='+repr(x)+'Y')
diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-06-29-10-46-14.gh-issue-121130.Rj66Xs.rst b/Misc/NEWS.d/next/Core and Builtins/2024-06-29-10-46-14.gh-issue-121130.Rj66Xs.rst
@@ -0,0 +1,2 @@
+Fix f-strings with debug expressions in format specifiers. Patch by Pablo
+Galindo
diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c
@@ -969,6 +969,8 @@ _PyPegen_check_fstring_conversion(Parser *p, Token* conv_token, expr_ty conv)
     return result_token_with_metadata(p, conv, conv_token->metadata);
 }
 
+static asdl_expr_seq *
+unpack_top_level_joined_strs(Parser *p, asdl_expr_seq *raw_expressions);
 ResultTokenWithMetadata *
 _PyPegen_setup_full_format_spec(Parser *p, Token *colon, asdl_expr_seq *spec, int lineno, int col_offset,
                                 int end_lineno, int end_col_offset, PyArena *arena)
@@ -1007,8 +1009,15 @@ _PyPegen_setup_full_format_spec(Parser *p, Token *colon, asdl_expr_seq *spec, in
         assert(j == non_empty_count);
         spec = resized_spec;
     }
-    expr_ty res = _PyAST_JoinedStr(spec, lineno, col_offset, end_lineno,
-                                   end_col_offset, p->arena);
+    expr_ty res;
+    if (asdl_seq_LEN(spec) == 0) {
+        res = _PyAST_JoinedStr(spec, lineno, col_offset, end_lineno,
+                                    end_col_offset, p->arena);
+    } else {
+        res = _PyPegen_concatenate_strings(p, spec,
+                             lineno, col_offset, end_lineno,
+                             end_col_offset, arena);
+    }
     if (!res) {
         return NULL;
     }
@@ -1308,6 +1317,7 @@ unpack_top_level_joined_strs(Parser *p, asdl_expr_seq *raw_expressions)
 
 expr_ty
 _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b) {
+
     asdl_expr_seq *expr = unpack_top_level_joined_strs(p, raw_expressions);
     Py_ssize_t n_items = asdl_seq_LEN(expr);
 
@@ -1472,7 +1482,6 @@ expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, Re
             debug_end_offset = end_col_offset;
             debug_metadata = closing_brace->metadata;
         }
-
         expr_ty debug_text = _PyAST_Constant(debug_metadata, NULL, lineno, col_offset + 1, debug_end_line,
                                              debug_end_offset - 1, p->arena);
         if (!debug_text) {
@@ -1505,16 +1514,23 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
     Py_ssize_t n_flattened_elements = 0;
     for (i = 0; i < len; i++) {
         expr_ty elem = asdl_seq_GET(strings, i);
-        if (elem->kind == Constant_kind) {
-            if (PyBytes_CheckExact(elem->v.Constant.value)) {
-                bytes_found = 1;
-            } else {
-                unicode_string_found = 1;
-            }
-            n_flattened_elements++;
-        } else {
-            n_flattened_elements += asdl_seq_LEN(elem->v.JoinedStr.values);
-            f_string_found = 1;
+        switch(elem->kind) {
+            case Constant_kind:
+                if (PyBytes_CheckExact(elem->v.Constant.value)) {
+                    bytes_found = 1;
+                } else {
+                    unicode_string_found = 1;
+                }
+                n_flattened_elements++;
+                break;
+            case JoinedStr_kind:
+                n_flattened_elements += asdl_seq_LEN(elem->v.JoinedStr.values);
+                f_string_found = 1;
+                break;
+            default:
+                n_flattened_elements++;
+                f_string_found = 1;
+                break;
         }
     }
 
@@ -1556,16 +1572,19 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
     Py_ssize_t j = 0;
     for (i = 0; i < len; i++) {
         expr_ty elem = asdl_seq_GET(strings, i);
-        if (elem->kind == Constant_kind) {
-            asdl_seq_SET(flattened, current_pos++, elem);
-        } else {
-            for (j = 0; j < asdl_seq_LEN(elem->v.JoinedStr.values); j++) {
-                expr_ty subvalue = asdl_seq_GET(elem->v.JoinedStr.values, j);
-                if (subvalue == NULL) {
-                    return NULL;
+        switch(elem->kind) {
+            case JoinedStr_kind:
+                for (j = 0; j < asdl_seq_LEN(elem->v.JoinedStr.values); j++) {
+                    expr_ty subvalue = asdl_seq_GET(elem->v.JoinedStr.values, j);
+                    if (subvalue == NULL) {
+                        return NULL;
+                    }
+                    asdl_seq_SET(flattened, current_pos++, subvalue);
                 }
-                asdl_seq_SET(flattened, current_pos++, subvalue);
-            }
+                break;
+            default:
+                asdl_seq_SET(flattened, current_pos++, elem);
+                break;
         }
     }
 
diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c
@@ -989,6 +989,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         the_current_tok->last_expr_buffer = NULL;
         the_current_tok->last_expr_size = 0;
         the_current_tok->last_expr_end = -1;
+        the_current_tok->in_format_spec = 0;
         the_current_tok->f_string_debug = 0;
 
         switch (*tok->start) {
@@ -1137,15 +1138,20 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
          * by the `{` case, so for ensuring that we are on the 0th level, we need
          * to adjust it manually */
         int cursor = current_tok->curly_bracket_depth - (c != '{');
-        if (cursor == 0 && !_PyLexer_update_fstring_expr(tok, c)) {
+        int in_format_spec = current_tok->in_format_spec;
+         int cursor_in_format_with_debug =
+             cursor == 1 && (current_tok->f_string_debug || in_format_spec);
+         int cursor_valid = cursor == 0 || cursor_in_format_with_debug;
+        if ((cursor_valid) && !_PyLexer_update_fstring_expr(tok, c)) {
             return MAKE_TOKEN(ENDMARKER);
         }
-        if (cursor == 0 && c != '{' && set_fstring_expr(tok, token, c)) {
+        if ((cursor_valid) && c != '{' && set_fstring_expr(tok, token, c)) {
             return MAKE_TOKEN(ERRORTOKEN);
         }
 
         if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) {
             current_tok->kind = TOK_FSTRING_MODE;
+            current_tok->in_format_spec = 1;
             p_start = tok->start;
             p_end = tok->cur;
             return MAKE_TOKEN(_PyToken_OneChar(c));
@@ -1235,6 +1241,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
             if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) {
                 current_tok->curly_bracket_expr_start_depth--;
                 current_tok->kind = TOK_FSTRING_MODE;
+                current_tok->in_format_spec = 0;
                 current_tok->f_string_debug = 0;
             }
         }
@@ -1317,11 +1324,11 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
     tok->multi_line_start = tok->line_start;
     while (end_quote_size != current_tok->f_string_quote_size) {
         int c = tok_nextc(tok);
-        if (tok->done == E_ERROR) {
+        if (tok->done == E_ERROR || tok->done == E_DECODE) {
             return MAKE_TOKEN(ERRORTOKEN);
         }
         int in_format_spec = (
-                current_tok->last_expr_end != -1
+                current_tok->in_format_spec
                 &&
                 INSIDE_FSTRING_EXPR(current_tok)
         );
@@ -1337,6 +1344,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
             if (in_format_spec && c == '\n') {
                 tok_backup(tok, c);
                 TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+                current_tok->in_format_spec = 0;
                 p_start = tok->start;
                 p_end = tok->cur;
                 return MAKE_TOKEN(FSTRING_MIDDLE);
@@ -1378,6 +1386,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
         }
 
         if (c == '{') {
+            if (!_PyLexer_update_fstring_expr(tok, c)) {
+                return MAKE_TOKEN(ENDMARKER);
+            }
             int peek = tok_nextc(tok);
             if (peek != '{' || in_format_spec) {
                 tok_backup(tok, peek);
@@ -1387,6 +1398,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
                     return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: expressions nested too deeply"));
                 }
                 TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+                current_tok->in_format_spec = 0;
                 p_start = tok->start;
                 p_end = tok->cur;
             } else {
@@ -1406,13 +1418,15 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
             // scanning (indicated by the end of the expression being set) and we are not at the top level
             // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double
             // brackets, we can bypass it here.
-            if (peek == '}' && !in_format_spec) {
+            int cursor = current_tok->curly_bracket_depth;
+            if (peek == '}' && !in_format_spec && cursor == 0) {
                 p_start = tok->start;
                 p_end = tok->cur - 1;
             } else {
                 tok_backup(tok, peek);
                 tok_backup(tok, c);
                 TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+                current_tok->in_format_spec = 0;
                 p_start = tok->start;
                 p_end = tok->cur;
             }
diff --git a/Parser/lexer/state.c b/Parser/lexer/state.c
@@ -74,6 +74,7 @@ free_fstring_expressions(struct tok_state *tok)
             mode->last_expr_buffer = NULL;
             mode->last_expr_size = 0;
             mode->last_expr_end = -1;
+            mode->in_format_spec = 0;
         }
     }
 }
diff --git a/Parser/lexer/state.h b/Parser/lexer/state.h
@@ -58,6 +58,7 @@ typedef struct _tokenizer_mode {
     Py_ssize_t last_expr_end;
     char* last_expr_buffer;
     int f_string_debug;
+    int in_format_spec;
 } tokenizer_mode;
 
 /* Tokenizer state */

Original file line number	Diff line number	Diff line change
`@@ -3638,7 +3638,7 @@ def main():`
`3638`	`3638`	`('Expression', ('Subscript', (1, 0, 1, 10), ('List', (1, 0, 1, 3), [('Constant', (1, 1, 1, 2), 5, None)], ('Load',)), ('Slice', (1, 4, 1, 9), ('Constant', (1, 4, 1, 5), 1, None), ('Constant', (1, 6, 1, 7), 1, None), ('Constant', (1, 8, 1, 9), 1, None)), ('Load',))),`
`3639`	`3639`	`('Expression', ('IfExp', (1, 0, 1, 21), ('Name', (1, 9, 1, 10), 'x', ('Load',)), ('Call', (1, 0, 1, 5), ('Name', (1, 0, 1, 3), 'foo', ('Load',)), [], []), ('Call', (1, 16, 1, 21), ('Name', (1, 16, 1, 19), 'bar', ('Load',)), [], []))),`
`3640`	`3640`	`('Expression', ('JoinedStr', (1, 0, 1, 6), [('FormattedValue', (1, 2, 1, 5), ('Name', (1, 3, 1, 4), 'a', ('Load',)), -1, None)])),`
`3641`		`-('Expression', ('JoinedStr', (1, 0, 1, 10), [('FormattedValue', (1, 2, 1, 9), ('Name', (1, 3, 1, 4), 'a', ('Load',)), -1, ('JoinedStr', (1, 4, 1, 8), [('Constant', (1, 5, 1, 8), '.2f', None)]))])),`
	`3641`	`+('Expression', ('JoinedStr', (1, 0, 1, 10), [('FormattedValue', (1, 2, 1, 9), ('Name', (1, 3, 1, 4), 'a', ('Load',)), -1, ('Constant', (1, 5, 1, 8), '.2f', None))])),`
`3642`	`3642`	`('Expression', ('JoinedStr', (1, 0, 1, 8), [('FormattedValue', (1, 2, 1, 7), ('Name', (1, 3, 1, 4), 'a', ('Load',)), 114, None)])),`
`3643`	`3643`	`('Expression', ('JoinedStr', (1, 0, 1, 11), [('Constant', (1, 2, 1, 6), 'foo(', None), ('FormattedValue', (1, 6, 1, 9), ('Name', (1, 7, 1, 8), 'a', ('Load',)), -1, None), ('Constant', (1, 9, 1, 10), ')', None)])),`
`3644`	`3644`	`]`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Fix f-strings with debug expressions in format specifiers. Patch by Pablo`
	`2`	`+Galindo`
Original file line number	Diff line number	Diff line change
`@@ -74,6 +74,7 @@ free_fstring_expressions(struct tok_state *tok)`
`74`	`74`	`mode->last_expr_buffer = NULL;`
`75`	`75`	`mode->last_expr_size = 0;`
`76`	`76`	`mode->last_expr_end = -1;`
	`77`	`+ mode->in_format_spec = 0;`
`77`	`78`	`}`
`78`	`79`	`}`
`79`	`80`	`}`