Skip to content

Commit d4aa857

Browse files
authored
gh-102856: Clean some of the PEP 701 tokenizer implementation (#103634)
1 parent 5f7d68e commit d4aa857

File tree

2 files changed

+67
-74
lines changed

2 files changed

+67
-74
lines changed

Parser/tokenizer.c

+65-71
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,6 @@
1111
#include "tokenizer.h"
1212
#include "errcode.h"
1313

14-
#include "unicodeobject.h"
15-
#include "bytesobject.h"
16-
#include "fileobject.h"
17-
#include "abstract.h"
18-
1914
/* Alternate tab spacing */
2015
#define ALTTABSIZE 1
2116

@@ -43,6 +38,8 @@
4338
tok->lineno++; \
4439
tok->col_offset = 0;
4540

41+
#define INSIDE_FSTRING(tok) (tok->tok_mode_stack_index > 0)
42+
#define INSIDE_FSTRING_EXPR(tok) (tok->curly_bracket_expr_start_depth >= 0)
4643
#ifdef Py_DEBUG
4744
static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) {
4845
assert(tok->tok_mode_stack_index >= 0);
@@ -54,15 +51,9 @@ static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) {
5451
assert(tok->tok_mode_stack_index < MAXLEVEL);
5552
return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]);
5653
}
57-
static inline int *TOK_GET_BRACKET_MARK(tokenizer_mode* mode) {
58-
assert(mode->bracket_mark_index >= 0);
59-
assert(mode->bracket_mark_index < MAX_EXPR_NESTING);
60-
return &(mode->bracket_mark[mode->bracket_mark_index]);
61-
}
6254
#else
6355
#define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index]))
6456
#define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index]))
65-
#define TOK_GET_BRACKET_MARK(mode) (&(mode->bracket_mark[mode->bracket_mark_index]))
6657
#endif
6758

6859
/* Forward */
@@ -398,20 +389,7 @@ update_fstring_expr(struct tok_state *tok, char cur)
398389
tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
399390

400391
switch (cur) {
401-
case '{':
402-
if (tok_mode->last_expr_buffer != NULL) {
403-
PyMem_Free(tok_mode->last_expr_buffer);
404-
}
405-
tok_mode->last_expr_buffer = PyMem_Malloc(size);
406-
if (tok_mode->last_expr_buffer == NULL) {
407-
tok->done = E_NOMEM;
408-
return 0;
409-
}
410-
tok_mode->last_expr_size = size;
411-
tok_mode->last_expr_end = -1;
412-
strncpy(tok_mode->last_expr_buffer, tok->cur, size);
413-
break;
414-
case 0:
392+
case 0:
415393
if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) {
416394
return 1;
417395
}
@@ -421,23 +399,38 @@ update_fstring_expr(struct tok_state *tok, char cur)
421399
);
422400
if (new_buffer == NULL) {
423401
PyMem_Free(tok_mode->last_expr_buffer);
424-
tok->done = E_NOMEM;
425-
return 0;
402+
goto error;
426403
}
427404
tok_mode->last_expr_buffer = new_buffer;
428405
strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size);
429406
tok_mode->last_expr_size += size;
430407
break;
408+
case '{':
409+
if (tok_mode->last_expr_buffer != NULL) {
410+
PyMem_Free(tok_mode->last_expr_buffer);
411+
}
412+
tok_mode->last_expr_buffer = PyMem_Malloc(size);
413+
if (tok_mode->last_expr_buffer == NULL) {
414+
goto error;
415+
}
416+
tok_mode->last_expr_size = size;
417+
tok_mode->last_expr_end = -1;
418+
strncpy(tok_mode->last_expr_buffer, tok->cur, size);
419+
break;
431420
case '}':
432421
case '!':
433422
case ':':
434423
if (tok_mode->last_expr_end == -1) {
435424
tok_mode->last_expr_end = strlen(tok->start);
436425
}
437426
break;
427+
default:
428+
Py_UNREACHABLE();
438429
}
439-
440430
return 1;
431+
error:
432+
tok->done = E_NOMEM;
433+
return 0;
441434
}
442435

443436
static void
@@ -1766,7 +1759,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
17661759
/* Skip comment, unless it's a type comment */
17671760
if (c == '#') {
17681761

1769-
if (tok->tok_mode_stack_index > 0) {
1762+
if (INSIDE_FSTRING(tok)) {
17701763
return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'"));
17711764
}
17721765

@@ -2208,32 +2201,31 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
22082201

22092202
p_start = tok->start;
22102203
p_end = tok->cur;
2211-
tokenizer_mode *current_tok = TOK_NEXT_MODE(tok);
2212-
current_tok->kind = TOK_FSTRING_MODE;
2213-
current_tok->f_string_quote = quote;
2214-
current_tok->f_string_quote_size = quote_size;
2215-
current_tok->f_string_start = tok->start;
2216-
current_tok->f_string_multi_line_start = tok->line_start;
2217-
current_tok->last_expr_buffer = NULL;
2218-
current_tok->last_expr_size = 0;
2219-
current_tok->last_expr_end = -1;
2204+
tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok);
2205+
the_current_tok->kind = TOK_FSTRING_MODE;
2206+
the_current_tok->f_string_quote = quote;
2207+
the_current_tok->f_string_quote_size = quote_size;
2208+
the_current_tok->f_string_start = tok->start;
2209+
the_current_tok->f_string_multi_line_start = tok->line_start;
2210+
the_current_tok->last_expr_buffer = NULL;
2211+
the_current_tok->last_expr_size = 0;
2212+
the_current_tok->last_expr_end = -1;
22202213

22212214
switch (*tok->start) {
22222215
case 'F':
22232216
case 'f':
2224-
current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r';
2217+
the_current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r';
22252218
break;
22262219
case 'R':
22272220
case 'r':
2228-
current_tok->f_string_raw = 1;
2221+
the_current_tok->f_string_raw = 1;
22292222
break;
22302223
default:
22312224
Py_UNREACHABLE();
22322225
}
22332226

2234-
current_tok->bracket_stack = 0;
2235-
current_tok->bracket_mark[0] = 0;
2236-
current_tok->bracket_mark_index = -1;
2227+
the_current_tok->curly_bracket_depth = 0;
2228+
the_current_tok->curly_bracket_expr_start_depth = -1;
22372229
return MAKE_TOKEN(FSTRING_START);
22382230
}
22392231

@@ -2282,15 +2274,15 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
22822274
int start = tok->lineno;
22832275
tok->lineno = tok->first_lineno;
22842276

2285-
if (tok->tok_mode_stack_index > 0) {
2277+
if (INSIDE_FSTRING(tok)) {
22862278
/* When we are in an f-string, before raising the
22872279
* unterminated string literal error, check whether
22882280
* does the initial quote matches with f-strings quotes
22892281
* and if it is, then this must be a missing '}' token
22902282
* so raise the proper error */
2291-
tokenizer_mode *current_tok = TOK_GET_MODE(tok);
2292-
if (current_tok->f_string_quote == quote &&
2293-
current_tok->f_string_quote_size == quote_size) {
2283+
tokenizer_mode *the_current_tok = TOK_GET_MODE(tok);
2284+
if (the_current_tok->f_string_quote == quote &&
2285+
the_current_tok->f_string_quote_size == quote_size) {
22942286
return MAKE_TOKEN(syntaxerror(tok, "f-string: expecting '}'", start));
22952287
}
22962288
}
@@ -2339,18 +2331,17 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
23392331

23402332
/* Punctuation character */
23412333
int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{');
2342-
if (is_punctuation && tok->tok_mode_stack_index > 0 && current_tok->bracket_mark_index >= 0) {
2343-
int mark = *TOK_GET_BRACKET_MARK(current_tok);
2344-
/* This code block gets executed before the bracket_stack is incremented
2334+
if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) {
2335+
/* This code block gets executed before the curly_bracket_depth is incremented
23452336
* by the `{` case, so for ensuring that we are on the 0th level, we need
23462337
* to adjust it manually */
2347-
int cursor = current_tok->bracket_stack - (c != '{');
2338+
int cursor = current_tok->curly_bracket_depth - (c != '{');
23482339

23492340
if (cursor == 0 && !update_fstring_expr(tok, c)) {
23502341
return MAKE_TOKEN(ENDMARKER);
23512342
}
23522343

2353-
if (c == ':' && cursor == mark) {
2344+
if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) {
23542345
current_tok->kind = TOK_FSTRING_MODE;
23552346
p_start = tok->start;
23562347
p_end = tok->cur;
@@ -2390,16 +2381,15 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
23902381
tok->parenlinenostack[tok->level] = tok->lineno;
23912382
tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
23922383
tok->level++;
2393-
2394-
if (tok->tok_mode_stack_index > 0) {
2395-
current_tok->bracket_stack++;
2384+
if (INSIDE_FSTRING(tok)) {
2385+
current_tok->curly_bracket_depth++;
23962386
}
23972387
break;
23982388
case ')':
23992389
case ']':
24002390
case '}':
24012391
if (!tok->level) {
2402-
if (tok->tok_mode_stack_index > 0 && !current_tok->bracket_stack && c == '}') {
2392+
if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') {
24032393
return MAKE_TOKEN(syntaxerror(tok, "f-string: single '}' is not allowed"));
24042394
}
24052395
return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c));
@@ -2415,10 +2405,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
24152405
nested expression, then instead of matching a different
24162406
syntactical construct with it; we'll throw an unmatched
24172407
parentheses error. */
2418-
if (tok->tok_mode_stack_index > 0 && opening == '{') {
2419-
assert(current_tok->bracket_stack >= 0);
2420-
int previous_bracket = current_tok->bracket_stack - 1;
2421-
if (previous_bracket == *TOK_GET_BRACKET_MARK(current_tok)) {
2408+
if (INSIDE_FSTRING(tok) && opening == '{') {
2409+
assert(current_tok->curly_bracket_depth >= 0);
2410+
int previous_bracket = current_tok->curly_bracket_depth - 1;
2411+
if (previous_bracket == current_tok->curly_bracket_expr_start_depth) {
24222412
return MAKE_TOKEN(syntaxerror(tok, "f-string: unmatched '%c'", c));
24232413
}
24242414
}
@@ -2436,14 +2426,16 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
24362426
}
24372427
}
24382428

2439-
if (tok->tok_mode_stack_index > 0) {
2440-
current_tok->bracket_stack--;
2441-
if (c == '}' && current_tok->bracket_stack == *TOK_GET_BRACKET_MARK(current_tok)) {
2442-
current_tok->bracket_mark_index--;
2429+
if (INSIDE_FSTRING(tok)) {
2430+
current_tok->curly_bracket_depth--;
2431+
if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) {
2432+
current_tok->curly_bracket_expr_start_depth--;
24432433
current_tok->kind = TOK_FSTRING_MODE;
24442434
}
24452435
}
24462436
break;
2437+
default:
2438+
break;
24472439
}
24482440

24492441
if (!Py_UNICODE_ISPRINTABLE(c)) {
@@ -2479,11 +2471,10 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
24792471

24802472
if ((start_char == '{' && peek1 != '{') || (start_char == '}' && peek1 != '}')) {
24812473
if (start_char == '{') {
2482-
current_tok->bracket_mark_index++;
2483-
if (current_tok->bracket_mark_index >= MAX_EXPR_NESTING) {
2474+
current_tok->curly_bracket_expr_start_depth++;
2475+
if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
24842476
return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply"));
24852477
}
2486-
*TOK_GET_BRACKET_MARK(current_tok) = current_tok->bracket_stack;
24872478
}
24882479
TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
24892480
return tok_get_normal_mode(tok, current_tok, token);
@@ -2544,17 +2535,20 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
25442535
end_quote_size = 0;
25452536
}
25462537

2547-
int in_format_spec = current_tok->last_expr_end != -1 && current_tok->bracket_mark_index >= 0;
2538+
int in_format_spec = (
2539+
current_tok->last_expr_end != -1
2540+
&&
2541+
INSIDE_FSTRING_EXPR(current_tok)
2542+
);
25482543
if (c == '{') {
25492544
int peek = tok_nextc(tok);
25502545
if (peek != '{' || in_format_spec) {
25512546
tok_backup(tok, peek);
25522547
tok_backup(tok, c);
2553-
current_tok->bracket_mark_index++;
2554-
if (current_tok->bracket_mark_index >= MAX_EXPR_NESTING) {
2548+
current_tok->curly_bracket_expr_start_depth++;
2549+
if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
25552550
return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply"));
25562551
}
2557-
*TOK_GET_BRACKET_MARK(current_tok) = current_tok->bracket_stack;
25582552
TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
25592553
p_start = tok->start;
25602554
p_end = tok->cur;

Parser/tokenizer.h

+2-3
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,8 @@ enum tokenizer_mode_kind_t {
4343
typedef struct _tokenizer_mode {
4444
enum tokenizer_mode_kind_t kind;
4545

46-
int bracket_stack;
47-
int bracket_mark[MAX_EXPR_NESTING];
48-
int bracket_mark_index;
46+
int curly_bracket_depth;
47+
int curly_bracket_expr_start_depth;
4948

5049
char f_string_quote;
5150
int f_string_quote_size;

0 commit comments

Comments
 (0)