Skip to content

Commit 3de08ce

Browse files
authored
gh-97997: Add col_offset field to tokenizer and use that for AST nodes (#98000)
1 parent c062764 commit 3de08ce

File tree

3 files changed

+44
-11
lines changed

3 files changed

+44
-11
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add running column offset to the tokenizer state to avoid calculating AST column information with pointer arithmetic.

Parser/tokenizer.c

+41-11
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@
3737
#define TABSIZE 8
3838

3939
#define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end)
40+
#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\
41+
type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end))
42+
#define ADVANCE_LINENO() \
43+
tok->lineno++; \
44+
tok->col_offset = 0;
4045

4146
/* Forward */
4247
static struct tok_state *tok_new(void);
@@ -73,6 +78,8 @@ tok_new(void)
7378
tok->pendin = 0;
7479
tok->prompt = tok->nextprompt = NULL;
7580
tok->lineno = 0;
81+
tok->starting_col_offset = -1;
82+
tok->col_offset = -1;
7683
tok->level = 0;
7784
tok->altindstack[0] = 0;
7885
tok->decoding_state = STATE_INIT;
@@ -871,7 +878,7 @@ tok_underflow_string(struct tok_state *tok) {
871878
tok->buf = tok->cur;
872879
}
873880
tok->line_start = tok->cur;
874-
tok->lineno++;
881+
ADVANCE_LINENO();
875882
tok->inp = end;
876883
return 1;
877884
}
@@ -930,7 +937,7 @@ tok_underflow_interactive(struct tok_state *tok) {
930937
else if (tok->start != NULL) {
931938
Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
932939
size_t size = strlen(newtok);
933-
tok->lineno++;
940+
ADVANCE_LINENO();
934941
if (!tok_reserve_buf(tok, size + 1)) {
935942
PyMem_Free(tok->buf);
936943
tok->buf = NULL;
@@ -943,7 +950,7 @@ tok_underflow_interactive(struct tok_state *tok) {
943950
tok->multi_line_start = tok->buf + cur_multi_line_start;
944951
}
945952
else {
946-
tok->lineno++;
953+
ADVANCE_LINENO();
947954
PyMem_Free(tok->buf);
948955
tok->buf = newtok;
949956
tok->cur = tok->buf;
@@ -998,7 +1005,7 @@ tok_underflow_file(struct tok_state *tok) {
9981005
*tok->inp = '\0';
9991006
}
10001007

1001-
tok->lineno++;
1008+
ADVANCE_LINENO();
10021009
if (tok->decoding_state != STATE_NORMAL) {
10031010
if (tok->lineno > 2) {
10041011
tok->decoding_state = STATE_NORMAL;
@@ -1056,6 +1063,7 @@ tok_nextc(struct tok_state *tok)
10561063
int rc;
10571064
for (;;) {
10581065
if (tok->cur != tok->inp) {
1066+
tok->col_offset++;
10591067
return Py_CHARMASK(*tok->cur++); /* Fast path */
10601068
}
10611069
if (tok->done != E_OK) {
@@ -1104,6 +1112,7 @@ tok_backup(struct tok_state *tok, int c)
11041112
if ((int)(unsigned char)*tok->cur != c) {
11051113
Py_FatalError("tok_backup: wrong character");
11061114
}
1115+
tok->col_offset--;
11071116
}
11081117
}
11091118

@@ -1390,21 +1399,33 @@ tok_continuation_line(struct tok_state *tok) {
13901399
return c;
13911400
}
13921401

1402+
static int
1403+
type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
1404+
int end_col_offset, const char *start, const char *end)
1405+
{
1406+
token->level = tok->level;
1407+
token->lineno = token->end_lineno = tok->lineno;
1408+
token->col_offset = col_offset;
1409+
token->end_col_offset = end_col_offset;
1410+
token->start = start;
1411+
token->end = end;
1412+
return type;
1413+
}
1414+
13931415
static int
13941416
token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
13951417
{
13961418
assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
13971419
token->level = tok->level;
13981420
token->lineno = type == STRING ? tok->first_lineno : tok->lineno;
13991421
token->end_lineno = tok->lineno;
1400-
token->col_offset = -1;
1401-
token->end_col_offset = -1;
1422+
token->col_offset = token->end_col_offset = -1;
14021423
token->start = start;
14031424
token->end = end;
1425+
14041426
if (start != NULL && end != NULL) {
1405-
const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start;
1406-
token->col_offset = (start >= line_start) ? (int)(start - line_start) : -1;
1407-
token->end_col_offset = (end >= tok->line_start) ? (int)(end - tok->line_start) : -1;
1427+
token->col_offset = tok->starting_col_offset;
1428+
token->end_col_offset = tok->col_offset;
14081429
}
14091430
return type;
14101431
}
@@ -1419,6 +1440,7 @@ tok_get(struct tok_state *tok, struct token *token)
14191440
const char *p_end = NULL;
14201441
nextline:
14211442
tok->start = NULL;
1443+
tok->starting_col_offset = -1;
14221444
blankline = 0;
14231445

14241446
/* Get indentation level */
@@ -1518,6 +1540,7 @@ tok_get(struct tok_state *tok, struct token *token)
15181540
}
15191541

15201542
tok->start = tok->cur;
1543+
tok->starting_col_offset = tok->col_offset;
15211544

15221545
/* Return pending indents/dedents */
15231546
if (tok->pendin != 0) {
@@ -1565,25 +1588,30 @@ tok_get(struct tok_state *tok, struct token *token)
15651588

15661589
/* Set start of current token */
15671590
tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
1591+
tok->starting_col_offset = tok->col_offset - 1;
15681592

15691593
/* Skip comment, unless it's a type comment */
15701594
if (c == '#') {
15711595
const char *prefix, *p, *type_start;
1596+
int current_starting_col_offset;
15721597

15731598
while (c != EOF && c != '\n') {
15741599
c = tok_nextc(tok);
15751600
}
15761601

15771602
if (tok->type_comments) {
15781603
p = tok->start;
1604+
current_starting_col_offset = tok->starting_col_offset;
15791605
prefix = type_comment_prefix;
15801606
while (*prefix && p < tok->cur) {
15811607
if (*prefix == ' ') {
15821608
while (*p == ' ' || *p == '\t') {
15831609
p++;
1610+
current_starting_col_offset++;
15841611
}
15851612
} else if (*prefix == *p) {
15861613
p++;
1614+
current_starting_col_offset++;
15871615
} else {
15881616
break;
15891617
}
@@ -1594,7 +1622,9 @@ tok_get(struct tok_state *tok, struct token *token)
15941622
/* This is a type comment if we matched all of type_comment_prefix. */
15951623
if (!*prefix) {
15961624
int is_type_ignore = 1;
1625+
// +6 in order to skip the word 'ignore'
15971626
const char *ignore_end = p + 6;
1627+
const int ignore_end_col_offset = current_starting_col_offset + 6;
15981628
tok_backup(tok, c); /* don't eat the newline or EOF */
15991629

16001630
type_start = p;
@@ -1615,11 +1645,11 @@ tok_get(struct tok_state *tok, struct token *token)
16151645
tok_nextc(tok);
16161646
tok->atbol = 1;
16171647
}
1618-
return MAKE_TOKEN(TYPE_IGNORE);
1648+
return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset);
16191649
} else {
16201650
p_start = type_start;
16211651
p_end = tok->cur;
1622-
return MAKE_TOKEN(TYPE_COMMENT);
1652+
return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset);
16231653
}
16241654
}
16251655
}

Parser/tokenizer.h

+2
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ struct tok_state {
5757
int lineno; /* Current line number */
5858
int first_lineno; /* First line of a single line or multi line string
5959
expression (cf. issue 16806) */
60+
int starting_col_offset; /* The column offset at the beginning of a token */
61+
int col_offset; /* Current col offset */
6062
int level; /* () [] {} Parentheses nesting level */
6163
/* Used to allow free continuations inside them */
6264
char parenstack[MAXLEVEL];

0 commit comments

Comments
 (0)