Skip to content

Commit 01481f2

Browse files
lysnikolaoupablogsalblurb-it[bot]
authored
gh-104169: Refactor tokenizer into lexer and wrappers (#110684)
* The lexer, which include the actual lexeme producing logic, goes into the `lexer` directory. * The wrappers, one wrapper per input mode (file, string, utf-8, and readline), go into the `tokenizer` directory and include logic for creating a lexer instance and managing the buffer for different modes. --------- Co-authored-by: Pablo Galindo <[email protected]> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
1 parent eb50cd3 commit 01481f2

29 files changed

+3185
-2988
lines changed

Makefile.pre.in

+20-2
Original file line numberDiff line numberDiff line change
@@ -347,20 +347,36 @@ PEGEN_OBJS= \
347347
Parser/string_parser.o \
348348
Parser/peg_api.o
349349

350+
TOKENIZER_OBJS= \
351+
Parser/lexer/buffer.o \
352+
Parser/lexer/lexer.o \
353+
Parser/lexer/state.o \
354+
Parser/tokenizer/file_tokenizer.o \
355+
Parser/tokenizer/readline_tokenizer.o \
356+
Parser/tokenizer/string_tokenizer.o \
357+
Parser/tokenizer/utf8_tokenizer.o \
358+
Parser/tokenizer/helpers.o
350359

351360
PEGEN_HEADERS= \
352361
$(srcdir)/Include/internal/pycore_parser.h \
353362
$(srcdir)/Parser/pegen.h \
354363
$(srcdir)/Parser/string_parser.h
355364

365+
TOKENIZER_HEADERS= \
366+
Parser/lexer/buffer.h \
367+
Parser/lexer/lexer.h \
368+
Parser/lexer/state.h \
369+
Parser/tokenizer/tokenizer.h \
370+
Parser/tokenizer/helpers.h
371+
356372
POBJS= \
357373
Parser/token.o \
358374

359-
PARSER_OBJS= $(POBJS) $(PEGEN_OBJS) Parser/myreadline.o Parser/tokenizer.o
375+
PARSER_OBJS= $(POBJS) $(PEGEN_OBJS) $(TOKENIZER_OBJS) Parser/myreadline.o
360376

361377
PARSER_HEADERS= \
362378
$(PEGEN_HEADERS) \
363-
$(srcdir)/Parser/tokenizer.h
379+
$(TOKENIZER_HEADERS)
364380

365381
##########################################################################
366382
# Python
@@ -1397,6 +1413,8 @@ regen-pegen-metaparser:
13971413
.PHONY: regen-pegen
13981414
regen-pegen:
13991415
@$(MKDIR_P) $(srcdir)/Parser
1416+
@$(MKDIR_P) $(srcdir)/Parser/tokenizer
1417+
@$(MKDIR_P) $(srcdir)/Parser/lexer
14001418
PYTHONPATH=$(srcdir)/Tools/peg_generator $(PYTHON_FOR_REGEN) -m pegen -q c \
14011419
$(srcdir)/Grammar/python.gram \
14021420
$(srcdir)/Grammar/Tokens \
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Split the tokenizer into two separate directories:
2+
- One part includes the actual lexeme producing logic and lives in ``Parser/lexer``.
3+
- The second part wraps the lexer according to the different tokenization modes
4+
we have (string, utf-8, file, interactive, readline) and lives in ``Parser/tokenizer``.

PCbuild/_freeze_module.vcxproj

+8-1
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,14 @@
172172
<ClCompile Include="..\Parser\action_helpers.c" />
173173
<ClCompile Include="..\Parser\string_parser.c" />
174174
<ClCompile Include="..\Parser\token.c" />
175-
<ClCompile Include="..\Parser\tokenizer.c" />
175+
<ClCompile Include="..\Parser\lexer\buffer.c" />
176+
<ClCompile Include="..\Parser\lexer\state.c" />
177+
<ClCompile Include="..\Parser\lexer\lexer.c" />
178+
<ClCompile Include="..\Parser\tokenizer\string_tokenizer.c" />
179+
<ClCompile Include="..\Parser\tokenizer\file_tokenizer.c" />
180+
<ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c" />
181+
<ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c" />
182+
<ClCompile Include="..\Parser\tokenizer\helpers.c" />
176183
<ClCompile Include="..\PC\invalid_parameter_handler.c" />
177184
<ClCompile Include="..\PC\msvcrtmodule.c" />
178185
<ClCompile Include="..\PC\winreg.c" />

PCbuild/_freeze_module.vcxproj.filters

+22-1
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,28 @@
397397
<ClCompile Include="..\Parser\token.c">
398398
<Filter>Source Files</Filter>
399399
</ClCompile>
400-
<ClCompile Include="..\Parser\tokenizer.c">
400+
<ClCompile Include="..\Parser\lexer\lexer.c">
401+
<Filter>Source Files</Filter>
402+
</ClCompile>
403+
<ClCompile Include="..\Parser\lexer\buffer.c">
404+
<Filter>Source Files</Filter>
405+
</ClCompile>
406+
<ClCompile Include="..\Parser\lexer\state.c">
407+
<Filter>Source Files</Filter>
408+
</ClCompile>
409+
<ClCompile Include="..\Parser\tokenizer\string_tokenizer.c">
410+
<Filter>Source Files</Filter>
411+
</ClCompile>
412+
<ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c">
413+
<Filter>Source Files</Filter>
414+
</ClCompile>
415+
<ClCompile Include="..\Parser\tokenizer\file_tokenizer.c">
416+
<Filter>Source Files</Filter>
417+
</ClCompile>
418+
<ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c">
419+
<Filter>Source Files</Filter>
420+
</ClCompile>
421+
<ClCompile Include="..\Parser\tokenizer\helpers.c">
401422
<Filter>Source Files</Filter>
402423
</ClCompile>
403424
<ClCompile Include="..\Python\traceback.c">

PCbuild/pythoncore.vcxproj

+13-2
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,11 @@
362362
<ClInclude Include="..\Objects\stringlib\replace.h" />
363363
<ClInclude Include="..\Objects\stringlib\split.h" />
364364
<ClInclude Include="..\Objects\unicodetype_db.h" />
365-
<ClInclude Include="..\Parser\tokenizer.h" />
365+
<ClInclude Include="..\Parser\lexer\state.h" />
366+
<ClInclude Include="..\Parser\lexer\lexer.h" />
367+
<ClInclude Include="..\Parser\lexer\buffer.h" />
368+
<ClInclude Include="..\Parser\tokenizer\helpers.h" />
369+
<ClInclude Include="..\Parser\tokenizer\tokenizer.h" />
366370
<ClInclude Include="..\Parser\string_parser.h" />
367371
<ClInclude Include="..\Parser\pegen.h" />
368372
<ClInclude Include="..\PC\errmap.h" />
@@ -507,7 +511,14 @@
507511
<ClCompile Include="..\Objects\unionobject.c" />
508512
<ClCompile Include="..\Objects\weakrefobject.c" />
509513
<ClCompile Include="..\Parser\myreadline.c" />
510-
<ClCompile Include="..\Parser\tokenizer.c" />
514+
<ClCompile Include="..\Parser\lexer\state.c" />
515+
<ClCompile Include="..\Parser\lexer\lexer.c" />
516+
<ClCompile Include="..\Parser\lexer\buffer.c" />
517+
<ClCompile Include="..\Parser\tokenizer\string_tokenizer.c" />
518+
<ClCompile Include="..\Parser\tokenizer\file_tokenizer.c" />
519+
<ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c" />
520+
<ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c" />
521+
<ClCompile Include="..\Parser\tokenizer\helpers.c" />
511522
<ClCompile Include="..\Parser\token.c" />
512523
<ClCompile Include="..\Parser\pegen.c" />
513524
<ClCompile Include="..\Parser\pegen_errors.c" />

PCbuild/pythoncore.vcxproj.filters

+35-2
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,19 @@
291291
<ClInclude Include="..\Objects\unicodetype_db.h">
292292
<Filter>Objects</Filter>
293293
</ClInclude>
294-
<ClInclude Include="..\Parser\tokenizer.h">
294+
<ClInclude Include="..\Parser\lexer\lexer.h">
295+
<Filter>Parser</Filter>
296+
</ClInclude>
297+
<ClInclude Include="..\Parser\lexer\state.h">
298+
<Filter>Parser</Filter>
299+
</ClInclude>
300+
<ClInclude Include="..\Parser\lexer\buffer.h">
301+
<Filter>Parser</Filter>
302+
</ClInclude>
303+
<ClInclude Include="..\Parser\tokenizer\tokenizer.h">
304+
<Filter>Parser</Filter>
305+
</ClInclude>
306+
<ClInclude Include="..\Parser\tokenizer\helpers.h">
295307
<Filter>Parser</Filter>
296308
</ClInclude>
297309
<ClInclude Include="..\PC\errmap.h">
@@ -1139,7 +1151,28 @@
11391151
<ClCompile Include="..\Parser\myreadline.c">
11401152
<Filter>Parser</Filter>
11411153
</ClCompile>
1142-
<ClCompile Include="..\Parser\tokenizer.c">
1154+
<ClCompile Include="..\Parser\lexer\lexer.c">
1155+
<Filter>Parser</Filter>
1156+
</ClCompile>
1157+
<ClCompile Include="..\Parser\lexer\state.c">
1158+
<Filter>Parser</Filter>
1159+
</ClCompile>
1160+
<ClCompile Include="..\Parser\lexer\buffer.c">
1161+
<Filter>Parser</Filter>
1162+
</ClCompile>
1163+
<ClCompile Include="..\Parser\tokenizer\string_tokenizer.c">
1164+
<Filter>Parser</Filter>
1165+
</ClCompile>
1166+
<ClCompile Include="..\Parser\tokenizer\file_tokenizer.c">
1167+
<Filter>Parser</Filter>
1168+
</ClCompile>
1169+
<ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c">
1170+
<Filter>Parser</Filter>
1171+
</ClCompile>
1172+
<ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c">
1173+
<Filter>Parser</Filter>
1174+
</ClCompile>
1175+
<ClCompile Include="..\Parser\tokenizer\helpers.c">
11431176
<Filter>Parser</Filter>
11441177
</ClCompile>
11451178
<ClCompile Include="..\Parser\token.c">

Parser/action_helpers.c

-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#include <Python.h>
22

33
#include "pegen.h"
4-
#include "tokenizer.h"
54
#include "string_parser.h"
65
#include "pycore_runtime.h" // _PyRuntime
76

Parser/lexer/buffer.c

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#include "Python.h"
2+
#include "errcode.h"
3+
4+
#include "state.h"
5+
6+
/* Traverse and remember all f-string buffers, in order to be able to restore
7+
them after reallocating tok->buf */
8+
void
9+
_PyLexer_remember_fstring_buffers(struct tok_state *tok)
10+
{
11+
int index;
12+
tokenizer_mode *mode;
13+
14+
for (index = tok->tok_mode_stack_index; index >= 0; --index) {
15+
mode = &(tok->tok_mode_stack[index]);
16+
mode->f_string_start_offset = mode->f_string_start - tok->buf;
17+
mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf;
18+
}
19+
}
20+
21+
/* Traverse and restore all f-string buffers after reallocating tok->buf */
22+
void
23+
_PyLexer_restore_fstring_buffers(struct tok_state *tok)
24+
{
25+
int index;
26+
tokenizer_mode *mode;
27+
28+
for (index = tok->tok_mode_stack_index; index >= 0; --index) {
29+
mode = &(tok->tok_mode_stack[index]);
30+
mode->f_string_start = tok->buf + mode->f_string_start_offset;
31+
mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset;
32+
}
33+
}
34+
35+
/* Read a line of text from TOK into S, using the stream in TOK.
36+
Return NULL on failure, else S.
37+
38+
On entry, tok->decoding_buffer will be one of:
39+
1) NULL: need to call tok->decoding_readline to get a new line
40+
2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
41+
stored the result in tok->decoding_buffer
42+
3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
43+
(in the s buffer) to copy entire contents of the line read
44+
by tok->decoding_readline. tok->decoding_buffer has the overflow.
45+
In this case, tok_readline_recode is called in a loop (with an expanded buffer)
46+
until the buffer ends with a '\n' (or until the end of the file is
47+
reached): see tok_nextc and its calls to tok_reserve_buf.
48+
*/
49+
int
50+
_PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
51+
{
52+
Py_ssize_t cur = tok->cur - tok->buf;
53+
Py_ssize_t oldsize = tok->inp - tok->buf;
54+
Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
55+
if (newsize > tok->end - tok->buf) {
56+
char *newbuf = tok->buf;
57+
Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
58+
Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
59+
Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
60+
_PyLexer_remember_fstring_buffers(tok);
61+
newbuf = (char *)PyMem_Realloc(newbuf, newsize);
62+
if (newbuf == NULL) {
63+
tok->done = E_NOMEM;
64+
return 0;
65+
}
66+
tok->buf = newbuf;
67+
tok->cur = tok->buf + cur;
68+
tok->inp = tok->buf + oldsize;
69+
tok->end = tok->buf + newsize;
70+
tok->start = start < 0 ? NULL : tok->buf + start;
71+
tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
72+
tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
73+
_PyLexer_restore_fstring_buffers(tok);
74+
}
75+
return 1;
76+
}

Parser/lexer/buffer.h

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#ifndef _LEXER_BUFFER_H_
2+
#define _LEXER_BUFFER_H_
3+
4+
#include "pyport.h"
5+
6+
void _PyLexer_remember_fstring_buffers(struct tok_state *tok);
7+
void _PyLexer_restore_fstring_buffers(struct tok_state *tok);
8+
int _PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size);
9+
10+
#endif

0 commit comments

Comments
 (0)