Skip to content

gh-104169: Refactor tokenizer into lexer and wrappers #110684

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -347,20 +347,36 @@ PEGEN_OBJS= \
Parser/string_parser.o \
Parser/peg_api.o

TOKENIZER_OBJS= \
Parser/lexer/buffer.o \
Parser/lexer/lexer.o \
Parser/lexer/state.o \
Parser/tokenizer/file_tokenizer.o \
Parser/tokenizer/readline_tokenizer.o \
Parser/tokenizer/string_tokenizer.o \
Parser/tokenizer/utf8_tokenizer.o \
Parser/tokenizer/helpers.o

PEGEN_HEADERS= \
$(srcdir)/Include/internal/pycore_parser.h \
$(srcdir)/Parser/pegen.h \
$(srcdir)/Parser/string_parser.h

TOKENIZER_HEADERS= \
Parser/lexer/buffer.h \
Parser/lexer/lexer.h \
Parser/lexer/state.h \
Parser/tokenizer/tokenizer.h \
Parser/tokenizer/helpers.h

POBJS= \
Parser/token.o \

PARSER_OBJS= $(POBJS) $(PEGEN_OBJS) Parser/myreadline.o Parser/tokenizer.o
PARSER_OBJS= $(POBJS) $(PEGEN_OBJS) $(TOKENIZER_OBJS) Parser/myreadline.o

PARSER_HEADERS= \
$(PEGEN_HEADERS) \
$(srcdir)/Parser/tokenizer.h
$(TOKENIZER_HEADERS)

##########################################################################
# Python
Expand Down Expand Up @@ -1397,6 +1413,8 @@ regen-pegen-metaparser:
.PHONY: regen-pegen
regen-pegen:
@$(MKDIR_P) $(srcdir)/Parser
@$(MKDIR_P) $(srcdir)/Parser/tokenizer
@$(MKDIR_P) $(srcdir)/Parser/lexer
PYTHONPATH=$(srcdir)/Tools/peg_generator $(PYTHON_FOR_REGEN) -m pegen -q c \
$(srcdir)/Grammar/python.gram \
$(srcdir)/Grammar/Tokens \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Split the tokenizer into two separate directories:
- One part includes the actual lexeme producing logic and lives in ``Parser/lexer``.
- The second part wraps the lexer according to the different tokenization modes
we have (string, utf-8, file, interactive, readline) and lives in ``Parser/tokenizer``.
9 changes: 8 additions & 1 deletion PCbuild/_freeze_module.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,14 @@
<ClCompile Include="..\Parser\action_helpers.c" />
<ClCompile Include="..\Parser\string_parser.c" />
<ClCompile Include="..\Parser\token.c" />
<ClCompile Include="..\Parser\tokenizer.c" />
<ClCompile Include="..\Parser\lexer\buffer.c" />
<ClCompile Include="..\Parser\lexer\state.c" />
<ClCompile Include="..\Parser\lexer\lexer.c" />
<ClCompile Include="..\Parser\tokenizer\string_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\file_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\helpers.c" />
<ClCompile Include="..\PC\invalid_parameter_handler.c" />
<ClCompile Include="..\PC\msvcrtmodule.c" />
<ClCompile Include="..\PC\winreg.c" />
Expand Down
23 changes: 22 additions & 1 deletion PCbuild/_freeze_module.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,28 @@
<ClCompile Include="..\Parser\token.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer.c">
<ClCompile Include="..\Parser\lexer\lexer.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\lexer\buffer.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\lexer\state.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\string_tokenizer.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\file_tokenizer.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\helpers.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Python\traceback.c">
Expand Down
15 changes: 13 additions & 2 deletions PCbuild/pythoncore.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,11 @@
<ClInclude Include="..\Objects\stringlib\replace.h" />
<ClInclude Include="..\Objects\stringlib\split.h" />
<ClInclude Include="..\Objects\unicodetype_db.h" />
<ClInclude Include="..\Parser\tokenizer.h" />
<ClInclude Include="..\Parser\lexer\state.h" />
<ClInclude Include="..\Parser\lexer\lexer.h" />
<ClInclude Include="..\Parser\lexer\buffer.h" />
<ClInclude Include="..\Parser\tokenizer\helpers.h" />
<ClInclude Include="..\Parser\tokenizer\tokenizer.h" />
<ClInclude Include="..\Parser\string_parser.h" />
<ClInclude Include="..\Parser\pegen.h" />
<ClInclude Include="..\PC\errmap.h" />
Expand Down Expand Up @@ -507,7 +511,14 @@
<ClCompile Include="..\Objects\unionobject.c" />
<ClCompile Include="..\Objects\weakrefobject.c" />
<ClCompile Include="..\Parser\myreadline.c" />
<ClCompile Include="..\Parser\tokenizer.c" />
<ClCompile Include="..\Parser\lexer\state.c" />
<ClCompile Include="..\Parser\lexer\lexer.c" />
<ClCompile Include="..\Parser\lexer\buffer.c" />
<ClCompile Include="..\Parser\tokenizer\string_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\file_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\helpers.c" />
<ClCompile Include="..\Parser\token.c" />
<ClCompile Include="..\Parser\pegen.c" />
<ClCompile Include="..\Parser\pegen_errors.c" />
Expand Down
37 changes: 35 additions & 2 deletions PCbuild/pythoncore.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,19 @@
<ClInclude Include="..\Objects\unicodetype_db.h">
<Filter>Objects</Filter>
</ClInclude>
<ClInclude Include="..\Parser\tokenizer.h">
<ClInclude Include="..\Parser\lexer\lexer.h">
<Filter>Parser</Filter>
</ClInclude>
<ClInclude Include="..\Parser\lexer\state.h">
<Filter>Parser</Filter>
</ClInclude>
<ClInclude Include="..\Parser\lexer\buffer.h">
<Filter>Parser</Filter>
</ClInclude>
<ClInclude Include="..\Parser\tokenizer\tokenizer.h">
<Filter>Parser</Filter>
</ClInclude>
<ClInclude Include="..\Parser\tokenizer\helpers.h">
<Filter>Parser</Filter>
</ClInclude>
<ClInclude Include="..\PC\errmap.h">
Expand Down Expand Up @@ -1139,7 +1151,28 @@
<ClCompile Include="..\Parser\myreadline.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer.c">
<ClCompile Include="..\Parser\lexer\lexer.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\lexer\state.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\lexer\buffer.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\string_tokenizer.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\file_tokenizer.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\helpers.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\token.c">
Expand Down
1 change: 0 additions & 1 deletion Parser/action_helpers.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include <Python.h>

#include "pegen.h"
#include "tokenizer.h"
#include "string_parser.h"
#include "pycore_runtime.h" // _PyRuntime

Expand Down
76 changes: 76 additions & 0 deletions Parser/lexer/buffer.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#include "Python.h"
#include "errcode.h"

#include "state.h"

/* Traverse and remember all f-string buffers, in order to be able to restore
them after reallocating tok->buf */
void
_PyLexer_remember_fstring_buffers(struct tok_state *tok)
{
int index;
tokenizer_mode *mode;

for (index = tok->tok_mode_stack_index; index >= 0; --index) {
mode = &(tok->tok_mode_stack[index]);
mode->f_string_start_offset = mode->f_string_start - tok->buf;
mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf;
}
}

/* Traverse and restore all f-string buffers after reallocating tok->buf */
void
_PyLexer_restore_fstring_buffers(struct tok_state *tok)
{
int index;
tokenizer_mode *mode;

for (index = tok->tok_mode_stack_index; index >= 0; --index) {
mode = &(tok->tok_mode_stack[index]);
mode->f_string_start = tok->buf + mode->f_string_start_offset;
mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset;
}
}

/* Read a line of text from TOK into S, using the stream in TOK.
Return NULL on failure, else S.

On entry, tok->decoding_buffer will be one of:
1) NULL: need to call tok->decoding_readline to get a new line
2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
stored the result in tok->decoding_buffer
3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
(in the s buffer) to copy entire contents of the line read
by tok->decoding_readline. tok->decoding_buffer has the overflow.
In this case, tok_readline_recode is called in a loop (with an expanded buffer)
until the buffer ends with a '\n' (or until the end of the file is
reached): see tok_nextc and its calls to tok_reserve_buf.
*/
int
_PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
{
Py_ssize_t cur = tok->cur - tok->buf;
Py_ssize_t oldsize = tok->inp - tok->buf;
Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
if (newsize > tok->end - tok->buf) {
char *newbuf = tok->buf;
Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
_PyLexer_remember_fstring_buffers(tok);
newbuf = (char *)PyMem_Realloc(newbuf, newsize);
if (newbuf == NULL) {
tok->done = E_NOMEM;
return 0;
}
tok->buf = newbuf;
tok->cur = tok->buf + cur;
tok->inp = tok->buf + oldsize;
tok->end = tok->buf + newsize;
tok->start = start < 0 ? NULL : tok->buf + start;
tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
_PyLexer_restore_fstring_buffers(tok);
}
return 1;
}
10 changes: 10 additions & 0 deletions Parser/lexer/buffer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#ifndef _LEXER_BUFFER_H_
#define _LEXER_BUFFER_H_

#include "pyport.h"

void _PyLexer_remember_fstring_buffers(struct tok_state *tok);
void _PyLexer_restore_fstring_buffers(struct tok_state *tok);
int _PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size);

#endif
Loading