Skip to content

Commit ff4d45c

Browse files
committed
gh-105069: Add a readline-like callable to the tokenizer to consume input iteratively
1 parent 39f6a04 commit ff4d45c

File tree

6 files changed

+144
-68
lines changed

6 files changed

+144
-68
lines changed

Lib/test/test_tokenize.py

+49-42
Original file line numberDiff line numberDiff line change
@@ -1154,7 +1154,8 @@ class TestTokenizerAdheresToPep0263(TestCase):
11541154

11551155
def _testFile(self, filename):
11561156
path = os.path.join(os.path.dirname(__file__), filename)
1157-
TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
1157+
with open(path, 'rb') as f:
1158+
TestRoundtrip.check_roundtrip(self, f)
11581159

11591160
def test_utf8_coding_cookie_and_no_utf8_bom(self):
11601161
f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
@@ -1827,9 +1828,10 @@ class CTokenizeTest(TestCase):
18271828
def check_tokenize(self, s, expected):
18281829
# Format the tokens in s in a table format.
18291830
# The ENDMARKER and final NEWLINE are omitted.
1831+
f = StringIO(s)
18301832
with self.subTest(source=s):
18311833
result = stringify_tokens_from_source(
1832-
_generate_tokens_from_c_tokenizer(s), s
1834+
_generate_tokens_from_c_tokenizer(f.readline), s
18331835
)
18341836
self.assertEqual(result, expected.rstrip().splitlines())
18351837

@@ -2668,43 +2670,44 @@ def test_unicode(self):
26682670

26692671
def test_invalid_syntax(self):
26702672
def get_tokens(string):
2671-
return list(_generate_tokens_from_c_tokenizer(string))
2672-
2673-
self.assertRaises(SyntaxError, get_tokens, "(1+2]")
2674-
self.assertRaises(SyntaxError, get_tokens, "(1+2}")
2675-
self.assertRaises(SyntaxError, get_tokens, "{1+2]")
2676-
2677-
self.assertRaises(SyntaxError, get_tokens, "1_")
2678-
self.assertRaises(SyntaxError, get_tokens, "1.2_")
2679-
self.assertRaises(SyntaxError, get_tokens, "1e2_")
2680-
self.assertRaises(SyntaxError, get_tokens, "1e+")
2681-
2682-
self.assertRaises(SyntaxError, get_tokens, "\xa0")
2683-
self.assertRaises(SyntaxError, get_tokens, "€")
2684-
2685-
self.assertRaises(SyntaxError, get_tokens, "0b12")
2686-
self.assertRaises(SyntaxError, get_tokens, "0b1_2")
2687-
self.assertRaises(SyntaxError, get_tokens, "0b2")
2688-
self.assertRaises(SyntaxError, get_tokens, "0b1_")
2689-
self.assertRaises(SyntaxError, get_tokens, "0b")
2690-
self.assertRaises(SyntaxError, get_tokens, "0o18")
2691-
self.assertRaises(SyntaxError, get_tokens, "0o1_8")
2692-
self.assertRaises(SyntaxError, get_tokens, "0o8")
2693-
self.assertRaises(SyntaxError, get_tokens, "0o1_")
2694-
self.assertRaises(SyntaxError, get_tokens, "0o")
2695-
self.assertRaises(SyntaxError, get_tokens, "0x1_")
2696-
self.assertRaises(SyntaxError, get_tokens, "0x")
2697-
self.assertRaises(SyntaxError, get_tokens, "1_")
2698-
self.assertRaises(SyntaxError, get_tokens, "012")
2699-
self.assertRaises(SyntaxError, get_tokens, "1.2_")
2700-
self.assertRaises(SyntaxError, get_tokens, "1e2_")
2701-
self.assertRaises(SyntaxError, get_tokens, "1e+")
2702-
2703-
self.assertRaises(SyntaxError, get_tokens, "'sdfsdf")
2704-
self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''")
2705-
2706-
self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000)
2707-
self.assertRaises(SyntaxError, get_tokens, "]")
2673+
the_string = StringIO(string)
2674+
return list(_generate_tokens_from_c_tokenizer(the_string.readline))
2675+
2676+
for case in [
2677+
"(1+2]",
2678+
"(1+2}",
2679+
"{1+2]",
2680+
"1_",
2681+
"1.2_",
2682+
"1e2_",
2683+
"1e+",
2684+
2685+
"\xa0",
2686+
"€",
2687+
"0b12",
2688+
"0b1_2",
2689+
"0b2",
2690+
"0b1_",
2691+
"0b",
2692+
"0o18",
2693+
"0o1_8",
2694+
"0o8",
2695+
"0o1_",
2696+
"0o",
2697+
"0x1_",
2698+
"0x",
2699+
"1_",
2700+
"012",
2701+
"1.2_",
2702+
"1e2_",
2703+
"1e+",
2704+
"'sdfsdf",
2705+
"'''sdfsdf''",
2706+
"("*1000+"a"+")"*1000,
2707+
"]",
2708+
]:
2709+
with self.subTest(case=case):
2710+
self.assertRaises(SyntaxError, get_tokens, case)
27082711

27092712
def test_max_indent(self):
27102713
MAXINDENT = 100
@@ -2715,20 +2718,24 @@ def generate_source(indents):
27152718
return source
27162719

27172720
valid = generate_source(MAXINDENT - 1)
2718-
tokens = list(_generate_tokens_from_c_tokenizer(valid))
2721+
the_input = StringIO(valid)
2722+
tokens = list(_generate_tokens_from_c_tokenizer(the_input.readline))
27192723
self.assertEqual(tokens[-2].type, DEDENT)
27202724
self.assertEqual(tokens[-1].type, ENDMARKER)
27212725
compile(valid, "<string>", "exec")
27222726

27232727
invalid = generate_source(MAXINDENT)
2724-
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid)))
2728+
the_input = StringIO(invalid)
2729+
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
27252730
self.assertRaises(
27262731
IndentationError, compile, invalid, "<string>", "exec"
27272732
)
27282733

27292734
def test_continuation_lines_indentation(self):
27302735
def get_tokens(string):
2731-
return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)]
2736+
the_string = StringIO(string)
2737+
return [(kind, string) for (kind, string, *_)
2738+
in _generate_tokens_from_c_tokenizer(the_string.readline)]
27322739

27332740
code = dedent("""
27342741
def fib(n):

Lib/tokenize.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -446,8 +446,14 @@ def tokenize(readline):
446446
yield from _tokenize(rl_gen, encoding)
447447

448448
def _tokenize(rl_gen, encoding):
449-
source = b"".join(rl_gen).decode(encoding)
450-
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
449+
def gen(rl_gen):
450+
while True:
451+
try:
452+
yield next(rl_gen).decode(encoding)
453+
except StopIteration:
454+
return
455+
g = gen(rl_gen)
456+
for token in _generate_tokens_from_c_tokenizer(g.__next__, extra_tokens=True):
451457
yield token
452458

453459
def generate_tokens(readline):

Parser/tokenizer.c

+77-4
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ tok_new(void)
103103
tok->filename = NULL;
104104
tok->decoding_readline = NULL;
105105
tok->decoding_buffer = NULL;
106+
tok->readline = NULL;
106107
tok->type_comments = 0;
107108
tok->async_hacks = 0;
108109
tok->async_def = 0;
@@ -900,6 +901,28 @@ _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
900901
return tok;
901902
}
902903

904+
struct tok_state *
905+
_PyTokenizer_FromUTF8Readline(PyObject* readline, int exec_input, int preserve_crlf)
906+
{
907+
struct tok_state *tok = tok_new();
908+
if (tok == NULL)
909+
return NULL;
910+
if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
911+
_PyTokenizer_Free(tok);
912+
return NULL;
913+
}
914+
tok->cur = tok->inp = tok->buf;
915+
tok->end = tok->buf + BUFSIZ;
916+
tok->fp = NULL;
917+
918+
tok->enc = NULL;
919+
tok->encoding = new_string("utf-8", 5, tok);
920+
Py_INCREF(readline);
921+
tok->readline = readline;
922+
tok->decoding_state = STATE_NORMAL;
923+
return tok;
924+
}
925+
903926
/* Set up tokenizer for UTF-8 string */
904927

905928
struct tok_state *
@@ -969,8 +992,9 @@ _PyTokenizer_Free(struct tok_state *tok)
969992
}
970993
Py_XDECREF(tok->decoding_readline);
971994
Py_XDECREF(tok->decoding_buffer);
995+
Py_XDECREF(tok->readline);
972996
Py_XDECREF(tok->filename);
973-
if (tok->fp != NULL && tok->buf != NULL) {
997+
if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) {
974998
PyMem_Free(tok->buf);
975999
}
9761000
if (tok->input) {
@@ -1021,6 +1045,47 @@ tok_readline_raw(struct tok_state *tok)
10211045
return 1;
10221046
}
10231047

1048+
static int
1049+
tok_readline_string(struct tok_state* tok) {
1050+
PyObject* line = PyObject_CallNoArgs(tok->readline);
1051+
if (line == NULL) {
1052+
if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
1053+
PyErr_Clear();
1054+
return 1;
1055+
}
1056+
error_ret(tok);
1057+
goto error;
1058+
}
1059+
Py_ssize_t buflen;
1060+
const char* buf = PyUnicode_AsUTF8AndSize(line, &buflen);
1061+
if (buf == NULL) {
1062+
error_ret(tok);
1063+
goto error;
1064+
}
1065+
1066+
// Make room for the null terminator *and* potentially
1067+
// an extra newline character that we may need to artificially
1068+
// add.
1069+
size_t buffer_size = buflen + 2;
1070+
if (!tok_reserve_buf(tok, buffer_size)) {
1071+
goto error;
1072+
}
1073+
memcpy(tok->inp, buf, buflen);
1074+
tok->inp += buflen;
1075+
*tok->inp = '\0';
1076+
1077+
if (tok->start == NULL) {
1078+
tok->buf = tok->cur;
1079+
}
1080+
tok->line_start = tok->cur;
1081+
1082+
Py_DECREF(line);
1083+
return 1;
1084+
error:
1085+
Py_XDECREF(line);
1086+
return 0;
1087+
}
1088+
10241089
static int
10251090
tok_underflow_string(struct tok_state *tok) {
10261091
char *end = strchr(tok->inp, '\n');
@@ -1136,7 +1201,7 @@ tok_underflow_interactive(struct tok_state *tok) {
11361201
}
11371202

11381203
static int
1139-
tok_underflow_file(struct tok_state *tok) {
1204+
tok_underflow_file(struct tok_state *tok, int use_readline) {
11401205
if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
11411206
tok->cur = tok->inp = tok->buf;
11421207
}
@@ -1157,6 +1222,11 @@ tok_underflow_file(struct tok_state *tok) {
11571222
return 0;
11581223
}
11591224
}
1225+
else if(use_readline) {
1226+
if (!tok_readline_string(tok)) {
1227+
return 0;
1228+
}
1229+
}
11601230
else {
11611231
/* We want a 'raw' read. */
11621232
if (!tok_readline_raw(tok)) {
@@ -1238,14 +1308,17 @@ tok_nextc(struct tok_state *tok)
12381308
if (tok->done != E_OK) {
12391309
return EOF;
12401310
}
1241-
if (tok->fp == NULL) {
1311+
if (tok->readline) {
1312+
rc = tok_underflow_file(tok, 1);
1313+
}
1314+
else if (tok->fp == NULL) {
12421315
rc = tok_underflow_string(tok);
12431316
}
12441317
else if (tok->prompt != NULL) {
12451318
rc = tok_underflow_interactive(tok);
12461319
}
12471320
else {
1248-
rc = tok_underflow_file(tok);
1321+
rc = tok_underflow_file(tok, 0);
12491322
}
12501323
#if defined(Py_DEBUG)
12511324
if (tok->debug) {

Parser/tokenizer.h

+2
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ struct tok_state {
109109
expression (cf. issue 16806) */
110110
PyObject *decoding_readline; /* open(...).readline */
111111
PyObject *decoding_buffer;
112+
PyObject *readline;
112113
const char* enc; /* Encoding for the current str. */
113114
char* str; /* Source string being tokenized (if tokenizing from a string)*/
114115
char* input; /* Tokenizer's newline translated copy of the string. */
@@ -137,6 +138,7 @@ struct tok_state {
137138

138139
extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
139140
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
141+
extern struct tok_state *_PyTokenizer_FromUTF8Readline(PyObject*, int, int);
140142
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
141143
const char *, const char *);
142144
extern void _PyTokenizer_Free(struct tok_state *);

Python/Python-tokenize.c

+4-4
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,15 @@ typedef struct
3737
@classmethod
3838
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
3939
40-
source: str
40+
source: object
4141
*
4242
extra_tokens: bool
4343
[clinic start generated code]*/
4444

4545
static PyObject *
46-
tokenizeriter_new_impl(PyTypeObject *type, const char *source,
46+
tokenizeriter_new_impl(PyTypeObject *type, PyObject *source,
4747
int extra_tokens)
48-
/*[clinic end generated code: output=f6f9d8b4beec8106 input=90dc5b6a5df180c2]*/
48+
/*[clinic end generated code: output=f174f61e34b2c306 input=32ddfe6d52575938]*/
4949
{
5050
tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
5151
if (self == NULL) {
@@ -55,7 +55,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
5555
if (filename == NULL) {
5656
return NULL;
5757
}
58-
self->tok = _PyTokenizer_FromUTF8(source, 1, 1);
58+
self->tok = _PyTokenizer_FromUTF8Readline(source, 1, 1);
5959
if (self->tok == NULL) {
6060
Py_DECREF(filename);
6161
return NULL;

Python/clinic/Python-tokenize.c.h

+4-16
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)