Skip to content

Commit bc6d2da

Browse files
committed
pythongh-105069: Add a readline-like callable to the tokenizer to consume input iteratively
Signed-off-by: Pablo Galindo <[email protected]>
1 parent 39f6a04 commit bc6d2da

File tree

7 files changed

+196
-73
lines changed

7 files changed

+196
-73
lines changed

Lib/test/test_tokenize.py

+56-42
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,13 @@ def check_tokenize(self, s, expected):
5050
self.assertEqual(result,
5151
[" ENCODING 'utf-8' (0, 0) (0, 0)"] +
5252
expected.rstrip().splitlines())
53+
54+
def test_invalid_readline(self):
55+
def gen():
56+
yield "sdfosdg"
57+
yield "sdfosdg"
58+
with self.assertRaises(TypeError):
59+
list(tokenize(gen().__next__))
5360

5461
def test_implicit_newline(self):
5562
# Make sure that the tokenizer puts in an implicit NEWLINE
@@ -1154,7 +1161,8 @@ class TestTokenizerAdheresToPep0263(TestCase):
11541161

11551162
def _testFile(self, filename):
11561163
path = os.path.join(os.path.dirname(__file__), filename)
1157-
TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
1164+
with open(path, 'rb') as f:
1165+
TestRoundtrip.check_roundtrip(self, f)
11581166

11591167
def test_utf8_coding_cookie_and_no_utf8_bom(self):
11601168
f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
@@ -1827,9 +1835,10 @@ class CTokenizeTest(TestCase):
18271835
def check_tokenize(self, s, expected):
18281836
# Format the tokens in s in a table format.
18291837
# The ENDMARKER and final NEWLINE are omitted.
1838+
f = BytesIO(s.encode())
18301839
with self.subTest(source=s):
18311840
result = stringify_tokens_from_source(
1832-
_generate_tokens_from_c_tokenizer(s), s
1841+
_generate_tokens_from_c_tokenizer(f.readline), s
18331842
)
18341843
self.assertEqual(result, expected.rstrip().splitlines())
18351844

@@ -2668,43 +2677,44 @@ def test_unicode(self):
26682677

26692678
def test_invalid_syntax(self):
26702679
def get_tokens(string):
2671-
return list(_generate_tokens_from_c_tokenizer(string))
2672-
2673-
self.assertRaises(SyntaxError, get_tokens, "(1+2]")
2674-
self.assertRaises(SyntaxError, get_tokens, "(1+2}")
2675-
self.assertRaises(SyntaxError, get_tokens, "{1+2]")
2676-
2677-
self.assertRaises(SyntaxError, get_tokens, "1_")
2678-
self.assertRaises(SyntaxError, get_tokens, "1.2_")
2679-
self.assertRaises(SyntaxError, get_tokens, "1e2_")
2680-
self.assertRaises(SyntaxError, get_tokens, "1e+")
2681-
2682-
self.assertRaises(SyntaxError, get_tokens, "\xa0")
2683-
self.assertRaises(SyntaxError, get_tokens, "€")
2684-
2685-
self.assertRaises(SyntaxError, get_tokens, "0b12")
2686-
self.assertRaises(SyntaxError, get_tokens, "0b1_2")
2687-
self.assertRaises(SyntaxError, get_tokens, "0b2")
2688-
self.assertRaises(SyntaxError, get_tokens, "0b1_")
2689-
self.assertRaises(SyntaxError, get_tokens, "0b")
2690-
self.assertRaises(SyntaxError, get_tokens, "0o18")
2691-
self.assertRaises(SyntaxError, get_tokens, "0o1_8")
2692-
self.assertRaises(SyntaxError, get_tokens, "0o8")
2693-
self.assertRaises(SyntaxError, get_tokens, "0o1_")
2694-
self.assertRaises(SyntaxError, get_tokens, "0o")
2695-
self.assertRaises(SyntaxError, get_tokens, "0x1_")
2696-
self.assertRaises(SyntaxError, get_tokens, "0x")
2697-
self.assertRaises(SyntaxError, get_tokens, "1_")
2698-
self.assertRaises(SyntaxError, get_tokens, "012")
2699-
self.assertRaises(SyntaxError, get_tokens, "1.2_")
2700-
self.assertRaises(SyntaxError, get_tokens, "1e2_")
2701-
self.assertRaises(SyntaxError, get_tokens, "1e+")
2702-
2703-
self.assertRaises(SyntaxError, get_tokens, "'sdfsdf")
2704-
self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''")
2705-
2706-
self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000)
2707-
self.assertRaises(SyntaxError, get_tokens, "]")
2680+
the_string = BytesIO(string.encode())
2681+
return list(_generate_tokens_from_c_tokenizer(the_string.readline))
2682+
2683+
for case in [
2684+
"(1+2]",
2685+
"(1+2}",
2686+
"{1+2]",
2687+
"1_",
2688+
"1.2_",
2689+
"1e2_",
2690+
"1e+",
2691+
2692+
"\xa0",
2693+
"€",
2694+
"0b12",
2695+
"0b1_2",
2696+
"0b2",
2697+
"0b1_",
2698+
"0b",
2699+
"0o18",
2700+
"0o1_8",
2701+
"0o8",
2702+
"0o1_",
2703+
"0o",
2704+
"0x1_",
2705+
"0x",
2706+
"1_",
2707+
"012",
2708+
"1.2_",
2709+
"1e2_",
2710+
"1e+",
2711+
"'sdfsdf",
2712+
"'''sdfsdf''",
2713+
"("*1000+"a"+")"*1000,
2714+
"]",
2715+
]:
2716+
with self.subTest(case=case):
2717+
self.assertRaises(SyntaxError, get_tokens, case)
27082718

27092719
def test_max_indent(self):
27102720
MAXINDENT = 100
@@ -2715,20 +2725,24 @@ def generate_source(indents):
27152725
return source
27162726

27172727
valid = generate_source(MAXINDENT - 1)
2718-
tokens = list(_generate_tokens_from_c_tokenizer(valid))
2728+
the_input = BytesIO(valid.encode())
2729+
tokens = list(_generate_tokens_from_c_tokenizer(the_input.readline))
27192730
self.assertEqual(tokens[-2].type, DEDENT)
27202731
self.assertEqual(tokens[-1].type, ENDMARKER)
27212732
compile(valid, "<string>", "exec")
27222733

27232734
invalid = generate_source(MAXINDENT)
2724-
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid)))
2735+
the_input = BytesIO(invalid.encode())
2736+
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
27252737
self.assertRaises(
27262738
IndentationError, compile, invalid, "<string>", "exec"
27272739
)
27282740

27292741
def test_continuation_lines_indentation(self):
27302742
def get_tokens(string):
2731-
return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)]
2743+
the_string = BytesIO(string.encode())
2744+
return [(kind, string) for (kind, string, *_)
2745+
in _generate_tokens_from_c_tokenizer(the_string.readline)]
27322746

27332747
code = dedent("""
27342748
def fib(n):

Lib/tokenize.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -446,8 +446,9 @@ def tokenize(readline):
446446
yield from _tokenize(rl_gen, encoding)
447447

448448
def _tokenize(rl_gen, encoding):
449-
source = b"".join(rl_gen).decode(encoding)
450-
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
449+
for token in _generate_tokens_from_c_tokenizer(rl_gen.__next__,
450+
encoding=encoding,
451+
extra_tokens=True):
451452
yield token
452453

453454
def generate_tokens(readline):
@@ -531,10 +532,10 @@ def error(message, filename=None, location=None):
531532
perror("unexpected error: %s" % err)
532533
raise
533534

534-
def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
535+
def _generate_tokens_from_c_tokenizer(source, encoding="utf-8", extra_tokens=False):
535536
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
536537
import _tokenize as c_tokenizer
537-
for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
538+
for info in c_tokenizer.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens):
538539
yield TokenInfo._make(info)
539540

540541

Parser/tokenizer.c

+101-4
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ tok_new(void)
103103
tok->filename = NULL;
104104
tok->decoding_readline = NULL;
105105
tok->decoding_buffer = NULL;
106+
tok->readline = NULL;
106107
tok->type_comments = 0;
107108
tok->async_hacks = 0;
108109
tok->async_def = 0;
@@ -900,6 +901,33 @@ _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
900901
return tok;
901902
}
902903

904+
struct tok_state *
905+
_PyTokenizer_FromReadline(PyObject* readline, const char* enc,
906+
int exec_input, int preserve_crlf)
907+
{
908+
struct tok_state *tok = tok_new();
909+
if (tok == NULL)
910+
return NULL;
911+
if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
912+
_PyTokenizer_Free(tok);
913+
return NULL;
914+
}
915+
tok->cur = tok->inp = tok->buf;
916+
tok->end = tok->buf + BUFSIZ;
917+
tok->fp = NULL;
918+
if (enc != NULL) {
919+
tok->encoding = new_string(enc, strlen(enc), tok);
920+
if (!tok->encoding) {
921+
_PyTokenizer_Free(tok);
922+
return NULL;
923+
}
924+
}
925+
tok->decoding_state = STATE_NORMAL;
926+
Py_INCREF(readline);
927+
tok->readline = readline;
928+
return tok;
929+
}
930+
903931
/* Set up tokenizer for UTF-8 string */
904932

905933
struct tok_state *
@@ -969,8 +997,9 @@ _PyTokenizer_Free(struct tok_state *tok)
969997
}
970998
Py_XDECREF(tok->decoding_readline);
971999
Py_XDECREF(tok->decoding_buffer);
1000+
Py_XDECREF(tok->readline);
9721001
Py_XDECREF(tok->filename);
973-
if (tok->fp != NULL && tok->buf != NULL) {
1002+
if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) {
9741003
PyMem_Free(tok->buf);
9751004
}
9761005
if (tok->input) {
@@ -1021,6 +1050,66 @@ tok_readline_raw(struct tok_state *tok)
10211050
return 1;
10221051
}
10231052

1053+
static int
1054+
tok_readline_string(struct tok_state* tok) {
1055+
PyObject* line = NULL;
1056+
PyObject* raw_line = PyObject_CallNoArgs(tok->readline);
1057+
if (raw_line == NULL) {
1058+
if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
1059+
PyErr_Clear();
1060+
return 1;
1061+
}
1062+
error_ret(tok);
1063+
goto error;
1064+
}
1065+
if(tok->encoding != NULL) {
1066+
if (!PyBytes_Check(raw_line)) {
1067+
PyErr_Format(PyExc_TypeError, "readline() returned a non-bytes object");
1068+
error_ret(tok);
1069+
goto error;
1070+
}
1071+
line = PyUnicode_Decode(PyBytes_AS_STRING(raw_line), PyBytes_GET_SIZE(raw_line),
1072+
tok->encoding, "replace");
1073+
Py_CLEAR(raw_line);
1074+
if (line == NULL) {
1075+
error_ret(tok);
1076+
goto error;
1077+
}
1078+
} else {
1079+
line = raw_line;
1080+
raw_line = NULL;
1081+
}
1082+
Py_ssize_t buflen;
1083+
const char* buf = PyUnicode_AsUTF8AndSize(line, &buflen);
1084+
if (buf == NULL) {
1085+
error_ret(tok);
1086+
goto error;
1087+
}
1088+
1089+
// Make room for the null terminator *and* potentially
1090+
// an extra newline character that we may need to artificially
1091+
// add.
1092+
size_t buffer_size = buflen + 2;
1093+
if (!tok_reserve_buf(tok, buffer_size)) {
1094+
goto error;
1095+
}
1096+
memcpy(tok->inp, buf, buflen);
1097+
tok->inp += buflen;
1098+
*tok->inp = '\0';
1099+
1100+
if (tok->start == NULL) {
1101+
tok->buf = tok->cur;
1102+
}
1103+
tok->line_start = tok->cur;
1104+
1105+
Py_DECREF(line);
1106+
return 1;
1107+
error:
1108+
Py_XDECREF(raw_line);
1109+
Py_XDECREF(line);
1110+
return 0;
1111+
}
1112+
10241113
static int
10251114
tok_underflow_string(struct tok_state *tok) {
10261115
char *end = strchr(tok->inp, '\n');
@@ -1136,7 +1225,7 @@ tok_underflow_interactive(struct tok_state *tok) {
11361225
}
11371226

11381227
static int
1139-
tok_underflow_file(struct tok_state *tok) {
1228+
tok_underflow_file(struct tok_state *tok, int use_readline) {
11401229
if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
11411230
tok->cur = tok->inp = tok->buf;
11421231
}
@@ -1157,6 +1246,11 @@ tok_underflow_file(struct tok_state *tok) {
11571246
return 0;
11581247
}
11591248
}
1249+
else if(use_readline) {
1250+
if (!tok_readline_string(tok)) {
1251+
return 0;
1252+
}
1253+
}
11601254
else {
11611255
/* We want a 'raw' read. */
11621256
if (!tok_readline_raw(tok)) {
@@ -1238,14 +1332,17 @@ tok_nextc(struct tok_state *tok)
12381332
if (tok->done != E_OK) {
12391333
return EOF;
12401334
}
1241-
if (tok->fp == NULL) {
1335+
if (tok->readline) {
1336+
rc = tok_underflow_file(tok, 1);
1337+
}
1338+
else if (tok->fp == NULL) {
12421339
rc = tok_underflow_string(tok);
12431340
}
12441341
else if (tok->prompt != NULL) {
12451342
rc = tok_underflow_interactive(tok);
12461343
}
12471344
else {
1248-
rc = tok_underflow_file(tok);
1345+
rc = tok_underflow_file(tok, 0);
12491346
}
12501347
#if defined(Py_DEBUG)
12511348
if (tok->debug) {

Parser/tokenizer.h

+2
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ struct tok_state {
109109
expression (cf. issue 16806) */
110110
PyObject *decoding_readline; /* open(...).readline */
111111
PyObject *decoding_buffer;
112+
PyObject *readline; /* readline() function */
112113
const char* enc; /* Encoding for the current str. */
113114
char* str; /* Source string being tokenized (if tokenizing from a string)*/
114115
char* input; /* Tokenizer's newline translated copy of the string. */
@@ -137,6 +138,7 @@ struct tok_state {
137138

138139
extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
139140
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
141+
extern struct tok_state *_PyTokenizer_FromReadline(PyObject*, const char*, int, int);
140142
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
141143
const char *, const char *);
142144
extern void _PyTokenizer_Free(struct tok_state *);

Python/Python-tokenize.c

+6-5
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,16 @@ typedef struct
3737
@classmethod
3838
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
3939
40-
source: str
40+
readline: object
4141
*
4242
extra_tokens: bool
43+
encoding: str = NULL
4344
[clinic start generated code]*/
4445

4546
static PyObject *
46-
tokenizeriter_new_impl(PyTypeObject *type, const char *source,
47-
int extra_tokens)
48-
/*[clinic end generated code: output=f6f9d8b4beec8106 input=90dc5b6a5df180c2]*/
47+
tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
48+
int extra_tokens, const char *encoding)
49+
/*[clinic end generated code: output=7501a1211683ce16 input=92c429aa8f2e6714]*/
4950
{
5051
tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
5152
if (self == NULL) {
@@ -55,7 +56,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
5556
if (filename == NULL) {
5657
return NULL;
5758
}
58-
self->tok = _PyTokenizer_FromUTF8(source, 1, 1);
59+
self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
5960
if (self->tok == NULL) {
6061
Py_DECREF(filename);
6162
return NULL;

0 commit comments

Comments
 (0)