Skip to content

Commit 9089ece

Browse files
committed
Blech
1 parent b3bae11 commit 9089ece

File tree

3 files changed

+100
-55
lines changed

3 files changed

+100
-55
lines changed

Lib/test/test_tokenize.py

+45-40
Original file line numberDiff line numberDiff line change
@@ -2669,43 +2669,44 @@ def test_unicode(self):
26692669

26702670
def test_invalid_syntax(self):
26712671
def get_tokens(string):
2672-
return list(_generate_tokens_from_c_tokenizer(string))
2673-
2674-
self.assertRaises(SyntaxError, get_tokens, "(1+2]")
2675-
self.assertRaises(SyntaxError, get_tokens, "(1+2}")
2676-
self.assertRaises(SyntaxError, get_tokens, "{1+2]")
2677-
2678-
self.assertRaises(SyntaxError, get_tokens, "1_")
2679-
self.assertRaises(SyntaxError, get_tokens, "1.2_")
2680-
self.assertRaises(SyntaxError, get_tokens, "1e2_")
2681-
self.assertRaises(SyntaxError, get_tokens, "1e+")
2682-
2683-
self.assertRaises(SyntaxError, get_tokens, "\xa0")
2684-
self.assertRaises(SyntaxError, get_tokens, "€")
2685-
2686-
self.assertRaises(SyntaxError, get_tokens, "0b12")
2687-
self.assertRaises(SyntaxError, get_tokens, "0b1_2")
2688-
self.assertRaises(SyntaxError, get_tokens, "0b2")
2689-
self.assertRaises(SyntaxError, get_tokens, "0b1_")
2690-
self.assertRaises(SyntaxError, get_tokens, "0b")
2691-
self.assertRaises(SyntaxError, get_tokens, "0o18")
2692-
self.assertRaises(SyntaxError, get_tokens, "0o1_8")
2693-
self.assertRaises(SyntaxError, get_tokens, "0o8")
2694-
self.assertRaises(SyntaxError, get_tokens, "0o1_")
2695-
self.assertRaises(SyntaxError, get_tokens, "0o")
2696-
self.assertRaises(SyntaxError, get_tokens, "0x1_")
2697-
self.assertRaises(SyntaxError, get_tokens, "0x")
2698-
self.assertRaises(SyntaxError, get_tokens, "1_")
2699-
self.assertRaises(SyntaxError, get_tokens, "012")
2700-
self.assertRaises(SyntaxError, get_tokens, "1.2_")
2701-
self.assertRaises(SyntaxError, get_tokens, "1e2_")
2702-
self.assertRaises(SyntaxError, get_tokens, "1e+")
2703-
2704-
self.assertRaises(SyntaxError, get_tokens, "'sdfsdf")
2705-
self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''")
2706-
2707-
self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000)
2708-
self.assertRaises(SyntaxError, get_tokens, "]")
2672+
the_string = StringIO(string)
2673+
return list(_generate_tokens_from_c_tokenizer(the_string.readline))
2674+
2675+
for case in [
2676+
"(1+2]",
2677+
"(1+2}",
2678+
"{1+2]",
2679+
"1_",
2680+
"1.2_",
2681+
"1e2_",
2682+
"1e+",
2683+
2684+
"\xa0",
2685+
"€",
2686+
"0b12",
2687+
"0b1_2",
2688+
"0b2",
2689+
"0b1_",
2690+
"0b",
2691+
"0o18",
2692+
"0o1_8",
2693+
"0o8",
2694+
"0o1_",
2695+
"0o",
2696+
"0x1_",
2697+
"0x",
2698+
"1_",
2699+
"012",
2700+
"1.2_",
2701+
"1e2_",
2702+
"1e+",
2703+
"'sdfsdf",
2704+
"'''sdfsdf''",
2705+
"("*1000+"a"+")"*1000,
2706+
"]",
2707+
]:
2708+
with self.subTest(case=case):
2709+
self.assertRaises(SyntaxError, get_tokens, case)
27092710

27102711
def test_max_indent(self):
27112712
MAXINDENT = 100
@@ -2716,20 +2717,24 @@ def generate_source(indents):
27162717
return source
27172718

27182719
valid = generate_source(MAXINDENT - 1)
2719-
tokens = list(_generate_tokens_from_c_tokenizer(valid))
2720+
the_input = StringIO(valid)
2721+
tokens = list(_generate_tokens_from_c_tokenizer(the_input.readline))
27202722
self.assertEqual(tokens[-2].type, DEDENT)
27212723
self.assertEqual(tokens[-1].type, ENDMARKER)
27222724
compile(valid, "<string>", "exec")
27232725

27242726
invalid = generate_source(MAXINDENT)
2725-
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid)))
2727+
the_input = StringIO(invalid)
2728+
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
27262729
self.assertRaises(
27272730
IndentationError, compile, invalid, "<string>", "exec"
27282731
)
27292732

27302733
def test_continuation_lines_indentation(self):
27312734
def get_tokens(string):
2732-
return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)]
2735+
the_string = StringIO(string)
2736+
return [(kind, string) for (kind, string, *_)
2737+
in _generate_tokens_from_c_tokenizer(the_string.readline)]
27332738

27342739
code = dedent("""
27352740
def fib(n):

Parser/tokenizer.c

+54-15
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ tok_new(void)
103103
tok->filename = NULL;
104104
tok->decoding_readline = NULL;
105105
tok->decoding_buffer = NULL;
106+
tok->readline = NULL;
106107
tok->type_comments = 0;
107108
tok->async_hacks = 0;
108109
tok->async_def = 0;
@@ -542,13 +543,7 @@ tok_readline_recode(struct tok_state *tok) {
542543
if (line == NULL) {
543544
line = PyObject_CallNoArgs(tok->decoding_readline);
544545
if (line == NULL) {
545-
if (!PyErr_ExceptionMatches(PyExc_StopIteration)) {
546-
error_ret(tok);
547-
} else {
548-
PyErr_Clear();
549-
tok->inp = tok->cur;
550-
tok->done = E_EOF;
551-
}
546+
error_ret(tok);
552547
goto error;
553548
}
554549
}
@@ -575,7 +570,6 @@ tok_readline_recode(struct tok_state *tok) {
575570
goto error;
576571
}
577572
Py_DECREF(line);
578-
exit:
579573
return 1;
580574
error:
581575
Py_XDECREF(line);
@@ -924,13 +918,11 @@ _PyTokenizer_FromUTF8Readline(PyObject* readline, int exec_input, int preserve_c
924918
tok->enc = NULL;
925919
tok->encoding = new_string("utf-8", 5, tok);
926920
Py_INCREF(readline);
927-
tok->decoding_readline = readline;
921+
tok->readline = readline;
928922
tok->decoding_state = STATE_NORMAL;
929923
return tok;
930924
}
931925

932-
933-
934926
/* Set up tokenizer for UTF-8 string */
935927

936928
struct tok_state *
@@ -1000,6 +992,7 @@ _PyTokenizer_Free(struct tok_state *tok)
1000992
}
1001993
Py_XDECREF(tok->decoding_readline);
1002994
Py_XDECREF(tok->decoding_buffer);
995+
Py_XDECREF(tok->readline);
1003996
Py_XDECREF(tok->filename);
1004997
if (tok->fp != NULL && tok->buf != NULL) {
1005998
PyMem_Free(tok->buf);
@@ -1052,6 +1045,47 @@ tok_readline_raw(struct tok_state *tok)
10521045
return 1;
10531046
}
10541047

1048+
static int
1049+
tok_readline_string(struct tok_state* tok) {
1050+
PyObject* line = PyObject_CallNoArgs(tok->readline);
1051+
if (line == NULL) {
1052+
if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
1053+
PyErr_Clear();
1054+
return 1;
1055+
}
1056+
error_ret(tok);
1057+
goto error;
1058+
}
1059+
Py_ssize_t buflen;
1060+
const char* buf = PyUnicode_AsUTF8AndSize(line, &buflen);
1061+
if (buf == NULL) {
1062+
error_ret(tok);
1063+
goto error;
1064+
}
1065+
1066+
// Make room for the null terminator *and* potentially
1067+
// an extra newline character that we may need to artificially
1068+
// add.
1069+
size_t buffer_size = buflen + 2;
1070+
if (!tok_reserve_buf(tok, buffer_size)) {
1071+
goto error;
1072+
}
1073+
memcpy(tok->inp, buf, buflen);
1074+
tok->inp += buflen;
1075+
*tok->inp = '\0';
1076+
1077+
if (tok->start == NULL) {
1078+
tok->buf = tok->cur;
1079+
}
1080+
tok->line_start = tok->cur;
1081+
1082+
Py_DECREF(line);
1083+
return 1;
1084+
error:
1085+
Py_XDECREF(line);
1086+
return 0;
1087+
}
1088+
10551089
static int
10561090
tok_underflow_string(struct tok_state *tok) {
10571091
char *end = strchr(tok->inp, '\n');
@@ -1167,7 +1201,7 @@ tok_underflow_interactive(struct tok_state *tok) {
11671201
}
11681202

11691203
static int
1170-
tok_underflow_file(struct tok_state *tok) {
1204+
tok_underflow_file(struct tok_state *tok, int use_readline) {
11711205
if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
11721206
tok->cur = tok->inp = tok->buf;
11731207
}
@@ -1188,6 +1222,11 @@ tok_underflow_file(struct tok_state *tok) {
11881222
return 0;
11891223
}
11901224
}
1225+
else if(use_readline) {
1226+
if (!tok_readline_string(tok)) {
1227+
return 0;
1228+
}
1229+
}
11911230
else {
11921231
/* We want a 'raw' read. */
11931232
if (!tok_readline_raw(tok)) {
@@ -1269,8 +1308,8 @@ tok_nextc(struct tok_state *tok)
12691308
if (tok->done != E_OK) {
12701309
return EOF;
12711310
}
1272-
if (tok->decoding_readline != NULL) {
1273-
rc = tok_underflow_file(tok);
1311+
if (tok->readline) {
1312+
rc = tok_underflow_file(tok, 1);
12741313
}
12751314
else if (tok->fp == NULL) {
12761315
rc = tok_underflow_string(tok);
@@ -1279,7 +1318,7 @@ tok_nextc(struct tok_state *tok)
12791318
rc = tok_underflow_interactive(tok);
12801319
}
12811320
else {
1282-
rc = tok_underflow_file(tok);
1321+
rc = tok_underflow_file(tok, 0);
12831322
}
12841323
#if defined(Py_DEBUG)
12851324
if (tok->debug) {

Parser/tokenizer.h

+1
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ struct tok_state {
109109
expression (cf. issue 16806) */
110110
PyObject *decoding_readline; /* open(...).readline */
111111
PyObject *decoding_buffer;
112+
PyObject *readline;
112113
const char* enc; /* Encoding for the current str. */
113114
char* str; /* Source string being tokenized (if tokenizing from a string)*/
114115
char* input; /* Tokenizer's newline translated copy of the string. */

0 commit comments

Comments
 (0)