Skip to content

Commit 8ca2957

Browse files
[3.12] gh-104972: Ensure that line attributes in tokens in the tokenize module are correct (GH-104975) (#104982)
gh-104972: Ensure that line attributes in tokens in the tokenize module are correct (GH-104975) (cherry picked from commit 3fdb55c) Co-authored-by: Pablo Galindo Salgado <[email protected]>
1 parent 01af2b0 commit 8ca2957

File tree

4 files changed

+21
-9
lines changed

4 files changed

+21
-9
lines changed

Lib/idlelib/idle_test/test_editor.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,8 @@ def test_searcher(self):
201201
test_info = (# text, (block, indent))
202202
("", (None, None)),
203203
("[1,", (None, None)), # TokenError
204-
("if 1:\n", ('if 1:', None)),
205-
("if 1:\n 2\n 3\n", ('if 1:', ' 2')),
204+
("if 1:\n", ('if 1:\n', None)),
205+
("if 1:\n 2\n 3\n", ('if 1:\n', ' 2\n')),
206206
)
207207
for code, expected_pair in test_info:
208208
with self.subTest(code=code):

Lib/test/test_tokenize.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -1174,7 +1174,7 @@ def readline():
11741174

11751175
# skip the initial encoding token and the end tokens
11761176
tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
1177-
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
1177+
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
11781178
self.assertEqual(tokens, expected_tokens,
11791179
"bytes not decoded with encoding")
11801180

@@ -1657,7 +1657,6 @@ def check_roundtrip(self, f):
16571657
code = f.encode('utf-8')
16581658
else:
16591659
code = f.read()
1660-
f.close()
16611660
readline = iter(code.splitlines(keepends=True)).__next__
16621661
tokens5 = list(tokenize(readline))
16631662
tokens2 = [tok[:2] for tok in tokens5]
@@ -1672,6 +1671,17 @@ def check_roundtrip(self, f):
16721671
tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
16731672
self.assertEqual(tokens2_from5, tokens2)
16741673

1674+
def check_line_extraction(self, f):
1675+
if isinstance(f, str):
1676+
code = f.encode('utf-8')
1677+
else:
1678+
code = f.read()
1679+
readline = iter(code.splitlines(keepends=True)).__next__
1680+
for tok in tokenize(readline):
1681+
if tok.type in {ENCODING, ENDMARKER}:
1682+
continue
1683+
self.assertEqual(tok.string, tok.line[tok.start[1]: tok.end[1]])
1684+
16751685
def test_roundtrip(self):
16761686
# There are some standard formatting practices that are easy to get right.
16771687

@@ -1768,6 +1778,7 @@ def test_random_files(self):
17681778
with open(testfile, 'rb') as f:
17691779
# with self.subTest(file=testfile):
17701780
self.check_roundtrip(f)
1781+
self.check_line_extraction(f)
17711782

17721783

17731784
def roundtrip(self, code):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Ensure that the ``line`` attribute in :class:`tokenize.TokenInfo` objects in
2+
the :mod:`tokenize` module are always correct. Patch by Pablo Galindo

Python/Python-tokenize.c

+4-5
Original file line numberDiff line numberDiff line change
@@ -194,15 +194,14 @@ tokenizeriter_next(tokenizeriterobject *it)
194194
goto exit;
195195
}
196196

197-
Py_ssize_t size = it->tok->inp - it->tok->buf;
198-
assert(it->tok->buf[size-1] == '\n');
199-
size -= 1; // Remove the newline character from the end of the line
200-
PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
197+
const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
198+
Py_ssize_t size = it->tok->inp - line_start;
199+
PyObject *line = PyUnicode_DecodeUTF8(line_start, size, "replace");
201200
if (line == NULL) {
202201
Py_DECREF(str);
203202
goto exit;
204203
}
205-
const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
204+
206205
Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
207206
Py_ssize_t end_lineno = it->tok->lineno;
208207
Py_ssize_t col_offset = -1;

0 commit comments

Comments
 (0)