Skip to content

Commit ffafa9b

Browse files
gh-96268: Fix loading invalid UTF-8 (GH-96270)
This makes tokenizer.c:valid_utf8 match stringlib/codecs.h:decode_utf8. It also fixes an off-by-one error introduced in 3.10 for the line number when the tokenizer reports bad UTF8. (cherry picked from commit 8bc356a) Co-authored-by: Michael Droettboom <[email protected]>
1 parent 9fa21d0 commit ffafa9b

File tree

3 files changed

+57
-16
lines changed

3 files changed

+57
-16
lines changed

Lib/test/test_source_encoding.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -248,16 +248,23 @@ def test_invalid_utf8(self):
248248
# test it is to write actual files to disk.
249249

250250
# Each example is put inside a string at the top of the file so
251-
# it's an otherwise valid Python source file.
252-
template = b'"%s"\n'
251+
# it's an otherwise valid Python source file. Put some newlines
252+
# beforehand so we can assert that the error is reported on the
253+
# correct line.
254+
template = b'\n\n\n"%s"\n'
253255

254256
fn = TESTFN
255257
self.addCleanup(unlink, fn)
256258

257259
def check(content):
258260
with open(fn, 'wb') as fp:
259261
fp.write(template % content)
260-
script_helper.assert_python_failure(fn)
262+
rc, stdout, stderr = script_helper.assert_python_failure(fn)
263+
# We want to assert that the python subprocess failed gracefully,
264+
# not via a signal.
265+
self.assertGreaterEqual(rc, 1)
266+
self.assertIn(b"Non-UTF-8 code starting with", stderr)
267+
self.assertIn(b"on line 4", stderr)
261268

262269
# continuation bytes in a sequence of 2, 3, or 4 bytes
263270
continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Loading a file with invalid UTF-8 will now report the broken character at
2+
the correct location.

Parser/tokenizer.c

+45-13
Original file line numberDiff line numberDiff line change
@@ -486,25 +486,59 @@ static void fp_ungetc(int c, struct tok_state *tok) {
486486

487487
/* Check whether the characters at s start a valid
488488
UTF-8 sequence. Return the number of characters forming
489-
the sequence if yes, 0 if not. */
490-
static int valid_utf8(const unsigned char* s)
489+
the sequence if yes, 0 if not. The special cases match
490+
those in stringlib/codecs.h:utf8_decode.
491+
*/
492+
static int
493+
valid_utf8(const unsigned char* s)
491494
{
492495
int expected = 0;
493496
int length;
494-
if (*s < 0x80)
497+
if (*s < 0x80) {
495498
/* single-byte code */
496499
return 1;
497-
if (*s < 0xc0)
498-
/* following byte */
499-
return 0;
500-
if (*s < 0xE0)
500+
}
501+
else if (*s < 0xE0) {
502+
/* \xC2\x80-\xDF\xBF -- 0080-07FF */
503+
if (*s < 0xC2) {
504+
/* invalid sequence
505+
\x80-\xBF -- continuation byte
506+
\xC0-\xC1 -- fake 0000-007F */
507+
return 0;
508+
}
501509
expected = 1;
502-
else if (*s < 0xF0)
510+
}
511+
else if (*s < 0xF0) {
512+
/* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
513+
if (*s == 0xE0 && *(s + 1) < 0xA0) {
514+
/* invalid sequence
515+
\xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
516+
return 0;
517+
}
518+
else if (*s == 0xED && *(s + 1) >= 0xA0) {
519+
/* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
520+
will result in surrogates in range D800-DFFF. Surrogates are
521+
not valid UTF-8 so they are rejected.
522+
See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
523+
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
524+
return 0;
525+
}
503526
expected = 2;
504-
else if (*s < 0xF8)
527+
}
528+
else if (*s < 0xF5) {
529+
/* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
530+
if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
531+
/* invalid sequence -- one of:
532+
\xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
533+
\xF4\x90\x80\x80- -- 110000- overflow */
534+
return 0;
535+
}
505536
expected = 3;
506-
else
537+
}
538+
else {
539+
/* invalid start byte */
507540
return 0;
541+
}
508542
length = expected + 1;
509543
for (; expected; expected--)
510544
if (s[expected] < 0x80 || s[expected] >= 0xC0)
@@ -525,14 +559,12 @@ ensure_utf8(char *line, struct tok_state *tok)
525559
}
526560
}
527561
if (badchar) {
528-
/* Need to add 1 to the line number, since this line
529-
has not been counted, yet. */
530562
PyErr_Format(PyExc_SyntaxError,
531563
"Non-UTF-8 code starting with '\\x%.2x' "
532564
"in file %U on line %i, "
533565
"but no encoding declared; "
534566
"see https://peps.python.org/pep-0263/ for details",
535-
badchar, tok->filename, tok->lineno + 1);
567+
badchar, tok->filename, tok->lineno);
536568
return 0;
537569
}
538570
return 1;

0 commit comments

Comments
 (0)