@@ -486,25 +486,59 @@ static void fp_ungetc(int c, struct tok_state *tok) {
486
486
487
487
/* Check whether the characters at s start a valid
488
488
UTF-8 sequence. Return the number of characters forming
489
- the sequence if yes, 0 if not. */
490
- static int valid_utf8 (const unsigned char * s )
489
+ the sequence if yes, 0 if not. The special cases match
490
+ those in stringlib/codecs.h:utf8_decode.
491
+ */
492
+ static int
493
+ valid_utf8 (const unsigned char * s )
491
494
{
492
495
int expected = 0 ;
493
496
int length ;
494
- if (* s < 0x80 )
497
+ if (* s < 0x80 ) {
495
498
/* single-byte code */
496
499
return 1 ;
497
- if (* s < 0xc0 )
498
- /* following byte */
499
- return 0 ;
500
- if (* s < 0xE0 )
500
+ }
501
+ else if (* s < 0xE0 ) {
502
+ /* \xC2\x80-\xDF\xBF -- 0080-07FF */
503
+ if (* s < 0xC2 ) {
504
+ /* invalid sequence
505
+ \x80-\xBF -- continuation byte
506
+ \xC0-\xC1 -- fake 0000-007F */
507
+ return 0 ;
508
+ }
501
509
expected = 1 ;
502
- else if (* s < 0xF0 )
510
+ }
511
+ else if (* s < 0xF0 ) {
512
+ /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
513
+ if (* s == 0xE0 && * (s + 1 ) < 0xA0 ) {
514
+ /* invalid sequence
515
+ \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
516
+ return 0 ;
517
+ }
518
+ else if (* s == 0xED && * (s + 1 ) >= 0xA0 ) {
519
+ /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
520
+ will result in surrogates in range D800-DFFF. Surrogates are
521
+ not valid UTF-8 so they are rejected.
522
+ See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
523
+ (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
524
+ return 0 ;
525
+ }
503
526
expected = 2 ;
504
- else if (* s < 0xF8 )
527
+ }
528
+ else if (* s < 0xF5 ) {
529
+ /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
530
+ if (* (s + 1 ) < 0x90 ? * s == 0xF0 : * s == 0xF4 ) {
531
+ /* invalid sequence -- one of:
532
+ \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
533
+ \xF4\x90\x80\x80- -- 110000- overflow */
534
+ return 0 ;
535
+ }
505
536
expected = 3 ;
506
- else
537
+ }
538
+ else {
539
+ /* invalid start byte */
507
540
return 0 ;
541
+ }
508
542
length = expected + 1 ;
509
543
for (; expected ; expected -- )
510
544
if (s [expected ] < 0x80 || s [expected ] >= 0xC0 )
@@ -525,14 +559,12 @@ ensure_utf8(char *line, struct tok_state *tok)
525
559
}
526
560
}
527
561
if (badchar ) {
528
- /* Need to add 1 to the line number, since this line
529
- has not been counted, yet. */
530
562
PyErr_Format (PyExc_SyntaxError ,
531
563
"Non-UTF-8 code starting with '\\x%.2x' "
532
564
"in file %U on line %i, "
533
565
"but no encoding declared; "
534
566
"see https://peps.python.org/pep-0263/ for details" ,
535
- badchar , tok -> filename , tok -> lineno + 1 );
567
+ badchar , tok -> filename , tok -> lineno );
536
568
return 0 ;
537
569
}
538
570
return 1 ;
0 commit comments