@@ -898,67 +898,66 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
898
898
validate : bool ,
899
899
scratch : & mut Vec < u8 > ,
900
900
) -> Result < ( ) > {
901
- let c = match tri ! ( read. decode_hex_escape( ) ) {
902
- n @ 0xDC00 ..=0xDFFF => {
903
- return if validate {
904
- error ( read, ErrorCode :: LoneLeadingSurrogateInHexEscape )
905
- } else {
906
- push_wtf8_codepoint ( n as u32 , scratch) ;
907
- Ok ( ( ) )
908
- } ;
909
- }
901
+ let n = tri ! ( read. decode_hex_escape( ) ) ;
910
902
911
- // Non-BMP characters are encoded as a sequence of two hex
912
- // escapes, representing UTF-16 surrogates. If deserializing a
913
- // utf-8 string the surrogates are required to be paired,
914
- // whereas deserializing a byte string accepts lone surrogates.
915
- n1 @ 0xD800 ..=0xDBFF => {
916
- if tri ! ( peek_or_eof( read) ) == b'\\' {
917
- read. discard ( ) ;
918
- } else {
919
- return if validate {
920
- read. discard ( ) ;
921
- error ( read, ErrorCode :: UnexpectedEndOfHexEscape )
922
- } else {
923
- push_wtf8_codepoint ( n1 as u32 , scratch) ;
924
- Ok ( ( ) )
925
- } ;
926
- }
903
+ // Non-BMP characters are encoded as a sequence of two hex
904
+ // escapes, representing UTF-16 surrogates. If deserializing a
905
+ // utf-8 string the surrogates are required to be paired,
906
+ // whereas deserializing a byte string accepts lone surrogates.
907
+ if validate && n >= 0xDC00 && n <= 0xDFFF {
908
+ // XXX: This is actually a trailing surrogate.
909
+ return error ( read, ErrorCode :: LoneLeadingSurrogateInHexEscape ) ;
910
+ }
927
911
928
- if tri ! ( peek_or_eof( read) ) == b'u' {
929
- read. discard ( ) ;
930
- } else {
931
- return if validate {
932
- read. discard ( ) ;
933
- error ( read, ErrorCode :: UnexpectedEndOfHexEscape )
934
- } else {
935
- push_wtf8_codepoint ( n1 as u32 , scratch) ;
936
- // The \ prior to this byte started an escape sequence,
937
- // so we need to parse that now. This recursive call
938
- // does not blow the stack on malicious input because
939
- // the escape is not \u, so it will be handled by one
940
- // of the easy nonrecursive cases.
941
- parse_escape ( read, validate, scratch)
942
- } ;
943
- }
912
+ if n < 0xD800 || n > 0xDBFF {
913
+ // Every u16 outside of the surrogate ranges is guaranteed to be a
914
+ // legal char.
915
+ push_wtf8_codepoint ( n as u32 , scratch) ;
916
+ return Ok ( ( ) ) ;
917
+ }
944
918
945
- let n2 = tri ! ( read. decode_hex_escape( ) ) ;
919
+ // n is a leading surrogate, we now expect a trailing surrogate.
920
+ let n1 = n;
946
921
947
- if n2 < 0xDC00 || n2 > 0xDFFF {
948
- return error ( read, ErrorCode :: LoneLeadingSurrogateInHexEscape ) ;
949
- }
922
+ if tri ! ( peek_or_eof( read) ) == b'\\' {
923
+ read. discard ( ) ;
924
+ } else {
925
+ return if validate {
926
+ read. discard ( ) ;
927
+ error ( read, ErrorCode :: UnexpectedEndOfHexEscape )
928
+ } else {
929
+ push_wtf8_codepoint ( n1 as u32 , scratch) ;
930
+ Ok ( ( ) )
931
+ } ;
932
+ }
950
933
951
- // This value is in range U+10000..=U+10FFFF, which is always a
952
- // valid codepoint.
953
- ( ( ( n1 - 0xD800 ) as u32 ) << 10 | ( n2 - 0xDC00 ) as u32 ) + 0x1_0000
954
- }
934
+ if tri ! ( peek_or_eof( read) ) == b'u' {
935
+ read. discard ( ) ;
936
+ } else {
937
+ return if validate {
938
+ read. discard ( ) ;
939
+ error ( read, ErrorCode :: UnexpectedEndOfHexEscape )
940
+ } else {
941
+ push_wtf8_codepoint ( n1 as u32 , scratch) ;
942
+ // The \ prior to this byte started an escape sequence,
943
+ // so we need to parse that now. This recursive call
944
+ // does not blow the stack on malicious input because
945
+ // the escape is not \u, so it will be handled by one
946
+ // of the easy nonrecursive cases.
947
+ parse_escape ( read, validate, scratch)
948
+ } ;
949
+ }
955
950
956
- // Every u16 outside of the surrogate ranges above is guaranteed
957
- // to be a legal char.
958
- n => n as u32 ,
959
- } ;
951
+ let n2 = tri ! ( read. decode_hex_escape( ) ) ;
952
+
953
+ if n2 < 0xDC00 || n2 > 0xDFFF {
954
+ return error ( read, ErrorCode :: LoneLeadingSurrogateInHexEscape ) ;
955
+ }
960
956
961
- push_wtf8_codepoint ( c, scratch) ;
957
+ // This value is in range U+10000..=U+10FFFF, which is always a
958
+ // valid codepoint.
959
+ let n = ( ( ( n1 - 0xD800 ) as u32 ) << 10 | ( n2 - 0xDC00 ) as u32 ) + 0x1_0000 ;
960
+ push_wtf8_codepoint ( n, scratch) ;
962
961
Ok ( ( ) )
963
962
}
964
963
0 commit comments