|
1 | 1 | use crate::error::{Error, ErrorCode, Result};
|
2 | 2 | use alloc::vec::Vec;
|
3 |
| -use core::char; |
4 | 3 | use core::cmp;
|
5 | 4 | use core::mem;
|
6 | 5 | use core::ops::Deref;
|
@@ -877,88 +876,133 @@ fn parse_escape<'de, R: Read<'de>>(
|
877 | 876 | b'n' => scratch.push(b'\n'),
|
878 | 877 | b'r' => scratch.push(b'\r'),
|
879 | 878 | b't' => scratch.push(b'\t'),
|
880 |
| - b'u' => { |
881 |
| - fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) { |
882 |
| - scratch.extend_from_slice(&[ |
883 |
| - (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000, |
884 |
| - (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000, |
885 |
| - (n & 0b0011_1111) as u8 | 0b1000_0000, |
886 |
| - ]); |
887 |
| - } |
| 879 | + b'u' => return parse_unicode_escape(read, validate, scratch), |
| 880 | + _ => { |
| 881 | + return error(read, ErrorCode::InvalidEscape); |
| 882 | + } |
| 883 | + } |
888 | 884 |
|
889 |
| - let c = match tri!(read.decode_hex_escape()) { |
890 |
| - n @ 0xDC00..=0xDFFF => { |
891 |
| - return if validate { |
892 |
| - error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) |
893 |
| - } else { |
894 |
| - encode_surrogate(scratch, n); |
895 |
| - Ok(()) |
896 |
| - }; |
897 |
| - } |
| 885 | + Ok(()) |
| 886 | +} |
898 | 887 |
|
899 |
| - // Non-BMP characters are encoded as a sequence of two hex |
900 |
| - // escapes, representing UTF-16 surrogates. If deserializing a |
901 |
| - // utf-8 string the surrogates are required to be paired, |
902 |
| - // whereas deserializing a byte string accepts lone surrogates. |
903 |
| - n1 @ 0xD800..=0xDBFF => { |
904 |
| - if tri!(peek_or_eof(read)) == b'\\' { |
905 |
| - read.discard(); |
906 |
| - } else { |
907 |
| - return if validate { |
908 |
| - read.discard(); |
909 |
| - error(read, ErrorCode::UnexpectedEndOfHexEscape) |
910 |
| - } else { |
911 |
| - encode_surrogate(scratch, n1); |
912 |
| - Ok(()) |
913 |
| - }; |
914 |
| - } |
| 888 | +/// Parses a JSON \u escape and appends it into the scratch space. Assumes \u |
| 889 | +/// has just been read. |
| 890 | +#[cold] |
| 891 | +fn parse_unicode_escape<'de, R: Read<'de>>( |
| 892 | + read: &mut R, |
| 893 | + validate: bool, |
| 894 | + scratch: &mut Vec<u8>, |
| 895 | +) -> Result<()> { |
| 896 | + let mut n = tri!(read.decode_hex_escape()); |
| 897 | + |
| 898 | + // Non-BMP characters are encoded as a sequence of two hex |
| 899 | + // escapes, representing UTF-16 surrogates. If deserializing a |
| 900 | + // utf-8 string the surrogates are required to be paired, |
| 901 | + // whereas deserializing a byte string accepts lone surrogates. |
| 902 | + if validate && n >= 0xDC00 && n <= 0xDFFF { |
| 903 | + // XXX: This is actually a trailing surrogate. |
| 904 | + return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); |
| 905 | + } |
| 906 | + |
| 907 | + loop { |
| 908 | + if n < 0xD800 || n > 0xDBFF { |
| 909 | + // Every u16 outside of the surrogate ranges is guaranteed to be a |
| 910 | + // legal char. |
| 911 | + push_wtf8_codepoint(n as u32, scratch); |
| 912 | + return Ok(()); |
| 913 | + } |
915 | 914 |
|
916 |
| - if tri!(peek_or_eof(read)) == b'u' { |
917 |
| - read.discard(); |
918 |
| - } else { |
919 |
| - return if validate { |
920 |
| - read.discard(); |
921 |
| - error(read, ErrorCode::UnexpectedEndOfHexEscape) |
922 |
| - } else { |
923 |
| - encode_surrogate(scratch, n1); |
924 |
| - // The \ prior to this byte started an escape sequence, |
925 |
| - // so we need to parse that now. This recursive call |
926 |
| - // does not blow the stack on malicious input because |
927 |
| - // the escape is not \u, so it will be handled by one |
928 |
| - // of the easy nonrecursive cases. |
929 |
| - parse_escape(read, validate, scratch) |
930 |
| - }; |
931 |
| - } |
| 915 | + // n is a leading surrogate, we now expect a trailing surrogate. |
| 916 | + let n1 = n; |
932 | 917 |
|
933 |
| - let n2 = tri!(read.decode_hex_escape()); |
| 918 | + if tri!(peek_or_eof(read)) == b'\\' { |
| 919 | + read.discard(); |
| 920 | + } else { |
| 921 | + return if validate { |
| 922 | + read.discard(); |
| 923 | + error(read, ErrorCode::UnexpectedEndOfHexEscape) |
| 924 | + } else { |
| 925 | + push_wtf8_codepoint(n1 as u32, scratch); |
| 926 | + Ok(()) |
| 927 | + }; |
| 928 | + } |
934 | 929 |
|
935 |
| - if n2 < 0xDC00 || n2 > 0xDFFF { |
936 |
| - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); |
937 |
| - } |
| 930 | + if tri!(peek_or_eof(read)) == b'u' { |
| 931 | + read.discard(); |
| 932 | + } else { |
| 933 | + return if validate { |
| 934 | + read.discard(); |
| 935 | + error(read, ErrorCode::UnexpectedEndOfHexEscape) |
| 936 | + } else { |
| 937 | + push_wtf8_codepoint(n1 as u32, scratch); |
| 938 | + // The \ prior to this byte started an escape sequence, |
| 939 | + // so we need to parse that now. This recursive call |
| 940 | + // does not blow the stack on malicious input because |
| 941 | + // the escape is not \u, so it will be handled by one |
| 942 | + // of the easy nonrecursive cases. |
| 943 | + parse_escape(read, validate, scratch) |
| 944 | + }; |
| 945 | + } |
938 | 946 |
|
939 |
| - let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; |
| 947 | + let n2 = tri!(read.decode_hex_escape()); |
940 | 948 |
|
941 |
| - match char::from_u32(n) { |
942 |
| - Some(c) => c, |
943 |
| - None => { |
944 |
| - return error(read, ErrorCode::InvalidUnicodeCodePoint); |
945 |
| - } |
946 |
| - } |
947 |
| - } |
| 949 | + if n2 < 0xDC00 || n2 > 0xDFFF { |
| 950 | + if validate { |
| 951 | + return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); |
| 952 | + } |
| 953 | + push_wtf8_codepoint(n1 as u32, scratch); |
| 954 | + // If n2 is a leading surrogate, we need to restart. |
| 955 | + n = n2; |
| 956 | + continue; |
| 957 | + } |
948 | 958 |
|
949 |
| - // Every u16 outside of the surrogate ranges above is guaranteed |
950 |
| - // to be a legal char. |
951 |
| - n => char::from_u32(n as u32).unwrap(), |
952 |
| - }; |
| 959 | + // This value is in range U+10000..=U+10FFFF, which is always a |
| 960 | + // valid codepoint. |
| 961 | + let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; |
| 962 | + push_wtf8_codepoint(n, scratch); |
| 963 | + return Ok(()); |
| 964 | + } |
| 965 | +} |
953 | 966 |
|
954 |
| - scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes()); |
955 |
| - } |
956 |
| - _ => { |
957 |
| - return error(read, ErrorCode::InvalidEscape); |
958 |
| - } |
| 967 | +/// Adds a WTF-8 codepoint to the end of the buffer. This is a more efficient |
| 968 | +/// implementation of String::push. The codepoint may be a surrogate. |
| 969 | +#[inline] |
| 970 | +fn push_wtf8_codepoint(n: u32, scratch: &mut Vec<u8>) { |
| 971 | + if n < 0x80 { |
| 972 | + scratch.push(n as u8); |
| 973 | + return; |
959 | 974 | }
|
960 | 975 |
|
961 |
| - Ok(()) |
| 976 | + scratch.reserve(4); |
| 977 | + |
| 978 | + unsafe { |
| 979 | + let ptr = scratch.as_mut_ptr().add(scratch.len()); |
| 980 | + |
| 981 | + let encoded_len = match n { |
| 982 | + 0..=0x7F => unreachable!(), |
| 983 | + 0x80..=0x7FF => { |
| 984 | + ptr.write((n >> 6 & 0b0001_1111) as u8 | 0b1100_0000); |
| 985 | + 2 |
| 986 | + } |
| 987 | + 0x800..=0xFFFF => { |
| 988 | + ptr.write((n >> 12 & 0b0000_1111) as u8 | 0b1110_0000); |
| 989 | + ptr.add(1).write((n >> 6 & 0b0011_1111) as u8 | 0b1000_0000); |
| 990 | + 3 |
| 991 | + } |
| 992 | + 0x1_0000..=0x10_FFFF => { |
| 993 | + ptr.write((n >> 18 & 0b0000_0111) as u8 | 0b1111_0000); |
| 994 | + ptr.add(1) |
| 995 | + .write((n >> 12 & 0b0011_1111) as u8 | 0b1000_0000); |
| 996 | + ptr.add(2).write((n >> 6 & 0b0011_1111) as u8 | 0b1000_0000); |
| 997 | + 4 |
| 998 | + } |
| 999 | + 0x11_0000.. => unreachable!(), |
| 1000 | + }; |
| 1001 | + ptr.add(encoded_len - 1) |
| 1002 | + .write((n & 0b0011_1111) as u8 | 0b1000_0000); |
| 1003 | + |
| 1004 | + scratch.set_len(scratch.len() + encoded_len); |
| 1005 | + } |
962 | 1006 | }
|
963 | 1007 |
|
964 | 1008 | /// Parses a JSON escape sequence and discards the value. Assumes the previous
|
|
0 commit comments