Skip to content

Commit 0f942e5

Browse files
committed
Merge pull request 1175 from iex-rs/faster-backslash-u
3 parents d8921cd + 96ae604 + f50e296 commit 0f942e5

File tree

3 files changed

+170
-81
lines changed

3 files changed

+170
-81
lines changed

src/de.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -1575,7 +1575,10 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
15751575
///
15761576
/// The behavior of serde_json is specified to fail on non-UTF-8 strings
15771577
/// when deserializing into Rust UTF-8 string types such as String, and
1578-
/// succeed with non-UTF-8 bytes when deserializing using this method.
1578+
/// succeed with the bytes representing the [WTF-8] encoding of code points
1579+
/// when deserializing using this method.
1580+
///
1581+
/// [WTF-8]: https://simonsapin.github.io/wtf-8
15791582
///
15801583
/// Escape sequences are processed as usual, and for `\uXXXX` escapes it is
15811584
/// still checked if the hex number represents a valid Unicode code point.

src/read.rs

+116-72
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
use crate::error::{Error, ErrorCode, Result};
22
use alloc::vec::Vec;
3-
use core::char;
43
use core::cmp;
54
use core::mem;
65
use core::ops::Deref;
@@ -877,88 +876,133 @@ fn parse_escape<'de, R: Read<'de>>(
877876
b'n' => scratch.push(b'\n'),
878877
b'r' => scratch.push(b'\r'),
879878
b't' => scratch.push(b'\t'),
880-
b'u' => {
881-
fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
882-
scratch.extend_from_slice(&[
883-
(n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
884-
(n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
885-
(n & 0b0011_1111) as u8 | 0b1000_0000,
886-
]);
887-
}
879+
b'u' => return parse_unicode_escape(read, validate, scratch),
880+
_ => {
881+
return error(read, ErrorCode::InvalidEscape);
882+
}
883+
}
888884

889-
let c = match tri!(read.decode_hex_escape()) {
890-
n @ 0xDC00..=0xDFFF => {
891-
return if validate {
892-
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
893-
} else {
894-
encode_surrogate(scratch, n);
895-
Ok(())
896-
};
897-
}
885+
Ok(())
886+
}
898887

899-
// Non-BMP characters are encoded as a sequence of two hex
900-
// escapes, representing UTF-16 surrogates. If deserializing a
901-
// utf-8 string the surrogates are required to be paired,
902-
// whereas deserializing a byte string accepts lone surrogates.
903-
n1 @ 0xD800..=0xDBFF => {
904-
if tri!(peek_or_eof(read)) == b'\\' {
905-
read.discard();
906-
} else {
907-
return if validate {
908-
read.discard();
909-
error(read, ErrorCode::UnexpectedEndOfHexEscape)
910-
} else {
911-
encode_surrogate(scratch, n1);
912-
Ok(())
913-
};
914-
}
888+
/// Parses a JSON \u escape and appends it into the scratch space. Assumes \u
889+
/// has just been read.
890+
#[cold]
891+
fn parse_unicode_escape<'de, R: Read<'de>>(
892+
read: &mut R,
893+
validate: bool,
894+
scratch: &mut Vec<u8>,
895+
) -> Result<()> {
896+
let mut n = tri!(read.decode_hex_escape());
897+
898+
// Non-BMP characters are encoded as a sequence of two hex
899+
// escapes, representing UTF-16 surrogates. If deserializing a
900+
// utf-8 string the surrogates are required to be paired,
901+
// whereas deserializing a byte string accepts lone surrogates.
902+
if validate && n >= 0xDC00 && n <= 0xDFFF {
903+
// XXX: This is actually a trailing surrogate.
904+
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
905+
}
906+
907+
loop {
908+
if n < 0xD800 || n > 0xDBFF {
909+
// Every u16 outside of the surrogate ranges is guaranteed to be a
910+
// legal char.
911+
push_wtf8_codepoint(n as u32, scratch);
912+
return Ok(());
913+
}
915914

916-
if tri!(peek_or_eof(read)) == b'u' {
917-
read.discard();
918-
} else {
919-
return if validate {
920-
read.discard();
921-
error(read, ErrorCode::UnexpectedEndOfHexEscape)
922-
} else {
923-
encode_surrogate(scratch, n1);
924-
// The \ prior to this byte started an escape sequence,
925-
// so we need to parse that now. This recursive call
926-
// does not blow the stack on malicious input because
927-
// the escape is not \u, so it will be handled by one
928-
// of the easy nonrecursive cases.
929-
parse_escape(read, validate, scratch)
930-
};
931-
}
915+
// n is a leading surrogate, we now expect a trailing surrogate.
916+
let n1 = n;
932917

933-
let n2 = tri!(read.decode_hex_escape());
918+
if tri!(peek_or_eof(read)) == b'\\' {
919+
read.discard();
920+
} else {
921+
return if validate {
922+
read.discard();
923+
error(read, ErrorCode::UnexpectedEndOfHexEscape)
924+
} else {
925+
push_wtf8_codepoint(n1 as u32, scratch);
926+
Ok(())
927+
};
928+
}
934929

935-
if n2 < 0xDC00 || n2 > 0xDFFF {
936-
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
937-
}
930+
if tri!(peek_or_eof(read)) == b'u' {
931+
read.discard();
932+
} else {
933+
return if validate {
934+
read.discard();
935+
error(read, ErrorCode::UnexpectedEndOfHexEscape)
936+
} else {
937+
push_wtf8_codepoint(n1 as u32, scratch);
938+
// The \ prior to this byte started an escape sequence,
939+
// so we need to parse that now. This recursive call
940+
// does not blow the stack on malicious input because
941+
// the escape is not \u, so it will be handled by one
942+
// of the easy nonrecursive cases.
943+
parse_escape(read, validate, scratch)
944+
};
945+
}
938946

939-
let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
947+
let n2 = tri!(read.decode_hex_escape());
940948

941-
match char::from_u32(n) {
942-
Some(c) => c,
943-
None => {
944-
return error(read, ErrorCode::InvalidUnicodeCodePoint);
945-
}
946-
}
947-
}
949+
if n2 < 0xDC00 || n2 > 0xDFFF {
950+
if validate {
951+
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
952+
}
953+
push_wtf8_codepoint(n1 as u32, scratch);
954+
// If n2 is a leading surrogate, we need to restart.
955+
n = n2;
956+
continue;
957+
}
948958

949-
// Every u16 outside of the surrogate ranges above is guaranteed
950-
// to be a legal char.
951-
n => char::from_u32(n as u32).unwrap(),
952-
};
959+
// This value is in range U+10000..=U+10FFFF, which is always a
960+
// valid codepoint.
961+
let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
962+
push_wtf8_codepoint(n, scratch);
963+
return Ok(());
964+
}
965+
}
953966

954-
scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes());
955-
}
956-
_ => {
957-
return error(read, ErrorCode::InvalidEscape);
958-
}
967+
/// Adds a WTF-8 codepoint to the end of the buffer. This is a more efficient
968+
/// implementation of String::push. The codepoint may be a surrogate.
969+
#[inline]
970+
fn push_wtf8_codepoint(n: u32, scratch: &mut Vec<u8>) {
971+
if n < 0x80 {
972+
scratch.push(n as u8);
973+
return;
959974
}
960975

961-
Ok(())
976+
scratch.reserve(4);
977+
978+
unsafe {
979+
let ptr = scratch.as_mut_ptr().add(scratch.len());
980+
981+
let encoded_len = match n {
982+
0..=0x7F => unreachable!(),
983+
0x80..=0x7FF => {
984+
ptr.write((n >> 6 & 0b0001_1111) as u8 | 0b1100_0000);
985+
2
986+
}
987+
0x800..=0xFFFF => {
988+
ptr.write((n >> 12 & 0b0000_1111) as u8 | 0b1110_0000);
989+
ptr.add(1).write((n >> 6 & 0b0011_1111) as u8 | 0b1000_0000);
990+
3
991+
}
992+
0x1_0000..=0x10_FFFF => {
993+
ptr.write((n >> 18 & 0b0000_0111) as u8 | 0b1111_0000);
994+
ptr.add(1)
995+
.write((n >> 12 & 0b0011_1111) as u8 | 0b1000_0000);
996+
ptr.add(2).write((n >> 6 & 0b0011_1111) as u8 | 0b1000_0000);
997+
4
998+
}
999+
0x11_0000.. => unreachable!(),
1000+
};
1001+
ptr.add(encoded_len - 1)
1002+
.write((n & 0b0011_1111) as u8 | 0b1000_0000);
1003+
1004+
scratch.set_len(scratch.len() + encoded_len);
1005+
}
9621006
}
9631007

9641008
/// Parses a JSON escape sequence and discards the value. Assumes the previous

tests/test.rs

+50-8
Original file line numberDiff line numberDiff line change
@@ -1707,7 +1707,7 @@ fn test_byte_buf_de() {
17071707
}
17081708

17091709
#[test]
1710-
fn test_byte_buf_de_lone_surrogate() {
1710+
fn test_byte_buf_de_invalid_surrogates() {
17111711
let bytes = ByteBuf::from(vec![237, 160, 188]);
17121712
let v: ByteBuf = from_str(r#""\ud83c""#).unwrap();
17131713
assert_eq!(v, bytes);
@@ -1720,23 +1720,54 @@ fn test_byte_buf_de_lone_surrogate() {
17201720
let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap();
17211721
assert_eq!(v, bytes);
17221722

1723-
let bytes = ByteBuf::from(vec![237, 176, 129]);
1724-
let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
1725-
assert_eq!(v, bytes);
1726-
17271723
let res = from_str::<ByteBuf>(r#""\ud83c\!""#);
17281724
assert!(res.is_err());
17291725

17301726
let res = from_str::<ByteBuf>(r#""\ud83c\u""#);
17311727
assert!(res.is_err());
17321728

1733-
let res = from_str::<ByteBuf>(r#""\ud83c\ud83c""#);
1734-
assert!(res.is_err());
1729+
// lone trailing surrogate
1730+
let bytes = ByteBuf::from(vec![237, 176, 129]);
1731+
let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
1732+
assert_eq!(v, bytes);
1733+
1734+
// leading surrogate followed by other leading surrogate
1735+
let bytes = ByteBuf::from(vec![237, 160, 188, 237, 160, 188]);
1736+
let v: ByteBuf = from_str(r#""\ud83c\ud83c""#).unwrap();
1737+
assert_eq!(v, bytes);
1738+
1739+
// leading surrogate followed by "a" (U+0061) in \u encoding
1740+
let bytes = ByteBuf::from(vec![237, 160, 188, 97]);
1741+
let v: ByteBuf = from_str(r#""\ud83c\u0061""#).unwrap();
1742+
assert_eq!(v, bytes);
1743+
1744+
// leading surrogate followed by U+0080
1745+
let bytes = ByteBuf::from(vec![237, 160, 188, 194, 128]);
1746+
let v: ByteBuf = from_str(r#""\ud83c\u0080""#).unwrap();
1747+
assert_eq!(v, bytes);
1748+
1749+
// leading surrogate followed by U+FFFF
1750+
let bytes = ByteBuf::from(vec![237, 160, 188, 239, 191, 191]);
1751+
let v: ByteBuf = from_str(r#""\ud83c\uffff""#).unwrap();
1752+
assert_eq!(v, bytes);
1753+
}
1754+
1755+
#[test]
1756+
fn test_byte_buf_de_surrogate_pair() {
1757+
// leading surrogate followed by trailing surrogate
1758+
let bytes = ByteBuf::from(vec![240, 159, 128, 128]);
1759+
let v: ByteBuf = from_str(r#""\ud83c\udc00""#).unwrap();
1760+
assert_eq!(v, bytes);
1761+
1762+
// leading surrogate followed by a surrogate pair
1763+
let bytes = ByteBuf::from(vec![237, 160, 188, 240, 159, 128, 128]);
1764+
let v: ByteBuf = from_str(r#""\ud83c\ud83c\udc00""#).unwrap();
1765+
assert_eq!(v, bytes);
17351766
}
17361767

17371768
#[cfg(feature = "raw_value")]
17381769
#[test]
1739-
fn test_raw_de_lone_surrogate() {
1770+
fn test_raw_de_invalid_surrogates() {
17401771
use serde_json::value::RawValue;
17411772

17421773
assert!(from_str::<Box<RawValue>>(r#""\ud83c""#).is_ok());
@@ -1746,6 +1777,17 @@ fn test_raw_de_lone_surrogate() {
17461777
assert!(from_str::<Box<RawValue>>(r#""\udc01\!""#).is_err());
17471778
assert!(from_str::<Box<RawValue>>(r#""\udc01\u""#).is_err());
17481779
assert!(from_str::<Box<RawValue>>(r#""\ud83c\ud83c""#).is_ok());
1780+
assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0061""#).is_ok());
1781+
assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0080""#).is_ok());
1782+
assert!(from_str::<Box<RawValue>>(r#""\ud83c\uffff""#).is_ok());
1783+
}
1784+
1785+
#[cfg(feature = "raw_value")]
1786+
#[test]
1787+
fn test_raw_de_surrogate_pair() {
1788+
use serde_json::value::RawValue;
1789+
1790+
assert!(from_str::<Box<RawValue>>(r#""\ud83c\udc00""#).is_ok());
17491791
}
17501792

17511793
#[test]

0 commit comments

Comments
 (0)