Skip to content

Commit f50e296

Browse files
committed
Deserialize invalid UTF-8 into byte bufs as WTF-8
Previously #828 added support for deserializing lone leading and trailing surrogates into WTF-8 encoded bytes when deserializing a string as bytes. This commit extends this to cover the case of a leading surrogate followed by code units that are not trailing surrogates. This allows for deserialization of "\ud83c\ud83c" (two leading surrogates), or "\ud83c\u0061" (a leading surrogate followed by "a"). The docs also now make it clear that we are serializing the invalid code points as WTF-8. This reference to WTF-8 signals to the user that they can use a WTF-8 parser on the bytes to construct a valid UTF-8 string.
1 parent b899429 commit f50e296

File tree

3 files changed

+80
-20
lines changed

3 files changed

+80
-20
lines changed

src/de.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -1570,7 +1570,10 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
15701570
///
15711571
/// The behavior of serde_json is specified to fail on non-UTF-8 strings
15721572
/// when deserializing into Rust UTF-8 string types such as String, and
1573-
/// succeed with non-UTF-8 bytes when deserializing using this method.
1573+
/// succeed with the bytes representing the [WTF-8] encoding of code points
1574+
/// when deserializing using this method.
1575+
///
1576+
/// [WTF-8]: https://simonsapin.github.io/wtf-8
15741577
///
15751578
/// Escape sequences are processed as usual, and for `\uXXXX` escapes it is
15761579
/// still checked if the hex number represents a valid Unicode code point.

src/read.rs

+30-11
Original file line numberDiff line numberDiff line change
@@ -861,20 +861,33 @@ fn parse_escape<'de, R: Read<'de>>(
861861
b'r' => scratch.push(b'\r'),
862862
b't' => scratch.push(b'\t'),
863863
b'u' => {
864-
fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
865-
scratch.extend_from_slice(&[
866-
(n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
867-
(n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
868-
(n & 0b0011_1111) as u8 | 0b1000_0000,
869-
]);
864+
fn encode_wtf8(scratch: &mut Vec<u8>, cp: u16) {
865+
match cp {
866+
0x0000..=0x007F => {
867+
scratch.extend_from_slice(&[cp as u8]);
868+
}
869+
0x0080..=0x07FF => {
870+
scratch
871+
.extend_from_slice(&[0xC0 | (cp >> 6) as u8, 0x80 | (cp & 0x3F) as u8]);
872+
}
873+
0x0800..=0xFFFF => {
874+
scratch.extend_from_slice(&[
875+
0xE0 | (cp >> 12) as u8,
876+
0x80 | ((cp >> 6) & 0x3F) as u8,
877+
0x80 | (cp & 0x3F) as u8,
878+
]);
879+
}
880+
}
870881
}
871882

872883
let c = match tri!(read.decode_hex_escape()) {
873884
n @ 0xDC00..=0xDFFF => {
874885
return if validate {
886+
// TODO: the error message is wrong, this is a lone
887+
// _trailing_ surrogate
875888
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
876889
} else {
877-
encode_surrogate(scratch, n);
890+
encode_wtf8(scratch, n);
878891
Ok(())
879892
};
880893
}
@@ -889,9 +902,9 @@ fn parse_escape<'de, R: Read<'de>>(
889902
} else {
890903
return if validate {
891904
read.discard();
892-
error(read, ErrorCode::UnexpectedEndOfHexEscape)
905+
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
893906
} else {
894-
encode_surrogate(scratch, n1);
907+
encode_wtf8(scratch, n1);
895908
Ok(())
896909
};
897910
}
@@ -903,7 +916,7 @@ fn parse_escape<'de, R: Read<'de>>(
903916
read.discard();
904917
error(read, ErrorCode::UnexpectedEndOfHexEscape)
905918
} else {
906-
encode_surrogate(scratch, n1);
919+
encode_wtf8(scratch, n1);
907920
// The \ prior to this byte started an escape sequence,
908921
// so we need to parse that now. This recursive call
909922
// does not blow the stack on malicious input because
@@ -916,7 +929,13 @@ fn parse_escape<'de, R: Read<'de>>(
916929
let n2 = tri!(read.decode_hex_escape());
917930

918931
if n2 < 0xDC00 || n2 > 0xDFFF {
919-
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
932+
return if validate {
933+
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
934+
} else {
935+
encode_wtf8(scratch, n1);
936+
encode_wtf8(scratch, n2);
937+
Ok(())
938+
};
920939
}
921940

922941
let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;

tests/test.rs

+46-8
Original file line numberDiff line numberDiff line change
@@ -1713,7 +1713,8 @@ fn test_byte_buf_de() {
17131713
}
17141714

17151715
#[test]
1716-
fn test_byte_buf_de_lone_surrogate() {
1716+
fn test_byte_buf_de_invalid_surrogates() {
1717+
// lone leading surrogate
17171718
let bytes = ByteBuf::from(vec![237, 160, 188]);
17181719
let v: ByteBuf = from_str(r#""\ud83c""#).unwrap();
17191720
assert_eq!(v, bytes);
@@ -1726,23 +1727,49 @@ fn test_byte_buf_de_lone_surrogate() {
17261727
let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap();
17271728
assert_eq!(v, bytes);
17281729

1729-
let bytes = ByteBuf::from(vec![237, 176, 129]);
1730-
let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
1731-
assert_eq!(v, bytes);
1732-
17331730
let res = from_str::<ByteBuf>(r#""\ud83c\!""#);
17341731
assert!(res.is_err());
17351732

17361733
let res = from_str::<ByteBuf>(r#""\ud83c\u""#);
17371734
assert!(res.is_err());
17381735

1739-
let res = from_str::<ByteBuf>(r#""\ud83c\ud83c""#);
1740-
assert!(res.is_err());
1736+
// lone trailing surrogate
1737+
let bytes = ByteBuf::from(vec![237, 176, 129]);
1738+
let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
1739+
assert_eq!(v, bytes);
1740+
1741+
// leading surrogate followed by other leading surrogate
1742+
let bytes = ByteBuf::from(vec![237, 160, 188, 237, 160, 188]);
1743+
let v: ByteBuf = from_str(r#""\ud83c\ud83c""#).unwrap();
1744+
assert_eq!(v, bytes);
1745+
1746+
// leading surrogate followed by "a" (U+0061) in \u encoding
1747+
let bytes = ByteBuf::from(vec![237, 160, 188, 97]);
1748+
let v: ByteBuf = from_str(r#""\ud83c\u0061""#).unwrap();
1749+
assert_eq!(v, bytes);
1750+
1751+
// leading surrogate followed by U+0080
1752+
let bytes = ByteBuf::from(vec![237, 160, 188, 194, 128]);
1753+
let v: ByteBuf = from_str(r#""\ud83c\u0080""#).unwrap();
1754+
assert_eq!(v, bytes);
1755+
1756+
// leading surrogate followed by U+FFFF
1757+
let bytes = ByteBuf::from(vec![237, 160, 188, 239, 191, 191]);
1758+
let v: ByteBuf = from_str(r#""\ud83c\uffff""#).unwrap();
1759+
assert_eq!(v, bytes);
1760+
}
1761+
1762+
#[test]
1763+
fn test_byte_buf_de_surrogate_pair() {
1764+
// leading surrogate followed by trailing surrogate
1765+
let bytes = ByteBuf::from(vec![240, 159, 128, 128]);
1766+
let v: ByteBuf = from_str(r#""\ud83c\udc00""#).unwrap();
1767+
assert_eq!(v, bytes);
17411768
}
17421769

17431770
#[cfg(feature = "raw_value")]
17441771
#[test]
1745-
fn test_raw_de_lone_surrogate() {
1772+
fn test_raw_de_invalid_surrogates() {
17461773
use serde_json::value::RawValue;
17471774

17481775
assert!(from_str::<Box<RawValue>>(r#""\ud83c""#).is_ok());
@@ -1752,6 +1779,17 @@ fn test_raw_de_lone_surrogate() {
17521779
assert!(from_str::<Box<RawValue>>(r#""\udc01\!""#).is_err());
17531780
assert!(from_str::<Box<RawValue>>(r#""\udc01\u""#).is_err());
17541781
assert!(from_str::<Box<RawValue>>(r#""\ud83c\ud83c""#).is_ok());
1782+
assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0061""#).is_ok());
1783+
assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0080""#).is_ok());
1784+
assert!(from_str::<Box<RawValue>>(r#""\ud83c\uffff""#).is_ok());
1785+
}
1786+
1787+
#[cfg(feature = "raw_value")]
1788+
#[test]
1789+
fn test_raw_de_surrogate_pair() {
1790+
use serde_json::value::RawValue;
1791+
1792+
assert!(from_str::<Box<RawValue>>(r#""\ud83c\udc00""#).is_ok());
17551793
}
17561794

17571795
#[test]

0 commit comments

Comments
 (0)