Deserialize invalid UTF-8 into byte bufs as WTF-8

lucacasonato · lucacasonato · commit f50e29656a52 · 2022-05-18T21:27:50.000+02:00
Previously #828 added support for deserializing lone leading and trailing surrogates into WTF-8 encoded bytes when deserializing a string as bytes. This commit extends this to cover the case of a leading surrogate followed by code units that are not trailing surrogates. This allows for deserialization of "\ud83c\ud83c" (two leading surrogates), or "\ud83c\u0061" (a leading surrogate followed by "a"). The docs also now make it clear that we are serializing the invalid code points as WTF-8. This reference to WTF-8 signals to the user that they can use a WTF-8 parser on the bytes to construct a valid UTF-8 string.
diff --git a/src/de.rs b/src/de.rs
@@ -1570,7 +1570,10 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer<R> {
     ///
     /// The behavior of serde_json is specified to fail on non-UTF-8 strings
     /// when deserializing into Rust UTF-8 string types such as String, and
-    /// succeed with non-UTF-8 bytes when deserializing using this method.
+    /// succeed with the bytes representing the [WTF-8] encoding of code points
+    /// when deserializing using this method.
+    /// 
+    /// [WTF-8]: https://simonsapin.github.io/wtf-8
     ///
     /// Escape sequences are processed as usual, and for `\uXXXX` escapes it is
     /// still checked if the hex number represents a valid Unicode code point.
diff --git a/src/read.rs b/src/read.rs
@@ -861,20 +861,33 @@ fn parse_escape<'de, R: Read<'de>>(
         b'r' => scratch.push(b'\r'),
         b't' => scratch.push(b'\t'),
         b'u' => {
-            fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
-                scratch.extend_from_slice(&[
-                    (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
-                    (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
-                    (n & 0b0011_1111) as u8 | 0b1000_0000,
-                ]);
+            fn encode_wtf8(scratch: &mut Vec<u8>, cp: u16) {
+                match cp {
+                    0x0000..=0x007F => {
+                        scratch.extend_from_slice(&[cp as u8]);
+                    }
+                    0x0080..=0x07FF => {
+                        scratch
+                            .extend_from_slice(&[0xC0 | (cp >> 6) as u8, 0x80 | (cp & 0x3F) as u8]);
+                    }
+                    0x0800..=0xFFFF => {
+                        scratch.extend_from_slice(&[
+                            0xE0 | (cp >> 12) as u8,
+                            0x80 | ((cp >> 6) & 0x3F) as u8,
+                            0x80 | (cp & 0x3F) as u8,
+                        ]);
+                    }
+                }
             }
 
             let c = match tri!(read.decode_hex_escape()) {
                 n @ 0xDC00..=0xDFFF => {
                     return if validate {
+                        // TODO: the error message is wrong, this is a lone
+                        // _trailing_ surrogate
                         error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
                     } else {
-                        encode_surrogate(scratch, n);
+                        encode_wtf8(scratch, n);
                         Ok(())
                     };
                 }
@@ -889,9 +902,9 @@ fn parse_escape<'de, R: Read<'de>>(
                     } else {
                         return if validate {
                             read.discard();
-                            error(read, ErrorCode::UnexpectedEndOfHexEscape)
+                            error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
                         } else {
-                            encode_surrogate(scratch, n1);
+                            encode_wtf8(scratch, n1);
                             Ok(())
                         };
                     }
@@ -903,7 +916,7 @@ fn parse_escape<'de, R: Read<'de>>(
                             read.discard();
                             error(read, ErrorCode::UnexpectedEndOfHexEscape)
                         } else {
-                            encode_surrogate(scratch, n1);
+                            encode_wtf8(scratch, n1);
                             // The \ prior to this byte started an escape sequence,
                             // so we need to parse that now. This recursive call
                             // does not blow the stack on malicious input because
@@ -916,7 +929,13 @@ fn parse_escape<'de, R: Read<'de>>(
                     let n2 = tri!(read.decode_hex_escape());
 
                     if n2 < 0xDC00 || n2 > 0xDFFF {
-                        return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
+                        return if validate {
+                            error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
+                        } else {
+                            encode_wtf8(scratch, n1);
+                            encode_wtf8(scratch, n2);
+                            Ok(())
+                        };
                     }
 
                     let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
diff --git a/tests/test.rs b/tests/test.rs
@@ -1713,7 +1713,8 @@ fn test_byte_buf_de() {
 }
 
 #[test]
-fn test_byte_buf_de_lone_surrogate() {
+fn test_byte_buf_de_invalid_surrogates() {
+    // lone leading surrogate
     let bytes = ByteBuf::from(vec![237, 160, 188]);
     let v: ByteBuf = from_str(r#""\ud83c""#).unwrap();
     assert_eq!(v, bytes);
@@ -1726,23 +1727,49 @@ fn test_byte_buf_de_lone_surrogate() {
     let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap();
     assert_eq!(v, bytes);
 
-    let bytes = ByteBuf::from(vec![237, 176, 129]);
-    let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
-    assert_eq!(v, bytes);
-
     let res = from_str::<ByteBuf>(r#""\ud83c\!""#);
     assert!(res.is_err());
 
     let res = from_str::<ByteBuf>(r#""\ud83c\u""#);
     assert!(res.is_err());
 
-    let res = from_str::<ByteBuf>(r#""\ud83c\ud83c""#);
-    assert!(res.is_err());
+    // lone trailing surrogate
+    let bytes = ByteBuf::from(vec![237, 176, 129]);
+    let v: ByteBuf = from_str(r#""\udc01""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by other leading surrogate
+    let bytes = ByteBuf::from(vec![237, 160, 188, 237, 160, 188]);
+    let v: ByteBuf = from_str(r#""\ud83c\ud83c""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by "a" (U+0061) in \u encoding
+    let bytes = ByteBuf::from(vec![237, 160, 188, 97]);
+    let v: ByteBuf = from_str(r#""\ud83c\u0061""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by U+0080
+    let bytes = ByteBuf::from(vec![237, 160, 188, 194, 128]);
+    let v: ByteBuf = from_str(r#""\ud83c\u0080""#).unwrap();
+    assert_eq!(v, bytes);
+
+    // leading surrogate followed by U+FFFF
+    let bytes = ByteBuf::from(vec![237, 160, 188, 239, 191, 191]);
+    let v: ByteBuf = from_str(r#""\ud83c\uffff""#).unwrap();
+    assert_eq!(v, bytes);
+}
+
+#[test]
+fn test_byte_buf_de_surrogate_pair() {
+    // leading surrogate followed by trailing surrogate
+    let bytes = ByteBuf::from(vec![240, 159, 128, 128]);
+    let v: ByteBuf = from_str(r#""\ud83c\udc00""#).unwrap();
+    assert_eq!(v, bytes);
 }
 
 #[cfg(feature = "raw_value")]
 #[test]
-fn test_raw_de_lone_surrogate() {
+fn test_raw_de_invalid_surrogates() {
     use serde_json::value::RawValue;
 
     assert!(from_str::<Box<RawValue>>(r#""\ud83c""#).is_ok());
@@ -1752,6 +1779,17 @@ fn test_raw_de_lone_surrogate() {
     assert!(from_str::<Box<RawValue>>(r#""\udc01\!""#).is_err());
     assert!(from_str::<Box<RawValue>>(r#""\udc01\u""#).is_err());
     assert!(from_str::<Box<RawValue>>(r#""\ud83c\ud83c""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0061""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\u0080""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\uffff""#).is_ok());
+}
+
+#[cfg(feature = "raw_value")]
+#[test]
+fn test_raw_de_surrogate_pair() {
+    use serde_json::value::RawValue;
+
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\udc00""#).is_ok());
 }
 
 #[test]