Skip to content

Commit 236cc82

Browse files
committed
Simplify unicode escape handling
This does not affect performance.
1 parent 2f28d10 commit 236cc82

File tree

1 file changed

+53
-54
lines changed

1 file changed

+53
-54
lines changed

src/read.rs

+53-54
Original file line numberDiff line numberDiff line change
@@ -898,67 +898,66 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
898898
validate: bool,
899899
scratch: &mut Vec<u8>,
900900
) -> Result<()> {
901-
let c = match tri!(read.decode_hex_escape()) {
902-
n @ 0xDC00..=0xDFFF => {
903-
return if validate {
904-
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
905-
} else {
906-
push_wtf8_codepoint(n as u32, scratch);
907-
Ok(())
908-
};
909-
}
901+
let n = tri!(read.decode_hex_escape());
910902

911-
// Non-BMP characters are encoded as a sequence of two hex
912-
// escapes, representing UTF-16 surrogates. If deserializing a
913-
// utf-8 string the surrogates are required to be paired,
914-
// whereas deserializing a byte string accepts lone surrogates.
915-
n1 @ 0xD800..=0xDBFF => {
916-
if tri!(peek_or_eof(read)) == b'\\' {
917-
read.discard();
918-
} else {
919-
return if validate {
920-
read.discard();
921-
error(read, ErrorCode::UnexpectedEndOfHexEscape)
922-
} else {
923-
push_wtf8_codepoint(n1 as u32, scratch);
924-
Ok(())
925-
};
926-
}
903+
// Non-BMP characters are encoded as a sequence of two hex
904+
// escapes, representing UTF-16 surrogates. If deserializing a
905+
// utf-8 string the surrogates are required to be paired,
906+
// whereas deserializing a byte string accepts lone surrogates.
907+
if validate && n >= 0xDC00 && n <= 0xDFFF {
908+
// XXX: This is actually a trailing surrogate.
909+
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
910+
}
927911

928-
if tri!(peek_or_eof(read)) == b'u' {
929-
read.discard();
930-
} else {
931-
return if validate {
932-
read.discard();
933-
error(read, ErrorCode::UnexpectedEndOfHexEscape)
934-
} else {
935-
push_wtf8_codepoint(n1 as u32, scratch);
936-
// The \ prior to this byte started an escape sequence,
937-
// so we need to parse that now. This recursive call
938-
// does not blow the stack on malicious input because
939-
// the escape is not \u, so it will be handled by one
940-
// of the easy nonrecursive cases.
941-
parse_escape(read, validate, scratch)
942-
};
943-
}
912+
if n < 0xD800 || n > 0xDBFF {
913+
// Every u16 outside of the surrogate ranges is guaranteed to be a
914+
// legal char.
915+
push_wtf8_codepoint(n as u32, scratch);
916+
return Ok(());
917+
}
944918

945-
let n2 = tri!(read.decode_hex_escape());
919+
// n is a leading surrogate, we now expect a trailing surrogate.
920+
let n1 = n;
946921

947-
if n2 < 0xDC00 || n2 > 0xDFFF {
948-
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
949-
}
922+
if tri!(peek_or_eof(read)) == b'\\' {
923+
read.discard();
924+
} else {
925+
return if validate {
926+
read.discard();
927+
error(read, ErrorCode::UnexpectedEndOfHexEscape)
928+
} else {
929+
push_wtf8_codepoint(n1 as u32, scratch);
930+
Ok(())
931+
};
932+
}
950933

951-
// This value is in range U+10000..=U+10FFFF, which is always a
952-
// valid codepoint.
953-
(((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000
954-
}
934+
if tri!(peek_or_eof(read)) == b'u' {
935+
read.discard();
936+
} else {
937+
return if validate {
938+
read.discard();
939+
error(read, ErrorCode::UnexpectedEndOfHexEscape)
940+
} else {
941+
push_wtf8_codepoint(n1 as u32, scratch);
942+
// The \ prior to this byte started an escape sequence,
943+
// so we need to parse that now. This recursive call
944+
// does not blow the stack on malicious input because
945+
// the escape is not \u, so it will be handled by one
946+
// of the easy nonrecursive cases.
947+
parse_escape(read, validate, scratch)
948+
};
949+
}
955950

956-
// Every u16 outside of the surrogate ranges above is guaranteed
957-
// to be a legal char.
958-
n => n as u32,
959-
};
951+
let n2 = tri!(read.decode_hex_escape());
952+
953+
if n2 < 0xDC00 || n2 > 0xDFFF {
954+
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
955+
}
960956

961-
push_wtf8_codepoint(c, scratch);
957+
// This value is in range U+10000..=U+10FFFF, which is always a
958+
// valid codepoint.
959+
let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
960+
push_wtf8_codepoint(n, scratch);
962961
Ok(())
963962
}
964963

0 commit comments

Comments
 (0)