Skip to content

Commit 2f28d10

Browse files
committed
Use the same UTF-8/WTF-8 impl for surrogates
This does not affect performance.
1 parent 0e90b61 commit 2f28d10

File tree

1 file changed

+7
-18
lines changed

1 file changed

+7
-18
lines changed

src/read.rs

+7-18
Original file line numberDiff line numberDiff line change
@@ -898,20 +898,12 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
898898
validate: bool,
899899
scratch: &mut Vec<u8>,
900900
) -> Result<()> {
901-
fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
902-
scratch.extend_from_slice(&[
903-
(n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
904-
(n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
905-
(n & 0b0011_1111) as u8 | 0b1000_0000,
906-
]);
907-
}
908-
909901
let c = match tri!(read.decode_hex_escape()) {
910902
n @ 0xDC00..=0xDFFF => {
911903
return if validate {
912904
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
913905
} else {
914-
encode_surrogate(scratch, n);
906+
push_wtf8_codepoint(n as u32, scratch);
915907
Ok(())
916908
};
917909
}
@@ -928,7 +920,7 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
928920
read.discard();
929921
error(read, ErrorCode::UnexpectedEndOfHexEscape)
930922
} else {
931-
encode_surrogate(scratch, n1);
923+
push_wtf8_codepoint(n1 as u32, scratch);
932924
Ok(())
933925
};
934926
}
@@ -940,7 +932,7 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
940932
read.discard();
941933
error(read, ErrorCode::UnexpectedEndOfHexEscape)
942934
} else {
943-
encode_surrogate(scratch, n1);
935+
push_wtf8_codepoint(n1 as u32, scratch);
944936
// The \ prior to this byte started an escape sequence,
945937
// so we need to parse that now. This recursive call
946938
// does not blow the stack on malicious input because
@@ -966,17 +958,14 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
966958
n => n as u32,
967959
};
968960

969-
// SAFETY: c is always a codepoint.
970-
unsafe {
971-
push_utf8_codepoint(c, scratch);
972-
}
961+
push_wtf8_codepoint(c, scratch);
973962
Ok(())
974963
}
975964

976-
/// Adds a UTF-8 codepoint to the end of the buffer. This is a more efficient
977-
/// implementation of String::push. n must be a valid codepoint.
965+
/// Adds a WTF-8 codepoint to the end of the buffer. This is a more efficient
966+
/// implementation of String::push. The codepoint may be a surrogate.
978967
#[inline]
979-
unsafe fn push_utf8_codepoint(n: u32, scratch: &mut Vec<u8>) {
968+
fn push_wtf8_codepoint(n: u32, scratch: &mut Vec<u8>) {
980969
if n < 0x80 {
981970
scratch.push(n as u8);
982971
return;

0 commit comments

Comments
 (0)