Skip to content

Commit 0e90b61

Browse files
committed
Format UTF-8 strings manually
This speeds up War and Peace 290 MB/s -> 330 MB/s (+15%).
1 parent a38dbf3 commit 0e90b61

File tree

1 file changed

+49
-11
lines changed

1 file changed

+49
-11
lines changed

src/read.rs

+49-11
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
use crate::error::{Error, ErrorCode, Result};
22
use alloc::vec::Vec;
3-
use core::char;
43
use core::cmp;
54
use core::mem;
65
use core::ops::Deref;
@@ -957,25 +956,64 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
957956
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
958957
}
959958

960-
let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
961-
962-
match char::from_u32(n) {
963-
Some(c) => c,
964-
None => {
965-
return error(read, ErrorCode::InvalidUnicodeCodePoint);
966-
}
967-
}
959+
// This value is in range U+10000..=U+10FFFF, which is always a
960+
// valid codepoint.
961+
(((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000
968962
}
969963

970964
// Every u16 outside of the surrogate ranges above is guaranteed
971965
// to be a legal char.
972-
n => char::from_u32(n as u32).unwrap(),
966+
n => n as u32,
973967
};
974968

975-
scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes());
969+
// SAFETY: c is always a codepoint.
970+
unsafe {
971+
push_utf8_codepoint(c, scratch);
972+
}
976973
Ok(())
977974
}
978975

976+
/// Adds a UTF-8 codepoint to the end of the buffer. This is a more efficient
977+
/// implementation of String::push. n must be a valid codepoint.
978+
#[inline]
979+
unsafe fn push_utf8_codepoint(n: u32, scratch: &mut Vec<u8>) {
980+
if n < 0x80 {
981+
scratch.push(n as u8);
982+
return;
983+
}
984+
985+
scratch.reserve(4);
986+
987+
unsafe {
988+
let ptr = scratch.as_mut_ptr().add(scratch.len());
989+
990+
let encoded_len = match n {
991+
0..=0x7F => unreachable!(),
992+
0x80..=0x7FF => {
993+
ptr.write((n >> 6 & 0b0001_1111) as u8 | 0b1100_0000);
994+
2
995+
}
996+
0x800..=0xFFFF => {
997+
ptr.write((n >> 12 & 0b0000_1111) as u8 | 0b1110_0000);
998+
ptr.add(1).write((n >> 6 & 0b0011_1111) as u8 | 0b1000_0000);
999+
3
1000+
}
1001+
0x1_0000..=0x10_FFFF => {
1002+
ptr.write((n >> 18 & 0b0000_0111) as u8 | 0b1111_0000);
1003+
ptr.add(1)
1004+
.write((n >> 12 & 0b0011_1111) as u8 | 0b1000_0000);
1005+
ptr.add(2).write((n >> 6 & 0b0011_1111) as u8 | 0b1000_0000);
1006+
4
1007+
}
1008+
0x11_0000.. => unreachable!(),
1009+
};
1010+
ptr.add(encoded_len - 1)
1011+
.write((n & 0b0011_1111) as u8 | 0b1000_0000);
1012+
1013+
scratch.set_len(scratch.len() + encoded_len);
1014+
}
1015+
}
1016+
9791017
/// Parses a JSON escape sequence and discards the value. Assumes the previous
9801018
/// byte read was a backslash.
9811019
fn ignore_escape<'de, R>(read: &mut R) -> Result<()>

0 commit comments

Comments
 (0)