Skip to content

Commit 6270e49

Browse files
authored
Rollup merge of #68232 - Mark-Simulacrum:unicode-tables, r=joshtriplett
Optimize size/speed of Unicode datasets The overall implementation has the same general idea as the prior approach, which was based on a compressed trie structure, but modified to use less space (and, coincidentally, be an overall performance improvement). Sizes | Old | New | New/current -- | -- | -- | -- Alphabetic | 4616 | 2982 | 64.60% Case_Ignorable | 3144 | 2112 | 67.18% Cased | 2376 | 934 | 39.31% Cc | 19 | 43 | 226.32% Grapheme_Extend | 3072 | 1734 | 56.45% Lowercase | 2328 | 985 | 42.31% N | 2648 | 1239 | 46.79% Uppercase | 1978 | 934 | 47.22% White_Space | 241 | 140 | 58.09% | | | Total | 20422 | 11103 | 54.37% This table shows the size of the old and new tables in bytes. The most important of these tables is "Grapheme_Extend", as it is present in essentially all Rust programs due to being called from `str`'s Debug impl (`char::escape_debug`). In a representative case given by this [blog post] for the embedded world, the shrinking in this PR shrinks the final binary by 1,604 bytes, from 14,440 to 12,836. The performance of these new tables, based on the (rough) benchmark of linearly scanning the entire valid set of chars, querying for each `is_*`, is roughly ~50% better, though in some cases is either on par or slightly (3-5%) worse. In practice, I believe the size benefits of this PR are the main concern. The new implementation has been tested to be equivalent to the current nightly in terms of returned values on the set of valid chars. A (relatively) high-level explanation of the specific compression scheme used can be found [in the generator]. This is split into three commits -- the first adds the generator which produces the Rust code for the tables, the second adds support code for the lookup, and the third actually swaps the current implementation out for the new one. [blog post]: https://jamesmunns.com/blog/fmt-unreasonably-expensive/ [in the generator]: https://github.com/Mark-Simulacrum/rust/blob/unicode-tables/src/tools/unicode-table-generator/src/raw_emitter.rs
2 parents ae1e75b + efcda04 commit 6270e49

File tree

15 files changed

+2966
-3202
lines changed

15 files changed

+2966
-3202
lines changed

.gitignore

+1-8
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,7 @@ __pycache__/
3434
# Created by default with `src/ci/docker/run.sh`:
3535
/obj/
3636
/rustllvm/
37-
/src/libcore/unicode/DerivedCoreProperties.txt
38-
/src/libcore/unicode/DerivedNormalizationProps.txt
39-
/src/libcore/unicode/PropList.txt
40-
/src/libcore/unicode/ReadMe.txt
41-
/src/libcore/unicode/Scripts.txt
42-
/src/libcore/unicode/SpecialCasing.txt
43-
/src/libcore/unicode/UnicodeData.txt
44-
/src/libcore/unicode/downloaded
37+
/unicode-downloads
4538
/target/
4639
# Generated by compiletest for incremental:
4740
/tmp/

Cargo.lock

+17
Original file line numberDiff line numberDiff line change
@@ -4953,6 +4953,16 @@ version = "1.10.0"
49534953
source = "registry+https://github.com/rust-lang/crates.io-index"
49544954
checksum = "612d636f949607bdf9b123b4a6f6d966dedf3ff669f7f045890d3a4a73948169"
49554955

4956+
[[package]]
4957+
name = "ucd-parse"
4958+
version = "0.1.4"
4959+
source = "registry+https://github.com/rust-lang/crates.io-index"
4960+
checksum = "ca6b52bf4da6512f0f07785a04769222e50d29639e7ecd016b7806fd2de306b4"
4961+
dependencies = [
4962+
"lazy_static 1.3.0",
4963+
"regex",
4964+
]
4965+
49564966
[[package]]
49574967
name = "ucd-trie"
49584968
version = "0.1.1"
@@ -4974,6 +4984,13 @@ dependencies = [
49744984
"version_check 0.1.5",
49754985
]
49764986

4987+
[[package]]
4988+
name = "unicode-bdd"
4989+
version = "0.1.0"
4990+
dependencies = [
4991+
"ucd-parse",
4992+
]
4993+
49774994
[[package]]
49784995
name = "unicode-bidi"
49794996
version = "0.3.4"

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ members = [
2323
"src/tools/rustfmt",
2424
"src/tools/miri",
2525
"src/tools/rustdoc-themes",
26+
"src/tools/unicode-table-generator",
2627
]
2728
exclude = [
2829
"build",

src/libcore/char/methods.rs

+8-8
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use crate::slice;
44
use crate::str::from_utf8_unchecked_mut;
55
use crate::unicode::printable::is_printable;
6-
use crate::unicode::tables::{conversions, derived_property, general_category, property};
6+
use crate::unicode::{self, conversions};
77

88
use super::*;
99

@@ -552,7 +552,7 @@ impl char {
552552
pub fn is_alphabetic(self) -> bool {
553553
match self {
554554
'a'..='z' | 'A'..='Z' => true,
555-
c => c > '\x7f' && derived_property::Alphabetic(c),
555+
c => c > '\x7f' && unicode::Alphabetic(c),
556556
}
557557
}
558558

@@ -583,7 +583,7 @@ impl char {
583583
pub fn is_lowercase(self) -> bool {
584584
match self {
585585
'a'..='z' => true,
586-
c => c > '\x7f' && derived_property::Lowercase(c),
586+
c => c > '\x7f' && unicode::Lowercase(c),
587587
}
588588
}
589589

@@ -614,7 +614,7 @@ impl char {
614614
pub fn is_uppercase(self) -> bool {
615615
match self {
616616
'A'..='Z' => true,
617-
c => c > '\x7f' && derived_property::Uppercase(c),
617+
c => c > '\x7f' && unicode::Uppercase(c),
618618
}
619619
}
620620

@@ -642,7 +642,7 @@ impl char {
642642
pub fn is_whitespace(self) -> bool {
643643
match self {
644644
' ' | '\x09'..='\x0d' => true,
645-
c => c > '\x7f' && property::White_Space(c),
645+
c => c > '\x7f' && unicode::White_Space(c),
646646
}
647647
}
648648

@@ -693,7 +693,7 @@ impl char {
693693
#[stable(feature = "rust1", since = "1.0.0")]
694694
#[inline]
695695
pub fn is_control(self) -> bool {
696-
general_category::Cc(self)
696+
unicode::Cc(self)
697697
}
698698

699699
/// Returns `true` if this `char` has the `Grapheme_Extend` property.
@@ -707,7 +707,7 @@ impl char {
707707
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
708708
#[inline]
709709
pub(crate) fn is_grapheme_extended(self) -> bool {
710-
derived_property::Grapheme_Extend(self)
710+
unicode::Grapheme_Extend(self)
711711
}
712712

713713
/// Returns `true` if this `char` has one of the general categories for numbers.
@@ -739,7 +739,7 @@ impl char {
739739
pub fn is_numeric(self) -> bool {
740740
match self {
741741
'0'..='9' => true,
742-
c => c > '\x7f' && general_category::N(c),
742+
c => c > '\x7f' && unicode::N(c),
743743
}
744744
}
745745

src/libcore/char/mod.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ pub use self::decode::{decode_utf16, DecodeUtf16, DecodeUtf16Error};
3737

3838
// unstable re-exports
3939
#[unstable(feature = "unicode_version", issue = "49726")]
40-
pub use crate::unicode::tables::UNICODE_VERSION;
41-
#[unstable(feature = "unicode_version", issue = "49726")]
4240
pub use crate::unicode::version::UnicodeVersion;
41+
#[unstable(feature = "unicode_version", issue = "49726")]
42+
pub use crate::unicode::UNICODE_VERSION;
4343

4444
use crate::fmt::{self, Write};
4545
use crate::iter::FusedIterator;

src/libcore/unicode/bool_trie.rs

-66
This file was deleted.

src/libcore/unicode/mod.rs

+49-5
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,59 @@
11
#![unstable(feature = "unicode_internals", issue = "none")]
22
#![allow(missing_docs)]
33

4-
mod bool_trie;
54
pub(crate) mod printable;
6-
pub(crate) mod tables;
5+
mod unicode_data;
76
pub(crate) mod version;
87

8+
use version::UnicodeVersion;
9+
10+
/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
11+
/// `char` and `str` methods are based on.
12+
#[unstable(feature = "unicode_version", issue = "49726")]
13+
pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {
14+
major: unicode_data::UNICODE_VERSION.0,
15+
minor: unicode_data::UNICODE_VERSION.1,
16+
micro: unicode_data::UNICODE_VERSION.2,
17+
_priv: (),
18+
};
19+
920
// For use in liballoc, not re-exported in libstd.
1021
pub mod derived_property {
11-
pub use crate::unicode::tables::derived_property::{Case_Ignorable, Cased};
22+
pub use super::{Case_Ignorable, Cased};
1223
}
13-
pub mod conversions {
14-
pub use crate::unicode::tables::conversions::{to_lower, to_upper};
24+
25+
pub use unicode_data::alphabetic::lookup as Alphabetic;
26+
pub use unicode_data::case_ignorable::lookup as Case_Ignorable;
27+
pub use unicode_data::cased::lookup as Cased;
28+
pub use unicode_data::cc::lookup as Cc;
29+
pub use unicode_data::conversions;
30+
pub use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
31+
pub use unicode_data::lowercase::lookup as Lowercase;
32+
pub use unicode_data::n::lookup as N;
33+
pub use unicode_data::uppercase::lookup as Uppercase;
34+
pub use unicode_data::white_space::lookup as White_Space;
35+
36+
#[inline(always)]
37+
fn range_search<const N: usize, const N1: usize, const N2: usize>(
38+
needle: u32,
39+
chunk_idx_map: &[u8; N],
40+
(last_chunk_idx, last_chunk_mapping): (u16, u8),
41+
bitset_chunk_idx: &[[u8; 16]; N1],
42+
bitset: &[u64; N2],
43+
) -> bool {
44+
let bucket_idx = (needle / 64) as usize;
45+
let chunk_map_idx = bucket_idx / 16;
46+
let chunk_piece = bucket_idx % 16;
47+
let chunk_idx = if chunk_map_idx >= N {
48+
if chunk_map_idx == last_chunk_idx as usize {
49+
last_chunk_mapping
50+
} else {
51+
return false;
52+
}
53+
} else {
54+
chunk_idx_map[chunk_map_idx]
55+
};
56+
let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece];
57+
let word = bitset[(idx as usize)];
58+
(word & (1 << (needle % 64) as u64)) != 0
1559
}

0 commit comments

Comments
 (0)