Skip to content

Commit c113247

Browse files
committed
Auto merge of rust-lang#130733 - okaneco:is_ascii, r=scottmcm
Optimize `is_ascii` for `str` and `[u8]` further Replace the existing optimized function with one that enables auto-vectorization. This is especially beneficial on x86-64 as `pmovmskb` can be emitted with careful structuring of the code. The instruction can detect non-ASCII characters one vector register width at a time instead of the current `usize` at a time check. The resulting implementation is completely safe. `case00_libcore` is the current implementation, `case04_while_loop` is this PR. ``` benchmarks: ascii::is_ascii_slice::long::case00_libcore 22.25/iter +/- 1.09 ascii::is_ascii_slice::long::case04_while_loop 6.78/iter +/- 0.92 ascii::is_ascii_slice::medium::case00_libcore 2.81/iter +/- 0.39 ascii::is_ascii_slice::medium::case04_while_loop 1.56/iter +/- 0.78 ascii::is_ascii_slice::short::case00_libcore 5.55/iter +/- 0.85 ascii::is_ascii_slice::short::case04_while_loop 3.75/iter +/- 0.22 ascii::is_ascii_slice::unaligned_both_long::case00_libcore 26.59/iter +/- 0.66 ascii::is_ascii_slice::unaligned_both_long::case04_while_loop 5.78/iter +/- 0.16 ascii::is_ascii_slice::unaligned_both_medium::case00_libcore 2.97/iter +/- 0.32 ascii::is_ascii_slice::unaligned_both_medium::case04_while_loop 2.41/iter +/- 0.10 ascii::is_ascii_slice::unaligned_head_long::case00_libcore 23.71/iter +/- 0.79 ascii::is_ascii_slice::unaligned_head_long::case04_while_loop 7.83/iter +/- 1.31 ascii::is_ascii_slice::unaligned_head_medium::case00_libcore 3.69/iter +/- 0.54 ascii::is_ascii_slice::unaligned_head_medium::case04_while_loop 7.05/iter +/- 0.32 ascii::is_ascii_slice::unaligned_tail_long::case00_libcore 24.44/iter +/- 1.41 ascii::is_ascii_slice::unaligned_tail_long::case04_while_loop 5.12/iter +/- 0.18 ascii::is_ascii_slice::unaligned_tail_medium::case00_libcore 3.24/iter +/- 0.40 ascii::is_ascii_slice::unaligned_tail_medium::case04_while_loop 2.86/iter +/- 0.14 ``` `unaligned_head_medium` is the main regression in the benchmarks. It is a 32 byte string being sliced `bytes[1..]`. The first commit can be used to run the benchmarks against the current core implementation. Previous implementation was done in rust-lang#74066 --- Two potential drawbacks of this implementation are that it increases instruction count and may regress other platforms/architectures. The benches here may also be too artificial to glean much insight from. https://rust.godbolt.org/z/G9znGfY36
2 parents 00bf74d + 1b5c02b commit c113247

File tree

3 files changed

+118
-15
lines changed

3 files changed

+118
-15
lines changed

library/core/benches/ascii/is_ascii.rs

+44-3
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,12 @@ macro_rules! benches {
1010
// Ensure we benchmark cases where the functions are called with strings
1111
// that are not perfectly aligned or have a length which is not a
1212
// multiple of size_of::<usize>() (or both)
13-
benches!(mod unaligned_head MEDIUM[1..] $($name $arg $body)+);
14-
benches!(mod unaligned_tail MEDIUM[..(MEDIUM.len() - 1)] $($name $arg $body)+);
15-
benches!(mod unaligned_both MEDIUM[1..(MEDIUM.len() - 1)] $($name $arg $body)+);
13+
benches!(mod unaligned_head_medium MEDIUM[1..] $($name $arg $body)+);
14+
benches!(mod unaligned_tail_medium MEDIUM[..(MEDIUM.len() - 1)] $($name $arg $body)+);
15+
benches!(mod unaligned_both_medium MEDIUM[1..(MEDIUM.len() - 1)] $($name $arg $body)+);
16+
benches!(mod unaligned_head_long LONG[1..] $($name $arg $body)+);
17+
benches!(mod unaligned_tail_long LONG[..(LONG.len() - 1)] $($name $arg $body)+);
18+
benches!(mod unaligned_both_long LONG[1..(LONG.len() - 1)] $($name $arg $body)+);
1619
};
1720

1821
(mod $mod_name: ident $input: ident [$range: expr] $($name: ident $arg: ident $body: block)+) => {
@@ -49,6 +52,44 @@ benches! {
4952
fn case03_align_to_unrolled(bytes: &[u8]) {
5053
is_ascii_align_to_unrolled(bytes)
5154
}
55+
56+
fn case04_while_loop(bytes: &[u8]) {
57+
// Process chunks of 32 bytes at a time in the fast path to enable
58+
// auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
59+
// can be OR'd together and then the resulting vector can be tested for
60+
// non-ASCII bytes.
61+
const CHUNK_SIZE: usize = 32;
62+
63+
let mut i = 0;
64+
65+
while i + CHUNK_SIZE <= bytes.len() {
66+
let chunk_end = i + CHUNK_SIZE;
67+
68+
// Get LLVM to produce a `pmovmskb` instruction on x86-64 which
69+
// creates a mask from the most significant bit of each byte.
70+
// ASCII bytes are less than 128 (0x80), so their most significant
71+
// bit is unset.
72+
let mut count = 0;
73+
while i < chunk_end {
74+
count += bytes[i].is_ascii() as u8;
75+
i += 1;
76+
}
77+
78+
// All bytes should be <= 127 so count is equal to chunk size.
79+
if count != CHUNK_SIZE as u8 {
80+
return false;
81+
}
82+
}
83+
84+
// Process the remaining `bytes.len() % N` bytes.
85+
let mut is_ascii = true;
86+
while i < bytes.len() {
87+
is_ascii &= bytes[i].is_ascii();
88+
i += 1;
89+
}
90+
91+
is_ascii
92+
}
5293
}
5394

5495
// These are separate since it's easier to debug errors if they don't go through

library/core/src/slice/ascii.rs

+58-12
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
use core::ascii::EscapeDefault;
44

55
use crate::fmt::{self, Write};
6+
#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))]
67
use crate::intrinsics::const_eval_select;
7-
use crate::{ascii, iter, mem, ops};
8+
use crate::{ascii, iter, ops};
89

910
#[cfg(not(test))]
1011
impl [u8] {
@@ -328,14 +329,6 @@ impl<'a> fmt::Debug for EscapeAscii<'a> {
328329
}
329330
}
330331

331-
/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
332-
/// from `../str/mod.rs`, which does something similar for utf8 validation.
333-
#[inline]
334-
const fn contains_nonascii(v: usize) -> bool {
335-
const NONASCII_MASK: usize = usize::repeat_u8(0x80);
336-
(NONASCII_MASK & v) != 0
337-
}
338-
339332
/// ASCII test *without* the chunk-at-a-time optimizations.
340333
///
341334
/// This is carefully structured to produce nice small code -- it's smaller in
@@ -366,6 +359,7 @@ pub const fn is_ascii_simple(mut bytes: &[u8]) -> bool {
366359
///
367360
/// If any of these loads produces something for which `contains_nonascii`
368361
/// (above) returns true, then we know the answer is false.
362+
#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))]
369363
#[inline]
370364
#[rustc_allow_const_fn_unstable(const_eval_select)] // fallback impl has same behavior
371365
const fn is_ascii(s: &[u8]) -> bool {
@@ -376,7 +370,14 @@ const fn is_ascii(s: &[u8]) -> bool {
376370
if const {
377371
is_ascii_simple(s)
378372
} else {
379-
const USIZE_SIZE: usize = mem::size_of::<usize>();
373+
/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
374+
/// from `../str/mod.rs`, which does something similar for utf8 validation.
375+
const fn contains_nonascii(v: usize) -> bool {
376+
const NONASCII_MASK: usize = usize::repeat_u8(0x80);
377+
(NONASCII_MASK & v) != 0
378+
}
379+
380+
const USIZE_SIZE: usize = size_of::<usize>();
380381

381382
let len = s.len();
382383
let align_offset = s.as_ptr().align_offset(USIZE_SIZE);
@@ -386,7 +387,7 @@ const fn is_ascii(s: &[u8]) -> bool {
386387
//
387388
// We also do this for architectures where `size_of::<usize>()` isn't
388389
// sufficient alignment for `usize`, because it's a weird edge case.
389-
if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::<usize>() {
390+
if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < align_of::<usize>() {
390391
return is_ascii_simple(s);
391392
}
392393

@@ -420,7 +421,7 @@ const fn is_ascii(s: &[u8]) -> bool {
420421
// have alignment information it should have given a `usize::MAX` for
421422
// `align_offset` earlier, sending things through the scalar path instead of
422423
// this one, so this check should pass if it's reachable.
423-
debug_assert!(word_ptr.is_aligned_to(mem::align_of::<usize>()));
424+
debug_assert!(word_ptr.is_aligned_to(align_of::<usize>()));
424425

425426
// Read subsequent words until the last aligned word, excluding the last
426427
// aligned word by itself to be done in tail check later, to ensure that
@@ -455,3 +456,48 @@ const fn is_ascii(s: &[u8]) -> bool {
455456
}
456457
)
457458
}
459+
460+
/// ASCII test optimized to use the `pmovmskb` instruction available on `x86-64`
461+
/// platforms.
462+
///
463+
/// Other platforms are not likely to benefit from this code structure, so they
464+
/// use SWAR techniques to test for ASCII in `usize`-sized chunks.
465+
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
466+
#[inline]
467+
const fn is_ascii(bytes: &[u8]) -> bool {
468+
// Process chunks of 32 bytes at a time in the fast path to enable
469+
// auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
470+
// can be OR'd together and then the resulting vector can be tested for
471+
// non-ASCII bytes.
472+
const CHUNK_SIZE: usize = 32;
473+
474+
let mut i = 0;
475+
476+
while i + CHUNK_SIZE <= bytes.len() {
477+
let chunk_end = i + CHUNK_SIZE;
478+
479+
// Get LLVM to produce a `pmovmskb` instruction on x86-64 which
480+
// creates a mask from the most significant bit of each byte.
481+
// ASCII bytes are less than 128 (0x80), so their most significant
482+
// bit is unset.
483+
let mut count = 0;
484+
while i < chunk_end {
485+
count += bytes[i].is_ascii() as u8;
486+
i += 1;
487+
}
488+
489+
// All bytes should be <= 127 so count is equal to chunk size.
490+
if count != CHUNK_SIZE as u8 {
491+
return false;
492+
}
493+
}
494+
495+
// Process the remaining `bytes.len() % N` bytes.
496+
let mut is_ascii = true;
497+
while i < bytes.len() {
498+
is_ascii &= bytes[i].is_ascii();
499+
i += 1;
500+
}
501+
502+
is_ascii
503+
}

tests/codegen/slice-is-ascii.rs

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
//@ only-x86_64
2+
//@ compile-flags: -C opt-level=3
3+
#![crate_type = "lib"]
4+
5+
/// Check that the fast-path of `is_ascii` uses a `pmovmskb` instruction.
6+
/// Platforms lacking an equivalent instruction use other techniques for
7+
/// optimizing `is_ascii`.
8+
// CHECK-LABEL: @is_ascii_autovectorized
9+
#[no_mangle]
10+
pub fn is_ascii_autovectorized(s: &[u8]) -> bool {
11+
// CHECK: load <32 x i8>
12+
// CHECK-NEXT: icmp slt <32 x i8>
13+
// CHECK-NEXT: bitcast <32 x i1>
14+
// CHECK-NEXT: icmp eq i32
15+
s.is_ascii()
16+
}

0 commit comments

Comments
 (0)