Skip to content

Commit 203c509

Browse files
committed
Add SIMD accelerated multiple pattern search.
This uses the "Teddy" algorithm, as learned from the Hyperscan regular expression library: https://01.org/hyperscan This support optional, subject to the following: 1. A nightly compiler. 2. Enabling the `simd-accel` feature. 3. Adding `RUSTFLAGS="-C target-feature=+ssse3"` when compiling.
1 parent 426e131 commit 203c509

File tree

12 files changed

+879
-13
lines changed

12 files changed

+879
-13
lines changed

.travis.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ script:
99
- cargo build --verbose
1010
- cargo build --verbose --manifest-path=regex-debug/Cargo.toml
1111
- if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
12-
travis_wait cargo test --verbose --features pattern;
12+
RUSTFLAGS="-C target-feature=+ssse3" cargo test --verbose --features 'simd-accel pattern';
1313
else
1414
travis_wait cargo test --verbose;
1515
fi

Cargo.toml

+4
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ memchr = "0.1.9"
2121
thread_local = "0.2.4"
2222
# For parsing regular expressions.
2323
regex-syntax = { path = "regex-syntax", version = "0.3.1" }
24+
# For accelerating text search.
25+
simd = { version = "0.1.0", optional = true }
2426
# For compiling UTF-8 decoding into automata.
2527
utf8-ranges = "0.1.3"
2628

@@ -35,6 +37,8 @@ rand = "0.3"
3537
[features]
3638
# Enable to use the unstable pattern traits defined in std.
3739
pattern = []
40+
# Enable to use simd acceleration.
41+
simd-accel = ["simd"]
3842

3943
[lib]
4044
# There are no benchmarks in the library code itself

bench/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ libc = "0.2"
1717
onig = { version = "0.4", optional = true }
1818
libpcre-sys = { version = "0.2", optional = true }
1919
memmap = "0.2"
20-
regex = { version = "0.1", path = ".." }
20+
regex = { version = "0.1", path = "..", features = ["simd-accel"] }
2121
regex_macros = { version = "0.1", path = "../regex_macros", optional = true }
2222
regex-syntax = { version = "0.3", path = "../regex-syntax" }
2323
rustc-serialize = "0.3"

bench/compile

+3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
#!/bin/sh
22

3+
# Enable SIMD.
4+
export RUSTFLAGS="-C target-feature=+ssse3"
5+
36
exec cargo build \
47
--release \
58
--features 're-onig re-pcre1 re-pcre2 re-re2 re-rust re-rust-bytes re-tcl' \

bench/run

+3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ if [ $# = 0 ] || [ $1 = '-h' ] || [ $1 = '--help' ]; then
99
usage
1010
fi
1111

12+
# Enable SIMD.
13+
export RUSTFLAGS="-C target-feature=+ssse3"
14+
1215
which="$1"
1316
shift
1417
case $which in

regex-syntax/src/properties.rs

-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
// except according to those terms.
1010

1111
use quickcheck::{Arbitrary, Gen, Testable, QuickCheck, StdGen};
12-
use rand::Rng;
1312

1413
use {
1514
Expr, ExprBuilder,

src/lib.rs

+7
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,7 @@
449449
#![deny(missing_docs)]
450450
#![cfg_attr(test, deny(warnings))]
451451
#![cfg_attr(feature = "pattern", feature(pattern))]
452+
#![cfg_attr(feature = "simd-accel", feature(cfg_target_feature))]
452453
#![doc(html_logo_url = "https://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
453454
html_favicon_url = "https://www.rust-lang.org/favicon.ico",
454455
html_root_url = "https://doc.rust-lang.org/regex/")]
@@ -458,6 +459,7 @@ extern crate memchr;
458459
extern crate thread_local;
459460
#[cfg(test)] extern crate quickcheck;
460461
extern crate regex_syntax as syntax;
462+
#[cfg(feature = "simd-accel")] extern crate simd;
461463
extern crate utf8_ranges;
462464

463465
pub use error::Error;
@@ -582,6 +584,11 @@ mod re_plugin;
582584
mod re_set;
583585
mod re_trait;
584586
mod re_unicode;
587+
#[cfg(feature = "simd-accel")]
588+
mod simd_accel;
589+
#[cfg(not(feature = "simd-accel"))]
590+
#[path = "simd_fallback/mod.rs"]
591+
mod simd_accel;
585592
mod sparse;
586593

587594
/// The `internal` module exists to support the `regex!` macro and other

src/literals.rs

+40-10
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use memchr::{memchr, memchr2, memchr3};
1515
use syntax;
1616

1717
use freqs::BYTE_FREQUENCIES;
18+
use simd_accel::teddy128::Teddy;
1819

1920
/// A prefix extracted from a compiled regular expression.
2021
///
@@ -51,6 +52,8 @@ enum Matcher {
5152
Single(SingleSearch),
5253
/// An Aho-Corasick automaton.
5354
AC(FullAcAutomaton<syntax::Lit>),
55+
/// A simd accelerated multiple string matcher.
56+
Teddy128(Teddy),
5457
}
5558

5659
impl LiteralSearcher {
@@ -100,6 +103,7 @@ impl LiteralSearcher {
100103
Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
101104
Single(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
102105
AC(ref aut) => aut.find(haystack).next().map(|m| (m.start, m.end)),
106+
Teddy128(ref ted) => ted.find(haystack).map(|m| (m.start, m.end)),
103107
}
104108
}
105109

@@ -136,6 +140,9 @@ impl LiteralSearcher {
136140
Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
137141
Matcher::Single(ref s) => LiteralIter::Single(&s.pat),
138142
Matcher::AC(ref ac) => LiteralIter::AC(ac.patterns()),
143+
Matcher::Teddy128(ref ted) => {
144+
LiteralIter::Teddy128(ted.patterns())
145+
}
139146
}
140147
}
141148

@@ -162,6 +169,7 @@ impl LiteralSearcher {
162169
Bytes(ref sset) => sset.dense.len(),
163170
Single(_) => 1,
164171
AC(ref aut) => aut.len(),
172+
Teddy128(ref ted) => ted.len(),
165173
}
166174
}
167175

@@ -173,6 +181,7 @@ impl LiteralSearcher {
173181
Bytes(ref sset) => sset.approximate_size(),
174182
Single(ref single) => single.approximate_size(),
175183
AC(ref aut) => aut.heap_bytes(),
184+
Teddy128(ref ted) => ted.approximate_size(),
176185
}
177186
}
178187
}
@@ -190,23 +199,34 @@ impl Matcher {
190199

191200
fn new(lits: &syntax::Literals, sset: SingleByteSet) -> Self {
192201
if lits.literals().is_empty() {
193-
Matcher::Empty
194-
} else if sset.dense.len() >= 26 {
202+
return Matcher::Empty;
203+
}
204+
if sset.dense.len() >= 26 {
195205
// Avoid trying to match a large number of single bytes.
196206
// This is *very* sensitive to a frequency analysis comparison
197207
// between the bytes in sset and the composition of the haystack.
198208
// No matter the size of sset, if its members all are rare in the
199209
// haystack, then it'd be worth using it. How to tune this... IDK.
200210
// ---AG
201-
Matcher::Empty
202-
} else if sset.complete {
203-
Matcher::Bytes(sset)
204-
} else if lits.literals().len() == 1 {
205-
Matcher::Single(SingleSearch::new(lits.literals()[0].to_vec()))
206-
} else {
207-
let pats = lits.literals().to_owned();
208-
Matcher::AC(AcAutomaton::new(pats).into_full())
211+
return Matcher::Empty;
212+
}
213+
if sset.complete {
214+
return Matcher::Bytes(sset);
215+
}
216+
if lits.literals().len() == 1 {
217+
let lit = lits.literals()[0].to_vec();
218+
return Matcher::Single(SingleSearch::new(lit));
209219
}
220+
// Only try Teddy if Aho-Corasick can't use memchr.
221+
// Also, in its current form, Teddy doesn't scale well to lots of
222+
// literals.
223+
if sset.dense.len() > 1 && lits.literals().len() <= 32 {
224+
if let Some(ted) = Teddy::new(lits) {
225+
return Matcher::Teddy128(ted);
226+
}
227+
}
228+
let pats = lits.literals().to_owned();
229+
Matcher::AC(AcAutomaton::new(pats).into_full())
210230
}
211231
}
212232

@@ -215,6 +235,7 @@ pub enum LiteralIter<'a> {
215235
Bytes(&'a [u8]),
216236
Single(&'a [u8]),
217237
AC(&'a [syntax::Lit]),
238+
Teddy128(&'a [Vec<u8>]),
218239
}
219240

220241
impl<'a> Iterator for LiteralIter<'a> {
@@ -250,6 +271,15 @@ impl<'a> Iterator for LiteralIter<'a> {
250271
Some(&**next)
251272
}
252273
}
274+
LiteralIter::Teddy128(ref mut lits) => {
275+
if lits.is_empty() {
276+
None
277+
} else {
278+
let next = &lits[0];
279+
*lits = &lits[1..];
280+
Some(&**next)
281+
}
282+
}
253283
}
254284
}
255285
}

src/simd_accel/mod.rs

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#[cfg(target_feature = "ssse3")]
2+
pub mod teddy128;
3+
#[cfg(not(target_feature = "ssse3"))]
4+
#[path = "../simd_fallback/teddy128.rs"]
5+
pub mod teddy128;

0 commit comments

Comments
 (0)