@@ -328,6 +328,7 @@ References
328
328
use std:: cmp;
329
329
use std:: ptr;
330
330
331
+ use aho_corasick:: { Automaton , AcAutomaton , FullAcAutomaton } ;
331
332
use simd:: u8x16;
332
333
use simd:: x86:: sse2:: Sse2Bool8ix16 ;
333
334
use simd:: x86:: ssse3:: Ssse3U8x16 ;
@@ -354,6 +355,9 @@ pub struct Match {
354
355
pub struct Teddy {
355
356
/// A list of substrings to match.
356
357
pats : Vec < Vec < u8 > > ,
358
+ /// An Aho-Corasick automaton of the patterns. We use this when we need to
359
+ /// search pieces smaller than the Teddy block size.
360
+ ac : FullAcAutomaton < Vec < u8 > > ,
357
361
/// A set of 8 buckets. Each bucket corresponds to a single member of a
358
362
/// bitset. A bucket contains zero or more substrings. This is useful
359
363
/// when the number of substrings exceeds 8, since our bitsets cannot have
@@ -403,6 +407,7 @@ impl Teddy {
403
407
}
404
408
Some ( Teddy {
405
409
pats : pats. to_vec ( ) ,
410
+ ac : AcAutomaton :: new ( pats. to_vec ( ) ) . into_full ( ) ,
406
411
buckets : buckets,
407
412
masks : masks,
408
413
} )
@@ -570,7 +575,7 @@ impl Teddy {
570
575
571
576
prev0 = res0;
572
577
prev1 = res1;
573
-
578
+
574
579
let bitfield = res. ne ( zero) . move_mask ( ) ;
575
580
if bitfield != 0 {
576
581
let pos = pos. checked_sub ( 2 ) . unwrap ( ) ;
@@ -659,27 +664,13 @@ impl Teddy {
659
664
/// This is used when we don't have enough bytes in the haystack for our
660
665
/// block based approach.
661
666
fn slow ( & self , haystack : & [ u8 ] , pos : usize ) -> Option < Match > {
662
- // TODO: Use Aho-Corasick, or otherwise adapt the block based approach
663
- // to be capable of using smaller blocks.
664
- let mut m = None ;
665
- for ( pi, p) in self . pats . iter ( ) . enumerate ( ) {
666
- if let Some ( i) = find_slow ( p, & haystack[ pos..] ) {
667
- let candidate = Match {
668
- pat : pi,
669
- start : pos + i,
670
- end : pos + i + p. len ( ) ,
671
- } ;
672
- match m {
673
- None => m = Some ( candidate) ,
674
- Some ( ref mut m) => {
675
- if candidate. start < m. start {
676
- * m = candidate;
677
- }
678
- }
679
- }
667
+ self . ac . find ( & haystack[ pos..] ) . next ( ) . map ( |m| {
668
+ Match {
669
+ pat : m. pati ,
670
+ start : pos + m. start ,
671
+ end : pos + m. end ,
680
672
}
681
- }
682
- m
673
+ } )
683
674
}
684
675
}
685
676
@@ -802,17 +793,3 @@ impl UnsafeLoad for u8x16 {
802
793
x
803
794
}
804
795
}
805
-
806
- /// Slow single-substring search use for naive brute force matching.
807
- #[ cold]
808
- pub fn find_slow ( pattern : & [ u8 ] , haystack : & [ u8 ] ) -> Option < usize > {
809
- if pattern. len ( ) > haystack. len ( ) {
810
- return None ;
811
- }
812
- for i in 0 ..( haystack. len ( ) - pattern. len ( ) + 1 ) {
813
- if pattern == & haystack[ i..i + pattern. len ( ) ] {
814
- return Some ( i) ;
815
- }
816
- }
817
- None
818
- }
0 commit comments