|
8 | 8 | // option. This file may not be copied, modified, or distributed
|
9 | 9 | // except according to those terms.
|
10 | 10 |
|
| 11 | +use std::cmp; |
11 | 12 | use std::mem;
|
12 | 13 |
|
13 | 14 | use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton};
|
@@ -695,14 +696,33 @@ impl BoyerMooreSearch {
|
695 | 696 | /// I had trouble proving a useful turnover point. Hopefully,
|
696 | 697 | /// we can find one in the future.
|
697 | 698 | fn should_use(pattern: &[u8]) -> bool {
|
698 |
| - const CUTOFF_FREQ: usize = 242; |
699 |
| - |
700 |
| - // all the bytes must be more common than the cutoff. |
701 |
| - pattern.iter().all(|c| freq_rank(*c) >= CUTOFF_FREQ) |
702 |
| - // and the pattern must be long enough to be worthwhile. |
703 |
| - // memchr will be faster on `e` because it is short |
704 |
| - // even though e is quite common. |
705 |
| - && pattern.len() > 7 |
| 699 | + // The minimum pattern length required to use TBM. |
| 700 | + const MIN_LEN: usize = 9; |
| 701 | + // The minimum frequency rank (lower is rarer) that every byte in the |
| 702 | + // pattern must have in order to use TBM. That is, if the pattern |
| 703 | + // contains _any_ byte with a lower rank, then TBM won't be used. |
| 704 | + const MIN_CUTOFF: usize = 150; |
| 705 | + // The maximum frequency rank for any byte. |
| 706 | + const MAX_CUTOFF: usize = 255; |
| 707 | + // The scaling factor used to determine the actual cutoff frequency |
| 708 | + // to use (keeping in mind that the minimum frequency rank is bounded |
| 709 | + // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more |
| 710 | + // likely to be used as the pattern grows longer. That is, longer |
| 711 | + // patterns permit somewhat less frequent bytes than shorter patterns, |
| 712 | + // under the assumption that TBM gets better as the pattern gets |
| 713 | + // longer. |
| 714 | + const LEN_CUTOFF_PROPORTION: usize = 4; |
| 715 | + |
| 716 | + let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION); |
| 717 | + let cutoff = cmp::max( |
| 718 | + MIN_CUTOFF, |
| 719 | + MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank), |
| 720 | + ); |
| 721 | + // The pattern must be long enough to be worthwhile. e.g., memchr will |
| 722 | + // be faster on `e` because it is short even though e is quite common. |
| 723 | + pattern.len() > MIN_LEN |
| 724 | + // all the bytes must be more common than the cutoff. |
| 725 | + && pattern.iter().all(|c| freq_rank(*c) >= cutoff) |
706 | 726 | }
|
707 | 727 |
|
708 | 728 | /// Check to see if there is a match at the given position
|
|
0 commit comments