literals: tweak the TBM heuristic

BurntSushi · BurntSushi · commit 392b3d63a25c · 2017-12-30T14:50:40.000-05:00
This commit tweaks the heuristic employed to determine whether to use TBM
or not. For the most part, the heuristic was tweaked by combining the
actual benchmark results with a bit of hand waving. In particular, the
primary change here is that the frequency rank cutoff is no longer a
constant, but rather, a function of the pattern length. That is, we guess
that TBM will do well with longer patterns, even if it contains somewhat
infrequent bytes. We do put a constant cap on this heuristic. That is,
regardless of the length of the pattern, if a "very rare" byte is found
in the pattern, then we won't use TBM.
diff --git a/src/literals.rs b/src/literals.rs
@@ -8,6 +8,7 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
+use std::cmp;
 use std::mem;
 
 use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton};
@@ -695,14 +696,33 @@ impl BoyerMooreSearch {
     /// I had trouble proving a useful turnover point. Hopefully,
     /// we can find one in the future.
     fn should_use(pattern: &[u8]) -> bool {
-        const CUTOFF_FREQ: usize = 242;
-
-        // all the bytes must be more common than the cutoff.
-        pattern.iter().all(|c| freq_rank(*c) >= CUTOFF_FREQ)
-            // and the pattern must be long enough to be worthwhile.
-            // memchr will be faster on `e` because it is short
-            // even though e is quite common.
-            && pattern.len() > 7
+        // The minimum pattern length required to use TBM.
+        const MIN_LEN: usize = 9;
+        // The minimum frequency rank (lower is rarer) that every byte in the
+        // pattern must have in order to use TBM. That is, if the pattern
+        // contains _any_ byte with a lower rank, then TBM won't be used.
+        const MIN_CUTOFF: usize = 150;
+        // The maximum frequency rank for any byte.
+        const MAX_CUTOFF: usize = 255;
+        // The scaling factor used to determine the actual cutoff frequency
+        // to use (keeping in mind that the minimum frequency rank is bounded
+        // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more
+        // likely to be used as the pattern grows longer. That is, longer
+        // patterns permit somewhat less frequent bytes than shorter patterns,
+        // under the assumption that TBM gets better as the pattern gets
+        // longer.
+        const LEN_CUTOFF_PROPORTION: usize = 4;
+
+        let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION);
+        let cutoff = cmp::max(
+            MIN_CUTOFF,
+            MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank),
+        );
+        // The pattern must be long enough to be worthwhile. e.g., memchr will
+        // be faster on `e` because it is short even though e is quite common.
+        pattern.len() > MIN_LEN
+            // all the bytes must be more common than the cutoff.
+            && pattern.iter().all(|c| freq_rank(*c) >= cutoff)
     }
 
     /// Check to see if there is a match at the given position