Don't use dfa for anchored strings with captures

Ethan Pailes · Ethan Pailes · commit bc69b03663c2 · 2017-10-27T17:57:44.000-04:00
The DFA can't produce captures, but is still faster than the Pike VM NFA, so the normal approach to finding capture groups is to look for the entire match with the DFA and then run the NFA on the substring of the input that matched. In cases where the regex in anchored, the match always starts at the beginning of the input, so there is never any point to trying the DFA first. The DFA can still be useful for rejecting inputs which are not in the language of the regular expression, but anchored regex with capture groups are most commonly used in a parsing context, so it seems like a fair trade-off. For a more in depth discussion see github issue #348.
diff --git a/bench/src/bench.rs b/bench/src/bench.rs
@@ -236,6 +236,41 @@ macro_rules! bench_find {
     }
 }
 
+// USAGE: bench_captures!(name, pattern, groups, haystack);
+//
+// CONTRACT:
+//   Given:
+//     ident, the desired benchmarking function name
+//     pattern : ::Regex, the regular expression to be executed
+//     groups : usize, the number of capture groups
+//     haystack : String, the string to search
+//   bench_captures will benchmark how fast re.captures() produces
+//   the capture groups in question.
+macro_rules! bench_captures {
+    ($name:ident, $pattern:expr, $count:expr, $haystack:expr) => {
+
+        #[cfg(feature = "re-rust")]
+        #[bench]
+        fn $name(b: &mut Bencher) {
+            use std::sync::Mutex;
+
+            lazy_static! {
+                static ref RE: Mutex<Regex> = Mutex::new($pattern);
+                static ref TEXT: Mutex<Text> = Mutex::new(text!($haystack));
+            };
+            let re = RE.lock().unwrap();
+            let text = TEXT.lock().unwrap();
+            b.bytes = text.len() as u64;
+            b.iter(|| {
+                match re.captures(&text) {
+                    None => assert!(false, "no captures"),
+                    Some(caps) => assert_eq!($count + 1, caps.len()),
+                }
+            });
+        }
+    }
+}
+
 mod ffi;
 mod misc;
 mod regexdna;
diff --git a/bench/src/misc.rs b/bench/src/misc.rs
@@ -191,3 +191,85 @@ macro_rules! reallyhard2 { () => (r"\w+\s+Holmes") }
 
 bench_match!(reallyhard2_1K, reallyhard2!(),
              get_text(TXT_1K, reallyhard2_suffix()));
+
+
+//
+// Benchmarks to justify the short-haystack NFA fallthrough optimization
+// implemented by `read_captures_at` in regex/src/exec.rs. See github issue
+// #348.
+//
+// The procedure used to try to determine the right hardcoded cutoff
+// for the short-haystack optimization in issue #348 is as follows.
+//
+// ```
+// > cd bench
+// > cargo bench --features re-rust short_hay | tee dfa-nfa.res
+// > # modify the `MatchType::Dfa` branch in exec.rs:read_captures_at
+// > # to just execute the nfa
+// > cargo bench --features re-rust short_hay | tee nfa-only.res
+// > cargo benchcmp dfa-nfa.res nfa-only.res
+// ```
+//
+// The expected result is that short inputs will go faster under
+// the nfa-only mode, but at some turnover point the dfa-nfa mode
+// will start to win again. Unfortunately, that is not what happened.
+// Instead there was no noticeable change in the bench results, so
+// I've opted to just do the more conservative anchor optimization.
+//
+bench_captures!(short_haystack_1x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    String::from("aaaabbbbccccbbbdddd"));
+bench_captures!(short_haystack_2x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(2).collect::<String>(),
+            repeat("dddd").take(2).collect::<String>(),
+            ));
+bench_captures!(short_haystack_3x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(3).collect::<String>(),
+            repeat("dddd").take(3).collect::<String>(),
+            ));
+bench_captures!(short_haystack_4x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(4).collect::<String>(),
+            repeat("dddd").take(4).collect::<String>(),
+            ));
+bench_captures!(short_haystack_10x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(10).collect::<String>(),
+            repeat("dddd").take(10).collect::<String>(),
+            ));
+bench_captures!(short_haystack_100x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(100).collect::<String>(),
+            repeat("dddd").take(100).collect::<String>(),
+            ));
+bench_captures!(short_haystack_1000x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(1000).collect::<String>(),
+            repeat("dddd").take(1000).collect::<String>(),
+            ));
+bench_captures!(short_haystack_10000x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(10000).collect::<String>(),
+            repeat("dddd").take(10000).collect::<String>(),
+            ));
+bench_captures!(short_haystack_100000x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(100000).collect::<String>(),
+            repeat("dddd").take(100000).collect::<String>(),
+            ));
+bench_captures!(short_haystack_1000000x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(1000000).collect::<String>(),
+            repeat("dddd").take(1000000).collect::<String>(),
+            ));
diff --git a/src/exec.rs b/src/exec.rs
@@ -554,12 +554,16 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
                 })
             }
             MatchType::Dfa => {
-                match self.find_dfa_forward(text, start) {
-                    dfa::Result::Match((s, e)) => {
-                        self.captures_nfa_with_match(slots, text, s, e)
+                if self.ro.nfa.is_anchored_start {
+                    self.captures_nfa(slots, text, start)
+                } else {
+                    match self.find_dfa_forward(text, start) {
+                        dfa::Result::Match((s, e)) => {
+                            self.captures_nfa_with_match(slots, text, s, e)
+                        }
+                        dfa::Result::NoMatch(_) => None,
+                        dfa::Result::Quit => self.captures_nfa(slots, text, start),
                     }
-                    dfa::Result::NoMatch(_) => None,
-                    dfa::Result::Quit => self.captures_nfa(slots, text, start),
                 }
             }
             MatchType::DfaAnchoredReverse => {

Original file line number	Diff line number	Diff line change
`@@ -554,12 +554,16 @@ impl<'c> RegularExpression for ExecNoSync<'c> {`
`554`	`554`	`})`
`555`	`555`	`}`
`556`	`556`	`MatchType::Dfa => {`
`557`		`- match self.find_dfa_forward(text, start) {`
`558`		`- dfa::Result::Match((s, e)) => {`
`559`		`- self.captures_nfa_with_match(slots, text, s, e)`
	`557`	`+ if self.ro.nfa.is_anchored_start {`
	`558`	`+ self.captures_nfa(slots, text, start)`
	`559`	`+ } else {`
	`560`	`+ match self.find_dfa_forward(text, start) {`
	`561`	`+ dfa::Result::Match((s, e)) => {`
	`562`	`+ self.captures_nfa_with_match(slots, text, s, e)`
	`563`	`+ }`
	`564`	`+ dfa::Result::NoMatch(_) => None,`
	`565`	`+ dfa::Result::Quit => self.captures_nfa(slots, text, start),`
`560`	`566`	`}`
`561`		`- dfa::Result::NoMatch(_) => None,`
`562`		`- dfa::Result::Quit => self.captures_nfa(slots, text, start),`
`563`	`567`	`}`
`564`	`568`	`}`
`565`	`569`	`MatchType::DfaAnchoredReverse => {`