diff --git a/HACKING.md b/HACKING.md
index 9556de6ecc..a106c4b337 100644
--- a/HACKING.md
+++ b/HACKING.md
@@ -270,9 +270,9 @@ N.B. To run tests for the `regex!` macro, use:
 
 The benchmarking in this crate is made up of many micro-benchmarks. Currently,
 there are two primary sets of benchmarks: the benchmarks that were adopted
-at this library's inception (in `benches/src/misc.rs`) and a newer set of
+at this library's inception (in `bench/src/misc.rs`) and a newer set of
 benchmarks meant to test various optimizations. Specifically, the latter set
-contain some analysis and are in `benches/src/sherlock.rs`. Also, the latter
+contain some analysis and are in `bench/src/sherlock.rs`. Also, the latter
 set are all executed on the same lengthy input whereas the former benchmarks
 are executed on strings of varying length.
 
@@ -299,20 +299,20 @@ library benchmarks (especially RE2).
 If you're hacking on one of the matching engines and just want to see
 benchmarks, then all you need to run is:
 
-    $ ./run-bench rust
+    $ ./bench/run rust
 
 If you want to compare your results with older benchmarks, then try:
 
-    $ ./run-bench rust | tee old
+    $ ./bench/run rust | tee old
     $ ... make it faster
-    $ ./run-bench rust | tee new
-    $ cargo-benchcmp old new --improvements
+    $ ./bench/run rust | tee new
+    $ cargo benchcmp old new --improvements
 
 The `cargo-benchcmp` utility is available here:
 https://github.com/BurntSushi/cargo-benchcmp
 
-The `run-bench` utility can run benchmarks for PCRE and Oniguruma too. See
-`./run-bench --help`.
+The `./bench/run` utility can run benchmarks for PCRE and Oniguruma too. See
+`./bench/bench --help`.
 
 ## Dev Docs
 
diff --git a/bench/Cargo.toml b/bench/Cargo.toml
index 8875fed487..2448636887 100644
--- a/bench/Cargo.toml
+++ b/bench/Cargo.toml
@@ -40,8 +40,7 @@ bench = false
 # Doing anything else will probably result in weird "duplicate definition"
 # compiler errors.
 #
-# Tip: use the run-bench script in the root of this repository to run
-# benchmarks.
+# Tip: use the `bench/run` script (in this directory) to run benchmarks.
 [features]
 re-pcre1 = ["libpcre-sys"]
 re-pcre2 = []
diff --git a/bench/src/bench.rs b/bench/src/bench.rs
index a45079edc0..92e780e6b4 100644
--- a/bench/src/bench.rs
+++ b/bench/src/bench.rs
@@ -236,6 +236,41 @@ macro_rules! bench_find {
     }
 }
 
+// USAGE: bench_captures!(name, pattern, groups, haystack);
+//
+// CONTRACT:
+//   Given:
+//     ident, the desired benchmarking function name
+//     pattern : ::Regex, the regular expression to be executed
+//     groups : usize, the number of capture groups
+//     haystack : String, the string to search
+//   bench_captures will benchmark how fast re.captures() produces
+//   the capture groups in question.
+macro_rules! bench_captures {
+    ($name:ident, $pattern:expr, $count:expr, $haystack:expr) => {
+
+        #[cfg(feature = "re-rust")]
+        #[bench]
+        fn $name(b: &mut Bencher) {
+            use std::sync::Mutex;
+
+            lazy_static! {
+                static ref RE: Mutex<Regex> = Mutex::new($pattern);
+                static ref TEXT: Mutex<Text> = Mutex::new(text!($haystack));
+            };
+            let re = RE.lock().unwrap();
+            let text = TEXT.lock().unwrap();
+            b.bytes = text.len() as u64;
+            b.iter(|| {
+                match re.captures(&text) {
+                    None => assert!(false, "no captures"),
+                    Some(caps) => assert_eq!($count + 1, caps.len()),
+                }
+            });
+        }
+    }
+}
+
 mod ffi;
 mod misc;
 mod regexdna;
diff --git a/bench/src/misc.rs b/bench/src/misc.rs
index 86f93c4878..859b59c259 100644
--- a/bench/src/misc.rs
+++ b/bench/src/misc.rs
@@ -191,3 +191,85 @@ macro_rules! reallyhard2 { () => (r"\w+\s+Holmes") }
 
 bench_match!(reallyhard2_1K, reallyhard2!(),
              get_text(TXT_1K, reallyhard2_suffix()));
+
+
+//
+// Benchmarks to justify the short-haystack NFA fallthrough optimization
+// implemented by `read_captures_at` in regex/src/exec.rs. See github issue
+// #348.
+//
+// The procedure used to try to determine the right hardcoded cutoff
+// for the short-haystack optimization in issue #348 is as follows.
+//
+// ```
+// > cd bench
+// > cargo bench --features re-rust short_hay | tee dfa-nfa.res
+// > # modify the `MatchType::Dfa` branch in exec.rs:read_captures_at
+// > # to just execute the nfa
+// > cargo bench --features re-rust short_hay | tee nfa-only.res
+// > cargo benchcmp dfa-nfa.res nfa-only.res
+// ```
+//
+// The expected result is that short inputs will go faster under
+// the nfa-only mode, but at some turnover point the dfa-nfa mode
+// will start to win again. Unfortunately, that is not what happened.
+// Instead there was no noticeable change in the bench results, so
+// I've opted to just do the more conservative anchor optimization.
+//
+bench_captures!(short_haystack_1x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    String::from("aaaabbbbccccbbbdddd"));
+bench_captures!(short_haystack_2x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(2).collect::<String>(),
+            repeat("dddd").take(2).collect::<String>(),
+            ));
+bench_captures!(short_haystack_3x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(3).collect::<String>(),
+            repeat("dddd").take(3).collect::<String>(),
+            ));
+bench_captures!(short_haystack_4x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(4).collect::<String>(),
+            repeat("dddd").take(4).collect::<String>(),
+            ));
+bench_captures!(short_haystack_10x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(10).collect::<String>(),
+            repeat("dddd").take(10).collect::<String>(),
+            ));
+bench_captures!(short_haystack_100x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(100).collect::<String>(),
+            repeat("dddd").take(100).collect::<String>(),
+            ));
+bench_captures!(short_haystack_1000x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(1000).collect::<String>(),
+            repeat("dddd").take(1000).collect::<String>(),
+            ));
+bench_captures!(short_haystack_10000x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(10000).collect::<String>(),
+            repeat("dddd").take(10000).collect::<String>(),
+            ));
+bench_captures!(short_haystack_100000x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(100000).collect::<String>(),
+            repeat("dddd").take(100000).collect::<String>(),
+            ));
+bench_captures!(short_haystack_1000000x,
+    Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
+    format!("{}bbbbccccbbb{}",
+            repeat("aaaa").take(1000000).collect::<String>(),
+            repeat("dddd").take(1000000).collect::<String>(),
+            ));
diff --git a/src/exec.rs b/src/exec.rs
index 458e47d3b0..d12a725cf0 100644
--- a/src/exec.rs
+++ b/src/exec.rs
@@ -554,12 +554,16 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
                 })
             }
             MatchType::Dfa => {
-                match self.find_dfa_forward(text, start) {
-                    dfa::Result::Match((s, e)) => {
-                        self.captures_nfa_with_match(slots, text, s, e)
+                if self.ro.nfa.is_anchored_start {
+                    self.captures_nfa(slots, text, start)
+                } else {
+                    match self.find_dfa_forward(text, start) {
+                        dfa::Result::Match((s, e)) => {
+                            self.captures_nfa_with_match(slots, text, s, e)
+                        }
+                        dfa::Result::NoMatch(_) => None,
+                        dfa::Result::Quit => self.captures_nfa(slots, text, start),
                     }
-                    dfa::Result::NoMatch(_) => None,
-                    dfa::Result::Quit => self.captures_nfa(slots, text, start),
                 }
             }
             MatchType::DfaAnchoredReverse => {