Skip to content

Commit 14a8989

Browse files
BurntSushiSeanRBurton
authored andcommitted
Reintroduce the reverse suffix literal optimization.
It's too good to pass up. This time, we avoid quadratic behavior with a simple work-around: we limit the amount of reverse searching we do after having found a literal match. If the reverse search ends at the beginning of its search text (whether a match or not), then we stop the reverse suffix optimization and fall back to the standard forward search. This reverts commit 50d991e. # Conflicts: # src/exec.rs
1 parent 9fc0ac8 commit 14a8989

File tree

6 files changed

+396
-188
lines changed

6 files changed

+396
-188
lines changed

bench/src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ fn main() {
7070
.unwrap_or_else(|e| e.exit());
7171

7272
let mmap = Mmap::open_path(&args.arg_file, Protection::Read).unwrap();
73-
let haystack = unsafe { str::from_utf8(mmap.as_slice()).unwrap() };
73+
let haystack = unsafe { str::from_utf8_unchecked(mmap.as_slice()) };
7474

7575
println!("{}", args.count(&haystack));
7676
}

bench/src/misc.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,14 @@ bench_match!(long_needle2, r"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbba", {
9292
repeat("b").take(100_000).collect::<String>() + "a"
9393
});
9494

95+
// This benchmark specifically targets the "reverse suffix literal"
96+
// optimization. In particular, it is easy for a naive implementation to
97+
// take quadratic worst case time. This benchmark provides a case for such
98+
// a scenario.
99+
bench_not_match!(reverse_suffix_no_quadratic, r"[r-z].*bcdefghijklmnopq", {
100+
repeat("bcdefghijklmnopq").take(500).collect::<String>()
101+
});
102+
95103
#[cfg(feature = "re-rust")]
96104
#[bench]
97105
fn replace_all(b: &mut Bencher) {

src/dfa.rs

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ pub struct Fsm<'a> {
214214
#[derive(Clone, Debug)]
215215
pub enum Result<T> {
216216
Match(T),
217-
NoMatch,
217+
NoMatch(usize),
218218
Quit,
219219
}
220220

@@ -223,7 +223,28 @@ impl<T> Result<T> {
223223
pub fn is_match(&self) -> bool {
224224
match *self {
225225
Result::Match(_) => true,
226-
Result::NoMatch | Result::Quit => false,
226+
Result::NoMatch(_) | Result::Quit => false,
227+
}
228+
}
229+
230+
/// Maps the given function onto T and returns the result.
231+
///
232+
/// If this isn't a match, then this is a no-op.
233+
pub fn map<U, F: FnMut(T) -> U>(self, mut f: F) -> Result<U> {
234+
match self {
235+
Result::Match(t) => Result::Match(f(t)),
236+
Result::NoMatch(x) => Result::NoMatch(x),
237+
Result::Quit => Result::Quit,
238+
}
239+
}
240+
241+
/// Sets the non-match position.
242+
///
243+
/// If this isn't a non-match, then this is a no-op.
244+
fn set_non_match(self, at: usize) -> Result<T> {
245+
match self {
246+
Result::NoMatch(_) => Result::NoMatch(at),
247+
r => r,
227248
}
228249
}
229250
}
@@ -465,7 +486,7 @@ impl<'a> Fsm<'a> {
465486
state_flags,
466487
) {
467488
None => return Result::Quit,
468-
Some(STATE_DEAD) => return Result::NoMatch,
489+
Some(STATE_DEAD) => return Result::NoMatch(at),
469490
Some(si) => si,
470491
};
471492
debug_assert!(dfa.start != STATE_UNKNOWN);
@@ -498,7 +519,7 @@ impl<'a> Fsm<'a> {
498519
state_flags,
499520
) {
500521
None => return Result::Quit,
501-
Some(STATE_DEAD) => return Result::NoMatch,
522+
Some(STATE_DEAD) => return Result::NoMatch(at),
502523
Some(si) => si,
503524
};
504525
debug_assert!(dfa.start != STATE_UNKNOWN);
@@ -532,7 +553,7 @@ impl<'a> Fsm<'a> {
532553
state_flags,
533554
) {
534555
None => return Result::Quit,
535-
Some(STATE_DEAD) => return Result::NoMatch,
556+
Some(STATE_DEAD) => return Result::NoMatch(at),
536557
Some(si) => si,
537558
};
538559
debug_assert!(dfa.start != STATE_UNKNOWN);
@@ -601,7 +622,7 @@ impl<'a> Fsm<'a> {
601622
// reported as an index to the most recent byte that resulted in a
602623
// transition to a match state and is always stored in capture slot `1`
603624
// when searching forwards. Its maximum value is `text.len()`.
604-
let mut result = Result::NoMatch;
625+
let mut result = Result::NoMatch(self.at);
605626
let (mut prev_si, mut next_si) = (self.start, self.start);
606627
let mut at = self.at;
607628
while at < text.len() {
@@ -690,7 +711,7 @@ impl<'a> Fsm<'a> {
690711
next_si &= !STATE_START;
691712
prev_si = next_si;
692713
at = match self.prefix_at(text, at) {
693-
None => return Result::NoMatch,
714+
None => return Result::NoMatch(text.len()),
694715
Some(i) => i,
695716
};
696717
} else if next_si >= STATE_UNKNOWN {
@@ -711,7 +732,7 @@ impl<'a> Fsm<'a> {
711732
self.at = at;
712733
next_si = match self.next_state(qcur, qnext, prev_si, byte) {
713734
None => return Result::Quit,
714-
Some(STATE_DEAD) => return result,
735+
Some(STATE_DEAD) => return result.set_non_match(at),
715736
Some(si) => si,
716737
};
717738
debug_assert!(next_si != STATE_UNKNOWN);
@@ -735,7 +756,7 @@ impl<'a> Fsm<'a> {
735756
prev_si &= STATE_MAX;
736757
prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
737758
None => return Result::Quit,
738-
Some(STATE_DEAD) => return result,
759+
Some(STATE_DEAD) => return result.set_non_match(text.len()),
739760
Some(si) => si & !STATE_START,
740761
};
741762
debug_assert!(prev_si != STATE_UNKNOWN);
@@ -762,7 +783,7 @@ impl<'a> Fsm<'a> {
762783
// N.B. The code duplication here is regrettable. Efforts to improve
763784
// it without sacrificing performance are welcome. ---AG
764785
debug_assert!(self.prog.is_reverse);
765-
let mut result = Result::NoMatch;
786+
let mut result = Result::NoMatch(self.at);
766787
let (mut prev_si, mut next_si) = (self.start, self.start);
767788
let mut at = self.at;
768789
while at > 0 {
@@ -816,7 +837,7 @@ impl<'a> Fsm<'a> {
816837
self.at = at;
817838
next_si = match self.next_state(qcur, qnext, prev_si, byte) {
818839
None => return Result::Quit,
819-
Some(STATE_DEAD) => return result,
840+
Some(STATE_DEAD) => return result.set_non_match(at),
820841
Some(si) => si,
821842
};
822843
debug_assert!(next_si != STATE_UNKNOWN);
@@ -837,7 +858,7 @@ impl<'a> Fsm<'a> {
837858
// Run the DFA once more on the special EOF senitnel value.
838859
prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
839860
None => return Result::Quit,
840-
Some(STATE_DEAD) => return result,
861+
Some(STATE_DEAD) => return result.set_non_match(0),
841862
Some(si) => si,
842863
};
843864
debug_assert!(prev_si != STATE_UNKNOWN);

0 commit comments

Comments
 (0)