Skip to content

Reintroduce the reverse suffix literal optimization. #228

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 7, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bench/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ fn main() {
.unwrap_or_else(|e| e.exit());

let mmap = Mmap::open_path(&args.arg_file, Protection::Read).unwrap();
let haystack = unsafe { str::from_utf8(mmap.as_slice()).unwrap() };
let haystack = unsafe { str::from_utf8_unchecked(mmap.as_slice()) };

println!("{}", args.count(&haystack));
}
Expand Down
8 changes: 8 additions & 0 deletions bench/src/misc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@ bench_match!(long_needle2, r"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbba", {
repeat("b").take(100_000).collect::<String>() + "a"
});

// This benchmark specifically targets the "reverse suffix literal"
// optimization. In particular, it is easy for a naive implementation to
// take quadratic worst case time. This benchmark provides a case for such
// a scenario.
bench_not_match!(reverse_suffix_no_quadratic, r"[r-z].*bcdefghijklmnopq", {
repeat("bcdefghijklmnopq").take(500).collect::<String>()
});

#[cfg(feature = "re-rust")]
#[bench]
fn replace_all(b: &mut Bencher) {
Expand Down
45 changes: 33 additions & 12 deletions src/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ pub struct Fsm<'a> {
#[derive(Clone, Debug)]
pub enum Result<T> {
Match(T),
NoMatch,
NoMatch(usize),
Quit,
}

Expand All @@ -223,7 +223,28 @@ impl<T> Result<T> {
pub fn is_match(&self) -> bool {
match *self {
Result::Match(_) => true,
Result::NoMatch | Result::Quit => false,
Result::NoMatch(_) | Result::Quit => false,
}
}

/// Maps the given function onto T and returns the result.
///
/// If this isn't a match, then this is a no-op.
pub fn map<U, F: FnMut(T) -> U>(self, mut f: F) -> Result<U> {
match self {
Result::Match(t) => Result::Match(f(t)),
Result::NoMatch(x) => Result::NoMatch(x),
Result::Quit => Result::Quit,
}
}

/// Sets the non-match position.
///
/// If this isn't a non-match, then this is a no-op.
fn set_non_match(self, at: usize) -> Result<T> {
match self {
Result::NoMatch(_) => Result::NoMatch(at),
r => r,
}
}
}
Expand Down Expand Up @@ -465,7 +486,7 @@ impl<'a> Fsm<'a> {
state_flags,
) {
None => return Result::Quit,
Some(STATE_DEAD) => return Result::NoMatch,
Some(STATE_DEAD) => return Result::NoMatch(at),
Some(si) => si,
};
debug_assert!(dfa.start != STATE_UNKNOWN);
Expand Down Expand Up @@ -498,7 +519,7 @@ impl<'a> Fsm<'a> {
state_flags,
) {
None => return Result::Quit,
Some(STATE_DEAD) => return Result::NoMatch,
Some(STATE_DEAD) => return Result::NoMatch(at),
Some(si) => si,
};
debug_assert!(dfa.start != STATE_UNKNOWN);
Expand Down Expand Up @@ -532,7 +553,7 @@ impl<'a> Fsm<'a> {
state_flags,
) {
None => return Result::Quit,
Some(STATE_DEAD) => return Result::NoMatch,
Some(STATE_DEAD) => return Result::NoMatch(at),
Some(si) => si,
};
debug_assert!(dfa.start != STATE_UNKNOWN);
Expand Down Expand Up @@ -601,7 +622,7 @@ impl<'a> Fsm<'a> {
// reported as an index to the most recent byte that resulted in a
// transition to a match state and is always stored in capture slot `1`
// when searching forwards. Its maximum value is `text.len()`.
let mut result = Result::NoMatch;
let mut result = Result::NoMatch(self.at);
let (mut prev_si, mut next_si) = (self.start, self.start);
let mut at = self.at;
while at < text.len() {
Expand Down Expand Up @@ -690,7 +711,7 @@ impl<'a> Fsm<'a> {
next_si &= !STATE_START;
prev_si = next_si;
at = match self.prefix_at(text, at) {
None => return Result::NoMatch,
None => return Result::NoMatch(text.len()),
Some(i) => i,
};
} else if next_si >= STATE_UNKNOWN {
Expand All @@ -711,7 +732,7 @@ impl<'a> Fsm<'a> {
self.at = at;
next_si = match self.next_state(qcur, qnext, prev_si, byte) {
None => return Result::Quit,
Some(STATE_DEAD) => return result,
Some(STATE_DEAD) => return result.set_non_match(at),
Some(si) => si,
};
debug_assert!(next_si != STATE_UNKNOWN);
Expand All @@ -735,7 +756,7 @@ impl<'a> Fsm<'a> {
prev_si &= STATE_MAX;
prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
None => return Result::Quit,
Some(STATE_DEAD) => return result,
Some(STATE_DEAD) => return result.set_non_match(text.len()),
Some(si) => si & !STATE_START,
};
debug_assert!(prev_si != STATE_UNKNOWN);
Expand All @@ -762,7 +783,7 @@ impl<'a> Fsm<'a> {
// N.B. The code duplication here is regrettable. Efforts to improve
// it without sacrificing performance are welcome. ---AG
debug_assert!(self.prog.is_reverse);
let mut result = Result::NoMatch;
let mut result = Result::NoMatch(self.at);
let (mut prev_si, mut next_si) = (self.start, self.start);
let mut at = self.at;
while at > 0 {
Expand Down Expand Up @@ -816,7 +837,7 @@ impl<'a> Fsm<'a> {
self.at = at;
next_si = match self.next_state(qcur, qnext, prev_si, byte) {
None => return Result::Quit,
Some(STATE_DEAD) => return result,
Some(STATE_DEAD) => return result.set_non_match(at),
Some(si) => si,
};
debug_assert!(next_si != STATE_UNKNOWN);
Expand All @@ -837,7 +858,7 @@ impl<'a> Fsm<'a> {
// Run the DFA once more on the special EOF senitnel value.
prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
None => return Result::Quit,
Some(STATE_DEAD) => return result,
Some(STATE_DEAD) => return result.set_non_match(0),
Some(si) => si,
};
debug_assert!(prev_si != STATE_UNKNOWN);
Expand Down
Loading