diff --git a/Cargo.toml b/Cargo.toml index 2a3376e39a..134c04e532 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -114,6 +114,11 @@ name = "backtrack-bytes" path = "tests/test_crates_regex.rs" name = "crates-regex" +# Run the test suite on the onepass engine. +[[test]] +path = "tests/test_onepass.rs" +name = "onepass" + [profile.release] debug = true diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index a7e70ef596..8c2f97ef14 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -309,6 +309,11 @@ impl IntervalSet { } true } + + /// Returns true iff this class is empty. + pub fn is_empty(&self) -> bool { + self.ranges.is_empty() + } } /// An iterator over intervals. diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 903e6085be..fb82dd9425 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -797,6 +797,16 @@ impl ClassUnicode { pub fn symmetric_difference(&mut self, other: &ClassUnicode) { self.set.symmetric_difference(&other.set); } + + /// Returns true iff this character class contains no characters. + /// + /// This should never be true for a character class which was + /// constructed by the regex parser, but a notion of character + /// class emptiness can be useful for code that wants to + /// programmatically generate character classes. + pub fn is_empty(&self) -> bool { + self.set.is_empty() + } } /// An iterator over all ranges in a Unicode character class. @@ -998,6 +1008,16 @@ impl ClassBytes { pub fn is_all_ascii(&self) -> bool { self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) } + + /// Returns true iff this character class contains no characters. + /// + /// This should never be true for a character class which was + /// constructed by the regex parser, but a notion of character + /// class emptiness can be useful for code that wants to + /// programmatically generate character classes. + pub fn is_empty(&self) -> bool { + self.set.is_empty() + } } /// An iterator over all ranges in a byte character class. diff --git a/src/analysis.rs b/src/analysis.rs new file mode 100644 index 0000000000..8f4f6ceb34 --- /dev/null +++ b/src/analysis.rs @@ -0,0 +1,614 @@ +use syntax::hir::{ + Hir, HirKind, Literal, ClassBytes, ClassBytesRange, + Class, RepetitionRange, RepetitionKind +}; +use utf8_ranges::Utf8Sequences; + +/// True iff the given expression is onepass +/// +/// The general approach here is to find all the places in +/// the given Hir where any sort of branching occurs, +/// and examine the start of each expression at the branch +/// to see if there is an ambiguity. +/// +/// For example, given the regex `a|b`, we would examine +/// both branches of the alternation `a` and `b` and +/// notice that they don't clash, so the regex is onepass. +/// On the other hand the branches of `a|a` do clash, +/// so that regex is not onepass. +/// +/// Alternations are not the only branch points in a regex. +/// We also have to make sure to consider repetitions like +/// `a*a`, which is not onepass because there is no way +/// to tell whether we have to loop back to the repeated +/// expression or continue on by looking at just one byte. +/// `a*b` is onepass because you can figure out what to do. +/// If you see an `a`, go back to the start of the loop, +/// and if you see a `b` continue onward. +/// +/// A third, more subtle case is the case of concatenations +/// of expressions where some of the expressions can +/// accept the empty string. Consider `a(b|)ba`. This +/// regex is not onepass because it is not clear what to +/// do upon seeing the input `ab`. The problem is that `(b|)` +/// and `ba` clash with one other. +/// +/// To get a bit more specific about what it means for two +/// expressions to clash, we introduce the concept of first +/// sets. The first set of an expression is the set of +/// bytes which might begin a word in the language of that +/// expression. If the expression can accept the empty string, +/// the first set takes note of that as well. +/// +/// To handle these three cases, we use a visitor to +/// find the alternations, repetitions, and concatenations. +/// Whenever we find one of the above cases, we compute +/// the first set of the various branches involved, +/// then check to see if the first sets intersect. If +/// we ever find a non-empty intersection, the regex +/// is not onepass. +pub fn is_onepass(expr: &Hir) -> bool { + fset_of(expr).is_onepass +} + +/// Compute the first set of a given regular expression. +/// +/// The first set of a regular expression is the set of all bytes +/// which might begin it. This is a less general version of the +/// notion of a regular expression preview (the first set can be +/// thought of as the 1-preview of a regular expression). +/// +/// Note that first sets are byte-oriented because the DFA is +/// byte oriented. This means an expression like /Δ|δ/ is actually not +/// onepass, even though there is clearly no non-determinism inherent +/// to the regex at a unicode code point level (big delta and little +/// delta start with the same byte). +fn fset_of(expr: &Hir) -> FirstSet { + fn singleton(b: u8) -> FirstSet { + let mut f = FirstSet::empty(); + f.push_bytes(ClassBytesRange::new(b, b)); + f + } + + match expr.kind() { + &HirKind::Empty => FirstSet::epsilon(), + &HirKind::Literal(ref lit) => { + match lit { + &Literal::Unicode(c) => singleton(first_byte(c)), + &Literal::Byte(b) => singleton(b), + } + } + &HirKind::Class(ref class) => { + let mut fset = match class { + &Class::Unicode(ref c) => { + // Get all the bytes which might begin this unicode + // class. + let mut cb = FirstSet::empty(); + for cr in c.iter() { + for br in Utf8Sequences::new(cr.start(), cr.end()) { + let first = br.as_slice()[0]; + cb.push_bytes( + ClassBytesRange::new(first.start, first.end)); + } + } + cb + } + &Class::Bytes(ref b) => + FirstSet::new(b.iter().map(|x| *x), false), + }; + + fset.is_onepass = class_is_onepass(class); + fset + } + + // When an empty look (Anchor or WordBoundary) is at the start of + // a concatenation, we conservatively assume that the assertion + // will pass, so we just drop it. Then we can only get to this + // point if we are dealing with some sort of naked empty look. + // For now we just do the most conservative thing and say + // that such an emptylook could potentially match on any character. + &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => FirstSet::anychar(), + + &HirKind::Repetition(ref rep) => { + let mut fset = fset_of(&rep.hir); + + fset.accepts_empty = match rep.kind { + RepetitionKind::ZeroOrOne => true, + RepetitionKind::ZeroOrMore => true, + RepetitionKind::OneOrMore => fset.accepts_empty, + RepetitionKind::Range(ref range) => { + match range { + &RepetitionRange::Exactly(0) + | &RepetitionRange::AtLeast(0) + | &RepetitionRange::Bounded(0, _) => true, + _ => fset.accepts_empty, + } + } + }; + + fset + }, + &HirKind::Group(ref group) => fset_of(&group.hir), + + // The most involved case. We need to strip leading empty-looks + // as well as take the union of the first sets of the first n+1 + // expressions where n is the number of leading expressions which + // accept the empty string. + &HirKind::Concat(ref es) => { + let mut fset = FirstSet::empty(); + let mut inner_fsets = Vec::with_capacity(es.len()); + for e in es.iter() { + inner_fsets.push(fset_of(e)); + } + for (i, e) in es.iter().enumerate() { + match e.kind() { + &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => { + // Ignore any leading emptylooks, but any in tail + // position have to be accounted for. + if i == es.len() - 1 { + fset.union(&FirstSet::anychar()); + fset.accepts_empty = false; + } + } + _ => { + fset.union(&inner_fsets[i]); + + if !inner_fsets[i].accepts_empty { + fset.accepts_empty = false; + // We can stop accumulating after we stop seeing + // first sets which contain epsilon. + break; + } + } + } + } + + fset.is_onepass = concat_is_onepass(es, &inner_fsets); + + fset + } + &HirKind::Alternation(ref es) => { + let mut fset = FirstSet::empty(); + let mut inner_fsets = Vec::with_capacity(es.len()); + for (i, e) in es.iter().enumerate() { + inner_fsets.push(fset_of(e)); + fset.union(&inner_fsets[i]); + } + + fset.is_onepass = !FirstSet::fsets_clash_value(&inner_fsets); + + fset + } + } +} + +// Unicode classes are really just big alternatives from the byte +// oriented point of view. +// +// This function translates a unicode class into the +// byte space and checks for intersecting first sets. +// +// Byte classes are always onepass +fn class_is_onepass(cls: &Class) -> bool { + match cls { + &Class::Unicode(ref ucls) => { + let mut seen_char: [bool; 256] = [false; 256]; + + for cr in ucls.iter() { + for br in Utf8Sequences::new(cr.start(), cr.end()) { + let first = br.as_slice()[0]; + for b in first.start..(first.end+1) { + if seen_char[b as usize] { + return false; + } + seen_char[b as usize] = true; + } + } + } + } + _ => {} + } + + true +} + +fn concat_is_onepass(es: &[Hir], inner_fsets: &[FirstSet]) -> bool { + let mut empty_run = vec![]; + + for (i, e) in NestedConcat::new(es).enumerate() { + match e.kind() { + &HirKind::Anchor(_) | &HirKind::WordBoundary(_) => { + if i < es.len() - 1 { + continue; + } + } + _ => {} // FALLTHROUGH + } + + if !inner_fsets[i].is_onepass { + return false; + } + + let is_real_rep = match e.kind() { + &HirKind::Repetition(ref rep) => { + match rep.kind { + RepetitionKind::Range( + RepetitionRange::Exactly(_)) => false, + _ => true, + } + }, + _ => false, + }; + + empty_run.push(&inner_fsets[i]); + if !(inner_fsets[i].accepts_empty || is_real_rep) { + if FirstSet::fsets_clash_ref(&empty_run) { + return false; + } + empty_run.clear(); + } + } + + ! FirstSet::fsets_clash_ref(&empty_run) +} + +/// The first byte of a unicode code point. +/// +/// We only ever care about the first byte of a particular character +/// because the onepass DFA is implemented in the byte space not the +/// character space. This means, for example, that a branch between +/// lowercase delta and uppercase delta is actually non-deterministic. +fn first_byte(c: char) -> u8 { + let mut b: [u8; 4] = [0; 4]; + c.encode_utf8(&mut b); + b[0] +} + +/// A representation of all the possible ways a word in the language +/// of a regex could begin. ClassBytes has no way to express the empty +/// string, so we add an extra flag to indicate if a FirstSet includes +/// epsilon. Put in a more theoretical way all firstsets are subsets of +/// SIGMA `union` { epsilon }. +#[derive(Debug, PartialEq, Eq, Clone)] +struct FirstSet { + bytes: ClassBytes, + accepts_empty: bool, + is_onepass: bool, +} + +/// A macro to define the fsets_clash associated functions, +/// parameterized over the type of the inner slice. This lets +/// us avoid allocating an extra vector when we check +/// alternations for onepassness. +macro_rules! def_fsets_clash { + ($fun_name:ident, $slice_inner:ty) => { + /// Check if a list of first sets is incompatible. + fn $fun_name(fsets: &[$slice_inner]) -> bool { + let mut seen_so_far = FirstSet::empty(); + + for fset in fsets.iter() { + let mut snapshot = seen_so_far.clone(); + snapshot.intersect(&fset); + if ! snapshot.is_empty() { + return true; + } + + seen_so_far.union(&fset); + } + + false + } + } +} +impl FirstSet { + fn empty() -> Self { + FirstSet { + bytes: ClassBytes::empty(), + accepts_empty: false, + is_onepass: true, + } + } + + pub fn new(ranges: I, accepts_empty: bool) -> Self + where I: IntoIterator + { + FirstSet { + bytes: ClassBytes::new(ranges), + accepts_empty: accepts_empty, + is_onepass: true, + } + } + + fn anychar() -> FirstSet { + let mut f = FirstSet::empty(); + f.push_bytes(ClassBytesRange::new(b'\0', b'\xFF')); + f + } + + fn epsilon() -> FirstSet { + FirstSet { + bytes: ClassBytes::empty(), + accepts_empty: true, + is_onepass: true, + } + } + + fn push_bytes(&mut self, byte_range: ClassBytesRange) { + self.bytes.push(byte_range); + } + + fn union(&mut self, other: &FirstSet) { + self.bytes.union(&other.bytes); + self.accepts_empty = self.accepts_empty || other.accepts_empty; + } + + fn intersect(&mut self, other: &FirstSet) { + self.bytes.intersect(&other.bytes); + self.accepts_empty = self.accepts_empty && other.accepts_empty; + } + + fn is_empty(&self) -> bool { + self.bytes.is_empty() && !self.accepts_empty + } + + def_fsets_clash!(fsets_clash_ref, &FirstSet); + def_fsets_clash!(fsets_clash_value, FirstSet); +} + +/// An iterator over a concatenation of expressions which +/// drills down into other embedded concatenations. +struct NestedConcat<'a>(Vec<(&'a [Hir], usize)>); +impl<'a> NestedConcat<'a> { + fn new(es: &'a [Hir]) -> Self { + NestedConcat(vec![(es, 0)]) + } +} +impl<'a> Iterator for NestedConcat<'a> { + type Item = &'a Hir; + + fn next(&mut self) -> Option<&'a Hir> { + loop { + if self.0.len() == 0 { + return None; + } + + let tip = self.0.len() - 1; + let (es, idx) = self.0[tip]; + + if idx >= es.len() { + self.0.pop(); + continue; + } + + self.0[tip].1 += 1; + + match es[idx].kind() { + &HirKind::Concat(ref es) => { + self.0.push((es, 0)); + continue; + } + _ => return Some(&es[idx]), + } + } + } +} + +#[cfg(test)] +mod tests { + use syntax::Parser; + use syntax::hir::Hir; + use super::*; + + fn is_intersecting_fset(e1: &Hir, e2: &Hir) -> bool { + let mut fset = fset_of(e1); + fset.intersect(&fset_of(e2)); + ! fset.is_empty() + } + + // + // First Set intersection smoke tests + // + + #[test] + fn fset_lit() { + let e1 = Parser::new().parse("a").unwrap(); + let e2 = Parser::new().parse("a").unwrap(); + let e3 = Parser::new().parse("b").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_class() { + let e1 = Parser::new().parse("[a]").unwrap(); + let e2 = Parser::new().parse("[a]").unwrap(); + let e3 = Parser::new().parse("[b]").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_class_n() { + let e1 = Parser::new().parse("[xamn]").unwrap(); + let e2 = Parser::new().parse("[rlwa]").unwrap(); + let e3 = Parser::new().parse("[bcq]").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_alt() { + let e1 = Parser::new().parse("ab|bc|ad").unwrap(); + let e2 = Parser::new().parse("yyyy|am|zz").unwrap(); + let e3 = Parser::new().parse("cc|ww").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_group() { + let e1 = Parser::new().parse("(?:ab)").unwrap(); + let e2 = Parser::new().parse("(?:aq)").unwrap(); + let e3 = Parser::new().parse("(?:m)").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_concat() { + let e1 = Parser::new().parse("aa(?:nb)").unwrap(); + let e2 = Parser::new().parse("aa(?:rq)").unwrap(); + let e3 = Parser::new().parse("bb(?:m)").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_word_boundary_dropped() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\baa").unwrap(); + let e3 = Parser::new().parse(r"\bbb").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_word_boundary_all() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\b").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + } + + #[test] + fn fset_not_word_boundary_dropped() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\Baa").unwrap(); + let e3 = Parser::new().parse(r"\Bbb").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_not_word_boundary_all() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"\B").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + } + + #[test] + fn fset_start_anchor_dropped() { + let e1 = Parser::new().parse(r"aa").unwrap(); + let e2 = Parser::new().parse(r"^aa").unwrap(); + let e3 = Parser::new().parse(r"^bb").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + assert!(!is_intersecting_fset(&e1, &e3)); + } + + #[test] + fn fset_terminal_emptylook_all_1_() { + let e = Parser::new().parse(r"a*\b").unwrap(); + let mut fset = FirstSet::anychar(); + fset.is_onepass = false; + + assert_eq!(fset, fset_of(&e), "\n\n{:?}\n\n", e); + } + + #[test] + fn fset_terminal_emptylook_all_2_() { + let e = Parser::new().parse(r"(a*)\b").unwrap(); + let mut fset = FirstSet::anychar(); + fset.is_onepass = false; + + assert_eq!(fset, fset_of(&e), "\n\n{:?}\n\n", e); + } + + + #[test] + fn fset_empty_alt() { + let e1 = Parser::new().parse(r"(?:a|())b").unwrap(); + let e2 = Parser::new().parse(r"b").unwrap(); + + assert!(is_intersecting_fset(&e1, &e2)); + } + + // + // onepass smoke tests + // + + macro_rules! test_onepass { + ($fun_name:ident, $re_str:expr) => { + #[test] + fn $fun_name() { + let e = Parser::new().parse($re_str).unwrap(); + let fset = fset_of(&e); + assert!(fset.is_onepass, "fset={:?}", fset); + } + } + } + + macro_rules! test_not_onepass { + ($fun_name:ident, $re_str:expr) => { + #[test] + fn $fun_name() { + let e = Parser::new().parse($re_str).unwrap(); + let fset = fset_of(&e); + assert!(!fset.is_onepass, "fset={:?}", fset); + } + } + } + + test_onepass!(onepass_smoke_1_, r"[^x]x(.*)"); + test_not_onepass!(onepass_smoke_2_, r"(.*)x(.*)"); + + test_not_onepass!(onepass_alt_1_, r"a|b|c|a|d"); + test_not_onepass!(onepass_alt_2_, r"a|b|c|((m|a|x)|g)|d"); + test_onepass!(onepass_alt_3_, r"a|b|c|x|d"); + test_onepass!(onepass_alt_4_, r"a|b|c|((m|x)|g)|d"); + + test_not_onepass!(onepass_not_in_rust, r"(\d+)-(\d+)"); + + test_onepass!(onepass_empty_alt_1_, r"(a|())b"); + test_not_onepass!(onepass_empty_alt_2_, r"(a|())a"); + + test_not_onepass!(onepass_rep_1_, r"a*a"); + test_not_onepass!(onepass_rep_2_, r"a+a"); + test_not_onepass!(onepass_rep_3_, r"a{4,8}a"); + test_not_onepass!(onepass_rep_4_, r"a{4,}a"); + test_onepass!(onepass_rep_5_, r"a{4}a"); + test_not_onepass!(onepass_rep_6_, r"a?a"); + + test_onepass!(onepass_rep_7_, r"a*b"); + test_onepass!(onepass_rep_8_, r"a+b"); + test_onepass!(onepass_rep_9_, r"a{4,8}b"); + test_onepass!(onepass_rep_10_, r"a{4,}b"); + test_onepass!(onepass_rep_11_, r"a{4}b"); + test_onepass!(onepass_rep_12_, r"a?b"); + + test_not_onepass!(onepass_concat_middle_1_, r"ab?bc"); + test_onepass!(onepass_concat_middle_2_, r"a(?:b|c)dc"); + + test_not_onepass!(onepass_unicode_class_1_, r"\d"); + test_not_onepass!(onepass_unicode_class_2_, r"\s"); + test_not_onepass!(onepass_unicode_class_3_, r"\w"); + test_not_onepass!(onepass_unicode_class_4_, r"inthe\wmiddle"); + + test_not_onepass!(onepass_unicode_clash_1_, r"Δ|δ"); + + test_not_onepass!(onepass_empty_assert_1_, r"a|^a"); + test_onepass!(onepass_empty_assert_2_, r"\ba"); + test_onepass!(onepass_empty_assert_3_, r"^a"); + test_onepass!(onepass_empty_assert_4_, r"a$"); + + test_not_onepass!(onepass_naked_empty_assert_1_, r"\w|a"); +} diff --git a/src/backtrack.rs b/src/backtrack.rs index 6e71e2c2f3..e49d724ed0 100644 --- a/src/backtrack.rs +++ b/src/backtrack.rs @@ -245,7 +245,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { ip = inst.goto1; } EmptyLook(ref inst) => { - if self.input.is_empty_match(at, inst) { + if self.input.is_empty_match(at, inst.look) { ip = inst.goto; } else { return false; diff --git a/src/exec.rs b/src/exec.rs index 578289aa5c..ef2cc3db41 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -32,6 +32,7 @@ use re_set; use re_trait::{RegularExpression, Slot, Locations}; use re_unicode; use utf8::next_utf8; +use onepass::{OnePass, OnePassCompiler}; /// `Exec` manages the execution of a regular expression. /// @@ -81,6 +82,8 @@ struct ExecReadOnly { /// preceding `.*?`). This is used by the DFA to find the starting location /// of matches. dfa_reverse: Program, + /// A compiled onepass DFA. Always byte based. + onepass: Option, /// A set of suffix literals extracted from the regex. /// /// Prefix literals are stored on the `Program`, since they are used inside @@ -177,6 +180,16 @@ impl ExecBuilder { self } + /// Asks the matching engine to use a onepass DFA if possible. + /// + /// This overrides whatever was previously set via the `automatic`, + /// `nfa`, or `bounded_backtracking` methods. + pub fn onepass(mut self) -> Self { + self.match_type = Some(MatchType::OnePassDfa(Box::new(None))); + self + } + + /// Compiles byte based programs for use with the NFA matching engines. /// /// By default, the NFA engines match on Unicode scalar values. They can @@ -286,6 +299,7 @@ impl ExecBuilder { nfa: Program::new(), dfa: Program::new(), dfa_reverse: Program::new(), + onepass: None, suffixes: LiteralSearcher::empty(), match_type: MatchType::Nothing, }); @@ -320,10 +334,14 @@ impl ExecBuilder { dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; let mut ro = ExecReadOnly { - res: self.options.pats, nfa: nfa, dfa: dfa, dfa_reverse: dfa_reverse, + onepass: OnePassCompiler::new( + &parsed.exprs, + &self.options, + self.only_utf8).and_then(|c| c.compile()).ok(), + res: self.options.pats, suffixes: LiteralSearcher::suffixes(suffixes), match_type: MatchType::Nothing, }; @@ -421,6 +439,21 @@ impl<'c> RegularExpression for ExecNoSync<'c> { dfa::Result::Quit => self.shortest_nfa(text, start), } } + MatchType::OnePassDfa(_) => { + debug_assert!(self.ro.onepass.is_some()); + + match self.ro.onepass { + Some(ref op) => { + let mut slots = vec![None; self.slots_len()]; + if op.exec(&mut slots, text, start) { + slots[0] + } else { + None + } + } + None => unreachable!(), + } + } MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start), MatchType::Nothing => None, } @@ -469,6 +502,17 @@ impl<'c> RegularExpression for ExecNoSync<'c> { dfa::Result::Quit => self.match_nfa(text, start), } } + MatchType::OnePassDfa(_) => { + debug_assert!(self.ro.onepass.is_some()); + + match self.ro.onepass { + Some(ref op) => { + let mut slots = vec![None; self.slots_len()]; + op.exec(&mut slots, text, start) + } + None => unreachable!(), + } + } MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start), MatchType::Nothing => false, } @@ -512,6 +556,21 @@ impl<'c> RegularExpression for ExecNoSync<'c> { } } } + MatchType::OnePassDfa(_) => { + debug_assert!(self.ro.onepass.is_some()); + + match self.ro.onepass { + Some(ref op) => { + let mut slots = vec![None; self.slots_len()]; + if op.exec(&mut slots, text, start) { + slots[0].and_then(|s1| slots[1].map(|s2| (s1, s2))) + } else { + None + } + } + None => unreachable!(), + } + } MatchType::Nfa(ty) => self.find_nfa(ty, text, start), MatchType::Nothing => None, MatchType::DfaMany => { @@ -534,7 +593,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { text: &[u8], start: usize, ) -> Option<(usize, usize)> { - let slots = locs.as_slots(); + let mut slots = locs.as_slots(); for slot in slots.iter_mut() { *slot = None; } @@ -591,6 +650,20 @@ impl<'c> RegularExpression for ExecNoSync<'c> { dfa::Result::Quit => self.captures_nfa(slots, text, start), } } + MatchType::OnePassDfa(_) => { + debug_assert!(self.ro.onepass.is_some()); + + match self.ro.onepass { + Some(ref op) => { + if op.exec(&mut slots, text, start) { + slots[0].and_then(|s1| slots[1].map(|s2| (s1, s2))) + } else { + None + } + } + None => unreachable!(), + } + } MatchType::Nfa(ty) => { self.captures_nfa_type(ty, slots, text, start) } @@ -1012,18 +1085,31 @@ impl<'c> ExecNoSync<'c> { matches: &mut [bool], text: &[u8], start: usize, + ) -> bool { + self.many_matches_at_match_type( + matches, text, start, &self.ro.match_type) + } + + /// Finds which regular expressions match the given text with a + /// specific match type. + fn many_matches_at_match_type( + &self, + matches: &mut [bool], + text: &[u8], + start: usize, + match_type: &MatchType, ) -> bool { use self::MatchType::*; if !self.is_anchor_end_match(text) { return false; } - match self.ro.match_type { - Literal(ty) => { + match match_type { + &Literal(ty) => { debug_assert_eq!(matches.len(), 1); matches[0] = self.find_literals(ty, text, start).is_some(); matches[0] } - Dfa | DfaAnchoredReverse | DfaSuffix | DfaMany => { + &Dfa | &DfaAnchoredReverse | &DfaSuffix | &DfaMany => { match dfa::Fsm::forward_many( &self.ro.dfa, self.cache, @@ -1044,8 +1130,17 @@ impl<'c> ExecNoSync<'c> { } } } - Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start), - Nothing => false, + &OnePassDfa(ref fallback) => { + match **fallback { + Some(ref fb) => + self.many_matches_at_match_type( + matches, text, start, fb), + None => unreachable!( + "BUG: we must have a real fallback by now."), + } + } + &Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start), + &Nothing => false, } } @@ -1141,6 +1236,17 @@ impl Clone for Exec { impl ExecReadOnly { fn choose_match_type(&self, hint: Option) -> MatchType { use self::MatchType::*; + // If we have been asked to use the onepass DFA, we still need + // to choose a fallback in the usual way. + if let Some(OnePassDfa(_)) = hint { + let fallback = self.choose_match_type(None); + if self.onepass.is_some() { + return OnePassDfa(Box::new(Some(fallback))); + } else { + return fallback; + } + } + if let Some(Nfa(_)) = hint { return hint.unwrap(); } @@ -1222,11 +1328,14 @@ impl ExecReadOnly { } } -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Debug)] enum MatchType { /// A single or multiple literal search. This is only used when the regex /// can be decomposed into unambiguous literal search. Literal(MatchLiteralType), + /// A onepass DFA search. If a onepass search is impossible, we just + /// fall back to an automatically chosen search. + OnePassDfa(Box>), /// A normal DFA search. Dfa, /// A reverse DFA search starting from the end of a haystack. diff --git a/src/input.rs b/src/input.rs index 56097bd562..158e26e930 100644 --- a/src/input.rs +++ b/src/input.rs @@ -17,7 +17,7 @@ use std::u32; use syntax; use literal::LiteralSearcher; -use prog::InstEmptyLook; +use prog::EmptyLook; use utf8::{decode_utf8, decode_last_utf8}; /// Represents a location in the input. @@ -92,7 +92,7 @@ pub trait Input { /// Return true if the given empty width instruction matches at the /// input position given. - fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool; + fn is_empty_match(&self, at: InputAt, look: EmptyLook) -> bool; /// Scan the input for a matching prefix. fn prefix_at( @@ -118,8 +118,8 @@ impl<'a, T: Input> Input for &'a T { fn previous_char(&self, at: InputAt) -> Char { (**self).previous_char(at) } - fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { - (**self).is_empty_match(at, empty) + fn is_empty_match(&self, at: InputAt, look: EmptyLook) -> bool { + (**self).is_empty_match(at, look) } fn prefix_at( @@ -173,9 +173,9 @@ impl<'t> Input for CharInput<'t> { decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() } - fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + fn is_empty_match(&self, at: InputAt, look: EmptyLook) -> bool { use prog::EmptyLook::*; - match empty.look { + match look { StartLine => { let c = self.previous_char(at); at.pos() == 0 || c == '\n' @@ -265,9 +265,9 @@ impl<'t> Input for ByteInput<'t> { decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() } - fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + fn is_empty_match(&self, at: InputAt, look: EmptyLook) -> bool { use prog::EmptyLook::*; - match empty.look { + match look { StartLine => { let c = self.previous_char(at); at.pos() == 0 || c == '\n' diff --git a/src/lib.rs b/src/lib.rs index 9ca156084a..4a901f983b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -668,6 +668,8 @@ mod re_unicode; mod sparse; #[cfg(any(regex_runtime_teddy_ssse3, regex_runtime_teddy_avx2))] mod vector; +mod analysis; +mod onepass; /// The `internal` module exists to support suspicious activity, such as /// testing different matching engines and supporting the `regex-debug` CLI diff --git a/src/onepass.rs b/src/onepass.rs new file mode 100644 index 0000000000..c8bbfcae07 --- /dev/null +++ b/src/onepass.rs @@ -0,0 +1,1189 @@ +/*! +A onepass regex tells us that there are no non-deterministic branches +in the regex which means that we can use a DFA to implement capture +groups without resorting to magic too deep! The main advantage of +a onepass DFA are: + +1. The potential exponential blowup from converting an NFA to a DFA + via the powerset construction goes away. The exponential blowup + comes from compound states, which are a result of non-determinism. + This means that we don't need to muck about with dynamic DFA + construction or caching. + +2. There are no compound states so, we can implement captures with + a DFA. The reason that a general DFA can't handle captures is + that you don't know what to do when the DFA reaches a compound + state which includes one capturing state, but also other states. + This means that the DFA is potentially in either a capturing + NFA state or some other NFA state. For a onepass regex there + will never be a compound state for the DFA, so captures can + be implemented right in the DFA. +*/ + +use std::fmt; +use std::collections::{HashMap, HashSet}; + +use prog::{Program, Inst, EmptyLook}; +use literal::LiteralSearcher; +use re_trait::Slot; +use input::{ByteInput, Input}; +use analysis; +use compile::Compiler; +use syntax::hir::Hir; +use re_builder::RegexOptions; +use Error; + +// flip to true for debugging +const TRACE: bool = false; +macro_rules! trace { + ($($tts:tt)*) => { + if TRACE { + println!($($tts)*); + } + } +} + +/// A OnePass DFA. +#[derive(Debug, Clone)] +pub struct OnePass { + /// The flattened transition table of all of the different + /// DFA states. + table: Vec, + /// The prefixes. + prefixes: LiteralSearcher, + /// The stride. + num_byte_classes: usize, + /// The byte classes of this regex. + byte_classes: Vec, + /// The starting state. + start_state: StatePtr, + /// True if the regex is anchored at the start. + is_anchored_start: bool, + /// True if the regex is anchored at the end. + is_anchored_end: bool, + /// True if this regex ought to only accept utf8 strings. + only_utf8: bool, +} + +impl fmt::Display for OnePass { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + writeln!(f, "is_anchored_start: {}", self.is_anchored_start)?; + writeln!(f, "is_anchored_end: {}", self.is_anchored_end)?; + writeln!(f, "START: {}\n", st_str(self.start_state))?; + + for i in 0..(self.table.len() / self.num_byte_classes) { + let addr = format!("{:x}", i * self.num_byte_classes); + let trans_start = i * self.num_byte_classes; + let trans_end = (i+1) * self.num_byte_classes; + let trans = &self.table[trans_start..trans_end]; + Self::fmt_line(addr, trans, f)?; + } + + Ok(()) + } +} + +impl OnePass { + /// Execute the one-pass DFA, populating the list of capture slots + /// as you go. + pub fn exec(&self, slots: &mut [Slot], text: &[u8], mut at: usize) -> bool { + trace!("execing on '{:?}'\n{}", text, self); + + if self.is_anchored_start { + at == 0 && self.exec_(text, at, slots) + } else { + // We are forced to just try every starting index. + // This is noticeably more painful than it is for a + // standard DFA because we must clear the capture slots. + // + // To try to cut down on the cost of zeroing the capture + // groups, we implement a very simple FSM that just + // repeatedly tests to see if the very first DFA + // state could make progress. + loop { + trace!("OnePass::exec Trying to match at={} text.len()={}", + at, text.len()); + if self.exec_(text, at, slots) { + return true; + } + + for s in slots.iter_mut() { + *s = None; + } + + at = self.exec_prefix(text, at + 1); + if at > text.len() { + return false; + } + } + } + } + + /// Given the input and a position in the input, return next + /// position where a match will actually make one character + /// of progress. + fn exec_prefix(&self, text: &[u8], mut at: usize) -> usize { + trace!("::exec_prefix at={}", at); + if at < text.len() && !self.prefixes.is_empty() { + at = at + self.prefixes + .find(&text[at..]) + .map(|(s, _)| s) + .unwrap_or(text.len()); + } else { + while at < text.len() { + let byte_class = self.byte_classes[text[at] as usize] as usize; + if self.table[byte_class] != STATE_DEAD { + break; + } + at += 1; + } + } + + trace!("::exec_prefix next-chance={}", at); + + at + } + + /// Execute the one-pass DFA, populating the list of capture slots + /// as you go. + /// + /// Returns true if there is a match and false otherwise. + #[inline] + fn exec_( + &self, + text: &[u8], + mut at: usize, + slots: &mut [Slot] + ) -> bool { + // We re-use the NFA input machinery for empty looks. We are + // really going to work directly on the byte slice though. + let input = ByteInput::new(text, self.only_utf8); + + let mut state_ptr = self.start_state; + let mut last_match: Slot = None; + + // The inner loop of the onepass DFA. + // + // We bend over backwards to make sure that the inner loop + // logically looks like: + // + // while at < text.len(): + // state_ptr = self.transitions[state_ptr + text[at]] + // at += 1 + // + // As usual, this is a horrible lie. The onepass DFA steals + // the byteclass compression trick from the lazy DFA, so there + // is an extra layer of indirection. Any special flags need to + // be handled, so we also need to check the STATE_SPECIAL mask + // at every step. Finally, we use a backstop instead of the + // actual text.len() to check when it is time to break out of + // the loop to facilitate loop unrolling, and to avoid an + // extra branch around when it is time to increment at. + // + // Note that the only difference between this loop and + // the drain loop below is where `at` gets incremented + // and loop unrolling. For bugs that are not related to + // either of those things, it is often easier to just comment + // this loop out and work on the drain loop. Once you've come + // up with the fix, you can transfer your work here. + let step_size = 1; + let backstop = text.len().checked_sub(step_size).unwrap_or(0); + while at < backstop { + if state_ptr & STATE_SPECIAL == 0 { + // This is a weird looking place to increment at. + // The reason we do so has to do with the odd + // representation of a DFA that we've chosen. + // Let's dump the simplest possible regex to unpack + // that. + // + // ```text + // > cd regex-debug + // > cargo run -- --onepass compile 'a' + // is_anchored_start: false + // is_anchored_end: false + // START: (0) + // + // 0: 0/D | 1/8 | 2/D | 3/D + // 4: 0/0 | 1/0 | 2/P | 3/P + // 8: 0/(c) | 1/(c) | 2/(c) | 3/(c) + // c: 0/M | 1/M | 2/M | 3/M + // 10: 0/1 | 1/1 | 2/P | 3/P + // ``` + // + // Our initial state is denoted (0) because it's transition + // table lives at self.table[0] and because it is a + // saving state. This means that it does not correspond + // to the consumption of any input, yet its transition + // table is derived from its child states. In this + // case its only child state is 8. When we transition + // to state 8, the assertion that the first byte be + // 97 has already passed. Then we can't just increment + // at after every input consuming state, as you might + // think at first. The assertions associated with a state + // really get checked right before we enter it, so the + // right thing to do is to increment at only when we + // enter an input consuming state. + // + // One might be concerned that this will cause us to + // skip over the very first byte, but we are saved by + // the fact that the first instruction is always a save + // instruction. + at += 1; + + let byte_class = self.byte_class(text, at); + trace!("::exec_ loop-byte st={} at={} bc={} byte={}", + st_str(state_ptr), at, byte_class, text[at]); + + // No need to mask because no flags are set. + state_ptr = self.follow(state_ptr as usize, byte_class); + } else { + // STATE_HALT and STATE_DEAD must always be checked + // first because they have STATE_ACTION and STATE_MATCH + // set, even though those flags don't apply. It would + // probably be better for performance to check them last, + // so it may be worthwhile to try to rejigger the + // representation of StatePtrs. + if state_ptr == STATE_HALT { + trace!("::exec_ loop-halt"); + break; + } else if state_ptr == STATE_DEAD { + trace!("::exec_ loop-dead"); + slots[FULL_MATCH_CAPTURE_END] = last_match; + return last_match.is_some(); + } + + if state_ptr & STATE_ACTION != 0 { + let byte_class = self.byte_class(text, at); + trace!("::exec_ loop-act st={} at={} bc={} byte={}", + st_str(state_ptr), at, byte_class, text[at]); + let match_state = state_ptr & STATE_MATCH != 0; + state_ptr = + self.act(input, at, slots, state_ptr, byte_class); + // only record a match if the action does not cause death + if state_ptr != STATE_DEAD && match_state { + trace!("::exec_ loop-act-match at={}", at); + last_match = Some(at); + } + } else { + debug_assert!(state_ptr & STATE_MATCH != 0); + at += 1; + let byte_class = self.byte_class(text, at); + trace!("::exec_ loop-match st={} at={} bc={} byte={}", + st_str(state_ptr), at, byte_class, text[at]); + last_match = Some(at); + state_ptr = self.follow( + (state_ptr & STATE_MAX) as usize, byte_class); + + } + } + } + + // + // Drain the input after the backstop. + // + + // First, bump the at pointer if we just passed a byte test. + if state_ptr & STATE_ACTION == 0 { + at += 1; + } + + while at < text.len() { + let byte_class = self.byte_class(text, at); + + if state_ptr & STATE_SPECIAL == 0 { + // NOTE: In the main loop we increment `at` and + // recompute the byte class here. It is not + // safe to do so in the drain loop because we + // might fly off the end of the input and + // get an out of bounds error. + trace!("::exec_ drain-byte st={} at={} bc={} byte={}", + st_str(state_ptr), at, byte_class, text[at]); + + // No need to mask because no flags are set. + state_ptr = self.follow(state_ptr as usize, byte_class); + } else { + // STATE_HALT and STATE_DEAD must always be checked + // first because they have STATE_ACTION and STATE_MATCH + // set, even though those flags don't apply. It would + // probably be better for performance to check them last, + // so it may be worthwhile to try to rejigger the + // representation of StatePtrs. + if state_ptr == STATE_HALT { + trace!("::exec_ drain-halt"); + break; + } else if state_ptr == STATE_DEAD { + trace!("::exec_ drain-dead"); + slots[FULL_MATCH_CAPTURE_END] = last_match; + return last_match.is_some(); + } + + if state_ptr & STATE_ACTION != 0 { + trace!("::exec_ drain-act st={} at={} bc={} byte={}", + st_str(state_ptr), at, byte_class, text[at]); + let match_state = state_ptr & STATE_MATCH != 0; + state_ptr = + self.act(input, at, slots, state_ptr, byte_class); + // only record a match if the action does not cause death + if state_ptr != STATE_DEAD && match_state { + trace!("::exec_ drain-act-match at={}", at); + last_match = Some(at); + } + } else { + debug_assert!(state_ptr & STATE_MATCH != 0); + trace!("::exec_ drain-match st={} at={} bc={} byte={}", + st_str(state_ptr), at, byte_class, text[at]); + last_match = Some(at); + state_ptr = self.follow( + (state_ptr & STATE_MAX) as usize, byte_class); + + } + } + + // We incur the cost of this extra branch in the drain + // loop because we need to make sure that we won't fly + // off the end of the string. + if state_ptr & STATE_ACTION == 0 { + at += 1; + } + } + + // + // Execute one last step in the magic EOF byte class + // + + // Set the byte class to be EOF + let byte_class = self.num_byte_classes - 1; + trace!("::exec eof st={} at={} bc={}", + st_str(state_ptr), at, byte_class); + + // One EOF step + if state_ptr & STATE_ACTION == 0 && state_ptr != STATE_DEAD { + if state_ptr & STATE_MATCH != 0 { + trace!("::exec_ eof-match st={} at={} last_match={:?}", + st_str(state_ptr), at, last_match); + last_match = Some(at); + } + state_ptr = self.table[ + (state_ptr & STATE_MAX) as usize + byte_class]; + } + + // Finally, drain any actions. + while state_ptr & STATE_ACTION != 0 && state_ptr != STATE_HALT { + trace!("::exec eof act st={}", st_str(state_ptr)); + let match_state = state_ptr & STATE_MATCH != 0; + state_ptr = self.act(input, at, slots, state_ptr, byte_class); + // only record a match if the action does not cause death + if state_ptr != STATE_DEAD && match_state { + trace!("::exec_ eof-act-match at={}", at); + last_match = Some(at); + } + } + + // + // Finally, we can figure out if we actually got a match. + // + + trace!("::exec_ determine-match st={} at={} last_match={:?} slots={:?}", + st_str(state_ptr), at, last_match, slots); + slots[FULL_MATCH_CAPTURE_END] = last_match; + return last_match.is_some(); + } + + #[inline] + fn act( + &self, + input: I, + at: usize, + slots: &mut [Slot], + state_ptr: StatePtr, + byte_class: usize, + ) -> StatePtr { + // We had better have been called with a state that actually + // needs to be acted on. + debug_assert!(state_ptr & STATE_ACTION != 0); + + let state_idx = (state_ptr & STATE_MAX) as usize; + let action_type = self.table[state_idx + self.num_byte_classes]; + + if action_type == Action::Save as StatePtr { + let slot_idx = self.table[state_idx + self.num_byte_classes + 1]; + trace!("::act saving slot {}", slot_idx); + slots[slot_idx as usize] = Some(at); + + self.follow(state_idx, byte_class) + } else { + let iat = input.at(at); + let look = match action_type { + x if x == Action::StartLine as StatePtr => EmptyLook::StartLine, + x if x == Action::EndLine as StatePtr => EmptyLook::EndLine, + x if x == Action::StartText as StatePtr => EmptyLook::StartText, + x if x == Action::EndText as StatePtr => EmptyLook::EndText, + x if x == Action::WordBoundary as StatePtr => + EmptyLook::WordBoundary, + x if x == Action::NotWordBoundary as StatePtr => + EmptyLook::NotWordBoundary, + x if x == Action::WordBoundaryAscii as StatePtr => + EmptyLook::WordBoundaryAscii, + x if x == Action::NotWordBoundaryAscii as StatePtr => + EmptyLook::NotWordBoundaryAscii, + _ => unreachable!("Bad action flag."), + }; + + trace!("::act look={:?}", look); + + if input.is_empty_match(iat, look) { + self.follow(state_idx, byte_class) + } else { + STATE_DEAD + } + } + + } + + #[inline] + fn byte_class(&self, text: &[u8], at: usize) -> usize { + self.byte_classes[text[at] as usize] as usize + } + + #[inline] + fn follow(&self, state_idx: usize, byte_class: usize) -> StatePtr { + self.table[state_idx + byte_class] + } + + fn fmt_line( + addr: String, + trans: &[StatePtr], + f: &mut fmt::Formatter, + ) -> Result<(), fmt::Error> { + writeln!(f, "{}: {}", addr, + trans.iter().enumerate() + .map(|(i, x)| format!("{}/{}", i, st_str(*x))) + .collect::>() + .join(" | "))?; + Ok(()) + } +} + +////////////////////////////////////////////////////////////////////////// +// // +// Compiler // +// // +////////////////////////////////////////////////////////////////////////// + +/// Compiler for a OnePass DFA +pub struct OnePassCompiler { + /// The flattened transition table AKA the baked form of the DFA. + table: Vec, + + num_byte_classes: usize, + only_utf8: bool, + + /// The program to be compiled. + prog: Program, + + /// A mapping from instruction indices to their transitions + transitions: Vec>, + + /// A mapping from instruction indices to flags indicating + /// if they should have the STATE_MATCH flag set. + accepting_states: Vec, +} + +#[derive(Debug)] +pub enum OnePassError { + /// This program can't be executed as a one-pass regex. + HasNondeterminism, + /// This program contains a cycle of instructions that consume + /// no input. Right now we can't handle that, but this restriction + /// may be lifted in the future. + ForwardingCycle, + /// There are too many instructions to deal with. + TooBig, + /// An error happened when we tried to compile the regex. + CompileError(Error), + /// We don't support multiple regex at once. + RegexSetUnsupported, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl From for OnePassError { + fn from(e: Error) -> Self { + OnePassError::CompileError(e) + } +} + +impl OnePassCompiler { + /// Create a new OnePassCompiler for a given Hir. + /// Collect some metadata from the compiled program. + pub fn new( + es: &[Hir], + options: &RegexOptions, + only_utf8: bool, + ) -> Result { + if es.len() != 1 { + return Err(OnePassError::RegexSetUnsupported); + } + + if ! analysis::is_onepass(&es[0]) { + return Err(OnePassError::HasNondeterminism); + } + + let prog = Compiler::new() + .size_limit(options.size_limit) + .bytes(true) + .only_utf8(only_utf8) + .compile(es)?; + + let num_byte_classes = (prog.byte_classes[255] as usize) + 2; + + // We don't allow STATE_MAX to actually be used so that STATE_POISON + // remains a valid poison value. + let max_table_size = (STATE_MAX - 1) as usize; + let mut table_size: usize = 0; + for inst in prog.iter() { + table_size += num_byte_classes; + match inst { + &Inst::EmptyLook(_) | &Inst::Save(_) => + table_size += num_byte_classes, + _ => {} + } + if table_size > max_table_size { + return Err(OnePassError::TooBig); + } + } + + trace!("new compiler for:\n{:?}", prog); + Ok(OnePassCompiler { + table: vec![], + num_byte_classes: num_byte_classes, + only_utf8: only_utf8, + + transitions: { + let mut x = Vec::new(); + for _ in 0..prog.len() { + x.push(None); + } + x + }, + accepting_states: vec![false; prog.len()], + prog: prog, + }) + } + + /// Attempt to compile the regex to a OnePass DFA + pub fn compile(mut self) -> Result { + // A DAG of forwarding relationships indicating when + // a state needs to be forwarded to an Action state + // once that Action state has been fully constructed. + let mut forwards = Forwards::new(); + + // Compute the prioritized transition tables for all of the + // instructions which get states. + let mut state_edge = vec![0]; + while let Some(i) = state_edge.pop() { + self.inst_trans(i, &mut forwards, &mut state_edge)?; + } + + // Solve the dependency relationships between all the + // forwarding directives that were emitted by inst_trans. + for fwd in forwards.into_iter_topo() { + self.perform_forward(fwd?); + } + + // Now emit the transitions in a form that we can actually + // execute. + self.bake_transitions(); + + Ok(OnePass { + table: self.table, + prefixes: self.prog.prefixes, + num_byte_classes: self.num_byte_classes, + byte_classes: self.prog.byte_classes, + start_state: 0 | STATE_ACTION, + is_anchored_start: self.prog.is_anchored_start, + is_anchored_end: self.prog.is_anchored_end, + only_utf8: self.only_utf8, + }) + } + + /// Compile the stage 1 transition table for the state corresponding + /// to the given instruction. + /// + /// The result of `inst_trans` will end up in `self.transitions`. + /// + /// Returns a list of child instructions which must be compiled + /// via `inst_trans`. + fn inst_trans( + &mut self, + inst_idx: usize, + forwards: &mut Forwards, + state_edge: &mut Vec, + ) -> Result<(), OnePassError> { + trace!("::inst_trans inst_idx={}", inst_idx); + + if self.transitions[inst_idx].is_some() { + // we've already computed the transition table for this state. + return Ok(()); + } + + // Iterate over the children, visiting lower priority + // children first. + let mut resume = match &self.prog[inst_idx] { + &Inst::Save(ref inst) => vec![inst.goto], + &Inst::EmptyLook(ref inst) => vec![inst.goto], + &Inst::Bytes(ref inst) => vec![inst.goto], + &Inst::Split(ref inst) => vec![inst.goto1, inst.goto2], + &Inst::Match(_) => return Ok(()), // no kids + &Inst::Ranges(_) | &Inst::Char(_) => unreachable!(), + }; + + let mut trans = TransitionTable( + vec![Transition { tgt: TransitionTarget::Die, priority: 0 }; + self.num_byte_classes]); + + // Start at priority 1 because everything is higher priority than + // the initial list of `TransitionTarget::Die` pointers. + let mut priority = 1; + + while let Some(child_idx) = resume.pop() { + match &self.prog[child_idx] { + &Inst::EmptyLook(_) | &Inst::Save(_) => { + forwards.forward(inst_idx, child_idx, priority); + state_edge.push(child_idx); + } + &Inst::Bytes(ref inst) => { + // Weird usize casting shenanigans because a Bytes + // instruction has inclusive ranges, but rust uses + // closed-open ranges. + for byte in (inst.start as usize)..(inst.end as usize + 1) { + let byte = byte as u8; + let bc = self.prog.byte_classes[byte as usize]; + trans.0[bc as usize] = Transition { + tgt: TransitionTarget::BytesInst(child_idx), + priority: priority + }; + } + state_edge.push(child_idx); + } + &Inst::Split(ref inst) => { + resume.push(inst.goto1); + resume.push(inst.goto2); + } + &Inst::Match(_) => { + self.accepting_states[inst_idx] = true; + for t in trans.0.iter_mut() { + // Note that we go from lowest to highest + // priority, so we don't have to worry about + // clobbering higher priority transitions here. + *t = Transition { + tgt: TransitionTarget::Match, + priority: priority + }; + } + } + &Inst::Ranges(_) | &Inst::Char(_) => unreachable!(), + } + priority += 1; + } + + self.transitions[inst_idx] = Some(trans); + + Ok(()) + } + + /// Execute a forwarding job. + /// + /// To make that a little more concrete, consider the program snippet: + /// + /// 0000: Bytes(a, a) + /// 0001: Save(2) + /// 0002: Bytes(b, b) + /// + /// Here the state for `Bytes(a, a)` needs to transition to + /// the state for `Save(2)`, but it does not know when to do + /// so. The right answer is that it should transition to + /// the `Save(2)` state when it sees a `b`, but it is hard + /// to know what children `Save(2)` has from where `Bytes(a, a)` + /// stands. To handle this we just emit a forwarding job + /// that says "when you know enough about the `Save(2)` state, + /// please forward `Bytes(a, a)` to `Save(2)`.". We need to use + /// a full DAG for this because there could be multiple forwarding + /// states in a row: + /// + /// 0000: Bytes(a, a) + /// 0001: Save(2) + /// 0002: Save(3) + /// 0003: Bytes(b, b) + /// + /// Here we will end up with two forwarding jobs: + /// + /// 1. Forward from `Bytes(a, a)` to `Save(2)`. + /// 2. Forward from `Save(2)` to `Save(3)`. + /// + /// Which we structure as a dag that looks like: + /// + /// (2) --> (1) + /// + /// The arrow flows in a funny direction because we want the jobs + /// with no dependencies to live at the roots of the DAG so that + /// we can process them first. + fn perform_forward(&mut self, fwd: Forward) { + debug_assert!(fwd.copy_to != fwd.copy_from); + + let tgt = match &self.prog[fwd.copy_from] { + &Inst::EmptyLook(_) | &Inst::Save(_) => + TransitionTarget::ActionInst(fwd.copy_from), + _ => TransitionTarget::BytesInst(fwd.copy_from), + }; + + // Get a pair of mutable references to the two different + // transition tables in borrow checker approved fashion. + let (copy_to_ts, copy_from_ts) = if fwd.copy_to < fwd.copy_from { + let (stub, tail) = self.transitions.split_at_mut(fwd.copy_from); + (&mut stub[fwd.copy_to], &mut tail[0]) + } else { + let (stub, tail) = self.transitions.split_at_mut(fwd.copy_to); + (&mut tail[0], &mut stub[fwd.copy_from]) + }; + let (copy_to_ts, copy_from_ts) = match (copy_to_ts, copy_from_ts) { + (&mut Some(ref mut copy_to_ts), &mut Some(ref copy_from_ts)) => { + (copy_to_ts, copy_from_ts) + } + _ => unreachable!("forwards must be between real nodes."), + }; + + // now shuffle the transitions + for (from_t, to_t) in copy_from_ts.0.iter().zip(copy_to_ts.0.iter_mut()) { + if from_t.tgt == TransitionTarget::Die { + continue; + } + if to_t.priority > fwd.priority { + continue; + } + + // we should never encounter equal priorities + debug_assert!(to_t.priority != fwd.priority); + + *to_t = Transition { + tgt: tgt.clone(), + priority: fwd.priority, + }; + } + + // Finally, if a match instruction is reachable through + // a save fwd (which can never fail), the from state is accepting. + match &self.prog[fwd.copy_from] { + &Inst::Save(_) => { + self.accepting_states[fwd.copy_to] = + self.accepting_states[fwd.copy_from]; + } + _ => {} + } + } + + /// Once all the per-instruction transition tables have been worked + /// out, we can bake them into the single flat transition table we + /// are going to use for the actual DFA. This function creates the + /// baked form, storing it in `self.table`. + fn bake_transitions(&mut self) { + // pre-compute the state indices + let mut state_starts = Vec::with_capacity(self.prog.len()); + let mut off = 0; + for inst_idx in 0..self.prog.len() { + state_starts.push(off); + if self.transitions[inst_idx].is_some() { + off += self.num_byte_classes; + + match &self.prog[inst_idx] { + &Inst::EmptyLook(_) | &Inst::Save(_) => { + off += self.num_byte_classes; + } + _ => {} + } + } + } + + let ptr_of = |c: &OnePassCompiler, i: usize| { + let mut p = state_starts[i] as StatePtr; + if c.accepting_states[i] { + p |= STATE_MATCH; + } + p + }; + + self.table.reserve(state_starts[state_starts.len() - 1] + + self.num_byte_classes); + for inst_idx in 0..self.prog.len() { + let mut trans = Vec::with_capacity(self.num_byte_classes * 2); + + match &self.transitions[inst_idx] { + &None => continue, + &Some(ref ttab) => { + for t in ttab.0.iter() { + trans.push(match t.tgt { + TransitionTarget::Match => STATE_HALT, + + TransitionTarget::Die => STATE_DEAD, + TransitionTarget::BytesInst(i) => ptr_of(self, i), + TransitionTarget::ActionInst(i) => + ptr_of(self, i) | STATE_ACTION, + }); + } + } + } + + self.table.extend(trans); + + // emit all the right window dressing for the action, if + // there is one. + match &self.prog[inst_idx] { + &Inst::Save(ref inst) => { + debug_assert!(self.num_byte_classes >= 2); + + let mut save_args = vec![ + Action::Save as StatePtr, + inst.slot as StatePtr]; + save_args.extend(vec![STATE_POISON; + self.num_byte_classes - 2]); + self.table.extend(save_args); + } + &Inst::EmptyLook(ref inst) => { + let mut el_args = vec![self.empty_look_action(inst.look)]; + el_args.extend(vec![STATE_POISON; + self.num_byte_classes - 1]); + self.table.extend(el_args); + } + _ => {} + } + } + } + + fn empty_look_action(&self, el: EmptyLook) -> StatePtr { + match el { + EmptyLook::StartLine => Action::StartLine as StatePtr, + EmptyLook::EndLine => Action::EndLine as StatePtr, + EmptyLook::StartText => Action::StartText as StatePtr, + EmptyLook::EndText => Action::EndText as StatePtr, + EmptyLook::WordBoundary => Action::WordBoundary as StatePtr, + EmptyLook::NotWordBoundary => Action::NotWordBoundary as StatePtr, + EmptyLook::WordBoundaryAscii => + Action::WordBoundaryAscii as StatePtr, + EmptyLook::NotWordBoundaryAscii => + Action::NotWordBoundaryAscii as StatePtr, + } + } +} + +/// A mapping from byte classes to target states annotated +/// with transition priority. An intermediary representation. +struct TransitionTable(Vec); + +#[derive(Debug, Clone)] +struct Transition { + tgt: TransitionTarget, + priority: usize, +} + +#[derive(Debug, Clone, Eq, PartialEq)] +enum TransitionTarget { + Die, + Match, + BytesInst(usize), + ActionInst(usize), +} + + +/// A (hopefully) DAG of forwarding jobs. +#[derive(Debug, Clone)] +struct Forwards { + jobs: Vec, + + // the edges of the DAG + e_out: HashMap>, + e_in: HashMap>, + + /// A mapping from instructions to forwarding jobs which + /// want to copy to them. + inst_copy_tos: HashMap>, + /// A mapping from instructions to forwarding jobs which + /// want to copy from them. + inst_copy_froms: HashMap>, + + /// We really care about the root set, but it is much easier to + /// keep track of its inverse in an online way. + not_root_set: HashSet, +} + +impl Forwards { + pub fn new() -> Self { + Forwards { + jobs: vec![], + + e_out: HashMap::new(), + e_in: HashMap::new(), + + inst_copy_tos: HashMap::new(), + inst_copy_froms: HashMap::new(), + + not_root_set: HashSet::new(), + } + } + + /// Forward the state indexed by `forward_from` to the state + /// indexed by `forward_to` once we have enough info to do so. + pub fn forward( + &mut self, + forward_from: usize, + forward_to: usize, + priority: usize + ) { + trace!("::forward from={} to={}", forward_from, forward_to); + + let fidx = self.jobs.len(); + self.jobs.push(Forward { + copy_to: forward_from, + copy_from: forward_to, + priority: priority, + }); + + // Note the endpoints of this forward + self.inst_copy_tos.entry(forward_from).or_insert(vec![]).push(fidx); + self.inst_copy_froms.entry(forward_to).or_insert(vec![]).push(fidx); + + // For every forwarding job that we depend on completing + // before this job, add an edge flowing from the dependency + // to this job. + match self.inst_copy_tos.get(&forward_to) { + Some(dependencies) => { + trace!("dependencies = {:?}", dependencies); + for dep in dependencies.iter() { + Self::edge( + &mut self.e_out, &mut self.e_in, + &mut self.not_root_set, *dep, fidx); + } + } + None => {} + } + + // For every job which depends on this job, + // add an edge which flows from this job to the dependant + // job. + match self.inst_copy_froms.get(&forward_from) { + Some(dependants) => { + for dep in dependants.iter() { + Self::edge( + &mut self.e_out, &mut self.e_in, + &mut self.not_root_set, fidx, *dep); + } + } + None => {} + } + } + + // An associated function to please the borrow checker. gross. + fn edge( + e_out: &mut HashMap>, + e_in: &mut HashMap>, + not_root_set: &mut HashSet, + out_node: usize, + in_node: usize + ) { + e_out.entry(out_node).or_insert(vec![]).push(in_node); + e_in.entry(in_node).or_insert(vec![]).push(out_node); + not_root_set.insert(in_node); + } + + pub fn into_iter_topo(self) -> Topo { + let mut root_set = vec![]; + for n in 0..self.jobs.len() { + if ! self.not_root_set.contains(&n) { + root_set.push(n); + } + } + + trace!("::into_iter_topo jobs={:?}", self.jobs); + trace!("::into_iter_topo e_out={:?}", self.e_out); + trace!("::into_iter_topo e_in={:?}", self.e_in); + trace!("::into_iter_topo root_set={:?}", root_set); + + Topo { + jobs: self.jobs, + e_out: self.e_out, + e_in: self.e_in, + root_set: root_set, + } + } +} + + +/// A job asking the state indicated by `copy_to` to be rewritten +/// to point to the state indicated by `copy_from` whenever the +/// `copy_from` state could make progress. +#[derive(Debug, Clone)] +struct Forward { + copy_to: usize, + copy_from: usize, + priority: usize, +} + +/// An iterator that returns forwarding directives in topological order +/// using Kahn's Algorithm. +struct Topo { + jobs: Vec, + e_out: HashMap>, + e_in: HashMap>, + root_set: Vec, +} + +impl Iterator for Topo { + type Item = Result; + fn next(&mut self) -> Option> { + if let Some(next_job) = self.root_set.pop() { + let tgts = self.e_out.get(&next_job).unwrap_or(&vec![]).clone(); + for tgt in tgts.iter() { + self.rm_edge(next_job, *tgt); + + // If tgt has no incoming edges, add it to the root set. + if ! self.e_in.get(tgt).is_some() { + self.root_set.push(*tgt); + } + } + + Some(Ok(self.jobs[next_job].clone())) + } else { + if self.e_out.len() != 0 || self.e_in.len() != 0 { + Some(Err(OnePassError::ForwardingCycle)) + } else { + None + } + } + } +} + +impl Topo { + fn rm_edge(&mut self, node_out: usize, node_in: usize) { + let mut rm = false; + match self.e_out.get_mut(&node_out) { + Some(tgts) => { + let in_pos = tgts.iter().position(|t| *t == node_in); + match in_pos { + Some(p) => { tgts.remove(p); }, + None => debug_assert!(false), + } + + if tgts.len() == 0 { + rm = true; + } + } + None => debug_assert!(false), + } + if rm { + self.e_out.remove(&node_out); + } + + rm = false; + match self.e_in.get_mut(&node_in) { + Some(tgts) => { + let out_pos = tgts.iter().position(|t| *t == node_out); + match out_pos { + Some(p) => { tgts.remove(p); }, + None => debug_assert!(false), + } + + if tgts.len() == 0 { + rm = true; + } + } + None => debug_assert!(false), + } + if rm { + self.e_in.remove(&node_in); + } + } +} + +////////////////////////////////////////////////////////////////////////// +// // +// State Encoding // +// // +// This is mostly stolen from the lazy DFA. STATE_ACTION is a onepass // +// thing. // +// // +////////////////////////////////////////////////////////////////////////// + +type StatePtr = u32; + +fn st_str(st: StatePtr) -> String { + if st == STATE_DEAD { + "D".to_string() + } else if st == STATE_POISON { + "P".to_string() + } else if st == STATE_HALT { + "H".to_string() + } else if st & STATE_ACTION != 0 && st & STATE_MATCH != 0 { + format!("(M{:x})", st & STATE_MAX) + } else if st & STATE_ACTION != 0 { + format!("({:x})", st & STATE_MAX) + } else if st & STATE_MATCH != 0 { + format!("M{:x}", st & STATE_MAX) + } else { + format!("{:x}", st & STATE_MAX) + } +} + +/// The ACTION state means that the DFA needs to take some +/// action that will be specified by the first two StatePtrs +/// in a special transition table entry just below the transition +/// table for the ACTION state. An ACTION might include checking +/// some zero-width assertion about the input, or it might include +/// saving a value to a capture slots. +const STATE_ACTION: StatePtr = 1 << 31; + +/// An action which might need to be taken for a special state. +enum Action { + Save, + StartLine, + EndLine, + StartText, + EndText, + WordBoundary, + NotWordBoundary, + WordBoundaryAscii, + NotWordBoundaryAscii, +} + +/// A match state means that the regex has successfully matched. +const STATE_MATCH: StatePtr = 1 << 30; + +/// POISON is a state pointer that should never be touched. +/// We use it to pad invalid argument slots to ACTION states. +const STATE_POISON: StatePtr = !0; + +/// A dead state means that the state has been computed and it is known that +/// once it is entered, no future match can ever occur. +/// +/// It is not valid to dereference STATE_DEAD. +const STATE_DEAD: StatePtr = STATE_MATCH + 1; + +/// HALT indicates that the machine ought to halt execution. It differs +/// from DEAD only in that an accepting state that transitions to HALT +/// still accepts, while an accepting state which transitions to DEAD +/// does not. +const STATE_HALT: StatePtr = STATE_ACTION + 1; + +/// The maximum state pointer. This is useful to mask out the "valid" state +/// pointer from a state with the "start" or "match" bits set. +const STATE_MAX: StatePtr = STATE_MATCH - 1; + +/// STATE_SPECIAL is a bitmask useful for checking if we are dealing +/// with a special case, or if we can keep chugging away at the inner +/// loop. +const STATE_SPECIAL: StatePtr = STATE_MATCH | STATE_ACTION; + +const FULL_MATCH_CAPTURE_END: usize = 1; diff --git a/src/pikevm.rs b/src/pikevm.rs index 80d44717ae..a82bb959fb 100644 --- a/src/pikevm.rs +++ b/src/pikevm.rs @@ -322,7 +322,7 @@ impl<'r, I: Input> Fsm<'r, I> { nlist.set.insert(ip); match self.prog[ip] { EmptyLook(ref inst) => { - if self.input.is_empty_match(at, inst) { + if self.input.is_empty_match(at, inst.look) { ip = inst.goto; } } diff --git a/tests/onepass_unit.rs b/tests/onepass_unit.rs new file mode 100644 index 0000000000..b69197a251 --- /dev/null +++ b/tests/onepass_unit.rs @@ -0,0 +1,26 @@ + +// +// Just some unit tests that I found it useful to focus on while +// debugging the onepass DFA. Mostly these are simplifications +// of existing tests, so their value is not that huge, but +// why throw out tests that have been useful in the past. +// This is definitely not an appropriate permanent home +// for them. I should ask @burntsushi about where a better place +// for them would be (maybe in misc.rs?). Alternatively I could +// not be lazy and just actually try to grok each of the test +// modules. +// + + +mat!(trailing_repeat, "ab(?:ab)?", "abac", Some((0, 2))); + +// Currently fail to compile because empty branches are not allowed! +// Yay! In the future we might have to worry about this though. +// +// mat!(trailing_alt_with_empty_branch, "ab(?:ab|)", "abac", Some((0, 2))); +// mat!(trailing_lazy_alt_with_empty_branch, "ab(?:|ab)", "abab", Some((0, 2))); + +matiter!(match_multi_rep_4, r"(?m)(?:^a)+", "aaa\naaa\naaa", + (0, 1), (4, 5), (8, 9)); + +mat!(startline_a_rep, r"(?m)(?:^a)+", "aaa", Some((0, 1))); diff --git a/tests/test_onepass.rs b/tests/test_onepass.rs new file mode 100644 index 0000000000..5d2d90cbdb --- /dev/null +++ b/tests/test_onepass.rs @@ -0,0 +1,64 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![cfg_attr(feature = "pattern", feature(pattern))] + +extern crate rand; +extern crate regex; + +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re).onepass().build().map(|e| e.into_regex()) + }} +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + } +} + +// Even though we don't support regex sets, we should still provide a +// constructor for them that sets the onepass flag in order to make +// sure that we properly fall back to a different impl. +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re).onepass().build().map(|e| e.into_regex_set()) + }} +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + } +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); + +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod searcher; +mod set; +mod suffix_reverse; +mod unicode; +mod word_boundary; +mod word_boundary_unicode; +mod onepass_unit;