From e711f8eeaeb7648640ac895222c56683c6650c76 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 2 Oct 2023 20:24:51 -0400 Subject: [PATCH 01/33] automata: clean up regression test The name was quite vague, so add a little specificity. --- regex-automata/src/meta/regex.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index ce3bae0fa..a06d2bb48 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -3640,8 +3640,8 @@ mod tests { // I found this in the course of building out the benchmark suite for // rebar. #[test] - fn regression() { - env_logger::init(); + fn regression_suffix_literal_count() { + let _ = env_logger::try_init(); let re = Regex::new(r"[a-zA-Z]+ing").unwrap(); assert_eq!(1, re.find_iter("tingling").count()); From c12a7dfda7e6c1c035475336945c0c8e6bbf2a34 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 2 Oct 2023 20:25:31 -0400 Subject: [PATCH 02/33] automata: fix line wrapping Breaking lines in the middle of backticks appears to be bad juju for some Markdown renderers. --- regex-automata/src/util/look.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs index aee31b34e..a34ea1d75 100644 --- a/regex-automata/src/util/look.rs +++ b/regex-automata/src/util/look.rs @@ -184,8 +184,8 @@ impl Look { pub struct LookSet { /// The underlying representation this set is exposed to make it possible /// to store it somewhere efficiently. The representation is that - /// of a bitset, where each assertion occupies bit `i` where `i = - /// Look::as_repr()`. + /// of a bitset, where each assertion occupies bit `i` where + /// `i = Look::as_repr()`. /// /// Note that users of this internal representation must permit the full /// range of `u16` values to be represented. For example, even if the From 9c8796af425b5a720d2b8664d67dd499961a70dd Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 2 Oct 2023 20:27:11 -0400 Subject: [PATCH 03/33] automata: fix word boundary bug This fixes a bug that can occur when: 1. The regex has a Unicode word boundary. 2. The haystack contains some non-ASCII Unicode scalar value. 3. An inner or suffix literal optimization is in play. Specifically, this provokes a case where a match is detected in one of the meta engine's ad hoc DFA search routines, but before the match reaches its correct endpoint, a quit state is entered. (Because DFAs can't deal with Unicode word boundaries on non-ASCII haystacks.) The correct thing to do is to return a quit error and let the higher level logic divert to a different engine, but it was returning the match that it had found up until that point instead. The match returned is not technically incorrect in the sense that a match does indeed exist, but the offsets it reports may be shorter than what the true match actually is. So... if a quit state is entered, return an error regardless of whether a match has been found. Fixes #1046 --- CHANGELOG.md | 8 ++++++++ regex-automata/src/meta/limited.rs | 12 ------------ regex-automata/src/meta/stopat.rs | 12 ------------ testdata/regression.toml | 18 ++++++++++++++++++ 4 files changed, 26 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a50b811dd..4a474af1b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +TBD +=== + +* [BUG #1046](https://github.com/rust-lang/regex/issues/1046): +Fix a bug that could result in incorrect match spans when using a Unicode word +boundary and searching non-ASCII strings. + + 1.9.6 (2023-09-30) ================== This is a patch release that fixes a panic that can occur when the default diff --git a/regex-automata/src/meta/limited.rs b/regex-automata/src/meta/limited.rs index 192a2625e..5653adc9a 100644 --- a/regex-automata/src/meta/limited.rs +++ b/regex-automata/src/meta/limited.rs @@ -69,9 +69,6 @@ pub(crate) fn dfa_try_search_half_rev( } else if dfa.is_dead_state(sid) { return Ok(mat); } else if dfa.is_quit_state(sid) { - if mat.is_some() { - return Ok(mat); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } } @@ -155,9 +152,6 @@ pub(crate) fn hybrid_try_search_half_rev( } else if sid.is_dead() { return Ok(mat); } else if sid.is_quit() { - if mat.is_some() { - return Ok(mat); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } } @@ -209,9 +203,6 @@ fn dfa_eoi_rev( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if dfa.is_quit_state(*sid) { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(byte, sp.start - 1)); } } else { @@ -246,9 +237,6 @@ fn hybrid_eoi_rev( let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(byte, sp.start - 1)); } } else { diff --git a/regex-automata/src/meta/stopat.rs b/regex-automata/src/meta/stopat.rs index e8d716689..c4dcd797a 100644 --- a/regex-automata/src/meta/stopat.rs +++ b/regex-automata/src/meta/stopat.rs @@ -81,9 +81,6 @@ pub(crate) fn dfa_try_search_half_fwd( } else if dfa.is_dead_state(sid) { return Ok(mat.ok_or(at)); } else if dfa.is_quit_state(sid) { - if mat.is_some() { - return Ok(mat.ok_or(at)); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } else { // Ideally we wouldn't use a DFA that specialized start states @@ -122,9 +119,6 @@ pub(crate) fn hybrid_try_search_half_fwd( } else if sid.is_dead() { return Ok(mat.ok_or(at)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(mat.ok_or(at)); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } else { // We should NEVER get an unknown state ID back from @@ -162,9 +156,6 @@ fn dfa_eoi_fwd( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if dfa.is_quit_state(*sid) { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(b, sp.end)); } } @@ -201,9 +192,6 @@ fn hybrid_eoi_fwd( let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(b, sp.end)); } } diff --git a/testdata/regression.toml b/testdata/regression.toml index 03b15d6d5..09b2b1d1c 100644 --- a/testdata/regression.toml +++ b/testdata/regression.toml @@ -782,3 +782,21 @@ match-kind = "all" search-kind = "overlapping" unicode = true utf8 = true + +# This tests that the PikeVM and the meta regex agree on a particular regex. +# This test previously failed when the ad hoc engines inside the meta engine +# did not handle quit states correctly. Namely, the Unicode word boundary here +# combined with a non-ASCII codepoint provokes the quit state. The ad hoc +# engines were previously returning a match even after entering the quit state +# if a match had been previously detected, but this is incorrect. The reason +# is that if a quit state is found, then the search must give up *immediately* +# because it prevents the search from finding the "proper" leftmost-first +# match. If it instead returns a match that has been found, it risks reporting +# an improper match, as it did in this case. +# +# See: https://github.com/rust-lang/regex/issues/1046 +[[test]] +name = "non-prefix-literal-quit-state" +regex = '.+\b\n' +haystack = "β77\n" +matches = [[0, 5]] From e1fae8bed6d7d0de3f6a4a64cf2f2cbc53f5bf58 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 2 Oct 2023 22:45:13 -0400 Subject: [PATCH 04/33] automata/onepass: future proof bit packing This was previously using the raw representation of a `LookSet`, which is fine, but would have errantly overwritten bits unrelated to look-around assertions if they were set in a `LookSet`. This can't happen today because we don't have more than 10 assertions. And the one-pass DFA constructor specifically errors if more assertions exist and are in the pattern. But still, it seems like good form to mask out only the bits we care about. --- regex-automata/src/dfa/onepass.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs index 44691d0c8..353bb1e17 100644 --- a/regex-automata/src/dfa/onepass.rs +++ b/regex-automata/src/dfa/onepass.rs @@ -2581,10 +2581,11 @@ impl Cache { /// Represents a single transition in a one-pass DFA. /// -/// The high 24 bits corresponds to the state ID. The low 48 bits corresponds -/// to the transition epsilons, which contains the slots that should be saved -/// when this transition is followed and the conditional epsilon transitions -/// that must be satisfied in order to follow this transition. +/// The high 21 bits corresponds to the state ID. The bit following corresponds +/// to the special "match wins" flag. The remaining low 42 bits corresponds to +/// the transition epsilons, which contains the slots that should be saved when +/// this transition is followed and the conditional epsilon transitions that +/// must be satisfied in order to follow this transition. #[derive(Clone, Copy, Eq, PartialEq)] struct Transition(u64); @@ -2741,7 +2742,7 @@ impl PatternEpsilons { fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons { PatternEpsilons( (self.0 & PatternEpsilons::PATTERN_ID_MASK) - | u64::from(epsilons.0), + | (u64::from(epsilons.0) & PatternEpsilons::EPSILONS_MASK), ) } } @@ -2819,7 +2820,10 @@ impl Epsilons { /// Set the look-around assertions on these epsilon transitions. fn set_looks(self, look_set: LookSet) -> Epsilons { - Epsilons((self.0 & Epsilons::SLOT_MASK) | u64::from(look_set.bits)) + Epsilons( + (self.0 & Epsilons::SLOT_MASK) + | (u64::from(look_set.bits) & Epsilons::LOOK_MASK), + ) } } From 355dd3ecc2e927255e4f9dd66464aecbdad8c8cc Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 3 Oct 2023 15:16:21 -0400 Subject: [PATCH 05/33] syntax: make Ast the size of a pointer This puts every Ast value behind a box to conserve space. It makes things like Vec quite a bit smaller than what they would be otherwise, which is especially beneficial for the representation of concatenations and alternations. This doesn't quite solve the memory usage problems though, since an AstKind is still quite big (over 200 bytes). The next step will be boxing each of the variants of an AstKind which should hopefully resolve the issue. Ref #1090 --- regex-syntax/src/ast/mod.rs | 180 ++++++++++------ regex-syntax/src/ast/parse.rs | 328 +++++++++++++++--------------- regex-syntax/src/ast/print.rs | 34 ++-- regex-syntax/src/ast/visitor.rs | 18 +- regex-syntax/src/hir/translate.rs | 44 ++-- 5 files changed, 332 insertions(+), 272 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 9e4284fee..6a6b58237 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -429,9 +429,19 @@ pub struct Comment { /// /// This type defines its own destructor that uses constant stack space and /// heap space proportional to the size of the `Ast`. +/// +/// This type boxes the actual kind of the AST element so that an `Ast` value +/// itself has a very small size. This in turn makes things like `Vec` use +/// a lot less memory than it might otherwise, which is particularly beneficial +/// for representing long concatenations or alternations. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Ast(pub Box); + +/// The kind of an abstract syntax element. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -pub enum Ast { +pub enum AstKind { /// An empty regex that matches everything. Empty(Span), /// A set of flags, e.g., `(?is)`. @@ -456,26 +466,76 @@ pub enum Ast { } impl Ast { + /// Create an "empty" AST item. + pub fn empty(span: Span) -> Ast { + Ast(Box::new(AstKind::Empty(span))) + } + + /// Create a "flags" AST item. + pub fn flags(e: SetFlags) -> Ast { + Ast(Box::new(AstKind::Flags(e))) + } + + /// Create a "literal" AST item. + pub fn literal(e: Literal) -> Ast { + Ast(Box::new(AstKind::Literal(e))) + } + + /// Create a "dot" AST item. + pub fn dot(span: Span) -> Ast { + Ast(Box::new(AstKind::Dot(span))) + } + + /// Create a "assertion" AST item. + pub fn assertion(e: Assertion) -> Ast { + Ast(Box::new(AstKind::Assertion(e))) + } + + /// Create a "class" AST item. + pub fn class(e: Class) -> Ast { + Ast(Box::new(AstKind::Class(e))) + } + + /// Create a "repetition" AST item. + pub fn repetition(e: Repetition) -> Ast { + Ast(Box::new(AstKind::Repetition(e))) + } + + /// Create a "group" AST item. + pub fn group(e: Group) -> Ast { + Ast(Box::new(AstKind::Group(e))) + } + + /// Create a "alternation" AST item. + pub fn alternation(e: Alternation) -> Ast { + Ast(Box::new(AstKind::Alternation(e))) + } + + /// Create a "concat" AST item. + pub fn concat(e: Concat) -> Ast { + Ast(Box::new(AstKind::Concat(e))) + } + /// Return the span of this abstract syntax tree. pub fn span(&self) -> &Span { - match *self { - Ast::Empty(ref span) => span, - Ast::Flags(ref x) => &x.span, - Ast::Literal(ref x) => &x.span, - Ast::Dot(ref span) => span, - Ast::Assertion(ref x) => &x.span, - Ast::Class(ref x) => x.span(), - Ast::Repetition(ref x) => &x.span, - Ast::Group(ref x) => &x.span, - Ast::Alternation(ref x) => &x.span, - Ast::Concat(ref x) => &x.span, + match *self.0 { + AstKind::Empty(ref span) => span, + AstKind::Flags(ref x) => &x.span, + AstKind::Literal(ref x) => &x.span, + AstKind::Dot(ref span) => span, + AstKind::Assertion(ref x) => &x.span, + AstKind::Class(ref x) => x.span(), + AstKind::Repetition(ref x) => &x.span, + AstKind::Group(ref x) => &x.span, + AstKind::Alternation(ref x) => &x.span, + AstKind::Concat(ref x) => &x.span, } } /// Return true if and only if this Ast is empty. pub fn is_empty(&self) -> bool { - match *self { - Ast::Empty(_) => true, + match *self.0 { + AstKind::Empty(_) => true, _ => false, } } @@ -483,17 +543,17 @@ impl Ast { /// Returns true if and only if this AST has any (including possibly empty) /// subexpressions. fn has_subexprs(&self) -> bool { - match *self { - Ast::Empty(_) - | Ast::Flags(_) - | Ast::Literal(_) - | Ast::Dot(_) - | Ast::Assertion(_) => false, - Ast::Class(_) - | Ast::Repetition(_) - | Ast::Group(_) - | Ast::Alternation(_) - | Ast::Concat(_) => true, + match *self.0 { + AstKind::Empty(_) + | AstKind::Flags(_) + | AstKind::Literal(_) + | AstKind::Dot(_) + | AstKind::Assertion(_) => false, + AstKind::Class(_) + | AstKind::Repetition(_) + | AstKind::Group(_) + | AstKind::Alternation(_) + | AstKind::Concat(_) => true, } } } @@ -526,14 +586,14 @@ pub struct Alternation { impl Alternation { /// Return this alternation as an AST. /// - /// If this alternation contains zero ASTs, then Ast::Empty is - /// returned. If this alternation contains exactly 1 AST, then the - /// corresponding AST is returned. Otherwise, Ast::Alternation is returned. + /// If this alternation contains zero ASTs, then `Ast::empty` is returned. + /// If this alternation contains exactly 1 AST, then the corresponding AST + /// is returned. Otherwise, `Ast::alternation` is returned. pub fn into_ast(mut self) -> Ast { match self.asts.len() { - 0 => Ast::Empty(self.span), + 0 => Ast::empty(self.span), 1 => self.asts.pop().unwrap(), - _ => Ast::Alternation(self), + _ => Ast::alternation(self), } } } @@ -551,14 +611,14 @@ pub struct Concat { impl Concat { /// Return this concatenation as an AST. /// - /// If this concatenation contains zero ASTs, then Ast::Empty is - /// returned. If this concatenation contains exactly 1 AST, then the - /// corresponding AST is returned. Otherwise, Ast::Concat is returned. + /// If this alternation contains zero ASTs, then `Ast::empty` is returned. + /// If this alternation contains exactly 1 AST, then the corresponding AST + /// is returned. Otherwise, `Ast::concat` is returned. pub fn into_ast(mut self) -> Ast { match self.asts.len() { - 0 => Ast::Empty(self.span), + 0 => Ast::empty(self.span), 1 => self.asts.pop().unwrap(), - _ => Ast::Concat(self), + _ => Ast::concat(self), } } } @@ -1544,43 +1604,43 @@ impl Drop for Ast { fn drop(&mut self) { use core::mem; - match *self { - Ast::Empty(_) - | Ast::Flags(_) - | Ast::Literal(_) - | Ast::Dot(_) - | Ast::Assertion(_) + match *self.0 { + AstKind::Empty(_) + | AstKind::Flags(_) + | AstKind::Literal(_) + | AstKind::Dot(_) + | AstKind::Assertion(_) // Classes are recursive, so they get their own Drop impl. - | Ast::Class(_) => return, - Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, - Ast::Group(ref x) if !x.ast.has_subexprs() => return, - Ast::Alternation(ref x) if x.asts.is_empty() => return, - Ast::Concat(ref x) if x.asts.is_empty() => return, + | AstKind::Class(_) => return, + AstKind::Repetition(ref x) if !x.ast.has_subexprs() => return, + AstKind::Group(ref x) if !x.ast.has_subexprs() => return, + AstKind::Alternation(ref x) if x.asts.is_empty() => return, + AstKind::Concat(ref x) if x.asts.is_empty() => return, _ => {} } let empty_span = || Span::splat(Position::new(0, 0, 0)); - let empty_ast = || Ast::Empty(empty_span()); + let empty_ast = || Ast::empty(empty_span()); let mut stack = vec![mem::replace(self, empty_ast())]; while let Some(mut ast) = stack.pop() { - match ast { - Ast::Empty(_) - | Ast::Flags(_) - | Ast::Literal(_) - | Ast::Dot(_) - | Ast::Assertion(_) + match *ast.0 { + AstKind::Empty(_) + | AstKind::Flags(_) + | AstKind::Literal(_) + | AstKind::Dot(_) + | AstKind::Assertion(_) // Classes are recursive, so they get their own Drop impl. - | Ast::Class(_) => {} - Ast::Repetition(ref mut x) => { + | AstKind::Class(_) => {} + AstKind::Repetition(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } - Ast::Group(ref mut x) => { + AstKind::Group(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } - Ast::Alternation(ref mut x) => { + AstKind::Alternation(ref mut x) => { stack.extend(x.asts.drain(..)); } - Ast::Concat(ref mut x) => { + AstKind::Concat(ref mut x) => { stack.extend(x.asts.drain(..)); } } @@ -1663,9 +1723,9 @@ mod tests { let run = || { let span = || Span::splat(Position::new(0, 0, 0)); - let mut ast = Ast::Empty(span()); + let mut ast = Ast::empty(span()); for i in 0..200 { - ast = Ast::Group(Group { + ast = Ast::group(Group { span: span(), kind: GroupKind::CaptureIndex(i), ast: Box::new(ast), diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 47ea2586b..b3f04bfdc 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -16,7 +16,7 @@ use alloc::{ }; use crate::{ - ast::{self, Ast, Position, Span}, + ast::{self, Ast, AstKind, Position, Span}, either::Either, is_escapeable_character, is_meta_character, }; @@ -53,11 +53,11 @@ impl Primitive { /// Convert this primitive into a proper AST. fn into_ast(self) -> Ast { match self { - Primitive::Literal(lit) => Ast::Literal(lit), - Primitive::Assertion(assert) => Ast::Assertion(assert), - Primitive::Dot(span) => Ast::Dot(span), - Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)), - Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)), + Primitive::Literal(lit) => Ast::literal(lit), + Primitive::Assertion(assert) => Ast::assertion(assert), + Primitive::Dot(span) => Ast::dot(span), + Primitive::Perl(cls) => Ast::class(ast::Class::Perl(cls)), + Primitive::Unicode(cls) => Ast::class(ast::Class::Unicode(cls)), } } @@ -691,7 +691,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parser().ignore_whitespace.set(v); } - concat.asts.push(Ast::Flags(set)); + concat.asts.push(Ast::flags(set)); Ok(concat) } Either::Right(group) => { @@ -764,7 +764,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { group.ast = Box::new(group_concat.into_ast()); } } - prior_concat.asts.push(Ast::Group(group)); + prior_concat.asts.push(Ast::group(group)); Ok(prior_concat) } @@ -783,7 +783,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Some(GroupState::Alternation(mut alt)) => { alt.span.end = self.pos(); alt.asts.push(concat.into_ast()); - Ok(Ast::Alternation(alt)) + Ok(Ast::alternation(alt)) } Some(GroupState::Group { group, .. }) => { return Err( @@ -976,7 +976,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { '|' => concat = self.push_alternate(concat)?, '[' => { let class = self.parse_set_class()?; - concat.asts.push(Ast::Class(class)); + concat.asts.push(Ast::class(class)); } '?' => { concat = self.parse_uncounted_repetition( @@ -1044,8 +1044,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { ) } }; - match ast { - Ast::Empty(_) | Ast::Flags(_) => { + match *ast.0 { + AstKind::Empty(_) | AstKind::Flags(_) => { return Err( self.error(self.span(), ast::ErrorKind::RepetitionMissing) ) @@ -1057,7 +1057,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { greedy = false; self.bump(); } - concat.asts.push(Ast::Repetition(ast::Repetition { + concat.asts.push(Ast::repetition(ast::Repetition { span: ast.span().with_end(self.pos()), op: ast::RepetitionOp { span: Span::new(op_start, self.pos()), @@ -1096,8 +1096,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { ) } }; - match ast { - Ast::Empty(_) | Ast::Flags(_) => { + match *ast.0 { + AstKind::Empty(_) | AstKind::Flags(_) => { return Err( self.error(self.span(), ast::ErrorKind::RepetitionMissing) ) @@ -1159,7 +1159,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.error(op_span, ast::ErrorKind::RepetitionCountInvalid) ); } - concat.asts.push(Ast::Repetition(ast::Repetition { + concat.asts.push(Ast::repetition(ast::Repetition { span: ast.span().with_end(self.pos()), op: ast::RepetitionOp { span: op_span, @@ -1212,7 +1212,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::CaptureName { starts_with_p, name }, - ast: Box::new(Ast::Empty(self.span())), + ast: Box::new(Ast::empty(self.span())), })) } else if self.bump_if("?") { if self.is_eof() { @@ -1241,7 +1241,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::NonCapturing(flags), - ast: Box::new(Ast::Empty(self.span())), + ast: Box::new(Ast::empty(self.span())), })) } } else { @@ -1249,7 +1249,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::CaptureIndex(capture_index), - ast: Box::new(Ast::Empty(self.span())), + ast: Box::new(Ast::empty(self.span())), })) } } @@ -2183,43 +2183,43 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { } fn visit_pre(&mut self, ast: &Ast) -> Result<()> { - let span = match *ast { - Ast::Empty(_) - | Ast::Flags(_) - | Ast::Literal(_) - | Ast::Dot(_) - | Ast::Assertion(_) - | Ast::Class(ast::Class::Unicode(_)) - | Ast::Class(ast::Class::Perl(_)) => { + let span = match *ast.0 { + AstKind::Empty(_) + | AstKind::Flags(_) + | AstKind::Literal(_) + | AstKind::Dot(_) + | AstKind::Assertion(_) + | AstKind::Class(ast::Class::Unicode(_)) + | AstKind::Class(ast::Class::Perl(_)) => { // These are all base cases, so we don't increment depth. return Ok(()); } - Ast::Class(ast::Class::Bracketed(ref x)) => &x.span, - Ast::Repetition(ref x) => &x.span, - Ast::Group(ref x) => &x.span, - Ast::Alternation(ref x) => &x.span, - Ast::Concat(ref x) => &x.span, + AstKind::Class(ast::Class::Bracketed(ref x)) => &x.span, + AstKind::Repetition(ref x) => &x.span, + AstKind::Group(ref x) => &x.span, + AstKind::Alternation(ref x) => &x.span, + AstKind::Concat(ref x) => &x.span, }; self.increment_depth(span) } fn visit_post(&mut self, ast: &Ast) -> Result<()> { - match *ast { - Ast::Empty(_) - | Ast::Flags(_) - | Ast::Literal(_) - | Ast::Dot(_) - | Ast::Assertion(_) - | Ast::Class(ast::Class::Unicode(_)) - | Ast::Class(ast::Class::Perl(_)) => { + match *ast.0 { + AstKind::Empty(_) + | AstKind::Flags(_) + | AstKind::Literal(_) + | AstKind::Dot(_) + | AstKind::Assertion(_) + | AstKind::Class(ast::Class::Unicode(_)) + | AstKind::Class(ast::Class::Perl(_)) => { // These are all base cases, so we don't decrement depth. Ok(()) } - Ast::Class(ast::Class::Bracketed(_)) - | Ast::Repetition(_) - | Ast::Group(_) - | Ast::Alternation(_) - | Ast::Concat(_) => { + AstKind::Class(ast::Class::Bracketed(_)) + | AstKind::Repetition(_) + | AstKind::Group(_) + | AstKind::Alternation(_) + | AstKind::Concat(_) => { self.decrement_depth(); Ok(()) } @@ -2426,12 +2426,12 @@ mod tests { /// Create a meta literal starting at the given position. fn meta_lit(c: char, span: Span) -> Ast { - Ast::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) + Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) } /// Create a verbatim literal with the given span. fn lit_with(c: char, span: Span) -> Ast { - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Verbatim, c, @@ -2445,17 +2445,17 @@ mod tests { /// Create a concatenation with the given span. fn concat_with(span: Span, asts: Vec) -> Ast { - Ast::Concat(ast::Concat { span, asts }) + Ast::concat(ast::Concat { span, asts }) } /// Create an alternation with the given span. fn alt(range: Range, asts: Vec) -> Ast { - Ast::Alternation(ast::Alternation { span: span(range), asts }) + Ast::alternation(ast::Alternation { span: span(range), asts }) } /// Create a capturing group with the given span. fn group(range: Range, index: u32, ast: Ast) -> Ast { - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span(range), kind: ast::GroupKind::CaptureIndex(index), ast: Box::new(ast), @@ -2488,7 +2488,7 @@ mod tests { }, ); } - Ast::Flags(ast::SetFlags { + Ast::flags(ast::SetFlags { span: span_range(pat, range.clone()), flags: ast::Flags { span: span_range(pat, (range.start + 2)..(range.end - 1)), @@ -2502,7 +2502,7 @@ mod tests { // A nest limit of 0 still allows some types of regexes. assert_eq!( parser_nest_limit("", 0).parse(), - Ok(Ast::Empty(span(0..0))) + Ok(Ast::empty(span(0..0))) ); assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0))); @@ -2516,7 +2516,7 @@ mod tests { ); assert_eq!( parser_nest_limit("a+", 1).parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2542,14 +2542,14 @@ mod tests { ); assert_eq!( parser_nest_limit("a+*", 2).parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(2..3), kind: ast::RepetitionKind::ZeroOrMore, }, greedy: true, - ast: Box::new(Ast::Repetition(ast::Repetition { + ast: Box::new(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2606,7 +2606,7 @@ mod tests { ); assert_eq!( parser_nest_limit("[a]", 1).parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: ast::ClassSet::Item(ast::ClassSetItem::Literal( @@ -2776,7 +2776,7 @@ bar vec![ lit_with('a', span_range(pat, 0..1)), lit_with(' ', span_range(pat, 1..2)), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 2..9), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span_range(pat, 4..5), @@ -2803,7 +2803,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -2825,7 +2825,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::CaptureIndex(1), ast: Box::new(lit_with('a', span_range(pat, 7..8))), @@ -2840,7 +2840,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Group(ast::Group { + Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span_range(pat, 8..8), @@ -2858,7 +2858,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(4..13), kind: ast::LiteralKind::HexBrace( ast::HexLiteralKind::X @@ -2877,7 +2877,7 @@ bar span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span_range(pat, 4..6), kind: ast::LiteralKind::Superfluous, c: ' ', @@ -2895,9 +2895,9 @@ bar Ok(concat_with( span_range(pat, 0..3), vec![ - Ast::Dot(span_range(pat, 0..1)), + Ast::dot(span_range(pat, 0..1)), lit_with('\n', span_range(pat, 1..2)), - Ast::Dot(span_range(pat, 2..3)), + Ast::dot(span_range(pat, 2..3)), ] )) ); @@ -2933,7 +2933,7 @@ bar fn parse_uncounted_repetition() { assert_eq!( parser(r"a*").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2945,7 +2945,7 @@ bar ); assert_eq!( parser(r"a+").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2958,7 +2958,7 @@ bar assert_eq!( parser(r"a?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2970,7 +2970,7 @@ bar ); assert_eq!( parser(r"a??").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(1..3), @@ -2982,7 +2982,7 @@ bar ); assert_eq!( parser(r"a?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -2997,7 +2997,7 @@ bar Ok(concat( 0..3, vec![ - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), @@ -3015,7 +3015,7 @@ bar Ok(concat( 0..4, vec![ - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(1..3), @@ -3034,7 +3034,7 @@ bar 0..3, vec![ lit('a', 0), - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(1..3), op: ast::RepetitionOp { span: span(2..3), @@ -3048,7 +3048,7 @@ bar ); assert_eq!( parser(r"(ab)?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(4..5), @@ -3067,8 +3067,8 @@ bar Ok(alt( 0..3, vec![ - Ast::Empty(span(0..0)), - Ast::Repetition(ast::Repetition { + Ast::empty(span(0..0)), + Ast::repetition(ast::Repetition { span: span(1..3), op: ast::RepetitionOp { span: span(2..3), @@ -3157,7 +3157,7 @@ bar fn parse_counted_repetition() { assert_eq!( parser(r"a{5}").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..4), op: ast::RepetitionOp { span: span(1..4), @@ -3171,7 +3171,7 @@ bar ); assert_eq!( parser(r"a{5,}").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(1..5), @@ -3185,7 +3185,7 @@ bar ); assert_eq!( parser(r"a{5,9}").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..6), op: ast::RepetitionOp { span: span(1..6), @@ -3199,7 +3199,7 @@ bar ); assert_eq!( parser(r"a{5}?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(1..5), @@ -3217,7 +3217,7 @@ bar 0..5, vec![ lit('a', 0), - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(1..5), op: ast::RepetitionOp { span: span(2..5), @@ -3237,7 +3237,7 @@ bar 0..6, vec![ lit('a', 0), - Ast::Repetition(ast::Repetition { + Ast::repetition(ast::Repetition { span: span(1..5), op: ast::RepetitionOp { span: span(2..5), @@ -3255,7 +3255,7 @@ bar assert_eq!( parser(r"a{ 5 }").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..6), op: ast::RepetitionOp { span: span(1..6), @@ -3269,7 +3269,7 @@ bar ); assert_eq!( parser(r"a{ 5 , 9 }").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..10), op: ast::RepetitionOp { span: span(1..10), @@ -3283,7 +3283,7 @@ bar ); assert_eq!( parser_ignore_whitespace(r"a{5,9} ?").parse(), - Ok(Ast::Repetition(ast::Repetition { + Ok(Ast::repetition(ast::Repetition { span: span(0..8), op: ast::RepetitionOp { span: span(1..8), @@ -3414,7 +3414,7 @@ bar fn parse_alternate() { assert_eq!( parser(r"a|b").parse(), - Ok(Ast::Alternation(ast::Alternation { + Ok(Ast::alternation(ast::Alternation { span: span(0..3), asts: vec![lit('a', 0), lit('b', 2)], })) @@ -3424,7 +3424,7 @@ bar Ok(group( 0..5, 1, - Ast::Alternation(ast::Alternation { + Ast::alternation(ast::Alternation { span: span(1..4), asts: vec![lit('a', 1), lit('b', 3)], }) @@ -3433,14 +3433,14 @@ bar assert_eq!( parser(r"a|b|c").parse(), - Ok(Ast::Alternation(ast::Alternation { + Ok(Ast::alternation(ast::Alternation { span: span(0..5), asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)], })) ); assert_eq!( parser(r"ax|by|cz").parse(), - Ok(Ast::Alternation(ast::Alternation { + Ok(Ast::alternation(ast::Alternation { span: span(0..8), asts: vec![ concat(0..2, vec![lit('a', 0), lit('x', 1)]), @@ -3454,7 +3454,7 @@ bar Ok(group( 0..10, 1, - Ast::Alternation(ast::Alternation { + Ast::alternation(ast::Alternation { span: span(1..9), asts: vec![ concat(1..3, vec![lit('a', 1), lit('x', 2)]), @@ -3503,7 +3503,7 @@ bar parser(r"|").parse(), Ok(alt( 0..1, - vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),] + vec![Ast::empty(span(0..0)), Ast::empty(span(1..1)),] )) ); assert_eq!( @@ -3511,19 +3511,19 @@ bar Ok(alt( 0..2, vec![ - Ast::Empty(span(0..0)), - Ast::Empty(span(1..1)), - Ast::Empty(span(2..2)), + Ast::empty(span(0..0)), + Ast::empty(span(1..1)), + Ast::empty(span(2..2)), ] )) ); assert_eq!( parser(r"a|").parse(), - Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),])) + Ok(alt(0..2, vec![lit('a', 0), Ast::empty(span(2..2)),])) ); assert_eq!( parser(r"|a").parse(), - Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),])) + Ok(alt(0..2, vec![Ast::empty(span(0..0)), lit('a', 1),])) ); assert_eq!( @@ -3533,7 +3533,7 @@ bar 1, alt( 1..2, - vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),] + vec![Ast::empty(span(1..1)), Ast::empty(span(2..2)),] ) )) ); @@ -3542,7 +3542,7 @@ bar Ok(group( 0..4, 1, - alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),]) + alt(1..3, vec![lit('a', 1), Ast::empty(span(3..3)),]) )) ); assert_eq!( @@ -3550,7 +3550,7 @@ bar Ok(group( 0..4, 1, - alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),]) + alt(1..3, vec![Ast::empty(span(1..1)), lit('a', 2),]) )) ); @@ -3606,7 +3606,7 @@ bar fn parse_group() { assert_eq!( parser("(?i)").parse(), - Ok(Ast::Flags(ast::SetFlags { + Ok(Ast::flags(ast::SetFlags { span: span(0..4), flags: ast::Flags { span: span(2..3), @@ -3621,7 +3621,7 @@ bar ); assert_eq!( parser("(?iU)").parse(), - Ok(Ast::Flags(ast::SetFlags { + Ok(Ast::flags(ast::SetFlags { span: span(0..5), flags: ast::Flags { span: span(2..4), @@ -3644,7 +3644,7 @@ bar ); assert_eq!( parser("(?i-U)").parse(), - Ok(Ast::Flags(ast::SetFlags { + Ok(Ast::flags(ast::SetFlags { span: span(0..6), flags: ast::Flags { span: span(2..5), @@ -3672,15 +3672,15 @@ bar assert_eq!( parser("()").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..2), kind: ast::GroupKind::CaptureIndex(1), - ast: Box::new(Ast::Empty(span(1..1))), + ast: Box::new(Ast::empty(span(1..1))), })) ); assert_eq!( parser("(a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..3), kind: ast::GroupKind::CaptureIndex(1), ast: Box::new(lit('a', 1)), @@ -3688,20 +3688,20 @@ bar ); assert_eq!( parser("(())").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..4), kind: ast::GroupKind::CaptureIndex(1), - ast: Box::new(Ast::Group(ast::Group { + ast: Box::new(Ast::group(ast::Group { span: span(1..3), kind: ast::GroupKind::CaptureIndex(2), - ast: Box::new(Ast::Empty(span(2..2))), + ast: Box::new(Ast::empty(span(2..2))), })), })) ); assert_eq!( parser("(?:a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..5), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span(2..2), @@ -3713,7 +3713,7 @@ bar assert_eq!( parser("(?i:a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..6), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span(2..3), @@ -3729,7 +3729,7 @@ bar ); assert_eq!( parser("(?i-U:a)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..8), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span(2..5), @@ -3818,7 +3818,7 @@ bar fn parse_capture_name() { assert_eq!( parser("(?z)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..7), kind: ast::GroupKind::CaptureName { starts_with_p: false, @@ -3833,7 +3833,7 @@ bar ); assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..8), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3848,7 +3848,7 @@ bar ); assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3864,7 +3864,7 @@ bar assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3880,7 +3880,7 @@ bar assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3896,7 +3896,7 @@ bar assert_eq!( parser("(?Pz)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: span(0..11), kind: ast::GroupKind::CaptureName { starts_with_p: true, @@ -3912,7 +3912,7 @@ bar assert_eq!( parser("(?P)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: Span::new( Position::new(0, 1, 1), Position::new(9, 1, 9), @@ -3928,7 +3928,7 @@ bar index: 1, } }, - ast: Box::new(Ast::Empty(Span::new( + ast: Box::new(Ast::empty(Span::new( Position::new(8, 1, 8), Position::new(8, 1, 8), ))), @@ -3936,7 +3936,7 @@ bar ); assert_eq!( parser("(?P<名字>)").parse(), - Ok(Ast::Group(ast::Group { + Ok(Ast::group(ast::Group { span: Span::new( Position::new(0, 1, 1), Position::new(12, 1, 9), @@ -3952,7 +3952,7 @@ bar index: 1, } }, - ast: Box::new(Ast::Empty(Span::new( + ast: Box::new(Ast::empty(Span::new( Position::new(11, 1, 8), Position::new(11, 1, 8), ))), @@ -4494,15 +4494,15 @@ bar ); assert_eq!( parser_octal(r"\778").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..4), asts: vec![ - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(0..3), kind: ast::LiteralKind::Octal, c: '?', }), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(3..4), kind: ast::LiteralKind::Verbatim, c: '8', @@ -4512,15 +4512,15 @@ bar ); assert_eq!( parser_octal(r"\7777").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..5), asts: vec![ - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(0..4), kind: ast::LiteralKind::Octal, c: '\u{01FF}', }), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(4..5), kind: ast::LiteralKind::Verbatim, c: '7', @@ -4965,7 +4965,7 @@ bar assert_eq!( parser("[[:alnum:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..11), negated: false, kind: itemset(item_ascii(alnum(span(1..10), false))), @@ -4973,7 +4973,7 @@ bar ); assert_eq!( parser("[[[:alnum:]]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..13), negated: false, kind: itemset(item_bracket(ast::ClassBracketed { @@ -4985,7 +4985,7 @@ bar ); assert_eq!( parser("[[:alnum:]&&[:lower:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: intersection( @@ -4997,7 +4997,7 @@ bar ); assert_eq!( parser("[[:alnum:]--[:lower:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: difference( @@ -5009,7 +5009,7 @@ bar ); assert_eq!( parser("[[:alnum:]~~[:lower:]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: symdifference( @@ -5022,7 +5022,7 @@ bar assert_eq!( parser("[a]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), 'a')), @@ -5030,7 +5030,7 @@ bar ); assert_eq!( parser(r"[a\]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( @@ -5048,7 +5048,7 @@ bar ); assert_eq!( parser(r"[a\-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( @@ -5067,7 +5067,7 @@ bar ); assert_eq!( parser("[ab]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( @@ -5078,7 +5078,7 @@ bar ); assert_eq!( parser("[a-]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( @@ -5089,7 +5089,7 @@ bar ); assert_eq!( parser("[-a]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( @@ -5100,7 +5100,7 @@ bar ); assert_eq!( parser(r"[\pL]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(item_unicode(ast::ClassUnicode { @@ -5112,7 +5112,7 @@ bar ); assert_eq!( parser(r"[\w]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(item_perl(ast::ClassPerl { @@ -5124,7 +5124,7 @@ bar ); assert_eq!( parser(r"[a\wz]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( @@ -5144,7 +5144,7 @@ bar assert_eq!( parser("[a-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(range(span(1..4), 'a', 'z')), @@ -5152,7 +5152,7 @@ bar ); assert_eq!( parser("[a-cx-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..8), negated: false, kind: union( @@ -5166,7 +5166,7 @@ bar ); assert_eq!( parser(r"[\w&&a-cx-z]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( @@ -5188,7 +5188,7 @@ bar ); assert_eq!( parser(r"[a-cx-z&&\w]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( @@ -5210,7 +5210,7 @@ bar ); assert_eq!( parser(r"[a--b--c]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: difference( @@ -5226,7 +5226,7 @@ bar ); assert_eq!( parser(r"[a~~b~~c]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: symdifference( @@ -5242,7 +5242,7 @@ bar ); assert_eq!( parser(r"[\^&&^]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( @@ -5258,7 +5258,7 @@ bar ); assert_eq!( parser(r"[\&&&&]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( @@ -5274,7 +5274,7 @@ bar ); assert_eq!( parser(r"[&&&&]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: intersection( @@ -5292,7 +5292,7 @@ bar let pat = "[☃-⛄]"; assert_eq!( parser(pat).parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span_range(pat, 0..9), negated: false, kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { @@ -5313,7 +5313,7 @@ bar assert_eq!( parser(r"[]]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), ']')), @@ -5321,7 +5321,7 @@ bar ); assert_eq!( parser(r"[]\[]").parse(), - Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( @@ -5342,7 +5342,7 @@ bar Ok(concat( 0..5, vec![ - Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + Ast::class(ast::Class::Bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(ast::ClassSetItem::Literal( @@ -5353,7 +5353,7 @@ bar } )), })), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(4..5), kind: ast::LiteralKind::Verbatim, c: ']', @@ -5914,15 +5914,15 @@ bar assert_eq!( parser(r"\pNz").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..4), asts: vec![ - Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + Ast::class(ast::Class::Unicode(ast::ClassUnicode { span: span(0..3), negated: false, kind: ast::ClassUnicodeKind::OneLetter('N'), })), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(3..4), kind: ast::LiteralKind::Verbatim, c: 'z', @@ -5932,15 +5932,15 @@ bar ); assert_eq!( parser(r"\p{Greek}z").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..10), asts: vec![ - Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + Ast::class(ast::Class::Unicode(ast::ClassUnicode { span: span(0..9), negated: false, kind: ast::ClassUnicodeKind::Named(s("Greek")), })), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(9..10), kind: ast::LiteralKind::Verbatim, c: 'z', @@ -6017,7 +6017,7 @@ bar assert_eq!( parser(r"\d").parse(), - Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl { + Ok(Ast::class(ast::Class::Perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, @@ -6025,15 +6025,15 @@ bar ); assert_eq!( parser(r"\dz").parse(), - Ok(Ast::Concat(ast::Concat { + Ok(Ast::concat(ast::Concat { span: span(0..3), asts: vec![ - Ast::Class(ast::Class::Perl(ast::ClassPerl { + Ast::class(ast::Class::Perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, })), - Ast::Literal(ast::Literal { + Ast::literal(ast::Literal { span: span(2..3), kind: ast::LiteralKind::Verbatim, c: 'z', diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 86a87e143..daf6776f2 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -7,7 +7,7 @@ use core::fmt; use crate::ast::{ self, visitor::{self, Visitor}, - Ast, + Ast, AstKind, }; /// A builder for constructing a printer. @@ -78,9 +78,9 @@ impl Visitor for Writer { } fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { - match *ast { - Ast::Group(ref x) => self.fmt_group_pre(x), - Ast::Class(ast::Class::Bracketed(ref x)) => { + match *ast.0 { + AstKind::Group(ref x) => self.fmt_group_pre(x), + AstKind::Class(ast::Class::Bracketed(ref x)) => { self.fmt_class_bracketed_pre(x) } _ => Ok(()), @@ -90,21 +90,21 @@ impl Visitor for Writer { fn visit_post(&mut self, ast: &Ast) -> fmt::Result { use crate::ast::Class; - match *ast { - Ast::Empty(_) => Ok(()), - Ast::Flags(ref x) => self.fmt_set_flags(x), - Ast::Literal(ref x) => self.fmt_literal(x), - Ast::Dot(_) => self.wtr.write_str("."), - Ast::Assertion(ref x) => self.fmt_assertion(x), - Ast::Class(Class::Perl(ref x)) => self.fmt_class_perl(x), - Ast::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x), - Ast::Class(Class::Bracketed(ref x)) => { + match *ast.0 { + AstKind::Empty(_) => Ok(()), + AstKind::Flags(ref x) => self.fmt_set_flags(x), + AstKind::Literal(ref x) => self.fmt_literal(x), + AstKind::Dot(_) => self.wtr.write_str("."), + AstKind::Assertion(ref x) => self.fmt_assertion(x), + AstKind::Class(Class::Perl(ref x)) => self.fmt_class_perl(x), + AstKind::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x), + AstKind::Class(Class::Bracketed(ref x)) => { self.fmt_class_bracketed_post(x) } - Ast::Repetition(ref x) => self.fmt_repetition(x), - Ast::Group(ref x) => self.fmt_group_post(x), - Ast::Alternation(_) => Ok(()), - Ast::Concat(_) => Ok(()), + AstKind::Repetition(ref x) => self.fmt_repetition(x), + AstKind::Group(ref x) => self.fmt_group_post(x), + AstKind::Alternation(_) => Ok(()), + AstKind::Concat(_) => Ok(()), } } diff --git a/regex-syntax/src/ast/visitor.rs b/regex-syntax/src/ast/visitor.rs index 03d12a14d..05fdac89c 100644 --- a/regex-syntax/src/ast/visitor.rs +++ b/regex-syntax/src/ast/visitor.rs @@ -1,6 +1,6 @@ use alloc::{vec, vec::Vec}; -use crate::ast::{self, Ast}; +use crate::ast::{self, Ast, AstKind}; /// A trait for visiting an abstract syntax tree (AST) in depth first order. /// @@ -263,19 +263,19 @@ impl<'a> HeapVisitor<'a> { ast: &'a Ast, visitor: &mut V, ) -> Result>, V::Err> { - Ok(match *ast { - Ast::Class(ast::Class::Bracketed(ref x)) => { + Ok(match *ast.0 { + AstKind::Class(ast::Class::Bracketed(ref x)) => { self.visit_class(x, visitor)?; None } - Ast::Repetition(ref x) => Some(Frame::Repetition(x)), - Ast::Group(ref x) => Some(Frame::Group(x)), - Ast::Concat(ref x) if x.asts.is_empty() => None, - Ast::Concat(ref x) => { + AstKind::Repetition(ref x) => Some(Frame::Repetition(x)), + AstKind::Group(ref x) => Some(Frame::Group(x)), + AstKind::Concat(ref x) if x.asts.is_empty() => None, + AstKind::Concat(ref x) => { Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] }) } - Ast::Alternation(ref x) if x.asts.is_empty() => None, - Ast::Alternation(ref x) => Some(Frame::Alternation { + AstKind::Alternation(ref x) if x.asts.is_empty() => None, + AstKind::Alternation(ref x) => Some(Frame::Alternation { head: &x.asts[0], tail: &x.asts[1..], }), diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 5430b51b2..743218df4 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -7,7 +7,7 @@ use core::cell::{Cell, RefCell}; use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; use crate::{ - ast::{self, Ast, Span, Visitor}, + ast::{self, Ast, AstKind, Span, Visitor}, either::Either, hir::{self, Error, ErrorKind, Hir, HirKind}, unicode::{self, ClassQuery}, @@ -336,8 +336,8 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } fn visit_pre(&mut self, ast: &Ast) -> Result<()> { - match *ast { - Ast::Class(ast::Class::Bracketed(_)) => { + match *ast.0 { + AstKind::Class(ast::Class::Bracketed(_)) => { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); @@ -346,20 +346,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::ClassBytes(cls)); } } - Ast::Repetition(_) => self.push(HirFrame::Repetition), - Ast::Group(ref x) => { + AstKind::Repetition(_) => self.push(HirFrame::Repetition), + AstKind::Group(ref x) => { let old_flags = x .flags() .map(|ast| self.set_flags(ast)) .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } - Ast::Concat(ref x) if x.asts.is_empty() => {} - Ast::Concat(_) => { + AstKind::Concat(ref x) if x.asts.is_empty() => {} + AstKind::Concat(_) => { self.push(HirFrame::Concat); } - Ast::Alternation(ref x) if x.asts.is_empty() => {} - Ast::Alternation(_) => { + AstKind::Alternation(ref x) if x.asts.is_empty() => {} + AstKind::Alternation(_) => { self.push(HirFrame::Alternation); self.push(HirFrame::AlternationBranch); } @@ -369,11 +369,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } fn visit_post(&mut self, ast: &Ast) -> Result<()> { - match *ast { - Ast::Empty(_) => { + match *ast.0 { + AstKind::Empty(_) => { self.push(HirFrame::Expr(Hir::empty())); } - Ast::Flags(ref x) => { + AstKind::Flags(ref x) => { self.set_flags(&x.flags); // Flags in the AST are generally considered directives and // not actual sub-expressions. However, they can be used in @@ -386,7 +386,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { // consistency sake. self.push(HirFrame::Expr(Hir::empty())); } - Ast::Literal(ref x) => { + AstKind::Literal(ref x) => { match self.ast_literal_to_scalar(x)? { Either::Right(byte) => self.push_byte(byte), Either::Left(ch) => { @@ -402,13 +402,13 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } // self.push(HirFrame::Expr(self.hir_literal(x)?)); } - Ast::Dot(span) => { + AstKind::Dot(span) => { self.push(HirFrame::Expr(self.hir_dot(span)?)); } - Ast::Assertion(ref x) => { + AstKind::Assertion(ref x) => { self.push(HirFrame::Expr(self.hir_assertion(x)?)); } - Ast::Class(ast::Class::Perl(ref x)) => { + AstKind::Class(ast::Class::Perl(ref x)) => { if self.flags().unicode() { let cls = self.hir_perl_unicode_class(x)?; let hcls = hir::Class::Unicode(cls); @@ -419,11 +419,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(Hir::class(hcls))); } } - Ast::Class(ast::Class::Unicode(ref x)) => { + AstKind::Class(ast::Class::Unicode(ref x)) => { let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); self.push(HirFrame::Expr(Hir::class(cls))); } - Ast::Class(ast::Class::Bracketed(ref ast)) => { + AstKind::Class(ast::Class::Bracketed(ref ast)) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); self.unicode_fold_and_negate( @@ -444,18 +444,18 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(expr)); } } - Ast::Repetition(ref x) => { + AstKind::Repetition(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); self.pop().unwrap().unwrap_repetition(); self.push(HirFrame::Expr(self.hir_repetition(x, expr))); } - Ast::Group(ref x) => { + AstKind::Group(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); let old_flags = self.pop().unwrap().unwrap_group(); self.trans().flags.set(old_flags); self.push(HirFrame::Expr(self.hir_capture(x, expr))); } - Ast::Concat(_) => { + AstKind::Concat(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_concat_expr() { if !matches!(*expr.kind(), HirKind::Empty) { @@ -465,7 +465,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { exprs.reverse(); self.push(HirFrame::Expr(Hir::concat(exprs))); } - Ast::Alternation(_) => { + AstKind::Alternation(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_alt_expr() { self.pop().unwrap().unwrap_alternation_pipe(); From 8b0b0b0e48ffee628bf54729d6dde85e56a7b834 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 3 Oct 2023 16:01:43 -0400 Subject: [PATCH 06/33] syntax: box each AstKind variant This does reduce memory, but not as much as it is reduced if we don't box the Ast. --- regex-syntax/src/ast/mod.rs | 149 ++++++++++++++++++----------- regex-syntax/src/ast/parse.rs | 152 +++++++++++++++--------------- regex-syntax/src/ast/print.rs | 14 +-- regex-syntax/src/ast/visitor.rs | 2 +- regex-syntax/src/hir/translate.rs | 38 ++++---- 5 files changed, 192 insertions(+), 163 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 6a6b58237..c346abcb6 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -443,77 +443,92 @@ pub struct Ast(pub Box); #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum AstKind { /// An empty regex that matches everything. - Empty(Span), + Empty(Box), /// A set of flags, e.g., `(?is)`. - Flags(SetFlags), + Flags(Box), /// A single character literal, which includes escape sequences. - Literal(Literal), + Literal(Box), /// The "any character" class. - Dot(Span), + Dot(Box), /// A single zero-width assertion. - Assertion(Assertion), - /// A single character class. This includes all forms of character classes - /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`. - Class(Class), + Assertion(Box), + /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`. + ClassUnicode(Box), + /// A single perl character class, e.g., `\d` or `\W`. + ClassPerl(Box), + /// A single bracketed character class set, which may contain zero or more + /// character ranges and/or zero or more nested classes. e.g., + /// `[a-zA-Z\pL]`. + ClassBracketed(Box), /// A repetition operator applied to an arbitrary regular expression. - Repetition(Repetition), + Repetition(Box), /// A grouped regular expression. - Group(Group), + Group(Box), /// An alternation of regular expressions. - Alternation(Alternation), + Alternation(Box), /// A concatenation of regular expressions. - Concat(Concat), + Concat(Box), } impl Ast { /// Create an "empty" AST item. pub fn empty(span: Span) -> Ast { - Ast(Box::new(AstKind::Empty(span))) + Ast(Box::new(AstKind::Empty(Box::new(span)))) } /// Create a "flags" AST item. pub fn flags(e: SetFlags) -> Ast { - Ast(Box::new(AstKind::Flags(e))) + Ast(Box::new(AstKind::Flags(Box::new(e)))) } /// Create a "literal" AST item. pub fn literal(e: Literal) -> Ast { - Ast(Box::new(AstKind::Literal(e))) + Ast(Box::new(AstKind::Literal(Box::new(e)))) } /// Create a "dot" AST item. pub fn dot(span: Span) -> Ast { - Ast(Box::new(AstKind::Dot(span))) + Ast(Box::new(AstKind::Dot(Box::new(span)))) } /// Create a "assertion" AST item. pub fn assertion(e: Assertion) -> Ast { - Ast(Box::new(AstKind::Assertion(e))) + Ast(Box::new(AstKind::Assertion(Box::new(e)))) + } + + /// Create a "Unicode class" AST item. + pub fn class_unicode(e: ClassUnicode) -> Ast { + Ast(Box::new(AstKind::ClassUnicode(Box::new(e)))) + } + + /// Create a "Perl class" AST item. + pub fn class_perl(e: ClassPerl) -> Ast { + Ast(Box::new(AstKind::ClassPerl(Box::new(e)))) } - /// Create a "class" AST item. - pub fn class(e: Class) -> Ast { - Ast(Box::new(AstKind::Class(e))) + /// Create a "bracketed class" AST item. + pub fn class_bracketed(e: ClassBracketed) -> Ast { + Ast(Box::new(AstKind::ClassBracketed(Box::new(e)))) } /// Create a "repetition" AST item. pub fn repetition(e: Repetition) -> Ast { - Ast(Box::new(AstKind::Repetition(e))) + Ast(Box::new(AstKind::Repetition(Box::new(e)))) } /// Create a "group" AST item. pub fn group(e: Group) -> Ast { - Ast(Box::new(AstKind::Group(e))) + Ast(Box::new(AstKind::Group(Box::new(e)))) } /// Create a "alternation" AST item. pub fn alternation(e: Alternation) -> Ast { - Ast(Box::new(AstKind::Alternation(e))) + Ast(Box::new(AstKind::Alternation(Box::new(e)))) } /// Create a "concat" AST item. pub fn concat(e: Concat) -> Ast { - Ast(Box::new(AstKind::Concat(e))) + Ast(Box::new(AstKind::Concat(Box::new(e)))) } /// Return the span of this abstract syntax tree. @@ -524,7 +539,9 @@ impl Ast { AstKind::Literal(ref x) => &x.span, AstKind::Dot(ref span) => span, AstKind::Assertion(ref x) => &x.span, - AstKind::Class(ref x) => x.span(), + AstKind::ClassUnicode(ref x) => &x.span, + AstKind::ClassPerl(ref x) => &x.span, + AstKind::ClassBracketed(ref x) => &x.span, AstKind::Repetition(ref x) => &x.span, AstKind::Group(ref x) => &x.span, AstKind::Alternation(ref x) => &x.span, @@ -548,8 +565,10 @@ impl Ast { | AstKind::Flags(_) | AstKind::Literal(_) | AstKind::Dot(_) - | AstKind::Assertion(_) => false, - AstKind::Class(_) + | AstKind::Assertion(_) + | AstKind::ClassUnicode(_) + | AstKind::ClassPerl(_) => false, + AstKind::ClassBracketed(_) | AstKind::Repetition(_) | AstKind::Group(_) | AstKind::Alternation(_) @@ -735,31 +754,6 @@ impl HexLiteralKind { } } -/// A single character class expression. -#[derive(Clone, Debug, Eq, PartialEq)] -#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -pub enum Class { - /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. - Unicode(ClassUnicode), - /// A perl character class, e.g., `\d` or `\W`. - Perl(ClassPerl), - /// A bracketed character class set, which may contain zero or more - /// character ranges and/or zero or more nested classes. e.g., - /// `[a-zA-Z\pL]`. - Bracketed(ClassBracketed), -} - -impl Class { - /// Return the span of this character class. - pub fn span(&self) -> &Span { - match *self { - Class::Perl(ref x) => &x.span, - Class::Unicode(ref x) => &x.span, - Class::Bracketed(ref x) => &x.span, - } - } -} - /// A Perl character class. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] @@ -1610,8 +1604,10 @@ impl Drop for Ast { | AstKind::Literal(_) | AstKind::Dot(_) | AstKind::Assertion(_) - // Classes are recursive, so they get their own Drop impl. - | AstKind::Class(_) => return, + | AstKind::ClassUnicode(_) + | AstKind::ClassPerl(_) + // Bracketed classes are recursive, they get their own Drop impl. + | AstKind::ClassBracketed(_) => return, AstKind::Repetition(ref x) if !x.ast.has_subexprs() => return, AstKind::Group(ref x) if !x.ast.has_subexprs() => return, AstKind::Alternation(ref x) if x.asts.is_empty() => return, @@ -1629,8 +1625,11 @@ impl Drop for Ast { | AstKind::Literal(_) | AstKind::Dot(_) | AstKind::Assertion(_) - // Classes are recursive, so they get their own Drop impl. - | AstKind::Class(_) => {} + | AstKind::ClassUnicode(_) + | AstKind::ClassPerl(_) + // Bracketed classes are recursive, so they get their own Drop + // impl. + | AstKind::ClassBracketed(_) => {} AstKind::Repetition(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } @@ -1754,4 +1753,42 @@ mod tests { .join() .unwrap(); } + + // This tests that our `Ast` has a reasonable size. This isn't a hard rule + // and it can be increased if given a good enough reason. But this test + // exists because the size of `Ast` was at one point over 200 bytes on a + // 64-bit target. Wow. + #[test] + fn ast_size() { + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + std::dbg!(core::mem::size_of::()); + + let max = core::mem::size_of::(); + let size = core::mem::size_of::(); + assert!( + size <= max, + "Ast size of {} bytes is bigger than suggested max {}", + size, + max + ); + + let max = 2 * core::mem::size_of::(); + let size = core::mem::size_of::(); + assert!( + size <= max, + "AstKind size of {} bytes is bigger than suggested max {}", + size, + max + ); + } } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index b3f04bfdc..a87be0e02 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -56,8 +56,8 @@ impl Primitive { Primitive::Literal(lit) => Ast::literal(lit), Primitive::Assertion(assert) => Ast::assertion(assert), Primitive::Dot(span) => Ast::dot(span), - Primitive::Perl(cls) => Ast::class(ast::Class::Perl(cls)), - Primitive::Unicode(cls) => Ast::class(ast::Class::Unicode(cls)), + Primitive::Perl(cls) => Ast::class_perl(cls), + Primitive::Unicode(cls) => Ast::class_unicode(cls), } } @@ -850,7 +850,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { fn pop_class( &self, nested_union: ast::ClassSetUnion, - ) -> Result> { + ) -> Result> { assert_eq!(self.char(), ']'); let item = ast::ClassSet::Item(nested_union.into_item()); @@ -882,7 +882,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { set.span.end = self.pos(); set.kind = prevset; if stack.is_empty() { - Ok(Either::Right(ast::Class::Bracketed(set))) + Ok(Either::Right(set)) } else { union.push(ast::ClassSetItem::Bracketed(Box::new(set))); Ok(Either::Left(union)) @@ -976,7 +976,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { '|' => concat = self.push_alternate(concat)?, '[' => { let class = self.parse_set_class()?; - concat.asts.push(Ast::class(class)); + concat.asts.push(Ast::class_bracketed(class)); } '?' => { concat = self.parse_uncounted_repetition( @@ -1743,7 +1743,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// is successful, then the parser is advanced to the position immediately /// following the closing `]`. #[inline(never)] - fn parse_set_class(&self) -> Result { + fn parse_set_class(&self) -> Result { assert_eq!(self.char(), '['); let mut union = @@ -2189,12 +2189,12 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { | AstKind::Literal(_) | AstKind::Dot(_) | AstKind::Assertion(_) - | AstKind::Class(ast::Class::Unicode(_)) - | AstKind::Class(ast::Class::Perl(_)) => { + | AstKind::ClassUnicode(_) + | AstKind::ClassPerl(_) => { // These are all base cases, so we don't increment depth. return Ok(()); } - AstKind::Class(ast::Class::Bracketed(ref x)) => &x.span, + AstKind::ClassBracketed(ref x) => &x.span, AstKind::Repetition(ref x) => &x.span, AstKind::Group(ref x) => &x.span, AstKind::Alternation(ref x) => &x.span, @@ -2210,12 +2210,12 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { | AstKind::Literal(_) | AstKind::Dot(_) | AstKind::Assertion(_) - | AstKind::Class(ast::Class::Unicode(_)) - | AstKind::Class(ast::Class::Perl(_)) => { + | AstKind::ClassUnicode(_) + | AstKind::ClassPerl(_) => { // These are all base cases, so we don't decrement depth. Ok(()) } - AstKind::Class(ast::Class::Bracketed(_)) + AstKind::ClassBracketed(_) | AstKind::Repetition(_) | AstKind::Group(_) | AstKind::Alternation(_) @@ -2606,7 +2606,7 @@ mod tests { ); assert_eq!( parser_nest_limit("[a]", 1).parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: ast::ClassSet::Item(ast::ClassSetItem::Literal( @@ -2616,7 +2616,7 @@ mod tests { c: 'a', } )), - }))) + })) ); assert_eq!( parser_nest_limit("[ab]", 1).parse().unwrap_err(), @@ -4965,15 +4965,15 @@ bar assert_eq!( parser("[[:alnum:]]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..11), negated: false, kind: itemset(item_ascii(alnum(span(1..10), false))), - }))) + })) ); assert_eq!( parser("[[[:alnum:]]]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..13), negated: false, kind: itemset(item_bracket(ast::ClassBracketed { @@ -4981,11 +4981,11 @@ bar negated: false, kind: itemset(item_ascii(alnum(span(2..11), false))), })), - }))) + })) ); assert_eq!( parser("[[:alnum:]&&[:lower:]]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: intersection( @@ -4993,11 +4993,11 @@ bar itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), - }))) + })) ); assert_eq!( parser("[[:alnum:]--[:lower:]]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: difference( @@ -5005,11 +5005,11 @@ bar itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), - }))) + })) ); assert_eq!( parser("[[:alnum:]~~[:lower:]]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: symdifference( @@ -5017,20 +5017,20 @@ bar itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), - }))) + })) ); assert_eq!( parser("[a]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), 'a')), - }))) + })) ); assert_eq!( parser(r"[a\]]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( @@ -5044,11 +5044,11 @@ bar }), ] ), - }))) + })) ); assert_eq!( parser(r"[a\-z]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( @@ -5063,44 +5063,44 @@ bar lit(span(4..5), 'z'), ] ), - }))) + })) ); assert_eq!( parser("[ab]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),] ), - }))) + })) ); assert_eq!( parser("[a-]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),] ), - }))) + })) ); assert_eq!( parser("[-a]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),] ), - }))) + })) ); assert_eq!( parser(r"[\pL]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(item_unicode(ast::ClassUnicode { @@ -5108,11 +5108,11 @@ bar negated: false, kind: ast::ClassUnicodeKind::OneLetter('L'), })), - }))) + })) ); assert_eq!( parser(r"[\w]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(item_perl(ast::ClassPerl { @@ -5120,11 +5120,11 @@ bar kind: ast::ClassPerlKind::Word, negated: false, })), - }))) + })) ); assert_eq!( parser(r"[a\wz]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( @@ -5139,20 +5139,20 @@ bar lit(span(4..5), 'z'), ] ), - }))) + })) ); assert_eq!( parser("[a-z]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(range(span(1..4), 'a', 'z')), - }))) + })) ); assert_eq!( parser("[a-cx-z]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..8), negated: false, kind: union( @@ -5162,11 +5162,11 @@ bar range(span(4..7), 'x', 'z'), ] ), - }))) + })) ); assert_eq!( parser(r"[\w&&a-cx-z]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( @@ -5184,11 +5184,11 @@ bar ] ), ), - }))) + })) ); assert_eq!( parser(r"[a-cx-z&&\w]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( @@ -5206,11 +5206,11 @@ bar negated: false, })), ), - }))) + })) ); assert_eq!( parser(r"[a--b--c]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: difference( @@ -5222,11 +5222,11 @@ bar ), itemset(lit(span(7..8), 'c')), ), - }))) + })) ); assert_eq!( parser(r"[a~~b~~c]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: symdifference( @@ -5238,11 +5238,11 @@ bar ), itemset(lit(span(7..8), 'c')), ), - }))) + })) ); assert_eq!( parser(r"[\^&&^]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( @@ -5254,11 +5254,11 @@ bar })), itemset(lit(span(5..6), '^')), ), - }))) + })) ); assert_eq!( parser(r"[\&&&&]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( @@ -5270,11 +5270,11 @@ bar })), itemset(lit(span(5..6), '&')), ), - }))) + })) ); assert_eq!( parser(r"[&&&&]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: intersection( @@ -5286,13 +5286,13 @@ bar ), itemset(empty(span(5..5))), ), - }))) + })) ); let pat = "[☃-⛄]"; assert_eq!( parser(pat).parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span_range(pat, 0..9), negated: false, kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { @@ -5308,20 +5308,20 @@ bar c: '⛄', }, })), - }))) + })) ); assert_eq!( parser(r"[]]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), ']')), - }))) + })) ); assert_eq!( parser(r"[]\[]").parse(), - Ok(Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( @@ -5335,14 +5335,14 @@ bar }), ] ), - }))) + })) ); assert_eq!( parser(r"[\[]]").parse(), Ok(concat( 0..5, vec![ - Ast::class(ast::Class::Bracketed(ast::ClassBracketed { + Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(ast::ClassSetItem::Literal( @@ -5352,7 +5352,7 @@ bar c: '[', } )), - })), + }), Ast::literal(ast::Literal { span: span(4..5), kind: ast::LiteralKind::Verbatim, @@ -5917,11 +5917,11 @@ bar Ok(Ast::concat(ast::Concat { span: span(0..4), asts: vec![ - Ast::class(ast::Class::Unicode(ast::ClassUnicode { + Ast::class_unicode(ast::ClassUnicode { span: span(0..3), negated: false, kind: ast::ClassUnicodeKind::OneLetter('N'), - })), + }), Ast::literal(ast::Literal { span: span(3..4), kind: ast::LiteralKind::Verbatim, @@ -5935,11 +5935,11 @@ bar Ok(Ast::concat(ast::Concat { span: span(0..10), asts: vec![ - Ast::class(ast::Class::Unicode(ast::ClassUnicode { + Ast::class_unicode(ast::ClassUnicode { span: span(0..9), negated: false, kind: ast::ClassUnicodeKind::Named(s("Greek")), - })), + }), Ast::literal(ast::Literal { span: span(9..10), kind: ast::LiteralKind::Verbatim, @@ -6017,22 +6017,22 @@ bar assert_eq!( parser(r"\d").parse(), - Ok(Ast::class(ast::Class::Perl(ast::ClassPerl { + Ok(Ast::class_perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, - }))) + })) ); assert_eq!( parser(r"\dz").parse(), Ok(Ast::concat(ast::Concat { span: span(0..3), asts: vec![ - Ast::class(ast::Class::Perl(ast::ClassPerl { + Ast::class_perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, - })), + }), Ast::literal(ast::Literal { span: span(2..3), kind: ast::LiteralKind::Verbatim, diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index daf6776f2..10ee56c2c 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -80,27 +80,21 @@ impl Visitor for Writer { fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { match *ast.0 { AstKind::Group(ref x) => self.fmt_group_pre(x), - AstKind::Class(ast::Class::Bracketed(ref x)) => { - self.fmt_class_bracketed_pre(x) - } + AstKind::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x), _ => Ok(()), } } fn visit_post(&mut self, ast: &Ast) -> fmt::Result { - use crate::ast::Class; - match *ast.0 { AstKind::Empty(_) => Ok(()), AstKind::Flags(ref x) => self.fmt_set_flags(x), AstKind::Literal(ref x) => self.fmt_literal(x), AstKind::Dot(_) => self.wtr.write_str("."), AstKind::Assertion(ref x) => self.fmt_assertion(x), - AstKind::Class(Class::Perl(ref x)) => self.fmt_class_perl(x), - AstKind::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x), - AstKind::Class(Class::Bracketed(ref x)) => { - self.fmt_class_bracketed_post(x) - } + AstKind::ClassPerl(ref x) => self.fmt_class_perl(x), + AstKind::ClassUnicode(ref x) => self.fmt_class_unicode(x), + AstKind::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x), AstKind::Repetition(ref x) => self.fmt_repetition(x), AstKind::Group(ref x) => self.fmt_group_post(x), AstKind::Alternation(_) => Ok(()), diff --git a/regex-syntax/src/ast/visitor.rs b/regex-syntax/src/ast/visitor.rs index 05fdac89c..2bd4b1956 100644 --- a/regex-syntax/src/ast/visitor.rs +++ b/regex-syntax/src/ast/visitor.rs @@ -264,7 +264,7 @@ impl<'a> HeapVisitor<'a> { visitor: &mut V, ) -> Result>, V::Err> { Ok(match *ast.0 { - AstKind::Class(ast::Class::Bracketed(ref x)) => { + AstKind::ClassBracketed(ref x) => { self.visit_class(x, visitor)?; None } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 743218df4..ab3aa93d7 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -337,7 +337,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { fn visit_pre(&mut self, ast: &Ast) -> Result<()> { match *ast.0 { - AstKind::Class(ast::Class::Bracketed(_)) => { + AstKind::ClassBracketed(_) => { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); @@ -386,29 +386,27 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { // consistency sake. self.push(HirFrame::Expr(Hir::empty())); } - AstKind::Literal(ref x) => { - match self.ast_literal_to_scalar(x)? { - Either::Right(byte) => self.push_byte(byte), - Either::Left(ch) => { - if !self.flags().unicode() && ch.len_utf8() > 1 { - return Err(self - .error(x.span, ErrorKind::UnicodeNotAllowed)); - } - match self.case_fold_char(x.span, ch)? { - None => self.push_char(ch), - Some(expr) => self.push(HirFrame::Expr(expr)), - } + AstKind::Literal(ref x) => match self.ast_literal_to_scalar(x)? { + Either::Right(byte) => self.push_byte(byte), + Either::Left(ch) => { + if !self.flags().unicode() && ch.len_utf8() > 1 { + return Err( + self.error(x.span, ErrorKind::UnicodeNotAllowed) + ); + } + match self.case_fold_char(x.span, ch)? { + None => self.push_char(ch), + Some(expr) => self.push(HirFrame::Expr(expr)), } } - // self.push(HirFrame::Expr(self.hir_literal(x)?)); - } - AstKind::Dot(span) => { - self.push(HirFrame::Expr(self.hir_dot(span)?)); + }, + AstKind::Dot(ref span) => { + self.push(HirFrame::Expr(self.hir_dot(**span)?)); } AstKind::Assertion(ref x) => { self.push(HirFrame::Expr(self.hir_assertion(x)?)); } - AstKind::Class(ast::Class::Perl(ref x)) => { + AstKind::ClassPerl(ref x) => { if self.flags().unicode() { let cls = self.hir_perl_unicode_class(x)?; let hcls = hir::Class::Unicode(cls); @@ -419,11 +417,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(Hir::class(hcls))); } } - AstKind::Class(ast::Class::Unicode(ref x)) => { + AstKind::ClassUnicode(ref x) => { let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); self.push(HirFrame::Expr(Hir::class(cls))); } - AstKind::Class(ast::Class::Bracketed(ref ast)) => { + AstKind::ClassBracketed(ref ast) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); self.unicode_fold_and_negate( From db214e59c3a4e480f6664e3f6f4381e0a0f636fe Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 3 Oct 2023 16:09:12 -0400 Subject: [PATCH 07/33] syntax: unbox Ast and remove AstKind The AstKind experiment proved unfruitful. I think the issue here is that the savings on Vec didn't prove to be enough to offset the extra heap allocation that resulted from the indirection. This seems to be a sweet spot. It would be nice to get Ast down below 16 bytes, but it's not clear how to do that (without much larger changes that I don't feel inclined to pursue). Fixes #1090 --- fuzz/fuzz_targets/ast_roundtrip.rs | 21 ++-- regex-cli/cmd/generate/fowler.rs | 4 +- regex-syntax/src/ast/mod.rs | 168 ++++++++++++----------------- regex-syntax/src/ast/parse.rs | 62 +++++------ regex-syntax/src/ast/print.rs | 34 +++--- regex-syntax/src/ast/visitor.rs | 18 ++-- regex-syntax/src/hir/translate.rs | 44 ++++---- 7 files changed, 161 insertions(+), 190 deletions(-) diff --git a/fuzz/fuzz_targets/ast_roundtrip.rs b/fuzz/fuzz_targets/ast_roundtrip.rs index 040b59d63..c35ac962e 100644 --- a/fuzz/fuzz_targets/ast_roundtrip.rs +++ b/fuzz/fuzz_targets/ast_roundtrip.rs @@ -3,7 +3,7 @@ use { libfuzzer_sys::{fuzz_target, Corpus}, regex_syntax::ast::{ - parse::Parser, visit, Ast, Flag, Group, GroupKind, SetFlags, Visitor, + parse::Parser, visit, Ast, Flag, Flags, GroupKind, Visitor, }, }; @@ -32,16 +32,17 @@ impl Visitor for VerboseVisitor { } fn visit_pre(&mut self, ast: &Ast) -> Result { + let reject_flags = |flags: &Flags| { + flags.flag_state(Flag::IgnoreWhitespace).unwrap_or(false) + }; match ast { - Ast::Flags(SetFlags { flags, .. }) - | Ast::Group(Group { - kind: GroupKind::NonCapturing(flags), .. - }) if flags - .flag_state(Flag::IgnoreWhitespace) - .unwrap_or(false) => - { - Err(()) - } + Ast::Flags(x) if reject_flags(&x.flags) => return Err(()), + Ast::Group(x) => match x.kind { + GroupKind::NonCapturing(ref flags) if reject_flags(flags) => { + return Err(()) + } + _ => Ok(()), + }, _ => Ok(()), } } diff --git a/regex-cli/cmd/generate/fowler.rs b/regex-cli/cmd/generate/fowler.rs index c0ab1b361..c287f6f52 100644 --- a/regex-cli/cmd/generate/fowler.rs +++ b/regex-cli/cmd/generate/fowler.rs @@ -404,7 +404,9 @@ fn count_capturing_groups_ast(ast: ®ex_syntax::ast::Ast) -> usize { | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) - | Ast::Class(_) => 0, + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) + | Ast::ClassBracketed(_) => 0, Ast::Repetition(ref rep) => count_capturing_groups_ast(&*rep.ast), Ast::Group(ref group) => { let this = if group.is_capturing() { 1 } else { 0 }; diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index c346abcb6..9e0f92606 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -429,19 +429,9 @@ pub struct Comment { /// /// This type defines its own destructor that uses constant stack space and /// heap space proportional to the size of the `Ast`. -/// -/// This type boxes the actual kind of the AST element so that an `Ast` value -/// itself has a very small size. This in turn makes things like `Vec` use -/// a lot less memory than it might otherwise, which is particularly beneficial -/// for representing long concatenations or alternations. -#[derive(Clone, Debug, Eq, PartialEq)] -#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -pub struct Ast(pub Box); - -/// The kind of an abstract syntax element. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -pub enum AstKind { +pub enum Ast { /// An empty regex that matches everything. Empty(Box), /// A set of flags, e.g., `(?is)`. @@ -473,86 +463,86 @@ pub enum AstKind { impl Ast { /// Create an "empty" AST item. pub fn empty(span: Span) -> Ast { - Ast(Box::new(AstKind::Empty(Box::new(span)))) + Ast::Empty(Box::new(span)) } /// Create a "flags" AST item. pub fn flags(e: SetFlags) -> Ast { - Ast(Box::new(AstKind::Flags(Box::new(e)))) + Ast::Flags(Box::new(e)) } /// Create a "literal" AST item. pub fn literal(e: Literal) -> Ast { - Ast(Box::new(AstKind::Literal(Box::new(e)))) + Ast::Literal(Box::new(e)) } /// Create a "dot" AST item. pub fn dot(span: Span) -> Ast { - Ast(Box::new(AstKind::Dot(Box::new(span)))) + Ast::Dot(Box::new(span)) } /// Create a "assertion" AST item. pub fn assertion(e: Assertion) -> Ast { - Ast(Box::new(AstKind::Assertion(Box::new(e)))) + Ast::Assertion(Box::new(e)) } /// Create a "Unicode class" AST item. pub fn class_unicode(e: ClassUnicode) -> Ast { - Ast(Box::new(AstKind::ClassUnicode(Box::new(e)))) + Ast::ClassUnicode(Box::new(e)) } /// Create a "Perl class" AST item. pub fn class_perl(e: ClassPerl) -> Ast { - Ast(Box::new(AstKind::ClassPerl(Box::new(e)))) + Ast::ClassPerl(Box::new(e)) } /// Create a "bracketed class" AST item. pub fn class_bracketed(e: ClassBracketed) -> Ast { - Ast(Box::new(AstKind::ClassBracketed(Box::new(e)))) + Ast::ClassBracketed(Box::new(e)) } /// Create a "repetition" AST item. pub fn repetition(e: Repetition) -> Ast { - Ast(Box::new(AstKind::Repetition(Box::new(e)))) + Ast::Repetition(Box::new(e)) } /// Create a "group" AST item. pub fn group(e: Group) -> Ast { - Ast(Box::new(AstKind::Group(Box::new(e)))) + Ast::Group(Box::new(e)) } /// Create a "alternation" AST item. pub fn alternation(e: Alternation) -> Ast { - Ast(Box::new(AstKind::Alternation(Box::new(e)))) + Ast::Alternation(Box::new(e)) } /// Create a "concat" AST item. pub fn concat(e: Concat) -> Ast { - Ast(Box::new(AstKind::Concat(Box::new(e)))) + Ast::Concat(Box::new(e)) } /// Return the span of this abstract syntax tree. pub fn span(&self) -> &Span { - match *self.0 { - AstKind::Empty(ref span) => span, - AstKind::Flags(ref x) => &x.span, - AstKind::Literal(ref x) => &x.span, - AstKind::Dot(ref span) => span, - AstKind::Assertion(ref x) => &x.span, - AstKind::ClassUnicode(ref x) => &x.span, - AstKind::ClassPerl(ref x) => &x.span, - AstKind::ClassBracketed(ref x) => &x.span, - AstKind::Repetition(ref x) => &x.span, - AstKind::Group(ref x) => &x.span, - AstKind::Alternation(ref x) => &x.span, - AstKind::Concat(ref x) => &x.span, + match *self { + Ast::Empty(ref span) => span, + Ast::Flags(ref x) => &x.span, + Ast::Literal(ref x) => &x.span, + Ast::Dot(ref span) => span, + Ast::Assertion(ref x) => &x.span, + Ast::ClassUnicode(ref x) => &x.span, + Ast::ClassPerl(ref x) => &x.span, + Ast::ClassBracketed(ref x) => &x.span, + Ast::Repetition(ref x) => &x.span, + Ast::Group(ref x) => &x.span, + Ast::Alternation(ref x) => &x.span, + Ast::Concat(ref x) => &x.span, } } /// Return true if and only if this Ast is empty. pub fn is_empty(&self) -> bool { - match *self.0 { - AstKind::Empty(_) => true, + match *self { + Ast::Empty(_) => true, _ => false, } } @@ -560,19 +550,19 @@ impl Ast { /// Returns true if and only if this AST has any (including possibly empty) /// subexpressions. fn has_subexprs(&self) -> bool { - match *self.0 { - AstKind::Empty(_) - | AstKind::Flags(_) - | AstKind::Literal(_) - | AstKind::Dot(_) - | AstKind::Assertion(_) - | AstKind::ClassUnicode(_) - | AstKind::ClassPerl(_) => false, - AstKind::ClassBracketed(_) - | AstKind::Repetition(_) - | AstKind::Group(_) - | AstKind::Alternation(_) - | AstKind::Concat(_) => true, + match *self { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => false, + Ast::ClassBracketed(_) + | Ast::Repetition(_) + | Ast::Group(_) + | Ast::Alternation(_) + | Ast::Concat(_) => true, } } } @@ -1598,20 +1588,20 @@ impl Drop for Ast { fn drop(&mut self) { use core::mem; - match *self.0 { - AstKind::Empty(_) - | AstKind::Flags(_) - | AstKind::Literal(_) - | AstKind::Dot(_) - | AstKind::Assertion(_) - | AstKind::ClassUnicode(_) - | AstKind::ClassPerl(_) + match *self { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) // Bracketed classes are recursive, they get their own Drop impl. - | AstKind::ClassBracketed(_) => return, - AstKind::Repetition(ref x) if !x.ast.has_subexprs() => return, - AstKind::Group(ref x) if !x.ast.has_subexprs() => return, - AstKind::Alternation(ref x) if x.asts.is_empty() => return, - AstKind::Concat(ref x) if x.asts.is_empty() => return, + | Ast::ClassBracketed(_) => return, + Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, + Ast::Group(ref x) if !x.ast.has_subexprs() => return, + Ast::Alternation(ref x) if x.asts.is_empty() => return, + Ast::Concat(ref x) if x.asts.is_empty() => return, _ => {} } @@ -1619,27 +1609,27 @@ impl Drop for Ast { let empty_ast = || Ast::empty(empty_span()); let mut stack = vec![mem::replace(self, empty_ast())]; while let Some(mut ast) = stack.pop() { - match *ast.0 { - AstKind::Empty(_) - | AstKind::Flags(_) - | AstKind::Literal(_) - | AstKind::Dot(_) - | AstKind::Assertion(_) - | AstKind::ClassUnicode(_) - | AstKind::ClassPerl(_) + match ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) // Bracketed classes are recursive, so they get their own Drop // impl. - | AstKind::ClassBracketed(_) => {} - AstKind::Repetition(ref mut x) => { + | Ast::ClassBracketed(_) => {} + Ast::Repetition(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } - AstKind::Group(ref mut x) => { + Ast::Group(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } - AstKind::Alternation(ref mut x) => { + Ast::Alternation(ref mut x) => { stack.extend(x.asts.drain(..)); } - AstKind::Concat(ref mut x) => { + Ast::Concat(ref mut x) => { stack.extend(x.asts.drain(..)); } } @@ -1760,20 +1750,7 @@ mod tests { // 64-bit target. Wow. #[test] fn ast_size() { - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - std::dbg!(core::mem::size_of::()); - - let max = core::mem::size_of::(); + let max = 2 * core::mem::size_of::(); let size = core::mem::size_of::(); assert!( size <= max, @@ -1781,14 +1758,5 @@ mod tests { size, max ); - - let max = 2 * core::mem::size_of::(); - let size = core::mem::size_of::(); - assert!( - size <= max, - "AstKind size of {} bytes is bigger than suggested max {}", - size, - max - ); } } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index a87be0e02..f7bae7759 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -16,7 +16,7 @@ use alloc::{ }; use crate::{ - ast::{self, Ast, AstKind, Position, Span}, + ast::{self, Ast, Position, Span}, either::Either, is_escapeable_character, is_meta_character, }; @@ -1044,8 +1044,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { ) } }; - match *ast.0 { - AstKind::Empty(_) | AstKind::Flags(_) => { + match ast { + Ast::Empty(_) | Ast::Flags(_) => { return Err( self.error(self.span(), ast::ErrorKind::RepetitionMissing) ) @@ -1096,8 +1096,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { ) } }; - match *ast.0 { - AstKind::Empty(_) | AstKind::Flags(_) => { + match ast { + Ast::Empty(_) | Ast::Flags(_) => { return Err( self.error(self.span(), ast::ErrorKind::RepetitionMissing) ) @@ -2183,43 +2183,43 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { } fn visit_pre(&mut self, ast: &Ast) -> Result<()> { - let span = match *ast.0 { - AstKind::Empty(_) - | AstKind::Flags(_) - | AstKind::Literal(_) - | AstKind::Dot(_) - | AstKind::Assertion(_) - | AstKind::ClassUnicode(_) - | AstKind::ClassPerl(_) => { + let span = match *ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => { // These are all base cases, so we don't increment depth. return Ok(()); } - AstKind::ClassBracketed(ref x) => &x.span, - AstKind::Repetition(ref x) => &x.span, - AstKind::Group(ref x) => &x.span, - AstKind::Alternation(ref x) => &x.span, - AstKind::Concat(ref x) => &x.span, + Ast::ClassBracketed(ref x) => &x.span, + Ast::Repetition(ref x) => &x.span, + Ast::Group(ref x) => &x.span, + Ast::Alternation(ref x) => &x.span, + Ast::Concat(ref x) => &x.span, }; self.increment_depth(span) } fn visit_post(&mut self, ast: &Ast) -> Result<()> { - match *ast.0 { - AstKind::Empty(_) - | AstKind::Flags(_) - | AstKind::Literal(_) - | AstKind::Dot(_) - | AstKind::Assertion(_) - | AstKind::ClassUnicode(_) - | AstKind::ClassPerl(_) => { + match *ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => { // These are all base cases, so we don't decrement depth. Ok(()) } - AstKind::ClassBracketed(_) - | AstKind::Repetition(_) - | AstKind::Group(_) - | AstKind::Alternation(_) - | AstKind::Concat(_) => { + Ast::ClassBracketed(_) + | Ast::Repetition(_) + | Ast::Group(_) + | Ast::Alternation(_) + | Ast::Concat(_) => { self.decrement_depth(); Ok(()) } diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 10ee56c2c..7dedf7f48 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -7,7 +7,7 @@ use core::fmt; use crate::ast::{ self, visitor::{self, Visitor}, - Ast, AstKind, + Ast, }; /// A builder for constructing a printer. @@ -78,27 +78,27 @@ impl Visitor for Writer { } fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { - match *ast.0 { - AstKind::Group(ref x) => self.fmt_group_pre(x), - AstKind::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x), + match *ast { + Ast::Group(ref x) => self.fmt_group_pre(x), + Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x), _ => Ok(()), } } fn visit_post(&mut self, ast: &Ast) -> fmt::Result { - match *ast.0 { - AstKind::Empty(_) => Ok(()), - AstKind::Flags(ref x) => self.fmt_set_flags(x), - AstKind::Literal(ref x) => self.fmt_literal(x), - AstKind::Dot(_) => self.wtr.write_str("."), - AstKind::Assertion(ref x) => self.fmt_assertion(x), - AstKind::ClassPerl(ref x) => self.fmt_class_perl(x), - AstKind::ClassUnicode(ref x) => self.fmt_class_unicode(x), - AstKind::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x), - AstKind::Repetition(ref x) => self.fmt_repetition(x), - AstKind::Group(ref x) => self.fmt_group_post(x), - AstKind::Alternation(_) => Ok(()), - AstKind::Concat(_) => Ok(()), + match *ast { + Ast::Empty(_) => Ok(()), + Ast::Flags(ref x) => self.fmt_set_flags(x), + Ast::Literal(ref x) => self.fmt_literal(x), + Ast::Dot(_) => self.wtr.write_str("."), + Ast::Assertion(ref x) => self.fmt_assertion(x), + Ast::ClassPerl(ref x) => self.fmt_class_perl(x), + Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x), + Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x), + Ast::Repetition(ref x) => self.fmt_repetition(x), + Ast::Group(ref x) => self.fmt_group_post(x), + Ast::Alternation(_) => Ok(()), + Ast::Concat(_) => Ok(()), } } diff --git a/regex-syntax/src/ast/visitor.rs b/regex-syntax/src/ast/visitor.rs index 2bd4b1956..c1bb24d97 100644 --- a/regex-syntax/src/ast/visitor.rs +++ b/regex-syntax/src/ast/visitor.rs @@ -1,6 +1,6 @@ use alloc::{vec, vec::Vec}; -use crate::ast::{self, Ast, AstKind}; +use crate::ast::{self, Ast}; /// A trait for visiting an abstract syntax tree (AST) in depth first order. /// @@ -263,19 +263,19 @@ impl<'a> HeapVisitor<'a> { ast: &'a Ast, visitor: &mut V, ) -> Result>, V::Err> { - Ok(match *ast.0 { - AstKind::ClassBracketed(ref x) => { + Ok(match *ast { + Ast::ClassBracketed(ref x) => { self.visit_class(x, visitor)?; None } - AstKind::Repetition(ref x) => Some(Frame::Repetition(x)), - AstKind::Group(ref x) => Some(Frame::Group(x)), - AstKind::Concat(ref x) if x.asts.is_empty() => None, - AstKind::Concat(ref x) => { + Ast::Repetition(ref x) => Some(Frame::Repetition(x)), + Ast::Group(ref x) => Some(Frame::Group(x)), + Ast::Concat(ref x) if x.asts.is_empty() => None, + Ast::Concat(ref x) => { Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] }) } - AstKind::Alternation(ref x) if x.asts.is_empty() => None, - AstKind::Alternation(ref x) => Some(Frame::Alternation { + Ast::Alternation(ref x) if x.asts.is_empty() => None, + Ast::Alternation(ref x) => Some(Frame::Alternation { head: &x.asts[0], tail: &x.asts[1..], }), diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index ab3aa93d7..56d261aa1 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -7,7 +7,7 @@ use core::cell::{Cell, RefCell}; use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; use crate::{ - ast::{self, Ast, AstKind, Span, Visitor}, + ast::{self, Ast, Span, Visitor}, either::Either, hir::{self, Error, ErrorKind, Hir, HirKind}, unicode::{self, ClassQuery}, @@ -336,8 +336,8 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } fn visit_pre(&mut self, ast: &Ast) -> Result<()> { - match *ast.0 { - AstKind::ClassBracketed(_) => { + match *ast { + Ast::ClassBracketed(_) => { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); @@ -346,20 +346,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::ClassBytes(cls)); } } - AstKind::Repetition(_) => self.push(HirFrame::Repetition), - AstKind::Group(ref x) => { + Ast::Repetition(_) => self.push(HirFrame::Repetition), + Ast::Group(ref x) => { let old_flags = x .flags() .map(|ast| self.set_flags(ast)) .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } - AstKind::Concat(ref x) if x.asts.is_empty() => {} - AstKind::Concat(_) => { + Ast::Concat(ref x) if x.asts.is_empty() => {} + Ast::Concat(_) => { self.push(HirFrame::Concat); } - AstKind::Alternation(ref x) if x.asts.is_empty() => {} - AstKind::Alternation(_) => { + Ast::Alternation(ref x) if x.asts.is_empty() => {} + Ast::Alternation(_) => { self.push(HirFrame::Alternation); self.push(HirFrame::AlternationBranch); } @@ -369,11 +369,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } fn visit_post(&mut self, ast: &Ast) -> Result<()> { - match *ast.0 { - AstKind::Empty(_) => { + match *ast { + Ast::Empty(_) => { self.push(HirFrame::Expr(Hir::empty())); } - AstKind::Flags(ref x) => { + Ast::Flags(ref x) => { self.set_flags(&x.flags); // Flags in the AST are generally considered directives and // not actual sub-expressions. However, they can be used in @@ -386,7 +386,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { // consistency sake. self.push(HirFrame::Expr(Hir::empty())); } - AstKind::Literal(ref x) => match self.ast_literal_to_scalar(x)? { + Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? { Either::Right(byte) => self.push_byte(byte), Either::Left(ch) => { if !self.flags().unicode() && ch.len_utf8() > 1 { @@ -400,13 +400,13 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } } }, - AstKind::Dot(ref span) => { + Ast::Dot(ref span) => { self.push(HirFrame::Expr(self.hir_dot(**span)?)); } - AstKind::Assertion(ref x) => { + Ast::Assertion(ref x) => { self.push(HirFrame::Expr(self.hir_assertion(x)?)); } - AstKind::ClassPerl(ref x) => { + Ast::ClassPerl(ref x) => { if self.flags().unicode() { let cls = self.hir_perl_unicode_class(x)?; let hcls = hir::Class::Unicode(cls); @@ -417,11 +417,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(Hir::class(hcls))); } } - AstKind::ClassUnicode(ref x) => { + Ast::ClassUnicode(ref x) => { let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); self.push(HirFrame::Expr(Hir::class(cls))); } - AstKind::ClassBracketed(ref ast) => { + Ast::ClassBracketed(ref ast) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); self.unicode_fold_and_negate( @@ -442,18 +442,18 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(expr)); } } - AstKind::Repetition(ref x) => { + Ast::Repetition(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); self.pop().unwrap().unwrap_repetition(); self.push(HirFrame::Expr(self.hir_repetition(x, expr))); } - AstKind::Group(ref x) => { + Ast::Group(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); let old_flags = self.pop().unwrap().unwrap_group(); self.trans().flags.set(old_flags); self.push(HirFrame::Expr(self.hir_capture(x, expr))); } - AstKind::Concat(_) => { + Ast::Concat(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_concat_expr() { if !matches!(*expr.kind(), HirKind::Empty) { @@ -463,7 +463,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { exprs.reverse(); self.push(HirFrame::Expr(Hir::concat(exprs))); } - AstKind::Alternation(_) => { + Ast::Alternation(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_alt_expr() { self.pop().unwrap().unwrap_alternation_pipe(); From ad2cfd67db697a5c05525006fe4641843dee69c6 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 Oct 2023 11:34:42 -0400 Subject: [PATCH 08/33] syntax: remove guarantees in the HIR related to 'u' flag Basically, we never should have guaranteed that a particular HIR would (or wouldn't) be used if the 'u' flag was present (or absent). Such a guarantee generally results in too little flexibility, particularly when it comes to HIR's smart constructors. We could probably uphold that guarantee, but it's somewhat gnarly to do and would require rejiggering some of the HIR types. For example, we would probably need a literal that is an enum of `&str` or `&[u8]` that correctly preserves the Unicode flag. This in turn comes with a bigger complexity cost in various rewriting rules. In general, it's much simpler to require the caller to be prepared for any kind of HIR regardless of what the flags are. I feel somewhat justified in this position due to the fact that part of the point of the HIR is to erase all of the regex flags so that callers no longer need to worry about them. That is, the erasure is the point that provides a simplification for everyone downstream. Closes #1088 --- CHANGELOG.md | 3 +++ regex-syntax/src/hir/mod.rs | 16 +++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a474af1b..5b88d9e80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,9 @@ TBD * [BUG #1046](https://github.com/rust-lang/regex/issues/1046): Fix a bug that could result in incorrect match spans when using a Unicode word boundary and searching non-ASCII strings. +* [BUG(regex-syntax) #1088](https://github.com/rust-lang/regex/issues/1088): +Remove guarantees in the API that connect the `u` flag with a specific HIR +representation. 1.9.6 (2023-09-30) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 6c1d2745e..f8a3d4a9e 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -797,13 +797,18 @@ impl core::fmt::Debug for Literal { /// The high-level intermediate representation of a character class. /// /// A character class corresponds to a set of characters. A character is either -/// defined by a Unicode scalar value or a byte. Unicode characters are used -/// by default, while bytes are used when Unicode mode (via the `u` flag) is -/// disabled. +/// defined by a Unicode scalar value or a byte. /// /// A character class, regardless of its character type, is represented by a /// sequence of non-overlapping non-adjacent ranges of characters. /// +/// There are no guarantees about which class variant is used. Generally +/// speaking, the Unicode variat is used whenever a class needs to contain +/// non-ASCII Unicode scalar values. But the Unicode variant can be used even +/// when Unicode mode is disabled. For example, at the time of writing, the +/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class +/// `[a\u00A0]` due to optimizations. +/// /// Note that `Bytes` variant may be produced even when it exclusively matches /// valid UTF-8. This is because a `Bytes` variant represents an intention by /// the author of the regular expression to disable Unicode mode, which in turn @@ -1326,8 +1331,9 @@ impl ClassUnicodeRange { } } -/// A set of characters represented by arbitrary bytes (where one byte -/// corresponds to one character). +/// A set of characters represented by arbitrary bytes. +/// +/// Each byte corresponds to one character. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassBytes { set: IntervalSet, From f0147f8a317b0a7b752ac9fc3985de9be6f9c745 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 Oct 2023 11:39:14 -0400 Subject: [PATCH 09/33] automata: rejigger DFA start state computation It turns out that requiring callers to provide an `Input` (and thus a `&[u8]` haystack) is a bit onerous for all cases. Namely, part of the point of `regex-automata` was to expose enough guts to make it tractable to write a streaming regex engine. A streaming regex engine, especially one that does a byte-at-a-time loop, is somewhat antithetical to having a haystack in a single `&[u8]` slice. This made computing start states possible but very awkward and quite unclear in terms of what the implementation would actually do with the haystack. This commit fixes that by exposing a lower level `start_state` method on both of the DFAs that can be called without materializing an `Input`. Instead, callers must create a new `start::Config` value which provides all of the information necessary for the DFA to compute the correct start state. This in turn also exposes the `crate::util::start` module. This is ultimately a breaking change because it adds a new required method to the `Automaton` trait. It also makes `start_state_forward` and `start_state_reverse` optional. It isn't really expected for callers to implement the `Automaton` trait themselves (and perhaps I will seal it so we can do such changes in the future without it being breaking), but still, this is technically breaking. Callers using `start_state_forward` or `start_state_reverse` with either DFA remain unchanged and unaffected. Closes #1031 --- CHANGELOG.md | 7 + regex-automata/src/dfa/automaton.rs | 188 ++++++++++++++++++--- regex-automata/src/dfa/dense.rs | 95 +++++------ regex-automata/src/dfa/mod.rs | 2 +- regex-automata/src/dfa/sparse.rs | 59 +++---- regex-automata/src/hybrid/dfa.rs | 179 +++++++++++--------- regex-automata/src/hybrid/error.rs | 115 ++++++++++++- regex-automata/src/hybrid/mod.rs | 2 +- regex-automata/src/util/mod.rs | 2 +- regex-automata/src/util/start.rs | 243 ++++++++++++++++++++++++---- 10 files changed, 662 insertions(+), 230 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b88d9e80..265f5cd48 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,13 @@ TBD === +New features: + +* [FEATURE(regex-automata) #1031](https://github.com/rust-lang/regex/pull/1031): +DFAs now have a `start_state` method that doesn't use an `Input`. + +Bug fixes: + * [BUG #1046](https://github.com/rust-lang/regex/issues/1046): Fix a bug that could result in incorrect match spans when using a Unicode word boundary and searching non-ASCII strings. diff --git a/regex-automata/src/dfa/automaton.rs b/regex-automata/src/dfa/automaton.rs index 7e2be9a15..cd597947e 100644 --- a/regex-automata/src/dfa/automaton.rs +++ b/regex-automata/src/dfa/automaton.rs @@ -7,6 +7,7 @@ use crate::{ prefilter::Prefilter, primitives::{PatternID, StateID}, search::{Anchored, HalfMatch, Input, MatchError}, + start, }, }; @@ -226,8 +227,8 @@ pub unsafe trait Automaton { /// ``` fn next_eoi_state(&self, current: StateID) -> StateID; - /// Return the ID of the start state for this lazy DFA when executing a - /// forward search. + /// Return the ID of the start state for this DFA for the given starting + /// configuration. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -235,12 +236,41 @@ pub unsafe trait Automaton { /// * The [`Anchored`] mode of the search. Unanchored, anchored and /// anchored searches for a specific [`PatternID`] all use different start /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for forward searches. + /// * Whether a "look-behind" byte exists. For example, the `^` anchor + /// matches if and only if there is no look-behind byte. + /// * The specific value of that look-behind byte. For example, a `(?m:^)` + /// assertion only matches when there is either no look-behind byte, or + /// when the look-behind byte is a line terminator. + /// + /// The [starting configuration](start::Config) provides the above + /// information. + /// + /// This routine can be used for either forward or reverse searches. + /// Although, as a convenience, if you have an [`Input`], then it may + /// be more succinct to use [`Automaton::start_state_forward`] or + /// [`Automaton::start_state_reverse`]. Note, for example, that the + /// convenience routines return a [`MatchError`] on failure where as this + /// routine returns a [`StartError`]. + /// + /// # Errors + /// + /// This may return a [`StartError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte). + /// This can also return an error if the given configuration contains an + /// unsupported [`Anchored`] configuration. + fn start_state( + &self, + config: &start::Config, + ) -> Result; + + /// Return the ID of the start state for this DFA when executing a forward + /// search. + /// + /// This is a convenience routine for calling [`Automaton::start_state`] + /// that converts the given [`Input`] to a [start + /// configuration](start::Config). Additionally, if an error occurs, it is + /// converted from a [`StartError`] to a [`MatchError`] using the offset + /// information in the given [`Input`]. /// /// # Errors /// @@ -251,23 +281,30 @@ pub unsafe trait Automaton { fn start_state_forward( &self, input: &Input<'_>, - ) -> Result; + ) -> Result { + let config = start::Config::from_input_forward(input); + self.start_state(&config).map_err(|err| match err { + StartError::Quit { byte } => { + let offset = input + .start() + .checked_sub(1) + .expect("no quit in start without look-behind"); + MatchError::quit(byte, offset) + } + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) + } - /// Return the ID of the start state for this lazy DFA when executing a - /// reverse search. + /// Return the ID of the start state for this DFA when executing a reverse + /// search. /// - /// Unlike typical DFA implementations, the start state for DFAs in this - /// crate is dependent on a few different factors: - /// - /// * The [`Anchored`] mode of the search. Unanchored, anchored and - /// anchored searches for a specific [`PatternID`] all use different start - /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for reverse searches. + /// This is a convenience routine for calling [`Automaton::start_state`] + /// that converts the given [`Input`] to a [start + /// configuration](start::Config). Additionally, if an error occurs, it is + /// converted from a [`StartError`] to a [`MatchError`] using the offset + /// information in the given [`Input`]. /// /// # Errors /// @@ -278,7 +315,18 @@ pub unsafe trait Automaton { fn start_state_reverse( &self, input: &Input<'_>, - ) -> Result; + ) -> Result { + let config = start::Config::from_input_reverse(input); + self.start_state(&config).map_err(|err| match err { + StartError::Quit { byte } => { + let offset = input.end(); + MatchError::quit(byte, offset) + } + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) + } /// If this DFA has a universal starting state for the given anchor mode /// and the DFA supports universal starting states, then this returns that @@ -1798,6 +1846,14 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { (**self).next_eoi_state(current) } + #[inline] + fn start_state( + &self, + config: &start::Config, + ) -> Result { + (**self).start_state(config) + } + #[inline] fn start_state_forward( &self, @@ -2015,6 +2071,90 @@ impl OverlappingState { } } +/// An error that can occur when computing the start state for a search. +/// +/// Computing a start state can fail for a few reasons, either based on +/// incorrect configuration or even based on whether the look-behind byte +/// triggers a quit state. Typically one does not need to handle this error +/// if you're using [`Automaton::start_state_forward`] (or its reverse +/// counterpart), as that routine automatically converts `StartError` to a +/// [`MatchError`] for you. +/// +/// This error may be returned by the [`Automaton::start_state`] routine. +/// +/// This error implements the `std::error::Error` trait when the `std` feature +/// is enabled. +/// +/// This error is marked as non-exhaustive. New variants may be added in a +/// semver compatible release. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum StartError { + /// An error that occurs when a starting configuration's look-behind byte + /// is in this DFA's quit set. + Quit { + /// The quit byte that was found. + byte: u8, + }, + /// An error that occurs when the caller requests an anchored mode that + /// isn't supported by the DFA. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +impl StartError { + pub(crate) fn quit(byte: u8) -> StartError { + StartError::Quit { byte } + } + + pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError { + StartError::UnsupportedAnchored { mode } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for StartError {} + +impl core::fmt::Display for StartError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + StartError::Quit { byte } => write!( + f, + "error computing start state because the look-behind byte \ + {:?} triggered a quit state", + crate::util::escape::DebugByte(byte), + ), + StartError::UnsupportedAnchored { mode: Anchored::Yes } => { + write!( + f, + "error computing start state because \ + anchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { mode: Anchored::No } => { + write!( + f, + "error computing start state because \ + unanchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "error computing start state because \ + anchored searches for a specific pattern ({}) \ + are not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + /// Runs the given overlapping `search` function (forwards or backwards) until /// a match is found whose offset does not split a codepoint. /// diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 6da865f97..7af38b546 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -30,7 +30,7 @@ use crate::{ use crate::{ dfa::{ accel::Accels, - automaton::{fmt_state_indicator, Automaton}, + automaton::{fmt_state_indicator, Automaton, StartError}, special::Special, start::StartKind, DEAD, @@ -40,8 +40,8 @@ use crate::{ int::{Pointer, Usize}, prefilter::Prefilter, primitives::{PatternID, StateID}, - search::{Anchored, Input, MatchError}, - start::{Start, StartByteMap}, + search::Anchored, + start::{self, Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -2885,31 +2885,33 @@ impl OwnedDFA { fn set_universal_starts(&mut self) { assert_eq!(6, Start::len(), "expected 6 start configurations"); - let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| { + let start_id = |dfa: &mut OwnedDFA, + anchored: Anchored, + start: Start| { // This OK because we only call 'start' under conditions // in which we know it will succeed. - dfa.st.start(inp, start).expect("valid Input configuration") + dfa.st.start(anchored, start).expect("valid Input configuration") }; if self.start_kind().has_unanchored() { - let inp = Input::new("").anchored(Anchored::No); - let sid = start_id(self, &inp, Start::NonWordByte); - if sid == start_id(self, &inp, Start::WordByte) - && sid == start_id(self, &inp, Start::Text) - && sid == start_id(self, &inp, Start::LineLF) - && sid == start_id(self, &inp, Start::LineCR) - && sid == start_id(self, &inp, Start::CustomLineTerminator) + let anchor = Anchored::No; + let sid = start_id(self, anchor, Start::NonWordByte); + if sid == start_id(self, anchor, Start::WordByte) + && sid == start_id(self, anchor, Start::Text) + && sid == start_id(self, anchor, Start::LineLF) + && sid == start_id(self, anchor, Start::LineCR) + && sid == start_id(self, anchor, Start::CustomLineTerminator) { self.st.universal_start_unanchored = Some(sid); } } if self.start_kind().has_anchored() { - let inp = Input::new("").anchored(Anchored::Yes); - let sid = start_id(self, &inp, Start::NonWordByte); - if sid == start_id(self, &inp, Start::WordByte) - && sid == start_id(self, &inp, Start::Text) - && sid == start_id(self, &inp, Start::LineLF) - && sid == start_id(self, &inp, Start::LineCR) - && sid == start_id(self, &inp, Start::CustomLineTerminator) + let anchor = Anchored::Yes; + let sid = start_id(self, anchor, Start::NonWordByte); + if sid == start_id(self, anchor, Start::WordByte) + && sid == start_id(self, anchor, Start::Text) + && sid == start_id(self, anchor, Start::LineLF) + && sid == start_id(self, anchor, Start::LineCR) + && sid == start_id(self, anchor, Start::CustomLineTerminator) { self.st.universal_start_anchored = Some(sid); } @@ -3216,35 +3218,21 @@ unsafe impl> Automaton for DFA { } #[cfg_attr(feature = "perf-inline", inline(always))] - fn start_state_forward( + fn start_state( &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); - } - } - let start = self.st.start_map.fwd(&input); - self.st.start(input, start) - } - - #[cfg_attr(feature = "perf-inline", inline(always))] - fn start_state_reverse( - &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + config: &start::Config, + ) -> Result { + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.st.start_map.get(byte) } - } - let start = self.st.start_map.rev(&input); - self.st.start(input, start) + }; + self.st.start(anchored, start) } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -4180,28 +4168,27 @@ impl> StartTable { #[cfg_attr(feature = "perf-inline", inline(always))] fn start( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => { if !self.kind.has_unanchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } start_index } Anchored::Yes => { if !self.kind.has_anchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } self.stride + start_index } Anchored::Pattern(pid) => { let len = match self.pattern_len { None => { - return Err(MatchError::unsupported_anchored(mode)) + return Err(StartError::unsupported_anchored(anchored)) } Some(len) => len, }; @@ -5086,6 +5073,8 @@ impl core::fmt::Display for BuildError { #[cfg(all(test, feature = "syntax", feature = "dfa-build"))] mod tests { + use crate::{Input, MatchError}; + use super::*; #[test] diff --git a/regex-automata/src/dfa/mod.rs b/regex-automata/src/dfa/mod.rs index 4bb870435..fd58cac23 100644 --- a/regex-automata/src/dfa/mod.rs +++ b/regex-automata/src/dfa/mod.rs @@ -320,7 +320,7 @@ dramatically. #[cfg(feature = "dfa-search")] pub use crate::dfa::{ - automaton::{Automaton, OverlappingState}, + automaton::{Automaton, OverlappingState, StartError}, start::StartKind, }; diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index 5d8ec2340..a5ccf9add 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -52,7 +52,7 @@ use alloc::{vec, vec::Vec}; use crate::dfa::dense::{self, BuildError}; use crate::{ dfa::{ - automaton::{fmt_state_indicator, Automaton}, + automaton::{fmt_state_indicator, Automaton, StartError}, dense::Flags, special::Special, StartKind, DEAD, @@ -63,8 +63,8 @@ use crate::{ int::{Pointer, Usize, U16, U32}, prefilter::Prefilter, primitives::{PatternID, StateID}, - search::{Anchored, Input, MatchError}, - start::{Start, StartByteMap}, + search::Anchored, + start::{self, Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -1207,35 +1207,21 @@ unsafe impl> Automaton for DFA { } #[inline] - fn start_state_forward( + fn start_state( &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); - } - } - let start = self.st.start_map.fwd(&input); - self.st.start(input, start) - } - - #[inline] - fn start_state_reverse( - &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + config: &start::Config, + ) -> Result { + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.st.start_map.get(byte) } - } - let start = self.st.start_map.rev(&input); - self.st.start(input, start) + }; + self.st.start(anchored, start) } #[inline] @@ -2145,28 +2131,27 @@ impl> StartTable { /// panics. fn start( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => { if !self.kind.has_unanchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } start_index } Anchored::Yes => { if !self.kind.has_anchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } self.stride + start_index } Anchored::Pattern(pid) => { let len = match self.pattern_len { None => { - return Err(MatchError::unsupported_anchored(mode)) + return Err(StartError::unsupported_anchored(anchored)) } Some(len) => len, }; diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 67261c1a3..102cfb6fe 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -13,7 +13,7 @@ use alloc::vec::Vec; use crate::{ hybrid::{ - error::{BuildError, CacheError}, + error::{BuildError, CacheError, StartError}, id::{LazyStateID, LazyStateIDError}, search, }, @@ -28,7 +28,7 @@ use crate::{ Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet, }, sparse_set::SparseSets, - start::{Start, StartByteMap}, + start::{self, Start, StartByteMap}, }, }; @@ -1518,8 +1518,8 @@ impl DFA { Lazy::new(self, cache).cache_next_state(current, unit) } - /// Return the ID of the start state for this lazy DFA when executing a - /// forward search. + /// Return the ID of the start state for this lazy DFA for the given + /// starting configuration. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -1527,85 +1527,122 @@ impl DFA { /// * The [`Anchored`] mode of the search. Unanchored, anchored and /// anchored searches for a specific [`PatternID`] all use different start /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for forward searches. + /// * Whether a "look-behind" byte exists. For example, the `^` anchor + /// matches if and only if there is no look-behind byte. + /// * The specific value of that look-behind byte. For example, a `(?m:^)` + /// assertion only matches when there is either no look-behind byte, or + /// when the look-behind byte is a line terminator. + /// + /// The [starting configuration](start::Config) provides the above + /// information. + /// + /// This routine can be used for either forward or reverse searches. + /// Although, as a convenience, if you have an [`Input`], then it + /// may be more succinct to use [`DFA::start_state_forward`] or + /// [`DFA::start_state_reverse`]. Note, for example, that the convenience + /// routines return a [`MatchError`] on failure where as this routine + /// returns a [`StartError`]. + /// + /// # Errors + /// + /// This may return a [`StartError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte + /// or if the cache has become inefficient). This can also return an + /// error if the given configuration contains an unsupported [`Anchored`] + /// configuration. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn start_state( + &self, + cache: &mut Cache, + config: &start::Config, + ) -> Result { + let lazy = LazyRef::new(self, cache); + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.start_map.get(byte) + } + }; + let start_id = lazy.get_cached_start_id(anchored, start)?; + if !start_id.is_unknown() { + return Ok(start_id); + } + Lazy::new(self, cache).cache_start_group(anchored, start) + } + + /// Return the ID of the start state for this lazy DFA when executing a + /// forward search. + /// + /// This is a convenience routine for calling [`DFA::start_state`] that + /// converts the given [`Input`] to a [start configuration](start::Config). + /// Additionally, if an error occurs, it is converted from a [`StartError`] + /// to a [`MatchError`] using the offset information in the given + /// [`Input`]. /// /// # Errors /// - /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if - /// it sees a "quit" byte or if the cache has been cleared too many - /// times). This can also return an error if the given `Input` contains an - /// unsupported [`Anchored`] configuration. + /// This may return a [`MatchError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte or + /// if the cache has become inefficient). This can also return an error if + /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_forward( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + let config = start::Config::from_input_forward(input); + self.start_state(cache, &config).map_err(|err| match err { + StartError::Cache { .. } => MatchError::gave_up(input.start()), + StartError::Quit { byte } => { + let offset = input + .start() + .checked_sub(1) + .expect("no quit in start without look-behind"); + MatchError::quit(byte, offset) } - } - let start_type = self.start_map.fwd(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; - if !start.is_unknown() { - return Ok(start); - } - Lazy::new(self, cache).cache_start_group(input, start_type) + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) } /// Return the ID of the start state for this lazy DFA when executing a /// reverse search. /// - /// Unlike typical DFA implementations, the start state for DFAs in this - /// crate is dependent on a few different factors: - /// - /// * The [`Anchored`] mode of the search. Unanchored, anchored and - /// anchored searches for a specific [`PatternID`] all use different start - /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for reverse searches. + /// This is a convenience routine for calling [`DFA::start_state`] that + /// converts the given [`Input`] to a [start configuration](start::Config). + /// Additionally, if an error occurs, it is converted from a [`StartError`] + /// to a [`MatchError`] using the offset information in the given + /// [`Input`]. /// /// # Errors /// - /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if - /// it sees a "quit" byte or if the cache has been cleared too many - /// times). This can also return an error if the given `Input` contains an - /// unsupported [`Anchored`] configuration. + /// This may return a [`MatchError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte or + /// if the cache has become inefficient). This can also return an error if + /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_reverse( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + let config = start::Config::from_input_reverse(input); + self.start_state(cache, &config).map_err(|err| match err { + StartError::Cache { .. } => MatchError::gave_up(input.end()), + StartError::Quit { byte } => { + let offset = input.end(); + MatchError::quit(byte, offset) } - } - let start_type = self.start_map.rev(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; - if !start.is_unknown() { - return Ok(start); - } - Lazy::new(self, cache).cache_start_group(input, start_type) + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) } /// Returns the total number of patterns that match in this state. @@ -2122,16 +2159,15 @@ impl<'i, 'c> Lazy<'i, 'c> { #[inline(never)] fn cache_start_group( &mut self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { - let mode = input.get_anchored(); - let nfa_start_id = match mode { + ) -> Result { + let nfa_start_id = match anchored { Anchored::No => self.dfa.get_nfa().start_unanchored(), Anchored::Yes => self.dfa.get_nfa().start_anchored(), Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } match self.dfa.get_nfa().start_pattern(pid) { None => return Ok(self.as_ref().dead_id()), @@ -2142,8 +2178,8 @@ impl<'i, 'c> Lazy<'i, 'c> { let id = self .cache_start_one(nfa_start_id, start) - .map_err(|_| MatchError::gave_up(input.start()))?; - self.set_start_state(input, start, id); + .map_err(StartError::cache)?; + self.set_start_state(anchored, start, id); Ok(id) } @@ -2574,13 +2610,13 @@ impl<'i, 'c> Lazy<'i, 'c> { /// 'starts_for_each_pattern' is not enabled. fn set_start_state( &mut self, - input: &Input<'_>, + anchored: Anchored, start: Start, id: LazyStateID, ) { assert!(self.as_ref().is_valid(id)); let start_index = start.as_usize(); - let index = match input.get_anchored() { + let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { @@ -2642,17 +2678,16 @@ impl<'i, 'c> LazyRef<'i, 'c> { #[cfg_attr(feature = "perf-inline", inline(always))] fn get_cached_start_id( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } if pid.as_usize() >= self.dfa.pattern_len() { return Ok(self.dead_id()); diff --git a/regex-automata/src/hybrid/error.rs b/regex-automata/src/hybrid/error.rs index 604daf3c3..d134e7ec9 100644 --- a/regex-automata/src/hybrid/error.rs +++ b/regex-automata/src/hybrid/error.rs @@ -1,4 +1,4 @@ -use crate::{hybrid::id::LazyStateIDError, nfa}; +use crate::{hybrid::id::LazyStateIDError, nfa, util::search::Anchored}; /// An error that occurs when initial construction of a lazy DFA fails. /// @@ -95,6 +95,113 @@ impl core::fmt::Display for BuildError { } } +/// An error that can occur when computing the start state for a search. +/// +/// Computing a start state can fail for a few reasons, either +/// based on incorrect configuration or even based on whether +/// the look-behind byte triggers a quit state. Typically +/// one does not need to handle this error if you're using +/// [`DFA::start_state_forward`](crate::hybrid::dfa::DFA::start_state_forward) +/// (or its reverse counterpart), as that routine automatically converts +/// `StartError` to a [`MatchError`](crate::MatchError) for you. +/// +/// This error may be returned by the +/// [`DFA::start_state`](crate::hybrid::dfa::DFA::start_state) routine. +/// +/// This error implements the `std::error::Error` trait when the `std` feature +/// is enabled. +/// +/// This error is marked as non-exhaustive. New variants may be added in a +/// semver compatible release. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum StartError { + /// An error that occurs when cache inefficiency has dropped below the + /// configured heuristic thresholds. + Cache { + /// The underlying cache error that occurred. + err: CacheError, + }, + /// An error that occurs when a starting configuration's look-behind byte + /// is in this DFA's quit set. + Quit { + /// The quit byte that was found. + byte: u8, + }, + /// An error that occurs when the caller requests an anchored mode that + /// isn't supported by the DFA. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +impl StartError { + pub(crate) fn cache(err: CacheError) -> StartError { + StartError::Cache { err } + } + + pub(crate) fn quit(byte: u8) -> StartError { + StartError::Quit { byte } + } + + pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError { + StartError::UnsupportedAnchored { mode } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for StartError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match *self { + StartError::Cache { ref err } => Some(err), + _ => None, + } + } +} + +impl core::fmt::Display for StartError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + StartError::Cache { .. } => write!( + f, + "error computing start state because of cache inefficiency" + ), + StartError::Quit { byte } => write!( + f, + "error computing start state because the look-behind byte \ + {:?} triggered a quit state", + crate::util::escape::DebugByte(byte), + ), + StartError::UnsupportedAnchored { mode: Anchored::Yes } => { + write!( + f, + "error computing start state because \ + anchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { mode: Anchored::No } => { + write!( + f, + "error computing start state because \ + unanchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "error computing start state because \ + anchored searches for a specific pattern ({}) \ + are not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + /// An error that occurs when cache usage has become inefficient. /// /// One of the weaknesses of a lazy DFA is that it may need to clear its @@ -126,11 +233,7 @@ impl CacheError { } #[cfg(feature = "std")] -impl std::error::Error for CacheError { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - None - } -} +impl std::error::Error for CacheError {} impl core::fmt::Display for CacheError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { diff --git a/regex-automata/src/hybrid/mod.rs b/regex-automata/src/hybrid/mod.rs index 44e67e129..2feb839d1 100644 --- a/regex-automata/src/hybrid/mod.rs +++ b/regex-automata/src/hybrid/mod.rs @@ -133,7 +133,7 @@ compiled DFAs. */ pub use self::{ - error::{BuildError, CacheError}, + error::{BuildError, CacheError, StartError}, id::LazyStateID, }; diff --git a/regex-automata/src/util/mod.rs b/regex-automata/src/util/mod.rs index bb739df1d..b3eef64e6 100644 --- a/regex-automata/src/util/mod.rs +++ b/regex-automata/src/util/mod.rs @@ -40,6 +40,7 @@ pub mod look; pub mod pool; pub mod prefilter; pub mod primitives; +pub mod start; #[cfg(feature = "syntax")] pub mod syntax; pub mod wire; @@ -52,6 +53,5 @@ pub(crate) mod memchr; pub(crate) mod search; #[cfg(feature = "alloc")] pub(crate) mod sparse_set; -pub(crate) mod start; pub(crate) mod unicode_data; pub(crate) mod utf8; diff --git a/regex-automata/src/util/start.rs b/regex-automata/src/util/start.rs index 4e360d083..f2d1922c9 100644 --- a/regex-automata/src/util/start.rs +++ b/regex-automata/src/util/start.rs @@ -1,17 +1,195 @@ /*! -Provides some helpers for dealing with start state configurations in DFAs. - -[`Start`] represents the possible starting configurations, while -[`StartByteMap`] represents a way to retrieve the `Start` configuration for a -given position in a haystack. +Provides helpers for dealing with start state configurations in DFAs. */ use crate::util::{ look::LookMatcher, - search::Input, + search::{Anchored, Input}, wire::{self, DeserializeError, SerializeError}, }; +/// The configuration used to determine a DFA's start state for a search. +/// +/// A DFA has a single starting state in the typical textbook description. That +/// is, it corresponds to the set of all starting states for the NFA that built +/// it, along with their espsilon closures. In this crate, however, DFAs have +/// many possible start states due to a few factors: +/// +/// * DFAs support the ability to run either anchored or unanchored searches. +/// Each type of search needs its own start state. For example, an unanchored +/// search requires starting at a state corresponding to a regex with a +/// `(?s-u:.)*?` prefix, which will match through anything. +/// * DFAs also optionally support starting an anchored search for any one +/// specific pattern. Each such pattern requires its own start state. +/// * If a look-behind assertion like `^` or `\b` is used in the regex, then +/// the DFA will need to inspect a single byte immediately before the start of +/// the search to choose the correct start state. +/// +/// Indeed, this configuration precisely encapsulates all of the above factors. +/// The [`Config::anchored`] method sets which kind of anchored search to +/// perform while the [`Config::look_behind`] method provides a way to set +/// the byte that occurs immediately before the start of the search. +/// +/// Generally speaking, this type is only useful when you want to run searches +/// without using an [`Input`](crate::Input). In particular, an `Input` wants a +/// haystack slice, but callers may not have a contiguous sequence of bytes as +/// a haystack in all cases. This type provides a lower level of control such +/// that callers can provide their own anchored configuration and look-behind +/// byte explicitly. +/// +/// # Example +/// +/// This shows basic usage that permits running a search with a DFA without +/// using the `Input` abstraction. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// let config = start::Config::new().anchored(Anchored::Yes); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter() { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// assert!(dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +/// +/// This example shows how to correctly run a search that doesn't begin at +/// the start of a haystack. Notice how we set the look-behind byte, and as +/// a result, the `\b` assertion does not match. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// let config = start::Config::new() +/// .anchored(Anchored::Yes) +/// .look_behind(Some(b'q')); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter().skip(1) { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// // No match! +/// assert!(!dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +/// +/// If we had instead not set a look-behind byte, then the DFA would assume +/// that it was starting at the beginning of the haystack, and thus `\b` should +/// match. This in turn would result in erroneously reporting a match: +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// // Whoops, forgot the look-behind byte... +/// let config = start::Config::new().anchored(Anchored::Yes); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter().skip(1) { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// // And now we get a match unexpectedly. +/// assert!(dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Config { + look_behind: Option, + anchored: Anchored, +} + +impl Config { + /// Create a new default start configuration. + /// + /// The default is an unanchored search that starts at the beginning of the + /// haystack. + pub fn new() -> Config { + Config { anchored: Anchored::No, look_behind: None } + } + + /// A convenience routine for building a start configuration from an + /// [`Input`] for a forward search. + /// + /// This automatically sets the look-behind byte to the byte immediately + /// preceding the start of the search. If the start of the search is at + /// offset `0`, then no look-behind byte is set. + pub fn from_input_forward(input: &Input<'_>) -> Config { + let look_behind = input + .start() + .checked_sub(1) + .and_then(|i| input.haystack().get(i).copied()); + Config { look_behind, anchored: input.get_anchored() } + } + + /// A convenience routine for building a start configuration from an + /// [`Input`] for a reverse search. + /// + /// This automatically sets the look-behind byte to the byte immediately + /// following the end of the search. If the end of the search is at + /// offset `haystack.len()`, then no look-behind byte is set. + pub fn from_input_reverse(input: &Input<'_>) -> Config { + let look_behind = input.haystack().get(input.end()).copied(); + Config { look_behind, anchored: input.get_anchored() } + } + + /// Set the look-behind byte at the start of a search. + /// + /// Unless the search is intended to logically start at the beginning of a + /// haystack, this should _always_ be set to the byte immediately preceding + /// the start of the search. If no look-behind byte is set, then the start + /// configuration will assume it is at the beginning of the haystack. For + /// example, the anchor `^` will match. + /// + /// The default is that no look-behind byte is set. + pub fn look_behind(mut self, byte: Option) -> Config { + self.look_behind = byte; + self + } + + /// Set the anchored mode of a search. + /// + /// The default is an unanchored search. + pub fn anchored(mut self, mode: Anchored) -> Config { + self.anchored = mode; + self + } + + /// Return the look-behind byte in this configuration, if one exists. + pub fn get_look_behind(&self) -> Option { + self.look_behind + } + + /// Return the anchored mode in this configuration. + pub fn get_anchored(&self) -> Anchored { + self.anchored + } +} + /// A map from every possible byte value to its corresponding starting /// configuration. /// @@ -71,30 +249,11 @@ impl StartByteMap { StartByteMap { map } } - /// Return the forward starting configuration for the given `input`. - #[cfg_attr(feature = "perf-inline", inline(always))] - pub(crate) fn fwd(&self, input: &Input) -> Start { - match input - .start() - .checked_sub(1) - .and_then(|i| input.haystack().get(i)) - { - None => Start::Text, - Some(&byte) => self.get(byte), - } - } - - /// Return the reverse starting configuration for the given `input`. - #[cfg_attr(feature = "perf-inline", inline(always))] - pub(crate) fn rev(&self, input: &Input) -> Start { - match input.haystack().get(input.end()) { - None => Start::Text, - Some(&byte) => self.get(byte), - } - } - + /// Return the starting configuration for the given look-behind byte. + /// + /// If no look-behind exists, callers should use `Start::Text`. #[cfg_attr(feature = "perf-inline", inline(always))] - fn get(&self, byte: u8) -> Start { + pub(crate) fn get(&self, byte: u8) -> Start { self.map[usize::from(byte)] } @@ -253,21 +412,32 @@ mod tests { #[test] fn start_fwd_done_range() { let smap = StartByteMap::new(&LookMatcher::default()); - assert_eq!(Start::Text, smap.fwd(&Input::new("").range(1..0))); + let input = Input::new("").range(1..0); + let config = Config::from_input_forward(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + assert_eq!(Start::Text, start); } #[test] fn start_rev_done_range() { let smap = StartByteMap::new(&LookMatcher::default()); - assert_eq!(Start::Text, smap.rev(&Input::new("").range(1..0))); + let input = Input::new("").range(1..0); + let config = Config::from_input_reverse(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + assert_eq!(Start::Text, start); } #[test] fn start_fwd() { let f = |haystack, start, end| { let smap = StartByteMap::new(&LookMatcher::default()); - let input = &Input::new(haystack).range(start..end); - smap.fwd(input) + let input = Input::new(haystack).range(start..end); + let config = Config::from_input_forward(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + start }; assert_eq!(Start::Text, f("", 0, 0)); @@ -287,8 +457,11 @@ mod tests { fn start_rev() { let f = |haystack, start, end| { let smap = StartByteMap::new(&LookMatcher::default()); - let input = &Input::new(haystack).range(start..end); - smap.rev(input) + let input = Input::new(haystack).range(start..end); + let config = Config::from_input_reverse(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + start }; assert_eq!(Start::Text, f("", 0, 0)); From ac51c5cf3bac69e226f29db9c3a5f8720763678a Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 Oct 2023 11:52:10 -0400 Subject: [PATCH 10/33] automata: fix doc links --- regex-automata/src/dfa/dense.rs | 8 ++-- regex-automata/src/dfa/regex.rs | 2 +- regex-automata/src/dfa/sparse.rs | 75 +++++++++++++----------------- regex-automata/src/hybrid/dfa.rs | 16 +++---- regex-automata/src/hybrid/regex.rs | 2 +- regex-automata/src/util/start.rs | 6 +-- 6 files changed, 50 insertions(+), 59 deletions(-) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 7af38b546..25dcac989 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -66,8 +66,9 @@ const VERSION: u32 = 2; /// /// The default configuration guarantees that a search will never return /// a "quit" error, although it is possible for a search to fail if -/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by -/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`]. +/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is +/// not by default) and an [`Anchored::Pattern`] mode is requested via +/// [`Input`](crate::Input). #[cfg(feature = "dfa-build")] #[derive(Clone, Debug, Default)] pub struct Config { @@ -113,8 +114,7 @@ impl Config { /// make searching slower than it otherwise would be if the transitions /// that leave accelerated states are traversed frequently. /// - /// See [`Automaton::accelerator`](crate::dfa::Automaton::accelerator) for - /// an example. + /// See [`Automaton::accelerator`] for an example. /// /// This is enabled by default. pub fn accelerate(mut self, yes: bool) -> Config { diff --git a/regex-automata/src/dfa/regex.rs b/regex-automata/src/dfa/regex.rs index f39c1c055..5e7e6e38a 100644 --- a/regex-automata/src/dfa/regex.rs +++ b/regex-automata/src/dfa/regex.rs @@ -853,7 +853,7 @@ impl Builder { } /// Set the dense DFA compilation configuration for this builder using - /// [`dense::Config`](dense::Config). + /// [`dense::Config`]. /// /// This permits setting things like whether the underlying DFAs should /// be minimized. diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index a5ccf9add..7862d48a2 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -3,13 +3,12 @@ Types and routines specific to sparse DFAs. This module is the home of [`sparse::DFA`](DFA). -Unlike the [`dense`](super::dense) module, this module does not contain a -builder or configuration specific for sparse DFAs. Instead, the intended -way to build a sparse DFA is either by using a default configuration with -its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the -construction of a dense DFA with [`dense::Builder`](super::dense::Builder) -and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For -example, this configures a sparse DFA to do an overlapping search: +Unlike the [`dense`] module, this module does not contain a builder or +configuration specific for sparse DFAs. Instead, the intended way to build a +sparse DFA is either by using a default configuration with its constructor +[`sparse::DFA::new`](DFA::new), or by first configuring the construction of a +dense DFA with [`dense::Builder`] and then calling [`dense::DFA::to_sparse`]. +For example, this configures a sparse DFA to do an overlapping search: ``` use regex_automata::{ @@ -74,18 +73,17 @@ const VERSION: u32 = 2; /// A sparse deterministic finite automaton (DFA) with variable sized states. /// -/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses -/// a more space efficient representation for its transitions. Consequently, -/// sparse DFAs may use much less memory than dense DFAs, but this comes at a -/// price. In particular, reading the more space efficient transitions takes -/// more work, and consequently, searching using a sparse DFA is typically -/// slower than a dense DFA. +/// In contrast to a [dense::DFA], a sparse DFA uses a more space efficient +/// representation for its transitions. Consequently, sparse DFAs may use much +/// less memory than dense DFAs, but this comes at a price. In particular, +/// reading the more space efficient transitions takes more work, and +/// consequently, searching using a sparse DFA is typically slower than a dense +/// DFA. /// /// A sparse DFA can be built using the default configuration via the -/// [`DFA::new`] constructor. Otherwise, one can configure various aspects -/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder), -/// and then convert a dense DFA to a sparse DFA using -/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse). +/// [`DFA::new`] constructor. Otherwise, one can configure various aspects of a +/// dense DFA via [`dense::Builder`], and then convert a dense DFA to a sparse +/// DFA using [`dense::DFA::to_sparse`]. /// /// In general, a sparse DFA supports all the same search operations as a dense /// DFA. @@ -140,11 +138,9 @@ impl DFA> { /// Parse the given regular expression using a default configuration and /// return the corresponding sparse DFA. /// - /// If you want a non-default configuration, then use - /// the [`dense::Builder`](crate::dfa::dense::Builder) - /// to set your own configuration, and then call - /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create - /// a sparse DFA. + /// If you want a non-default configuration, then use the + /// [`dense::Builder`] to set your own configuration, and then call + /// [`dense::DFA::to_sparse`] to create a sparse DFA. /// /// # Example /// @@ -167,11 +163,9 @@ impl DFA> { /// Parse the given regular expressions using a default configuration and /// return the corresponding multi-DFA. /// - /// If you want a non-default configuration, then use - /// the [`dense::Builder`](crate::dfa::dense::Builder) - /// to set your own configuration, and then call - /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create - /// a sparse DFA. + /// If you want a non-default configuration, then use the + /// [`dense::Builder`] to set your own configuration, and then call + /// [`dense::DFA::to_sparse`] to create a sparse DFA. /// /// # Example /// @@ -511,10 +505,9 @@ impl> DFA { /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// - /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s - /// serialization methods, this does not add any initial padding to the - /// returned bytes. Padding isn't required for sparse DFAs since they have - /// no alignment requirements. + /// Note that unlike a [`dense::DFA`]'s serialization methods, this does + /// not add any initial padding to the returned bytes. Padding isn't + /// required for sparse DFAs since they have no alignment requirements. /// /// # Example /// @@ -553,10 +546,9 @@ impl> DFA { /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// - /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s - /// serialization methods, this does not add any initial padding to the - /// returned bytes. Padding isn't required for sparse DFAs since they have - /// no alignment requirements. + /// Note that unlike a [`dense::DFA`]'s serialization methods, this does + /// not add any initial padding to the returned bytes. Padding isn't + /// required for sparse DFAs since they have no alignment requirements. /// /// # Example /// @@ -595,10 +587,9 @@ impl> DFA { /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// - /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s - /// serialization methods, this does not add any initial padding to the - /// returned bytes. Padding isn't required for sparse DFAs since they have - /// no alignment requirements. + /// Note that unlike a [`dense::DFA`]'s serialization methods, this does + /// not add any initial padding to the returned bytes. Padding isn't + /// required for sparse DFAs since they have no alignment requirements. /// /// Generally speaking, native endian format should only be used when /// you know that the target you're compiling the DFA for matches the @@ -903,9 +894,9 @@ impl<'a> DFA<&'a [u8]> { /// /// If any of the above are not true, then an error will be returned. /// - /// Note that unlike deserializing a - /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has - /// no alignment requirements. That is, an alignment of `1` is valid. + /// Note that unlike deserializing a [`dense::DFA`], deserializing a sparse + /// DFA has no alignment requirements. That is, an alignment of `1` is + /// valid. /// /// # Panics /// diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 102cfb6fe..9466e1e76 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -3213,12 +3213,12 @@ impl Config { /// be quit bytes _only_ when a Unicode word boundary is present in the /// pattern. /// - /// When enabling this option, callers _must_ be prepared to handle - /// a [`MatchError`](crate::MatchError) error during search. - /// When using a [`Regex`](crate::hybrid::regex::Regex), this - /// corresponds to using the `try_` suite of methods. Alternatively, - /// if callers can guarantee that their input is ASCII only, then a - /// [`MatchError::quit`] error will never be returned while searching. + /// When enabling this option, callers _must_ be prepared to + /// handle a [`MatchError`] error during search. When using a + /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the + /// `try_` suite of methods. Alternatively, if callers can guarantee that + /// their input is ASCII only, then a [`MatchError::quit`] error will never + /// be returned while searching. /// /// This is disabled by default. /// @@ -3304,8 +3304,8 @@ impl Config { /// (The advantage being that non-ASCII quit bytes will only be added if a /// Unicode word boundary is in the pattern.) /// - /// When enabling this option, callers _must_ be prepared to handle a - /// [`MatchError`](crate::MatchError) error during search. When using a + /// When enabling this option, callers _must_ be prepared to + /// handle a [`MatchError`] error during search. When using a /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the /// `try_` suite of methods. /// diff --git a/regex-automata/src/hybrid/regex.rs b/regex-automata/src/hybrid/regex.rs index 75667daf9..b3b1fe317 100644 --- a/regex-automata/src/hybrid/regex.rs +++ b/regex-automata/src/hybrid/regex.rs @@ -878,7 +878,7 @@ impl Builder { } /// Set the lazy DFA compilation configuration for this builder using - /// [`dfa::Config`](dfa::Config). + /// [`dfa::Config`]. /// /// This permits setting things like whether Unicode word boundaries should /// be heuristically supported or settings how the behavior of the cache. diff --git a/regex-automata/src/util/start.rs b/regex-automata/src/util/start.rs index f2d1922c9..27153780e 100644 --- a/regex-automata/src/util/start.rs +++ b/regex-automata/src/util/start.rs @@ -31,9 +31,9 @@ use crate::util::{ /// the byte that occurs immediately before the start of the search. /// /// Generally speaking, this type is only useful when you want to run searches -/// without using an [`Input`](crate::Input). In particular, an `Input` wants a -/// haystack slice, but callers may not have a contiguous sequence of bytes as -/// a haystack in all cases. This type provides a lower level of control such +/// without using an [`Input`]. In particular, an `Input` wants a haystack +/// slice, but callers may not have a contiguous sequence of bytes as a +/// haystack in all cases. This type provides a lower level of control such /// that callers can provide their own anchored configuration and look-behind /// byte explicitly. /// From 2e67b6ffa05e4fbfe856f076cbaa21f184f9676e Mon Sep 17 00:00:00 2001 From: Leachim <32847549+Licheam@users.noreply.github.com> Date: Sun, 23 Jul 2023 21:33:41 +0800 Subject: [PATCH 11/33] automata: fix one outdated regex-cli test command Ref #1053 --- regex-automata/src/dfa/dense.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 25dcac989..28b525eb7 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -1233,8 +1233,8 @@ impl Builder { // // Test case: // - // regex-cli find hybrid regex -w @conn.json.1000x.log \ - // '^#' '\b10\.55\.182\.100\b' + // regex-cli find match dense --unicode-word-boundary \ + // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log if !quitset.is_empty() { set.add_set(&quitset); } From 5e9204fd7e08d6ab1b7878873dea76009645e2eb Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 Oct 2023 12:38:50 -0400 Subject: [PATCH 12/33] automata: fix more out-dated regex-cli commands That should cover all of them. Closes #1053 --- regex-automata/src/dfa/accel.rs | 13 +++++++------ regex-automata/src/dfa/automaton.rs | 2 +- regex-automata/src/dfa/dense.rs | 5 +++-- regex-automata/src/hybrid/dfa.rs | 10 ++++++---- regex-automata/src/hybrid/search.rs | 10 +++++----- regex-automata/src/nfa/thompson/compiler.rs | 2 +- regex-automata/src/nfa/thompson/map.rs | 2 +- regex-automata/src/nfa/thompson/nfa.rs | 6 ++---- regex-automata/src/nfa/thompson/range_trie.rs | 2 +- regex-automata/src/util/look.rs | 4 +++- 10 files changed, 30 insertions(+), 26 deletions(-) diff --git a/regex-automata/src/dfa/accel.rs b/regex-automata/src/dfa/accel.rs index 5ea2423dd..c0ba18ea8 100644 --- a/regex-automata/src/dfa/accel.rs +++ b/regex-automata/src/dfa/accel.rs @@ -6,15 +6,16 @@ // non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its // DFA with regex-cli: // -// $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC -// dense::DFA( +// $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table // D 000000: // Q 000001: // *000002: -// A 000003: \x00-` => 3, a => 5, b-\xFF => 3 -// >000004: \x00-` => 3, a => 4, b-\xFF => 3 -// 000005: \x00-\xFF => 2, EOI => 2 -// ) +// A 000003: \x00-` => 3, a => 8, b-\xFF => 3 +// A 000004: \x00-` => 4, a => 7, b-\xFF => 4 +// 000005: \x00-` => 4, b-\xFF => 4 +// 000006: \x00-` => 3, a => 6, b-\xFF => 3 +// 000007: \x00-\xFF => 2, EOI => 2 +// 000008: \x00-\xFF => 2, EOI => 2 // // In particular, state 3 is accelerated (shown via the 'A' indicator) since // the only way to leave that state once entered is to see an 'a' byte. If diff --git a/regex-automata/src/dfa/automaton.rs b/regex-automata/src/dfa/automaton.rs index cd597947e..fcfcf2997 100644 --- a/regex-automata/src/dfa/automaton.rs +++ b/regex-automata/src/dfa/automaton.rs @@ -1132,7 +1132,7 @@ pub unsafe trait Automaton { /// // implementation defined. /// // /// // N.B. We get '3' by inspecting the state machine using 'regex-cli'. - /// // e.g., try `regex-cli debug dfa dense '[^abc]+a' -BbUC`. + /// // e.g., try `regex-cli debug dense dfa -p '[^abc]+a' -BbUC`. /// let id = StateID::new(3 * dfa.stride()).unwrap(); /// let accelerator = dfa.accelerator(id); /// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated. diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 28b525eb7..c9fe3b381 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -1228,8 +1228,9 @@ impl Builder { } else { let mut set = nfa.byte_class_set().clone(); // It is important to distinguish any "quit" bytes from all other - // bytes. Otherwise, a non-quit byte may end up in the same class - // as a quit byte, and thus cause the DFA stop when it shouldn't. + // bytes. Otherwise, a non-quit byte may end up in the same + // class as a quit byte, and thus cause the DFA to stop when it + // shouldn't. // // Test case: // diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 9466e1e76..bd9179b19 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -2103,8 +2103,10 @@ impl<'i, 'c> Lazy<'i, 'c> { /// Here's an example that justifies 'inline(never)' /// /// ```ignore - /// regex-cli find hybrid dfa \ - /// @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000 + /// regex-cli find match hybrid \ + /// --cache-capacity 100000000 \ + /// -p '\pL{100}' + /// all-codepoints-utf8-100x /// ``` /// /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every @@ -3830,8 +3832,8 @@ impl Config { // // Test case: // - // regex-cli find hybrid regex -w @conn.json.1000x.log \ - // '^#' '\b10\.55\.182\.100\b' + // regex-cli find match hybrid --unicode-word-boundary \ + // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log if !quit.is_empty() { set.add_set(&quit); } diff --git a/regex-automata/src/hybrid/search.rs b/regex-automata/src/hybrid/search.rs index f23283685..1f4a505db 100644 --- a/regex-automata/src/hybrid/search.rs +++ b/regex-automata/src/hybrid/search.rs @@ -105,14 +105,14 @@ fn find_fwd_imp( // PERF: For justification of omitting bounds checks, it gives us a // ~10% bump in search time. This was used for a benchmark: // - // regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb + // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile // // PERF: For justification for the loop unrolling, we use a few // different tests: // - // regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb - // regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb - // regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb + // regex-cli find half hybrid -p '\w{50}' -UBb bigfile + // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile + // regex-cli find half hybrid -p 'ZQZQZQZQ' -UBb bigfile // // And there are three different configurations: // @@ -353,7 +353,7 @@ fn find_rev_imp( // anchored and on shorter haystacks. However, this still makes a // difference. Take this command for example: // - // regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb + // regex-cli find match hybrid -p '(?m)^.+$' -UBb bigfile // // (Notice that we use 'find hybrid regex', not 'find hybrid dfa' // like in the justification for the forward direction. The 'regex' diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 065e9ef27..a188017d8 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1466,7 +1466,7 @@ impl Compiler { // compare and contrast performance of the Pike VM when the code below // is active vs the code above. Here's an example to try: // - // regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru' + // regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file // // With Unicode classes generated below, this search takes about 45s on // my machine. But with the compressed version above, the search takes diff --git a/regex-automata/src/nfa/thompson/map.rs b/regex-automata/src/nfa/thompson/map.rs index c36ce5386..c92d4c0b8 100644 --- a/regex-automata/src/nfa/thompson/map.rs +++ b/regex-automata/src/nfa/thompson/map.rs @@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037; /// Specifically, one could observe the difference with std's hashmap via /// something like the following benchmark: /// -/// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" +/// hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" /// /// But to observe that difference, you'd have to modify the code to use /// std's hashmap. diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 2108fa338..1f57f8ebd 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1841,14 +1841,12 @@ impl SparseTransitions { // This is an alternative implementation that uses binary search. In // some ad hoc experiments, like // - // smallishru=OpenSubtitles2018.raw.sample.smallish.ru - // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b' + // regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file // // I could not observe any improvement, and in fact, things seemed to // be a bit slower. I can see an improvement in at least one benchmark: // - // allcpssmall=all-codepoints-utf8-10x - // regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}' + // regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8 // // Where total search time goes from 3.2s to 2.4s when using binary // search. diff --git a/regex-automata/src/nfa/thompson/range_trie.rs b/regex-automata/src/nfa/thompson/range_trie.rs index 2d43a5b6f..75c9b796b 100644 --- a/regex-automata/src/nfa/thompson/range_trie.rs +++ b/regex-automata/src/nfa/thompson/range_trie.rs @@ -594,7 +594,7 @@ impl State { // Benchmarks suggest that binary search is just a bit faster than // straight linear search. Specifically when using the debug tool: // - // hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" + // hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" binary_search(&self.transitions, |t| range.start <= t.range.end) } diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs index a34ea1d75..81b4eb718 100644 --- a/regex-automata/src/util/look.rs +++ b/regex-automata/src/util/look.rs @@ -1024,7 +1024,9 @@ impl core::fmt::Display for UnicodeWordBoundaryError { // There are perhaps other choices as well. Why did I stop at these 4? Because // I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA // approach eventually, as the benefits of the DFA approach are somewhat -// compelling. The 'boundary-words-holmes' benchmark tests this: +// compelling. The 'boundary-words-holmes' benchmark tests this. (Note that +// the commands below no longer work. If necessary, we should re-capitulate +// the benchmark from whole cloth in rebar.) // // $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv // From 82d7153570e25c8604d74e3d3eeecaded157af93 Mon Sep 17 00:00:00 2001 From: Leachim <32847549+Licheam@users.noreply.github.com> Date: Fri, 21 Jul 2023 20:32:37 +0800 Subject: [PATCH 13/33] syntax: optimize most of the IntervalSet routines This reduces or eliminates allocation when combining Unicode classes and should make some things faster. It's unlikely for these optimizations to matter much in practice, but they are likely to help in niche or pathological cases where there are a lot of ops in a class. Closes #1051 --- regex-syntax/src/hir/interval.rs | 282 ++++++++++++++++++++----------- 1 file changed, 185 insertions(+), 97 deletions(-) diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index e063390a8..e3051bf31 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -19,7 +19,7 @@ use crate::unicode; // // Some of the implementation complexity here is a result of me wanting to // preserve the sequential representation without using additional memory. -// In many cases, we do use linear extra memory, but it is at most 2x and it +// In some cases, we do use linear extra memory, but it is at most 2x and it // is amortized. If we relaxed the memory requirements, this implementation // could become much simpler. The extra memory is honestly probably OK, but // character classes (especially of the Unicode variety) can become quite @@ -81,14 +81,45 @@ impl IntervalSet { /// Add a new interval to this set. pub fn push(&mut self, interval: I) { - // TODO: This could be faster. e.g., Push the interval such that - // it preserves canonicalization. - self.ranges.push(interval); - self.canonicalize(); // We don't know whether the new interval added here is considered // case folded, so we conservatively assume that the entire set is // no longer case folded if it was previously. self.folded = false; + + if self.ranges.is_empty() { + self.ranges.push(interval); + return; + } + + // Find the first range that is not greater than the new interval. + // This is the first range that could possibly be unioned with the + // new interval. + let mut drain_end = self.ranges.len(); + while drain_end > 0 + && self.ranges[drain_end - 1].lower() > interval.upper() + && !self.ranges[drain_end - 1].is_contiguous(&interval) + { + drain_end -= 1; + } + + // Try to union the new interval with old intervals backwards. + if drain_end > 0 && self.ranges[drain_end - 1].is_contiguous(&interval) + { + self.ranges[drain_end - 1] = + self.ranges[drain_end - 1].union(&interval).unwrap(); + for i in (0..drain_end - 1).rev() { + if let Some(union) = + self.ranges[drain_end - 1].union(&self.ranges[i]) + { + self.ranges[drain_end - 1] = union; + } else { + self.ranges.drain(i + 1..drain_end - 1); + break; + } + } + } else { + self.ranges.insert(drain_end, interval); + } } /// Return an iterator over all intervals in this set. @@ -192,34 +223,13 @@ impl IntervalSet { // Folks seem to suggest interval or segment trees, but I'd like to // avoid the overhead (both runtime and conceptual) of that. // - // The following is basically my Shitty First Draft. Therefore, in - // order to grok it, you probably need to read each line carefully. - // Simplifications are most welcome! - // // Remember, we can assume the canonical format invariant here, which // says that all ranges are sorted, not overlapping and not adjacent in // each class. let drain_end = self.ranges.len(); - let (mut a, mut b) = (0, 0); - 'LOOP: while a < drain_end && b < other.ranges.len() { - // Basically, the easy cases are when neither range overlaps with - // each other. If the `b` range is less than our current `a` - // range, then we can skip it and move on. - if other.ranges[b].upper() < self.ranges[a].lower() { - b += 1; - continue; - } - // ... similarly for the `a` range. If it's less than the smallest - // `b` range, then we can add it as-is. - if self.ranges[a].upper() < other.ranges[b].lower() { - let range = self.ranges[a]; - self.ranges.push(range); - a += 1; - continue; - } - // Otherwise, we have overlapping ranges. - assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b])); + let mut b = 0; + for a in 0..drain_end { // This part is tricky and was non-obvious to me without looking // at explicit examples (see the tests). The trickiness stems from // two things: 1) subtracting a range from another range could @@ -231,47 +241,34 @@ impl IntervalSet { // For example, if our `a` range is `a-t` and our next three `b` // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply // subtraction three times before moving on to the next `a` range. - let mut range = self.ranges[a]; + self.ranges.push(self.ranges[a]); + // Only when `b` is not above `a`, `b` might apply to current + // `a` range. while b < other.ranges.len() - && !range.is_intersection_empty(&other.ranges[b]) + && other.ranges[b].lower() <= self.ranges[a].upper() { - let old_range = range; - range = match range.difference(&other.ranges[b]) { - (None, None) => { - // We lost the entire range, so move on to the next - // without adding this one. - a += 1; - continue 'LOOP; + match self.ranges.pop().unwrap().difference(&other.ranges[b]) { + (Some(range1), None) | (None, Some(range1)) => { + self.ranges.push(range1); } - (Some(range1), None) | (None, Some(range1)) => range1, (Some(range1), Some(range2)) => { self.ranges.push(range1); - range2 + self.ranges.push(range2); } - }; - // It's possible that the `b` range has more to contribute - // here. In particular, if it is greater than the original - // range, then it might impact the next `a` range *and* it - // has impacted the current `a` range as much as possible, - // so we can quit. We don't bump `b` so that the next `a` - // range can apply it. - if other.ranges[b].upper() > old_range.upper() { - break; + (None, None) => {} } - // Otherwise, the next `b` range might apply to the current + // The next `b` range might apply to the current // `a` range. b += 1; } - self.ranges.push(range); - a += 1; - } - while a < drain_end { - let range = self.ranges[a]; - self.ranges.push(range); - a += 1; + // It's possible that the last `b` range has more to + // contribute to the next `a`. We don't bump the last + // `b` so that the next `a` range can apply it. + b = b.saturating_sub(1); } + self.ranges.drain(..drain_end); - self.folded = self.folded && other.folded; + self.folded = self.ranges.is_empty() || (self.folded && other.folded); } /// Compute the symmetric difference of the two sets, in place. @@ -282,11 +279,83 @@ impl IntervalSet { /// set. That is, the set will contain all elements in either set, /// but will not contain any elements that are in both sets. pub fn symmetric_difference(&mut self, other: &IntervalSet) { - // TODO(burntsushi): Fix this so that it amortizes allocation. - let mut intersection = self.clone(); - intersection.intersect(other); - self.union(other); - self.difference(&intersection); + if self.ranges.is_empty() { + self.ranges.extend(&other.ranges); + self.folded = other.folded; + return; + } + if other.ranges.is_empty() { + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the symmetric difference to the end of this range, and then drain + // it before we're done. + let drain_end = self.ranges.len(); + let mut b = 0; + let mut b_range = Some(other.ranges[b]); + for a in 0..drain_end { + self.ranges.push(self.ranges[a]); + while b_range + .map_or(false, |r| r.lower() <= self.ranges[a].upper()) + { + let (range1, range2) = match self + .ranges + .pop() + .unwrap() + .symmetric_difference(&b_range.as_ref().unwrap()) + { + (Some(range1), None) | (None, Some(range1)) => { + (Some(range1), None) + } + (Some(range1), Some(range2)) => { + (Some(range1), Some(range2)) + } + (None, None) => (None, None), + }; + if let Some(range) = range1 { + if self.ranges.len() > drain_end + && self.ranges.last().unwrap().is_contiguous(&range) + { + self.ranges + .last_mut() + .map(|last| *last = last.union(&range).unwrap()); + } else { + self.ranges.push(range); + } + } + if let Some(range) = range2 { + self.ranges.push(range); + } + + b_range = if self.ranges.len() > drain_end + && self.ranges.last().unwrap().upper() + > self.ranges[a].upper() + { + Some(*self.ranges.last().unwrap()) + } else { + b += 1; + other.ranges.get(b).cloned() + }; + } + } + while let Some(range) = b_range { + if self.ranges.len() > drain_end + && self.ranges.last().unwrap().is_contiguous(&range) + { + self.ranges + .last_mut() + .map(|last| *last = last.union(&range).unwrap()); + } else { + self.ranges.push(range); + } + b += 1; + b_range = other.ranges.get(b).cloned(); + } + + self.ranges.drain(..drain_end); + self.folded = self.ranges.is_empty() || (self.folded && other.folded); } /// Negate this interval set. @@ -302,28 +371,44 @@ impl IntervalSet { return; } - // There should be a way to do this in-place with constant memory, - // but I couldn't figure out a simple way to do it. So just append - // the negation to the end of this range, and then drain it before - // we're done. - let drain_end = self.ranges.len(); - // We do checked arithmetic below because of the canonical ordering // invariant. if self.ranges[0].lower() > I::Bound::min_value() { - let upper = self.ranges[0].lower().decrement(); - self.ranges.push(I::create(I::Bound::min_value(), upper)); - } - for i in 1..drain_end { - let lower = self.ranges[i - 1].upper().increment(); - let upper = self.ranges[i].lower().decrement(); - self.ranges.push(I::create(lower, upper)); - } - if self.ranges[drain_end - 1].upper() < I::Bound::max_value() { - let lower = self.ranges[drain_end - 1].upper().increment(); - self.ranges.push(I::create(lower, I::Bound::max_value())); + let mut pre_upper = self.ranges[0].upper(); + self.ranges[0] = I::create( + I::Bound::min_value(), + self.ranges[0].lower().decrement(), + ); + for i in 1..self.ranges.len() { + let lower = pre_upper.increment(); + pre_upper = self.ranges[i].upper(); + self.ranges[i] = + I::create(lower, self.ranges[i].lower().decrement()); + } + if pre_upper < I::Bound::max_value() { + self.ranges.push(I::create( + pre_upper.increment(), + I::Bound::max_value(), + )); + } + } else { + for i in 1..self.ranges.len() { + self.ranges[i - 1] = I::create( + self.ranges[i - 1].upper().increment(), + self.ranges[i].lower().decrement(), + ); + } + if self.ranges.last().unwrap().upper() < I::Bound::max_value() { + self.ranges.last_mut().map(|range| { + *range = I::create( + range.upper().increment(), + I::Bound::max_value(), + ) + }); + } else { + self.ranges.pop(); + } } - self.ranges.drain(..drain_end); // We don't need to update whether this set is folded or not, because // it is conservatively preserved through negation. Namely, if a set // is not folded, then it is possible that its negation is folded, for @@ -337,6 +422,7 @@ impl IntervalSet { // of case folded characters. Negating it in turn means that all // equivalence classes in the set are negated, and any equivalence // class that was previously not in the set is now entirely in the set. + self.folded = self.ranges.is_empty() || self.folded; } /// Converts this set into a canonical ordering. @@ -347,24 +433,20 @@ impl IntervalSet { self.ranges.sort(); assert!(!self.ranges.is_empty()); - // Is there a way to do this in-place with constant memory? I couldn't - // figure out a way to do it. So just append the canonicalization to - // the end of this range, and then drain it before we're done. - let drain_end = self.ranges.len(); - for oldi in 0..drain_end { - // If we've added at least one new range, then check if we can - // merge this range in the previously added range. - if self.ranges.len() > drain_end { - let (last, rest) = self.ranges.split_last_mut().unwrap(); - if let Some(union) = last.union(&rest[oldi]) { - *last = union; - continue; - } + // We maintain the canonicalization results in-place at `0..newi`. + // `newi` will keep track of the end of the canonicalized ranges. + let mut newi = 0; + for oldi in 1..self.ranges.len() { + // The last new range gets merged with currnet old range when + // unionable. If not, we update `newi` and store it as a new range. + if let Some(union) = self.ranges[newi].union(&self.ranges[oldi]) { + self.ranges[newi] = union; + } else { + newi += 1; + self.ranges[newi] = self.ranges[oldi]; } - let range = self.ranges[oldi]; - self.ranges.push(range); } - self.ranges.drain(..drain_end); + self.ranges.truncate(newi + 1); } /// Returns true if and only if this class is in a canonical ordering. @@ -486,7 +568,13 @@ pub trait Interval: other: &Self, ) -> (Option, Option) { let union = match self.union(other) { - None => return (Some(self.clone()), Some(other.clone())), + None => { + return if self.upper() < other.lower() { + (Some(self.clone()), Some(other.clone())) + } else { + (Some(other.clone()), Some(self.clone())) + } + } Some(union) => union, }; let intersection = match self.intersect(other) { From a5aa23372ed2261b18ab21a69ab0bef58daeab33 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 7 Oct 2023 09:19:57 -0400 Subject: [PATCH 14/33] syntax and automata: bump LookSet representation from u16 to u32 This is in preparation for adding 8 new word boundary look-around assertions: \b{start}, \b{end}, \b{start-half} and \b{end-half}, along with Unicode and ASCII-only variants of each. Ref #469 --- regex-automata/src/dfa/dense.rs | 8 ++-- regex-automata/src/dfa/onepass.rs | 2 +- regex-automata/src/util/determinize/state.rs | 39 ++++++++++---------- regex-automata/src/util/look.rs | 26 +++++++------ regex-automata/tests/hybrid/api.rs | 4 +- regex-syntax/src/hir/mod.rs | 26 +++++++------ 6 files changed, 55 insertions(+), 50 deletions(-) diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index c9fe3b381..902f4b273 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -882,20 +882,20 @@ impl Config { /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 /// use regex_automata::{dfa::{dense, Automaton}, Input}; /// - /// // 600KB isn't enough! + /// // 700KB isn't enough! /// dense::Builder::new() /// .configure(dense::Config::new() - /// .determinize_size_limit(Some(600_000)) + /// .determinize_size_limit(Some(700_000)) /// ) /// .build(r"\w{20}") /// .unwrap_err(); /// - /// // ... but 700KB probably is! + /// // ... but 800KB probably is! /// // (Note that auxiliary storage sizes aren't necessarily stable between /// // releases.) /// let dfa = dense::Builder::new() /// .configure(dense::Config::new() - /// .determinize_size_limit(Some(700_000)) + /// .determinize_size_limit(Some(800_000)) /// ) /// .build(r"\w{20}")?; /// let haystack = "A".repeat(20).into_bytes(); diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs index 353bb1e17..e62bbd383 100644 --- a/regex-automata/src/dfa/onepass.rs +++ b/regex-automata/src/dfa/onepass.rs @@ -2815,7 +2815,7 @@ impl Epsilons { /// Return the set of look-around assertions in these epsilon transitions. fn looks(self) -> LookSet { - LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u16() } + LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u32() } } /// Set the look-around assertions on these epsilon transitions. diff --git a/regex-automata/src/util/determinize/state.rs b/regex-automata/src/util/determinize/state.rs index e64123587..effa6f44d 100644 --- a/regex-automata/src/util/determinize/state.rs +++ b/regex-automata/src/util/determinize/state.rs @@ -197,7 +197,7 @@ impl StateBuilderEmpty { } pub(crate) fn into_matches(mut self) -> StateBuilderMatches { - self.0.extend_from_slice(&[0, 0, 0, 0, 0]); + self.0.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0]); StateBuilderMatches(self.0) } @@ -348,16 +348,17 @@ impl StateBuilderNFA { /// generated by a transition over a "word" byte. (Callers may not always set /// this. For example, if the NFA has no word boundary assertion, then needing /// to track whether a state came from a word byte or not is superfluous and -/// wasteful.) +/// wasteful.) Bit 3 is set to 1 if the state was generated by a transition +/// from a `\r` (forward search) or a `\n` (reverse search) when CRLF mode is +/// enabled. /// -/// Byte 1 corresponds to the look-behind assertions that were satisfied by -/// the transition that created this state. This generally only includes the -/// StartLF and Start assertions. (Look-ahead assertions are not tracked as -/// part of states. Instead, these are applied by re-computing the epsilon -/// closure of a state when computing the transition function. See `next` in -/// the parent module.) +/// Bytes 1..5 correspond to the look-behind assertions that were satisfied +/// by the transition that created this state. (Look-ahead assertions are not +/// tracked as part of states. Instead, these are applied by re-computing the +/// epsilon closure of a state when computing the transition function. See +/// `next` in the parent module.) /// -/// Byte 2 corresponds to the set of look-around assertions (including both +/// Bytes 5..9 correspond to the set of look-around assertions (including both /// look-behind and look-ahead) that appear somewhere in this state's set of /// NFA state IDs. This is used to determine whether this state's epsilon /// closure should be re-computed when computing the transition function. @@ -366,7 +367,7 @@ impl StateBuilderNFA { /// function, we should only re-compute the epsilon closure if those new /// assertions are relevant to this particular state. /// -/// Bytes 3..7 correspond to a 32-bit native-endian encoded integer +/// Bytes 9..13 correspond to a 32-bit native-endian encoded integer /// corresponding to the number of patterns encoded in this state. If the state /// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is /// PatternID::ZERO, then no integer is encoded at this position. Instead, byte @@ -452,7 +453,7 @@ impl<'a> Repr<'a> { /// state has no conditional epsilon transitions, then there is no need /// to re-compute the epsilon closure. fn look_need(&self) -> LookSet { - LookSet::read_repr(&self.0[3..]) + LookSet::read_repr(&self.0[5..]) } /// Returns the total number of match pattern IDs in this state. @@ -476,7 +477,7 @@ impl<'a> Repr<'a> { if !self.has_pattern_ids() { PatternID::ZERO } else { - let offset = 9 + index * PatternID::SIZE; + let offset = 13 + index * PatternID::SIZE; // This is OK since we only ever serialize valid PatternIDs to // states. wire::read_pattern_id_unchecked(&self.0[offset..]).0 @@ -507,7 +508,7 @@ impl<'a> Repr<'a> { f(PatternID::ZERO); return; } - let mut pids = &self.0[9..self.pattern_offset_end()]; + let mut pids = &self.0[13..self.pattern_offset_end()]; while !pids.is_empty() { let pid = wire::read_u32(pids); pids = &pids[PatternID::SIZE..]; @@ -539,11 +540,11 @@ impl<'a> Repr<'a> { fn pattern_offset_end(&self) -> usize { let encoded = self.encoded_pattern_len(); if encoded == 0 { - return 5; + return 9; } // This arithmetic is OK since we were able to address this many bytes // when writing to the state, thus, it must fit into a usize. - encoded.checked_mul(4).unwrap().checked_add(9).unwrap() + encoded.checked_mul(4).unwrap().checked_add(13).unwrap() } /// Returns the total number of *encoded* pattern IDs in this state. @@ -557,7 +558,7 @@ impl<'a> Repr<'a> { } // This unwrap is OK since the total number of patterns is always // guaranteed to fit into a usize. - usize::try_from(wire::read_u32(&self.0[5..9])).unwrap() + usize::try_from(wire::read_u32(&self.0[9..13])).unwrap() } } @@ -643,7 +644,7 @@ impl<'a> ReprVec<'a> { /// Mutate the set of look-around (both behind and ahead) assertions that /// appear at least once in this state's set of NFA states. fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) { - set(self.look_need()).write_repr(&mut self.0[3..]); + set(self.look_need()).write_repr(&mut self.0[5..]); } /// Add a pattern ID to this state. All match states must have at least @@ -703,14 +704,14 @@ impl<'a> ReprVec<'a> { return; } let patsize = PatternID::SIZE; - let pattern_bytes = self.0.len() - 9; + let pattern_bytes = self.0.len() - 13; // Every pattern ID uses 4 bytes, so number of bytes should be // divisible by 4. assert_eq!(pattern_bytes % patsize, 0); // This unwrap is OK since we are guaranteed that the maximum number // of possible patterns fits into a u32. let count32 = u32::try_from(pattern_bytes / patsize).unwrap(); - wire::NE::write_u32(count32, &mut self.0[5..9]); + wire::NE::write_u32(count32, &mut self.0[9..13]); } /// Add an NFA state ID to this state. The order in which NFA states are diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs index 81b4eb718..f87b963ad 100644 --- a/regex-automata/src/util/look.rs +++ b/regex-automata/src/util/look.rs @@ -125,17 +125,17 @@ impl Look { /// constructor is guaranteed to return the same look-around variant that /// one started with within a semver compatible release of this crate. #[inline] - pub const fn as_repr(self) -> u16 { + pub const fn as_repr(self) -> u32 { // AFAIK, 'as' is the only way to zero-cost convert an int enum to an // actual int. - self as u16 + self as u32 } /// Given the underlying representation of a `Look` value, return the /// corresponding `Look` value if the representation is valid. Otherwise /// `None` is returned. #[inline] - pub const fn from_repr(repr: u16) -> Option { + pub const fn from_repr(repr: u32) -> Option { match repr { 0b00_0000_0001 => Some(Look::Start), 0b00_0000_0010 => Some(Look::End), @@ -191,7 +191,7 @@ pub struct LookSet { /// range of `u16` values to be represented. For example, even if the /// current implementation only makes use of the 10 least significant bits, /// it may use more bits in a future semver compatible release. - pub bits: u16, + pub bits: u32, } impl LookSet { @@ -379,29 +379,31 @@ impl LookSet { *self = self.intersect(other); } - /// Return a `LookSet` from the slice given as a native endian 16-bit + /// Return a `LookSet` from the slice given as a native endian 32-bit /// integer. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn read_repr(slice: &[u8]) -> LookSet { - let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap()); + let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); LookSet { bits } } - /// Write a `LookSet` as a native endian 16-bit integer to the beginning + /// Write a `LookSet` as a native endian 32-bit integer to the beginning /// of the slice given. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn write_repr(self, slice: &mut [u8]) { let raw = self.bits.to_ne_bytes(); slice[0] = raw[0]; slice[1] = raw[1]; + slice[2] = raw[2]; + slice[3] = raw[3]; } /// Checks that all assertions in this set can be matched. @@ -456,9 +458,9 @@ impl Iterator for LookSetIter { return None; } // We'll never have more than u8::MAX distinct look-around assertions, - // so 'repr' will always fit into a u16. - let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); - let look = Look::from_repr(1 << repr)?; + // so 'bit' will always fit into a u16. + let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << bit)?; self.set = self.set.remove(look); Some(look) } diff --git a/regex-automata/tests/hybrid/api.rs b/regex-automata/tests/hybrid/api.rs index e82d808e3..4b04c4f8f 100644 --- a/regex-automata/tests/hybrid/api.rs +++ b/regex-automata/tests/hybrid/api.rs @@ -55,7 +55,7 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box> { let mut cache = dfa.create_cache(); let haystack = "a".repeat(101).into_bytes(); - let err = MatchError::gave_up(25); + let err = MatchError::gave_up(24); // Notice that we make the same amount of progress in each search! That's // because the cache is reused and already has states to handle the first // N bytes. @@ -83,7 +83,7 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box> { // OK, if we reset the cache, then we should be able to create more states // and make more progress with searching for betas. cache.reset(&dfa); - let err = MatchError::gave_up(27); + let err = MatchError::gave_up(26); assert_eq!( Err(err), dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index f8a3d4a9e..361ca41af 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1664,17 +1664,17 @@ impl Look { /// constructor is guaranteed to return the same look-around variant that /// one started with within a semver compatible release of this crate. #[inline] - pub const fn as_repr(self) -> u16 { + pub const fn as_repr(self) -> u32 { // AFAIK, 'as' is the only way to zero-cost convert an int enum to an // actual int. - self as u16 + self as u32 } /// Given the underlying representation of a `Look` value, return the /// corresponding `Look` value if the representation is valid. Otherwise /// `None` is returned. #[inline] - pub const fn from_repr(repr: u16) -> Option { + pub const fn from_repr(repr: u32) -> Option { match repr { 0b00_0000_0001 => Some(Look::Start), 0b00_0000_0010 => Some(Look::End), @@ -2600,7 +2600,7 @@ pub struct LookSet { /// range of `u16` values to be represented. For example, even if the /// current implementation only makes use of the 10 least significant bits, /// it may use more bits in a future semver compatible release. - pub bits: u16, + pub bits: u32, } impl LookSet { @@ -2788,29 +2788,31 @@ impl LookSet { *self = self.intersect(other); } - /// Return a `LookSet` from the slice given as a native endian 16-bit + /// Return a `LookSet` from the slice given as a native endian 32-bit /// integer. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn read_repr(slice: &[u8]) -> LookSet { - let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap()); + let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); LookSet { bits } } - /// Write a `LookSet` as a native endian 16-bit integer to the beginning + /// Write a `LookSet` as a native endian 32-bit integer to the beginning /// of the slice given. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn write_repr(self, slice: &mut [u8]) { let raw = self.bits.to_ne_bytes(); slice[0] = raw[0]; slice[1] = raw[1]; + slice[2] = raw[2]; + slice[3] = raw[3]; } } @@ -2843,9 +2845,9 @@ impl Iterator for LookSetIter { return None; } // We'll never have more than u8::MAX distinct look-around assertions, - // so 'repr' will always fit into a u16. - let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); - let look = Look::from_repr(1 << repr)?; + // so 'bit' will always fit into a u16. + let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << bit)?; self.set = self.set.remove(look); Some(look) } From 8f77e22594a8c58a1130a3e04d82ac4de2a60675 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 Oct 2023 21:58:38 -0400 Subject: [PATCH 15/33] syntax/ast: add support for additional word boundary assertions This adds AST support for the following new assertions: \b{start}, \b{end}, \b{start-half}, \b{end-half}, \< and \>. The last two, \< and \>, are aliases for \b{start} and \b{end}. The parsing for this is a little suspect since there's a little ambiguity between, e.g., \b{5} and \b{start}, but we handle it by allowing the parser to look for one of the new special assertions, and then back-up if it fails to find one so that it can try to parse a counted repetition. Ref #469 --- regex-syntax/src/ast/mod.rs | 47 +++++++ regex-syntax/src/ast/parse.rs | 226 ++++++++++++++++++++++++++++-- regex-syntax/src/ast/print.rs | 6 + regex-syntax/src/hir/translate.rs | 14 ++ regex-syntax/src/lib.rs | 3 + 5 files changed, 281 insertions(+), 15 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 9e0f92606..6a77ee134 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -162,6 +162,18 @@ pub enum ErrorKind { /// `(?i)*`. It is, however, possible to create a repetition operating on /// an empty sub-expression. For example, `()*` is still considered valid. RepetitionMissing, + /// The special word boundary syntax, `\b{something}`, was used, but + /// either EOF without `}` was seen, or an invalid character in the + /// braces was seen. + SpecialWordBoundaryUnclosed, + /// The special word boundary syntax, `\b{something}`, was used, but + /// `something` was not recognized as a valid word boundary kind. + SpecialWordBoundaryUnrecognized, + /// The syntax `\b{` was observed, but afterwards the end of the pattern + /// was observed without being able to tell whether it was meant to be a + /// bounded repetition on the `\b` or the beginning of a special word + /// boundary assertion. + SpecialWordOrRepetitionUnexpectedEof, /// The Unicode class is not valid. This typically occurs when a `\p` is /// followed by something other than a `{`. UnicodeClassInvalid, @@ -260,6 +272,29 @@ impl core::fmt::Display for ErrorKind { RepetitionMissing => { write!(f, "repetition operator missing expression") } + SpecialWordBoundaryUnclosed => { + write!( + f, + "special word boundary assertion is either \ + unclosed or contains an invalid character", + ) + } + SpecialWordBoundaryUnrecognized => { + write!( + f, + "unrecognized special word boundary assertion, \ + valid choices are: start, end, start-half \ + or end-half", + ) + } + SpecialWordOrRepetitionUnexpectedEof => { + write!( + f, + "found either the beginning of a special word \ + boundary or a bounded repetition on a \\b with \ + an opening brace, but no closing brace", + ) + } UnicodeClassInvalid => { write!(f, "invalid Unicode character class") } @@ -1293,6 +1328,18 @@ pub enum AssertionKind { WordBoundary, /// `\B` NotWordBoundary, + /// `\b{start}` + WordBoundaryStart, + /// `\b{end}` + WordBoundaryEnd, + /// `\<` (alias for `\b{start}`) + WordBoundaryStartAngle, + /// `\>` (alias for `\b{end}`) + WordBoundaryEndAngle, + /// `\b{start-half}` + WordBoundaryStartHalf, + /// `\b{end-half}` + WordBoundaryEndHalf, } /// A repetition operation applied to a regular expression. diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index f7bae7759..593b14fbc 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -1528,18 +1528,115 @@ impl<'s, P: Borrow> ParserI<'s, P> { span, kind: ast::AssertionKind::EndText, })), - 'b' => Ok(Primitive::Assertion(ast::Assertion { - span, - kind: ast::AssertionKind::WordBoundary, - })), + 'b' => { + let mut wb = ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundary, + }; + // After a \b, we "try" to parse things like \b{start} for + // special word boundary assertions. + if !self.is_eof() && self.char() == '{' { + if let Some(kind) = + self.maybe_parse_special_word_boundary(start)? + { + wb.kind = kind; + wb.span.end = self.pos(); + } + } + Ok(Primitive::Assertion(wb)) + } 'B' => Ok(Primitive::Assertion(ast::Assertion { span, kind: ast::AssertionKind::NotWordBoundary, })), + '<' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundaryStartAngle, + })), + '>' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundaryEndAngle, + })), _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), } } + /// Attempt to parse a specialty word boundary. That is, `\b{start}`, + /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. + /// + /// This is similar to `maybe_parse_ascii_class` in that, in most cases, + /// if it fails it will just return `None` with no error. This is done + /// because `\b{5}` is a valid expression and we want to let that be parsed + /// by the existing counted repetition parsing code. (I thought about just + /// invoking the counted repetition code from here, but it seemed a little + /// ham-fisted.) + /// + /// Unlike `maybe_parse_ascii_class` though, this can return an error. + /// Namely, if we definitely know it isn't a counted repetition, then we + /// return an error specific to the specialty word boundaries. + /// + /// This assumes the parser is positioned at a `{` immediately following + /// a `\b`. When `None` is returned, the parser is returned to the position + /// at which it started: pointing at a `{`. + /// + /// The position given should correspond to the start of the `\b`. + fn maybe_parse_special_word_boundary( + &self, + wb_start: Position, + ) -> Result> { + assert_eq!(self.char(), '{'); + + let is_valid_char = |c| match c { + 'A'..='Z' | 'a'..='z' | '-' => true, + _ => false, + }; + let start = self.pos(); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(wb_start, self.pos()), + ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, + )); + } + let start_contents = self.pos(); + // This is one of the critical bits: if the first non-whitespace + // character isn't in [-A-Za-z] (i.e., this can't be a special word + // boundary), then we bail and let the counted repetition parser deal + // with this. + if !is_valid_char(self.char()) { + self.parser().pos.set(start); + return Ok(None); + } + + // Now collect up our chars until we see a '}'. + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + while !self.is_eof() && is_valid_char(self.char()) { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + if self.is_eof() || self.char() != '}' { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::SpecialWordBoundaryUnclosed, + )); + } + let end = self.pos(); + self.bump(); + let kind = match scratch.as_str() { + "start" => ast::AssertionKind::WordBoundaryStart, + "end" => ast::AssertionKind::WordBoundaryEnd, + "start-half" => ast::AssertionKind::WordBoundaryStartHalf, + "end-half" => ast::AssertionKind::WordBoundaryEndHalf, + _ => { + return Err(self.error( + Span::new(start_contents, end), + ast::ErrorKind::SpecialWordBoundaryUnrecognized, + )) + } + }; + Ok(Some(kind)) + } + /// Parse an octal representation of a Unicode codepoint up to 3 digits /// long. This expects the parser to be positioned at the first octal /// digit and advances the parser to the first character immediately @@ -1967,9 +2064,9 @@ impl<'s, P: Borrow> ParserI<'s, P> { // because parsing cannot fail with any interesting error. For example, // in order to use an ASCII character class, it must be enclosed in // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think - // of it as "ASCII character characters have the syntax `[:NAME:]` - // which can only appear within character brackets." This means that - // things like `[[:lower:]A]` are legal constructs. + // of it as "ASCII character classes have the syntax `[:NAME:]` which + // can only appear within character brackets." This means that things + // like `[[:lower:]A]` are legal constructs. // // However, if one types an incorrect ASCII character class, e.g., // `[[:loower:]]`, then we treat that as a normal nested character @@ -3295,6 +3392,23 @@ bar ast: Box::new(lit('a', 0)), })) ); + assert_eq!( + parser(r"\b{5,9}").parse(), + Ok(Ast::repetition(ast::Repetition { + span: span(0..7), + op: ast::RepetitionOp { + span: span(2..7), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9) + ), + }, + greedy: true, + ast: Box::new(Ast::assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundary, + })), + })) + ); assert_eq!( parser(r"(?i){0}").parse().unwrap_err(), @@ -4381,6 +4495,48 @@ bar kind: ast::AssertionKind::WordBoundary, })) ); + assert_eq!( + parser(r"\b{start}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..9), + kind: ast::AssertionKind::WordBoundaryStart, + })) + ); + assert_eq!( + parser(r"\b{end}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..7), + kind: ast::AssertionKind::WordBoundaryEnd, + })) + ); + assert_eq!( + parser(r"\b{start-half}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..14), + kind: ast::AssertionKind::WordBoundaryStartHalf, + })) + ); + assert_eq!( + parser(r"\b{end-half}").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..12), + kind: ast::AssertionKind::WordBoundaryEndHalf, + })) + ); + assert_eq!( + parser(r"\<").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundaryStartAngle, + })) + ); + assert_eq!( + parser(r"\>").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundaryEndAngle, + })) + ); assert_eq!( parser(r"\B").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { @@ -4418,20 +4574,60 @@ bar kind: ast::ErrorKind::EscapeUnrecognized, } ); - // But also, < and > are banned, so that we may evolve them into - // start/end word boundary assertions. (Not sure if we will...) + + // Starting a special word boundary without any non-whitespace chars + // after the brace makes it ambiguous whether the user meant to write + // a counted repetition (probably not?) or an actual special word + // boundary assertion. assert_eq!( - parser(r"\<").parse_escape().unwrap_err(), + parser(r"\b{").parse_escape().unwrap_err(), TestError { - span: span(0..2), - kind: ast::ErrorKind::EscapeUnrecognized, + span: span(0..3), + kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, } ); assert_eq!( - parser(r"\>").parse_escape().unwrap_err(), + parser_ignore_whitespace(r"\b{ ").parse_escape().unwrap_err(), TestError { - span: span(0..2), - kind: ast::ErrorKind::EscapeUnrecognized, + span: span(0..4), + kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, + } + ); + // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char, + // and thus causes the parser to treat it as a counted repetition. + assert_eq!( + parser(r"\b{ ").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + // In this case, we got some valid chars that makes it look like the + // user is writing one of the special word boundary assertions, but + // we forget to close the brace. + assert_eq!( + parser(r"\b{foo").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, + } + ); + // We get the same error as above, except it is provoked by seeing a + // char that we know is invalid before seeing a closing brace. + assert_eq!( + parser(r"\b{foo!}").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, + } + ); + // And this one occurs when, syntactically, everything looks okay, but + // we don't use a valid spelling of a word boundary assertion. + assert_eq!( + parser(r"\b{foo}").parse_escape().unwrap_err(), + TestError { + span: span(3..6), + kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized, } ); diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 7dedf7f48..1ceb3c7fa 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -261,6 +261,12 @@ impl Writer { EndText => self.wtr.write_str(r"\z"), WordBoundary => self.wtr.write_str(r"\b"), NotWordBoundary => self.wtr.write_str(r"\B"), + WordBoundaryStart => self.wtr.write_str(r"\b{start}"), + WordBoundaryEnd => self.wtr.write_str(r"\b{end}"), + WordBoundaryStartAngle => self.wtr.write_str(r"\<"), + WordBoundaryEndAngle => self.wtr.write_str(r"\>"), + WordBoundaryStartHalf => self.wtr.write_str(r"\b{start-half}"), + WordBoundaryEndHalf => self.wtr.write_str(r"\b{end-half}"), } } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 56d261aa1..4ae279f92 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -962,6 +962,20 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } else { hir::Look::WordAsciiNegate }), + ast::AssertionKind::WordBoundaryStart + | ast::AssertionKind::WordBoundaryStartAngle => { + Hir::look(if unicode { todo!() } else { todo!() }) + } + ast::AssertionKind::WordBoundaryEnd + | ast::AssertionKind::WordBoundaryEndAngle => { + Hir::look(if unicode { todo!() } else { todo!() }) + } + ast::AssertionKind::WordBoundaryStartHalf => { + Hir::look(if unicode { todo!() } else { todo!() }) + } + ast::AssertionKind::WordBoundaryEndHalf => { + Hir::look(if unicode { todo!() } else { todo!() }) + } }) } diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index a552099c6..38c8d88d4 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -334,6 +334,9 @@ pub fn is_escapeable_character(c: char) -> bool { // escapeable, \< and \> will result in a parse error. Thus, we can // turn them into something else in the future without it being a // backwards incompatible change. + // + // OK, now we support \< and \>, and we need to retain them as *not* + // escapeable here since the escape sequence is significant. '<' | '>' => false, _ => true, } From 37faa6e43aa0b4f9379148036868bd6c862cb4aa Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 7 Oct 2023 09:59:51 -0400 Subject: [PATCH 16/33] syntax/hir: add new special word boundaries to HIR This builds on the previous commit to bring word boundary support to the HIR, and updates AST->HIR translation to produce them from the corresponding AST elements. Ref #469 --- regex-syntax/src/hir/mod.rs | 95 ++++++++++++++++++++++++++----- regex-syntax/src/hir/print.rs | 24 ++++++++ regex-syntax/src/hir/translate.rs | 26 +++++++-- 3 files changed, 126 insertions(+), 19 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 361ca41af..ce38ead7b 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1635,6 +1635,42 @@ pub enum Look { WordUnicode = 1 << 8, /// Match a Unicode-aware negation of a word boundary. WordUnicodeNegate = 1 << 9, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartAscii = 1 << 10, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndAscii = 1 << 11, + /// Match the start of a Unicode word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartUnicode = 1 << 12, + /// Match the end of a Unicode word boundary. That is, this matches a + /// position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndUnicode = 1 << 13, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfAscii = 1 << 14, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalfAscii = 1 << 15, + /// Match the start half of a Unicode word boundary. That is, this matches + /// a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfUnicode = 1 << 16, + /// Match the end half of a Unicode word boundary. That is, this matches + /// a position at either the end of the haystack or where the following + /// character is not a word character. + WordEndHalfUnicode = 1 << 17, } impl Look { @@ -1656,6 +1692,14 @@ impl Look { Look::WordAsciiNegate => Look::WordAsciiNegate, Look::WordUnicode => Look::WordUnicode, Look::WordUnicodeNegate => Look::WordUnicodeNegate, + Look::WordStartAscii => Look::WordEndAscii, + Look::WordEndAscii => Look::WordStartAscii, + Look::WordStartUnicode => Look::WordEndUnicode, + Look::WordEndUnicode => Look::WordStartUnicode, + Look::WordStartHalfAscii => Look::WordEndHalfAscii, + Look::WordEndHalfAscii => Look::WordStartHalfAscii, + Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, + Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, } } @@ -1676,16 +1720,24 @@ impl Look { #[inline] pub const fn from_repr(repr: u32) -> Option { match repr { - 0b00_0000_0001 => Some(Look::Start), - 0b00_0000_0010 => Some(Look::End), - 0b00_0000_0100 => Some(Look::StartLF), - 0b00_0000_1000 => Some(Look::EndLF), - 0b00_0001_0000 => Some(Look::StartCRLF), - 0b00_0010_0000 => Some(Look::EndCRLF), - 0b00_0100_0000 => Some(Look::WordAscii), - 0b00_1000_0000 => Some(Look::WordAsciiNegate), - 0b01_0000_0000 => Some(Look::WordUnicode), - 0b10_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0000_0000_0001 => Some(Look::Start), + 0b00_0000_0000_0000_0010 => Some(Look::End), + 0b00_0000_0000_0000_0100 => Some(Look::StartLF), + 0b00_0000_0000_0000_1000 => Some(Look::EndLF), + 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), + 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), + 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), + 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), + 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), + 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), + 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), + 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), + 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), + 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), + 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), + 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), + 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), _ => None, } } @@ -1710,6 +1762,14 @@ impl Look { Look::WordAsciiNegate => 'B', Look::WordUnicode => '𝛃', Look::WordUnicodeNegate => '𝚩', + Look::WordStartAscii => '<', + Look::WordEndAscii => '>', + Look::WordStartUnicode => '〈', + Look::WordEndUnicode => '〉', + Look::WordStartHalfAscii => '◁', + Look::WordEndHalfAscii => '▷', + Look::WordStartHalfUnicode => '◀', + Look::WordEndHalfUnicode => '▶', } } } @@ -2703,13 +2763,22 @@ impl LookSet { pub fn contains_word_unicode(self) -> bool { self.contains(Look::WordUnicode) || self.contains(Look::WordUnicodeNegate) + || self.contains(Look::WordStartUnicode) + || self.contains(Look::WordEndUnicode) + || self.contains(Look::WordStartHalfUnicode) + || self.contains(Look::WordEndHalfUnicode) } /// Returns true if and only if this set contains any ASCII word boundary /// or negated ASCII word boundary assertions. #[inline] pub fn contains_word_ascii(self) -> bool { - self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) + self.contains(Look::WordAscii) + || self.contains(Look::WordAsciiNegate) + || self.contains(Look::WordStartAscii) + || self.contains(Look::WordEndAscii) + || self.contains(Look::WordStartHalfAscii) + || self.contains(Look::WordEndHalfAscii) } /// Returns an iterator over all of the look-around assertions in this set. @@ -3769,7 +3838,7 @@ mod tests { assert_eq!(0, set.iter().count()); let set = LookSet::full(); - assert_eq!(10, set.iter().count()); + assert_eq!(18, set.iter().count()); let set = LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); @@ -3787,6 +3856,6 @@ mod tests { let res = format!("{:?}", LookSet::empty()); assert_eq!("∅", res); let res = format!("{:?}", LookSet::full()); - assert_eq!("Az^$rRbB𝛃𝚩", res); + assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); } } diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index aa737a092..dfa6d4032 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -202,6 +202,30 @@ impl Visitor for Writer { hir::Look::WordUnicodeNegate => { self.wtr.write_str(r"\B")?; } + hir::Look::WordStartAscii => { + self.wtr.write_str(r"(?-u:\b{start})")?; + } + hir::Look::WordEndAscii => { + self.wtr.write_str(r"(?-u:\b{end})")?; + } + hir::Look::WordStartUnicode => { + self.wtr.write_str(r"\b{start}")?; + } + hir::Look::WordEndUnicode => { + self.wtr.write_str(r"\b{end}")?; + } + hir::Look::WordStartHalfAscii => { + self.wtr.write_str(r"(?-u:\b{start-half})")?; + } + hir::Look::WordEndHalfAscii => { + self.wtr.write_str(r"(?-u:\b{end-half})")?; + } + hir::Look::WordStartHalfUnicode => { + self.wtr.write_str(r"\b{start-half}")?; + } + hir::Look::WordEndHalfUnicode => { + self.wtr.write_str(r"\b{end-half}")?; + } }, HirKind::Capture(hir::Capture { ref name, .. }) => { self.wtr.write_str("(")?; diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 4ae279f92..55ca074fa 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -964,18 +964,32 @@ impl<'t, 'p> TranslatorI<'t, 'p> { }), ast::AssertionKind::WordBoundaryStart | ast::AssertionKind::WordBoundaryStartAngle => { - Hir::look(if unicode { todo!() } else { todo!() }) + Hir::look(if unicode { + hir::Look::WordStartUnicode + } else { + hir::Look::WordStartAscii + }) } ast::AssertionKind::WordBoundaryEnd | ast::AssertionKind::WordBoundaryEndAngle => { - Hir::look(if unicode { todo!() } else { todo!() }) + Hir::look(if unicode { + hir::Look::WordEndUnicode + } else { + hir::Look::WordEndAscii + }) } ast::AssertionKind::WordBoundaryStartHalf => { - Hir::look(if unicode { todo!() } else { todo!() }) - } - ast::AssertionKind::WordBoundaryEndHalf => { - Hir::look(if unicode { todo!() } else { todo!() }) + Hir::look(if unicode { + hir::Look::WordStartHalfUnicode + } else { + hir::Look::WordStartHalfAscii + }) } + ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode { + hir::Look::WordEndHalfUnicode + } else { + hir::Look::WordEndHalfAscii + }), }) } From 97f02051c3f78bf9874ff9bf8d0bf29c45782c17 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 7 Oct 2023 18:04:56 -0400 Subject: [PATCH 17/33] automata: add special word boundaries to regex-automata In this commit, all of the regex engines now support the new special word boundary assertions: \b{start}, \b{end}, \b{start-half} and \b{end-half}. Of course, when they are Unicode-aware, the DFAs will quit upon seeing a non-ASCII character, just like for the \b and \B assertions. For now, we don't add support to the one-pass DFA, since it would either make it use more memory or reduce the number of capture groups it supports. I think these assertions will be rare enough that it isn't worth adding support yet. This is a breaking change because it adds new variants to the `Look` enum. --- regex-automata/src/nfa/thompson/compiler.rs | 8 + regex-automata/src/util/determinize/mod.rs | 60 +- regex-automata/src/util/look.rs | 898 ++++++++++++++++++-- regex-automata/tests/dfa/suite.rs | 6 +- regex-automata/tests/lib.rs | 1 + testdata/word-boundary-special.toml | 653 ++++++++++++++ tests/lib.rs | 1 + 7 files changed, 1563 insertions(+), 64 deletions(-) create mode 100644 testdata/word-boundary-special.toml diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index a188017d8..2d2172957 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1557,6 +1557,14 @@ impl Compiler { hir::Look::WordAsciiNegate => Look::WordAsciiNegate, hir::Look::WordUnicode => Look::WordUnicode, hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate, + hir::Look::WordStartAscii => Look::WordStartAscii, + hir::Look::WordEndAscii => Look::WordEndAscii, + hir::Look::WordStartUnicode => Look::WordStartUnicode, + hir::Look::WordEndUnicode => Look::WordEndUnicode, + hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii, + hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii, + hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode, + hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode, }; let id = self.add_look(look)?; Ok(ThompsonRef { start: id, end: id }) diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs index 30a82afb8..d320fabc3 100644 --- a/regex-automata/src/util/determinize/mod.rs +++ b/regex-automata/src/util/determinize/mod.rs @@ -145,9 +145,10 @@ pub(crate) fn next( } Some(_) => {} None => { - look_have = look_have.insert(Look::End); - look_have = look_have.insert(Look::EndLF); - look_have = look_have.insert(Look::EndCRLF); + look_have = look_have + .insert(Look::End) + .insert(Look::EndLF) + .insert(Look::EndCRLF); } } if unit.is_byte(lookm.get_line_terminator()) { @@ -160,11 +161,26 @@ pub(crate) fn next( look_have = look_have.insert(Look::StartCRLF); } if state.is_from_word() == unit.is_word_byte() { - look_have = look_have.insert(Look::WordUnicodeNegate); - look_have = look_have.insert(Look::WordAsciiNegate); + look_have = look_have + .insert(Look::WordAsciiNegate) + .insert(Look::WordUnicodeNegate); } else { - look_have = look_have.insert(Look::WordUnicode); - look_have = look_have.insert(Look::WordAscii); + look_have = + look_have.insert(Look::WordAscii).insert(Look::WordUnicode); + } + if !unit.is_word_byte() { + look_have = look_have + .insert(Look::WordEndHalfAscii) + .insert(Look::WordEndHalfUnicode); + } + if state.is_from_word() && !unit.is_word_byte() { + look_have = look_have + .insert(Look::WordEndAscii) + .insert(Look::WordEndUnicode); + } else if !state.is_from_word() && unit.is_word_byte() { + look_have = look_have + .insert(Look::WordStartAscii) + .insert(Look::WordStartUnicode); } // If we have new assertions satisfied that are among the set of // assertions that exist in this state (that is, just because we added @@ -220,6 +236,14 @@ pub(crate) fn next( { builder.set_look_have(|have| have.insert(Look::StartCRLF)); } + // And also for the start-half word boundary assertions. As long as the + // look-behind byte is not a word char, then the assertions are satisfied. + if nfa.look_set_any().contains_word() && !unit.is_word_byte() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } for nfa_id in sparses.set1.iter() { match *nfa.state(nfa_id) { thompson::State::Union { .. } @@ -564,7 +588,12 @@ pub(crate) fn set_lookbehind_from_start( let rev = nfa.is_reverse(); let lineterm = nfa.look_matcher().get_line_terminator(); match *start { - Start::NonWordByte => {} + Start::NonWordByte => { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } Start::WordByte => { builder.set_is_from_word(); } @@ -573,6 +602,8 @@ pub(crate) fn set_lookbehind_from_start( have.insert(Look::Start) .insert(Look::StartLF) .insert(Look::StartCRLF) + .insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) }); } Start::LineLF => { @@ -585,6 +616,10 @@ pub(crate) fn set_lookbehind_from_start( if lineterm == b'\n' { builder.set_look_have(|have| have.insert(Look::StartLF)); } + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); } Start::LineCR => { if rev { @@ -595,6 +630,10 @@ pub(crate) fn set_lookbehind_from_start( if lineterm == b'\r' { builder.set_look_have(|have| have.insert(Look::StartLF)); } + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); } Start::CustomLineTerminator => { builder.set_look_have(|have| have.insert(Look::StartLF)); @@ -604,6 +643,11 @@ pub(crate) fn set_lookbehind_from_start( // state as having come from a word byte. if utf8::is_word_byte(lineterm) { builder.set_is_from_word(); + } else { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); } } } diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs index f87b963ad..ddf8fb129 100644 --- a/regex-automata/src/util/look.rs +++ b/regex-automata/src/util/look.rs @@ -96,6 +96,42 @@ pub enum Look { WordUnicode = 1 << 8, /// Match a Unicode-aware negation of a word boundary. WordUnicodeNegate = 1 << 9, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartAscii = 1 << 10, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndAscii = 1 << 11, + /// Match the start of a Unicode word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartUnicode = 1 << 12, + /// Match the end of a Unicode word boundary. That is, this matches a + /// position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndUnicode = 1 << 13, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfAscii = 1 << 14, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalfAscii = 1 << 15, + /// Match the start half of a Unicode word boundary. That is, this matches + /// a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfUnicode = 1 << 16, + /// Match the end half of a Unicode word boundary. That is, this matches + /// a position at either the end of the haystack or where the following + /// character is not a word character. + WordEndHalfUnicode = 1 << 17, } impl Look { @@ -117,6 +153,14 @@ impl Look { Look::WordAsciiNegate => Look::WordAsciiNegate, Look::WordUnicode => Look::WordUnicode, Look::WordUnicodeNegate => Look::WordUnicodeNegate, + Look::WordStartAscii => Look::WordEndAscii, + Look::WordEndAscii => Look::WordStartAscii, + Look::WordStartUnicode => Look::WordEndUnicode, + Look::WordEndUnicode => Look::WordStartUnicode, + Look::WordStartHalfAscii => Look::WordEndHalfAscii, + Look::WordEndHalfAscii => Look::WordStartHalfAscii, + Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, + Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, } } @@ -137,16 +181,24 @@ impl Look { #[inline] pub const fn from_repr(repr: u32) -> Option { match repr { - 0b00_0000_0001 => Some(Look::Start), - 0b00_0000_0010 => Some(Look::End), - 0b00_0000_0100 => Some(Look::StartLF), - 0b00_0000_1000 => Some(Look::EndLF), - 0b00_0001_0000 => Some(Look::StartCRLF), - 0b00_0010_0000 => Some(Look::EndCRLF), - 0b00_0100_0000 => Some(Look::WordAscii), - 0b00_1000_0000 => Some(Look::WordAsciiNegate), - 0b01_0000_0000 => Some(Look::WordUnicode), - 0b10_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0000_0000_0001 => Some(Look::Start), + 0b00_0000_0000_0000_0010 => Some(Look::End), + 0b00_0000_0000_0000_0100 => Some(Look::StartLF), + 0b00_0000_0000_0000_1000 => Some(Look::EndLF), + 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), + 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), + 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), + 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), + 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), + 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), + 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), + 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), + 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), + 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), + 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), + 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), + 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), _ => None, } } @@ -171,6 +223,14 @@ impl Look { Look::WordAsciiNegate => 'B', Look::WordUnicode => '𝛃', Look::WordUnicodeNegate => '𝚩', + Look::WordStartAscii => '<', + Look::WordEndAscii => '>', + Look::WordStartUnicode => '〈', + Look::WordEndUnicode => '〉', + Look::WordStartHalfAscii => '◁', + Look::WordEndHalfAscii => '▷', + Look::WordStartHalfUnicode => '◀', + Look::WordEndHalfUnicode => '▶', } } } @@ -294,13 +354,22 @@ impl LookSet { pub fn contains_word_unicode(self) -> bool { self.contains(Look::WordUnicode) || self.contains(Look::WordUnicodeNegate) + || self.contains(Look::WordStartUnicode) + || self.contains(Look::WordEndUnicode) + || self.contains(Look::WordStartHalfUnicode) + || self.contains(Look::WordEndHalfUnicode) } /// Returns true if and only if this set contains any ASCII word boundary /// or negated ASCII word boundary assertions. #[inline] pub fn contains_word_ascii(self) -> bool { - self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) + self.contains(Look::WordAscii) + || self.contains(Look::WordAsciiNegate) + || self.contains(Look::WordStartAscii) + || self.contains(Look::WordEndAscii) + || self.contains(Look::WordStartHalfAscii) + || self.contains(Look::WordEndHalfAscii) } /// Returns an iterator over all of the look-around assertions in this set. @@ -568,6 +637,23 @@ impl LookMatcher { } /// Like `matches`, but forcefully inlined. + /// + /// # Panics + /// + /// This panics when testing any Unicode word boundary assertion in this + /// set and when the Unicode word data is not available. Specifically, this + /// only occurs when the `unicode-word-boundary` feature is not enabled. + /// + /// Since it's generally expected that this routine is called inside of + /// a matching engine, callers should check the error condition when + /// building the matching engine. If there is a Unicode word boundary + /// in the matcher and the data isn't available, then the matcher should + /// fail to build. + /// + /// Callers can check the error condition with [`LookSet::available`]. + /// + /// This also may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn matches_inline( &self, @@ -588,6 +674,26 @@ impl LookMatcher { Look::WordUnicodeNegate => { self.is_word_unicode_negate(haystack, at).unwrap() } + Look::WordStartAscii => self.is_word_start_ascii(haystack, at), + Look::WordEndAscii => self.is_word_end_ascii(haystack, at), + Look::WordStartUnicode => { + self.is_word_start_unicode(haystack, at).unwrap() + } + Look::WordEndUnicode => { + self.is_word_end_unicode(haystack, at).unwrap() + } + Look::WordStartHalfAscii => { + self.is_word_start_half_ascii(haystack, at) + } + Look::WordEndHalfAscii => { + self.is_word_end_half_ascii(haystack, at) + } + Look::WordStartHalfUnicode => { + self.is_word_start_half_unicode(haystack, at).unwrap() + } + Look::WordEndHalfUnicode => { + self.is_word_end_half_unicode(haystack, at).unwrap() + } } } @@ -682,6 +788,46 @@ impl LookMatcher { return false; } } + if set.contains(Look::WordStartAscii) { + if !self.is_word_start_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordEndAscii) { + if !self.is_word_end_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordStartUnicode) { + if !self.is_word_start_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordEndUnicode) { + if !self.is_word_end_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordStartHalfAscii) { + if !self.is_word_start_half_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordEndHalfAscii) { + if !self.is_word_end_half_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordStartHalfUnicode) { + if !self.is_word_start_half_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordEndHalfUnicode) { + if !self.is_word_end_half_unicode(haystack, at).unwrap() { + return false; + } + } true } @@ -705,7 +851,15 @@ impl LookMatcher { Look::WordAscii | Look::WordAsciiNegate | Look::WordUnicode - | Look::WordUnicodeNegate => { + | Look::WordUnicodeNegate + | Look::WordStartAscii + | Look::WordEndAscii + | Look::WordStartUnicode + | Look::WordEndUnicode + | Look::WordStartHalfAscii + | Look::WordEndHalfAscii + | Look::WordStartHalfUnicode + | Look::WordEndHalfUnicode => { // We need to mark all ranges of bytes whose pairs result in // evaluating \b differently. This isn't technically correct // for Unicode word boundaries, but DFAs can't handle those @@ -933,6 +1087,177 @@ impl LookMatcher { }; Ok(word_before == word_after) } + + /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_before && word_after + } + + /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before && !word_after + } + + /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_start_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + let word_before = is_word_char::rev(haystack, at)?; + let word_after = is_word_char::fwd(haystack, at)?; + Ok(!word_before && word_after) + } + + /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_end_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + let word_before = is_word_char::rev(haystack, at)?; + let word_after = is_word_char::fwd(haystack, at)?; + Ok(word_before && !word_after) + } + + /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_start_half_ascii( + &self, + haystack: &[u8], + at: usize, + ) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + !word_before + } + + /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_after + } + + /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_start_half_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + // See `is_word_unicode_negate` for why we need to do this. We don't + // need to do it for `is_word_start_unicode` because that guarantees + // that the position matched falls on a valid UTF-8 boundary given + // that the right side must be in \w. + let word_before = at > 0 + && match utf8::decode_last(&haystack[..at]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::rev(haystack, at)?, + }; + Ok(!word_before) + } + + /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_end_half_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + // See `is_word_unicode_negate` for why we need to do this. We don't + // need to do it for `is_word_end_unicode` because that guarantees + // that the position matched falls on a valid UTF-8 boundary given + // that the left side must be in \w. + let word_after = at < haystack.len() + && match utf8::decode(&haystack[at..]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::fwd(haystack, at)?, + }; + Ok(!word_after) + } } impl Default for LookMatcher { @@ -1660,50 +1985,478 @@ mod tests { } #[test] - fn look_set() { - let mut f = LookSet::default(); - assert!(!f.contains(Look::Start)); - assert!(!f.contains(Look::End)); - assert!(!f.contains(Look::StartLF)); - assert!(!f.contains(Look::EndLF)); - assert!(!f.contains(Look::WordUnicode)); - assert!(!f.contains(Look::WordUnicodeNegate)); - assert!(!f.contains(Look::WordAscii)); - assert!(!f.contains(Look::WordAsciiNegate)); + fn look_matches_word_start_ascii() { + let look = Look::WordStartAscii; - f = f.insert(Look::Start); - assert!(f.contains(Look::Start)); - f = f.remove(Look::Start); - assert!(!f.contains(Look::Start)); + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) - f = f.insert(Look::End); - assert!(f.contains(Look::End)); - f = f.remove(Look::End); - assert!(!f.contains(Look::End)); + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); - f = f.insert(Look::StartLF); - assert!(f.contains(Look::StartLF)); - f = f.remove(Look::StartLF); - assert!(!f.contains(Look::StartLF)); + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); - f = f.insert(Look::EndLF); - assert!(f.contains(Look::EndLF)); - f = f.remove(Look::EndLF); - assert!(!f.contains(Look::EndLF)); + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); - f = f.insert(Look::StartCRLF); - assert!(f.contains(Look::StartCRLF)); - f = f.remove(Look::StartCRLF); - assert!(!f.contains(Look::StartCRLF)); + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); - f = f.insert(Look::EndCRLF); - assert!(f.contains(Look::EndCRLF)); - f = f.remove(Look::EndCRLF); - assert!(!f.contains(Look::EndCRLF)); + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); - f = f.insert(Look::WordUnicode); - assert!(f.contains(Look::WordUnicode)); - f = f.remove(Look::WordUnicode); + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_end_ascii() { + let look = Look::WordEndAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_start_unicode() { + let look = Look::WordStartUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_end_unicode() { + let look = Look::WordEndUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_start_half_ascii() { + let look = Look::WordStartHalfAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(testlook!(look, "𝛃", 1)); + assert!(testlook!(look, "𝛃", 2)); + assert!(testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 1)); + assert!(testlook!(look, "𝛃𐆀", 2)); + assert!(testlook!(look, "𝛃𐆀", 3)); + assert!(testlook!(look, "𝛃𐆀", 5)); + assert!(testlook!(look, "𝛃𐆀", 6)); + assert!(testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_end_half_ascii() { + let look = Look::WordEndHalfAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(testlook!(look, "𝛃", 1)); + assert!(testlook!(look, "𝛃", 2)); + assert!(testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 1)); + assert!(testlook!(look, "𝛃𐆀", 2)); + assert!(testlook!(look, "𝛃𐆀", 3)); + assert!(testlook!(look, "𝛃𐆀", 5)); + assert!(testlook!(look, "𝛃𐆀", 6)); + assert!(testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_start_half_unicode() { + let look = Look::WordStartHalfUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_end_half_unicode() { + let look = Look::WordEndHalfUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_set() { + let mut f = LookSet::default(); + assert!(!f.contains(Look::Start)); + assert!(!f.contains(Look::End)); + assert!(!f.contains(Look::StartLF)); + assert!(!f.contains(Look::EndLF)); + assert!(!f.contains(Look::WordUnicode)); + assert!(!f.contains(Look::WordUnicodeNegate)); + assert!(!f.contains(Look::WordAscii)); + assert!(!f.contains(Look::WordAsciiNegate)); + + f = f.insert(Look::Start); + assert!(f.contains(Look::Start)); + f = f.remove(Look::Start); + assert!(!f.contains(Look::Start)); + + f = f.insert(Look::End); + assert!(f.contains(Look::End)); + f = f.remove(Look::End); + assert!(!f.contains(Look::End)); + + f = f.insert(Look::StartLF); + assert!(f.contains(Look::StartLF)); + f = f.remove(Look::StartLF); + assert!(!f.contains(Look::StartLF)); + + f = f.insert(Look::EndLF); + assert!(f.contains(Look::EndLF)); + f = f.remove(Look::EndLF); + assert!(!f.contains(Look::EndLF)); + + f = f.insert(Look::StartCRLF); + assert!(f.contains(Look::StartCRLF)); + f = f.remove(Look::StartCRLF); + assert!(!f.contains(Look::StartCRLF)); + + f = f.insert(Look::EndCRLF); + assert!(f.contains(Look::EndCRLF)); + f = f.remove(Look::EndCRLF); + assert!(!f.contains(Look::EndCRLF)); + + f = f.insert(Look::WordUnicode); + assert!(f.contains(Look::WordUnicode)); + f = f.remove(Look::WordUnicode); assert!(!f.contains(Look::WordUnicode)); f = f.insert(Look::WordUnicodeNegate); @@ -1720,6 +2473,46 @@ mod tests { assert!(f.contains(Look::WordAsciiNegate)); f = f.remove(Look::WordAsciiNegate); assert!(!f.contains(Look::WordAsciiNegate)); + + f = f.insert(Look::WordStartAscii); + assert!(f.contains(Look::WordStartAscii)); + f = f.remove(Look::WordStartAscii); + assert!(!f.contains(Look::WordStartAscii)); + + f = f.insert(Look::WordEndAscii); + assert!(f.contains(Look::WordEndAscii)); + f = f.remove(Look::WordEndAscii); + assert!(!f.contains(Look::WordEndAscii)); + + f = f.insert(Look::WordStartUnicode); + assert!(f.contains(Look::WordStartUnicode)); + f = f.remove(Look::WordStartUnicode); + assert!(!f.contains(Look::WordStartUnicode)); + + f = f.insert(Look::WordEndUnicode); + assert!(f.contains(Look::WordEndUnicode)); + f = f.remove(Look::WordEndUnicode); + assert!(!f.contains(Look::WordEndUnicode)); + + f = f.insert(Look::WordStartHalfAscii); + assert!(f.contains(Look::WordStartHalfAscii)); + f = f.remove(Look::WordStartHalfAscii); + assert!(!f.contains(Look::WordStartHalfAscii)); + + f = f.insert(Look::WordEndHalfAscii); + assert!(f.contains(Look::WordEndHalfAscii)); + f = f.remove(Look::WordEndHalfAscii); + assert!(!f.contains(Look::WordEndHalfAscii)); + + f = f.insert(Look::WordStartHalfUnicode); + assert!(f.contains(Look::WordStartHalfUnicode)); + f = f.remove(Look::WordStartHalfUnicode); + assert!(!f.contains(Look::WordStartHalfUnicode)); + + f = f.insert(Look::WordEndHalfUnicode); + assert!(f.contains(Look::WordEndHalfUnicode)); + f = f.remove(Look::WordEndHalfUnicode); + assert!(!f.contains(Look::WordEndHalfUnicode)); } #[test] @@ -1728,7 +2521,7 @@ mod tests { assert_eq!(0, set.iter().count()); let set = LookSet::full(); - assert_eq!(10, set.iter().count()); + assert_eq!(18, set.iter().count()); let set = LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); @@ -1739,6 +2532,9 @@ mod tests { let set = LookSet::empty().insert(Look::WordAsciiNegate); assert_eq!(1, set.iter().count()); + + let set = LookSet::empty().insert(Look::WordEndHalfUnicode); + assert_eq!(1, set.iter().count()); } #[test] @@ -1747,6 +2543,6 @@ mod tests { let res = alloc::format!("{:?}", LookSet::empty()); assert_eq!("∅", res); let res = alloc::format!("{:?}", LookSet::full()); - assert_eq!("Az^$rRbB𝛃𝚩", res); + assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); } } diff --git a/regex-automata/tests/dfa/suite.rs b/regex-automata/tests/dfa/suite.rs index f3445e02a..8ed6dd007 100644 --- a/regex-automata/tests/dfa/suite.rs +++ b/regex-automata/tests/dfa/suite.rs @@ -9,7 +9,6 @@ use { util::{prefilter::Prefilter, syntax}, Anchored, Input, PatternSet, }, - regex_syntax::hir, regex_test::{ CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult, TestRunner, @@ -285,10 +284,7 @@ fn compiler( // That is, Unicode word boundaries when searching non-ASCII text. if !test.haystack().is_ascii() { for hir in hirs.iter() { - let looks = hir.properties().look_set(); - if looks.contains(hir::Look::WordUnicode) - || looks.contains(hir::Look::WordUnicodeNegate) - { + if hir.properties().look_set().contains_word_unicode() { return Ok(CompiledRegex::skip()); } } diff --git a/regex-automata/tests/lib.rs b/regex-automata/tests/lib.rs index 1465e51eb..67c979aa8 100644 --- a/regex-automata/tests/lib.rs +++ b/regex-automata/tests/lib.rs @@ -61,6 +61,7 @@ fn suite() -> anyhow::Result { load!("unicode"); load!("utf8"); load!("word-boundary"); + load!("word-boundary-special"); load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); diff --git a/testdata/word-boundary-special.toml b/testdata/word-boundary-special.toml new file mode 100644 index 000000000..c1689f5cc --- /dev/null +++ b/testdata/word-boundary-special.toml @@ -0,0 +1,653 @@ +# These tests are for the "special" word boundary assertions. That is, +# \b{start}, \b{end}, \b{start-half}, \b{end-half}. These are specialty +# assertions for more niche use cases, but hitting those cases without these +# assertions is difficult. For example, \b{start-half} and \b{end-half} are +# used to implement the -w/--word-regexp flag in a grep program. + +# Tests for (?-u:\b{start}) + +[[test]] +name = "word-start-ascii-010" +regex = '\b{start}' +haystack = "a" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-ascii-020" +regex = '\b{start}' +haystack = "a " +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-ascii-030" +regex = '\b{start}' +haystack = " a " +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-start-ascii-040" +regex = '\b{start}' +haystack = "" +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-050" +regex = '\b{start}' +haystack = "ab" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-ascii-060" +regex = '\b{start}' +haystack = "𝛃" +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-060-bounds" +regex = '\b{start}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-070" +regex = '\b{start}' +haystack = " 𝛃 " +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-080" +regex = '\b{start}' +haystack = "𝛃𐆀" +matches = [] +unicode = false + +[[test]] +name = "word-start-ascii-090" +regex = '\b{start}' +haystack = "𝛃b" +matches = [[4, 4]] +unicode = false + +[[test]] +name = "word-start-ascii-110" +regex = '\b{start}' +haystack = "b𝛃" +matches = [[0, 0]] +unicode = false + +# Tests for (?-u:\b{end}) + +[[test]] +name = "word-end-ascii-010" +regex = '\b{end}' +haystack = "a" +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-end-ascii-020" +regex = '\b{end}' +haystack = "a " +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-end-ascii-030" +regex = '\b{end}' +haystack = " a " +matches = [[2, 2]] +unicode = false + +[[test]] +name = "word-end-ascii-040" +regex = '\b{end}' +haystack = "" +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-050" +regex = '\b{end}' +haystack = "ab" +matches = [[2, 2]] +unicode = false + +[[test]] +name = "word-end-ascii-060" +regex = '\b{end}' +haystack = "𝛃" +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-060-bounds" +regex = '\b{end}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-070" +regex = '\b{end}' +haystack = " 𝛃 " +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-080" +regex = '\b{end}' +haystack = "𝛃𐆀" +matches = [] +unicode = false + +[[test]] +name = "word-end-ascii-090" +regex = '\b{end}' +haystack = "𝛃b" +matches = [[5, 5]] +unicode = false + +[[test]] +name = "word-end-ascii-110" +regex = '\b{end}' +haystack = "b𝛃" +matches = [[1, 1]] +unicode = false + +# Tests for \b{start} + +[[test]] +name = "word-start-unicode-010" +regex = '\b{start}' +haystack = "a" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-020" +regex = '\b{start}' +haystack = "a " +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-030" +regex = '\b{start}' +haystack = " a " +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-start-unicode-040" +regex = '\b{start}' +haystack = "" +matches = [] +unicode = true + +[[test]] +name = "word-start-unicode-050" +regex = '\b{start}' +haystack = "ab" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-060" +regex = '\b{start}' +haystack = "𝛃" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-060-bounds" +regex = '\b{start}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-start-unicode-070" +regex = '\b{start}' +haystack = " 𝛃 " +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-start-unicode-080" +regex = '\b{start}' +haystack = "𝛃𐆀" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-090" +regex = '\b{start}' +haystack = "𝛃b" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-unicode-110" +regex = '\b{start}' +haystack = "b𝛃" +matches = [[0, 0]] +unicode = true + +# Tests for \b{end} + +[[test]] +name = "word-end-unicode-010" +regex = '\b{end}' +haystack = "a" +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-end-unicode-020" +regex = '\b{end}' +haystack = "a " +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-end-unicode-030" +regex = '\b{end}' +haystack = " a " +matches = [[2, 2]] +unicode = true + +[[test]] +name = "word-end-unicode-040" +regex = '\b{end}' +haystack = "" +matches = [] +unicode = true + +[[test]] +name = "word-end-unicode-050" +regex = '\b{end}' +haystack = "ab" +matches = [[2, 2]] +unicode = true + +[[test]] +name = "word-end-unicode-060" +regex = '\b{end}' +haystack = "𝛃" +matches = [[4, 4]] +unicode = true + +[[test]] +name = "word-end-unicode-060-bounds" +regex = '\b{end}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-end-unicode-070" +regex = '\b{end}' +haystack = " 𝛃 " +matches = [[5, 5]] +unicode = true + +[[test]] +name = "word-end-unicode-080" +regex = '\b{end}' +haystack = "𝛃𐆀" +matches = [[4, 4]] +unicode = true + +[[test]] +name = "word-end-unicode-090" +regex = '\b{end}' +haystack = "𝛃b" +matches = [[5, 5]] +unicode = true + +[[test]] +name = "word-end-unicode-110" +regex = '\b{end}' +haystack = "b𝛃" +matches = [[5, 5]] +unicode = true + +# Tests for (?-u:\b{start-half}) + +[[test]] +name = "word-start-half-ascii-010" +regex = '\b{start-half}' +haystack = "a" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-half-ascii-020" +regex = '\b{start-half}' +haystack = "a " +matches = [[0, 0], [2, 2]] +unicode = false + +[[test]] +name = "word-start-half-ascii-030" +regex = '\b{start-half}' +haystack = " a " +matches = [[0, 0], [1, 1], [3, 3]] +unicode = false + +[[test]] +name = "word-start-half-ascii-040" +regex = '\b{start-half}' +haystack = "" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-half-ascii-050" +regex = '\b{start-half}' +haystack = "ab" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-start-half-ascii-060" +regex = '\b{start-half}' +haystack = "𝛃" +matches = [[0, 0], [4, 4]] +unicode = false + +[[test]] +name = "word-start-half-ascii-060-noutf8" +regex = '\b{start-half}' +haystack = "𝛃" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] +unicode = false +utf8 = false + +[[test]] +name = "word-start-half-ascii-060-bounds" +regex = '\b{start-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-start-half-ascii-070" +regex = '\b{start-half}' +haystack = " 𝛃 " +matches = [[0, 0], [1, 1], [5, 5], [6, 6]] +unicode = false + +[[test]] +name = "word-start-half-ascii-080" +regex = '\b{start-half}' +haystack = "𝛃𐆀" +matches = [[0, 0], [4, 4], [8, 8]] +unicode = false + +[[test]] +name = "word-start-half-ascii-090" +regex = '\b{start-half}' +haystack = "𝛃b" +matches = [[0, 0], [4, 4]] +unicode = false + +[[test]] +name = "word-start-half-ascii-110" +regex = '\b{start-half}' +haystack = "b𝛃" +matches = [[0, 0], [5, 5]] +unicode = false + +# Tests for (?-u:\b{end-half}) + +[[test]] +name = "word-end-half-ascii-010" +regex = '\b{end-half}' +haystack = "a" +matches = [[1, 1]] +unicode = false + +[[test]] +name = "word-end-half-ascii-020" +regex = '\b{end-half}' +haystack = "a " +matches = [[1, 1], [2, 2]] +unicode = false + +[[test]] +name = "word-end-half-ascii-030" +regex = '\b{end-half}' +haystack = " a " +matches = [[0, 0], [2, 2], [3, 3]] +unicode = false + +[[test]] +name = "word-end-half-ascii-040" +regex = '\b{end-half}' +haystack = "" +matches = [[0, 0]] +unicode = false + +[[test]] +name = "word-end-half-ascii-050" +regex = '\b{end-half}' +haystack = "ab" +matches = [[2, 2]] +unicode = false + +[[test]] +name = "word-end-half-ascii-060" +regex = '\b{end-half}' +haystack = "𝛃" +matches = [[0, 0], [4, 4]] +unicode = false + +[[test]] +name = "word-end-half-ascii-060-bounds" +regex = '\b{end-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = false + +[[test]] +name = "word-end-half-ascii-070" +regex = '\b{end-half}' +haystack = " 𝛃 " +matches = [[0, 0], [1, 1], [5, 5], [6, 6]] +unicode = false + +[[test]] +name = "word-end-half-ascii-080" +regex = '\b{end-half}' +haystack = "𝛃𐆀" +matches = [[0, 0], [4, 4], [8, 8]] +unicode = false + +[[test]] +name = "word-end-half-ascii-090" +regex = '\b{end-half}' +haystack = "𝛃b" +matches = [[0, 0], [5, 5]] +unicode = false + +[[test]] +name = "word-end-half-ascii-110" +regex = '\b{end-half}' +haystack = "b𝛃" +matches = [[1, 1], [5, 5]] +unicode = false + +# Tests for \b{start-half} + +[[test]] +name = "word-start-half-unicode-010" +regex = '\b{start-half}' +haystack = "a" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-020" +regex = '\b{start-half}' +haystack = "a " +matches = [[0, 0], [2, 2]] +unicode = true + +[[test]] +name = "word-start-half-unicode-030" +regex = '\b{start-half}' +haystack = " a " +matches = [[0, 0], [1, 1], [3, 3]] +unicode = true + +[[test]] +name = "word-start-half-unicode-040" +regex = '\b{start-half}' +haystack = "" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-050" +regex = '\b{start-half}' +haystack = "ab" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-060" +regex = '\b{start-half}' +haystack = "𝛃" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-060-bounds" +regex = '\b{start-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-start-half-unicode-070" +regex = '\b{start-half}' +haystack = " 𝛃 " +matches = [[0, 0], [1, 1], [6, 6]] +unicode = true + +[[test]] +name = "word-start-half-unicode-080" +regex = '\b{start-half}' +haystack = "𝛃𐆀" +matches = [[0, 0], [8, 8]] +unicode = true + +[[test]] +name = "word-start-half-unicode-090" +regex = '\b{start-half}' +haystack = "𝛃b" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-start-half-unicode-110" +regex = '\b{start-half}' +haystack = "b𝛃" +matches = [[0, 0]] +unicode = true + +# Tests for \b{end-half} + +[[test]] +name = "word-end-half-unicode-010" +regex = '\b{end-half}' +haystack = "a" +matches = [[1, 1]] +unicode = true + +[[test]] +name = "word-end-half-unicode-020" +regex = '\b{end-half}' +haystack = "a " +matches = [[1, 1], [2, 2]] +unicode = true + +[[test]] +name = "word-end-half-unicode-030" +regex = '\b{end-half}' +haystack = " a " +matches = [[0, 0], [2, 2], [3, 3]] +unicode = true + +[[test]] +name = "word-end-half-unicode-040" +regex = '\b{end-half}' +haystack = "" +matches = [[0, 0]] +unicode = true + +[[test]] +name = "word-end-half-unicode-050" +regex = '\b{end-half}' +haystack = "ab" +matches = [[2, 2]] +unicode = true + +[[test]] +name = "word-end-half-unicode-060" +regex = '\b{end-half}' +haystack = "𝛃" +matches = [[4, 4]] +unicode = true + +[[test]] +name = "word-end-half-unicode-060-bounds" +regex = '\b{end-half}' +haystack = "𝛃" +bounds = [2, 3] +matches = [] +unicode = true + +[[test]] +name = "word-end-half-unicode-070" +regex = '\b{end-half}' +haystack = " 𝛃 " +matches = [[0, 0], [5, 5], [6, 6]] +unicode = true + +[[test]] +name = "word-end-half-unicode-080" +regex = '\b{end-half}' +haystack = "𝛃𐆀" +matches = [[4, 4], [8, 8]] +unicode = true + +[[test]] +name = "word-end-half-unicode-090" +regex = '\b{end-half}' +haystack = "𝛃b" +matches = [[5, 5]] +unicode = true + +[[test]] +name = "word-end-half-unicode-110" +regex = '\b{end-half}' +haystack = "b𝛃" +matches = [[5, 5]] +unicode = true diff --git a/tests/lib.rs b/tests/lib.rs index badd57455..b3f69423d 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -49,6 +49,7 @@ fn suite() -> anyhow::Result { load!("unicode"); load!("utf8"); load!("word-boundary"); + load!("word-boundary-special"); load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); From 915a1543e7f1d7171818aca95c82d6c11a532e7f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 09:29:27 -0400 Subject: [PATCH 18/33] doc: explain the new word boundary assertions Closes #469 --- CHANGELOG.md | 7 ++++++ src/lib.rs | 70 ++++++++++++++++++++++++++++++---------------------- 2 files changed, 47 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 265f5cd48..7f90e45a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,9 +3,16 @@ TBD New features: +* [FEATURE #469](https://github.com/rust-lang/regex/issues/469): +Add support for `\<` and `\>` word boundary assertions. * [FEATURE(regex-automata) #1031](https://github.com/rust-lang/regex/pull/1031): DFAs now have a `start_state` method that doesn't use an `Input`. +Performance improvements: + +* [PERF #1051](https://github.com/rust-lang/regex/pull/1051): +Unicode character class operations have been optimized in `regex-syntax`. + Bug fixes: * [BUG #1046](https://github.com/rust-lang/regex/issues/1046): diff --git a/src/lib.rs b/src/lib.rs index 1e191b692..6dbd3c202 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -543,8 +543,10 @@ scalar value, even when it is encoded using multiple bytes. When Unicode mode is disabled (e.g., `(?-u:.)`), then `.` will match a single byte in all cases. * The character classes `\w`, `\d` and `\s` are all Unicode-aware by default. Use `(?-u:\w)`, `(?-u:\d)` and `(?-u:\s)` to get their ASCII-only definitions. -* Similarly, `\b` and `\B` use a Unicode definition of a "word" character. To -get ASCII-only word boundaries, use `(?-u:\b)` and `(?-u:\B)`. +* Similarly, `\b` and `\B` use a Unicode definition of a "word" character. +To get ASCII-only word boundaries, use `(?-u:\b)` and `(?-u:\B)`. This also +applies to the special word boundary assertions. (That is, `\b{start}`, +`\b{end}`, `\b{start-half}`, `\b{end-half}`.) * `^` and `$` are **not** Unicode-aware in multi-line mode. Namely, they only recognize `\n` (assuming CRLF mode is not enabled) and not any of the other forms of line terminators defined by Unicode. @@ -723,12 +725,16 @@ x{n}? exactly n x ### Empty matches
-^     the beginning of a haystack (or start-of-line with multi-line mode)
-$     the end of a haystack (or end-of-line with multi-line mode)
-\A    only the beginning of a haystack (even with multi-line mode enabled)
-\z    only the end of a haystack (even with multi-line mode enabled)
-\b    a Unicode word boundary (\w on one side and \W, \A, or \z on other)
-\B    not a Unicode word boundary
+^               the beginning of a haystack (or start-of-line with multi-line mode)
+$               the end of a haystack (or end-of-line with multi-line mode)
+\A              only the beginning of a haystack (even with multi-line mode enabled)
+\z              only the end of a haystack (even with multi-line mode enabled)
+\b              a Unicode word boundary (\w on one side and \W, \A, or \z on other)
+\B              not a Unicode word boundary
+\b{start}, \<   a Unicode start-of-word boundary (\W|\A on the left, \w on the right)
+\b{end}, \>     a Unicode end-of-word boundary (\w on the left, \W|\z on the right))
+\b{start-half}  half of a Unicode start-of-word boundary (\W|\A on the left)
+\b{end-half}    half of a Unicode end-of-word boundary (\W|\z on the right)
 
The empty regex is valid and matches the empty string. For example, the @@ -856,28 +862,32 @@ Note that this includes all possible escape sequences, even ones that are documented elsewhere.
-\*          literal *, applies to all ASCII except [0-9A-Za-z<>]
-\a          bell (\x07)
-\f          form feed (\x0C)
-\t          horizontal tab
-\n          new line
-\r          carriage return
-\v          vertical tab (\x0B)
-\A          matches at the beginning of a haystack
-\z          matches at the end of a haystack
-\b          word boundary assertion
-\B          negated word boundary assertion
-\123        octal character code, up to three digits (when enabled)
-\x7F        hex character code (exactly two digits)
-\x{10FFFF}  any hex character code corresponding to a Unicode code point
-\u007F      hex character code (exactly four digits)
-\u{7F}      any hex character code corresponding to a Unicode code point
-\U0000007F  hex character code (exactly eight digits)
-\U{7F}      any hex character code corresponding to a Unicode code point
-\p{Letter}  Unicode character class
-\P{Letter}  negated Unicode character class
-\d, \s, \w  Perl character class
-\D, \S, \W  negated Perl character class
+\*              literal *, applies to all ASCII except [0-9A-Za-z<>]
+\a              bell (\x07)
+\f              form feed (\x0C)
+\t              horizontal tab
+\n              new line
+\r              carriage return
+\v              vertical tab (\x0B)
+\A              matches at the beginning of a haystack
+\z              matches at the end of a haystack
+\b              word boundary assertion
+\B              negated word boundary assertion
+\b{start}, \<   start-of-word boundary assertion
+\b{end}, \>     end-of-word boundary assertion
+\b{start-half}  half of a start-of-word boundary assertion
+\b{end-half}    half of a end-of-word boundary assertion
+\123            octal character code, up to three digits (when enabled)
+\x7F            hex character code (exactly two digits)
+\x{10FFFF}      any hex character code corresponding to a Unicode code point
+\u007F          hex character code (exactly four digits)
+\u{7F}          any hex character code corresponding to a Unicode code point
+\U0000007F      hex character code (exactly eight digits)
+\U{7F}          any hex character code corresponding to a Unicode code point
+\p{Letter}      Unicode character class
+\P{Letter}      negated Unicode character class
+\d, \s, \w      Perl character class
+\D, \S, \W      negated Perl character class
 
### Perl character classes (Unicode friendly) From bd36c6feae8d59b1448c8f64ede7df028fb323d8 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 10:20:24 -0400 Subject: [PATCH 19/33] lite: add special word boundaries to regex-lite This was substantially easier. Coupling, private abstractions and slow code are so much easier to deal with. Ref #469 --- regex-lite/src/hir/mod.rs | 42 +++++++++++++++++ regex-lite/src/hir/parse.rs | 89 +++++++++++++++++++++++++++++++++++-- regex-lite/src/lib.rs | 58 +++++++++++++----------- regex-lite/tests/lib.rs | 1 + 4 files changed, 162 insertions(+), 28 deletions(-) diff --git a/regex-lite/src/hir/mod.rs b/regex-lite/src/hir/mod.rs index f73a5420a..3d61ce8c9 100644 --- a/regex-lite/src/hir/mod.rs +++ b/regex-lite/src/hir/mod.rs @@ -592,6 +592,24 @@ pub(crate) enum Look { Word = 1 << 6, /// Match an ASCII-only negation of a word boundary. WordNegate = 1 << 7, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStart = 1 << 8, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEnd = 1 << 9, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalf = 1 << 10, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalf = 1 << 11, } impl Look { @@ -631,6 +649,30 @@ impl Look { at < haystack.len() && utf8::is_word_byte(haystack[at]); word_before == word_after } + WordStart => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_before && word_after + } + WordEnd => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before && !word_after + } + WordStartHalf => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + !word_before + } + WordEndHalf => { + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_after + } } } } diff --git a/regex-lite/src/hir/parse.rs b/regex-lite/src/hir/parse.rs index cc3c21fe6..33bb97a7d 100644 --- a/regex-lite/src/hir/parse.rs +++ b/regex-lite/src/hir/parse.rs @@ -111,6 +111,12 @@ const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str = "character class difference is not supported"; const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str = "character class symmetric difference is not supported"; +const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str = + "special word boundary assertion is unclosed or has an invalid character"; +const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str = + "special word boundary assertion is unrecognized"; +const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str = + "found start of special word boundary or repetition without an end"; /// A regular expression parser. /// @@ -479,12 +485,86 @@ impl<'a> Parser<'a> { 'v' => special('\x0B'), 'A' => Ok(Hir::look(hir::Look::Start)), 'z' => Ok(Hir::look(hir::Look::End)), - 'b' => Ok(Hir::look(hir::Look::Word)), + 'b' => { + let mut hir = Hir::look(hir::Look::Word); + if !self.is_done() && self.char() == '{' { + if let Some(special) = + self.maybe_parse_special_word_boundary()? + { + hir = special; + } + } + Ok(hir) + } 'B' => Ok(Hir::look(hir::Look::WordNegate)), + '<' => Ok(Hir::look(hir::Look::WordStart)), + '>' => Ok(Hir::look(hir::Look::WordEnd)), _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)), } } + /// Attempt to parse a specialty word boundary. That is, `\b{start}`, + /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. + /// + /// This is similar to `maybe_parse_ascii_class` in that, in most cases, + /// if it fails it will just return `None` with no error. This is done + /// because `\b{5}` is a valid expression and we want to let that be parsed + /// by the existing counted repetition parsing code. (I thought about just + /// invoking the counted repetition code from here, but it seemed a little + /// ham-fisted.) + /// + /// Unlike `maybe_parse_ascii_class` though, this can return an error. + /// Namely, if we definitely know it isn't a counted repetition, then we + /// return an error specific to the specialty word boundaries. + /// + /// This assumes the parser is positioned at a `{` immediately following + /// a `\b`. When `None` is returned, the parser is returned to the position + /// at which it started: pointing at a `{`. + /// + /// The position given should correspond to the start of the `\b`. + fn maybe_parse_special_word_boundary(&self) -> Result, Error> { + assert_eq!(self.char(), '{'); + + let is_valid_char = |c| match c { + 'A'..='Z' | 'a'..='z' | '-' => true, + _ => false, + }; + let start = self.pos(); + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF)); + } + // This is one of the critical bits: if the first non-whitespace + // character isn't in [-A-Za-z] (i.e., this can't be a special word + // boundary), then we bail and let the counted repetition parser deal + // with this. + if !is_valid_char(self.char()) { + self.pos.set(start); + self.char.set(Some('{')); + return Ok(None); + } + + // Now collect up our chars until we see a '}'. + let mut scratch = String::new(); + while !self.is_done() && is_valid_char(self.char()) { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + if self.is_done() || self.char() != '}' { + return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED)); + } + self.bump(); + let kind = match scratch.as_str() { + "start" => hir::Look::WordStart, + "end" => hir::Look::WordEnd, + "start-half" => hir::Look::WordStartHalf, + "end-half" => hir::Look::WordEndHalf, + _ => { + return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED)) + } + }; + Ok(Some(Hir::look(kind))) + } + /// Parse a hex representation of a Unicode codepoint. This handles both /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to @@ -1948,8 +2028,6 @@ bar assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL")); assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}")); assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i")); - assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\<")); - assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\>")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+")); @@ -1983,6 +2061,11 @@ bar assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]")); assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]")); assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]")); + assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo")); + assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}")); + assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}")); + assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{")); + assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ ")); } #[test] diff --git a/regex-lite/src/lib.rs b/regex-lite/src/lib.rs index 8008b9e59..68d54824f 100644 --- a/regex-lite/src/lib.rs +++ b/regex-lite/src/lib.rs @@ -466,12 +466,16 @@ x{n}? exactly n x ### Empty matches
-^     the beginning of a haystack (or start-of-line with multi-line mode)
-$     the end of a haystack (or end-of-line with multi-line mode)
-\A    only the beginning of a haystack (even with multi-line mode enabled)
-\z    only the end of a haystack (even with multi-line mode enabled)
-\b    an ASCII word boundary (\w on one side and \W, \A, or \z on other)
-\B    not an ASCII word boundary
+^               the beginning of a haystack (or start-of-line with multi-line mode)
+$               the end of a haystack (or end-of-line with multi-line mode)
+\A              only the beginning of a haystack (even with multi-line mode enabled)
+\z              only the end of a haystack (even with multi-line mode enabled)
+\b              an ASCII word boundary (\w on one side and \W, \A, or \z on other)
+\B              not an ASCII word boundary
+\b{start}       an ASCII start-of-word boundary (\W|\A on the left, \w on the right)
+\b{end}         an ASCII end-of-word boundary (\w on the left, \W|\z on the right))
+\b{start-half}  half of an ASCII start-of-word boundary (\W|\A on the left)
+\b{end-half}    half of an ASCII end-of-word boundary (\W|\z on the right)
 
The empty regex is valid and matches the empty string. For example, the @@ -581,25 +585,29 @@ Note that this includes all possible escape sequences, even ones that are documented elsewhere.
-\*          literal *, applies to all ASCII except [0-9A-Za-z<>]
-\a          bell (\x07)
-\f          form feed (\x0C)
-\t          horizontal tab
-\n          new line
-\r          carriage return
-\v          vertical tab (\x0B)
-\A          matches at the beginning of a haystack
-\z          matches at the end of a haystack
-\b          word boundary assertion
-\B          negated word boundary assertion
-\x7F        hex character code (exactly two digits)
-\x{10FFFF}  any hex character code corresponding to a Unicode code point
-\u007F      hex character code (exactly four digits)
-\u{7F}      any hex character code corresponding to a Unicode code point
-\U0000007F  hex character code (exactly eight digits)
-\U{7F}      any hex character code corresponding to a Unicode code point
-\d, \s, \w  Perl character class
-\D, \S, \W  negated Perl character class
+\*              literal *, applies to all ASCII except [0-9A-Za-z<>]
+\a              bell (\x07)
+\f              form feed (\x0C)
+\t              horizontal tab
+\n              new line
+\r              carriage return
+\v              vertical tab (\x0B)
+\A              matches at the beginning of a haystack
+\z              matches at the end of a haystack
+\b              word boundary assertion
+\B              negated word boundary assertion
+\b{start}, \<   start-of-word boundary assertion
+\b{end}, \>     end-of-word boundary assertion
+\b{start-half}  half of a start-of-word boundary assertion
+\b{end-half}    half of a end-of-word boundary assertion
+\x7F            hex character code (exactly two digits)
+\x{10FFFF}      any hex character code corresponding to a Unicode code point
+\u007F          hex character code (exactly four digits)
+\u{7F}          any hex character code corresponding to a Unicode code point
+\U0000007F      hex character code (exactly eight digits)
+\U{7F}          any hex character code corresponding to a Unicode code point
+\d, \s, \w      Perl character class
+\D, \S, \W      negated Perl character class
 
### Perl character classes (ASCII only) diff --git a/regex-lite/tests/lib.rs b/regex-lite/tests/lib.rs index 757b39441..89635f2d7 100644 --- a/regex-lite/tests/lib.rs +++ b/regex-lite/tests/lib.rs @@ -38,6 +38,7 @@ fn suite() -> anyhow::Result { load!("unicode"); load!("utf8"); load!("word-boundary"); + load!("word-boundary-special"); load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); From 048b6f8f06f9919e3163594973a2b76a024af4ed Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 14:55:20 -0400 Subject: [PATCH 20/33] doc: remove HACKING document It is almost completely wrong now. Instead of rewriting it---which would be a huge endeavor---we just point folks toward my blog on regex internals. Closes #1058 --- HACKING.md | 341 ----------------------------------------------------- README.md | 15 +++ 2 files changed, 15 insertions(+), 341 deletions(-) delete mode 100644 HACKING.md diff --git a/HACKING.md b/HACKING.md deleted file mode 100644 index 34af5b517..000000000 --- a/HACKING.md +++ /dev/null @@ -1,341 +0,0 @@ -Your friendly guide to hacking and navigating the regex library. - -This guide assumes familiarity with Rust and Cargo, and at least a perusal of -the user facing documentation for this crate. - -If you're looking for background on the implementation in this library, then -you can do no better than Russ Cox's article series on implementing regular -expressions using finite automata: https://swtch.com/~rsc/regexp/ - - -## Architecture overview - -As you probably already know, this library executes regular expressions using -finite automata. In particular, a design goal is to make searching linear -with respect to both the regular expression and the text being searched. -Meeting that design goal on its own is not so hard and can be done with an -implementation of the Pike VM (similar to Thompson's construction, but supports -capturing groups), as described in: https://swtch.com/~rsc/regexp/regexp2.html ---- This library contains such an implementation in src/pikevm.rs. - -Making it fast is harder. One of the key problems with the Pike VM is that it -can be in more than one state at any point in time, and must shuffle capture -positions between them. The Pike VM also spends a lot of time following the -same epsilon transitions over and over again. We can employ one trick to -speed up the Pike VM: extract one or more literal prefixes from the regular -expression and execute specialized code to quickly find matches of those -prefixes in the search text. The Pike VM can then be avoided for most the -search, and instead only executed when a prefix is found. The code to find -prefixes is in the regex-syntax crate (in this repository). The code to search -for literals is in src/literals.rs. When more than one literal prefix is found, -we fall back to an Aho-Corasick DFA using the aho-corasick crate. For one -literal, we use a variant of the Boyer-Moore algorithm. Both Aho-Corasick and -Boyer-Moore use `memchr` when appropriate. The Boyer-Moore variant in this -library also uses elementary frequency analysis to choose the right byte to run -`memchr` with. - -Of course, detecting prefix literals can only take us so far. Not all regular -expressions have literal prefixes. To remedy this, we try another approach -to executing the Pike VM: backtracking, whose implementation can be found in -src/backtrack.rs. One reason why backtracking can be faster is that it avoids -excessive shuffling of capture groups. Of course, backtracking is susceptible -to exponential runtimes, so we keep track of every state we've visited to make -sure we never visit it again. This guarantees linear time execution, but we -pay for it with the memory required to track visited states. Because of the -memory requirement, we only use this engine on small search strings *and* small -regular expressions. - -Lastly, the real workhorse of this library is the "lazy" DFA in src/dfa.rs. -It is distinct from the Pike VM in that the DFA is explicitly represented in -memory and is only ever in one state at a time. It is said to be "lazy" because -the DFA is computed as text is searched, where each byte in the search text -results in at most one new DFA state. It is made fast by caching states. DFAs -are susceptible to exponential state blow up (where the worst case is computing -a new state for every input byte, regardless of what's in the state cache). To -avoid using a lot of memory, the lazy DFA uses a bounded cache. Once the cache -is full, it is wiped and state computation starts over again. If the cache is -wiped too frequently, then the DFA gives up and searching falls back to one of -the aforementioned algorithms. - -All of the above matching engines expose precisely the same matching semantics. -This is indeed tested. (See the section below about testing.) - -The following sub-sections describe the rest of the library and how each of the -matching engines are actually used. - -### Parsing - -Regular expressions are parsed using the regex-syntax crate, which is -maintained in this repository. The regex-syntax crate defines an abstract -syntax and provides very detailed error messages when a parse error is -encountered. Parsing is done in a separate crate so that others may benefit -from its existence, and because it is relatively divorced from the rest of the -regex library. - -The regex-syntax crate also provides sophisticated support for extracting -prefix and suffix literals from regular expressions. - -### Compilation - -The compiler is in src/compile.rs. The input to the compiler is some abstract -syntax for a regular expression and the output is a sequence of opcodes that -matching engines use to execute a search. (One can think of matching engines as -mini virtual machines.) The sequence of opcodes is a particular encoding of a -non-deterministic finite automaton. In particular, the opcodes explicitly rely -on epsilon transitions. - -Consider a simple regular expression like `a|b`. Its compiled form looks like -this: - - 000 Save(0) - 001 Split(2, 3) - 002 'a' (goto: 4) - 003 'b' - 004 Save(1) - 005 Match - -The first column is the instruction pointer and the second column is the -instruction. Save instructions indicate that the current position in the input -should be stored in a captured location. Split instructions represent a binary -branch in the program (i.e., epsilon transitions). The instructions `'a'` and -`'b'` indicate that the literal bytes `'a'` or `'b'` should match. - -In older versions of this library, the compilation looked like this: - - 000 Save(0) - 001 Split(2, 3) - 002 'a' - 003 Jump(5) - 004 'b' - 005 Save(1) - 006 Match - -In particular, empty instructions that merely served to move execution from one -point in the program to another were removed. Instead, every instruction has a -`goto` pointer embedded into it. This resulted in a small performance boost for -the Pike VM, because it was one fewer epsilon transition that it had to follow. - -There exist more instructions and they are defined and documented in -src/prog.rs. - -Compilation has several knobs and a few unfortunately complicated invariants. -Namely, the output of compilation can be one of two types of programs: a -program that executes on Unicode scalar values or a program that executes -on raw bytes. In the former case, the matching engine is responsible for -performing UTF-8 decoding and executing instructions using Unicode codepoints. -In the latter case, the program handles UTF-8 decoding implicitly, so that the -matching engine can execute on raw bytes. All matching engines can execute -either Unicode or byte based programs except for the lazy DFA, which requires -byte based programs. In general, both representations were kept because (1) the -lazy DFA requires byte based programs so that states can be encoded in a memory -efficient manner and (2) the Pike VM benefits greatly from inlining Unicode -character classes into fewer instructions as it results in fewer epsilon -transitions. - -N.B. UTF-8 decoding is built into the compiled program by making use of the -utf8-ranges crate. The compiler in this library factors out common suffixes to -reduce the size of huge character classes (e.g., `\pL`). - -A regrettable consequence of this split in instruction sets is we generally -need to compile two programs; one for NFA execution and one for the lazy DFA. - -In fact, it is worse than that: the lazy DFA is not capable of finding the -starting location of a match in a single scan, and must instead execute a -backwards search after finding the end location. To execute a backwards search, -we must have compiled the regular expression *in reverse*. - -This means that every compilation of a regular expression generally results in -three distinct programs. It would be possible to lazily compile the Unicode -program, since it is never needed if (1) the regular expression uses no word -boundary assertions and (2) the caller never asks for sub-capture locations. - -### Execution - -At the time of writing, there are four matching engines in this library: - -1. The Pike VM (supports captures). -2. Bounded backtracking (supports captures). -3. Literal substring or multi-substring search. -4. Lazy DFA (no support for Unicode word boundary assertions). - -Only the first two matching engines are capable of executing every regular -expression program. They also happen to be the slowest, which means we need -some logic that (1) knows various facts about the regular expression and (2) -knows what the caller wants. Using this information, we can determine which -engine (or engines) to use. - -The logic for choosing which engine to execute is in src/exec.rs and is -documented on the Exec type. Exec values contain regular expression Programs -(defined in src/prog.rs), which contain all the necessary tidbits for actually -executing a regular expression on search text. - -For the most part, the execution logic is straight-forward and follows the -limitations of each engine described above pretty faithfully. The hairiest -part of src/exec.rs by far is the execution of the lazy DFA, since it requires -a forwards and backwards search, and then falls back to either the Pike VM or -backtracking if the caller requested capture locations. - -The Exec type also contains mutable scratch space for each type of matching -engine. This scratch space is used during search (for example, for the lazy -DFA, it contains compiled states that are reused on subsequent searches). - -### Programs - -A regular expression program is essentially a sequence of opcodes produced by -the compiler plus various facts about the regular expression (such as whether -it is anchored, its capture names, etc.). - -### The regex! macro - -The `regex!` macro no longer exists. It was developed in a bygone era as a -compiler plugin during the infancy of the regex crate. Back then, then only -matching engine in the crate was the Pike VM. The `regex!` macro was, itself, -also a Pike VM. The only advantages it offered over the dynamic Pike VM that -was built at runtime were the following: - - 1. Syntax checking was done at compile time. Your Rust program wouldn't - compile if your regex didn't compile. - 2. Reduction of overhead that was proportional to the size of the regex. - For the most part, this overhead consisted of heap allocation, which - was nearly eliminated in the compiler plugin. - -The main takeaway here is that the compiler plugin was a marginally faster -version of a slow regex engine. As the regex crate evolved, it grew other regex -engines (DFA, bounded backtracker) and sophisticated literal optimizations. -The regex macro didn't keep pace, and it therefore became (dramatically) slower -than the dynamic engines. The only reason left to use it was for the compile -time guarantee that your regex is correct. Fortunately, Clippy (the Rust lint -tool) has a lint that checks your regular expression validity, which mostly -replaces that use case. - -Additionally, the regex compiler plugin stopped receiving maintenance. Nobody -complained. At that point, it seemed prudent to just remove it. - -Will a compiler plugin be brought back? The future is murky, but there is -definitely an opportunity there to build something that is faster than the -dynamic engines in some cases. But it will be challenging! As of now, there -are no plans to work on this. - - -## Testing - -A key aspect of any mature regex library is its test suite. A subset of the -tests in this library come from Glenn Fowler's AT&T test suite (its online -presence seems gone at the time of writing). The source of the test suite is -located in src/testdata. The scripts/regex-match-tests.py takes the test suite -in src/testdata and generates tests/matches.rs. - -There are also many other manually crafted tests and regression tests in -tests/tests.rs. Some of these tests were taken from RE2. - -The biggest source of complexity in the tests is related to answering this -question: how can we reuse the tests to check all of our matching engines? One -approach would have been to encode every test into some kind of format (like -the AT&T test suite) and code generate tests for each matching engine. The -approach we use in this library is to create a Cargo.toml entry point for each -matching engine we want to test. The entry points are: - -* `tests/test_default.rs` - tests `Regex::new` -* `tests/test_default_bytes.rs` - tests `bytes::Regex::new` -* `tests/test_nfa.rs` - tests `Regex::new`, forced to use the NFA - algorithm on every regex. -* `tests/test_nfa_bytes.rs` - tests `Regex::new`, forced to use the NFA - algorithm on every regex and use *arbitrary* byte based programs. -* `tests/test_nfa_utf8bytes.rs` - tests `Regex::new`, forced to use the NFA - algorithm on every regex and use *UTF-8* byte based programs. -* `tests/test_backtrack.rs` - tests `Regex::new`, forced to use - backtracking on every regex. -* `tests/test_backtrack_bytes.rs` - tests `Regex::new`, forced to use - backtracking on every regex and use *arbitrary* byte based programs. -* `tests/test_backtrack_utf8bytes.rs` - tests `Regex::new`, forced to use - backtracking on every regex and use *UTF-8* byte based programs. -* `tests/test_crates_regex.rs` - tests to make sure that all of the - backends behave in the same way against a number of quickcheck - generated random inputs. These tests need to be enabled through - the `RUST_REGEX_RANDOM_TEST` environment variable (see - below). - -The lazy DFA and pure literal engines are absent from this list because -they cannot be used on every regular expression. Instead, we rely on -`tests/test_dynamic.rs` to test the lazy DFA and literal engines when possible. - -Since the tests are repeated several times, and because `cargo test` runs all -entry points, it can take a while to compile everything. To reduce compile -times slightly, try using `cargo test --test default`, which will only use the -`tests/test_default.rs` entry point. - -The random testing takes quite a while, so it is not enabled by default. -In order to run the random testing you can set the -`RUST_REGEX_RANDOM_TEST` environment variable to anything before -invoking `cargo test`. Note that this variable is inspected at compile -time, so if the tests don't seem to be running, you may need to run -`cargo clean`. - -## Benchmarking - -The benchmarking in this crate is made up of many micro-benchmarks. Currently, -there are two primary sets of benchmarks: the benchmarks that were adopted -at this library's inception (in `bench/src/misc.rs`) and a newer set of -benchmarks meant to test various optimizations. Specifically, the latter set -contain some analysis and are in `bench/src/sherlock.rs`. Also, the latter -set are all executed on the same lengthy input whereas the former benchmarks -are executed on strings of varying length. - -There is also a smattering of benchmarks for parsing and compilation. - -Benchmarks are in a separate crate so that its dependencies can be managed -separately from the main regex crate. - -Benchmarking follows a similarly wonky setup as tests. There are multiple entry -points: - -* `bench_rust.rs` - benchmarks `Regex::new` -* `bench_rust_bytes.rs` benchmarks `bytes::Regex::new` -* `bench_pcre.rs` - benchmarks PCRE -* `bench_onig.rs` - benchmarks Oniguruma - -The PCRE and Oniguruma benchmarks exist as a comparison point to a mature -regular expression library. In general, this regex library compares favorably -(there are even a few benchmarks that PCRE simply runs too slowly on or -outright can't execute at all). I would love to add other regular expression -library benchmarks (especially RE2). - -If you're hacking on one of the matching engines and just want to see -benchmarks, then all you need to run is: - - $ (cd bench && ./run rust) - -If you want to compare your results with older benchmarks, then try: - - $ (cd bench && ./run rust | tee old) - $ ... make it faster - $ (cd bench && ./run rust | tee new) - $ cargo benchcmp old new --improvements - -The `cargo-benchcmp` utility is available here: -https://github.com/BurntSushi/cargo-benchcmp - -The `./bench/run` utility can run benchmarks for PCRE and Oniguruma too. See -`./bench/bench --help`. - -## Dev Docs - -When digging your teeth into the codebase for the first time, the -crate documentation can be a great resource. By default `rustdoc` -will strip out all documentation of private crate members in an -effort to help consumers of the crate focus on the *interface* -without having to concern themselves with the *implementation*. -Normally this is a great thing, but if you want to start hacking -on regex internals it is not what you want. Many of the private members -of this crate are well documented with rustdoc style comments, and -it would be a shame to miss out on the opportunity that presents. -You can generate the private docs with: - -``` -$ rustdoc --crate-name docs src/lib.rs -o target/doc -L target/debug/deps --no-defaults --passes collapse-docs --passes unindent-comments -``` - -Then just point your browser at `target/doc/regex/index.html`. - -See https://github.com/rust-lang/rust/issues/15347 for more info -about generating developer docs for internal use. diff --git a/README.md b/README.md index 7454c166d..a23a266d3 100644 --- a/README.md +++ b/README.md @@ -290,6 +290,21 @@ $ rebar cmp results.csv See the `rebar` documentation for more details on how it works and how to compare results with other regex engines. + +### Hacking + +The `regex` crate is, for the most part, a pretty thin wrapper around the +[`meta::Regex`](https://docs.rs/regex-automata/latest/regex_automata/meta/struct.Regex.html) +from the +[`regex-automata` crate](https://docs.rs/regex-automata/latest/regex_automata/). +Therefore, if you're looking to work on the internals of this crate, you'll +likely either want to look in `regex-syntax` (for parsing) or `regex-automata` +(for construction of finite automata and the search routines). + +My [blog on regex internals](https://blog.burntsushi.net/regex-internals/) +goes into more depth. + + ### Minimum Rust version policy This crate's minimum supported `rustc` version is `1.60.0`. From 2d7b35574b6b92f8116fcd923010f1b415a6a2cf Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 14:57:53 -0400 Subject: [PATCH 21/33] changelog: add note about decreasing memory usage Ref #1090 --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f90e45a8..a813c4fdb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ Performance improvements: * [PERF #1051](https://github.com/rust-lang/regex/pull/1051): Unicode character class operations have been optimized in `regex-syntax`. +* [PERF #1090](https://github.com/rust-lang/regex/issues/1090): +Make patterns containing lots of literal characters use less memory. Bug fixes: From f68f59a8172c927282313b5cf60462ccedb91625 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 15:05:16 -0400 Subject: [PATCH 22/33] test: disable some tests on non-64-bit Some doc tests make 64-bit assumptions and fail on 32-bit. I'd be open to perhaps refactoring the tests somehow to make them work on both, but I literally have no easy way to run doc tests in a 32-bit environment. Without being able to actually run them myself, I don't feel comfortable doing anything other than squashing the tests in that case. Closes #1041 --- regex-lite/src/string.rs | 1 + src/builders.rs | 4 ++++ src/regex/bytes.rs | 1 + src/regex/string.rs | 1 + 4 files changed, 7 insertions(+) diff --git a/regex-lite/src/string.rs b/regex-lite/src/string.rs index 1c6eb4ab9..af0a5b629 100644 --- a/regex-lite/src/string.rs +++ b/regex-lite/src/string.rs @@ -2063,6 +2063,7 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// This example shows how to create and use `CaptureLocations` in a search. /// /// ``` +/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex_lite::Regex; /// /// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); diff --git a/src/builders.rs b/src/builders.rs index 46c4824c5..c111a96c0 100644 --- a/src/builders.rs +++ b/src/builders.rs @@ -679,6 +679,7 @@ pub(crate) mod string { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::RegexBuilder; /// /// // It may surprise you how big some seemingly small patterns can @@ -1246,6 +1247,7 @@ pub(crate) mod string { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::RegexSetBuilder; /// /// // It may surprise you how big some seemingly small patterns can @@ -1856,6 +1858,7 @@ pub(crate) mod bytes { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::bytes::RegexBuilder; /// /// // It may surprise you how big some seemingly small patterns can @@ -2428,6 +2431,7 @@ pub(crate) mod bytes { /// # Example /// /// ``` + /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::bytes::RegexSetBuilder; /// /// // It may surprise you how big some seemingly small patterns can diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs index cc53482cb..c742b095a 100644 --- a/src/regex/bytes.rs +++ b/src/regex/bytes.rs @@ -2025,6 +2025,7 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// This example shows how to create and use `CaptureLocations` in a search. /// /// ``` +/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::bytes::Regex; /// /// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); diff --git a/src/regex/string.rs b/src/regex/string.rs index d5908ae0d..177a2af34 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -2028,6 +2028,7 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { /// This example shows how to create and use `CaptureLocations` in a search. /// /// ``` +/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 /// use regex::Regex; /// /// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); From a85c72ed608176de45cbbace57abde666ed5058b Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 15:23:17 -0400 Subject: [PATCH 23/33] syntax: fix panics that occur with non-sensical Ast values These panics I do not believe can occur from an actual pattern, since the parser will either never produce such things or will return an error. But still, the Ast->Hir translator shouldn't panic in such cases. Actually, the non-sensical Ast values are actually somewhat sensible, and they don't map to invalid regexes. These panics were likely the result of the regex crate not supporting empty patterns or "fail" patterns particularly well in the fast. But now that we do, we can just let the Asts through and generate the Hir you'd expect. Fixes #1047 --- CHANGELOG.md | 3 ++ regex-syntax/src/hir/translate.rs | 59 ++++++++++++++++++++++++++++--- 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a813c4fdb..2c0d193a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,9 @@ Bug fixes: * [BUG #1046](https://github.com/rust-lang/regex/issues/1046): Fix a bug that could result in incorrect match spans when using a Unicode word boundary and searching non-ASCII strings. +* [BUG(regex-syntax) #1047](https://github.com/rust-lang/regex/issues/1047): +Fix panics that can occur in `Ast->Hir` translation (not reachable from `regex` +crate). * [BUG(regex-syntax) #1088](https://github.com/rust-lang/regex/issues/1088): Remove guarantees in the API that connect the `u` flag with a specific HIR representation. diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 55ca074fa..2b500cc2f 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -354,14 +354,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } - Ast::Concat(ref x) if x.asts.is_empty() => {} Ast::Concat(_) => { self.push(HirFrame::Concat); } - Ast::Alternation(ref x) if x.asts.is_empty() => {} - Ast::Alternation(_) => { + Ast::Alternation(ref x) => { self.push(HirFrame::Alternation); - self.push(HirFrame::AlternationBranch); + if !x.asts.is_empty() { + self.push(HirFrame::AlternationBranch); + } } _ => {} } @@ -3652,4 +3652,55 @@ mod tests { ]), ); } + + #[test] + fn regression_alt_empty_concat() { + use crate::ast::{self, Ast}; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::alternation(ast::Alternation { + span, + asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::empty()), t.translate("", &ast)); + } + + #[test] + fn regression_empty_alt() { + use crate::ast::{self, Ast}; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::concat(ast::Concat { + span, + asts: vec![Ast::alternation(ast::Alternation { + span, + asts: vec![], + })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::fail()), t.translate("", &ast)); + } + + #[test] + fn regression_singleton_alt() { + use crate::{ + ast::{self, Ast}, + hir::Dot, + }; + + let span = Span::splat(Position::new(0, 0, 0)); + let ast = Ast::concat(ast::Concat { + span, + asts: vec![Ast::alternation(ast::Alternation { + span, + asts: vec![Ast::dot(span)], + })], + }); + + let mut t = Translator::new(); + assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast)); + } } From 1de1a37d271fdba9d0e0a7d1693d8956aebd1d99 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 16:06:14 -0400 Subject: [PATCH 24/33] changelog: start filling out the 1.10 release --- CHANGELOG.md | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c0d193a1..b51142218 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,30 @@ -TBD -=== +1.10.0 (2023-10-09) +=================== +This is a new minor release of `regex` that adds support for start and end +word boundary assertions. That is, `\<` and `\>`. The minimum supported Rust +version has also been raised to 1.65, which was released about one year ago. + +The new word boundary assertions are: + +* `\<` or `\b{start}`: a Unicode start-of-word boundary (`\W|\A` on the left, +`\w` on the right). +* `\>` or `\b{end}`: a Unicode end-of-word boundary (`\w` on the left, `\W|\z` +on the right)). +* `\b{start-half}`: half of a Unicode start-of-word boundary (`\W|\A` on the +left). +* `\b{end-half}`: half of a Unicode end-of-word boundary (`\W|\z` on the +right). + +The `\<` and `\>` are GNU extensions to POSIX regexes. They have been added +to the `regex` crate because they enjoy somewhat broad support in other regex +engines as well (for example, vim). The `\b{start}` and `\b{end}` assertions +are aliases for `\<` and `\>`, respectively. + +The `\b{start-half}` and `\b{end-half}` assertions are not found in any +other regex engine (although regex engines with general look-around support +can certainly express them). They were added principally to support the +implementation of word matching in grep programs, where one generally wants to +be a bit more flexible in what is considered a word boundary. New features: @@ -27,6 +52,29 @@ crate). Remove guarantees in the API that connect the `u` flag with a specific HIR representation. +`regex-automata` breaking change release: + +This release includes a `regex-automata 0.4.0` breaking change release, which +was necessary in order to support the new word boundary assertions. For +example, the `Look` enum has new variants and the `LookSet` type now uses `u32` +instead of `u16` to represent a bitset of look-around assertions. These are +overall very minor changes, and most users of `regex-automata` should be able +to move to `0.4` from `0.3` without any changes at all. + +`regex-syntax` breaking change release: + +This release also includes a `regex-syntax 0.8.0` breaking change release, +which, like `regex-automata`, was necessary in order to support the new word +boundary assertions. This release also includes some changes to the `Ast` +type to reduce heap usage in some cases. If you are using the `Ast` type +directly, your code may require some minor modifications. Otherwise, users of +`regex-syntax 0.7` should be able to migrate to `0.8` without any code changes. + +`regex-lite` release: + +The `regex-lite 0.1.1` release contains support for the new word boundary +assertions. There are no breaking changes. + 1.9.6 (2023-09-30) ================== From 0cc1b4d32f1a4d0897f7db1081aa0a553f252573 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 21:31:16 -0400 Subject: [PATCH 25/33] automata: fix subtle DFA performance bug This commit fixes a subtle *performance* bug in the start state computation. The issue here is rather tricky, but it boils down to the fact that the way the look-behind assertions are computed in the start state is not quite precisely equivalent to how they're computed during normal state generation. Namely, in normal state generation, we only compute look-behind assertions if the NFA actually has one (or one similar to it) in its graph somewhere. If it doesn't, then there's no point in saving whether the assertion is satisfied or not. Logically speaking, this doesn't matter too much, because if the look-around assertions don't match up with how they're computed in the start state, a new state will simply be created. Not a huge deal, but wasteful. The real problem is that the new state will no longer be considered a start state. It will just be like any other normal state. We rely on being able to detect start states at search time to know when to trigger the prefilter. So if we re-generate start states as non-start states, then we may end up not triggering the prefilter. That's bad. rebar actually caught this bug via the `imported/sherlock/line-boundary-sherlock-holmes` benchmark, which recorded a 20x slowdown due to the prefilter not running. Owch! This specifically was caused by the start states unconditionally attaching half-starting word boundary assertions whenever they were satisfied, where as normal state generation only does this when there is actually a half-starting word boundary assertion in the NFA. So this led to re-generating start states needlessly. Interestingly, the start state computation was unconditionally attaching all different types of look-behind assertions, and thus in theory, this problem already existed under different circumstances. My hypothesis is that it wasn't "as bad" because it was mostly limited to line terminators. But the half-starting word boundary assertion is much more broadly applicable. We remedy this not only for the half-starting word boundary assertion, but for all others as well. I also did manual mutation testing in this start state computation and found a few branches not covered by tests. We add those tests here. Thanks rebar! --- regex-automata/src/util/determinize/mod.rs | 102 +++++++++++++-------- testdata/line-terminator.toml | 12 +++ testdata/word-boundary-special.toml | 34 +++++++ 3 files changed, 111 insertions(+), 37 deletions(-) diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs index d320fabc3..ba32991d0 100644 --- a/regex-automata/src/util/determinize/mod.rs +++ b/regex-automata/src/util/determinize/mod.rs @@ -587,67 +587,95 @@ pub(crate) fn set_lookbehind_from_start( ) { let rev = nfa.is_reverse(); let lineterm = nfa.look_matcher().get_line_terminator(); + let lookset = nfa.look_set_any(); match *start { Start::NonWordByte => { - builder.set_look_have(|have| { - have.insert(Look::WordStartHalfAscii) - .insert(Look::WordStartHalfUnicode) - }); + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::WordByte => { - builder.set_is_from_word(); + if lookset.contains_word() { + builder.set_is_from_word(); + } } Start::Text => { - builder.set_look_have(|have| { - have.insert(Look::Start) - .insert(Look::StartLF) - .insert(Look::StartCRLF) - .insert(Look::WordStartHalfAscii) - .insert(Look::WordStartHalfUnicode) - }); + if lookset.contains_anchor_haystack() { + builder.set_look_have(|have| have.insert(Look::Start)); + } + if lookset.contains_anchor_line() { + builder.set_look_have(|have| { + have.insert(Look::StartLF).insert(Look::StartCRLF) + }); + } + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::LineLF => { if rev { - builder.set_is_half_crlf(); - builder.set_look_have(|have| have.insert(Look::StartLF)); + if lookset.contains_anchor_crlf() { + builder.set_is_half_crlf(); + } + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } } else { - builder.set_look_have(|have| have.insert(Look::StartCRLF)); + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } } - if lineterm == b'\n' { + if lookset.contains_anchor_line() && lineterm == b'\n' { builder.set_look_have(|have| have.insert(Look::StartLF)); } - builder.set_look_have(|have| { - have.insert(Look::WordStartHalfAscii) - .insert(Look::WordStartHalfUnicode) - }); + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::LineCR => { - if rev { - builder.set_look_have(|have| have.insert(Look::StartCRLF)); - } else { - builder.set_is_half_crlf(); + if lookset.contains_anchor_crlf() { + if rev { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } else { + builder.set_is_half_crlf(); + } } - if lineterm == b'\r' { + if lookset.contains_anchor_line() && lineterm == b'\r' { builder.set_look_have(|have| have.insert(Look::StartLF)); } - builder.set_look_have(|have| { - have.insert(Look::WordStartHalfAscii) - .insert(Look::WordStartHalfUnicode) - }); + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::CustomLineTerminator => { - builder.set_look_have(|have| have.insert(Look::StartLF)); + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } // This is a bit of a tricky case, but if the line terminator was // set to a word byte, then we also need to behave as if the start // configuration is Start::WordByte. That is, we need to mark our // state as having come from a word byte. - if utf8::is_word_byte(lineterm) { - builder.set_is_from_word(); - } else { - builder.set_look_have(|have| { - have.insert(Look::WordStartHalfAscii) - .insert(Look::WordStartHalfUnicode) - }); + if lookset.contains_word() { + if utf8::is_word_byte(lineterm) { + builder.set_is_from_word(); + } else { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } } } diff --git a/testdata/line-terminator.toml b/testdata/line-terminator.toml index 4de72de31..a398dafa2 100644 --- a/testdata/line-terminator.toml +++ b/testdata/line-terminator.toml @@ -38,6 +38,18 @@ unescape = true line-terminator = '\xFF' utf8 = false +# This tests a tricky case where the line terminator is set to \r. This ensures +# that the StartLF look-behind assertion is tracked when computing the start +# state. +[[test]] +name = "carriage" +regex = '(?m)^[a-z]+' +haystack = 'ABC\rabc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true +line-terminator = '\r' + # This tests that we can set the line terminator to a byte corresponding to a # word character, and things work as expected. [[test]] diff --git a/testdata/word-boundary-special.toml b/testdata/word-boundary-special.toml index c1689f5cc..2b5a2a0ac 100644 --- a/testdata/word-boundary-special.toml +++ b/testdata/word-boundary-special.toml @@ -651,3 +651,37 @@ regex = '\b{end-half}' haystack = "b𝛃" matches = [[5, 5]] unicode = true + +# Specialty tests. + +# Since \r is special cased in the start state computation (to deal with CRLF +# mode), this test ensures that the correct start state is computed when the +# pattern starts with a half word boundary assertion. +[[test]] +name = "word-start-half-ascii-carriage" +regex = '\b{start-half}[a-z]+' +haystack = 'ABC\rabc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true + +# Since \n is also special cased in the start state computation, this test +# ensures that the correct start state is computed when the pattern starts with +# a half word boundary assertion. +[[test]] +name = "word-start-half-ascii-linefeed" +regex = '\b{start-half}[a-z]+' +haystack = 'ABC\nabc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true + +# Like the carriage return test above, but with a custom line terminator. +[[test]] +name = "word-start-half-ascii-customlineterm" +regex = '\b{start-half}[a-z]+' +haystack = 'ABC!abc' +matches = [[4, 7]] +bounds = [4, 7] +unescape = true +line-terminator = '!' From 1a50eaa726d24bf3916cc0495889883e4c0f385a Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Oct 2023 22:15:37 -0400 Subject: [PATCH 26/33] msrv: bump to Rust 1.65 This MSRV bump is mostly motivated by "good sense," and in particular, Rust 1.65 means we can use 'let ... else'. We don't actually start peppering the code with 'let ... else' just yet, but we fix a few outstanding small issues and update our Rust version everywhere. Also, Rust 1.65 is about a year old at time of writing. Let's keep the trains moving. --- .github/workflows/ci.yml | 2 +- Cargo.toml | 2 +- README.md | 2 +- regex-automata/Cargo.toml | 1 + regex-automata/src/util/lazy.rs | 6 +---- regex-automata/src/util/look.rs | 3 +-- regex-automata/src/util/pool.rs | 43 +++++++++++++++++++++++++++++---- regex-cli/Cargo.toml | 1 + regex-lite/Cargo.toml | 2 +- regex-lite/README.md | 2 +- regex-syntax/Cargo.toml | 2 +- regex-syntax/src/hir/literal.rs | 21 ++++++---------- regex-syntax/src/lib.rs | 12 --------- 13 files changed, 56 insertions(+), 43 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 08cc60d9a..2813a1676 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -141,7 +141,7 @@ jobs: - name: Install Rust uses: dtolnay/rust-toolchain@master with: - toolchain: 1.60.0 + toolchain: 1.65.0 # The memchr 2.6 release purportedly bumped its MSRV to Rust 1.60, but it # turned out that on aarch64, it was using something that wasn't stabilized # until Rust 1.61[1]. (This was an oversight on my part. I had previously diff --git a/Cargo.toml b/Cargo.toml index 46664f669..6f94dc4ae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ categories = ["text-processing"] autotests = false exclude = ["/scripts/*", "/.github/*"] edition = "2021" -rust-version = "1.60.0" +rust-version = "1.65" [workspace] members = [ diff --git a/README.md b/README.md index a23a266d3..f1e4c404a 100644 --- a/README.md +++ b/README.md @@ -307,7 +307,7 @@ goes into more depth. ### Minimum Rust version policy -This crate's minimum supported `rustc` version is `1.60.0`. +This crate's minimum supported `rustc` version is `1.65.0`. The policy is that the minimum Rust version required to use this crate can be increased in minor version updates. For example, if regex 1.0 requires Rust diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 7d47140b0..2d08cec75 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -11,6 +11,7 @@ license = "MIT OR Apache-2.0" categories = ["text-processing"] edition = "2021" autoexamples = false +rust-version = "1.65" [lib] bench = false diff --git a/regex-automata/src/util/lazy.rs b/regex-automata/src/util/lazy.rs index de27a2a6e..0d0b4fb2a 100644 --- a/regex-automata/src/util/lazy.rs +++ b/regex-automata/src/util/lazy.rs @@ -384,11 +384,7 @@ mod lazy { // SAFETY: state is DONE if and only if data has been fully // initialized. At which point, it is safe to drop. unsafe { - // MSRV(1.60): Use assume_init_drop. The below is how - // assume_init_drop is implemented. - core::ptr::drop_in_place( - (*self.data.as_ptr()).as_mut_ptr(), - ) + self.data.get_mut().assume_init_drop(); } } } diff --git a/regex-automata/src/util/look.rs b/regex-automata/src/util/look.rs index ddf8fb129..73e51c0f6 100644 --- a/regex-automata/src/util/look.rs +++ b/regex-automata/src/util/look.rs @@ -1651,8 +1651,7 @@ mod is_word_char { fn is_word_character(c: char) -> bool { use crate::util::{unicode_data::perl_word::PERL_WORD, utf8}; - // MSRV(1.59): Use 'u8::try_from(c)' instead. - if u8::try_from(u32::from(c)).map_or(false, utf8::is_word_byte) { + if u8::try_from(c).map_or(false, utf8::is_word_byte) { return true; } PERL_WORD diff --git a/regex-automata/src/util/pool.rs b/regex-automata/src/util/pool.rs index 95afa4a0d..d90d4ecff 100644 --- a/regex-automata/src/util/pool.rs +++ b/regex-automata/src/util/pool.rs @@ -455,11 +455,44 @@ mod inner { /// Create a new pool. The given closure is used to create values in /// the pool when necessary. pub(super) fn new(create: F) -> Pool { - // MSRV(1.63): Mark this function as 'const'. I've arranged the - // code such that it should "just work." Then mark the public - // 'Pool::new' method as 'const' too. (The alloc-only Pool::new - // is already 'const', so that should "just work" too.) The only - // thing we're waiting for is Mutex::new to be const. + // FIXME: Now that we require 1.65+, Mutex::new is available as + // const... So we can almost mark this function as const. But of + // course, we're creating a Vec of stacks below (we didn't when I + // originally wrote this code). It seems like the best way to work + // around this would be to use a `[Stack; MAX_POOL_STACKS]` instead + // of a `Vec`. I refrained from making this change at time + // of writing (2023/10/08) because I was making a lot of other + // changes at the same time and wanted to do this more carefully. + // Namely, because of the cache line optimization, that `[Stack; + // MAX_POOL_STACKS]` would be quite big. It's unclear how bad (if + // at all) that would be. + // + // Another choice would be to lazily allocate the stacks, but... + // I'm not so sure about that. Seems like a fair bit of complexity? + // + // Maybe there's a simple solution I'm missing. + // + // ... OK, I tried to fix this. First, I did it by putting `stacks` + // in an `UnsafeCell` and using a `Once` to lazily initialize it. + // I benchmarked it and everything looked okay. I then made this + // function `const` and thought I was just about done. But the + // public pool type wraps its inner pool in a `Box` to keep its + // size down. Blech. + // + // So then I thought that I could push the box down into this + // type (and leave the non-std version unboxed) and use the same + // `UnsafeCell` technique to lazily initialize it. This has the + // downside of the `Once` now needing to get hit in the owner fast + // path, but maybe that's OK? However, I then realized that we can + // only lazily initialize `stacks`, `owner` and `owner_val`. The + // `create` function needs to be put somewhere outside of the box. + // So now the pool is a `Box`, `Once` and a function. Now we're + // starting to defeat the point of boxing in the first place. So I + // backed out that change too. + // + // Back to square one. I maybe we just don't make a pool's + // constructor const and live with it. It's probably not a huge + // deal. let mut stacks = Vec::with_capacity(MAX_POOL_STACKS); for _ in 0..stacks.capacity() { stacks.push(CacheLine(Mutex::new(vec![]))); diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml index f9dec0024..b5de2b5e7 100644 --- a/regex-cli/Cargo.toml +++ b/regex-cli/Cargo.toml @@ -12,6 +12,7 @@ license = "MIT OR Apache-2.0" categories = ["text-processing"] autotests = false edition = "2021" +rust-version = "1.65" [[bin]] name = "regex-cli" diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml index 1dc144b31..21330fd4e 100644 --- a/regex-lite/Cargo.toml +++ b/regex-lite/Cargo.toml @@ -10,7 +10,7 @@ A lightweight regex engine that optimizes for binary size and compilation time. """ workspace = ".." edition = "2021" -rust-version = "1.60.0" +rust-version = "1.65" autotests = false # Features are documented in the "Crate features" section of the crate docs: diff --git a/regex-lite/README.md b/regex-lite/README.md index 34c749b21..758fac6ae 100644 --- a/regex-lite/README.md +++ b/regex-lite/README.md @@ -78,7 +78,7 @@ year: 2014, month: 10, day: 14 ### Minimum Rust version policy -This crate's minimum supported `rustc` version is `1.60.0`. +This crate's minimum supported `rustc` version is `1.65.0`. The policy is that the minimum Rust version required to use this crate can be increased in semver compatible updates. diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index aaceeee7f..e5e541302 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -8,7 +8,7 @@ documentation = "https://docs.rs/regex-syntax" description = "A regular expression parser." workspace = ".." edition = "2021" -rust-version = "1.60.0" +rust-version = "1.65" # Features are documented in the "Crate features" section of the crate docs: # https://docs.rs/regex-syntax/*/#crate-features diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index afcd506e0..a5a3737f6 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -2235,24 +2235,19 @@ impl PreferenceTrie { /// after them and because any removed literals are guaranteed to never /// match. fn minimize(literals: &mut Vec, keep_exact: bool) { - use core::cell::RefCell; - - // MSRV(1.61): Use retain_mut here to avoid interior mutability. - let trie = RefCell::new(PreferenceTrie { + let mut trie = PreferenceTrie { states: vec![], matches: vec![], next_literal_index: 1, - }); + }; let mut make_inexact = vec![]; - literals.retain(|lit| { - match trie.borrow_mut().insert(lit.as_bytes()) { - Ok(_) => true, - Err(i) => { - if !keep_exact { - make_inexact.push(i.checked_sub(1).unwrap()); - } - false + literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) { + Ok(_) => true, + Err(i) => { + if !keep_exact { + make_inexact.push(i.checked_sub(1).unwrap()); } + false } }); for i in make_inexact { diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 38c8d88d4..20f25db71 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -168,18 +168,6 @@ The following features are available: #![forbid(unsafe_code)] #![deny(missing_docs, rustdoc::broken_intra_doc_links)] #![warn(missing_debug_implementations)] -// MSRV(1.62): Allow unused warnings. Needed for the 'allow' below, -// since the warning is no longer triggered in newer Rust releases. -// Once the 'allow(mutable_borrow_reservation_conflict)' can be -// removed, we can remove the 'allow(renamed_and_removed_lints)' too. -#![allow(renamed_and_removed_lints)] -// MSRV(1.62): This gets triggered on Rust <1.62, and since our MSRV -// is Rust 1.60 at the time of writing, a warning is displayed. But -// the lang team decided the code pattern flagged by this warning is -// OK, so the warning is innocuous. We can remove this explicit allow -// once we get to a Rust release where the warning is no longer -// triggered. I believe that's Rust 1.62. -#![allow(mutable_borrow_reservation_conflict)] #![cfg_attr(docsrs, feature(doc_auto_cfg))] #[cfg(any(test, feature = "std"))] From 9e503cdf83d6e447d7d514a18423aaba88779239 Mon Sep 17 00:00:00 2001 From: Addison Crump Date: Sat, 15 Jul 2023 16:00:21 +0200 Subject: [PATCH 27/33] fuzz: institute sane limits for arbitrary-based fuzzers Closes #1043 Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=61570 Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=62436 --- fuzz/ast-fuzzers.options | 2 ++ fuzz/oss-fuzz-build.sh | 5 ++++- ...e-minimized-ast_fuzz_match-5990349284442112 | Bin 0 -> 169710 bytes ...e-minimized-ast_fuzz_match-6114393576046592 | Bin 0 -> 51466 bytes ...mized-ast_fuzz_match_bytes-4820641084473344 | Bin 0 -> 47681 bytes 5 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 fuzz/ast-fuzzers.options create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-5990349284442112 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-6114393576046592 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match_bytes-4820641084473344 diff --git a/fuzz/ast-fuzzers.options b/fuzz/ast-fuzzers.options new file mode 100644 index 000000000..678d526b1 --- /dev/null +++ b/fuzz/ast-fuzzers.options @@ -0,0 +1,2 @@ +[libfuzzer] +max_len = 65536 diff --git a/fuzz/oss-fuzz-build.sh b/fuzz/oss-fuzz-build.sh index f96474739..81f619dcb 100755 --- a/fuzz/oss-fuzz-build.sh +++ b/fuzz/oss-fuzz-build.sh @@ -14,5 +14,8 @@ targets=( ast_fuzz_match_bytes ) for target in "${targets[@]}"; do - cp fuzz/target/x86_64-unknown-linux-gnu/release/$target $OUT/ + cp "fuzz/target/x86_64-unknown-linux-gnu/release/${target}" "${OUT}/" + if [[ "$target" == ast_* ]]; then + cp fuzz/ast-fuzzers.options "${OUT}/${target}.options" + fi done diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-5990349284442112 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-5990349284442112 new file mode 100644 index 0000000000000000000000000000000000000000..8de974975d4227f6da75038a92caa08dc6feac23 GIT binary patch literal 169710 zcmeI5F^=Rm7KRJh#20YS8C(koK0yPUj2C+e!*FE4FyO?Q53n875h|J!IGAH7oowR? zeE_4EeAFZPC_VK^WYx~Pzk(sY$C5bG&yT-IRdxCF_U-fE-abA)o*v))=ed48{^Q^A zzsE~@{e3Q1i*Vd({Z{3;2IE=xpX!r;va6?ho%grb9?zwIo$BOPv7AY0+`ocp{jJpt z(_J?k)Sa}9m-a#*s9=|i+w$H%sQz^gLmAKI?l1ZG-*3PE`R%v=(mkF(emp%sKm2sc zuXi6FPt{eFaJ83I&2XV*&&8#KUlvCN1HXtM_+2im<4Sq4<4S9tGP(`!r`G0_xSyp5 z{Nk_~@)BVk?6eyEz;6g_%SOPjo`YJn}wDekxA=wOW~sy648xSz!k{0_plJPiDB zzcjd(Y}}y+{J;gCF=ECv5Xb@MG?m2G=Gg zw|W!&zz_VI(Z$juArZDLJ(&At7$jin(YVF&RP@v}hEswc_#G!~^GNVx?w1DFCMLIf z^A+Q_kW{n?wW&MSw$oZAKYOrH@_2;^?A(R5n!B;Zkjdb2-kKx95BE!hs{!LaDfodO z_%&mV2Z0HRu)XFeXq00UJekZSJLU;Z7u+Y6lhsL{Q!@86UCJ-6jYAXsRxD+QXIL;7 zS?nl&(a*6pt;XIgH{dtS->|n8dsEsb4QvU1;McE!Fcq4yY}MIRCnW~Ie(~oPL{z=Y zIwd#Xf?kZk5B&NS5T-&imaRIQ>ZHWr*DwCuf{5z4o>t?21-%%7ANchvAWVg3EL(Lp z)k%rLuV4JR1rb$k&M9%ff?kZk5B&NS5T-&imaRIQ>ZHWr*DwCuf{5z4o>t?21-%%7 zANchvAWVg3EL(Lp)k%rLuV4JR1rb$k&M9%ff?kZk5B&NS5T-&imaRIQ>ZHWr*DwCu zf{5z4o>t?21-%%7ANchvAWVg3EL(Lp)k%rLuV4JR1rb$k&M9%ff?kZk5B&NS5T-&i zmaRIQ>ZHWr*DwCuf{5z4o>t?21-%%7ANchvAWVg3EL(Lp)k%rLuV4JR1rb$k&M9%f zf?kZk5B&NS5T-&imaRIQ>ZHWr*DwCuf{5z4o>t?21-%%7ANchvAWVg3EL(Lp)k%rL zuV4JR1rb$k&M9%ff?kZk5B&NS5T-&imaRIQ>ZHWr*DwCuf{5z4o>t?21-%%7ANchv zAWVg3EL(Lp)k%rLuV4JR1rb$k&M9%ff?kZk5B&NS5T-&imaRIQ>ZHWr*DwCuf{5z4 zo>t?21-%%7ANchvAWVg3EL(Lp)k%rLuV4JR1rb$k&M80D{q~=f{Ob^)WbYSnz!4(} z&Nj zYc5wWUDe#abnwess$k$3aeV&x@$~ro@Y5;3-hFsHWwGi2{Ax8; zoz-!ryI5P-TBiiRD2#)hR)e3V2mIo&8S)Zgl{2RVKkyp@+p-bxd$w%V*;FSb#{HHY z9jp;i9oN%p+^?V)Bk(&2+ww5*!~Nu$&JvB=)&L&h2?OzY_WfVr2Y%q!j5|K>NJxb3 zHAg|C9Gl?DWG2}$L2bI=KB=6nt}dLCMThB979Hcz1V6`8qA-)Fv)Jsgl*O^OBK9L6 zw>hq-)i1l>!^4N)^>upq&;hx=Ff@0i&4nGNHt$eQH`u`&gb*5KmYQ>_var& zq_1Vo`M0y#Bj)GxU2foVwt|$3<&-aLXu+JbRnql-9RBg@ph`Fi?{fX}@7wdJPAP^{ zlh5huh3T%F4eCx>#*1YLJJ@M8_$e-_1iv_JhP*^j<;*F;5B!F}wrm9a>PdjbjGOP0 zm%%mQ2YzYL4F>Dengc)Z1HWdhu<}hvgzYs)L8Ba-;K^hr*|AD$y5K&koUF6t&}x<* zrb~HrKMqasb1WqaGl@Ej%??Xh9BV6LKl0p9ZO$oiKMM``4YNNH*1=Az!S7Y#S6%cl z-1amCn%tNZgL|kMwlDaBANV!HiH9-?iLkxqC}@;p6Yx;R$T26Yvv*a~YT3~D41QTq z6^x%RiOUpzzJ$LaY|Wx0mre{Ml^b#A&^ z&CS=)$G2gI(U*q4a^{ra2Y%y-GcDnM+g!RLIj6+^ zEHvOZ%>G1J2Rp3>Kkyp{oNWmFxcR2RWx2Rh27cfNe$8Ox!An9SY_B;A8s*poJa{p3 z%*pBuUDdQ&HuUjj`=X}`#+U8mQiLzt^JV+iEIM)-X1c_6N>;wbjAFEE>$;1zb***E z*l|4fbFkBD@U!%QUx(k>Lq%BS%qhVS{46!YGVpVjlTnyS)LCqBKk(~UK(GqUEbz(| zQ;tjke*NOlEr_npO;@YA`5O9IdJLm44SnUzDZvl?#t~;)!u?+F&G$NwssKbaz6>5A z;S;DVjta&{0P#k}M*uv7ZOx)1=T20+aQ2)OOiezgECV}$vxkbJ&iq@enTJ~p!Ot?( zk%8a;#{D|FOb?Z=3yB>YbvD&Wi5WgBVRtE_s!cm3?p@G}5%_gurwxD~?w1Ca5^>iO z{J;MS-pEM;-5t%&`|SC`f1oD%o5(171C`x9Xu?6eyEz;76EwjuE2=9>nW<>F2m_<C#y4bRnuzO(8u=`i=HYN-&c%F5x%d;XTGgjbmTJ3 zbcySftbB_Z#c0*mbr);vTI-at<9P1pV5imKXXydI4!^U9im=L=Q-UA(S!#x5;O8tS zqcD@Gv)JH%;McE!U=^BK;FT+;9GL+8`o*7H5M7;{u2ysNHT1Fc7)DiLkxqC}@;p6FiyBBs*3~O&8oJm6LVmd|J)i z&vYqEk8x;%pJORem`T)GY<5`6;#gY|`;iab)aINL_p{J|-!S_VVIAzW8vMX-7;v^B z@Z;v22AAdHP8s-tANVzcjR!9YiLkxqC}@;p6Y${0$T26YGjvtcYT3}om+gz5Di~k3 zk4q80Y|poyTeIlMWtiy_*C|=~7Bh;`s;%oT*4DMwDPza++|R*ItHICG1AZNTXAc!& zl{2RVKk&2E49mdJSx!b_CQ)ax!TrFmUje}?G_$}fS4=rF0r>TcKer&dIyYUd=H_eY zdu>Y(d-3uBi1pG%2ZvKKW-K;XHkRG5fG`!Bv24}ZR3{|{zkc!O7DQCX^|TuIE9k`t z{5~`M9v(jYuCLR>AMd_&UvIwp^Q5ouVi58#QpML&R9E)R`F#HM=U;yK{`_Nz^tG%x z|8_Qe#Qc0dS!P1Uhi({&g#sg~hFf1K!s-pnDZ%friJuy4AgH>)mM6Y_l*LiO`1(h5 z3cmiql(sdCj+{GD?ZVk}QZP07oH7yW0L~sNiaHY@t!5%*F$6!$P)7!SxL+Dv9S|Kq z6#T#s{F?E@$~Peqwyb=a`(+p;VD8tr#qm`1)HQ}vf*<%DCv5Xb@MG?m2G=Ggw|W!& zzz_VI(Z$juArZDLJ(&At7$jin(YVF&RP=O~99j*2;CGy`%_G5&xnCMwo0#0{P4ELh z@M}gFOOJ#^*s}Cs?w4VZfTc&{7ROW3Q`ZCL8eE&0-0ICS`2BzU z0iX3S$)c?M+MILO(`xR1mK-Ku!`x_${n+lAv(;YGl~lNZ>Y(;=SbAo0R51S3EV?0o zYQ~?MwPw+g%P7;ktW$FH&0ZwIs;%oT*4DMwDPzYm_j9n*YVfo4fM18-*+WHG<;*F; z5Bw}O!!q!zmp~RXerb3ZW?nM_&Jsmg_%U1#b$@4ERMAmu^;)o!*M;W#{Dcb;5W?vL|Em_DZvl?h5=_A z0zYoPX>eIC?v#Na_<>(D*m&@gkOYCcNnnk1HsQ(Mn+*KQD?Ei{kECf@vdm=xSm$yewIM++X2}2uHg52-A|w81VXCu z)$+o_uPlxV#>208XX4=(Pa#^f=*YPf)h?VpCk0cJ&nXk34&dyeqNq-|osxOD#Sr`~ zLme6T;eKgwbwG6dQ1Amk@N32oE8m1f*s}6v?w4VZfVp4e7ROW3(^+z8HTZ$wal$r_ z1V83}X>e^~a;rDN5B$Kd8C@(r5)xs{(u28QhCu?B9*tWZPeo5%V>l)Ff!}e$Hje~9 z=6-2#ZDMk(H^C45z^@ryEIkquVaw8kxnG7s0+t?)TO3bCPiM)Y)!+wy#|hg!68xC^ zrNOm{$*tZTgWqTQ|C4=5vN4yQ`mWio-kx<97tX8Ny**2&y;&R;j6ZXU^AP^bg+Fs? z&7vchQKs`-r)1ijy-0#pTi0Ezt!u4Q#*X8+UL5SS8vHCh;Md`I_D~U4Ide+z`)lH- z)5<_Zb%BXN8}J+Ej3TUqomPV%_zeTjHUxh4BEe$DuM%2TZjgZ=_@zO21Grvp z68yjq{F-sXC)5dvu;s}ZOOFhL1gv};w>X}Pp1Q6P960r1W+~RmDdg>a(DPP9#v;2eX{{n8r{}2EG literal 0 HcmV?d00001 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-6114393576046592 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_fuzz_match-6114393576046592 new file mode 100644 index 0000000000000000000000000000000000000000..a34eeaf2c0d2ba02b5f35ae85b06cfec4f1089b0 GIT binary patch literal 51466 zcmeIzv2Gi60LSrbDi(x?A|#eFAdnbH!Eqc1q*^DsWbKp=nM~z~LNJv8DH0M=pCE5w z1`ln=NGv=;o}v;9D?Gt*=S#x*n{Px!1ysuKKREt+ei}o6-qvq3B&eq}PMt{5i zdv#vdZZsF#FBawEhs9!k-WQAan-})?o5$|?>ftwEKc0N|=<{w|{`4m&Cr3v|H|u^} zpI?rT$Fur;G<#9KRkznZKD^V&H0QVWOXsnus`7qyx2h&`ewxqA?+@qm`3KJ?v)Oq3 z$LSmE&w2OD$?Q@0O0NEVttjqn4n|v_)^91_C~g(&r>FTAH*OVgo`0*`opZN-TXXjH zw69(9aX1_nwbsGG!A5_p*PN9%{`K-;`Qn9#d;j39{q1`$d$7t2SzmAC^6~7CKpQXpGqAjEH@CFYV^`X{{0DuRSM;M0 zg9H$pK{6GzAOXZ?kW2+FNC2@JBvU~P5*Y* zW{^w;El2>d86;Cd3lczV2FX;=f&>toK{6GzAOXZ?kW2+FNC2@JBvU~P5*Y*W{^w;El2>d86;Cd3lczV2FX;=f&>toK{6GzAOXZ? zkW2+FNC2@JBvU~P5*Y*W{^w;El2>d86;Cd z3lczV2FX;=f&>toK{6GzAOXZ?kW2+FNC2@JBvU~P5*Y*W{^w;El2>d86;Cd3lczV2FX;=f&>toK{6GzAOXZ?kW2+FNC2@JBvU~P z5*Y*W{^w;El2>d86;Cd3lczVUU#HZlk3iK z&we-9fLY_p0A!?e||C<)DNGn f=FS>-p3BoI6|S`Rdhg-iyUo>x&9asVEJiu`sPrYNauaQXt@)zBjvj`?=lw z?)f{pb-|dnAAYX*q}F;lx%~f83^Ia z)560l(1J&Yhr_+0%gh~{p>v}^BMQ&$%X zk&slGBeiHw4~1@ZB9s$m#6FfYrwYlV2%mn2YT*8X}+z0e%pI0Y;Rq1qGrgMqBwW(B}d^0A>MK zsWESolIzFjo=G%P^N5OTnQU1kkwalwj1MrzB0rpced0`WXl(U#%$!3CNiY~KToI`-2OE%aI_D#P^|#8!x_cJ#qJh&hpXxSZ;@%CoH{wE*VXLm zkRn|#2d^u}k;wB%pxa#!xEOE77l1u7)IPo^gn!Gls7$de#b`z};Dl`PE9e&krfJ+^ z2PkERjIPIIg;iJ^=7U`9Uc7j5^_{Cup2Pv($&*43^@7Dub7>@(Wl*ZUQeHLDj1e9? zboTC!*_Y}L%z5#%+((A*8QA`5{ku!fTz$)~Cm*@%qwgGfs&M1_W{2yVy$|1Z{Jx>9 zcYosU{9xg8e_1}a^X&I7ugl+F=>OGse{;vVdme49e|Kf~GoSX%-TA@nm#*5oa`j5j zsc8!~{o(xA&OLF9yX}p7qj6)0+qH$#j~;n4p~Os~N6#k2ip%$;3IrppmcW}B z8uUTrh=}7^fVQnI4L-D}06uG1-UKE3pSyCq%sovb;ZRptC=G%Ot#nq%i zfL7>=&*U)qlh7h`7z}ixLF?Hwy$35IdniRnXeO6XdK@GJ{&Df}uAZc_mXM27VEc{> zHgXZj_R;aqNd^PIOp3CaKv8Cr0xucB$ym+gkfyzsm6nU;RgTJX(Si$HJRCR@H01Lk%vOf{8NR#t+HzmicdW@_pQ4g5n7YP&VsfL|%_=P?B(7rnj6QZ(GM(cRYJ zZgBbBjWV}9tEo7ORXk>d_k~#!Z(7)iW*H%uDb4HIcfbfsbq?C8POOy@P^d5`=sflU z8HFX8ARS<7uOKv+(4YY{)PL~72OeY)kPP;0`%!zl$KU8{Z*Ok%w6wQ-+dA6drLi&A z!*AlDlp0OO=V6S>IuORJw7$vMSFe_2?Y$ECFpi(zMMg-Z;WY*)Btv-~mn@;2>f1n6 z5I+e7n7H8bxP7f156IGi`~|5ZTU1wn_~G?9^j{VVJ&po_f(=YcqAvw(7UE-NQbKP6 z{jFzPH>5n}E?V@HLFUR~?KL$>?3}qhoD=IJdXAuSf5b zQF7*yr}nDsXq0~QXe45D8h=b4Y)+Y@;}l4q0t!#lvq}A%#7bk_l&E%_nuKs91@qjc zZsUY?$}`*$;YN8}rc8~?(yOV2HQb{SUZ2p2PiQsDtm?Z)dR0q-$;#3P&bGzKk7o4C zKz`JTj|WUk7fwZfEE_!j6pE7?B5)Z}ks>FI3<+`p6A#QP8*`gLGUVX#plxC#@wQ_7 zZSZ(XdrWiSs;Pw)}cP)lrsQp^3Ehg}UaFC`{jT=TsXYX2?8_*`- zWWQROkR8VxE}b>0RO4v4G>L{vt*wjLet!0VH3c9%E(IrJ?iI*ZB^nl9CG)TbAg~V&3t(_S<@H>;o03R3*x~d+^pFYt4{h#-UD0{+5fdc_xT`rg zv)NKGrGRFpNhQtPNo^!-QHt#V%7$d5$!VNTL+Qw1itJ^TLS5<`nKIb+l#LQt6o9~9 zwrHJBny8au^L;i)X_V~&9-UDdV@GE5{7Dv2Q{^SAZrMX#Q&=c3|=b7*K{R@m&tViF3TMjrQ^6OyC+^$*&js%z6hoqM&V~p)G<0VRDR|pdM0Qr&zz{O6_xhD zl)N6Sc(L|mtrBKx4-9$fM~MO((@F=<_`aA{#0eW|>XmrHeyY*R_y!n|jaFJ&NeA_Y zg+iMzNnMV-8{sf}bB4pOu<89ZUNY-uzkh?D`u!tx06a&e;P6}%wI#4Tr`Cp%)@pY2dIUfystDp2iY8 z>E*=gF{V{xvVA!_zYc4={OiQk-y_kRQ_7&}h&Lw7?2B;$I@tgc6p6t_FUnYw=_VFM*gCgRvZ^+b zN}7@39tQZA;T;K-=(xw8xhSUWuQg7eW^af9^8-2L3QP63S(vIRVm%vB)v4NcH_jy8zQI+7a+RT7UDd0K zaHNBy90t&a;%bAOKpJ0cQ}@EPhAMhqWD{yiqI$v)8NoPhr#x0m>8diWH55DY@dla& z>KH5Jb#%^F+r4_n@fj(=wT2og!rwUs?4f|d(qs6i#zA7G8V9f{@v>lQ62h*UYYk-= zP$Yfp1lJl0b&}saAy1Xd5@Z3_8X9{iL{Cpc6W1D=nHb5nhVmP)qaAh@1_Psjk{_5% zYPaMhopHB&xWu)Fq7qH6H8dNHE6Ql4tL&!gfegLUO|DVQysGiZ{&Xk%jr-SA)lxBENf%I&)1);Vt~C_YH6hRfcmVlfjCS!b5X|LumzN3EOG<;N zQ{e!GV{JRc3JSLxGgvHKq$_+uw;MM2CwqmjZO6K})=;h)Jf@MS?7;hYMl#;VV~+r& zh`aHF_wn}j@m8u)xzOD=lVM$!_E)$|Qpq0uos>4j?^HzA6t6W=4FZFV%#<(H<{Tn%U_ ztDD{i-2^A;Ng;=NVLHukah0VeGP6b(45JoQz#zS7!m{aOZ=0*ZEjwDa8&nV}Uy4Wt z-I}H`OE?#Z;B-3Az)u5^e?sc&;O;tmY)M{#dV^mf-pQClkw;Q*@5p^-@znH!vWw>ridtQ*M0kN zxDXkc`l!wE-Qj9!EOBja2!dc-QBh9C{5^%Wu1%h;P2w>zPo(4+Dk$8B4+-Y%DRi-K ze}m|(6RUhGHngFPsZBxeD Date: Mon, 9 Oct 2023 13:31:11 -0400 Subject: [PATCH 28/33] automata: remove 'is_quit_state' debug assertions It's not feasible for us to check such things when deserializing a DFA, so we just have no real choice but to remove the assert and let the search proceed with incorrect results. I had previously wrote these as real asserts and then swapped them to debug_asserts, but of course, the fuzzer still trips over them. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60652 --- ...tomata_deserialize_sparse_dfa-5415338693754880 | Bin 0 -> 992 bytes regex-automata/src/dfa/search.rs | 10 ---------- 2 files changed, 10 deletions(-) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-5415338693754880 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-5415338693754880 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-5415338693754880 new file mode 100644 index 0000000000000000000000000000000000000000..cac835c53eda6ae94833f300257aec639d5d6b3c GIT binary patch literal 992 zcmb_Y!3~2j40Hfh;@J&O?7)K!I#&1S7L~dge>ewBDFzw=CpmWR>`Uaw8&{ztfLuht z9OEkMlRm@GPMN61D$F&Cn6+>ZrBLm@n~)%A$SW)_>nMgNphd}`#b1aGREYySj+v-< z*XI>_Ifh~0ogs77L89<;!BdWzdZxm)BoN!fKgw;ivK4+3tmiE)FJRP+7h?(Y-O4>9 COL@rv literal 0 HcmV?d00001 diff --git a/regex-automata/src/dfa/search.rs b/regex-automata/src/dfa/search.rs index 8c012a594..5a82261f9 100644 --- a/regex-automata/src/dfa/search.rs +++ b/regex-automata/src/dfa/search.rs @@ -176,7 +176,6 @@ fn find_fwd_imp( // It's important that this is a debug_assert, since this can // actually be tripped even if DFA::from_bytes succeeds and // returns a supposedly valid DFA. - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit(input.haystack()[at], at)); } } @@ -297,7 +296,6 @@ fn find_rev_imp( } else if dfa.is_dead_state(sid) { return Ok(mat); } else { - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit(input.haystack()[at], at)); } } @@ -422,7 +420,6 @@ fn find_overlapping_fwd_imp( } else if dfa.is_dead_state(sid) { return Ok(()); } else { - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit( input.haystack()[state.at], state.at, @@ -526,7 +523,6 @@ pub(crate) fn find_overlapping_rev( } else if dfa.is_dead_state(sid) { return Ok(()); } else { - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit( input.haystack()[state.at], state.at, @@ -600,9 +596,6 @@ fn eoi_fwd( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, input.haystack().len())); } - // N.B. We don't have to check 'is_quit' here because the EOI - // transition can never lead to a quit state. - debug_assert!(!dfa.is_quit_state(*sid)); } } Ok(()) @@ -631,9 +624,6 @@ fn eoi_rev( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, 0)); } - // N.B. We don't have to check 'is_quit' here because the EOI - // transition can never lead to a quit state. - debug_assert!(!dfa.is_quit_state(*sid)); } Ok(()) } From 62ce81265ebd7be54b65165481b88aba68616459 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 13:50:42 -0400 Subject: [PATCH 29/33] automata: fix invalid accelerators It's possible for DFA deserialization to result in an otherwise valid DFA, but one that records accelerated DFA states without any actual accelerator. We remedy that by checking for it at deserialization time. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60739 Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=61255 fixup --- ...ata_deserialize_dense_dfa-5883983265923072 | Bin 0 -> 2734 bytes ...ata_deserialize_dense_dfa-6363062083649536 | Bin 0 -> 2735 bytes regex-automata/src/dfa/dense.rs | 18 ++++++++++++++++++ 3 files changed, 18 insertions(+) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5883983265923072 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-6363062083649536 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5883983265923072 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_dense_dfa-5883983265923072 new file mode 100644 index 0000000000000000000000000000000000000000..233fcbc950a61bc614dc0e0a7418724fa0c36c56 GIT binary patch literal 2734 zcmZQ%VEF(4KNFB(WMp6fqESE~1R4m) DFA<&'a [u32]> { dfa.accels.validate()?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, // so it has already been validated. + for state in dfa.states() { + // If the state is an accel state, then it must have a non-empty + // accelerator. + if dfa.is_accel_state(state.id()) { + let index = dfa.accelerator_index(state.id()); + if index >= dfa.accels.len() { + return Err(DeserializeError::generic( + "found DFA state with invalid accelerator index", + )); + } + let needles = dfa.accels.needles(index); + if !(1 <= needles.len() && needles.len() <= 3) { + return Err(DeserializeError::generic( + "accelerator needles has invalid length", + )); + } + } + } Ok((dfa, nread)) } From e378b4dbf7f127a3bb118ed9b1c56c409453dc21 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 13:54:25 -0400 Subject: [PATCH 30/33] lite: reduce size limit to avoid timeouts Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60779 Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=61434 --- fuzz/fuzz_targets/fuzz_regex_lite_match.rs | 7 +++++-- ...ized-fuzz_regex_lite_match-5690981331369984 | Bin 0 -> 133532 bytes ...ized-fuzz_regex_lite_match-5888324890656768 | Bin 0 -> 233677 bytes 3 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5690981331369984 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5888324890656768 diff --git a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs index 579078c71..155fa6d8d 100644 --- a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs +++ b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs @@ -57,8 +57,11 @@ fuzz_target!(|case: FuzzCase| -> Corpus { .dot_matches_new_line(case.dot_matches_new_line) .swap_greed(case.swap_greed) .ignore_whitespace(case.ignore_whitespace) - .size_limit(1<<20) - .build() else { return Corpus::Reject }; + .size_limit(1 << 16) + .build() + else { + return Corpus::Reject; + }; re.is_match(case.haystack); Corpus::Keep }); diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5690981331369984 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5690981331369984 new file mode 100644 index 0000000000000000000000000000000000000000..d892bc31c496d70f689adcd583a32f42896f5fb5 GIT binary patch literal 133532 zcmeHQ&8j5HbsnLO+@|;1*$jx0rgcTTJv~3*VGxZPc{@U55YlXfAHX1kp_#|<6Zi!T z-dHf&NPE43Hrn_#TyaiDeDR$;aUwFSvTok0yo=weC!VMfg{6gWy9s_U%3(U{Y6CC_ak>g{ zNT7vG6vjcUo^TE&0f0XlJ%Ng4^qhb`a7s~8?vy?|<5c;kG=aBMRX=vqFlN`+Y2*vK zS*?^sb1B;|A0SteW#}7z>NS-X@`Da?VH7k~oo*c0@iVPXWE%-63X)LIx&=txPhCL5 zrY$nBvXMzF33Hv65JA5QD;vOBEt`SoY%uA;3`V3XWTJ2^#OetLo~KMKY$Slz5mEsT zA!P7e&QbKnhI$Hs`v5wn_fzGY5(K_NBMrS!$xlx>@Vu_7fkDJ$M!pWkfx1CAtAUqB zeb~O?r(RQ;K1hy?`nfO)s)d>8t}$QzfHNK_`Khk3MFI+gBowZ0f%4x99uh88^+!A* z5yePP)(urXKj^>0t6Il5SqT_b+0Bide1h+r3KSZuLJo`(t0(M3RZ!|bNRA4mv~79I z=FQue(_N!=x0r_RtK9Nk7`fvX78d(A98<%G$Clpjj7;0EIOkLFy%?oM-tVxu61Nnk(Rs%1M`mlY&LpA+pbBOUx3B-&pX#g`uq$*^ha8ShZg$*(h z>gNJ@FzO+PBu2gt!RCk*S|lI={k)b?%2l%2i%j_I(hJi%nxZ=W1E-D}R2dz|7)e8v zA!M{lsU}a8`Jn$wM^1$-`-W|IVIQ0b5Aj5WDC{=RBtunC0dU`BC$j7;MKl%*Vy_e$ zs=ZJNa!=TYDq2&JL2>{Pqrf19%*fZFI8Zm}W;O8As1MsW{M2hIErjIAsGkd?pjw@7 z9M|zPT|tWk6a`7BXWarM@Q|=+i_EKRWD-llT&E>O&~L)Z25?qWJMf$hCOw$Z(es3q zbcIe74u)7g;lTB}PO|us3vg!!&VdX0>dX*)Lzu5 zwcC8UmyfhP{bl_13H)G>Cu<|YMNap>oML0MRLgd%_gk<^%8qo3x?_wUe~TXA8M!O%x>7saOvqC4Q{V|*GsPC zZWkA^X)$P?p)N`Y`eBjU5@EOqPIVTn7)yM!tRP|1CH5{IpnHzst^Y4icg6ji%V#c$ zK~}C6YyjbG*7-3!se7|{Dj}oJ^B@0lN&P&-+QajY*`u8;p6rM_i$V7vd@?^gf{Y(3 z_}<9ZiP9rhA_2`GNvLPtBE7&v!W>?Cl2+LW#6_yp5-Ky{U~ho5JTVAt);G8QNDyh$ z+){yrL=?_>&a|Pb&0q@nCOfsy&Qh^}+Bc=K1Z7Q#NwpX1B4Hn@f>P)~a#X;iv=hph zj(k@4aZejfSRU>+4M~6u)x}sRo~RIorDy!(ZRtjqXPJVWjOy+;Io`B_*)RK?R+V7F z`Au=GMy9OT&52Ic1T`M{I)3V>4~nEZX0S`B)Tc&f68&4DH%RX$+33!_z+~V6-UIMZ!5Rj_3YC5&$s@ z3_{3^d>x7d`GRg%122vGuzkbs@-}zIiSkW}Rz}APFi5IGCJF~dEMGYJvCY%Z1@K_h zLk>xdd>w+#5h=7tKmwvq%}nP2>Q&hsbYzZrVuC0J-DKTR)$;?wmQmF?zR60!sH#3f zsS4EDhk;5M}WHcQzhmIyt z4GvEbG9zDyVDkY}p+y3EJg+6DBfN1fyS3PjU%@U4RgyEc0T;kzF&D5Ul=-O!P$ev^ z>MJx4B#2anOq7Ca z&YDwLIPko#s)0eoV@AGCBSh$CH4~8t96$Mxva-_>0$QWSBB1L5NoehLyLP##Y>;j4 zB9ZD$bCsEJIO@btV(L0c2A*?QvqD2v$bm6p^@Ib@>lz85bu^`FI1{VTb0PY`$w^V} zl-^I3Z%ShK6_6lO6*5r@t~qN?Vd21YKRE!1QD6{4X5{NMLWFKsGZCr41LR~>mCQF> z2^dw?M|;o^4>Duc&e)G8SZheNj{kGrVpM`(k0?zc-& zsn79!5b~+bv^jlXql{DLk1ql+`B=CH>_xovdNkFJo9SGx=<`cM2|aV|nw}&wsAYRY zFt@0CT1Cpbepv_0GSHJA%FvmG>|wYrKZ^LeNIY*5x{Vr2j(F86NP%_Uc{$F|gA;uS znUSyKUlNI`UKd3IdSyfs>QuMNPFp}BVRQXfUchCQhO&?YbHwTi`^gcl>GE*s$_Y>r z8fe)GcxlwPPmvVmNw4g~kHVJhAMY2RGHfzarAKl!*qx&0m!h^J3do=ao=^tbWA%o{99HZ!s z4fXT@hcc?>q7R@FFsiDLPzGah6=0fB2$?7x6tQ~37q4ny5b>Ci=onga*LOfcBOkDm z)xb+5E#&bi7rFQg3UCM^gFg-jjZf!N z&aPh7piNL#GxBvP4%8mHSq;22>NlfrxHFxzGoE<9DS_xK06nP+nJ63-v3y~fJ}@t% zelCoHYGEe2Ys|O~!RCk*S|p$-NJ2g979fF#gpEJqi3(AS^km&o)zbqU%Bbob-()3V zR8=3LRQdKz1qv`tszN3T2Su!&aM09KkU??)u-tjtU~oLjU7ysUo09vwNjwBOyfJ@# zAZC5TY@vC6MUM}}oZdCa&$n;!TdA+~fW&^Q!l>pL-lzTkNt$IR-51`1X(@92p+aDAk29%dGUPFL2^V1>(sy~&~ zDVD2Bp`j||z!-kg63~^7B-E*HmCYDQ*tA9FRW>q- zB~<_zv7blpSMxsy-n0Rn)q)*(&IUO$b$#pKqOBL+_LhnFO9{s}N0rO9D-O=TN9LLvOwx;sJld>3?vIyRF8%oEtJnB-C^0fr#Qi^RniBE^ zqZxu6zBaJ(*ty{>k|(~^=ez8=_`dpSr~(&*JmhVQdLIYLvT9y&aDKTzdiB{ypMCtv ztIu9fUlpJeYdD7U5v*Kbo=U?ZL9%Y0HhX+i1t@Q8VGWnMtMtBiXK1T~-OJks51`1Xx*Kme%69L- zdy=5}H{qkZ_M#gJC<>BLf4T)opdew>7MWMs$Rw78xlW!$FbCeW0i4I<4CDey4vKJm z+H9AfHj^q&eUCcEqdZNLu1fCUM?r(b%OchW$8pI~XZ@Myw6zGKHoi_Y2Sqm$&`C`a z>QA@IM%@#}B>X%1Yc%-t@~#3aB9OU1qHrdNH5icKhCdli1$pqv?9<3pyRD40aEfDZ^?t|U45WU;EuP{LS;V6(ngpe8eIur*&gl<*?FOB-JeZx<^rrw&+&jqrHQO+y3FZkZb*YPtg z`U)))kbo|~#7nmbgue5iAFX9FphY>i*ZgaQH*a6j!wdJf9gmxOZt{jMq@Lg%V%Bf< z`1Q01tuLUbIW!^mb#V5=sA==2pd(w_1ukeK|DzxQMe&z{6~DS=^(qp1PhWw ztXZbenKG)Onn;#*^D4jWv%lPulcGH7mHi+{uzXXqtUv)HQWY{$7zeR>!gW31Pe%RZ z7zNcr8Qr7kxDLgE5Mdx=2QQ8KuzkZrH9fwj=#*hp0Ha)sRDDqWObcJFL;~7KwZzm~ zu#ZtKcH_S|qv@M+k^UhI7~N4m7KNJo^onbSR;p1NgX4CpniJh>P~-z6Ux(_8e9hIs zOQTJX13Rv%Ktpn5)X#-cP_0fkj_Wi^h%!Y2ih?B6vu*(rcu3f^1@T0MC`Nj+Zm8<% z0S;v}WjV?ECMyA>s`?0}TGM?~fdWjEs*s7oK@qDb?9WsMrT&BDs6g_*{gnp4Sr75#G3#-CFF%uV5F2D#@AJfD2%< zmxynp99ChL+F?F3J1JAjuS)rjSf+;k=6m`N$Lc`N7>Vcq1*t7-lM1?3u zda`b)>gfRvWmI(zo2(dCC8IZVwU?Qv{~n*5hyYO-GqHSOac3M&(1TEog6!rQnJjvI zkLK$TY>r5wMFJ9#^LfAJuGErjJ?sQRGay?};U+VByw)|96)lRELB~*^Ks1f`WKUEA z=^6ict*epcS*9Q-qk1jiifsEUOxZ8{>@N;}Q_Q;0m_;*I>#=WYmKBpQ*(KFEg}pBl z&Us0Qjpu%HjDqY220igWhvGnpu##PYmqvZqzTu&owqNV2)H5#5sqZz2AAyppkcq
WzFI zKlRi3OsZp6+n}ajfAO2|eDB4(*FXN`n|B|7^Ww#eXYXE~#vV=ix;+V2e>3&AXm$WI z&$6NSPJh2lo0spo`rn@f6(J0KHyfDSKIP7oA*XV;9V?521+DwR>@^Cq>xb$|g68Wq z8dO?SbRz*>?MOoX=~mg}<(L~HikTvlbwgFpK{|btwwKgQvAsNZh~1uNh-pGGrxeF? z)||q^K2*_~f(())VzSxd#f5D1cmpYCV)_4}mp}IP2GS7Z@O_oz4W#9bvZcoP6@8>P zkiOh9D`U1LcW?4owyKKtVC zdS7@uRG)3HWG@`-4g5l7IHQPpd*!3>Hr$HRj<}x>J0B6+(f#+CEt`fbo-!zLdbzJV)cZ5s0vE`2gy+ZlhRHoV>-9@>%&u85I&U&IfOti4R~pE zuPrP2Lvm!)&xKJ?t&4PMszsxOC{rY$C`dxl>J}go0|}e9$h^u%Cb1;UwN|nBD~8{M zl?~v0>?2ruRsD*8i=GZ3VKQ3RxW-8LDbeXN=X8_I-udw^irQBt+g%j3E;Lrshptcd z70nRj@UrkpT~bXqY*SD6UY84kUfCYkQ9Hk)1IfAN3A~Ftg=hE5(FW^NIF!3vTgO+< z`s3Bw(r2gB966}ujh_3ok29}c<3_L;d4hdcf}anbU^GLJgA;=xeec-0;Vf$RpPq3) z!FB9nkcaBEMZJ%Uo%wpPJ>~pzfAs3Jk3ReOlUJX;rVEH+=;tF?d6<4G4Tl7kqv&vQ zuO|qU!K3kutx^|>YY1|<)Lo_buX%8M_3@_zv*`(z@nVqAZ<>JwdnJD=%T+KnQ|_i4 z(|q&0f8*7^XsHy^_DT46^4IuG(DQrv zym5VWrSzEu6HkqXi#r*TwfDvYQTWnQaRl~CL2*e14QLonwQ20kDx?qFUYI< znO#8UX-}P2IdTd$PT6KsGZDy1bCi?SwBfcv;5i%QMpOUCDb7IRxDx#-;G68M5`L9@ zQ&Kx$0c}O9LM93aMXa81;JKe10K_OT2q81_btn#m2;Hm(UK;gb`-Y!-O^+qsTnd=B zpsGs~5yePP3$vkW?>(ReWmI*JZ?Y0Fs;Ym}QHzpn{w zUenpQTw9qx+F*TVC3kOrlKxWb|Mt`L5X!!#erxOa%2~hRyGiQ{=xGj3h`Joh;T8Q)|I1MzvV?OAZzZ z`;jR`^&hkv0JcY%FpwdHUVjIu#0nXhQ`qDp;lT6M8CTB7D)gLyK5%kUlqbEij{?fb zPnB;fP=HlZ6*5s62eEp>K2!yz{ufp?FsSwzta`#!s18OH2ee=%y80dO1l2bs zd|#oVD&)Wzv3kOR=PAe_IRIF07H%*&e*1HMQitv%@9V+HAAbMmZ@>NHU;bKdb9)cy z!du_RTMO$OW(&=sCJZaO^7-YBe3sCDxyTkZ@@-;)$7ta2BF9>2$Y{yY1ggQ|2|{M% z>kw@3EFcollZRSDDb>*(AZoE2zk*#9s${5h=A<%Ha=A)K@bUsr2Gc*7!Dvw!i-dis z8m0OlI=uo^J3?gO5JFDkanjtl;ZHe}R1(Ex5m~qWw9{n{&5=9OL?i-V2F>WFqcfee zGoE<9DM8>X06nP+nJ63-v3y}EB+ScbrzK3WDKv_UL!RZ3jPIa39nLB4Kc1~c8*$F8sAXIk?q+eknYP7>-*x2T8i zr!F92(-y=VbZ#E__tQyEK5w^FNf)qVRG6m_LB)ui2$?7xH?e%-yXhQ1v6LZBp05y9 zLQ8%(89iR>n$7V|*QMD6Zkz+~N;uO(2V*n~gr|ZC=K0{Th@Wx*%NLzsfiYrU#Ah`T+KwJ0^8KI{aRPm5{tq~;C%%IN|_KcQ8<2L^@Ib@>&g$%I!g0g z|G*)H41U=Hn$a5@>S>z>SQ1N){PZ8eV?=2XGEq1vV)cY`UJ?NKlhL}W1_q6N?D{$s z2SS{yftN;o*uLROPmO*3#c#gzy%+CZ|M-({-hKSdix)4Ry?c2Ydo<`zK-8VqObXl1T=pnp`LY%^a2kFb9l#M*f`CmoJlH)VzP*=pR*YH53nyoJ>EcC zpU0t>Klb$o(h%hEeU;-4r1cH6h35GceWW*#zT6@k%WO*Sj_*x5eHTXWSDo;>nPhmV z>Grkqd7a^WJqmvl#l}A~G-cI;HcA_+M7W@@oc_lbsEn5A>T*#}Tpi5JVT7){7`;Wu znl&r|JDs27a!5wgh3s(+c)Ia%4>t#ct2bh;yVUcq^G(Z{-J)aYvRjz*&K3S5%mc!5 z0c(KVew8C!9k`h}q2XpjfMjDqyrI#fyaNdcdEI zc6uxV%8Z@+s1KVGRFpdtQJu9P6c0Y80eV6qWTF&YbJm=agr$36UPk>~7zNeRM)VAN z;(<;OkwS|E6h$pD;qpM#V#lYaOolqGO$Fq>gP}axQVl?*w6m(O&`=d}V2oHj;lT5{ z2?5YLn)+}!6RXghY^vu61Es1gIVsAOfKioQKUKb|Kmn#nRmemsxaO=mg@t{nqP4E7 zfkCnAZG4?Zh%k`VOhhVhcQuvigXG9)rzK1UQ)rDEi-4{NB%$HycI|Re*&y58MRocd zjvA?8+SBf$lbbZbg;wC%S5Vb+fAs2u&*RS@Ag2}9Iu*gJLU8R7>_etb#p$8WV?W`T z-TmE+FmzjaUyr1QAcuFtkB9M__0uZD=T~&)(>Ofa*!S-}1X-2b9S`I0_F??g8bed) z*4FWrv;KGnAFL-EvyLAz**3w^IOTyl^Q8ycO<;yx(lX+GqNESqp~hGu9nD3APPmUv0KxGR;+PuYB_u zx1vbyc5w|G-uYV;KKfyi+7e;7c-x+83s#IJzFAg~FzFI|mk!W9NATAFm!~`D{%zpx zR<3IWdx^rdY@8p%le#sF=La&H9(0wbE|!f457mQ>`pLtP0)$9a$V6cr#PWp&AI!_> zcz9RRSl$5C6!|&y(^qU5w(RC-Z7re9PvwC=)st0y1yw!w;a&2y&h@%ZLclu090Di8 zLp)I-ik~<`RZqcFN8jXAUk;GqprO`PH82RTW;EB44`J`(w6S$O9HyUt;4HTW@~NPt zZ&eSP#GBoD9eQhm81Uzag49V(66#d9tvUZY`D?weeGebQ89gA(Q%e+H@3bbGdWQo> z?u3?5=BN5WEAZ?qG*pEg7$a6sIPkn~g#&0EwR*VKy^P+|wjQ_&en4c^C=8~EOels+ zM3}&kX@hRcfT61A2mM!g)uQK{tOSgz>@Lq8>Qgm3OcM$r6Quy3v*r{Q_MwW_y1oqz zidApp>oh`yfvjdCQh_H}Q%$(E0)mv$PD_yzuh1H`5&`XPlF;yUi`GZbBy7ri4~OL- zequ?+h+>k3t$}nA3I-7F3AsLs5)1M%}t&_j=I{Ar9?L{5_rxAlL{0t zB2^(1g>ew8CtQ1GGsmZ6Z2_-c5BQVOPLD-EnR|)*s1KVOwkS_}WgmVt8&aL))3NzD z1;J#p4)ZIzjl+d`cuTM|PAb#*`` zgRB4{QWY{$7zeR>!ah_5rT+I^v5cM*FqJR>PAMwNoziD#TqNI=CNN1(f11nUN_I(g zPB}x^hpM0yNst^BVDEuJPh36a*iZin+a literal 0 HcmV?d00001 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5888324890656768 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-5888324890656768 new file mode 100644 index 0000000000000000000000000000000000000000..86126585263977b28dc771444064df04f8961bdf GIT binary patch literal 233677 zcmeI*&2Agl6$fx;6M%~X-t+;yC?Ewb%n>P34ip&3(tUw0gf4>W4T5IT&dP7$7c$TS z>DG_a`JRj6mFA92i5&7LAh9Axwj|Cy|Bv%O=idKgQM~=7c>DXWZ}0Q_>RmSSCTrA2DzWrtJf&2V#JHKCk(oY|Y|NGy0@ykDdxc}|P`|rxnzs{G7{NdvAU#on1 zoxl6Tn@#s$)q8pM8~q3Kcgx^^X+HVqyYiFjFX^HC!Lp>2yt z+7mPO+&P=hCO|396x@829*naLRR*8CxI|k^IzNRu<8%1jv?oTTRcTe)=^lkjtJ12p zB|RAk9e@H*017~9ni0eD05{?03~{sThiR^LZALxBvRSHCsw1rNJ;|jc5l93QfkYq? zB&h_(%BY;|%}!>q2@*v_M)Ny6pP^l>U0elVc}}9a637W*ZbFO@Bg6LY5S!`5zqtl06owFJcOcJuXApsHN96i_{J!pw%q=_YkSQ(X-lY?zxTkA|Vumadns-+!_ zF)^c}@2{fU-(`Q7{ar4kjoU^ltx9W0P~jB~@&_D54k8DUgIEb1o>P$MwF;^8$fnS0 zfC5ke3P1rU1EyVjftm~Dg8@(oF)8dTw=@Fd?AS}p|E=#&B>57PCR$5gYxd#`Pbfvj@^d`MYZ_=CeCcSArMY*@< zrE&Aq=dX!>2RquzH7nO@O(#|*RwY&?R#hVOSl9<{!cDk2Yux<)Ia$C&TD?koyRySm zrEQbDBg(jGJIL&D-Cx>0NOvfXh6TASS+_dXRy&%9R>Z!Cf3h((==NSd$eh@QT< zzPP@)zPP@)zIX=;@%?)wUtoN|O}Gg+Um7=e1HO{alFu#}k`j>;krI&-krL@OOaESe zu*&lXjDKQv@F9TDq35rEBS0x|Xh`YkM`@zjruY>ta3^^G(5K z@kWzLCX;48nG|Fl>eI}?WMDEd8JG-A1||bDnA>fxLNXj#;Nyr`6(J31)!ZYb-+m>H z2&o9^OGili^#(}>hAV0MA6P6#qk^AM1y5WgGIyZzZH75|uO`}ob{rY)Xo928rnBj6 zI-91ki)SDtEm}~$EKAa{042M*-L6+_fC5ke3P1rUr_!5&go}r6&~Q0;t|>@>5AXp# zz#kMw3KDL@P4oagKo8IZ^x$cFz&>T479EJ`6Fg%HSSjJ6^qh(l)m{S=M8ftiGmCLCwsHG;dJ4X=B=$Hl~effA){r$NCe^u5>WzA%-ip&OASu)v_14zNG!{B$L#wnZtxBuX?nj7a6@?ps zmJ(!7rahT8&>CyQX<1%pVtK7seHvNHrghYZnif4k56}bjplP@b%LB`c<;C(UJ0oOu zY*!-#lYz;=+$|5;wi}z9TM?pi-5Hti0DH|NIDuRZfA!ta9hPYPIwTe^P zUi@HwFh7_d%n#mVF5+_$pNse+OR^ie6&cPV#0W7$j1b#Tpf!3IFoi;)P$(1%g+ig+ zZYY#vP$=%{4ZHlh)hE%dc0E)OppR&M(T#6j#3Ium(;(9jjn?Ls!Wr2!sRSuO>ukHW zN!ljq$1kkn7;+3bh8#nVA;*woSc>{I-V8V4CftOZa1(CAO}H8L>FV-0yP>R4pTA;3 zVbc=XL-vq8WDnUx_K-bf@07@%SM&H@&3Rrr+T{^s{7xd{*KBF>YI-$g@d}Wz=Fonj zU03KRvUnW3<0El!6K=vyxGBRPt<6V)o6L{sYctcnrB~Cd`9!n*`!}6KRX23^Sc2y1 z*ws5ks@`r|>Jg{KX>nSd_TZfM!`<2%rR|yAR_EI+neC!_Gs)MJICk`r+R^mF|#|#gLt4&>P z>S|LXJFWzexP0hA*@2i+9H{mAS?50Gbmc2`)!!AGEJPF)%iFD77MJq?fTb=onIP;dux{ud= zypLM~`lkA(`lkA(`lkA(`lkA(QDe+troqhR7*jNuX)tr0ZP)ZBz4>gt*&CGN;ycqM zEJ>E6_PzGK_I*^P7_%~FRd1LdJRWcpZo*Bt2{#9Y@jT<=A^|KK7R|FQnx<12<|EIu zj;>*-d{P;4sdd}&GfMrxcMC1%*rye zjGnZHxh`FO*B`^`J!!q_p{b^iQFqjx7sLxX4KHZD>Yd|v^@}=Xm}kuA;05u5Tqd&1 zH&OTJQ1|G-fD&XWj-@!$p<4-3g1GLypo6;Z?&)Q%vDTuz7Cysg_za&9iqEbJcU3q) zSi86w!PONHVgb71opPH}9&lQm7N^B&aoTA9j2FZU;ss^5>y44StM`IA7njw&Uq?xEuaorD)uIK#m_NKI7eSUQ%DrDLp(3Fop(3F&=Y&dPv8OlbO?s2wq&Ht$Z@P3;rB!KFT9x*t zD{akbKiru;li5o%Nm*VjFP0a}Yp<496`N1nwbONa*fB%`kw7F639lm(4E7l8xkzl4 z5D7%Wd5eVIF{fHGq>)?$#eE{&rp28_tfUX=L;8?Dqz~!S`C;*8)oG@Fw(S-h?-Cb# zP0DkIhM*z#bJ}ZSuSt(M{d@MBz)gEi65UbqoqV50DEaC9^z-)9`N2=@g9j)81)%VQ z`N90)5lx=z2fG5*6{u4zZwiG%p-_wk>~$;vr^RVsg44Q^ChS&l4^F~8a3xLe5yn!s zl#d;N={o?+XDRK8p?oMG%7^lyeCzDM3n*xcG?t zOlyGyl;Z&uz9L_dugF&%DeSM{E4sz$cu>J^4%%4e`%3<^sr&UWQ(^~Vu41lYuG7yA zZHxt-iP<{g7AN69S%6p;pn8O+&=TcUc|DWz>f1c#mYq9vjlLLtG5Yc*Fy3n-^9oiw z7K8;&P+#36>bddUpq0l=E8V(^R-%8#f^v@0hp-)Mgl%Uv0oR)!eOjAlkNquI!@k>lRTi{2Uy zPy)UIH{mASgqv^^Zog6wwXQjpS&RGk+3i-oji9b1lrZFn7czGR0qrrIDYJ=!WQq6{}69m_DmBcD$4z zB`CKXFJBH!D?*GA_nC|8$P7@wMrR@z$NC*jUR-2pmu61bV^JtDo_5+Ys zuQvw0%B(1`q98qU=4?8fR{Knc?js=ME$;|y?^DzqzRhyE@cA@p|8*04=}@sM9HTNXe3V-}0W zp9j{PyGhcZ-LgieKuF_5WRc_~GNnzkd8b DTKo+f literal 0 HcmV?d00001 From c8e4c2e3e44f5e5ee2cfa6cdd206e74075aa5d51 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 14:02:40 -0400 Subject: [PATCH 31/33] regex: reject large patterns when fuzzing Otherwise we risk timeouts. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=61484 --- fuzz/fuzz_targets/fuzz_regex_match.rs | 10 ++++++++-- ...minimized-fuzz_regex_match-6659953212129280 | Bin 0 -> 399135 bytes 2 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6659953212129280 diff --git a/fuzz/fuzz_targets/fuzz_regex_match.rs b/fuzz/fuzz_targets/fuzz_regex_match.rs index 6c375510d..a5dda53d6 100644 --- a/fuzz/fuzz_targets/fuzz_regex_match.rs +++ b/fuzz/fuzz_targets/fuzz_regex_match.rs @@ -54,6 +54,9 @@ re.is_match({haystack:?}); fuzz_target!(|case: FuzzCase| -> Corpus { let _ = env_logger::try_init(); + if case.pattern.len() > (16 * (1 << 10)) { + return Corpus::Reject; + } if case.haystack.len() > (16 * (1 << 10)) { return Corpus::Reject; } @@ -65,8 +68,11 @@ fuzz_target!(|case: FuzzCase| -> Corpus { .ignore_whitespace(case.ignore_whitespace) .unicode(case.unicode) .octal(case.octal) - .size_limit(1<<18) - .build() else { return Corpus::Reject }; + .size_limit(1 << 18) + .build() + else { + return Corpus::Reject; + }; re.is_match(case.haystack); Corpus::Keep }); diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6659953212129280 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_match-6659953212129280 new file mode 100644 index 0000000000000000000000000000000000000000..b8cdc138a42791040d908484abc77478ca1defec GIT binary patch literal 399135 zcmeI*Ps^tFS;q1ENi$g5B8ZEMB5fgc^ogStUDTkYbnCKEg)o>bvXDv^QlW)OX|OM3 z-hiv_Y)jsX@1WqU+yrmtAV(g)lRV~{oWJ|;H<)ss(<|royFS0;d|H?$-rPL7;s5=3 z{|gWKi+Sv3x7*$Q3+{Q;$L{Le-2?DX%fC-kyPk>bQaetP$K_n*x@q`+OfeSJS)ANH|Qq!45i z2vaEECcZwbTPacqvI)dJq`)S=K5XntkwTD7AWWfroA~;$Zly>e$R-f?kOG_d`mnJp zMG8SSfiQ*gZQ|?0x|Je@Ae%tkLkeu->%+#b6e$GR1i}=`w~4P0>sE>sf@}hD4=J#T zuMZo$Qlt=M69`i%-zL63tXnBk2(k&pJ*2=UzCLX1N|8d4O(0C6e4F_Cux_PDA;=~W z_mBdc`1-K1D@6)HHi0mO@@?Yl!@89sg&>;@s2O54rd|bq^`fyEnw35M&()Q+V+| zcuMJ)mcj)C#oADQUFYDjvx>~JCYEl zaAar?>au+Jic+6e$D|1OjQ@LkfV&%@G6wXh#yl6pjq-L7i5L6oLo>fwb-+1;FIy2m%4L zBMD&&M~3#GPAf$UK?H$7TKA9wU~+Q=fdJZ(gfN98LwiuCl_G^8fmE`7Om2=K5I{SU5T5D26_#vVR<`oV8*j`3n&`hoyL;m?U9g18bC zIO1jW)viP-Qd~R;Q+V+o+za+gOK~Mx@AsbI)w_oj=xv!!PzbUPgeg419~J$0&sGXo z0u<*C;vP}}7wqy20s*um31JFHhW4OND@6)H1c5+W_mBc$a&rWM0NRm+Foh#Sdr+s9 zB84D=Kp?GqNC7apIf6g{?MOnH!jYjpsMAW3LJ&b9kk&n<0GQkyK_Gy3Bq2=U$j~0t zX{AUZh#(M1>mE`7Om2=K5I{SU5T5D27o4=Dg9H%AZ%pdCpFQ#dlT z2X$I0QV1dl1k#?^9zJ{e^IzWk_p2IYT%#Zt19D=!=pH^7aKSES!TKTh->G{@fx@M@ zpRW)5SSfS^#peNG3e6J`1rY=SXh#y_9#UY1a%Ya*nbRtO;(U3T!t?!-UWuPjiWGvJ zK8Slrfz!XUp77`kcJu?3;>E(V!DIx4+cGk=2X$I0QV1dl1k$>P6abT(BM1c0jwFOB z92we!I;|8b1Q7%RY28B#fXU4f1OjMB62cUY4DCUkR*DpY2m*n$?jZ%h(97s;r4U3A2%sHFh_RS*cE9Z3jNI5M;cby_J>2qFjs(z=Hf0F#>|2n5iMB!np(8QOz7 ztrRH)5d;Eh-9rk1$;}Z20%%7P!W51S?LnPZiWGtf0)e#dAqBwX<_H1-v?B>&3P*ADQUFYDjvx>~JCYEl zaAar?>au+Jic+6e$D|1OjQ@LkfV&%@G6wXh#yl6pjq-L7i5L6oLo>fwb-+1;FIy2m%4L zBMD&&M~3#GPAf$UK?H$7TKA9wU~+Q=fdJZ(gfN98LwiuCl_G^8ffJ*M z^tMbVC|4=J!hxid%Z z%xM)salX7v;rV_^uf$I%MG8SqAH+SR!0F#vPw0Y`BE|WFFooy)CA|_qp%mwy&VIG{@f!@6#28AH&K$yad|G`sAzqAxC7$^=8;vP}}SK_(~0s*um31JFHhW4OND@6)H z1c5+W_mBc$a&rWM0NRm+Foh#Sdr+s9B84D=Kp?GqNC7apIf6g{?MOnH!jYjpsMAW3 zLJ&b9kk&n<0GQkyK_Gy3Bq2=U$j~0tX{AUZh#(M1>mE`7Om2=K5I{SU5T5D27o4=Dg9H%AZ%pdCpFQ#dlT2X$I0QV1dl1k$>P6abT(BM1c0jwFOB92we! zI;|8b1Q7%RY28B#fXU4f1OjMB62cUY4DCUkR*DpY2m*n$7r2N0&O?U&0v!*@ca%Hx zr4>MNzE|%aQs82)wjXl7>Aap#U<$pn-hB!|1c3nBk%YL16j-6$nIm_0TGJkUI^YF< z>1X^uu^IVNB906_t)Q&r^Ghp$!XRw|aSti5iLVbEyHcbOWD^KeDBmW&KCD|QQV6mM z#66_ICcZvw>`IYBkWC;=p?sV8`mk=LNFm535ciM*oA~;$u`5LiK{kOfh4O9U>%+R0 zB84EEK-@zLY~t&~#;z181la__6w0@WuMg{1iWGuu0&x#1u!*k^8@p1Z5M&bwQz+ji zzCNs5DN+cs3B*05z$U&vZ0t&rLXb@$Ord<6`1-JJrAQ&jCJ^_K0-N~yu(2ye3PCo3 zFop7M;_Jh@l_G^8n?T${3T)!*!^W-@DFoRB!W7E4iLVdqR*DpYYyxo)DX@vJ4;#Bu zq!45i2vaEECcZwbTPacqvI)dJq`)S=K5XntkwTD7AWY$P@_pko{uwrV%IgF{hVHh4 z+|`&$hA)pfw>YK-@zLtWfSuLtKg!g2aI-)Wx7c0IdmG z0pcD~V1;sL8sbu<5F`#vp)Ljm0%%Rh3J~{@0xOg|(-4;;g&=WY3Ux6k5I}1}R)DyN z6j-6$nTEI&DFlfFQ>cqUfdE<)vI4|Cq`(U0&NRfONFhiZm_l6)3Ix!akQE^AAq7?_ zccvjOMG8UUz!d6YP#}QTgscE@4=J!hxibxMDN+a$2c}RLg8~7xCS(POdq{y5%AIM5 zOOZm5I535}7!(MgH6bfN+(QbiQ0`1aT#6Ke#DOW)#h^d{tqEBH;vQ09g>q*a;!>m# zBo0iWE(QexXidlp5ciM*E0jCa5SJo_AaP&{bulOqKx;x)fVhVgSfSjRhPV_d1c?Jv zsEa{?09q5W0>nL}zzXHgG{mJyAxIpULR}0B1kjq06(H^*1y(3`rXemx3PIw)6zXD7 zAb{3{tN?KjDX>DhGYxSmQV0?Urcf7y0s*upWCe(ONP!i~ooR?mkwTC-Fon7p6bPU- zAuB-KLkg@=?o2~miWGvxfhp9*pg;hv30VQ+9#UY1a%URiQlt5so2dvf!69@`(p$L=r=--5h%Z1)j8BrkXDknip``qEOQ5ailG z+(QanTLSvME`KRf91?^nJmkCk*}k+ChfhFX?Jbnz@VDR|Qs9V}(O0_?r8xXb^wo#F z1*Y(j?>@>0`_c-a@am5U;vQ1qh?mhxqrtk!RRP^ILTPa)#P@Fr6dq@FXu*)w91kjEogee>u+Jic+6e$D|1OjQ@ zLkfV&%@G6wXh#yl6pjq-L7i5L6oLo>fwb-+1;FIy2m%4LBMD&&M~3#GPAf$UK?H$7 zTKA9wU~+Q=fdJZ(gfN98LwnE*n3iw3fDbw3CzvVqxjrAQ%&AP`9F9#Q~IZjK-jKs%BUrf_6v z59+j1q!2_92&8onDF7xnM-T|09Z3jNI5M;cby_J>2qFjs(z=Hf0F#>|2n5iMB!np( z8QOz7trRH)5d;Eh-9rk1$;}Z20%%7P!W51S?Lm(+?I>@*K*yt8o-b{DQwXvN#66_I zCcZvw>`IYBkWC;=p?sV8`mk=LNFm535ciM*oA~;@s2O54rd|bq^`fyEnw35M&()Q+V+|cuMJ)mcj)C#oADQUFYDjvx>~JCYElaAar?>au+Jic+6e$D|1OjQ@LkfV& z%@G6wXh#yl6pjq-L7i5L6oLo>fwb-+1;FIy2m%4LBMD&&M~3#GPAf$UK?H$7TKA9w zU~+Q=fdJZ(gfN98LwnG3nf8s(_-Fg0krog#66_ICcZv=zp{^20EKP=VG4CIC=ftvLRNsdhZI<$ z+?j^B6e$FW15>DrL4g2T6S4xtJ*2=2<<2z3rAQ%29GF5~3yaVb&=5(lPG7lQ%;v?gQ)hA)pfw>YK-@zLtWfSuLtKg!g2aI-)Wx7c0IdmG0pcD~V1;sL8sbu<5F`#v zp)Ljm0%%Rh3J~{@0xOg|(-4;;g&=WY3Ux6k5I}1}R)DyN6j-6$nTEI&DFlfFQ>cqU zfdE<)vI4|Cq`(U0&NRfONFhiZm_l6)3Ix!akQE^AAq7?_ccvjOMG8UUz!d6YP#}QT zgscE@4=J!hxibxMDN+a$2c}RLg8~7xCS(POdq{y5%AIM5OOZm5I535}7!(MgH6bfN z+(QbiQ0`1aT#6Ke#DOW)#h^d{tqEBH;vQ09g>q*a;!>m#Bo0iWE(QexXidlp5ciM* zE0jCa5SJo_AaP&{bulOqKx;x)fVhVgSfSjRhPV_d1c?JvsEa{?09q5W0>nL}zzXHg zG{mJyAxIpULR}0B1kjq06(H^*1y(3`rXemx3PIw)6zXD7Ab{3{tN?KjDX>DhGYxSm zQV0?Urcf7y0s*upWCe(ONP!i~ooR?mkzxbLv!@^Y>fy(?^~!HQ{P20b|J7gLeEh3- ziVcH1^v++-WA2{Ze4fYl2l25xjKjAe?;YEHL=VZ!9XsT^yN$lI6e$F`HW2ra0@s#+ zey_`4iWG+gVG0lV?tZo}Eydvz&{um4r8xX8xQ7%t;$`&Ju0$ygzY=}*A#Z^xJmkBN z^1;5e0w}!tBZ9bx6gc8#^wq9JDNmE`7Om2=K5I{SU5T z5D27o4=Dg9H%AZ%pdCpFQ#dlT2X$I0QV1dl1k$>P6abT(BM1c0jwFOB92we!I;|8b z1Q7%RY28B#fXU4f1OjMB62cUY4DCUkR*DpY2m*n$?jZ%hqxjrAQ%&AP`9F9#Q~IZjK-jKs%BUrf_6v z59+j1q!2_92&8onDF7xnM-T|09Z3jNI5M;cby_J>2qFjs(z=Hf0F#>|2n5iMB!np( z8QOz7trRH)5d;Eh-9rk1$;}Z20%%7P!W51S?LnPZiWGtf0)e#dAqBwX<_H1-v?fFl z2%t3~fDl~JM(8}It{lbh#(L^YeE2mRv$qifYyYZCdfZe{DDtP>y3j& z5J4b-)`SQG0kkGW5D1_(A%Z{vtqBnX0%%Q$AP_)nLIi;TS`%`rApe7d?OQJkCV~h8 z0kkGW5D1_(AukWId;EvH-R|o@-2L(HOZ#H~$M@d*(HoC{^soQ)#>4;Tf8Mx%sq!x` zFYnDK0W(MjDFPQ`U=%lCfz?%#j;@i%{b^VY-NpM2$OyMOzK#}D?u3G?r}x4-nYCvUy}r*FQq`?Gg; z55N7|n~xsv-gxvo|MLCU9z6N(`?r6-|MSuJ-ul7Me)jmiJAeG{?)P@j+vmTP^LKx- z+r7T8zWk57-A{IpU;Xy(o6mp!lYiL{`K$e(kN)<%fBl`mdH=s}-@8kZdIiXfZ^5tl zQb8bq)`VOI$p7x}nfvyOT>URT`shDC`sf!A{H@1N1R(rHcroe=Ue?7-yYV;s&*eA! eU)$Z@zJ7aq`=9@{|Cjvn`t94dx9|Mm5B?9F4sLb; literal 0 HcmV?d00001 From 24d08d5899920f42858241d9de96ecf43b0a99ef Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 14:55:59 -0400 Subject: [PATCH 32/33] automata: improve sparse DFA validation This rejiggers some code so that we can more reliably check whether start state IDs are valid or not. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=62726 --- ...ta_deserialize_sparse_dfa-4903112680538112 | Bin 0 -> 953 bytes regex-automata/src/dfa/sparse.rs | 159 +++++++++--------- 2 files changed, 83 insertions(+), 76 deletions(-) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_automata_deserialize_sparse_dfa-4903112680538112 new file mode 100644 index 0000000000000000000000000000000000000000..3056bca2f335559837ff22c307040e7d200693b5 GIT binary patch literal 953 zcmcgrzb^z)5dPkCdx%D_SrUavRGOo46+(5LlBjgl;_fdHC6V|8G}v1iCd3X&#o?ZHHI$Q{9vE`2)E1aXc z4dV*ibs}S9a$sF5@Ts&8Nm;m!h_LYvQgx@O&0-~*MqvXc*1mdu8%`|_4(ibjUcp(+ zUaNf^t&($zmCZ!j&`hg&V+sEZeFMdJm&UDZ#tk}9nN`%b_4cnaUQ`}Jy`sc(1@{;A cVu5+$3H6G4Prat{2Q-<>jd&&eHq)$ literal 0 HcmV?d00001 diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index 7862d48a2..38096d994 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -992,8 +992,8 @@ impl<'a> DFA<&'a [u8]> { // (by trying to decode every state) and start state ID list below. If // either validation fails, then we return an error. let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; - dfa.tt.validate(&dfa.special)?; - dfa.st.validate(&dfa.special, &dfa.tt)?; + let seen = dfa.tt.validate(&dfa.special)?; + dfa.st.validate(&dfa.special, &seen)?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, // so it has already been validated. Ok((dfa, nread)) @@ -1388,63 +1388,8 @@ impl> Transitions { /// /// That is, every state ID can be used to correctly index a state in this /// table. - fn validate(&self, sp: &Special) -> Result<(), DeserializeError> { - // In order to validate everything, we not only need to make sure we - // can decode every state, but that every transition in every state - // points to a valid state. There are many duplicative transitions, so - // we record state IDs that we've verified so that we don't redo the - // decoding work. - // - // Except, when in no_std mode, we don't have dynamic memory allocation - // available to us, so we skip this optimization. It's not clear - // whether doing something more clever is worth it just yet. If you're - // profiling this code and need it to run faster, please file an issue. - // - // OK, so we also use this to record the set of valid state IDs. Since - // it is possible for a transition to point to an invalid state ID that - // still (somehow) deserializes to a valid state. So we need to make - // sure our transitions are limited to actually correct state IDs. - // The problem is, I'm not sure how to do this verification step in - // no-std no-alloc mode. I think we'd *have* to store the set of valid - // state IDs in the DFA itself. For now, we don't do this verification - // in no-std no-alloc mode. The worst thing that can happen is an - // incorrect result. But no panics or memory safety problems should - // result. Because we still do validate that the state itself is - // "valid" in the sense that everything it points to actually exists. - // - // ---AG - struct Seen { - #[cfg(feature = "alloc")] - set: alloc::collections::BTreeSet, - #[cfg(not(feature = "alloc"))] - set: core::marker::PhantomData, - } - - #[cfg(feature = "alloc")] - impl Seen { - fn new() -> Seen { - Seen { set: alloc::collections::BTreeSet::new() } - } - fn insert(&mut self, id: StateID) { - self.set.insert(id); - } - fn contains(&self, id: &StateID) -> bool { - self.set.contains(id) - } - } - - #[cfg(not(feature = "alloc"))] - impl Seen { - fn new() -> Seen { - Seen { set: core::marker::PhantomData } - } - fn insert(&mut self, _id: StateID) {} - fn contains(&self, _id: &StateID) -> bool { - false - } - } - - let mut verified: Seen = Seen::new(); + fn validate(&self, sp: &Special) -> Result { + let mut verified = Seen::new(); // We need to make sure that we decode the correct number of states. // Otherwise, an empty set of transitions would validate even if the // recorded state length is non-empty. @@ -1521,7 +1466,7 @@ impl> Transitions { "mismatching sparse state length", )); } - Ok(()) + Ok(verified) } /// Converts these transitions to a borrowed value. @@ -1659,7 +1604,7 @@ impl> Transitions { let state = &state[nr..]; if npats == 0 { return Err(DeserializeError::generic( - "state marked as a match, but has no pattern IDs", + "state marked as a match, but pattern length is zero", )); } @@ -1681,6 +1626,21 @@ impl> Transitions { } else { (&[][..], state) }; + if is_match && pattern_ids.is_empty() { + return Err(DeserializeError::generic( + "state marked as a match, but has no pattern IDs", + )); + } + if sp.is_match_state(id) && pattern_ids.is_empty() { + return Err(DeserializeError::generic( + "state marked special as a match, but has no pattern IDs", + )); + } + if sp.is_match_state(id) != is_match { + return Err(DeserializeError::generic( + "whether state is a match or not is inconsistent", + )); + } // Now read this state's accelerator info. The first byte is the length // of the accelerator, which is typically 0 (for no acceleration) but @@ -2061,28 +2021,19 @@ impl> StartTable { fn validate( &self, sp: &Special, - trans: &Transitions, + seen: &Seen, ) -> Result<(), DeserializeError> { for (id, _, _) in self.iter() { + if !seen.contains(&id) { + return Err(DeserializeError::generic( + "found invalid start state ID", + )); + } if sp.is_match_state(id) { return Err(DeserializeError::generic( "start states cannot be match states", )); } - // Confirm that the start state points to a valid state. - let state = trans.try_state(sp, id)?; - // And like for the transition table, confirm that the transitions - // on all start states themselves point to a valid state. - // - // It'd probably be better to integrate this validation with the - // transition table, or otherwise store a sorted sequence of all - // valid state IDs in the sparse DFA itself. That way, we could - // check that every pointer to a state corresponds precisely to a - // correct and valid state. - for i in 0..state.ntrans { - let to = state.next_at(i); - let _ = trans.try_state(sp, to)?; - } } Ok(()) } @@ -2537,6 +2488,62 @@ impl<'a> fmt::Debug for StateMut<'a> { } } +// In order to validate everything, we not only need to make sure we +// can decode every state, but that every transition in every state +// points to a valid state. There are many duplicative transitions, so +// we record state IDs that we've verified so that we don't redo the +// decoding work. +// +// Except, when in no_std mode, we don't have dynamic memory allocation +// available to us, so we skip this optimization. It's not clear +// whether doing something more clever is worth it just yet. If you're +// profiling this code and need it to run faster, please file an issue. +// +// OK, so we also use this to record the set of valid state IDs. Since +// it is possible for a transition to point to an invalid state ID that +// still (somehow) deserializes to a valid state. So we need to make +// sure our transitions are limited to actually correct state IDs. +// The problem is, I'm not sure how to do this verification step in +// no-std no-alloc mode. I think we'd *have* to store the set of valid +// state IDs in the DFA itself. For now, we don't do this verification +// in no-std no-alloc mode. The worst thing that can happen is an +// incorrect result. But no panics or memory safety problems should +// result. Because we still do validate that the state itself is +// "valid" in the sense that everything it points to actually exists. +// +// ---AG +#[derive(Debug)] +struct Seen { + #[cfg(feature = "alloc")] + set: alloc::collections::BTreeSet, + #[cfg(not(feature = "alloc"))] + set: core::marker::PhantomData, +} + +#[cfg(feature = "alloc")] +impl Seen { + fn new() -> Seen { + Seen { set: alloc::collections::BTreeSet::new() } + } + fn insert(&mut self, id: StateID) { + self.set.insert(id); + } + fn contains(&self, id: &StateID) -> bool { + self.set.contains(id) + } +} + +#[cfg(not(feature = "alloc"))] +impl Seen { + fn new() -> Seen { + Seen { set: core::marker::PhantomData } + } + fn insert(&mut self, _id: StateID) {} + fn contains(&self, _id: &StateID) -> bool { + false + } +} + /* /// A binary search routine specialized specifically to a sparse DFA state's /// transitions. Specifically, the transitions are defined as a set of pairs From 109c8c450bb860f6118e2c631785cf1829fbe1c7 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 15:12:22 -0400 Subject: [PATCH 33/33] fuzz: add regression test for AST roundtripping I couldn't get this to reproduce. Maybe some of my recent changes to regex-syntax fixed this? Not sure. I'm not a huge fan of this fuzzer in general because it isn't really testing a rock solid guarantee that we provide. And the positions are tough to deal with. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=62382 --- ...tcase-minimized-ast_roundtrip-5633607856947200 | Bin 0 -> 491 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-ast_roundtrip-5633607856947200 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-ast_roundtrip-5633607856947200 b/fuzz/regressions/clusterfuzz-testcase-minimized-ast_roundtrip-5633607856947200 new file mode 100644 index 0000000000000000000000000000000000000000..726609cf21cf2933ef0fa2a49301f10f675f091a GIT binary patch literal 491 zcma)0!4ZHk2=nO}S-}n1%X1I6tpU6uwYBnSNdhE5i9(c-gjGZd5jX?*