Skip to content

Commit c3ef1de

Browse files
committed
regex-automata: fix bug in DFA quit behavior
It turns out that the way we were dealing with quit states in the DFA was not quite right. Basically, if we entered a quit state and a match had been found, then we were returning the match instead of the error. But the match might not be the correct leftmost-first match, and so, we really shouldn't return it. Otherwise a regex like '\B.*' could match much less than it should. This was caught by a differential fuzzer developed in #848.
1 parent 99f18c4 commit c3ef1de

File tree

6 files changed

+40
-34
lines changed

6 files changed

+40
-34
lines changed

Diff for: regex-automata/src/dfa/dense.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -562,7 +562,7 @@ impl Config {
562562
///
563563
/// // The match occurs before the search ever observes the snowman
564564
/// // character, so no error occurs.
565-
/// let haystack = "foo 123 ☃".as_bytes();
565+
/// let haystack = "foo 123 ☃".as_bytes();
566566
/// let expected = Some(HalfMatch::must(0, 7));
567567
/// let got = dfa.try_search_fwd(&Input::new(haystack))?;
568568
/// assert_eq!(expected, got);
@@ -572,8 +572,8 @@ impl Config {
572572
/// // routines read one byte past the end of the search to account for
573573
/// // look-around, and indeed, this is required here to determine whether
574574
/// // the trailing \b matches.
575-
/// let haystack = "foo 123☃".as_bytes();
576-
/// let expected = MatchError::quit(0xE2, 7);
575+
/// let haystack = "foo 123 ☃".as_bytes();
576+
/// let expected = MatchError::quit(0xE2, 8);
577577
/// let got = dfa.try_search_fwd(&Input::new(haystack));
578578
/// assert_eq!(Err(expected), got);
579579
///

Diff for: regex-automata/src/dfa/search.rs

-12
Original file line numberDiff line numberDiff line change
@@ -180,9 +180,6 @@ fn find_fwd_imp<A: Automaton + ?Sized>(
180180
// actually be tripped even if DFA::from_bytes succeeds and
181181
// returns a supposedly valid DFA.
182182
debug_assert!(dfa.is_quit_state(sid));
183-
if mat.is_some() {
184-
return Ok(mat);
185-
}
186183
return Err(MatchError::quit(input.haystack()[at], at));
187184
}
188185
}
@@ -307,9 +304,6 @@ fn find_rev_imp<A: Automaton + ?Sized>(
307304
return Ok(mat);
308305
} else {
309306
debug_assert!(dfa.is_quit_state(sid));
310-
if mat.is_some() {
311-
return Ok(mat);
312-
}
313307
return Err(MatchError::quit(input.haystack()[at], at));
314308
}
315309
}
@@ -611,9 +605,6 @@ fn eoi_fwd<A: Automaton + ?Sized>(
611605
let pattern = dfa.match_pattern(*sid, 0);
612606
*mat = Some(HalfMatch::new(pattern, sp.end));
613607
} else if dfa.is_quit_state(*sid) {
614-
if mat.is_some() {
615-
return Ok(());
616-
}
617608
return Err(MatchError::quit(b, sp.end));
618609
}
619610
}
@@ -646,9 +637,6 @@ fn eoi_rev<A: Automaton + ?Sized>(
646637
let pattern = dfa.match_pattern(*sid, 0);
647638
*mat = Some(HalfMatch::new(pattern, sp.start));
648639
} else if dfa.is_quit_state(*sid) {
649-
if mat.is_some() {
650-
return Ok(());
651-
}
652640
return Err(MatchError::quit(byte, sp.start - 1));
653641
}
654642
} else {

Diff for: regex-automata/src/hybrid/dfa.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -3226,7 +3226,7 @@ impl Config {
32263226
///
32273227
/// // The match occurs before the search ever observes the snowman
32283228
/// // character, so no error occurs.
3229-
/// let haystack = "foo 123 ☃";
3229+
/// let haystack = "foo 123 ☃";
32303230
/// let expected = Some(HalfMatch::must(0, 7));
32313231
/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
32323232
/// assert_eq!(expected, got);
@@ -3236,8 +3236,8 @@ impl Config {
32363236
/// // routines read one byte past the end of the search to account for
32373237
/// // look-around, and indeed, this is required here to determine whether
32383238
/// // the trailing \b matches.
3239-
/// let haystack = "foo 123☃";
3240-
/// let expected = MatchError::quit(0xE2, 7);
3239+
/// let haystack = "foo 123 ☃";
3240+
/// let expected = MatchError::quit(0xE2, 8);
32413241
/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack));
32423242
/// assert_eq!(Err(expected), got);
32433243
///

Diff for: regex-automata/src/hybrid/search.rs

-12
Original file line numberDiff line numberDiff line change
@@ -282,9 +282,6 @@ fn find_fwd_imp(
282282
return Ok(mat);
283283
} else if sid.is_quit() {
284284
cache.search_finish(at);
285-
if mat.is_some() {
286-
return Ok(mat);
287-
}
288285
return Err(MatchError::quit(input.haystack()[at], at));
289286
} else {
290287
debug_assert!(sid.is_unknown());
@@ -432,9 +429,6 @@ fn find_rev_imp(
432429
return Ok(mat);
433430
} else if sid.is_quit() {
434431
cache.search_finish(at);
435-
if mat.is_some() {
436-
return Ok(mat);
437-
}
438432
return Err(MatchError::quit(input.haystack()[at], at));
439433
} else {
440434
debug_assert!(sid.is_unknown());
@@ -730,9 +724,6 @@ fn eoi_fwd(
730724
let pattern = dfa.match_pattern(cache, *sid, 0);
731725
*mat = Some(HalfMatch::new(pattern, sp.end));
732726
} else if sid.is_quit() {
733-
if mat.is_some() {
734-
return Ok(());
735-
}
736727
return Err(MatchError::quit(b, sp.end));
737728
}
738729
}
@@ -770,9 +761,6 @@ fn eoi_rev(
770761
let pattern = dfa.match_pattern(cache, *sid, 0);
771762
*mat = Some(HalfMatch::new(pattern, sp.start));
772763
} else if sid.is_quit() {
773-
if mat.is_some() {
774-
return Ok(());
775-
}
776764
return Err(MatchError::quit(byte, sp.start - 1));
777765
}
778766
} else {

Diff for: regex-automata/src/meta/strategy.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -636,7 +636,7 @@ impl Strategy for Core {
636636
// We manually inline try_search_mayfail here because letting the
637637
// compiler do it seems to produce pretty crappy codegen.
638638
return if let Some(e) = self.dfa.get(input) {
639-
trace!("using full DFA for search at {:?}", input.get_span());
639+
trace!("using full DFA for full search at {:?}", input.get_span());
640640
match e.try_search(input) {
641641
Ok(x) => x,
642642
Err(_err) => {
@@ -645,7 +645,7 @@ impl Strategy for Core {
645645
}
646646
}
647647
} else if let Some(e) = self.hybrid.get(input) {
648-
trace!("using lazy DFA for search at {:?}", input.get_span());
648+
trace!("using lazy DFA for full search at {:?}", input.get_span());
649649
match e.try_search(&mut cache.hybrid, input) {
650650
Ok(x) => x,
651651
Err(_err) => {
@@ -668,7 +668,7 @@ impl Strategy for Core {
668668
// can use a single forward scan without needing to run the reverse
669669
// DFA.
670670
return if let Some(e) = self.dfa.get(input) {
671-
trace!("using full DFA for search at {:?}", input.get_span());
671+
trace!("using full DFA for half search at {:?}", input.get_span());
672672
match e.try_search_half_fwd(input) {
673673
Ok(x) => x,
674674
Err(_err) => {
@@ -677,7 +677,7 @@ impl Strategy for Core {
677677
}
678678
}
679679
} else if let Some(e) = self.hybrid.get(input) {
680-
trace!("using lazy DFA for search at {:?}", input.get_span());
680+
trace!("using lazy DFA for half search at {:?}", input.get_span());
681681
match e.try_search_half_fwd(&mut cache.hybrid, input) {
682682
Ok(x) => x,
683683
Err(_err) => {

Diff for: testdata/regression.toml

+30
Original file line numberDiff line numberDiff line change
@@ -689,3 +689,33 @@ name = "captures-wrong-order"
689689
regex = '(a){0}(a)'
690690
haystack = 'a'
691691
matches = [[[0, 1], [], [0, 1]]]
692+
693+
# This tests a bug in how quit states are handled in the DFA. At some point
694+
# during development, the DFAs were tweaked slightly such that if they hit
695+
# a quit state (which means, they hit a byte that the caller configured should
696+
# stop the search), then it might not return an error necessarily. Namely, if a
697+
# match had already been found, then it would be returned instead of an error.
698+
#
699+
# But this is actually wrong! Why? Because even though a match had been found,
700+
# it wouldn't be fully correct to return it once a quit state has been seen
701+
# because you can't determine whether the match offset returned is the correct
702+
# greedy/leftmost-first match. Since you can't complete the search as requested
703+
# by the caller, the DFA should just stop and return an error.
704+
#
705+
# Interestingly, this does seem to produce an unavoidable difference between
706+
# 'try_is_match().unwrap()' and 'try_find().unwrap().is_some()' for the DFAs.
707+
# The former will stop immediately once a match is known to occur and return
708+
# 'Ok(true)', where as the latter could find the match but quit with an
709+
# 'Err(..)' first.
710+
#
711+
# Thankfully, I believe this inconsistency between 'is_match()' and 'find()'
712+
# cannot be observed in the higher level meta regex API because it specifically
713+
# will try another engine that won't fail in the case of a DFA failing.
714+
#
715+
# This regression happened in the regex crate rewrite, but before anything got
716+
# released.
717+
[[test]]
718+
name = "negated-unicode-word-boundary-dfa-fail"
719+
regex = '\B.*'
720+
haystack = "!\u02D7"
721+
matches = [[0, 3]]

0 commit comments

Comments
 (0)