Skip to content

Commit 3563d73

Browse files
hikotqBurntSushi
authored andcommitted
exec: fix a bug in capture with match
When performing "EndText" matching, it is necessary to check whether the current position matches the input text length. However, when capturing a submatch using the matching result of DFA, "EndText" matching wasn't actually performed correctly because the input text is sliced. By applying this patch we specify the match end position by the argument "end", not using slice when performing capture with the matching result of DFA. Fixes #557, Closes #561
1 parent 6152de4 commit 3563d73

File tree

5 files changed

+76
-46
lines changed

5 files changed

+76
-46
lines changed

Diff for: CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ Performance improvements:
88
* [OPT #566](https://github.com/rust-lang/regex/pull/566):
99
Upgrades `aho-corasick` to 0.7 and uses it for `foo|bar|...|quux` regexes.
1010

11+
Bug fixes:
12+
13+
* [BUG #557](https://github.com/rust-lang/regex/issues/557):
14+
Fix a bug where captures could lead to an incorrect match.
15+
1116

1217
1.1.2 (2019-02-27)
1318
==================

Diff for: src/backtrack.rs

+4-3
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
9898
slots: &'s mut [Slot],
9999
input: I,
100100
start: usize,
101+
end: usize,
101102
) -> bool {
102103
let mut cache = cache.borrow_mut();
103104
let cache = &mut cache.backtrack;
@@ -109,7 +110,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
109110
slots: slots,
110111
m: cache,
111112
};
112-
b.exec_(start)
113+
b.exec_(start, end)
113114
}
114115

115116
/// Clears the cache such that the backtracking engine can be executed
@@ -147,7 +148,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
147148

148149
/// Start backtracking at the given position in the input, but also look
149150
/// for literal prefixes.
150-
fn exec_(&mut self, mut at: InputAt) -> bool {
151+
fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
151152
self.clear();
152153
// If this is an anchored regex at the beginning of the input, then
153154
// we're either already done or we only need to try backtracking once.
@@ -170,7 +171,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
170171
if matched && self.prog.matches.len() == 1 {
171172
return true;
172173
}
173-
if at.is_end() {
174+
if at.pos() == end {
174175
break;
175176
}
176177
at = self.input.at(at.next_pos());

Diff for: src/exec.rs

+57-41
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
use std::cell::RefCell;
1212
use std::collections::HashMap;
13-
use std::cmp;
1413
use std::sync::Arc;
1514

1615
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
@@ -589,7 +588,8 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
589588
match self.ro.match_type {
590589
MatchType::Literal(ty) => {
591590
self.find_literals(ty, text, start).and_then(|(s, e)| {
592-
self.captures_nfa_with_match(slots, text, s, e)
591+
self.captures_nfa_type(
592+
MatchNfaType::Auto, slots, text, s, e)
593593
})
594594
}
595595
MatchType::Dfa => {
@@ -598,17 +598,21 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
598598
} else {
599599
match self.find_dfa_forward(text, start) {
600600
dfa::Result::Match((s, e)) => {
601-
self.captures_nfa_with_match(slots, text, s, e)
601+
self.captures_nfa_type(
602+
MatchNfaType::Auto, slots, text, s, e)
602603
}
603604
dfa::Result::NoMatch(_) => None,
604-
dfa::Result::Quit => self.captures_nfa(slots, text, start),
605+
dfa::Result::Quit => {
606+
self.captures_nfa(slots, text, start)
607+
}
605608
}
606609
}
607610
}
608611
MatchType::DfaAnchoredReverse => {
609612
match self.find_dfa_anchored_reverse(text, start) {
610613
dfa::Result::Match((s, e)) => {
611-
self.captures_nfa_with_match(slots, text, s, e)
614+
self.captures_nfa_type(
615+
MatchNfaType::Auto, slots, text, s, e)
612616
}
613617
dfa::Result::NoMatch(_) => None,
614618
dfa::Result::Quit => self.captures_nfa(slots, text, start),
@@ -617,14 +621,15 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
617621
MatchType::DfaSuffix => {
618622
match self.find_dfa_reverse_suffix(text, start) {
619623
dfa::Result::Match((s, e)) => {
620-
self.captures_nfa_with_match(slots, text, s, e)
624+
self.captures_nfa_type(
625+
MatchNfaType::Auto, slots, text, s, e)
621626
}
622627
dfa::Result::NoMatch(_) => None,
623628
dfa::Result::Quit => self.captures_nfa(slots, text, start),
624629
}
625630
}
626631
MatchType::Nfa(ty) => {
627-
self.captures_nfa_type(ty, slots, text, start)
632+
self.captures_nfa_type(ty, slots, text, start, text.len())
628633
}
629634
MatchType::Nothing => None,
630635
MatchType::DfaMany => {
@@ -867,7 +872,7 @@ impl<'c> ExecNoSync<'c> {
867872
text: &[u8],
868873
start: usize,
869874
) -> bool {
870-
self.exec_nfa(ty, &mut [false], &mut [], true, text, start)
875+
self.exec_nfa(ty, &mut [false], &mut [], true, text, start, text.len())
871876
}
872877

873878
/// Finds the shortest match using an NFA.
@@ -883,7 +888,15 @@ impl<'c> ExecNoSync<'c> {
883888
start: usize,
884889
) -> Option<usize> {
885890
let mut slots = [None, None];
886-
if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start) {
891+
if self.exec_nfa(
892+
ty,
893+
&mut [false],
894+
&mut slots,
895+
true,
896+
text,
897+
start,
898+
text.len()
899+
) {
887900
slots[1]
888901
} else {
889902
None
@@ -898,7 +911,15 @@ impl<'c> ExecNoSync<'c> {
898911
start: usize,
899912
) -> Option<(usize, usize)> {
900913
let mut slots = [None, None];
901-
if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start) {
914+
if self.exec_nfa(
915+
ty,
916+
&mut [false],
917+
&mut slots,
918+
false,
919+
text,
920+
start,
921+
text.len()
922+
) {
902923
match (slots[0], slots[1]) {
903924
(Some(s), Some(e)) => Some((s, e)),
904925
_ => None,
@@ -908,26 +929,6 @@ impl<'c> ExecNoSync<'c> {
908929
}
909930
}
910931

911-
/// Like find_nfa, but fills in captures and restricts the search space
912-
/// using previously found match information.
913-
///
914-
/// `slots` should have length equal to `2 * nfa.captures.len()`.
915-
fn captures_nfa_with_match(
916-
&self,
917-
slots: &mut [Slot],
918-
text: &[u8],
919-
match_start: usize,
920-
match_end: usize,
921-
) -> Option<(usize, usize)> {
922-
// We can't use match_end directly, because we may need to examine one
923-
// "character" after the end of a match for lookahead operators. We
924-
// need to move two characters beyond the end, since some look-around
925-
// operations may falsely assume a premature end of text otherwise.
926-
let e = cmp::min(
927-
next_utf8(text, next_utf8(text, match_end)), text.len());
928-
self.captures_nfa(slots, &text[..e], match_start)
929-
}
930-
931932
/// Like find_nfa, but fills in captures.
932933
///
933934
/// `slots` should have length equal to `2 * nfa.captures.len()`.
@@ -937,7 +938,8 @@ impl<'c> ExecNoSync<'c> {
937938
text: &[u8],
938939
start: usize,
939940
) -> Option<(usize, usize)> {
940-
self.captures_nfa_type(MatchNfaType::Auto, slots, text, start)
941+
self.captures_nfa_type(
942+
MatchNfaType::Auto, slots, text, start, text.len())
941943
}
942944

943945
/// Like captures_nfa, but allows specification of type of NFA engine.
@@ -947,8 +949,9 @@ impl<'c> ExecNoSync<'c> {
947949
slots: &mut [Slot],
948950
text: &[u8],
949951
start: usize,
952+
end: usize,
950953
) -> Option<(usize, usize)> {
951-
if self.exec_nfa(ty, &mut [false], slots, false, text, start) {
954+
if self.exec_nfa(ty, &mut [false], slots, false, text, start, end) {
952955
match (slots[0], slots[1]) {
953956
(Some(s), Some(e)) => Some((s, e)),
954957
_ => None,
@@ -966,6 +969,7 @@ impl<'c> ExecNoSync<'c> {
966969
quit_after_match: bool,
967970
text: &[u8],
968971
start: usize,
972+
end: usize,
969973
) -> bool {
970974
use self::MatchNfaType::*;
971975
if let Auto = ty {
@@ -977,10 +981,10 @@ impl<'c> ExecNoSync<'c> {
977981
}
978982
match ty {
979983
Auto => unreachable!(),
980-
Backtrack => self.exec_backtrack(matches, slots, text, start),
984+
Backtrack => self.exec_backtrack(matches, slots, text, start, end),
981985
PikeVM => {
982986
self.exec_pikevm(
983-
matches, slots, quit_after_match, text, start)
987+
matches, slots, quit_after_match, text, start, end)
984988
}
985989
}
986990
}
@@ -993,6 +997,7 @@ impl<'c> ExecNoSync<'c> {
993997
quit_after_match: bool,
994998
text: &[u8],
995999
start: usize,
1000+
end: usize,
9961001
) -> bool {
9971002
if self.ro.nfa.uses_bytes() {
9981003
pikevm::Fsm::exec(
@@ -1002,7 +1007,8 @@ impl<'c> ExecNoSync<'c> {
10021007
slots,
10031008
quit_after_match,
10041009
ByteInput::new(text, self.ro.nfa.only_utf8),
1005-
start)
1010+
start,
1011+
end)
10061012
} else {
10071013
pikevm::Fsm::exec(
10081014
&self.ro.nfa,
@@ -1011,7 +1017,8 @@ impl<'c> ExecNoSync<'c> {
10111017
slots,
10121018
quit_after_match,
10131019
CharInput::new(text),
1014-
start)
1020+
start,
1021+
end)
10151022
}
10161023
}
10171024

@@ -1022,6 +1029,7 @@ impl<'c> ExecNoSync<'c> {
10221029
slots: &mut [Slot],
10231030
text: &[u8],
10241031
start: usize,
1032+
end: usize,
10251033
) -> bool {
10261034
if self.ro.nfa.uses_bytes() {
10271035
backtrack::Bounded::exec(
@@ -1030,15 +1038,17 @@ impl<'c> ExecNoSync<'c> {
10301038
matches,
10311039
slots,
10321040
ByteInput::new(text, self.ro.nfa.only_utf8),
1033-
start)
1041+
start,
1042+
end)
10341043
} else {
10351044
backtrack::Bounded::exec(
10361045
&self.ro.nfa,
10371046
self.cache,
10381047
matches,
10391048
slots,
10401049
CharInput::new(text),
1041-
start)
1050+
start,
1051+
end)
10421052
}
10431053
}
10441054

@@ -1082,11 +1092,15 @@ impl<'c> ExecNoSync<'c> {
10821092
&mut [],
10831093
false,
10841094
text,
1085-
start)
1095+
start,
1096+
text.len())
10861097
}
10871098
}
10881099
}
1089-
Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start),
1100+
Nfa(ty) => {
1101+
self.exec_nfa(
1102+
ty, matches, &mut [], false, text, start, text.len())
1103+
}
10901104
Nothing => false,
10911105
}
10921106
}
@@ -1118,7 +1132,9 @@ impl Exec {
11181132
/// Get a searcher that isn't Sync.
11191133
#[inline(always)] // reduces constant overhead
11201134
pub fn searcher(&self) -> ExecNoSync {
1121-
let create = || Box::new(RefCell::new(ProgramCacheInner::new(&self.ro)));
1135+
let create = || {
1136+
Box::new(RefCell::new(ProgramCacheInner::new(&self.ro)))
1137+
};
11221138
ExecNoSync {
11231139
ro: &self.ro, // a clone is too expensive here! (and not needed)
11241140
cache: self.cache.get_or(create),

Diff for: src/pikevm.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ impl<'r, I: Input> Fsm<'r, I> {
107107
quit_after_match: bool,
108108
input: I,
109109
start: usize,
110+
end: usize,
110111
) -> bool {
111112
let mut cache = cache.borrow_mut();
112113
let cache = &mut cache.pikevm;
@@ -124,6 +125,7 @@ impl<'r, I: Input> Fsm<'r, I> {
124125
slots,
125126
quit_after_match,
126127
at,
128+
end,
127129
)
128130
}
129131

@@ -135,6 +137,7 @@ impl<'r, I: Input> Fsm<'r, I> {
135137
slots: &mut [Slot],
136138
quit_after_match: bool,
137139
mut at: InputAt,
140+
end: usize,
138141
) -> bool {
139142
let mut matched = false;
140143
let mut all_matched = false;
@@ -212,7 +215,7 @@ impl<'r, I: Input> Fsm<'r, I> {
212215
}
213216
}
214217
}
215-
if at.is_end() {
218+
if at.pos() == end {
216219
break;
217220
}
218221
at = at_next;

Diff for: tests/regression.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,13 @@ ismatch!(reverse_suffix2, r"\d\d\d000", "153.230000\n", true);
8888
matiter!(reverse_suffix3, r"\d\d\d000", "153.230000\n", (4, 10));
8989

9090
// See: https://github.com/rust-lang/regex/issues/334
91-
mat!(captures_after_dfa_premature_end, r"a(b*(X|$))?", "abcbX",
91+
// See: https://github.com/rust-lang/regex/issues/557
92+
mat!(captures_after_dfa_premature_end1, r"a(b*(X|$))?", "abcbX",
9293
Some((0, 1)), None, None);
94+
mat!(captures_after_dfa_premature_end2, r"a(bc*(X|$))?", "abcbX",
95+
Some((0, 1)), None, None);
96+
mat!(captures_after_dfa_premature_end3, r"(aa$)?", "aaz",
97+
Some((0, 0)));
9398

9499
// See: https://github.com/rust-lang/regex/issues/437
95100
ismatch!(

0 commit comments

Comments
 (0)