Skip to content

Commit f003d72

Browse files
committed
impl: fix prefix literal matching bug
This commit fixes a bug where it was possible to report a match where none existed. Basically, in the current regex crate, it just cannot deal with a mixture of look-around assertions in the prefix of a pattern and prefix literal optimizations. Before 1.8, this was handled by simply refusing to extract literals in that case. But in 1.8, with a rewrite of the literal extractor, literals are now extracted for patterns like this: (?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_)) So in 1.8, since it was still using the old engines that can't deal with this, I added some extra logic to throw away any extracted prefix literals if a look-around assertion occurred in the prefix of the pattern. The problem is that the logic I used was "always occurs in the prefix of the pattern" instead of "may occur in the prefix of the pattern." In the pattern above, it's the latter case. So it slipped by and the regex engine tried to use the prefix literals to accelerat the search. This in turn caused mishandling of the `\b` and led to a false positive match. The specific reason why the current regex engines can't deal with this is because they weren't designed to handle searches that took the surrounding context into account when resolving look-around assertions. It was a pretty big oversight on my part many years ago. The new engines we'll be migrating to Real Soon Now don't have this problem and can deal with the prefix literal optimizations while correctly handling look-around assertions in the prefix. Fixes #981
1 parent 93316a3 commit f003d72

File tree

4 files changed

+86
-9
lines changed

4 files changed

+86
-9
lines changed

regex-syntax/src/hir/mod.rs

+53-3
Original file line numberDiff line numberDiff line change
@@ -1854,6 +1854,8 @@ struct PropertiesI {
18541854
look_set: LookSet,
18551855
look_set_prefix: LookSet,
18561856
look_set_suffix: LookSet,
1857+
look_set_prefix_any: LookSet,
1858+
look_set_suffix_any: LookSet,
18571859
utf8: bool,
18581860
explicit_captures_len: usize,
18591861
static_explicit_captures_len: Option<usize>,
@@ -1909,6 +1911,19 @@ impl Properties {
19091911
self.0.look_set_prefix
19101912
}
19111913

1914+
/// Returns a set of all look-around assertions that appear as a _possible_
1915+
/// prefix for this HIR value. That is, the set returned corresponds to the
1916+
/// set of assertions that _may_ be passed before matching any bytes in a
1917+
/// haystack.
1918+
///
1919+
/// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
1920+
/// true if and only if it's possible for the regex to match through a
1921+
/// anchored assertion before consuming any input.
1922+
#[inline]
1923+
pub fn look_set_prefix_any(&self) -> LookSet {
1924+
self.0.look_set_prefix_any
1925+
}
1926+
19121927
/// Returns a set of all look-around assertions that appear as a suffix for
19131928
/// this HIR value. That is, the set returned corresponds to the set of
19141929
/// assertions that must be passed in order to be considered a match after
@@ -1921,6 +1936,19 @@ impl Properties {
19211936
self.0.look_set_suffix
19221937
}
19231938

1939+
/// Returns a set of all look-around assertions that appear as a _possible_
1940+
/// suffix for this HIR value. That is, the set returned corresponds to the
1941+
/// set of assertions that _may_ be passed before matching any bytes in a
1942+
/// haystack.
1943+
///
1944+
/// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
1945+
/// true if and only if it's possible for the regex to match through a
1946+
/// anchored assertion at the end of a match without consuming any input.
1947+
#[inline]
1948+
pub fn look_set_suffix_any(&self) -> LookSet {
1949+
self.0.look_set_suffix_any
1950+
}
1951+
19241952
/// Return true if and only if the corresponding HIR will always match
19251953
/// valid UTF-8.
19261954
///
@@ -2188,6 +2216,8 @@ impl Properties {
21882216
look_set: LookSet::empty(),
21892217
look_set_prefix: fix,
21902218
look_set_suffix: fix,
2219+
look_set_prefix_any: LookSet::empty(),
2220+
look_set_suffix_any: LookSet::empty(),
21912221
utf8: true,
21922222
explicit_captures_len: 0,
21932223
static_explicit_captures_len,
@@ -2201,6 +2231,8 @@ impl Properties {
22012231
props.look_set.set_union(p.look_set());
22022232
props.look_set_prefix.set_intersect(p.look_set_prefix());
22032233
props.look_set_suffix.set_intersect(p.look_set_suffix());
2234+
props.look_set_prefix_any.set_union(p.look_set_prefix_any());
2235+
props.look_set_suffix_any.set_union(p.look_set_suffix_any());
22042236
props.utf8 = props.utf8 && p.is_utf8();
22052237
props.explicit_captures_len = props
22062238
.explicit_captures_len
@@ -2246,6 +2278,8 @@ impl Properties {
22462278
look_set: LookSet::empty(),
22472279
look_set_prefix: LookSet::empty(),
22482280
look_set_suffix: LookSet::empty(),
2281+
look_set_prefix_any: LookSet::empty(),
2282+
look_set_suffix_any: LookSet::empty(),
22492283
// It is debatable whether an empty regex always matches at valid
22502284
// UTF-8 boundaries. Strictly speaking, at a byte oriented view,
22512285
// it is clearly false. There are, for example, many empty strings
@@ -2280,6 +2314,8 @@ impl Properties {
22802314
look_set: LookSet::empty(),
22812315
look_set_prefix: LookSet::empty(),
22822316
look_set_suffix: LookSet::empty(),
2317+
look_set_prefix_any: LookSet::empty(),
2318+
look_set_suffix_any: LookSet::empty(),
22832319
utf8: core::str::from_utf8(&lit.0).is_ok(),
22842320
explicit_captures_len: 0,
22852321
static_explicit_captures_len: Some(0),
@@ -2297,6 +2333,8 @@ impl Properties {
22972333
look_set: LookSet::empty(),
22982334
look_set_prefix: LookSet::empty(),
22992335
look_set_suffix: LookSet::empty(),
2336+
look_set_prefix_any: LookSet::empty(),
2337+
look_set_suffix_any: LookSet::empty(),
23002338
utf8: class.is_utf8(),
23012339
explicit_captures_len: 0,
23022340
static_explicit_captures_len: Some(0),
@@ -2314,6 +2352,8 @@ impl Properties {
23142352
look_set: LookSet::singleton(look),
23152353
look_set_prefix: LookSet::singleton(look),
23162354
look_set_suffix: LookSet::singleton(look),
2355+
look_set_prefix_any: LookSet::singleton(look),
2356+
look_set_suffix_any: LookSet::singleton(look),
23172357
// This requires a little explanation. Basically, we don't consider
23182358
// matching an empty string to be equivalent to matching invalid
23192359
// UTF-8, even though technically matching every empty string will
@@ -2355,15 +2395,17 @@ impl Properties {
23552395
look_set: p.look_set(),
23562396
look_set_prefix: LookSet::empty(),
23572397
look_set_suffix: LookSet::empty(),
2398+
look_set_prefix_any: p.look_set_prefix_any(),
2399+
look_set_suffix_any: p.look_set_suffix_any(),
23582400
utf8: p.is_utf8(),
23592401
explicit_captures_len: p.explicit_captures_len(),
23602402
static_explicit_captures_len: p.static_explicit_captures_len(),
23612403
literal: false,
23622404
alternation_literal: false,
23632405
};
2364-
// The repetition operator can match the empty string, then its lookset
2365-
// prefix and suffixes themselves remain empty since they are no longer
2366-
// required to match.
2406+
// If the repetition operator can match the empty string, then its
2407+
// lookset prefix and suffixes themselves remain empty since they are
2408+
// no longer required to match.
23672409
if rep.min > 0 {
23682410
inner.look_set_prefix = p.look_set_prefix();
23692411
inner.look_set_suffix = p.look_set_suffix();
@@ -2414,6 +2456,8 @@ impl Properties {
24142456
look_set: LookSet::empty(),
24152457
look_set_prefix: LookSet::empty(),
24162458
look_set_suffix: LookSet::empty(),
2459+
look_set_prefix_any: LookSet::empty(),
2460+
look_set_suffix_any: LookSet::empty(),
24172461
utf8: true,
24182462
explicit_captures_len: 0,
24192463
static_explicit_captures_len: Some(0),
@@ -2455,6 +2499,9 @@ impl Properties {
24552499
let mut it = concat.iter();
24562500
while let Some(x) = it.next() {
24572501
props.look_set_prefix.set_union(x.properties().look_set_prefix());
2502+
props
2503+
.look_set_prefix_any
2504+
.set_union(x.properties().look_set_prefix_any());
24582505
if x.properties().maximum_len().map_or(true, |x| x > 0) {
24592506
break;
24602507
}
@@ -2463,6 +2510,9 @@ impl Properties {
24632510
let mut it = concat.iter().rev();
24642511
while let Some(x) = it.next() {
24652512
props.look_set_suffix.set_union(x.properties().look_set_suffix());
2513+
props
2514+
.look_set_suffix_any
2515+
.set_union(x.properties().look_set_suffix_any());
24662516
if x.properties().maximum_len().map_or(true, |x| x > 0) {
24672517
break;
24682518
}

regex-syntax/src/hir/translate.rs

+6
Original file line numberDiff line numberDiff line change
@@ -3287,6 +3287,12 @@ mod tests {
32873287
assert_eq!(p.minimum_len(), Some(1));
32883288
}
32893289

3290+
#[test]
3291+
fn analysis_look_set_prefix_any() {
3292+
let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))");
3293+
assert!(p.look_set_prefix_any().contains(Look::WordAscii));
3294+
}
3295+
32903296
#[test]
32913297
fn analysis_is_anchored() {
32923298
let is_start = |p| props(p).look_set_prefix().contains(Look::Start);

src/exec.rs

+7-6
Original file line numberDiff line numberDiff line change
@@ -274,18 +274,18 @@ impl ExecBuilder {
274274
// prefixes, so disable them.
275275
prefixes = None;
276276
} else if is_set
277-
&& props.look_set_prefix().contains(Look::Start)
277+
&& props.look_set_prefix_any().contains(Look::Start)
278278
{
279279
// Regex sets with anchors do not go well with literal
280280
// optimizations.
281281
prefixes = None;
282-
} else if props.look_set_prefix().contains_word() {
282+
} else if props.look_set_prefix_any().contains_word() {
283283
// The new literal extractor ignores look-around while
284284
// the old one refused to extract prefixes from regexes
285285
// that began with a \b. These old creaky regex internals
286286
// can't deal with it, so we drop it.
287287
prefixes = None;
288-
} else if props.look_set().contains(Look::StartLF) {
288+
} else if props.look_set_prefix_any().contains(Look::StartLF) {
289289
// Similar to the reasoning for word boundaries, this old
290290
// regex engine can't handle literal prefixes with '(?m:^)'
291291
// at the beginning of a regex.
@@ -298,15 +298,16 @@ impl ExecBuilder {
298298
// Partial anchors unfortunately make it hard to use
299299
// suffixes, so disable them.
300300
suffixes = None;
301-
} else if is_set && props.look_set_suffix().contains(Look::End)
301+
} else if is_set
302+
&& props.look_set_suffix_any().contains(Look::End)
302303
{
303304
// Regex sets with anchors do not go well with literal
304305
// optimizations.
305306
suffixes = None;
306-
} else if props.look_set_suffix().contains_word() {
307+
} else if props.look_set_suffix_any().contains_word() {
307308
// See the prefix case for reasoning here.
308309
suffixes = None;
309-
} else if props.look_set().contains(Look::EndLF) {
310+
} else if props.look_set_suffix_any().contains(Look::EndLF) {
310311
// See the prefix case for reasoning here.
311312
suffixes = None;
312313
}

tests/regression.rs

+20
Original file line numberDiff line numberDiff line change
@@ -220,3 +220,23 @@ matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5));
220220

221221
// See: https://github.com/rust-lang/regex/issues/862
222222
mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1)));
223+
224+
// See: https://github.com/rust-lang/regex/issues/981
225+
#[cfg(feature = "unicode")]
226+
#[test]
227+
fn regression_bad_word_boundary() {
228+
let re = regex_new!(r#"(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))"#).unwrap();
229+
let hay = "ubi-Darwin-x86_64.tar.gz";
230+
assert!(!re.is_match(text!(hay)));
231+
let hay = "ubi-Windows-x86_64.zip";
232+
assert!(re.is_match(text!(hay)));
233+
}
234+
235+
// See: https://github.com/rust-lang/regex/issues/982
236+
#[cfg(feature = "unicode-perl")]
237+
#[test]
238+
fn regression_unicode_perl_not_enabled() {
239+
let pat = r"(\d+\s?(years|year|y))?\s?(\d+\s?(months|month|m))?\s?(\d+\s?(weeks|week|w))?\s?(\d+\s?(days|day|d))?\s?(\d+\s?(hours|hour|h))?";
240+
let re = regex_new!(pat);
241+
assert!(re.is_ok());
242+
}

0 commit comments

Comments
 (0)