Skip to content

Commit 76343f8

Browse files
committed
regex: ban (?-u:\B) for Unicode regexes
The issue with the ASCII version of \B is that it can match between code units of UTF-8, which means it can cause match indices reported to be on invalid UTF-8 boundaries. Therefore, similar to things like `(?-u:\xFF)`, we ban negated ASCII word boundaries from Unicode regular expressions. Normal ASCII word boundaries remain accessible from Unicode regular expressions. See #457
1 parent 9604cc0 commit 76343f8

File tree

4 files changed

+18
-19
lines changed

4 files changed

+18
-19
lines changed

Diff for: regex-syntax/src/hir/translate.rs

+8-12
Original file line numberDiff line numberDiff line change
@@ -724,13 +724,10 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
724724
// It is possible for negated ASCII word boundaries to
725725
// match at invalid UTF-8 boundaries, even when searching
726726
// valid UTF-8.
727-
//
728-
// TODO(ag): Enable this error when regex goes to 1.0.
729-
// Otherwise, it is too steep of a breaking change.
730-
// if !self.trans().allow_invalid_utf8 {
731-
// return Err(self.error(
732-
// asst.span, ErrorKind::InvalidUtf8));
733-
// }
727+
if !self.trans().allow_invalid_utf8 {
728+
return Err(self.error(
729+
asst.span, ErrorKind::InvalidUtf8));
730+
}
734731
hir::WordBoundary::AsciiNegate
735732
})
736733
}
@@ -1511,11 +1508,10 @@ mod tests {
15111508
t_bytes(r"(?-u)\B"),
15121509
hir_word(hir::WordBoundary::AsciiNegate));
15131510

1514-
// TODO(ag): Enable this tests when regex goes to 1.0.
1515-
// assert_eq!(t_err(r"(?-u)\B"), TestError {
1516-
// kind: hir::ErrorKind::InvalidUtf8,
1517-
// span: Span::new(Position::new(5, 1, 6), Position::new(7, 1, 8)),
1518-
// });
1511+
assert_eq!(t_err(r"(?-u)\B"), TestError {
1512+
kind: hir::ErrorKind::InvalidUtf8,
1513+
span: Span::new(Position::new(5, 1, 6), Position::new(7, 1, 8)),
1514+
});
15191515
}
15201516

15211517
#[test]

Diff for: tests/bytes.rs

+10
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,13 @@ matiter!(invalidutf8_anchor3,
6060
fn negated_full_byte_range() {
6161
assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err());
6262
}
63+
64+
matiter!(word_boundary_ascii1, r"(?-u:\B)x(?-u:\B)", "áxβ");
65+
matiter!(word_boundary_ascii2, r"(?-u:\B)", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5));
66+
67+
// See: https://github.com/rust-lang/regex/issues/264
68+
mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0)));
69+
mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0)));
70+
71+
// See: https://github.com/rust-lang/regex/issues/271
72+
mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8)));

Diff for: tests/regression.rs

-5
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,6 @@ matiter!(word_boundary_dfa, r"\b", "a b c",
6161
// See: https://github.com/rust-lang/regex/issues/268
6262
matiter!(partial_anchor, r"^a|b", "ba", (0, 1));
6363

64-
// See: https://github.com/rust-lang/regex/issues/264
65-
mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0)));
66-
mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0)));
67-
6864
// See: https://github.com/rust-lang/regex/issues/280
6965
ismatch!(partial_anchor_alternate_begin, r"^a|z", "yyyyya", false);
7066
ismatch!(partial_anchor_alternate_end, r"a$|z", "ayyyyy", false);
@@ -77,7 +73,6 @@ mat!(lits_unambiguous2, r"((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$",
7773
"CIMG2341", Some((0, 8)), Some((0, 4)), None, Some((0, 4)), Some((4, 8)));
7874

7975
// See: https://github.com/rust-lang/regex/issues/271
80-
mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8)));
8176
mat!(endl_or_wb, r"(?m:$)|(?-u:\b)", "\u{6084e}", Some((4, 4)));
8277
mat!(zero_or_end, r"(?i-u:\x00)|$", "\u{e682f}", Some((4, 4)));
8378
mat!(y_or_endl, r"(?i-u:y)|(?m:$)", "\u{b4331}", Some((4, 4)));

Diff for: tests/word_boundary_unicode.rs

-2
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,3 @@ matiter!(unicode1, r"\bx\b", "áxβ");
44
matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));
55

66
matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));
7-
matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ");
8-
matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (5, 5));

0 commit comments

Comments
 (0)