Skip to content

Commit 9d03c76

Browse files
committed
regex: disable octal syntax by default
This commit disables octal syntax by default, which will permit us to produce useful error messages if a user tried to invoke a backreference. This commit adds a new `octal` method to RegexBuilder and RegexSetBuilder which permits callers to re-enable octal syntax. See rust-lang#457
1 parent 23afcf9 commit 9d03c76

File tree

4 files changed

+56
-7
lines changed

4 files changed

+56
-7
lines changed

Diff for: src/exec.rs

+1-5
Original file line numberDiff line numberDiff line change
@@ -218,11 +218,7 @@ impl ExecBuilder {
218218
for pat in &self.options.pats {
219219
let mut parser =
220220
ParserBuilder::new()
221-
// TODO(burntsushi): Disable octal in regex 1.0. Nobody
222-
// uses it, and we'll get better error messages when
223-
// someone tries to use a backreference. Provide a new
224-
// opt-in toggle for it though.
225-
.octal(true)
221+
.octal(self.options.octal)
226222
.case_insensitive(self.options.case_insensitive)
227223
.multi_line(self.options.multi_line)
228224
.dot_matches_new_line(self.options.dot_matches_new_line)

Diff for: src/lib.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ assert_eq!(&cap[0], "abc");
445445
\n new line
446446
\r carriage return
447447
\v vertical tab (\x0B)
448-
\123 octal character code (up to three digits)
448+
\123 octal character code (up to three digits) (when enabled)
449449
\x7F hex character code (exactly two digits)
450450
\x{10FFFF} any hex character code corresponding to a Unicode code point
451451
\u007F hex character code (exactly four digits)
@@ -619,7 +619,8 @@ determine whether a byte is a word byte or not.
619619
5. Hexadecimal notation can be used to specify arbitrary bytes instead of
620620
Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the
621621
literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that
622-
matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation.
622+
matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation when
623+
enabled.
623624
6. `.` matches any *byte* except for `\n` instead of any Unicode scalar value.
624625
When the `s` flag is enabled, `.` matches any byte.
625626

Diff for: src/re_builder.rs

+42
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ pub struct RegexOptions {
2222
pub swap_greed: bool,
2323
pub ignore_whitespace: bool,
2424
pub unicode: bool,
25+
pub octal: bool,
2526
}
2627

2728
impl Default for RegexOptions {
@@ -37,6 +38,7 @@ impl Default for RegexOptions {
3738
swap_greed: false,
3839
ignore_whitespace: false,
3940
unicode: true,
41+
octal: false,
4042
}
4143
}
4244
}
@@ -142,6 +144,26 @@ impl RegexBuilder {
142144
self
143145
}
144146

147+
/// Whether to support octal syntax or not.
148+
///
149+
/// Octal syntax is a little-known way of uttering Unicode codepoints in
150+
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
151+
/// `\141` are all equivalent regular expressions, where the last example
152+
/// shows octal syntax.
153+
///
154+
/// While supporting octal syntax isn't in and of itself a problem, it does
155+
/// make good error messages harder. That is, in PCRE based regex engines,
156+
/// syntax like `\0` invokes a backreference, which is explicitly
157+
/// unsupported in Rust's regex engine. However, many users expect it to
158+
/// be supported. Therefore, when octal support is disabled, the error
159+
/// message will explicitly mention that backreferences aren't supported.
160+
///
161+
/// Octal syntax is disabled by default.
162+
pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
163+
self.0.octal = yes;
164+
self
165+
}
166+
145167
/// Set the approximate size limit of the compiled regular expression.
146168
///
147169
/// This roughly corresponds to the number of bytes occupied by a single
@@ -283,6 +305,26 @@ impl RegexSetBuilder {
283305
self
284306
}
285307

308+
/// Whether to support octal syntax or not.
309+
///
310+
/// Octal syntax is a little-known way of uttering Unicode codepoints in
311+
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
312+
/// `\141` are all equivalent regular expressions, where the last example
313+
/// shows octal syntax.
314+
///
315+
/// While supporting octal syntax isn't in and of itself a problem, it does
316+
/// make good error messages harder. That is, in PCRE based regex engines,
317+
/// syntax like `\0` invokes a backreference, which is explicitly
318+
/// unsupported in Rust's regex engine. However, many users expect it to
319+
/// be supported. Therefore, when octal support is disabled, the error
320+
/// message will explicitly mention that backreferences aren't supported.
321+
///
322+
/// Octal syntax is disabled by default.
323+
pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
324+
self.0.octal = yes;
325+
self
326+
}
327+
286328
/// Set the approximate size limit of the compiled regular expression.
287329
///
288330
/// This roughly corresponds to the number of bytes occupied by a single

Diff for: tests/test_default.rs

+10
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,13 @@ fn disallow_non_utf8() {
7575
assert!(regex::Regex::new(r"(?-u)[\xFF]").is_err());
7676
assert!(regex::Regex::new(r"(?-u)☃").is_err());
7777
}
78+
79+
#[test]
80+
fn disallow_octal() {
81+
assert!(regex::Regex::new(r"\0").is_err());
82+
}
83+
84+
#[test]
85+
fn allow_octal() {
86+
assert!(regex::RegexBuilder::new(r"\0").octal(true).build().is_ok());
87+
}

0 commit comments

Comments
 (0)