Skip to content

Commit 9062f38

Browse files
committed
Disallow empty character class ranges.
The compiler in particular assumes that it never gets an empty character class. The current parser is pretty paranoid about rejecting empty classes, but a few tricky cases made it through. In particular, one can write `[^\d\D]` to correspond to "match nothing." This commit now looks for empty classes explicitly, and if one is found, returns an error. Interestingly, other regex engines allow this particular idiosyncrasy and interpret it as "never match." Even more interesting, expressions like `a{0}` are also allowed (including by this regex library) and are interpreted as "always match the empty string." Both seem semantically the same. In any case, we forbid empty character classes, primarily because that seems like the sensible thing to do but secondarily because it's the conservative choice. It seems plausible that such a construct could be occasionally useful if one were machine generating regexes, because it could be used to indicate "never match." If we do want to support that use case, we'll need to add a new opcode to the regex matching engines. One can still achieve that today using something like `(a|[^a])`. Fixes #257, where using such a form caused an assert to trip in the compiler. A new, more explicit assert has been added.
1 parent cd85664 commit 9062f38

File tree

3 files changed

+22
-11
lines changed

3 files changed

+22
-11
lines changed

regex-syntax/src/lib.rs

+6
Original file line numberDiff line numberDiff line change
@@ -1336,6 +1336,9 @@ pub enum ErrorKind {
13361336
/// This never returned if the parser is permitted to allow expressions
13371337
/// that match arbitrary bytes.
13381338
InvalidUtf8,
1339+
/// A character class was constructed such that it is empty.
1340+
/// e.g., `[^\d\D]`.
1341+
EmptyClass,
13391342
/// Hints that destructuring should not be exhaustive.
13401343
///
13411344
/// This enum may grow additional variants, so this makes sure clients
@@ -1398,6 +1401,7 @@ impl ErrorKind {
13981401
FlagNotAllowed(_) => "flag not allowed",
13991402
UnicodeNotAllowed => "Unicode features not allowed",
14001403
InvalidUtf8 => "matching arbitrary bytes is not allowed",
1404+
EmptyClass => "empty character class",
14011405
__Nonexhaustive => unreachable!(),
14021406
}
14031407
}
@@ -1507,6 +1511,8 @@ impl fmt::Display for ErrorKind {
15071511
(u) flag is not set."),
15081512
InvalidUtf8 =>
15091513
write!(f, "Matching arbitrary bytes is not allowed."),
1514+
EmptyClass =>
1515+
write!(f, "Empty character classes are not allowed."),
15101516
__Nonexhaustive => unreachable!(),
15111517
}
15121518
}

regex-syntax/src/parser.rs

+15-11
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,9 @@ impl Parser {
587587
}
588588
}
589589
class = self.class_transform(negated, class).canonicalize();
590+
if class.is_empty() {
591+
return Err(self.err(ErrorKind::EmptyClass));
592+
}
590593
Ok(Build::Expr(if self.flags.unicode {
591594
Expr::Class(class)
592595
} else {
@@ -1277,7 +1280,7 @@ mod tests {
12771280
ErrorKind,
12781281
};
12791282
use unicode::regex::{PERLD, PERLS, PERLW};
1280-
use super::{LOWER, UPPER, Flags, Parser, ascii_class};
1283+
use super::{LOWER, UPPER, WORD, Flags, Parser, ascii_class};
12811284

12821285
static YI: &'static [(char, char)] = &[
12831286
('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}'),
@@ -2127,10 +2130,10 @@ mod tests {
21272130

21282131
#[test]
21292132
fn class_multiple_class_negate_negate() {
2130-
let nperld = class(PERLD).negate();
2133+
let nperlw = class(PERLW).negate();
21312134
let nyi = class(YI).negate();
2132-
let cls = CharClass::empty().merge(nperld).merge(nyi);
2133-
assert_eq!(p(r"[^\D\P{Yi}]"), Expr::Class(cls.negate()));
2135+
let cls = CharClass::empty().merge(nperlw).merge(nyi);
2136+
assert_eq!(p(r"[^\W\P{Yi}]"), Expr::Class(cls.negate()));
21342137
}
21352138

21362139
#[test]
@@ -2149,10 +2152,10 @@ mod tests {
21492152

21502153
#[test]
21512154
fn class_multiple_class_negate_negate_casei() {
2152-
let nperld = class(PERLD).negate();
2155+
let nperlw = class(PERLW).negate();
21532156
let nyi = class(YI).negate();
2154-
let class = CharClass::empty().merge(nperld).merge(nyi);
2155-
assert_eq!(p(r"(?i)[^\D\P{Yi}]"),
2157+
let class = CharClass::empty().merge(nperlw).merge(nyi);
2158+
assert_eq!(p(r"(?i)[^\W\P{Yi}]"),
21562159
Expr::Class(class.case_fold().negate()));
21572160
}
21582161

@@ -2236,10 +2239,10 @@ mod tests {
22362239

22372240
#[test]
22382241
fn ascii_classes_negate_multiple() {
2239-
let (nlower, nupper) = (class(LOWER).negate(), class(UPPER).negate());
2240-
let cls = CharClass::empty().merge(nlower).merge(nupper);
2241-
assert_eq!(p("[[:^lower:][:^upper:]]"), Expr::Class(cls.clone()));
2242-
assert_eq!(p("[^[:^lower:][:^upper:]]"), Expr::Class(cls.negate()));
2242+
let (nlower, nword) = (class(LOWER).negate(), class(WORD).negate());
2243+
let cls = CharClass::empty().merge(nlower).merge(nword);
2244+
assert_eq!(p("[[:^lower:][:^word:]]"), Expr::Class(cls.clone()));
2245+
assert_eq!(p("[^[:^lower:][:^word:]]"), Expr::Class(cls.negate()));
22432246
}
22442247

22452248
#[test]
@@ -2725,6 +2728,7 @@ mod tests {
27252728
fn error_class_empty_range() {
27262729
test_err!("[]", 2, ErrorKind::UnexpectedClassEof);
27272730
test_err!("[^]", 3, ErrorKind::UnexpectedClassEof);
2731+
test_err!(r"[^\d\D]", 7, ErrorKind::EmptyClass);
27282732
}
27292733

27302734
#[test]

src/compile.rs

+1
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,7 @@ impl Compiler {
372372
}
373373

374374
fn c_class(&mut self, ranges: &[ClassRange]) -> Result {
375+
assert!(!ranges.is_empty());
375376
if self.compiled.uses_bytes() {
376377
CompileClass {
377378
c: self,

0 commit comments

Comments
 (0)