Skip to content

Commit 01c92c8

Browse files
authored
Merge pull request #262 from rust-lang-nursery/fix-bugs
fix several small bugs found from fuzzing
2 parents cf04879 + 84a2bf5 commit 01c92c8

14 files changed

+166
-48
lines changed

regex-syntax/src/lib.rs

+6
Original file line numberDiff line numberDiff line change
@@ -1336,6 +1336,9 @@ pub enum ErrorKind {
13361336
/// This never returned if the parser is permitted to allow expressions
13371337
/// that match arbitrary bytes.
13381338
InvalidUtf8,
1339+
/// A character class was constructed such that it is empty.
1340+
/// e.g., `[^\d\D]`.
1341+
EmptyClass,
13391342
/// Hints that destructuring should not be exhaustive.
13401343
///
13411344
/// This enum may grow additional variants, so this makes sure clients
@@ -1398,6 +1401,7 @@ impl ErrorKind {
13981401
FlagNotAllowed(_) => "flag not allowed",
13991402
UnicodeNotAllowed => "Unicode features not allowed",
14001403
InvalidUtf8 => "matching arbitrary bytes is not allowed",
1404+
EmptyClass => "empty character class",
14011405
__Nonexhaustive => unreachable!(),
14021406
}
14031407
}
@@ -1507,6 +1511,8 @@ impl fmt::Display for ErrorKind {
15071511
(u) flag is not set."),
15081512
InvalidUtf8 =>
15091513
write!(f, "Matching arbitrary bytes is not allowed."),
1514+
EmptyClass =>
1515+
write!(f, "Empty character classes are not allowed."),
15101516
__Nonexhaustive => unreachable!(),
15111517
}
15121518
}

regex-syntax/src/parser.rs

+34-12
Original file line numberDiff line numberDiff line change
@@ -581,12 +581,18 @@ impl Parser {
581581
_ => unreachable!(),
582582
},
583583
start => {
584+
if !self.flags.unicode {
585+
let _ = try!(self.codepoint_to_one_byte(start));
586+
}
584587
self.bump();
585588
try!(self.parse_class_range(&mut class, start));
586589
}
587590
}
588591
}
589592
class = self.class_transform(negated, class).canonicalize();
593+
if class.is_empty() {
594+
return Err(self.err(ErrorKind::EmptyClass));
595+
}
590596
Ok(Build::Expr(if self.flags.unicode {
591597
Expr::Class(class)
592598
} else {
@@ -639,7 +645,13 @@ impl Parser {
639645
// Because `parse_escape` can never return `LeftParen`.
640646
_ => unreachable!(),
641647
},
642-
_ => self.bump(),
648+
_ => {
649+
let c = self.bump();
650+
if !self.flags.unicode {
651+
let _ = try!(self.codepoint_to_one_byte(c));
652+
}
653+
c
654+
}
643655
};
644656
if end < start {
645657
// e.g., [z-a]
@@ -1277,7 +1289,7 @@ mod tests {
12771289
ErrorKind,
12781290
};
12791291
use unicode::regex::{PERLD, PERLS, PERLW};
1280-
use super::{LOWER, UPPER, Flags, Parser, ascii_class};
1292+
use super::{LOWER, UPPER, WORD, Flags, Parser, ascii_class};
12811293

12821294
static YI: &'static [(char, char)] = &[
12831295
('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}'),
@@ -2002,6 +2014,8 @@ mod tests {
20022014

20032015
assert_eq!(pb(r"(?-u)[a]"), Expr::ClassBytes(bclass(&[(b'a', b'a')])));
20042016
assert_eq!(pb(r"(?-u)[\x00]"), Expr::ClassBytes(bclass(&[(0, 0)])));
2017+
assert_eq!(pb(r"(?-u)[\xFF]"),
2018+
Expr::ClassBytes(bclass(&[(0xFF, 0xFF)])));
20052019
assert_eq!(pb("(?-u)[\n]"),
20062020
Expr::ClassBytes(bclass(&[(b'\n', b'\n')])));
20072021
assert_eq!(pb(r"(?-u)[\n]"),
@@ -2127,10 +2141,10 @@ mod tests {
21272141

21282142
#[test]
21292143
fn class_multiple_class_negate_negate() {
2130-
let nperld = class(PERLD).negate();
2144+
let nperlw = class(PERLW).negate();
21312145
let nyi = class(YI).negate();
2132-
let cls = CharClass::empty().merge(nperld).merge(nyi);
2133-
assert_eq!(p(r"[^\D\P{Yi}]"), Expr::Class(cls.negate()));
2146+
let cls = CharClass::empty().merge(nperlw).merge(nyi);
2147+
assert_eq!(p(r"[^\W\P{Yi}]"), Expr::Class(cls.negate()));
21342148
}
21352149

21362150
#[test]
@@ -2149,10 +2163,10 @@ mod tests {
21492163

21502164
#[test]
21512165
fn class_multiple_class_negate_negate_casei() {
2152-
let nperld = class(PERLD).negate();
2166+
let nperlw = class(PERLW).negate();
21532167
let nyi = class(YI).negate();
2154-
let class = CharClass::empty().merge(nperld).merge(nyi);
2155-
assert_eq!(p(r"(?i)[^\D\P{Yi}]"),
2168+
let class = CharClass::empty().merge(nperlw).merge(nyi);
2169+
assert_eq!(p(r"(?i)[^\W\P{Yi}]"),
21562170
Expr::Class(class.case_fold().negate()));
21572171
}
21582172

@@ -2236,10 +2250,10 @@ mod tests {
22362250

22372251
#[test]
22382252
fn ascii_classes_negate_multiple() {
2239-
let (nlower, nupper) = (class(LOWER).negate(), class(UPPER).negate());
2240-
let cls = CharClass::empty().merge(nlower).merge(nupper);
2241-
assert_eq!(p("[[:^lower:][:^upper:]]"), Expr::Class(cls.clone()));
2242-
assert_eq!(p("[^[:^lower:][:^upper:]]"), Expr::Class(cls.negate()));
2253+
let (nlower, nword) = (class(LOWER).negate(), class(WORD).negate());
2254+
let cls = CharClass::empty().merge(nlower).merge(nword);
2255+
assert_eq!(p("[[:^lower:][:^word:]]"), Expr::Class(cls.clone()));
2256+
assert_eq!(p("[^[:^lower:][:^word:]]"), Expr::Class(cls.negate()));
22432257
}
22442258

22452259
#[test]
@@ -2402,6 +2416,13 @@ mod tests {
24022416
test_err!(r"☃(?-u:\pL)", 9, ErrorKind::UnicodeNotAllowed, flags);
24032417
}
24042418

2419+
#[test]
2420+
fn unicode_class_literal_not_allowed() {
2421+
let flags = Flags { allow_bytes: true, .. Flags::default() };
2422+
test_err!(r"(?-u)[☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
2423+
test_err!(r"(?-u)[☃-☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
2424+
}
2425+
24052426
#[test]
24062427
fn unicode_hex_not_allowed() {
24072428
let flags = Flags { allow_bytes: true, .. Flags::default() };
@@ -2725,6 +2746,7 @@ mod tests {
27252746
fn error_class_empty_range() {
27262747
test_err!("[]", 2, ErrorKind::UnexpectedClassEof);
27272748
test_err!("[^]", 3, ErrorKind::UnexpectedClassEof);
2749+
test_err!(r"[^\d\D]", 7, ErrorKind::EmptyClass);
27282750
}
27292751

27302752
#[test]

src/backtrack.rs

+1-3
Original file line numberDiff line numberDiff line change
@@ -242,9 +242,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
242242
ip = inst.goto1;
243243
}
244244
EmptyLook(ref inst) => {
245-
let prev = self.input.previous_char(at);
246-
let next = self.input.next_char(at);
247-
if inst.matches(prev, next) {
245+
if self.input.is_empty_match(at, inst) {
248246
ip = inst.goto;
249247
} else {
250248
return false;

src/compile.rs

+1
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,7 @@ impl Compiler {
372372
}
373373

374374
fn c_class(&mut self, ranges: &[ClassRange]) -> Result {
375+
assert!(!ranges.is_empty());
375376
if self.compiled.uses_bytes() {
376377
CompileClass {
377378
c: self,

src/dfa.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1847,7 +1847,7 @@ mod tests {
18471847
expected == got && state.flags() == StateFlags(flags)
18481848
}
18491849
QuickCheck::new()
1850-
.gen(StdGen::new(self::rand::thread_rng(), 70_000))
1850+
.gen(StdGen::new(self::rand::thread_rng(), 10_000))
18511851
.quickcheck(p as fn(Vec<u32>, u8) -> bool);
18521852
}
18531853

src/exec.rs

+7-3
Original file line numberDiff line numberDiff line change
@@ -589,7 +589,11 @@ impl<'c> ExecNoSync<'c> {
589589
lits.find_start(&text[start..])
590590
.map(|(s, e)| (start + s, start + e))
591591
}
592-
AnchoredEnd => self.ro.suffixes.find_end(&text),
592+
AnchoredEnd => {
593+
let lits = &self.ro.suffixes;
594+
lits.find_end(&text[start..])
595+
.map(|(s, e)| (start + s, start + e))
596+
}
593597
}
594598
}
595599

@@ -917,7 +921,7 @@ impl<'c> ExecNoSync<'c> {
917921
matches,
918922
slots,
919923
quit_after_match,
920-
ByteInput::new(text),
924+
ByteInput::new(text, self.ro.nfa.only_utf8),
921925
start)
922926
} else {
923927
pikevm::Fsm::exec(
@@ -945,7 +949,7 @@ impl<'c> ExecNoSync<'c> {
945949
&self.cache,
946950
matches,
947951
slots,
948-
ByteInput::new(text),
952+
ByteInput::new(text, self.ro.nfa.only_utf8),
949953
start)
950954
} else {
951955
backtrack::Bounded::exec(

src/input.rs

+107-8
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ use std::u32;
1616

1717
use syntax;
1818

19-
use utf8::{decode_utf8, decode_last_utf8};
2019
use literals::LiteralSearcher;
20+
use prog::InstEmptyLook;
21+
use utf8::{decode_utf8, decode_last_utf8};
2122

2223
/// Represents a location in the input.
2324
#[derive(Clone, Copy, Debug)]
@@ -83,6 +84,10 @@ pub trait Input {
8384
/// If no such character could be decoded, then `Char` is absent.
8485
fn previous_char(&self, at: InputAt) -> Char;
8586

87+
/// Return true if the given empty width instruction matches at the
88+
/// input position given.
89+
fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
90+
8691
/// Scan the input for a matching prefix.
8792
fn prefix_at(
8893
&self,
@@ -104,6 +109,10 @@ impl<'a, T: Input> Input for &'a T {
104109

105110
fn previous_char(&self, at: InputAt) -> Char { (**self).previous_char(at) }
106111

112+
fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
113+
(**self).is_empty_match(at, empty)
114+
}
115+
107116
fn prefix_at(
108117
&self,
109118
prefixes: &LiteralSearcher,
@@ -155,6 +164,38 @@ impl<'t> Input for CharInput<'t> {
155164
decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
156165
}
157166

167+
fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
168+
use prog::EmptyLook::*;
169+
match empty.look {
170+
StartLine => {
171+
let c = self.previous_char(at);
172+
c.is_none() || c == '\n'
173+
}
174+
EndLine => {
175+
let c = self.next_char(at);
176+
c.is_none() || c == '\n'
177+
}
178+
StartText => self.previous_char(at).is_none(),
179+
EndText => self.next_char(at).is_none(),
180+
WordBoundary => {
181+
let (c1, c2) = (self.previous_char(at), self.next_char(at));
182+
c1.is_word_char() != c2.is_word_char()
183+
}
184+
NotWordBoundary => {
185+
let (c1, c2) = (self.previous_char(at), self.next_char(at));
186+
c1.is_word_char() == c2.is_word_char()
187+
}
188+
WordBoundaryAscii => {
189+
let (c1, c2) = (self.previous_char(at), self.next_char(at));
190+
c1.is_word_byte() != c2.is_word_byte()
191+
}
192+
NotWordBoundaryAscii => {
193+
let (c1, c2) = (self.previous_char(at), self.next_char(at));
194+
c1.is_word_byte() == c2.is_word_byte()
195+
}
196+
}
197+
}
198+
158199
fn prefix_at(
159200
&self,
160201
prefixes: &LiteralSearcher,
@@ -178,20 +219,26 @@ impl<'t> Input for CharInput<'t> {
178219
/// easy access to necessary Unicode decoding (used for word boundary look
179220
/// ahead/look behind).
180221
#[derive(Clone, Copy, Debug)]
181-
pub struct ByteInput<'t>(&'t [u8]);
222+
pub struct ByteInput<'t> {
223+
text: &'t [u8],
224+
only_utf8: bool,
225+
}
182226

183227
impl<'t> ByteInput<'t> {
184228
/// Return a new byte-based input reader for the given string.
185-
pub fn new(s: &'t [u8]) -> ByteInput<'t> {
186-
ByteInput(s)
229+
pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
230+
ByteInput {
231+
text: text,
232+
only_utf8: only_utf8,
233+
}
187234
}
188235
}
189236

190237
impl<'t> ops::Deref for ByteInput<'t> {
191238
type Target = [u8];
192239

193240
fn deref(&self) -> &[u8] {
194-
self.0
241+
self.text
195242
}
196243
}
197244

@@ -213,6 +260,58 @@ impl<'t> Input for ByteInput<'t> {
213260
decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
214261
}
215262

263+
fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
264+
use prog::EmptyLook::*;
265+
match empty.look {
266+
StartLine => {
267+
let c = self.previous_char(at);
268+
c.is_none() || c == '\n'
269+
}
270+
EndLine => {
271+
let c = self.next_char(at);
272+
c.is_none() || c == '\n'
273+
}
274+
StartText => self.previous_char(at).is_none(),
275+
EndText => self.next_char(at).is_none(),
276+
WordBoundary => {
277+
let (c1, c2) = (self.previous_char(at), self.next_char(at));
278+
c1.is_word_char() != c2.is_word_char()
279+
}
280+
NotWordBoundary => {
281+
let (c1, c2) = (self.previous_char(at), self.next_char(at));
282+
c1.is_word_char() == c2.is_word_char()
283+
}
284+
WordBoundaryAscii => {
285+
let (c1, c2) = (self.previous_char(at), self.next_char(at));
286+
if self.only_utf8 {
287+
// If we must match UTF-8, then we can't match word
288+
// boundaries at invalid UTF-8.
289+
if c1.is_none() && !at.is_start() {
290+
return false;
291+
}
292+
if c2.is_none() && !at.is_end() {
293+
return false;
294+
}
295+
}
296+
c1.is_word_byte() != c2.is_word_byte()
297+
}
298+
NotWordBoundaryAscii => {
299+
let (c1, c2) = (self.previous_char(at), self.next_char(at));
300+
if self.only_utf8 {
301+
// If we must match UTF-8, then we can't match word
302+
// boundaries at invalid UTF-8.
303+
if c1.is_none() && !at.is_start() {
304+
return false;
305+
}
306+
if c2.is_none() && !at.is_end() {
307+
return false;
308+
}
309+
}
310+
c1.is_word_byte() == c2.is_word_byte()
311+
}
312+
}
313+
}
314+
216315
fn prefix_at(
217316
&self,
218317
prefixes: &LiteralSearcher,
@@ -222,11 +321,11 @@ impl<'t> Input for ByteInput<'t> {
222321
}
223322

224323
fn len(&self) -> usize {
225-
self.0.len()
324+
self.text.len()
226325
}
227326

228327
fn as_bytes(&self) -> &[u8] {
229-
self.0
328+
&self.text
230329
}
231330
}
232331

@@ -276,7 +375,7 @@ impl Char {
276375
pub fn is_word_byte(self) -> bool {
277376
match char::from_u32(self.0) {
278377
None => false,
279-
Some(c) if c <= '\u{FF}' => syntax::is_word_byte(c as u8),
378+
Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8),
280379
Some(_) => false,
281380
}
282381
}

src/pikevm.rs

+1-3
Original file line numberDiff line numberDiff line change
@@ -322,9 +322,7 @@ impl<'r, I: Input> Fsm<'r, I> {
322322
nlist.set.insert(ip);
323323
match self.prog[ip] {
324324
EmptyLook(ref inst) => {
325-
let prev = self.input.previous_char(at);
326-
let next = self.input.next_char(at);
327-
if inst.matches(prev, next) {
325+
if self.input.is_empty_match(at, inst) {
328326
ip = inst.goto;
329327
}
330328
}

0 commit comments

Comments
 (0)