Skip to content

Commit c15170a

Browse files
committed
Auto merge of #354 - robinst:fix-panics-in-extended-mode-by-being-more-strict, r=BurntSushi
Fix panics with whitespace in extended mode by being more strict Instead of ignoring space in all the bump/peek methods (as proposed in pull request #349), have an explicit `ignore_space` method that can be used in places where space/comments should be allowed. This makes parsing a bit stricter than before as well.
2 parents 68bc958 + 6f32d2f commit c15170a

File tree

1 file changed

+131
-43
lines changed

1 file changed

+131
-43
lines changed

Diff for: regex-syntax/src/parser.rs

+131-43
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,11 @@ impl Parser {
104104
// Starts at the beginning of the input and consumes until either the end
105105
// of input or an error.
106106
fn parse_expr(mut self) -> Result<Expr> {
107-
while !self.eof() {
107+
loop {
108+
self.ignore_space();
109+
if self.eof() {
110+
break;
111+
}
108112
let build_expr = match self.cur() {
109113
'\\' => try!(self.parse_escape()),
110114
'|' => { let e = try!(self.alternate()); self.bump(); e }
@@ -177,7 +181,7 @@ impl Parser {
177181
return Err(self.err(ErrorKind::UnexpectedEscapeEof));
178182
}
179183
let c = self.cur();
180-
if is_punct(c) {
184+
if is_punct(c) || (self.flags.ignore_space && c.is_whitespace()) {
181185
let c = self.bump();
182186
return Ok(try!(self.lit(c)));
183187
}
@@ -234,6 +238,7 @@ impl Parser {
234238
let chari = self.chari;
235239
let mut name: CaptureName = None;
236240
self.bump();
241+
self.ignore_space();
237242
if self.bump_if("?P<") {
238243
let n = try!(self.parse_group_name());
239244
if self.names.iter().any(|n2| n2 == &n) {
@@ -370,13 +375,16 @@ impl Parser {
370375
return Err(self.err(ErrorKind::RepeaterUnexpectedExpr(e)));
371376
}
372377
self.bump();
373-
let min = try!(self.parse_decimal(|c| c != ',' && c != '}'));
378+
self.ignore_space();
379+
let min = try!(self.parse_decimal());
374380
let mut max_opt = Some(min);
381+
self.ignore_space();
375382
if self.bump_if(',') {
383+
self.ignore_space();
376384
if self.peek_is('}') {
377385
max_opt = None;
378386
} else {
379-
let max = try!(self.parse_decimal(|c| c != '}'));
387+
let max = try!(self.parse_decimal());
380388
if min > max {
381389
// e.g., a{2,1}
382390
return Err(self.err(ErrorKind::InvalidRepeatRange {
@@ -387,6 +395,7 @@ impl Parser {
387395
max_opt = Some(max);
388396
}
389397
}
398+
self.ignore_space();
390399
if !self.bump_if('}') {
391400
Err(self.err(ErrorKind::UnclosedRepeat))
392401
} else {
@@ -423,8 +432,8 @@ impl Parser {
423432
//
424433
// Start: `1`
425434
// End: `,` (where `until == ','`)
426-
fn parse_decimal<B: Bumpable>(&mut self, until: B) -> Result<u32> {
427-
match self.bump_get(until) {
435+
fn parse_decimal(&mut self) -> Result<u32> {
436+
match self.bump_get(|c| is_ascii_word(c) || c.is_whitespace()) {
428437
// e.g., a{}
429438
None => Err(self.err(ErrorKind::MissingBase10)),
430439
Some(n) => {
@@ -472,6 +481,7 @@ impl Parser {
472481
// Start: `{`
473482
// End: `b`
474483
fn parse_hex(&mut self) -> Result<Build> {
484+
self.ignore_space();
475485
if self.bump_if('{') {
476486
self.parse_hex_many_digits()
477487
} else {
@@ -486,9 +496,11 @@ impl Parser {
486496
fn parse_hex_many_digits(&mut self) -> Result<Build> {
487497
use std::char;
488498

489-
let s = self.bump_get(|c| c != '}').unwrap_or("".into());
499+
self.ignore_space();
500+
let s = self.bump_get(is_ascii_word).unwrap_or("".into());
490501
let n = try!(u32::from_str_radix(&s, 16)
491502
.map_err(|_| self.err(ErrorKind::InvalidBase16(s))));
503+
self.ignore_space();
492504
if !self.bump_if('}') {
493505
// e.g., a\x{d
494506
return Err(self.err(ErrorKind::UnclosedHex));
@@ -530,12 +542,16 @@ impl Parser {
530542
// End: `+`
531543
fn parse_class(&mut self) -> Result<Build> {
532544
self.bump();
545+
self.ignore_space();
533546
let negated = self.bump_if('^');
547+
self.ignore_space();
534548
let mut class = CharClass::empty();
535549
while self.bump_if('-') {
550+
self.ignore_space();
536551
class.ranges.push(ClassRange::one('-'));
537552
}
538553
loop {
554+
self.ignore_space();
539555
if self.eof() {
540556
// e.g., [a
541557
return Err(self.err(ErrorKind::UnexpectedClassEof));
@@ -631,11 +647,13 @@ impl Parser {
631647
// End: `]`
632648
fn parse_class_range(&mut self, class: &mut CharClass, start: char)
633649
-> Result<()> {
650+
self.ignore_space();
634651
if !self.bump_if('-') {
635652
// Not a range, so just push a singleton range.
636653
class.ranges.push(ClassRange::one(start));
637654
return Ok(());
638655
}
656+
self.ignore_space();
639657
if self.eof() {
640658
// e.g., [a-
641659
return Err(self.err(ErrorKind::UnexpectedClassEof));
@@ -730,9 +748,12 @@ impl Parser {
730748
//
731749
// `negate` is true when the class name is used with `\P`.
732750
fn parse_unicode_class(&mut self, neg: bool) -> Result<CharClass> {
751+
self.ignore_space();
733752
let name =
734753
if self.bump_if('{') {
735-
let n = self.bump_get(|c| c != '}').unwrap_or("".into());
754+
self.ignore_space();
755+
let n = self.bump_get(is_ascii_word).unwrap_or("".into());
756+
self.ignore_space();
736757
if n.is_empty() || !self.bump_if('}') {
737758
// e.g., \p{Greek
738759
return Err(self.err(ErrorKind::UnclosedUnicodeName));
@@ -796,7 +817,31 @@ impl Parser {
796817
// Auxiliary helper methods.
797818
impl Parser {
798819
fn chars(&self) -> Chars {
799-
Chars::new(&self.chars[self.chari..], self.flags.ignore_space)
820+
Chars::new(&self.chars[self.chari..])
821+
}
822+
823+
fn ignore_space(&mut self) {
824+
if !self.flags.ignore_space {
825+
return;
826+
}
827+
while !self.eof() {
828+
match self.cur() {
829+
'#' => {
830+
self.bump();
831+
while !self.eof() {
832+
match self.bump() {
833+
'\n' => break,
834+
_ => continue,
835+
}
836+
}
837+
},
838+
c => if !c.is_whitespace() {
839+
return;
840+
} else {
841+
self.bump();
842+
}
843+
}
844+
}
800845
}
801846

802847
fn bump(&mut self) -> char {
@@ -924,48 +969,22 @@ impl Parser {
924969
struct Chars<'a> {
925970
chars: &'a [char],
926971
cur: usize,
927-
ignore_space: bool,
928972
}
929973

930974
impl<'a> Iterator for Chars<'a> {
931975
type Item = char;
932976
fn next(&mut self) -> Option<char> {
933-
if !self.ignore_space {
934-
let x = self.c();
935-
self.advance();
936-
return x;
937-
}
938-
while let Some(c) = self.c() {
939-
self.advance();
940-
match c {
941-
'\\' => return match self.c() {
942-
Some('#') => {self.advance(); Some('#')}
943-
_ => Some('\\')
944-
},
945-
'#' => loop {
946-
match self.c() {
947-
Some(c) => {
948-
self.advance();
949-
if c == '\n' {
950-
break;
951-
}
952-
},
953-
None => return None
954-
}
955-
},
956-
_ => if !c.is_whitespace() {return Some(c);}
957-
}
958-
}
959-
None
977+
let x = self.c();
978+
self.advance();
979+
return x;
960980
}
961981
}
962982

963983
impl<'a> Chars<'a> {
964-
fn new(chars: &[char], ignore_space: bool) -> Chars {
984+
fn new(chars: &[char]) -> Chars {
965985
Chars {
966986
chars: chars,
967987
cur: 0,
968-
ignore_space: ignore_space,
969988
}
970989
}
971990

@@ -1221,6 +1240,13 @@ fn is_valid_capture_char(c: char) -> bool {
12211240
|| (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
12221241
}
12231242

1243+
fn is_ascii_word(c: char) -> bool {
1244+
match c {
1245+
'a' ... 'z' | 'A' ... 'Z' | '_' | '0' ... '9' => true,
1246+
_ => false,
1247+
}
1248+
}
1249+
12241250
/// Returns true if the give character has significance in a regex.
12251251
pub fn is_punct(c: char) -> bool {
12261252
match c {
@@ -2368,10 +2394,49 @@ mod tests {
23682394
}
23692395

23702396
#[test]
2371-
fn ignore_space_escape() {
2372-
assert_eq!(p(r"(?x)\ d"), Expr::Class(class(PERLD)));
2373-
assert_eq!(p(r"(?x)\
2374-
D"), Expr::Class(class(PERLD).negate()));
2397+
fn ignore_space_escape_octal() {
2398+
assert_eq!(p(r"(?x)\12 3"), Expr::Concat(vec![
2399+
lit('\n'),
2400+
lit('3'),
2401+
]));
2402+
}
2403+
2404+
#[test]
2405+
fn ignore_space_escape_hex() {
2406+
assert_eq!(p(r"(?x)\x { 53 }"), lit('S'));
2407+
assert_eq!(p(r"(?x)\x # comment
2408+
{ # comment
2409+
53 # comment
2410+
} # comment"), lit('S'));
2411+
}
2412+
2413+
#[test]
2414+
fn ignore_space_escape_hex2() {
2415+
assert_eq!(p(r"(?x)\x 53"), lit('S'));
2416+
assert_eq!(p(r"(?x)\x # comment
2417+
53 # comment"), lit('S'));
2418+
}
2419+
2420+
#[test]
2421+
fn ignore_space_escape_unicode_name() {
2422+
assert_eq!(p(r"(?x)\p # comment
2423+
{ # comment
2424+
Yi # comment
2425+
} # comment"), Expr::Class(class(YI)));
2426+
}
2427+
2428+
#[test]
2429+
fn ignore_space_repeat_counted() {
2430+
assert_eq!(p("(?x)a # comment
2431+
{ # comment
2432+
5 # comment
2433+
, # comment
2434+
10 # comment
2435+
}"), Expr::Repeat {
2436+
e: b(lit('a')),
2437+
r: Repeater::Range { min: 5, max: Some(10) },
2438+
greedy: true,
2439+
});
23752440
}
23762441

23772442
#[test]
@@ -2424,6 +2489,14 @@ mod tests {
24242489
]));
24252490
}
24262491

2492+
#[test]
2493+
fn ignore_space_escape_space() {
2494+
assert_eq!(p(r"(?x)a\ # hi there"), Expr::Concat(vec![
2495+
lit('a'),
2496+
lit(' '),
2497+
]));
2498+
}
2499+
24272500
// Test every single possible error case.
24282501

24292502
macro_rules! test_err {
@@ -2815,4 +2888,19 @@ mod tests {
28152888
test_err!("(?P<a>.)(?P<a>.)", 14,
28162889
ErrorKind::DuplicateCaptureName("a".into()));
28172890
}
2891+
2892+
#[test]
2893+
fn error_ignore_space_escape_hex() {
2894+
test_err!(r"(?x)\x{ 5 3 }", 10, ErrorKind::UnclosedHex);
2895+
}
2896+
2897+
#[test]
2898+
fn error_ignore_space_escape_hex2() {
2899+
test_err!(r"(?x)\x 5 3", 9, ErrorKind::InvalidBase16("5 ".into()));
2900+
}
2901+
2902+
#[test]
2903+
fn error_ignore_space_escape_unicode_name() {
2904+
test_err!(r"(?x)\p{Y i}", 9, ErrorKind::UnclosedUnicodeName);
2905+
}
28182906
}

0 commit comments

Comments
 (0)