@@ -104,7 +104,11 @@ impl Parser {
104
104
// Starts at the beginning of the input and consumes until either the end
105
105
// of input or an error.
106
106
fn parse_expr ( mut self ) -> Result < Expr > {
107
- while !self . eof ( ) {
107
+ loop {
108
+ self . ignore_space ( ) ;
109
+ if self . eof ( ) {
110
+ break ;
111
+ }
108
112
let build_expr = match self . cur ( ) {
109
113
'\\' => try!( self . parse_escape ( ) ) ,
110
114
'|' => { let e = try!( self . alternate ( ) ) ; self . bump ( ) ; e }
@@ -177,7 +181,7 @@ impl Parser {
177
181
return Err ( self . err ( ErrorKind :: UnexpectedEscapeEof ) ) ;
178
182
}
179
183
let c = self . cur ( ) ;
180
- if is_punct ( c) {
184
+ if is_punct ( c) || ( self . flags . ignore_space && c . is_whitespace ( ) ) {
181
185
let c = self . bump ( ) ;
182
186
return Ok ( try!( self . lit ( c) ) ) ;
183
187
}
@@ -234,6 +238,7 @@ impl Parser {
234
238
let chari = self . chari ;
235
239
let mut name: CaptureName = None ;
236
240
self . bump ( ) ;
241
+ self . ignore_space ( ) ;
237
242
if self . bump_if ( "?P<" ) {
238
243
let n = try!( self . parse_group_name ( ) ) ;
239
244
if self . names . iter ( ) . any ( |n2| n2 == & n) {
@@ -370,13 +375,16 @@ impl Parser {
370
375
return Err ( self . err ( ErrorKind :: RepeaterUnexpectedExpr ( e) ) ) ;
371
376
}
372
377
self . bump ( ) ;
373
- let min = try!( self . parse_decimal ( |c| c != ',' && c != '}' ) ) ;
378
+ self . ignore_space ( ) ;
379
+ let min = try!( self . parse_decimal ( ) ) ;
374
380
let mut max_opt = Some ( min) ;
381
+ self . ignore_space ( ) ;
375
382
if self . bump_if ( ',' ) {
383
+ self . ignore_space ( ) ;
376
384
if self . peek_is ( '}' ) {
377
385
max_opt = None ;
378
386
} else {
379
- let max = try!( self . parse_decimal ( |c| c != '}' ) ) ;
387
+ let max = try!( self . parse_decimal ( ) ) ;
380
388
if min > max {
381
389
// e.g., a{2,1}
382
390
return Err ( self . err ( ErrorKind :: InvalidRepeatRange {
@@ -387,6 +395,7 @@ impl Parser {
387
395
max_opt = Some ( max) ;
388
396
}
389
397
}
398
+ self . ignore_space ( ) ;
390
399
if !self . bump_if ( '}' ) {
391
400
Err ( self . err ( ErrorKind :: UnclosedRepeat ) )
392
401
} else {
@@ -423,8 +432,8 @@ impl Parser {
423
432
//
424
433
// Start: `1`
425
434
// End: `,` (where `until == ','`)
426
- fn parse_decimal < B : Bumpable > ( & mut self , until : B ) -> Result < u32 > {
427
- match self . bump_get ( until ) {
435
+ fn parse_decimal ( & mut self ) -> Result < u32 > {
436
+ match self . bump_get ( |c| is_ascii_word ( c ) || c . is_whitespace ( ) ) {
428
437
// e.g., a{}
429
438
None => Err ( self . err ( ErrorKind :: MissingBase10 ) ) ,
430
439
Some ( n) => {
@@ -472,6 +481,7 @@ impl Parser {
472
481
// Start: `{`
473
482
// End: `b`
474
483
fn parse_hex ( & mut self ) -> Result < Build > {
484
+ self . ignore_space ( ) ;
475
485
if self . bump_if ( '{' ) {
476
486
self . parse_hex_many_digits ( )
477
487
} else {
@@ -486,9 +496,11 @@ impl Parser {
486
496
fn parse_hex_many_digits ( & mut self ) -> Result < Build > {
487
497
use std:: char;
488
498
489
- let s = self . bump_get ( |c| c != '}' ) . unwrap_or ( "" . into ( ) ) ;
499
+ self . ignore_space ( ) ;
500
+ let s = self . bump_get ( is_ascii_word) . unwrap_or ( "" . into ( ) ) ;
490
501
let n = try!( u32:: from_str_radix ( & s, 16 )
491
502
. map_err ( |_| self . err ( ErrorKind :: InvalidBase16 ( s) ) ) ) ;
503
+ self . ignore_space ( ) ;
492
504
if !self . bump_if ( '}' ) {
493
505
// e.g., a\x{d
494
506
return Err ( self . err ( ErrorKind :: UnclosedHex ) ) ;
@@ -530,12 +542,16 @@ impl Parser {
530
542
// End: `+`
531
543
fn parse_class ( & mut self ) -> Result < Build > {
532
544
self . bump ( ) ;
545
+ self . ignore_space ( ) ;
533
546
let negated = self . bump_if ( '^' ) ;
547
+ self . ignore_space ( ) ;
534
548
let mut class = CharClass :: empty ( ) ;
535
549
while self . bump_if ( '-' ) {
550
+ self . ignore_space ( ) ;
536
551
class. ranges . push ( ClassRange :: one ( '-' ) ) ;
537
552
}
538
553
loop {
554
+ self . ignore_space ( ) ;
539
555
if self . eof ( ) {
540
556
// e.g., [a
541
557
return Err ( self . err ( ErrorKind :: UnexpectedClassEof ) ) ;
@@ -631,11 +647,13 @@ impl Parser {
631
647
// End: `]`
632
648
fn parse_class_range ( & mut self , class : & mut CharClass , start : char )
633
649
-> Result < ( ) > {
650
+ self . ignore_space ( ) ;
634
651
if !self . bump_if ( '-' ) {
635
652
// Not a range, so just push a singleton range.
636
653
class. ranges . push ( ClassRange :: one ( start) ) ;
637
654
return Ok ( ( ) ) ;
638
655
}
656
+ self . ignore_space ( ) ;
639
657
if self . eof ( ) {
640
658
// e.g., [a-
641
659
return Err ( self . err ( ErrorKind :: UnexpectedClassEof ) ) ;
@@ -730,9 +748,12 @@ impl Parser {
730
748
//
731
749
// `negate` is true when the class name is used with `\P`.
732
750
fn parse_unicode_class ( & mut self , neg : bool ) -> Result < CharClass > {
751
+ self . ignore_space ( ) ;
733
752
let name =
734
753
if self . bump_if ( '{' ) {
735
- let n = self . bump_get ( |c| c != '}' ) . unwrap_or ( "" . into ( ) ) ;
754
+ self . ignore_space ( ) ;
755
+ let n = self . bump_get ( is_ascii_word) . unwrap_or ( "" . into ( ) ) ;
756
+ self . ignore_space ( ) ;
736
757
if n. is_empty ( ) || !self . bump_if ( '}' ) {
737
758
// e.g., \p{Greek
738
759
return Err ( self . err ( ErrorKind :: UnclosedUnicodeName ) ) ;
@@ -796,7 +817,31 @@ impl Parser {
796
817
// Auxiliary helper methods.
797
818
impl Parser {
798
819
fn chars ( & self ) -> Chars {
799
- Chars :: new ( & self . chars [ self . chari ..] , self . flags . ignore_space )
820
+ Chars :: new ( & self . chars [ self . chari ..] )
821
+ }
822
+
823
+ fn ignore_space ( & mut self ) {
824
+ if !self . flags . ignore_space {
825
+ return ;
826
+ }
827
+ while !self . eof ( ) {
828
+ match self . cur ( ) {
829
+ '#' => {
830
+ self . bump ( ) ;
831
+ while !self . eof ( ) {
832
+ match self . bump ( ) {
833
+ '\n' => break ,
834
+ _ => continue ,
835
+ }
836
+ }
837
+ } ,
838
+ c => if !c. is_whitespace ( ) {
839
+ return ;
840
+ } else {
841
+ self . bump ( ) ;
842
+ }
843
+ }
844
+ }
800
845
}
801
846
802
847
fn bump ( & mut self ) -> char {
@@ -924,48 +969,22 @@ impl Parser {
924
969
struct Chars < ' a > {
925
970
chars : & ' a [ char ] ,
926
971
cur : usize ,
927
- ignore_space : bool ,
928
972
}
929
973
930
974
impl < ' a > Iterator for Chars < ' a > {
931
975
type Item = char ;
932
976
fn next ( & mut self ) -> Option < char > {
933
- if !self . ignore_space {
934
- let x = self . c ( ) ;
935
- self . advance ( ) ;
936
- return x;
937
- }
938
- while let Some ( c) = self . c ( ) {
939
- self . advance ( ) ;
940
- match c {
941
- '\\' => return match self . c ( ) {
942
- Some ( '#' ) => { self . advance ( ) ; Some ( '#' ) }
943
- _ => Some ( '\\' )
944
- } ,
945
- '#' => loop {
946
- match self . c ( ) {
947
- Some ( c) => {
948
- self . advance ( ) ;
949
- if c == '\n' {
950
- break ;
951
- }
952
- } ,
953
- None => return None
954
- }
955
- } ,
956
- _ => if !c. is_whitespace ( ) { return Some ( c) ; }
957
- }
958
- }
959
- None
977
+ let x = self . c ( ) ;
978
+ self . advance ( ) ;
979
+ return x;
960
980
}
961
981
}
962
982
963
983
impl < ' a > Chars < ' a > {
964
- fn new ( chars : & [ char ] , ignore_space : bool ) -> Chars {
984
+ fn new ( chars : & [ char ] ) -> Chars {
965
985
Chars {
966
986
chars : chars,
967
987
cur : 0 ,
968
- ignore_space : ignore_space,
969
988
}
970
989
}
971
990
@@ -1221,6 +1240,13 @@ fn is_valid_capture_char(c: char) -> bool {
1221
1240
|| ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' )
1222
1241
}
1223
1242
1243
+ fn is_ascii_word ( c : char ) -> bool {
1244
+ match c {
1245
+ 'a' ... 'z' | 'A' ... 'Z' | '_' | '0' ... '9' => true ,
1246
+ _ => false ,
1247
+ }
1248
+ }
1249
+
1224
1250
/// Returns true if the give character has significance in a regex.
1225
1251
pub fn is_punct ( c : char ) -> bool {
1226
1252
match c {
@@ -2368,10 +2394,49 @@ mod tests {
2368
2394
}
2369
2395
2370
2396
#[ test]
2371
- fn ignore_space_escape ( ) {
2372
- assert_eq ! ( p( r"(?x)\ d" ) , Expr :: Class ( class( PERLD ) ) ) ;
2373
- assert_eq ! ( p( r"(?x)\
2374
- D" ) , Expr :: Class ( class( PERLD ) . negate( ) ) ) ;
2397
+ fn ignore_space_escape_octal ( ) {
2398
+ assert_eq ! ( p( r"(?x)\12 3" ) , Expr :: Concat ( vec![
2399
+ lit( '\n' ) ,
2400
+ lit( '3' ) ,
2401
+ ] ) ) ;
2402
+ }
2403
+
2404
+ #[ test]
2405
+ fn ignore_space_escape_hex ( ) {
2406
+ assert_eq ! ( p( r"(?x)\x { 53 }" ) , lit( 'S' ) ) ;
2407
+ assert_eq ! ( p( r"(?x)\x # comment
2408
+ { # comment
2409
+ 53 # comment
2410
+ } # comment" ) , lit( 'S' ) ) ;
2411
+ }
2412
+
2413
+ #[ test]
2414
+ fn ignore_space_escape_hex2 ( ) {
2415
+ assert_eq ! ( p( r"(?x)\x 53" ) , lit( 'S' ) ) ;
2416
+ assert_eq ! ( p( r"(?x)\x # comment
2417
+ 53 # comment" ) , lit( 'S' ) ) ;
2418
+ }
2419
+
2420
+ #[ test]
2421
+ fn ignore_space_escape_unicode_name ( ) {
2422
+ assert_eq ! ( p( r"(?x)\p # comment
2423
+ { # comment
2424
+ Yi # comment
2425
+ } # comment" ) , Expr :: Class ( class( YI ) ) ) ;
2426
+ }
2427
+
2428
+ #[ test]
2429
+ fn ignore_space_repeat_counted ( ) {
2430
+ assert_eq ! ( p( "(?x)a # comment
2431
+ { # comment
2432
+ 5 # comment
2433
+ , # comment
2434
+ 10 # comment
2435
+ }" ) , Expr :: Repeat {
2436
+ e: b( lit( 'a' ) ) ,
2437
+ r: Repeater :: Range { min: 5 , max: Some ( 10 ) } ,
2438
+ greedy: true ,
2439
+ } ) ;
2375
2440
}
2376
2441
2377
2442
#[ test]
@@ -2424,6 +2489,14 @@ mod tests {
2424
2489
] ) ) ;
2425
2490
}
2426
2491
2492
+ #[ test]
2493
+ fn ignore_space_escape_space ( ) {
2494
+ assert_eq ! ( p( r"(?x)a\ # hi there" ) , Expr :: Concat ( vec![
2495
+ lit( 'a' ) ,
2496
+ lit( ' ' ) ,
2497
+ ] ) ) ;
2498
+ }
2499
+
2427
2500
// Test every single possible error case.
2428
2501
2429
2502
macro_rules! test_err {
@@ -2815,4 +2888,19 @@ mod tests {
2815
2888
test_err ! ( "(?P<a>.)(?P<a>.)" , 14 ,
2816
2889
ErrorKind :: DuplicateCaptureName ( "a" . into( ) ) ) ;
2817
2890
}
2891
+
2892
+ #[ test]
2893
+ fn error_ignore_space_escape_hex ( ) {
2894
+ test_err ! ( r"(?x)\x{ 5 3 }" , 10 , ErrorKind :: UnclosedHex ) ;
2895
+ }
2896
+
2897
+ #[ test]
2898
+ fn error_ignore_space_escape_hex2 ( ) {
2899
+ test_err ! ( r"(?x)\x 5 3" , 9 , ErrorKind :: InvalidBase16 ( "5 " . into( ) ) ) ;
2900
+ }
2901
+
2902
+ #[ test]
2903
+ fn error_ignore_space_escape_unicode_name ( ) {
2904
+ test_err ! ( r"(?x)\p{Y i}" , 9 , ErrorKind :: UnclosedUnicodeName ) ;
2905
+ }
2818
2906
}
0 commit comments