@@ -1528,18 +1528,115 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
1528
1528
span,
1529
1529
kind : ast:: AssertionKind :: EndText ,
1530
1530
} ) ) ,
1531
- 'b' => Ok ( Primitive :: Assertion ( ast:: Assertion {
1532
- span,
1533
- kind : ast:: AssertionKind :: WordBoundary ,
1534
- } ) ) ,
1531
+ 'b' => {
1532
+ let mut wb = ast:: Assertion {
1533
+ span,
1534
+ kind : ast:: AssertionKind :: WordBoundary ,
1535
+ } ;
1536
+ // After a \b, we "try" to parse things like \b{start} for
1537
+ // special word boundary assertions.
1538
+ if !self . is_eof ( ) && self . char ( ) == '{' {
1539
+ if let Some ( kind) =
1540
+ self . maybe_parse_special_word_boundary ( start) ?
1541
+ {
1542
+ wb. kind = kind;
1543
+ wb. span . end = self . pos ( ) ;
1544
+ }
1545
+ }
1546
+ Ok ( Primitive :: Assertion ( wb) )
1547
+ }
1535
1548
'B' => Ok ( Primitive :: Assertion ( ast:: Assertion {
1536
1549
span,
1537
1550
kind : ast:: AssertionKind :: NotWordBoundary ,
1538
1551
} ) ) ,
1552
+ '<' => Ok ( Primitive :: Assertion ( ast:: Assertion {
1553
+ span,
1554
+ kind : ast:: AssertionKind :: WordBoundaryStartAngle ,
1555
+ } ) ) ,
1556
+ '>' => Ok ( Primitive :: Assertion ( ast:: Assertion {
1557
+ span,
1558
+ kind : ast:: AssertionKind :: WordBoundaryEndAngle ,
1559
+ } ) ) ,
1539
1560
_ => Err ( self . error ( span, ast:: ErrorKind :: EscapeUnrecognized ) ) ,
1540
1561
}
1541
1562
}
1542
1563
1564
+ /// Attempt to parse a specialty word boundary. That is, `\b{start}`,
1565
+ /// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
1566
+ ///
1567
+ /// This is similar to `maybe_parse_ascii_class` in that, in most cases,
1568
+ /// if it fails it will just return `None` with no error. This is done
1569
+ /// because `\b{5}` is a valid expression and we want to let that be parsed
1570
+ /// by the existing counted repetition parsing code. (I thought about just
1571
+ /// invoking the counted repetition code from here, but it seemed a little
1572
+ /// ham-fisted.)
1573
+ ///
1574
+ /// Unlike `maybe_parse_ascii_class` though, this can return an error.
1575
+ /// Namely, if we definitely know it isn't a counted repetition, then we
1576
+ /// return an error specific to the specialty word boundaries.
1577
+ ///
1578
+ /// This assumes the parser is positioned at a `{` immediately following
1579
+ /// a `\b`. When `None` is returned, the parser is returned to the position
1580
+ /// at which it started: pointing at a `{`.
1581
+ ///
1582
+ /// The position given should correspond to the start of the `\b`.
1583
+ fn maybe_parse_special_word_boundary (
1584
+ & self ,
1585
+ wb_start : Position ,
1586
+ ) -> Result < Option < ast:: AssertionKind > > {
1587
+ assert_eq ! ( self . char ( ) , '{' ) ;
1588
+
1589
+ let is_valid_char = |c| match c {
1590
+ 'A' ..='Z' | 'a' ..='z' | '-' => true ,
1591
+ _ => false ,
1592
+ } ;
1593
+ let start = self . pos ( ) ;
1594
+ if !self . bump_and_bump_space ( ) {
1595
+ return Err ( self . error (
1596
+ Span :: new ( wb_start, self . pos ( ) ) ,
1597
+ ast:: ErrorKind :: SpecialWordOrRepetitionUnexpectedEof ,
1598
+ ) ) ;
1599
+ }
1600
+ let start_contents = self . pos ( ) ;
1601
+ // This is one of the critical bits: if the first non-whitespace
1602
+ // character isn't in [-A-Za-z] (i.e., this can't be a special word
1603
+ // boundary), then we bail and let the counted repetition parser deal
1604
+ // with this.
1605
+ if !is_valid_char ( self . char ( ) ) {
1606
+ self . parser ( ) . pos . set ( start) ;
1607
+ return Ok ( None ) ;
1608
+ }
1609
+
1610
+ // Now collect up our chars until we see a '}'.
1611
+ let mut scratch = self . parser ( ) . scratch . borrow_mut ( ) ;
1612
+ scratch. clear ( ) ;
1613
+ while !self . is_eof ( ) && is_valid_char ( self . char ( ) ) {
1614
+ scratch. push ( self . char ( ) ) ;
1615
+ self . bump_and_bump_space ( ) ;
1616
+ }
1617
+ if self . is_eof ( ) || self . char ( ) != '}' {
1618
+ return Err ( self . error (
1619
+ Span :: new ( start, self . pos ( ) ) ,
1620
+ ast:: ErrorKind :: SpecialWordBoundaryUnclosed ,
1621
+ ) ) ;
1622
+ }
1623
+ let end = self . pos ( ) ;
1624
+ self . bump ( ) ;
1625
+ let kind = match scratch. as_str ( ) {
1626
+ "start" => ast:: AssertionKind :: WordBoundaryStart ,
1627
+ "end" => ast:: AssertionKind :: WordBoundaryEnd ,
1628
+ "start-half" => ast:: AssertionKind :: WordBoundaryStartHalf ,
1629
+ "end-half" => ast:: AssertionKind :: WordBoundaryEndHalf ,
1630
+ _ => {
1631
+ return Err ( self . error (
1632
+ Span :: new ( start_contents, end) ,
1633
+ ast:: ErrorKind :: SpecialWordBoundaryUnrecognized ,
1634
+ ) )
1635
+ }
1636
+ } ;
1637
+ Ok ( Some ( kind) )
1638
+ }
1639
+
1543
1640
/// Parse an octal representation of a Unicode codepoint up to 3 digits
1544
1641
/// long. This expects the parser to be positioned at the first octal
1545
1642
/// digit and advances the parser to the first character immediately
@@ -1967,9 +2064,9 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
1967
2064
// because parsing cannot fail with any interesting error. For example,
1968
2065
// in order to use an ASCII character class, it must be enclosed in
1969
2066
// double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
1970
- // of it as "ASCII character characters have the syntax `[:NAME:]`
1971
- // which can only appear within character brackets." This means that
1972
- // things like `[[:lower:]A]` are legal constructs.
2067
+ // of it as "ASCII character classes have the syntax `[:NAME:]` which
2068
+ // can only appear within character brackets." This means that things
2069
+ // like `[[:lower:]A]` are legal constructs.
1973
2070
//
1974
2071
// However, if one types an incorrect ASCII character class, e.g.,
1975
2072
// `[[:loower:]]`, then we treat that as a normal nested character
@@ -3295,6 +3392,23 @@ bar
3295
3392
ast: Box :: new( lit( 'a' , 0 ) ) ,
3296
3393
} ) )
3297
3394
) ;
3395
+ assert_eq ! (
3396
+ parser( r"\b{5,9}" ) . parse( ) ,
3397
+ Ok ( Ast :: repetition( ast:: Repetition {
3398
+ span: span( 0 ..7 ) ,
3399
+ op: ast:: RepetitionOp {
3400
+ span: span( 2 ..7 ) ,
3401
+ kind: ast:: RepetitionKind :: Range (
3402
+ ast:: RepetitionRange :: Bounded ( 5 , 9 )
3403
+ ) ,
3404
+ } ,
3405
+ greedy: true ,
3406
+ ast: Box :: new( Ast :: assertion( ast:: Assertion {
3407
+ span: span( 0 ..2 ) ,
3408
+ kind: ast:: AssertionKind :: WordBoundary ,
3409
+ } ) ) ,
3410
+ } ) )
3411
+ ) ;
3298
3412
3299
3413
assert_eq ! (
3300
3414
parser( r"(?i){0}" ) . parse( ) . unwrap_err( ) ,
@@ -4381,6 +4495,48 @@ bar
4381
4495
kind: ast:: AssertionKind :: WordBoundary ,
4382
4496
} ) )
4383
4497
) ;
4498
+ assert_eq ! (
4499
+ parser( r"\b{start}" ) . parse_primitive( ) ,
4500
+ Ok ( Primitive :: Assertion ( ast:: Assertion {
4501
+ span: span( 0 ..9 ) ,
4502
+ kind: ast:: AssertionKind :: WordBoundaryStart ,
4503
+ } ) )
4504
+ ) ;
4505
+ assert_eq ! (
4506
+ parser( r"\b{end}" ) . parse_primitive( ) ,
4507
+ Ok ( Primitive :: Assertion ( ast:: Assertion {
4508
+ span: span( 0 ..7 ) ,
4509
+ kind: ast:: AssertionKind :: WordBoundaryEnd ,
4510
+ } ) )
4511
+ ) ;
4512
+ assert_eq ! (
4513
+ parser( r"\b{start-half}" ) . parse_primitive( ) ,
4514
+ Ok ( Primitive :: Assertion ( ast:: Assertion {
4515
+ span: span( 0 ..14 ) ,
4516
+ kind: ast:: AssertionKind :: WordBoundaryStartHalf ,
4517
+ } ) )
4518
+ ) ;
4519
+ assert_eq ! (
4520
+ parser( r"\b{end-half}" ) . parse_primitive( ) ,
4521
+ Ok ( Primitive :: Assertion ( ast:: Assertion {
4522
+ span: span( 0 ..12 ) ,
4523
+ kind: ast:: AssertionKind :: WordBoundaryEndHalf ,
4524
+ } ) )
4525
+ ) ;
4526
+ assert_eq ! (
4527
+ parser( r"\<" ) . parse_primitive( ) ,
4528
+ Ok ( Primitive :: Assertion ( ast:: Assertion {
4529
+ span: span( 0 ..2 ) ,
4530
+ kind: ast:: AssertionKind :: WordBoundaryStartAngle ,
4531
+ } ) )
4532
+ ) ;
4533
+ assert_eq ! (
4534
+ parser( r"\>" ) . parse_primitive( ) ,
4535
+ Ok ( Primitive :: Assertion ( ast:: Assertion {
4536
+ span: span( 0 ..2 ) ,
4537
+ kind: ast:: AssertionKind :: WordBoundaryEndAngle ,
4538
+ } ) )
4539
+ ) ;
4384
4540
assert_eq ! (
4385
4541
parser( r"\B" ) . parse_primitive( ) ,
4386
4542
Ok ( Primitive :: Assertion ( ast:: Assertion {
@@ -4418,20 +4574,60 @@ bar
4418
4574
kind: ast:: ErrorKind :: EscapeUnrecognized ,
4419
4575
}
4420
4576
) ;
4421
- // But also, < and > are banned, so that we may evolve them into
4422
- // start/end word boundary assertions. (Not sure if we will...)
4577
+
4578
+ // Starting a special word boundary without any non-whitespace chars
4579
+ // after the brace makes it ambiguous whether the user meant to write
4580
+ // a counted repetition (probably not?) or an actual special word
4581
+ // boundary assertion.
4423
4582
assert_eq ! (
4424
- parser( r"\< " ) . parse_escape( ) . unwrap_err( ) ,
4583
+ parser( r"\b{ " ) . parse_escape( ) . unwrap_err( ) ,
4425
4584
TestError {
4426
- span: span( 0 ..2 ) ,
4427
- kind: ast:: ErrorKind :: EscapeUnrecognized ,
4585
+ span: span( 0 ..3 ) ,
4586
+ kind: ast:: ErrorKind :: SpecialWordOrRepetitionUnexpectedEof ,
4428
4587
}
4429
4588
) ;
4430
4589
assert_eq ! (
4431
- parser ( r"\> " ) . parse_escape( ) . unwrap_err( ) ,
4590
+ parser_ignore_whitespace ( r"\b{ " ) . parse_escape( ) . unwrap_err( ) ,
4432
4591
TestError {
4433
- span: span( 0 ..2 ) ,
4434
- kind: ast:: ErrorKind :: EscapeUnrecognized ,
4592
+ span: span( 0 ..4 ) ,
4593
+ kind: ast:: ErrorKind :: SpecialWordOrRepetitionUnexpectedEof ,
4594
+ }
4595
+ ) ;
4596
+ // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char,
4597
+ // and thus causes the parser to treat it as a counted repetition.
4598
+ assert_eq ! (
4599
+ parser( r"\b{ " ) . parse( ) . unwrap_err( ) ,
4600
+ TestError {
4601
+ span: span( 4 ..4 ) ,
4602
+ kind: ast:: ErrorKind :: RepetitionCountDecimalEmpty ,
4603
+ }
4604
+ ) ;
4605
+ // In this case, we got some valid chars that makes it look like the
4606
+ // user is writing one of the special word boundary assertions, but
4607
+ // we forget to close the brace.
4608
+ assert_eq ! (
4609
+ parser( r"\b{foo" ) . parse_escape( ) . unwrap_err( ) ,
4610
+ TestError {
4611
+ span: span( 2 ..6 ) ,
4612
+ kind: ast:: ErrorKind :: SpecialWordBoundaryUnclosed ,
4613
+ }
4614
+ ) ;
4615
+ // We get the same error as above, except it is provoked by seeing a
4616
+ // char that we know is invalid before seeing a closing brace.
4617
+ assert_eq ! (
4618
+ parser( r"\b{foo!}" ) . parse_escape( ) . unwrap_err( ) ,
4619
+ TestError {
4620
+ span: span( 2 ..6 ) ,
4621
+ kind: ast:: ErrorKind :: SpecialWordBoundaryUnclosed ,
4622
+ }
4623
+ ) ;
4624
+ // And this one occurs when, syntactically, everything looks okay, but
4625
+ // we don't use a valid spelling of a word boundary assertion.
4626
+ assert_eq ! (
4627
+ parser( r"\b{foo}" ) . parse_escape( ) . unwrap_err( ) ,
4628
+ TestError {
4629
+ span: span( 3 ..6 ) ,
4630
+ kind: ast:: ErrorKind :: SpecialWordBoundaryUnrecognized ,
4435
4631
}
4436
4632
) ;
4437
4633
0 commit comments