@@ -399,7 +399,7 @@ private RegExpTree parseCharset() {
399
399
CharRanges ieExplicits = CharRanges .EMPTY ;
400
400
while (pos < limit && pattern .charAt (pos ) != ']' ) {
401
401
char ch = pattern .charAt (pos );
402
- char start ;
402
+ int start ;
403
403
if (ch == '\\' ) {
404
404
++pos ;
405
405
char possibleGroupName = pattern .charAt (pos );
@@ -414,7 +414,7 @@ private RegExpTree parseCharset() {
414
414
start = ch ;
415
415
++pos ;
416
416
}
417
- char end = start ;
417
+ int end = start ;
418
418
if (pos + 1 < limit && pattern .charAt (pos ) == '-'
419
419
&& pattern .charAt (pos + 1 ) != ']' ) {
420
420
++pos ;
@@ -459,20 +459,22 @@ private RegExpTree parseCharset() {
459
459
}
460
460
461
461
/**
462
- * Parses an escape to a code point.
463
- * Some of the characters parsed here have special meanings in various
464
- * contexts, so contexts must filter those instead.
465
- * E.g. '\b' means a different thing inside a charset than without.
462
+ * Parses an escape to a code point. Some of the characters parsed here have special meanings
463
+ * in various contexts, so contexts must filter those instead. E.g. '\b' means a different
464
+ * thing inside a charset than without.
466
465
*/
467
- private char parseEscapeChar () {
466
+ private int parseEscapeChar () {
468
467
char ch = pattern .charAt (pos ++);
469
468
switch (ch ) {
470
469
case 'b' : return '\b' ;
471
470
case 'f' : return '\f' ;
472
471
case 'n' : return '\n' ;
473
472
case 'r' : return '\r' ;
474
473
case 't' : return '\t' ;
475
- case 'u' : return parseHex (4 );
474
+ case 'u' :
475
+ return (flags .contains ("u" ) && pos < limit && pattern .charAt (pos ) == '{' )
476
+ ? parseBracedUnicodeEscape ()
477
+ : parseHex (4 );
476
478
case 'v' : return '\u000b' ;
477
479
case 'x' : return parseHex (2 );
478
480
default :
@@ -599,18 +601,23 @@ private RegExpTree parseEscape() {
599
601
++pos ;
600
602
return new Charset (charGroup , CharRanges .EMPTY );
601
603
}
602
- return new Text ("" + parseEscapeChar ());
604
+ return new Text (new String ( Character . toChars ( parseEscapeChar ()) ));
603
605
}
604
606
}
605
607
606
- /**
607
- * Parses n hex digits to a code-unit.
608
- */
609
- private char parseHex (int n ) {
608
+ /** Parses n hex digits to a code-unit. */
609
+ private int parseHex (int n ) {
610
610
if (pos + n > limit ) {
611
611
throw new IllegalArgumentException (
612
612
"Abbreviated hex escape " + pattern .substring (pos ));
613
613
}
614
+ if (n > 7 ) {
615
+ // We need to guard the MSB to prevent overflow.
616
+ throw new IllegalArgumentException (
617
+ "Cannot parse hexadecimal encoding wider than 28 bits: "
618
+ + pattern .substring (pos , pos + n ));
619
+ }
620
+
614
621
int result = 0 ;
615
622
while (--n >= 0 ) {
616
623
char ch = pattern .charAt (pos );
@@ -627,7 +634,31 @@ private char parseHex(int n) {
627
634
++pos ;
628
635
result = (result << 4 ) | digit ;
629
636
}
630
- return (char ) result ;
637
+ return result ;
638
+ }
639
+
640
+ private int parseBracedUnicodeEscape () {
641
+ int openBrace = pos ;
642
+ checkState (pattern .charAt (pos ++) == '{' );
643
+
644
+ int closeBrace = pos ;
645
+ while (closeBrace < limit && pattern .charAt (closeBrace ) != '}' ) {
646
+ closeBrace ++;
647
+ }
648
+ if (closeBrace == limit ) {
649
+ throw new IllegalArgumentException (
650
+ "Malformed unicode escape: expected '}' after " + pattern .substring (openBrace ));
651
+ } else if (closeBrace == pos ) {
652
+ throw new IllegalArgumentException ("Empty unicode escape" );
653
+ }
654
+
655
+ int result = parseHex (closeBrace - pos );
656
+ if (result > 0x10FFFF ) {
657
+ throw new IllegalArgumentException (
658
+ "Unicode must be at most 0x10FFFF: " + pattern .substring (openBrace + 1 , pos ));
659
+ }
660
+ pos ++; // Consume the close brace.
661
+ return result ;
631
662
}
632
663
633
664
private boolean isRepetitionStart (char ch ) {
0 commit comments