Skip to content

Commit 95f3b9a

Browse files
committed
Improve unicode escape in regex
1 parent ca81032 commit 95f3b9a

File tree

3 files changed

+84
-5
lines changed

3 files changed

+84
-5
lines changed

src/com/google/javascript/jscomp/regex/RegExpTree.java

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ private RegExpTree parseCharset() {
399399
CharRanges ieExplicits = CharRanges.EMPTY;
400400
while (pos < limit && pattern.charAt(pos) != ']') {
401401
char ch = pattern.charAt(pos);
402-
char start;
402+
int start;
403403
if (ch == '\\') {
404404
++pos;
405405
char possibleGroupName = pattern.charAt(pos);
@@ -414,7 +414,7 @@ private RegExpTree parseCharset() {
414414
start = ch;
415415
++pos;
416416
}
417-
char end = start;
417+
int end = start;
418418
if (pos + 1 < limit && pattern.charAt(pos) == '-'
419419
&& pattern.charAt(pos + 1) != ']') {
420420
++pos;
@@ -464,15 +464,20 @@ private RegExpTree parseCharset() {
464464
* contexts, so contexts must filter those instead.
465465
* E.g. '\b' means a different thing inside a charset than without.
466466
*/
467-
private char parseEscapeChar() {
467+
private int parseEscapeChar() {
468468
char ch = pattern.charAt(pos++);
469469
switch (ch) {
470470
case 'b': return '\b';
471471
case 'f': return '\f';
472472
case 'n': return '\n';
473473
case 'r': return '\r';
474474
case 't': return '\t';
475-
case 'u': return parseHex(4);
475+
case 'u':
476+
if (flags.contains("u") && pos < limit && pattern.charAt(pos) == '{') {
477+
return parseUnicodeEscape();
478+
} else {
479+
return parseHex(4);
480+
}
476481
case 'v': return '\u000b';
477482
case 'x': return parseHex(2);
478483
default:
@@ -599,7 +604,7 @@ private RegExpTree parseEscape() {
599604
++pos;
600605
return new Charset(charGroup, CharRanges.EMPTY);
601606
}
602-
return new Text("" + parseEscapeChar());
607+
return new Text(new String(Character.toChars(parseEscapeChar())));
603608
}
604609
}
605610

@@ -630,6 +635,42 @@ private char parseHex(int n) {
630635
return (char) result;
631636
}
632637

638+
private int parseUnicodeEscape() {
639+
checkState(pattern.charAt(pos) == '{');
640+
int start = pos++;
641+
int result = 0;
642+
char ch = pattern.charAt(pos);
643+
if (ch == '}') {
644+
throw new IllegalArgumentException("Invalid unicode escape: "
645+
+ pattern.substring(start, ++pos));
646+
}
647+
while (pos < limit) {
648+
int digit;
649+
ch = pattern.charAt(pos++);
650+
if ('0' <= ch && ch <= '9') {
651+
digit = ch - '0';
652+
} else if ('a' <= ch && ch <= 'f') {
653+
digit = ch + (10 - 'a');
654+
} else if ('A' <= ch && ch <= 'F') {
655+
digit = ch + (10 - 'A');
656+
} else if (ch == '}') {
657+
break;
658+
} else {
659+
throw new IllegalArgumentException("Invalid character in unicode escape: " + ch);
660+
}
661+
result = (result << 4) | digit;
662+
}
663+
if (ch != '}') {
664+
throw new IllegalArgumentException("Malformed unicode escape: expected '}' after "
665+
+ pattern.substring(start, pos));
666+
}
667+
if (result > 0x10FFFF) {
668+
throw new IllegalArgumentException("Unicode must not greater than 0x10FFFF: "
669+
+ pattern.substring(start, pos));
670+
}
671+
return result;
672+
}
673+
633674
private boolean isRepetitionStart(char ch) {
634675
switch (ch) {
635676
case '?':

test/com/google/javascript/jscomp/parsing/ParserTest.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4894,6 +4894,14 @@ public void testRegExpError() {
48944894
parseError("/\b.\\/", "Expected '/' in regular expression literal");
48954895
}
48964896

4897+
@Test
4898+
public void testRegExpUnicode() {
4899+
assertNodeEquality(parse("/\\u10fA/"), script(expr(regex("\\u10fA"))));
4900+
assertNodeEquality(parse("/\\u{10fA}/u"), script(expr(regex("\\u{10fA}", "u"))));
4901+
assertNodeEquality(parse("/\\u{1fA}/u"), script(expr(regex("\\u{1fA}", "u"))));
4902+
assertNodeEquality(parse("/\\u{10FFFF}/u"), script(expr(regex("\\u{10FFFF}", "u"))));
4903+
}
4904+
48974905
@Test
48984906
public void testRegExpFlags() {
48994907
// Various valid combinations.
@@ -6556,6 +6564,10 @@ private static Node regex(String regex) {
65566564
return new Node(Token.REGEXP, Node.newString(regex));
65576565
}
65586566

6567+
private static Node regex(String regex, String flag) {
6568+
return new Node(Token.REGEXP, Node.newString(regex), Node.newString(flag));
6569+
}
6570+
65596571
/**
65606572
* Verify that the given code has the given parse errors.
65616573
* @return If in IDE mode, returns a partial tree.

test/com/google/javascript/jscomp/regex/RegExpTreeTest.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,4 +193,30 @@ public void testBackreferencingTreatedAsStringIfNoGroup() {
193193
// (?: ) in expected output serves same purpose as above test
194194
assertRegexCompilesTo("[(?<foo>)]\\k<foo>", "", "(?:[()<>?fo]k)<foo>");
195195
}
196+
197+
@Test
198+
public void testValidUnicodeEscape() {
199+
assertRegexCompilesTo("\\u0061", "", "a");
200+
assertRegexCompilesTo("\\u10b1", "u", "\\u10b1");
201+
assertRegexCompilesTo("\\u{61}", "u", "a");
202+
assertRegexCompilesTo("\\u{10b1}", "u", "\\u10b1");
203+
assertRegexCompilesTo("\\u{1bc}", "u", "\\u01bc");
204+
assertRegexCompilesTo("\\u{100A3}", "u", "\\ud800\\udca3");
205+
}
206+
207+
@Test
208+
public void testInvalidUnicodeEscape() {
209+
assertRegexThrowsExceptionThat("\\u{a012", "u")
210+
.hasMessageThat()
211+
.isEqualTo("Malformed unicode escape: expected '}' after {a012");
212+
assertRegexThrowsExceptionThat("\\u{}", "u")
213+
.hasMessageThat()
214+
.isEqualTo("Invalid unicode escape: {}");
215+
assertRegexThrowsExceptionThat("\\u{10za}", "u")
216+
.hasMessageThat()
217+
.isEqualTo("Invalid character in unicode escape: z");
218+
assertRegexThrowsExceptionThat("\\u{FFFFFF}", "u")
219+
.hasMessageThat()
220+
.isEqualTo("Unicode must not greater than 0x10FFFF: {FFFFFF}");
221+
}
196222
}

0 commit comments

Comments
 (0)