Skip to content

Commit 114fff4

Browse files
Merge pull request #3656 from HenryRLee:regexUnicode
PiperOrigin-RevId: 326105468
2 parents 508a2c1 + 5345e2b commit 114fff4

File tree

3 files changed

+86
-14
lines changed

3 files changed

+86
-14
lines changed

src/com/google/javascript/jscomp/regex/RegExpTree.java

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ private RegExpTree parseCharset() {
399399
CharRanges ieExplicits = CharRanges.EMPTY;
400400
while (pos < limit && pattern.charAt(pos) != ']') {
401401
char ch = pattern.charAt(pos);
402-
char start;
402+
int start;
403403
if (ch == '\\') {
404404
++pos;
405405
char possibleGroupName = pattern.charAt(pos);
@@ -414,7 +414,7 @@ private RegExpTree parseCharset() {
414414
start = ch;
415415
++pos;
416416
}
417-
char end = start;
417+
int end = start;
418418
if (pos + 1 < limit && pattern.charAt(pos) == '-'
419419
&& pattern.charAt(pos + 1) != ']') {
420420
++pos;
@@ -459,20 +459,22 @@ private RegExpTree parseCharset() {
459459
}
460460

461461
/**
462-
* Parses an escape to a code point.
463-
* Some of the characters parsed here have special meanings in various
464-
* contexts, so contexts must filter those instead.
465-
* E.g. '\b' means a different thing inside a charset than without.
462+
* Parses an escape to a code point. Some of the characters parsed here have special meanings
463+
* in various contexts, so contexts must filter those instead. E.g. '\b' means a different
464+
* thing inside a charset than without.
466465
*/
467-
private char parseEscapeChar() {
466+
private int parseEscapeChar() {
468467
char ch = pattern.charAt(pos++);
469468
switch (ch) {
470469
case 'b': return '\b';
471470
case 'f': return '\f';
472471
case 'n': return '\n';
473472
case 'r': return '\r';
474473
case 't': return '\t';
475-
case 'u': return parseHex(4);
474+
case 'u':
475+
return (flags.contains("u") && pos < limit && pattern.charAt(pos) == '{')
476+
? parseBracedUnicodeEscape()
477+
: parseHex(4);
476478
case 'v': return '\u000b';
477479
case 'x': return parseHex(2);
478480
default:
@@ -599,18 +601,23 @@ private RegExpTree parseEscape() {
599601
++pos;
600602
return new Charset(charGroup, CharRanges.EMPTY);
601603
}
602-
return new Text("" + parseEscapeChar());
604+
return new Text(new String(Character.toChars(parseEscapeChar())));
603605
}
604606
}
605607

606-
/**
607-
* Parses n hex digits to a code-unit.
608-
*/
609-
private char parseHex(int n) {
608+
/** Parses n hex digits to a code-unit. */
609+
private int parseHex(int n) {
610610
if (pos + n > limit) {
611611
throw new IllegalArgumentException(
612612
"Abbreviated hex escape " + pattern.substring(pos));
613613
}
614+
if (n > 7) {
615+
// We need to guard the MSB to prevent overflow.
616+
throw new IllegalArgumentException(
617+
"Cannot parse hexadecimal encoding wider than 28 bits: "
618+
+ pattern.substring(pos, pos + n));
619+
}
620+
614621
int result = 0;
615622
while (--n >= 0) {
616623
char ch = pattern.charAt(pos);
@@ -627,7 +634,31 @@ private char parseHex(int n) {
627634
++pos;
628635
result = (result << 4) | digit;
629636
}
630-
return (char) result;
637+
return result;
638+
}
639+
640+
private int parseBracedUnicodeEscape() {
641+
int openBrace = pos;
642+
checkState(pattern.charAt(pos++) == '{');
643+
644+
int closeBrace = pos;
645+
while (closeBrace < limit && pattern.charAt(closeBrace) != '}') {
646+
closeBrace++;
647+
}
648+
if (closeBrace == limit) {
649+
throw new IllegalArgumentException(
650+
"Malformed unicode escape: expected '}' after " + pattern.substring(openBrace));
651+
} else if (closeBrace == pos) {
652+
throw new IllegalArgumentException("Empty unicode escape");
653+
}
654+
655+
int result = parseHex(closeBrace - pos);
656+
if (result > 0x10FFFF) {
657+
throw new IllegalArgumentException(
658+
"Unicode must be at most 0x10FFFF: " + pattern.substring(openBrace + 1, pos));
659+
}
660+
pos++; // Consume the close brace.
661+
return result;
631662
}
632663

633664
private boolean isRepetitionStart(char ch) {

test/com/google/javascript/jscomp/parsing/ParserTest.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4894,6 +4894,14 @@ public void testRegExpError() {
48944894
parseError("/\b.\\/", "Expected '/' in regular expression literal");
48954895
}
48964896

4897+
@Test
4898+
public void testRegExpUnicode() {
4899+
assertNodeEquality(parse("/\\u10fA/"), script(expr(regex("\\u10fA"))));
4900+
assertNodeEquality(parse("/\\u{10fA}/u"), script(expr(regex("\\u{10fA}", "u"))));
4901+
assertNodeEquality(parse("/\\u{1fA}/u"), script(expr(regex("\\u{1fA}", "u"))));
4902+
assertNodeEquality(parse("/\\u{10FFFF}/u"), script(expr(regex("\\u{10FFFF}", "u"))));
4903+
}
4904+
48974905
@Test
48984906
public void testRegExpFlags() {
48994907
// Various valid combinations.
@@ -6556,6 +6564,10 @@ private static Node regex(String regex) {
65566564
return new Node(Token.REGEXP, Node.newString(regex));
65576565
}
65586566

6567+
private static Node regex(String regex, String flag) {
6568+
return new Node(Token.REGEXP, Node.newString(regex), Node.newString(flag));
6569+
}
6570+
65596571
/**
65606572
* Verify that the given code has the given parse errors.
65616573
* @return If in IDE mode, returns a partial tree.

test/com/google/javascript/jscomp/regex/RegExpTreeTest.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,4 +193,33 @@ public void testBackreferencingTreatedAsStringIfNoGroup() {
193193
// (?: ) in expected output serves same purpose as above test
194194
assertRegexCompilesTo("[(?<foo>)]\\k<foo>", "", "(?:[()<>?fo]k)<foo>");
195195
}
196+
197+
@Test
198+
public void testValidUnicodeEscape() {
199+
assertRegexCompilesTo("\\u0061", "", "a");
200+
assertRegexCompilesTo("\\u10b1", "u", "\\u10b1");
201+
assertRegexCompilesTo("\\u{61}", "u", "a");
202+
assertRegexCompilesTo("\\u{10b1}", "u", "\\u10b1");
203+
assertRegexCompilesTo("\\u{1bc}", "u", "\\u01bc");
204+
assertRegexCompilesTo("\\u{100A3}", "u", "\\ud800\\udca3");
205+
}
206+
207+
@Test
208+
public void testInvalidUnicodeEscape() {
209+
assertRegexThrowsExceptionThat("\\u{a012", "u")
210+
.hasMessageThat()
211+
.isEqualTo("Malformed unicode escape: expected '}' after {a012");
212+
assertRegexThrowsExceptionThat("\\u{}", "u") //
213+
.hasMessageThat()
214+
.isEqualTo("Empty unicode escape");
215+
assertRegexThrowsExceptionThat("\\u{10za}", "u") //
216+
.hasMessageThat()
217+
.isEqualTo("za}");
218+
assertRegexThrowsExceptionThat("\\u{FFFFFF}", "u")
219+
.hasMessageThat()
220+
.isEqualTo("Unicode must be at most 0x10FFFF: FFFFFF");
221+
assertRegexThrowsExceptionThat("\\u{FF00FFFF}", "u")
222+
.hasMessageThat()
223+
.isEqualTo("Cannot parse hexadecimal encoding wider than 28 bits: FF00FFFF");
224+
}
196225
}

0 commit comments

Comments
 (0)