Skip to content

Commit 96b6c5f

Browse files
Julian Rosselydell
Julian Rosse
authored andcommitted
Fix #4248: Unicode code point escapes (#4498)
1 parent bfce054 commit 96b6c5f

File tree

5 files changed

+241
-24
lines changed

5 files changed

+241
-24
lines changed

lib/coffee-script/lexer.js

+62-13
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/lexer.coffee

+46-11
Original file line numberDiff line numberDiff line change
@@ -261,14 +261,14 @@ exports.Lexer = class Lexer
261261
indent = attempt if indent is null or 0 < attempt.length < indent.length
262262
indentRegex = /// \n#{indent} ///g if indent
263263
@mergeInterpolationTokens tokens, {delimiter}, (value, i) =>
264-
value = @formatString value
264+
value = @formatString value, delimiter: quote
265265
value = value.replace indentRegex, '\n' if indentRegex
266266
value = value.replace LEADING_BLANK_LINE, '' if i is 0
267267
value = value.replace TRAILING_BLANK_LINE, '' if i is $
268268
value
269269
else
270270
@mergeInterpolationTokens tokens, {delimiter}, (value, i) =>
271-
value = @formatString value
271+
value = @formatString value, delimiter: quote
272272
value = value.replace SIMPLE_STRING_OMIT, (match, offset) ->
273273
if (i is 0 and offset is 0) or
274274
(i is $ and offset + match.length is value.length)
@@ -318,6 +318,7 @@ exports.Lexer = class Lexer
318318
when match = REGEX.exec @chunk
319319
[regex, body, closed] = match
320320
@validateEscapes body, isRegex: yes, offsetInChunk: 1
321+
body = @formatRegex body, delimiter: '/'
321322
index = regex.length
322323
[..., prev] = @tokens
323324
if prev
@@ -632,7 +633,7 @@ exports.Lexer = class Lexer
632633
tokensToPush = value
633634
when 'NEOSTRING'
634635
# Convert 'NEOSTRING' into 'STRING'.
635-
converted = fn token[1], i
636+
converted = fn.call this, token[1], i
636637
# Optimize out empty strings. We ensure that the tokens stream always
637638
# starts with a string token, though, to make sure that the result
638639
# really is a string.
@@ -762,11 +763,37 @@ exports.Lexer = class Lexer
762763
'**', 'SHIFT', 'RELATION', 'COMPARE', '&', '^', '|', '&&', '||',
763764
'BIN?', 'THROW', 'EXTENDS']
764765

765-
formatString: (str) ->
766-
str.replace STRING_OMIT, '$1'
766+
formatString: (str, options) ->
767+
@replaceUnicodeCodePointEscapes str.replace(STRING_OMIT, '$1'), options
767768

768769
formatHeregex: (str) ->
769-
str.replace HEREGEX_OMIT, '$1$2'
770+
@formatRegex str.replace(HEREGEX_OMIT, '$1$2'), delimiter: '///'
771+
772+
formatRegex: (str, options) ->
773+
@replaceUnicodeCodePointEscapes str, options
774+
775+
unicodeCodePointToUnicodeEscapes: (codePoint) ->
776+
toUnicodeEscape = (val) ->
777+
str = val.toString 16
778+
"\\u#{repeat '0', 4 - str.length}#{str}"
779+
return toUnicodeEscape(codePoint) if codePoint < 0x10000
780+
# surrogate pair
781+
high = Math.floor((codePoint - 0x10000) / 0x400) + 0xD800
782+
low = (codePoint - 0x10000) % 0x400 + 0xDC00
783+
"#{toUnicodeEscape(high)}#{toUnicodeEscape(low)}"
784+
785+
# Replace \u{...} with \uxxxx[\uxxxx] in strings and regexes
786+
replaceUnicodeCodePointEscapes: (str, options) ->
787+
str.replace UNICODE_CODE_POINT_ESCAPE, (match, escapedBackslash, codePointHex, offset) =>
788+
return escapedBackslash if escapedBackslash
789+
790+
codePointDecimal = parseInt codePointHex, 16
791+
if codePointDecimal > 0x10ffff
792+
@error "unicode code point escapes greater than \\u{10ffff} are not allowed",
793+
offset: offset + options.delimiter.length
794+
length: codePointHex.length + 4
795+
796+
@unicodeCodePointToUnicodeEscapes codePointDecimal
770797

771798
# Validates escapes in strings and regexes.
772799
validateEscapes: (str, options = {}) ->
@@ -777,13 +804,13 @@ exports.Lexer = class Lexer
777804
STRING_INVALID_ESCAPE
778805
match = invalidEscapeRegex.exec str
779806
return unless match
780-
[[], before, octal, hex, unicode] = match
807+
[[], before, octal, hex, unicodeCodePoint, unicode] = match
781808
message =
782809
if octal
783810
"octal escape sequences are not allowed"
784811
else
785812
"invalid escape sequence"
786-
invalidEscape = "\\#{octal or hex or unicode}"
813+
invalidEscape = "\\#{octal or hex or unicodeCodePoint or unicode}"
787814
@error "#{message} #{invalidEscape}",
788815
offset: (options.offsetInChunk ? 0) + match.index + before.length
789816
length: invalidEscape.length
@@ -970,7 +997,7 @@ REGEX = /// ^
970997
///
971998

972999
REGEX_FLAGS = /^\w*/
973-
VALID_FLAGS = /^(?!.*(.).*\1)[imgy]*$/
1000+
VALID_FLAGS = /^(?!.*(.).*\1)[imguy]*$/
9741001

9751002
HEREGEX = /// ^(?: [^\\/#] | \\[\s\S] | /(?!//) | \#(?!\{) )* ///
9761003

@@ -994,18 +1021,26 @@ STRING_INVALID_ESCAPE = ///
9941021
\\ (
9951022
?: (0[0-7]|[1-7]) # octal escape
9961023
| (x(?![\da-fA-F]{2}).{0,2}) # hex escape
997-
| (u(?![\da-fA-F]{4}).{0,4}) # unicode escape
1024+
| (u\{(?![\da-fA-F]{1,}\})[^}]*\}?) # unicode code point escape
1025+
| (u(?!\{|[\da-fA-F]{4}).{0,4}) # unicode escape
9981026
)
9991027
///
10001028
REGEX_INVALID_ESCAPE = ///
10011029
( (?:^|[^\\]) (?:\\\\)* ) # make sure the escape isn’t escaped
10021030
\\ (
10031031
?: (0[0-7]) # octal escape
10041032
| (x(?![\da-fA-F]{2}).{0,2}) # hex escape
1005-
| (u(?![\da-fA-F]{4}).{0,4}) # unicode escape
1033+
| (u\{(?![\da-fA-F]{1,}\})[^}]*\}?) # unicode code point escape
1034+
| (u(?!\{|[\da-fA-F]{4}).{0,4}) # unicode escape
10061035
)
10071036
///
10081037
1038+
UNICODE_CODE_POINT_ESCAPE = ///
1039+
( \\\\ ) # make sure the escape isn’t escaped
1040+
|
1041+
\\u\{ ( [\da-fA-F]+ ) \}
1042+
///g
1043+
10091044
LEADING_BLANK_LINE = /^[^\n\S]*\n/
10101045
TRAILING_BLANK_LINE = /\n[^\n\S]*$/
10111046

test/error_messages.coffee

+62
Original file line numberDiff line numberDiff line change
@@ -1257,3 +1257,65 @@ test "can't use pattern matches for loop indices", ->
12571257
a for b, {c} in d
12581258
^^^
12591259
'''
1260+
1261+
test "#4248: Unicode code point escapes", ->
1262+
assertErrorFormat '''
1263+
"a
1264+
#{b} \\u{G02}
1265+
c"
1266+
''', '''
1267+
[stdin]:2:8: error: invalid escape sequence \\u{G02}
1268+
#{b} \\u{G02}
1269+
^\^^^^^^
1270+
'''
1271+
assertErrorFormat '''
1272+
/a\\u{}b/
1273+
''', '''
1274+
[stdin]:1:3: error: invalid escape sequence \\u{}
1275+
/a\\u{}b/
1276+
^\^^^
1277+
'''
1278+
assertErrorFormat '''
1279+
///a \\u{01abc///
1280+
''', '''
1281+
[stdin]:1:6: error: invalid escape sequence \\u{01abc
1282+
///a \\u{01abc///
1283+
^\^^^^^^^
1284+
'''
1285+
1286+
assertErrorFormat '''
1287+
/\\u{123} \\u{110000}/
1288+
''', '''
1289+
[stdin]:1:10: error: unicode code point escapes greater than \\u{10ffff} are not allowed
1290+
/\\u{123} \\u{110000}/
1291+
\ ^\^^^^^^^^^
1292+
'''
1293+
1294+
assertErrorFormat '''
1295+
///abc\\\\\\u{123456}///u
1296+
''', '''
1297+
[stdin]:1:9: error: unicode code point escapes greater than \\u{10ffff} are not allowed
1298+
///abc\\\\\\u{123456}///u
1299+
\ \^\^^^^^^^^^
1300+
'''
1301+
1302+
assertErrorFormat '''
1303+
"""
1304+
\\u{123}
1305+
a
1306+
\\u{00110000}
1307+
#{ 'b' }
1308+
"""
1309+
''', '''
1310+
[stdin]:4:5: error: unicode code point escapes greater than \\u{10ffff} are not allowed
1311+
\\u{00110000}
1312+
^\^^^^^^^^^^^
1313+
'''
1314+
1315+
assertErrorFormat '''
1316+
'\\u{a}\\u{1111110000}'
1317+
''', '''
1318+
[stdin]:1:7: error: unicode code point escapes greater than \\u{10ffff} are not allowed
1319+
'\\u{a}\\u{1111110000}'
1320+
\ ^\^^^^^^^^^^^^^
1321+
'''

test/regexps.coffee

+35
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@
66
# * Regexen
77
# * Heregexen
88

9+
# Helper function
10+
toJS = (str) ->
11+
CoffeeScript.compile str, bare: yes
12+
.replace /^\s+|\s+$/g, '' # Trim leading/trailing whitespace
13+
14+
915
test "basic regular expression literals", ->
1016
ok 'a'.match(/a/)
1117
ok 'a'.match /a/
@@ -286,3 +292,32 @@ test "#3795: Escape otherwise invalid characters", ->
286292
ok ///#{a}\
///.test 'a\u2029'
287293
ok ///#{a}\0
288294
1///.test 'a\x001'
295+
296+
test "#4248: Unicode code point escapes", ->
297+
ok /a\u{1ab}c/u.test 'a\u01abc'
298+
ok ///#{ 'a' }\u{000001ab}c///u.test 'a\u{1ab}c'
299+
ok ///a\u{000001ab}c///u.test 'a\u{1ab}c'
300+
ok /a\u{12345}c/u.test 'a\ud808\udf45c'
301+
302+
# and now without u flag
303+
ok /a\u{1ab}c/.test 'a\u01abc'
304+
ok ///#{ 'a' }\u{000001ab}c///.test 'a\u{1ab}c'
305+
ok ///a\u{000001ab}c///.test 'a\u{1ab}c'
306+
ok /a\u{12345}c/.test 'a\ud808\udf45c'
307+
308+
# rewrite code point escapes
309+
input = """
310+
/\\u{bcdef}\\u{abc}/u
311+
"""
312+
output = """
313+
/\\udab3\\uddef\\u0abc/u;
314+
"""
315+
eq toJS(input), output
316+
317+
input = """
318+
///#{ 'a' }\\u{bcdef}///
319+
"""
320+
output = """
321+
/a\\udab3\\uddef/;
322+
"""
323+
eq toJS(input), output

0 commit comments

Comments
 (0)