Skip to content

Fix #4248: Unicode code point escapes #4498

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 20, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 62 additions & 13 deletions lib/coffee-script/lexer.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

57 changes: 46 additions & 11 deletions src/lexer.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -261,14 +261,14 @@ exports.Lexer = class Lexer
indent = attempt if indent is null or 0 < attempt.length < indent.length
indentRegex = /// \n#{indent} ///g if indent
@mergeInterpolationTokens tokens, {delimiter}, (value, i) =>
value = @formatString value
value = @formatString value, delimiter: quote
value = value.replace indentRegex, '\n' if indentRegex
value = value.replace LEADING_BLANK_LINE, '' if i is 0
value = value.replace TRAILING_BLANK_LINE, '' if i is $
value
else
@mergeInterpolationTokens tokens, {delimiter}, (value, i) =>
value = @formatString value
value = @formatString value, delimiter: quote
value = value.replace SIMPLE_STRING_OMIT, (match, offset) ->
if (i is 0 and offset is 0) or
(i is $ and offset + match.length is value.length)
Expand Down Expand Up @@ -318,6 +318,7 @@ exports.Lexer = class Lexer
when match = REGEX.exec @chunk
[regex, body, closed] = match
@validateEscapes body, isRegex: yes, offsetInChunk: 1
body = @formatRegex body, delimiter: '/'
index = regex.length
[..., prev] = @tokens
if prev
Expand Down Expand Up @@ -632,7 +633,7 @@ exports.Lexer = class Lexer
tokensToPush = value
when 'NEOSTRING'
# Convert 'NEOSTRING' into 'STRING'.
converted = fn token[1], i
converted = fn.call this, token[1], i
# Optimize out empty strings. We ensure that the tokens stream always
# starts with a string token, though, to make sure that the result
# really is a string.
Expand Down Expand Up @@ -762,11 +763,37 @@ exports.Lexer = class Lexer
'**', 'SHIFT', 'RELATION', 'COMPARE', '&', '^', '|', '&&', '||',
'BIN?', 'THROW', 'EXTENDS']

formatString: (str) ->
str.replace STRING_OMIT, '$1'
formatString: (str, options) ->
@replaceUnicodeCodePointEscapes str.replace(STRING_OMIT, '$1'), options

formatHeregex: (str) ->
str.replace HEREGEX_OMIT, '$1$2'
@formatRegex str.replace(HEREGEX_OMIT, '$1$2'), delimiter: '///'

formatRegex: (str, options) ->
@replaceUnicodeCodePointEscapes str, options

unicodeCodePointToUnicodeEscapes: (codePoint) ->
toUnicodeEscape = (val) ->
str = val.toString 16
"\\u#{repeat '0', 4 - str.length}#{str}"
return toUnicodeEscape(codePoint) if codePoint < 0x10000
# surrogate pair
high = Math.floor((codePoint - 0x10000) / 0x400) + 0xD800
low = (codePoint - 0x10000) % 0x400 + 0xDC00
"#{toUnicodeEscape(high)}#{toUnicodeEscape(low)}"

# Replace \u{...} with \uxxxx[\uxxxx] in strings and regexes
replaceUnicodeCodePointEscapes: (str, options) ->
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Found a bug that I hadn't tested for: adjacent code point escapes eg "\u{a}\u{b}" were not both being processed because of the "before" section of UNICODE_CODE_POINT_ESCAPE (( (?:^|[^\\]) (?:\\\\)* ), which I copied from STRING_INVALID_ESCAPE). Basically, since it looks for a non-zero-length (unless we're at the beginning of the string) verification that we're not inside an "escaped escape", the desired matches eg \u{a} and \u{b} become overlapping (because the second match would be }\u{b}). I couldn't figure out how to make the regex not have to overlap so that I could still use str.replace() so I switched to an (uglier) implementation using UNICODE_CODE_POINT_ESCAPE.exec() and some lastIndex tweaking

In test/regexps.coffee and test/strings.coffee, I copied the toJS() helper from test/modules.coffee so that I could add some rewriting tests on the generated JS (since code-point escapes and regular Unicode escapes seem indistinguishable to eg eq())

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we not copy/paste this helper? You could maybe attach it to global in the runTests function in Cakefile.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@GeoffreyBooth sure this was already merged so I opened a new pull request #4522 where I refactored toJS() into test/support/helpers.coffee

str.replace UNICODE_CODE_POINT_ESCAPE, (match, escapedBackslash, codePointHex, offset) =>
return escapedBackslash if escapedBackslash

codePointDecimal = parseInt codePointHex, 16
if codePointDecimal > 0x10ffff
@error "unicode code point escapes greater than \\u{10ffff} are not allowed",
offset: offset + options.delimiter.length
length: codePointHex.length + 4

@unicodeCodePointToUnicodeEscapes codePointDecimal

# Validates escapes in strings and regexes.
validateEscapes: (str, options = {}) ->
Expand All @@ -777,13 +804,13 @@ exports.Lexer = class Lexer
STRING_INVALID_ESCAPE
match = invalidEscapeRegex.exec str
return unless match
[[], before, octal, hex, unicode] = match
[[], before, octal, hex, unicodeCodePoint, unicode] = match
message =
if octal
"octal escape sequences are not allowed"
else
"invalid escape sequence"
invalidEscape = "\\#{octal or hex or unicode}"
invalidEscape = "\\#{octal or hex or unicodeCodePoint or unicode}"
@error "#{message} #{invalidEscape}",
offset: (options.offsetInChunk ? 0) + match.index + before.length
length: invalidEscape.length
Expand Down Expand Up @@ -970,7 +997,7 @@ REGEX = /// ^
///

REGEX_FLAGS = /^\w*/
VALID_FLAGS = /^(?!.*(.).*\1)[imgy]*$/
VALID_FLAGS = /^(?!.*(.).*\1)[imguy]*$/

HEREGEX = /// ^(?: [^\\/#] | \\[\s\S] | /(?!//) | \#(?!\{) )* ///

Expand All @@ -994,18 +1021,26 @@ STRING_INVALID_ESCAPE = ///
\\ (
?: (0[0-7]|[1-7]) # octal escape
| (x(?![\da-fA-F]{2}).{0,2}) # hex escape
| (u(?![\da-fA-F]{4}).{0,4}) # unicode escape
| (u\{(?![\da-fA-F]{1,}\})[^}]*\}?) # unicode code point escape
| (u(?!\{|[\da-fA-F]{4}).{0,4}) # unicode escape
)
///
REGEX_INVALID_ESCAPE = ///
( (?:^|[^\\]) (?:\\\\)* ) # make sure the escape isn’t escaped
\\ (
?: (0[0-7]) # octal escape
| (x(?![\da-fA-F]{2}).{0,2}) # hex escape
| (u(?![\da-fA-F]{4}).{0,4}) # unicode escape
| (u\{(?![\da-fA-F]{1,}\})[^}]*\}?) # unicode code point escape
| (u(?!\{|[\da-fA-F]{4}).{0,4}) # unicode escape
)
///

UNICODE_CODE_POINT_ESCAPE = ///
( \\\\ ) # make sure the escape isn’t escaped
|
\\u\{ ( [\da-fA-F]+ ) \}
///g

LEADING_BLANK_LINE = /^[^\n\S]*\n/
TRAILING_BLANK_LINE = /\n[^\n\S]*$/

Expand Down
62 changes: 62 additions & 0 deletions test/error_messages.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -1257,3 +1257,65 @@ test "can't use pattern matches for loop indices", ->
a for b, {c} in d
^^^
'''

test "#4248: Unicode code point escapes", ->
assertErrorFormat '''
"a
#{b} \\u{G02}
c"
''', '''
[stdin]:2:8: error: invalid escape sequence \\u{G02}
#{b} \\u{G02}
^\^^^^^^
'''
assertErrorFormat '''
/a\\u{}b/
''', '''
[stdin]:1:3: error: invalid escape sequence \\u{}
/a\\u{}b/
^\^^^
'''
assertErrorFormat '''
///a \\u{01abc///
''', '''
[stdin]:1:6: error: invalid escape sequence \\u{01abc
///a \\u{01abc///
^\^^^^^^^
'''

assertErrorFormat '''
/\\u{123} \\u{110000}/
''', '''
[stdin]:1:10: error: unicode code point escapes greater than \\u{10ffff} are not allowed
/\\u{123} \\u{110000}/
\ ^\^^^^^^^^^
'''

assertErrorFormat '''
///abc\\\\\\u{123456}///u
''', '''
[stdin]:1:9: error: unicode code point escapes greater than \\u{10ffff} are not allowed
///abc\\\\\\u{123456}///u
\ \^\^^^^^^^^^
'''

assertErrorFormat '''
"""
\\u{123}
a
\\u{00110000}
#{ 'b' }
"""
''', '''
[stdin]:4:5: error: unicode code point escapes greater than \\u{10ffff} are not allowed
\\u{00110000}
^\^^^^^^^^^^^
'''

assertErrorFormat '''
'\\u{a}\\u{1111110000}'
''', '''
[stdin]:1:7: error: unicode code point escapes greater than \\u{10ffff} are not allowed
'\\u{a}\\u{1111110000}'
\ ^\^^^^^^^^^^^^^
'''
35 changes: 35 additions & 0 deletions test/regexps.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@
# * Regexen
# * Heregexen

# Helper function
toJS = (str) ->
CoffeeScript.compile str, bare: yes
.replace /^\s+|\s+$/g, '' # Trim leading/trailing whitespace


test "basic regular expression literals", ->
ok 'a'.match(/a/)
ok 'a'.match /a/
Expand Down Expand Up @@ -286,3 +292,32 @@ test "#3795: Escape otherwise invalid characters", ->
ok ///#{a}\
///.test 'a\u2029'
ok ///#{a}\0
1///.test 'a\x001'

test "#4248: Unicode code point escapes", ->
ok /a\u{1ab}c/u.test 'a\u01abc'
ok ///#{ 'a' }\u{000001ab}c///u.test 'a\u{1ab}c'
ok ///a\u{000001ab}c///u.test 'a\u{1ab}c'
ok /a\u{12345}c/u.test 'a\ud808\udf45c'

# and now without u flag
ok /a\u{1ab}c/.test 'a\u01abc'
ok ///#{ 'a' }\u{000001ab}c///.test 'a\u{1ab}c'
ok ///a\u{000001ab}c///.test 'a\u{1ab}c'
ok /a\u{12345}c/.test 'a\ud808\udf45c'

# rewrite code point escapes
input = """
/\\u{bcdef}\\u{abc}/u
"""
output = """
/\\udab3\\uddef\\u0abc/u;
"""
eq toJS(input), output

input = """
///#{ 'a' }\\u{bcdef}///
"""
output = """
/a\\udab3\\uddef/;
"""
eq toJS(input), output
Loading