Keep unicode code point escapes as is when possible (#4520)

Julian Rosse · lydell · commit 7ef5cb4a1fe8 · 2017-04-25T19:15:08.000+02:00
diff --git a/lib/coffeescript/lexer.js b/lib/coffeescript/lexer.js
diff --git a/src/lexer.coffee b/src/lexer.coffee
@@ -12,7 +12,7 @@
 {Rewriter, INVERSES} = require './rewriter'
 
 # Import the helpers we need.
-{count, starts, compact, repeat, invertLiterate,
+{count, starts, compact, repeat, invertLiterate, merge,
 locationDataToString,  throwSyntaxError} = require './helpers'
 
 # The Lexer Class
@@ -330,7 +330,6 @@ exports.Lexer = class Lexer
       when match = REGEX.exec @chunk
         [regex, body, closed] = match
         @validateEscapes body, isRegex: yes, offsetInChunk: 1
-        body = @formatRegex body, delimiter: '/'
         index = regex.length
         prev = @prev()
         if prev
@@ -349,13 +348,17 @@ exports.Lexer = class Lexer
       when not VALID_FLAGS.test flags
         @error "invalid regular expression flags #{flags}", offset: index, length: flags.length
       when regex or tokens.length is 1
-        body ?= @formatHeregex tokens[0][1]
+        if body
+          body = @formatRegex body, { flags, delimiter: '/' }
+        else
+          body = @formatHeregex tokens[0][1], { flags }
         @token 'REGEX', "#{@makeDelimitedLiteral body, delimiter: '/'}#{flags}", 0, end, origin
       else
         @token 'REGEX_START', '(', 0, 0, origin
         @token 'IDENTIFIER', 'RegExp', 0, 0
         @token 'CALL_START', '(', 0, 0
-        @mergeInterpolationTokens tokens, {delimiter: '"', double: yes}, @formatHeregex
+        @mergeInterpolationTokens tokens, {delimiter: '"', double: yes}, (str) =>
+          @formatHeregex str, { flags }
         if flags
           @token ',', ',', index - 1, 0
           @token 'STRING', '"' + flags + '"', index - 1, flags.length
@@ -792,8 +795,8 @@ exports.Lexer = class Lexer
   formatString: (str, options) ->
     @replaceUnicodeCodePointEscapes str.replace(STRING_OMIT, '$1'), options
 
-  formatHeregex: (str) ->
-    @formatRegex str.replace(HEREGEX_OMIT, '$1$2'), delimiter: '///'
+  formatHeregex: (str, options) ->
+    @formatRegex str.replace(HEREGEX_OMIT, '$1$2'), merge(options, delimiter: '///')
 
   formatRegex: (str, options) ->
     @replaceUnicodeCodePointEscapes str, options
@@ -808,8 +811,9 @@ exports.Lexer = class Lexer
     low = (codePoint - 0x10000) % 0x400 + 0xDC00
     "#{toUnicodeEscape(high)}#{toUnicodeEscape(low)}"
 
-  # Replace \u{...} with \uxxxx[\uxxxx] in strings and regexes
+  # Replace \u{...} with \uxxxx[\uxxxx] in regexes without `u` flag
   replaceUnicodeCodePointEscapes: (str, options) ->
+    shouldReplace = options.flags? and 'u' not in options.flags
     str.replace UNICODE_CODE_POINT_ESCAPE, (match, escapedBackslash, codePointHex, offset) =>
       return escapedBackslash if escapedBackslash
 
@@ -818,6 +822,7 @@ exports.Lexer = class Lexer
         @error "unicode code point escapes greater than \\u{10ffff} are not allowed",
           offset: offset + options.delimiter.length
           length: codePointHex.length + 4
+      return match unless shouldReplace
 
       @unicodeCodePointToUnicodeEscapes codePointDecimal
 
diff --git a/test/regexps.coffee b/test/regexps.coffee
@@ -305,12 +305,12 @@ test "#4248: Unicode code point escapes", ->
   ok ///a\u{000001ab}c///.test 'a\u{1ab}c'
   ok /a\u{12345}c/.test 'a\ud808\udf45c'
 
-  # rewrite code point escapes
+  # rewrite code point escapes unless u flag is set
   input = """
     /\\u{bcdef}\\u{abc}/u
     """
   output = """
-    /\\udab3\\uddef\\u0abc/u;
+    /\\u{bcdef}\\u{abc}/u;
   """
   eq toJS(input), output
 
diff --git a/test/strings.coffee b/test/strings.coffee
@@ -420,19 +420,19 @@ test "#4248: Unicode code point escapes", ->
   eq '\udab3\uddefc', """\u{bcdef}#{ 'c' }"""
   eq '\\u{123456}', "#{'\\'}#{'u{123456}'}"
 
-  # rewrite code point escapes
+  # don't rewrite code point escapes
   input = """
     '\\u{bcdef}\\u{abc}'
     """
   output = """
-    '\\udab3\\uddef\\u0abc';
+    '\\u{bcdef}\\u{abc}';
   """
   eq toJS(input), output
 
   input = """
     "#{ 'a' }\\u{bcdef}"
     """
   output = """
-    "a\\udab3\\uddef";
+    "a\\u{bcdef}";
   """
   eq toJS(input), output