Skip to content

Commit 7212912

Browse files
Merge pull request #2169 from Microsoft/withANameLikeUnicodeYoudThinkThereWouldntBeSoManyWaysToDoIt
Add support for extended Unicode escape sequences in strings and templates
2 parents e5a8deb + 5c5a489 commit 7212912

File tree

407 files changed

+2787
-77
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

407 files changed

+2787
-77
lines changed

src/compiler/core.ts

-23
Original file line numberDiff line numberDiff line change
@@ -623,29 +623,6 @@ module ts {
623623
"\u0085": "\\u0085" // nextLine
624624
};
625625

626-
/**
627-
* Based heavily on the abstract 'Quote'/ 'QuoteJSONString' operation from ECMA-262 (24.3.2.2),
628-
* but augmented for a few select characters.
629-
* Note that this doesn't actually wrap the input in double quotes.
630-
*/
631-
export function escapeString(s: string): string {
632-
// Prioritize '"' and '\'
633-
s = backslashOrDoubleQuote.test(s) ? s.replace(backslashOrDoubleQuote, getReplacement) : s;
634-
s = escapedCharsRegExp.test(s) ? s.replace(escapedCharsRegExp, getReplacement) : s;
635-
636-
return s;
637-
638-
function getReplacement(c: string) {
639-
return escapedCharsMap[c] || unicodeEscape(c);
640-
}
641-
642-
function unicodeEscape(c: string): string {
643-
var hexCharCode = c.charCodeAt(0).toString(16);
644-
var paddedHexCode = ("0000" + hexCharCode).slice(-4);
645-
return "\\u" + paddedHexCode;
646-
}
647-
}
648-
649626
export function getDefaultLibFileName(options: CompilerOptions): string {
650627
return options.target === ScriptTarget.ES6 ? "lib.es6.d.ts" : "lib.d.ts";
651628
}

src/compiler/diagnosticInformationMap.generated.ts

+2
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,8 @@ module ts {
155155
Catch_clause_variable_name_must_be_an_identifier: { code: 1195, category: DiagnosticCategory.Error, key: "Catch clause variable name must be an identifier." },
156156
Catch_clause_variable_cannot_have_a_type_annotation: { code: 1196, category: DiagnosticCategory.Error, key: "Catch clause variable cannot have a type annotation." },
157157
Catch_clause_variable_cannot_have_an_initializer: { code: 1197, category: DiagnosticCategory.Error, key: "Catch clause variable cannot have an initializer." },
158+
An_extended_Unicode_escape_value_must_be_between_0x0_and_0x10FFFF_inclusive: { code: 1198, category: DiagnosticCategory.Error, key: "An extended Unicode escape value must be between 0x0 and 0x10FFFF inclusive." },
159+
Unterminated_Unicode_escape_sequence: { code: 1199, category: DiagnosticCategory.Error, key: "Unterminated Unicode escape sequence." },
158160
Duplicate_identifier_0: { code: 2300, category: DiagnosticCategory.Error, key: "Duplicate identifier '{0}'." },
159161
Initializer_of_instance_member_variable_0_cannot_reference_identifier_1_declared_in_the_constructor: { code: 2301, category: DiagnosticCategory.Error, key: "Initializer of instance member variable '{0}' cannot reference identifier '{1}' declared in the constructor." },
160162
Static_members_cannot_reference_class_type_parameters: { code: 2302, category: DiagnosticCategory.Error, key: "Static members cannot reference class type parameters." },

src/compiler/diagnosticMessages.json

+8-1
Original file line numberDiff line numberDiff line change
@@ -611,7 +611,14 @@
611611
"category": "Error",
612612
"code": 1197
613613
},
614-
614+
"An extended Unicode escape value must be between 0x0 and 0x10FFFF inclusive.": {
615+
"category": "Error",
616+
"code": 1198
617+
},
618+
"Unterminated Unicode escape sequence.": {
619+
"category": "Error",
620+
"code": 1199
621+
},
615622
"Duplicate identifier '{0}'.": {
616623
"category": "Error",
617624
"code": 2300

src/compiler/emitter.ts

+50-16
Original file line numberDiff line numberDiff line change
@@ -2223,36 +2223,70 @@ module ts {
22232223
}
22242224
}
22252225

2226-
function isBinaryOrOctalIntegerLiteral(text: string): boolean {
2227-
if (text.length <= 0) {
2228-
return false;
2229-
}
2230-
2231-
if (text.charCodeAt(1) === CharacterCodes.B || text.charCodeAt(1) === CharacterCodes.b ||
2232-
text.charCodeAt(1) === CharacterCodes.O || text.charCodeAt(1) === CharacterCodes.o) {
2233-
return true;
2226+
function isBinaryOrOctalIntegerLiteral(node: LiteralExpression, text: string): boolean {
2227+
if (node.kind === SyntaxKind.NumericLiteral && text.length > 1) {
2228+
switch (text.charCodeAt(1)) {
2229+
case CharacterCodes.b:
2230+
case CharacterCodes.B:
2231+
case CharacterCodes.o:
2232+
case CharacterCodes.O:
2233+
return true;
2234+
}
22342235
}
2236+
22352237
return false;
22362238
}
22372239

22382240
function emitLiteral(node: LiteralExpression) {
2239-
var text = languageVersion < ScriptTarget.ES6 && isTemplateLiteralKind(node.kind) ? getTemplateLiteralAsStringLiteral(node) :
2240-
node.parent ? getSourceTextOfNodeFromSourceFile(currentSourceFile, node) :
2241-
node.text;
2241+
var text = getLiteralText(node);
2242+
22422243
if (compilerOptions.sourceMap && (node.kind === SyntaxKind.StringLiteral || isTemplateLiteralKind(node.kind))) {
22432244
writer.writeLiteral(text);
22442245
}
2245-
// For version below ES6, emit binary integer literal and octal integer literal in canonical form
2246-
else if (languageVersion < ScriptTarget.ES6 && node.kind === SyntaxKind.NumericLiteral && isBinaryOrOctalIntegerLiteral(text)) {
2246+
// For versions below ES6, emit binary & octal literals in their canonical decimal form.
2247+
else if (languageVersion < ScriptTarget.ES6 && isBinaryOrOctalIntegerLiteral(node, text)) {
22472248
write(node.text);
22482249
}
22492250
else {
22502251
write(text);
22512252
}
22522253
}
2253-
2254-
function getTemplateLiteralAsStringLiteral(node: LiteralExpression): string {
2255-
return '"' + escapeString(node.text) + '"';
2254+
2255+
function getLiteralText(node: LiteralExpression) {
2256+
// Any template literal or string literal with an extended escape
2257+
// (e.g. "\u{0067}") will need to be downleveled as a escaped string literal.
2258+
if (languageVersion < ScriptTarget.ES6 && (isTemplateLiteralKind(node.kind) || node.hasExtendedUnicodeEscape)) {
2259+
return getQuotedEscapedLiteralText('"', node.text, '"');
2260+
}
2261+
2262+
// If we don't need to downlevel and we can reach the original source text using
2263+
// the node's parent reference, then simply get the text as it was originally written.
2264+
if (node.parent) {
2265+
return getSourceTextOfNodeFromSourceFile(currentSourceFile, node);
2266+
}
2267+
2268+
// If we can't reach the original source text, use the canonical form if it's a number,
2269+
// or an escaped quoted form of the original text if it's string-like.
2270+
switch (node.kind) {
2271+
case SyntaxKind.StringLiteral:
2272+
return getQuotedEscapedLiteralText('"', node.text, '"');
2273+
case SyntaxKind.NoSubstitutionTemplateLiteral:
2274+
return getQuotedEscapedLiteralText('`', node.text, '`');
2275+
case SyntaxKind.TemplateHead:
2276+
return getQuotedEscapedLiteralText('`', node.text, '${');
2277+
case SyntaxKind.TemplateMiddle:
2278+
return getQuotedEscapedLiteralText('}', node.text, '${');
2279+
case SyntaxKind.TemplateTail:
2280+
return getQuotedEscapedLiteralText('}', node.text, '`');
2281+
case SyntaxKind.NumericLiteral:
2282+
return node.text;
2283+
}
2284+
2285+
Debug.fail(`Literal kind '${node.kind}' not accounted for.`);
2286+
}
2287+
2288+
function getQuotedEscapedLiteralText(leftQuote: string, text: string, rightQuote: string) {
2289+
return leftQuote + escapeNonAsciiCharacters(escapeString(text)) + rightQuote;
22562290
}
22572291

22582292
function emitDownlevelRawTemplateLiteral(node: LiteralExpression) {

src/compiler/parser.ts

+4
Original file line numberDiff line numberDiff line change
@@ -2163,6 +2163,10 @@ module ts {
21632163
var text = scanner.getTokenValue();
21642164
node.text = internName ? internIdentifier(text) : text;
21652165

2166+
if (scanner.hasExtendedUnicodeEscape()) {
2167+
node.hasExtendedUnicodeEscape = true;
2168+
}
2169+
21662170
if (scanner.isUnterminated()) {
21672171
node.isUnterminated = true;
21682172
}

src/compiler/scanner.ts

+97-13
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ module ts {
1414
getTokenPos(): number;
1515
getTokenText(): string;
1616
getTokenValue(): string;
17+
hasExtendedUnicodeEscape(): boolean;
1718
hasPrecedingLineBreak(): boolean;
1819
isIdentifier(): boolean;
1920
isReservedWord(): boolean;
@@ -556,6 +557,7 @@ module ts {
556557
var token: SyntaxKind;
557558
var tokenValue: string;
558559
var precedingLineBreak: boolean;
560+
var hasExtendedUnicodeEscape: boolean;
559561
var tokenIsUnterminated: boolean;
560562

561563
function error(message: DiagnosticMessage, length?: number): void {
@@ -606,11 +608,27 @@ module ts {
606608
}
607609
return +(text.substring(start, pos));
608610
}
611+
612+
/**
613+
* Scans the given number of hexadecimal digits in the text,
614+
* returning -1 if the given number is unavailable.
615+
*/
616+
function scanExactNumberOfHexDigits(count: number): number {
617+
return scanHexDigits(/*minCount*/ count, /*scanAsManyAsPossible*/ false);
618+
}
619+
620+
/**
621+
* Scans as many hexadecimal digits as are available in the text,
622+
* returning -1 if the given number of digits was unavailable.
623+
*/
624+
function scanMinimumNumberOfHexDigits(count: number): number {
625+
return scanHexDigits(/*minCount*/ count, /*scanAsManyAsPossible*/ true);
626+
}
609627

610-
function scanHexDigits(count: number, mustMatchCount?: boolean): number {
628+
function scanHexDigits(minCount: number, scanAsManyAsPossible: boolean): number {
611629
var digits = 0;
612630
var value = 0;
613-
while (digits < count || !mustMatchCount) {
631+
while (digits < minCount || scanAsManyAsPossible) {
614632
var ch = text.charCodeAt(pos);
615633
if (ch >= CharacterCodes._0 && ch <= CharacterCodes._9) {
616634
value = value * 16 + ch - CharacterCodes._0;
@@ -627,7 +645,7 @@ module ts {
627645
pos++;
628646
digits++;
629647
}
630-
if (digits < count) {
648+
if (digits < minCount) {
631649
value = -1;
632650
}
633651
return value;
@@ -764,16 +782,20 @@ module ts {
764782
return "\'";
765783
case CharacterCodes.doubleQuote:
766784
return "\"";
767-
case CharacterCodes.x:
768785
case CharacterCodes.u:
769-
var ch = scanHexDigits(ch === CharacterCodes.x ? 2 : 4, /*mustMatchCount*/ true);
770-
if (ch >= 0) {
771-
return String.fromCharCode(ch);
772-
}
773-
else {
774-
error(Diagnostics.Hexadecimal_digit_expected);
775-
return ""
786+
// '\u{DDDDDDDD}'
787+
if (pos < len && text.charCodeAt(pos) === CharacterCodes.openBrace) {
788+
hasExtendedUnicodeEscape = true;
789+
pos++;
790+
return scanExtendedUnicodeEscape();
776791
}
792+
793+
// '\uDDDD'
794+
return scanHexadecimalEscape(/*numDigits*/ 4)
795+
796+
case CharacterCodes.x:
797+
// '\xDD'
798+
return scanHexadecimalEscape(/*numDigits*/ 2)
777799

778800
// when encountering a LineContinuation (i.e. a backslash and a line terminator sequence),
779801
// the line terminator is interpreted to be "the empty code unit sequence".
@@ -790,14 +812,74 @@ module ts {
790812
return String.fromCharCode(ch);
791813
}
792814
}
815+
816+
function scanHexadecimalEscape(numDigits: number): string {
817+
var escapedValue = scanExactNumberOfHexDigits(numDigits);
818+
819+
if (escapedValue >= 0) {
820+
return String.fromCharCode(escapedValue);
821+
}
822+
else {
823+
error(Diagnostics.Hexadecimal_digit_expected);
824+
return ""
825+
}
826+
}
827+
828+
function scanExtendedUnicodeEscape(): string {
829+
var escapedValue = scanMinimumNumberOfHexDigits(1);
830+
var isInvalidExtendedEscape = false;
831+
832+
// Validate the value of the digit
833+
if (escapedValue < 0) {
834+
error(Diagnostics.Hexadecimal_digit_expected)
835+
isInvalidExtendedEscape = true;
836+
}
837+
else if (escapedValue > 0x10FFFF) {
838+
error(Diagnostics.An_extended_Unicode_escape_value_must_be_between_0x0_and_0x10FFFF_inclusive);
839+
isInvalidExtendedEscape = true;
840+
}
841+
842+
if (pos >= len) {
843+
error(Diagnostics.Unexpected_end_of_text);
844+
isInvalidExtendedEscape = true;
845+
}
846+
else if (text.charCodeAt(pos) == CharacterCodes.closeBrace) {
847+
// Only swallow the following character up if it's a '}'.
848+
pos++;
849+
}
850+
else {
851+
error(Diagnostics.Unterminated_Unicode_escape_sequence);
852+
isInvalidExtendedEscape = true;
853+
}
854+
855+
if (isInvalidExtendedEscape) {
856+
return "";
857+
}
858+
859+
return utf16EncodeAsString(escapedValue);
860+
}
861+
862+
// Derived from the 10.1.1 UTF16Encoding of the ES6 Spec.
863+
function utf16EncodeAsString(codePoint: number): string {
864+
Debug.assert(0x0 <= codePoint && codePoint <= 0x10FFFF);
865+
866+
if (codePoint <= 65535) {
867+
return String.fromCharCode(codePoint);
868+
}
869+
870+
var codeUnit1 = Math.floor((codePoint - 65536) / 1024) + 0xD800;
871+
var codeUnit2 = ((codePoint - 65536) % 1024) + 0xDC00;
872+
873+
return String.fromCharCode(codeUnit1, codeUnit2);
874+
}
793875

794876
// Current character is known to be a backslash. Check for Unicode escape of the form '\uXXXX'
795877
// and return code point value if valid Unicode escape is found. Otherwise return -1.
796878
function peekUnicodeEscape(): number {
797879
if (pos + 5 < len && text.charCodeAt(pos + 1) === CharacterCodes.u) {
798880
var start = pos;
799881
pos += 2;
800-
var value = scanHexDigits(4, /*mustMatchCount*/ true);
882+
var value = scanExactNumberOfHexDigits(4);
801883
pos = start;
802884
return value;
803885
}
@@ -869,6 +951,7 @@ module ts {
869951

870952
function scan(): SyntaxKind {
871953
startPos = pos;
954+
hasExtendedUnicodeEscape = false;
872955
precedingLineBreak = false;
873956
tokenIsUnterminated = false;
874957
while (true) {
@@ -1034,7 +1117,7 @@ module ts {
10341117
case CharacterCodes._0:
10351118
if (pos + 2 < len && (text.charCodeAt(pos + 1) === CharacterCodes.X || text.charCodeAt(pos + 1) === CharacterCodes.x)) {
10361119
pos += 2;
1037-
var value = scanHexDigits(1, /*mustMatchCount*/ false);
1120+
var value = scanMinimumNumberOfHexDigits(1);
10381121
if (value < 0) {
10391122
error(Diagnostics.Hexadecimal_digit_expected);
10401123
value = 0;
@@ -1336,6 +1419,7 @@ module ts {
13361419
getTokenPos: () => tokenPos,
13371420
getTokenText: () => text.substring(tokenPos, pos),
13381421
getTokenValue: () => tokenValue,
1422+
hasExtendedUnicodeEscape: () => hasExtendedUnicodeEscape,
13391423
hasPrecedingLineBreak: () => precedingLineBreak,
13401424
isIdentifier: () => token === SyntaxKind.Identifier || token > SyntaxKind.LastReservedWord,
13411425
isReservedWord: () => token >= SyntaxKind.FirstReservedWord && token <= SyntaxKind.LastReservedWord,

src/compiler/types.ts

+1
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,7 @@ module ts {
655655
export interface LiteralExpression extends PrimaryExpression {
656656
text: string;
657657
isUnterminated?: boolean;
658+
hasExtendedUnicodeEscape?: boolean;
658659
}
659660

660661
export interface StringLiteralExpression extends LiteralExpression {

0 commit comments

Comments
 (0)