Fix spans generated for HTML with higher-plane unicode characters (flutter#109)

cvolzke4 · nshahan · commit 2b392a4b8d73 · 2019-09-19T16:06:20.000-07:00
diff --git a/.travis.yml b/.travis.yml
@@ -2,7 +2,7 @@ language: dart
 
 dart:
   - dev
-  - 2.0.0
+  - 2.3.0
 
 dart_task:
   - test: -p vm
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.14.0+3
+
+- Fix spans generated for HTML with higher-plane unicode characters (eg. emojis)
+
 ## 0.14.0+2
 
 - Support `package:css` `>=0.13.2 <0.17.0`.
diff --git a/lib/src/html_input_stream.dart b/lib/src/html_input_stream.dart
@@ -33,7 +33,7 @@ class HtmlInputStream {
   List<int> _rawBytes;
 
   /// Raw UTF-16 codes, used if a Dart String is passed in.
-  Iterable<int> _rawChars;
+  List<int> _rawChars;
 
   Queue<String> errors;
 
@@ -66,7 +66,7 @@ class HtmlInputStream {
       this.sourceUrl])
       : charEncodingName = codecName(encoding) {
     if (source is String) {
-      _rawChars = source.runes.toList();
+      _rawChars = source.codeUnits;
       charEncodingName = 'utf-8';
       charEncodingCertain = true;
     } else if (source is List<int>) {
@@ -96,17 +96,27 @@ class HtmlInputStream {
     }
 
     bool skipNewline = false;
-    for (var c in _rawChars) {
+    bool wasSurrogatePair = false;
+    for (int i = 0; i < _rawChars.length; i++) {
+      int c = _rawChars[i];
       if (skipNewline) {
         skipNewline = false;
         if (c == NEWLINE) continue;
       }
 
-      if (_invalidUnicode(c)) errors.add('invalid-codepoint');
+      final isSurrogatePair = _isSurrogatePair(_rawChars, i);
+      if (!isSurrogatePair && !wasSurrogatePair) {
+        if (_invalidUnicode(c)) {
+          errors.add('invalid-codepoint');
 
-      if (0xD800 <= c && c <= 0xDFFF) {
-        c = 0xFFFD;
-      } else if (c == RETURN) {
+          if (0xD800 <= c && c <= 0xDFFF) {
+            c = 0xFFFD;
+          }
+        }
+      }
+      wasSurrogatePair = isSurrogatePair;
+
+      if (c == RETURN) {
         skipNewline = true;
         c = NEWLINE;
       }
@@ -203,21 +213,38 @@ class HtmlInputStream {
   /// EOF when EOF is reached.
   String char() {
     if (_offset >= _chars.length) return eof;
-    return String.fromCharCodes([_chars[_offset++]]);
+    return _isSurrogatePair(_chars, _offset)
+        ? String.fromCharCodes([_chars[_offset++], _chars[_offset++]])
+        : String.fromCharCodes([_chars[_offset++]]);
   }
 
   String peekChar() {
     if (_offset >= _chars.length) return eof;
-    return String.fromCharCodes([_chars[_offset]]);
+    return _isSurrogatePair(_chars, _offset)
+        ? String.fromCharCodes([_chars[_offset], _chars[_offset + 1]])
+        : String.fromCharCodes([_chars[_offset]]);
+  }
+
+  // Whether the current and next chars indicate a surrogate pair.
+  bool _isSurrogatePair(List<int> chars, int i) {
+    return i + 1 < chars.length &&
+        _isLeadSurrogate(chars[i]) &&
+        _isTrailSurrogate(chars[i + 1]);
   }
 
+  // Is then code (a 16-bit unsigned integer) a UTF-16 lead surrogate.
+  bool _isLeadSurrogate(int code) => (code & 0xFC00) == 0xD800;
+
+  // Is then code (a 16-bit unsigned integer) a UTF-16 trail surrogate.
+  bool _isTrailSurrogate(int code) => (code & 0xFC00) == 0xDC00;
+
   /// Returns a string of characters from the stream up to but not
   /// including any character in 'characters' or EOF.
   String charsUntil(String characters, [bool opposite = false]) {
     int start = _offset;
     String c;
     while ((c = peekChar()) != null && characters.contains(c) == opposite) {
-      _offset++;
+      _offset += c.codeUnits.length;
     }
 
     return String.fromCharCodes(_chars.sublist(start, _offset));
@@ -227,7 +254,7 @@ class HtmlInputStream {
     // Only one character is allowed to be ungotten at once - it must
     // be consumed again before any further call to unget
     if (ch != null) {
-      _offset--;
+      _offset -= ch.codeUnits.length;
       assert(peekChar() == ch);
     }
   }
@@ -304,18 +331,18 @@ bool _hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {
       bytes[offset + 2] == 0xBF;
 }
 
-/// Decodes the [bytes] with the provided [encoding] and returns an iterable for
+/// Decodes the [bytes] with the provided [encoding] and returns a list for
 /// the codepoints. Supports the major unicode encodings as well as ascii and
 /// and windows-1252 encodings.
-Iterable<int> _decodeBytes(String encoding, List<int> bytes) {
+List<int> _decodeBytes(String encoding, List<int> bytes) {
   switch (encoding) {
     case 'ascii':
-      return ascii.decode(bytes).runes;
+      return ascii.decode(bytes).codeUnits;
 
     case 'utf-8':
       // NOTE: To match the behavior of the other decode functions, we eat the
       // UTF-8 BOM here. This is the default behavior of `utf8.decode`.
-      return utf8.decode(bytes).runes;
+      return utf8.decode(bytes).codeUnits;
 
     default:
       throw ArgumentError('Encoding $encoding not supported');
diff --git a/pubspec.yaml b/pubspec.yaml
@@ -6,7 +6,7 @@ author: Dart Team <misc@dartlang.org>
 homepage: https://github.com/dart-lang/html
 
 environment:
-  sdk: '>=2.0.0 <3.0.0'
+  sdk: '>=2.3.0 <3.0.0'
 
 dependencies:
   csslib: '>=0.13.2 <0.17.0'
diff --git a/test/data/tokenizer/unicodeCharsSurrogates.test b/test/data/tokenizer/unicodeCharsSurrogates.test
@@ -0,0 +1,24 @@
+{"tests" : [
+{"description": "Unicode surrogate (emoji)",
+"input": "\uD83D\uDC3C",
+"output":[["Character", "\uD83D\uDC3C"]]},
+
+{"description": "Unicode surrogate (emoji) prefixed by characters",
+"input": "before\uD83D\uDC3C",
+"output":[["Character", "before\uD83D\uDC3C"]]},
+
+{"description": "Unicode surrogate (emoji) suffixed by characters",
+"input": "\uD83D\uDC3Cafter",
+"output":[["Character", "\uD83D\uDC3Cafter"]]},
+
+{"description":"Quoted attribute with surrogate unicode content",
+"generateSpans": true,
+"input":"<a href='\uD83D\uDC3C'/>",
+"output":[["StartTag","a",{"href":"\uD83D\uDC3C"},true,0,14]]},
+
+{"description":"Surrogate unicode content followed by attribute",
+"generateSpans": true,
+"input":"\uD83D\uDC3C<a href='b'/>",
+"output":[["Character", "\uD83D\uDC3C", 0, 2],["StartTag","a",{"href":"b"},true,2,15]]}
+]
+}
diff --git a/test/tokenizer_test.dart b/test/tokenizer_test.dart
@@ -16,16 +16,20 @@ import 'support.dart';
 class TokenizerTestParser {
   final String _state;
   final String _lastStartTag;
+  final bool _generateSpans;
   List outputTokens;
 
-  TokenizerTestParser(String initialState, [String lastStartTag])
+  TokenizerTestParser(String initialState,
+      [String lastStartTag, bool generateSpans = false])
       : _state = initialState,
-        _lastStartTag = lastStartTag;
+        _lastStartTag = lastStartTag,
+        _generateSpans = generateSpans;
 
   List parse(String str) {
     // Note: we need to pass bytes to the tokenizer if we want it to handle BOM.
     var bytes = utf8.encode(str);
-    var tokenizer = HtmlTokenizer(bytes, encoding: 'utf-8');
+    var tokenizer =
+        HtmlTokenizer(bytes, encoding: 'utf-8', generateSpans: _generateSpans);
     outputTokens = [];
 
     // Note: we can't get a closure of the state method. However, we can
@@ -68,28 +72,29 @@ class TokenizerTestParser {
   }
 
   void processDoctype(DoctypeToken token) {
-    outputTokens.add(
+    addOutputToken(token,
         ["DOCTYPE", token.name, token.publicId, token.systemId, token.correct]);
   }
 
   void processStartTag(StartTagToken token) {
-    outputTokens.add(["StartTag", token.name, token.data, token.selfClosing]);
+    addOutputToken(
+        token, ["StartTag", token.name, token.data, token.selfClosing]);
   }
 
   void processEndTag(EndTagToken token) {
-    outputTokens.add(["EndTag", token.name, token.selfClosing]);
+    addOutputToken(token, ["EndTag", token.name, token.selfClosing]);
   }
 
   void processComment(StringToken token) {
-    outputTokens.add(["Comment", token.data]);
+    addOutputToken(token, ["Comment", token.data]);
   }
 
   void processSpaceCharacters(StringToken token) {
     processCharacters(token);
   }
 
   void processCharacters(StringToken token) {
-    outputTokens.add(["Character", token.data]);
+    addOutputToken(token, ["Character", token.data]);
   }
 
   void processEOF(token) {}
@@ -98,7 +103,15 @@ class TokenizerTestParser {
     // TODO(jmesserly): when debugging test failures it can be useful to add
     // logging here like `print('ParseError $token');`. It would be nice to
     // use the actual logging library.
-    outputTokens.add(["ParseError", token.data]);
+    addOutputToken(token, ["ParseError", token.data]);
+  }
+
+  void addOutputToken(Token token, List array) {
+    outputTokens.add([
+      ...array,
+      if (token.span != null && _generateSpans) token.span.start.offset,
+      if (token.span != null && _generateSpans) token.span.end.offset,
+    ]);
   }
 }
 
@@ -138,16 +151,18 @@ List normalizeTokens(List tokens) {
 void expectTokensMatch(
     List expectedTokens, List receivedTokens, bool ignoreErrorOrder,
     [bool ignoreErrors = false, String message]) {
-  var checkSelfClosing = false;
+  // If the 'selfClosing' attribute is not included in the expected test tokens,
+  // remove it from the received token.
+  var removeSelfClosing = false;
   for (var token in expectedTokens) {
-    if (token[0] == "StartTag" && token.length == 4 ||
-        token[0] == "EndTag" && token.length == 3) {
-      checkSelfClosing = true;
+    if (token[0] == "StartTag" && token.length == 3 ||
+        token[0] == "EndTag" && token.length == 2) {
+      removeSelfClosing = true;
       break;
     }
   }
 
-  if (!checkSelfClosing) {
+  if (removeSelfClosing) {
     for (var token in receivedTokens) {
       if (token[0] == "StartTag" || token[0] == "EndTag") {
         token.removeLast();
@@ -182,8 +197,8 @@ void runTokenizerTest(Map testInfo) {
   if (!testInfo.containsKey('lastStartTag')) {
     testInfo['lastStartTag'] = null;
   }
-  var parser =
-      TokenizerTestParser(testInfo['initialState'], testInfo['lastStartTag']);
+  var parser = TokenizerTestParser(testInfo['initialState'],
+      testInfo['lastStartTag'], testInfo['generateSpans'] ?? false);
   var tokens = parser.parse(testInfo['input']);
   tokens = concatenateCharacterTokens(tokens);
   var received = normalizeTokens(tokens);