Skip to content

Commit 2b392a4

Browse files
cvolzke4nshahan
authored andcommitted
Fix spans generated for HTML with higher-plane unicode characters (flutter#109)
1 parent d37f588 commit 2b392a4

File tree

6 files changed

+103
-33
lines changed

6 files changed

+103
-33
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ language: dart
22

33
dart:
44
- dev
5-
- 2.0.0
5+
- 2.3.0
66

77
dart_task:
88
- test: -p vm

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.14.0+3
2+
3+
- Fix spans generated for HTML with higher-plane unicode characters (eg. emojis)
4+
15
## 0.14.0+2
26

37
- Support `package:css` `>=0.13.2 <0.17.0`.

lib/src/html_input_stream.dart

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class HtmlInputStream {
3333
List<int> _rawBytes;
3434

3535
/// Raw UTF-16 codes, used if a Dart String is passed in.
36-
Iterable<int> _rawChars;
36+
List<int> _rawChars;
3737

3838
Queue<String> errors;
3939

@@ -66,7 +66,7 @@ class HtmlInputStream {
6666
this.sourceUrl])
6767
: charEncodingName = codecName(encoding) {
6868
if (source is String) {
69-
_rawChars = source.runes.toList();
69+
_rawChars = source.codeUnits;
7070
charEncodingName = 'utf-8';
7171
charEncodingCertain = true;
7272
} else if (source is List<int>) {
@@ -96,17 +96,27 @@ class HtmlInputStream {
9696
}
9797

9898
bool skipNewline = false;
99-
for (var c in _rawChars) {
99+
bool wasSurrogatePair = false;
100+
for (int i = 0; i < _rawChars.length; i++) {
101+
int c = _rawChars[i];
100102
if (skipNewline) {
101103
skipNewline = false;
102104
if (c == NEWLINE) continue;
103105
}
104106

105-
if (_invalidUnicode(c)) errors.add('invalid-codepoint');
107+
final isSurrogatePair = _isSurrogatePair(_rawChars, i);
108+
if (!isSurrogatePair && !wasSurrogatePair) {
109+
if (_invalidUnicode(c)) {
110+
errors.add('invalid-codepoint');
106111

107-
if (0xD800 <= c && c <= 0xDFFF) {
108-
c = 0xFFFD;
109-
} else if (c == RETURN) {
112+
if (0xD800 <= c && c <= 0xDFFF) {
113+
c = 0xFFFD;
114+
}
115+
}
116+
}
117+
wasSurrogatePair = isSurrogatePair;
118+
119+
if (c == RETURN) {
110120
skipNewline = true;
111121
c = NEWLINE;
112122
}
@@ -203,21 +213,38 @@ class HtmlInputStream {
203213
/// EOF when EOF is reached.
204214
String char() {
205215
if (_offset >= _chars.length) return eof;
206-
return String.fromCharCodes([_chars[_offset++]]);
216+
return _isSurrogatePair(_chars, _offset)
217+
? String.fromCharCodes([_chars[_offset++], _chars[_offset++]])
218+
: String.fromCharCodes([_chars[_offset++]]);
207219
}
208220

209221
String peekChar() {
210222
if (_offset >= _chars.length) return eof;
211-
return String.fromCharCodes([_chars[_offset]]);
223+
return _isSurrogatePair(_chars, _offset)
224+
? String.fromCharCodes([_chars[_offset], _chars[_offset + 1]])
225+
: String.fromCharCodes([_chars[_offset]]);
226+
}
227+
228+
// Whether the current and next chars indicate a surrogate pair.
229+
bool _isSurrogatePair(List<int> chars, int i) {
230+
return i + 1 < chars.length &&
231+
_isLeadSurrogate(chars[i]) &&
232+
_isTrailSurrogate(chars[i + 1]);
212233
}
213234

235+
// Is then code (a 16-bit unsigned integer) a UTF-16 lead surrogate.
236+
bool _isLeadSurrogate(int code) => (code & 0xFC00) == 0xD800;
237+
238+
// Is then code (a 16-bit unsigned integer) a UTF-16 trail surrogate.
239+
bool _isTrailSurrogate(int code) => (code & 0xFC00) == 0xDC00;
240+
214241
/// Returns a string of characters from the stream up to but not
215242
/// including any character in 'characters' or EOF.
216243
String charsUntil(String characters, [bool opposite = false]) {
217244
int start = _offset;
218245
String c;
219246
while ((c = peekChar()) != null && characters.contains(c) == opposite) {
220-
_offset++;
247+
_offset += c.codeUnits.length;
221248
}
222249

223250
return String.fromCharCodes(_chars.sublist(start, _offset));
@@ -227,7 +254,7 @@ class HtmlInputStream {
227254
// Only one character is allowed to be ungotten at once - it must
228255
// be consumed again before any further call to unget
229256
if (ch != null) {
230-
_offset--;
257+
_offset -= ch.codeUnits.length;
231258
assert(peekChar() == ch);
232259
}
233260
}
@@ -304,18 +331,18 @@ bool _hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {
304331
bytes[offset + 2] == 0xBF;
305332
}
306333

307-
/// Decodes the [bytes] with the provided [encoding] and returns an iterable for
334+
/// Decodes the [bytes] with the provided [encoding] and returns a list for
308335
/// the codepoints. Supports the major unicode encodings as well as ascii and
309336
/// and windows-1252 encodings.
310-
Iterable<int> _decodeBytes(String encoding, List<int> bytes) {
337+
List<int> _decodeBytes(String encoding, List<int> bytes) {
311338
switch (encoding) {
312339
case 'ascii':
313-
return ascii.decode(bytes).runes;
340+
return ascii.decode(bytes).codeUnits;
314341

315342
case 'utf-8':
316343
// NOTE: To match the behavior of the other decode functions, we eat the
317344
// UTF-8 BOM here. This is the default behavior of `utf8.decode`.
318-
return utf8.decode(bytes).runes;
345+
return utf8.decode(bytes).codeUnits;
319346

320347
default:
321348
throw ArgumentError('Encoding $encoding not supported');

pubspec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ author: Dart Team <[email protected]>
66
homepage: https://github.com/dart-lang/html
77

88
environment:
9-
sdk: '>=2.0.0 <3.0.0'
9+
sdk: '>=2.3.0 <3.0.0'
1010

1111
dependencies:
1212
csslib: '>=0.13.2 <0.17.0'
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{"tests" : [
2+
{"description": "Unicode surrogate (emoji)",
3+
"input": "\uD83D\uDC3C",
4+
"output":[["Character", "\uD83D\uDC3C"]]},
5+
6+
{"description": "Unicode surrogate (emoji) prefixed by characters",
7+
"input": "before\uD83D\uDC3C",
8+
"output":[["Character", "before\uD83D\uDC3C"]]},
9+
10+
{"description": "Unicode surrogate (emoji) suffixed by characters",
11+
"input": "\uD83D\uDC3Cafter",
12+
"output":[["Character", "\uD83D\uDC3Cafter"]]},
13+
14+
{"description":"Quoted attribute with surrogate unicode content",
15+
"generateSpans": true,
16+
"input":"<a href='\uD83D\uDC3C'/>",
17+
"output":[["StartTag","a",{"href":"\uD83D\uDC3C"},true,0,14]]},
18+
19+
{"description":"Surrogate unicode content followed by attribute",
20+
"generateSpans": true,
21+
"input":"\uD83D\uDC3C<a href='b'/>",
22+
"output":[["Character", "\uD83D\uDC3C", 0, 2],["StartTag","a",{"href":"b"},true,2,15]]}
23+
]
24+
}

test/tokenizer_test.dart

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,20 @@ import 'support.dart';
1616
class TokenizerTestParser {
1717
final String _state;
1818
final String _lastStartTag;
19+
final bool _generateSpans;
1920
List outputTokens;
2021

21-
TokenizerTestParser(String initialState, [String lastStartTag])
22+
TokenizerTestParser(String initialState,
23+
[String lastStartTag, bool generateSpans = false])
2224
: _state = initialState,
23-
_lastStartTag = lastStartTag;
25+
_lastStartTag = lastStartTag,
26+
_generateSpans = generateSpans;
2427

2528
List parse(String str) {
2629
// Note: we need to pass bytes to the tokenizer if we want it to handle BOM.
2730
var bytes = utf8.encode(str);
28-
var tokenizer = HtmlTokenizer(bytes, encoding: 'utf-8');
31+
var tokenizer =
32+
HtmlTokenizer(bytes, encoding: 'utf-8', generateSpans: _generateSpans);
2933
outputTokens = [];
3034

3135
// Note: we can't get a closure of the state method. However, we can
@@ -68,28 +72,29 @@ class TokenizerTestParser {
6872
}
6973

7074
void processDoctype(DoctypeToken token) {
71-
outputTokens.add(
75+
addOutputToken(token,
7276
["DOCTYPE", token.name, token.publicId, token.systemId, token.correct]);
7377
}
7478

7579
void processStartTag(StartTagToken token) {
76-
outputTokens.add(["StartTag", token.name, token.data, token.selfClosing]);
80+
addOutputToken(
81+
token, ["StartTag", token.name, token.data, token.selfClosing]);
7782
}
7883

7984
void processEndTag(EndTagToken token) {
80-
outputTokens.add(["EndTag", token.name, token.selfClosing]);
85+
addOutputToken(token, ["EndTag", token.name, token.selfClosing]);
8186
}
8287

8388
void processComment(StringToken token) {
84-
outputTokens.add(["Comment", token.data]);
89+
addOutputToken(token, ["Comment", token.data]);
8590
}
8691

8792
void processSpaceCharacters(StringToken token) {
8893
processCharacters(token);
8994
}
9095

9196
void processCharacters(StringToken token) {
92-
outputTokens.add(["Character", token.data]);
97+
addOutputToken(token, ["Character", token.data]);
9398
}
9499

95100
void processEOF(token) {}
@@ -98,7 +103,15 @@ class TokenizerTestParser {
98103
// TODO(jmesserly): when debugging test failures it can be useful to add
99104
// logging here like `print('ParseError $token');`. It would be nice to
100105
// use the actual logging library.
101-
outputTokens.add(["ParseError", token.data]);
106+
addOutputToken(token, ["ParseError", token.data]);
107+
}
108+
109+
void addOutputToken(Token token, List array) {
110+
outputTokens.add([
111+
...array,
112+
if (token.span != null && _generateSpans) token.span.start.offset,
113+
if (token.span != null && _generateSpans) token.span.end.offset,
114+
]);
102115
}
103116
}
104117

@@ -138,16 +151,18 @@ List normalizeTokens(List tokens) {
138151
void expectTokensMatch(
139152
List expectedTokens, List receivedTokens, bool ignoreErrorOrder,
140153
[bool ignoreErrors = false, String message]) {
141-
var checkSelfClosing = false;
154+
// If the 'selfClosing' attribute is not included in the expected test tokens,
155+
// remove it from the received token.
156+
var removeSelfClosing = false;
142157
for (var token in expectedTokens) {
143-
if (token[0] == "StartTag" && token.length == 4 ||
144-
token[0] == "EndTag" && token.length == 3) {
145-
checkSelfClosing = true;
158+
if (token[0] == "StartTag" && token.length == 3 ||
159+
token[0] == "EndTag" && token.length == 2) {
160+
removeSelfClosing = true;
146161
break;
147162
}
148163
}
149164

150-
if (!checkSelfClosing) {
165+
if (removeSelfClosing) {
151166
for (var token in receivedTokens) {
152167
if (token[0] == "StartTag" || token[0] == "EndTag") {
153168
token.removeLast();
@@ -182,8 +197,8 @@ void runTokenizerTest(Map testInfo) {
182197
if (!testInfo.containsKey('lastStartTag')) {
183198
testInfo['lastStartTag'] = null;
184199
}
185-
var parser =
186-
TokenizerTestParser(testInfo['initialState'], testInfo['lastStartTag']);
200+
var parser = TokenizerTestParser(testInfo['initialState'],
201+
testInfo['lastStartTag'], testInfo['generateSpans'] ?? false);
187202
var tokens = parser.parse(testInfo['input']);
188203
tokens = concatenateCharacterTokens(tokens);
189204
var received = normalizeTokens(tokens);

0 commit comments

Comments
 (0)