-
-
Notifications
You must be signed in to change notification settings - Fork 670
optimize UTF8 conversion routines #1022
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
2d22cb0
9fd51ce
6e31949
88c9bd6
17949a6
dcb025c
642b90a
835a081
4fa33af
c690031
7465d2e
8d435c7
cb6bb7f
187340a
1ed875b
a9968cb
845156b
6977e76
89734da
77e95a7
a56cd98
016c2c5
3e3babb
0fefbef
9f7cd5c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -625,23 +625,25 @@ export namespace String { | |
export function byteLength(str: string, nullTerminated: bool = false): i32 { | ||
var strOff = changetype<usize>(str); | ||
var strEnd = strOff + <usize>changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize; | ||
var bufLen = nullTerminated ? 1 : 0; | ||
var bufLen = <i32>nullTerminated; | ||
while (strOff < strEnd) { | ||
let c1 = <u32>load<u16>(strOff); | ||
if (c1 < 128) { | ||
if (nullTerminated && !c1) break; | ||
bufLen += 1; strOff += 2; | ||
// @ts-ignore: cast | ||
if (nullTerminated & !c1) break; | ||
bufLen += 1; | ||
} else if (c1 < 2048) { | ||
bufLen += 2; strOff += 2; | ||
bufLen += 2; | ||
} else { | ||
if ((c1 & 0xFC00) == 0xD800 && strOff + 2 < strEnd) { | ||
if ((<u32>load<u16>(strOff, 2) & 0xFC00) == 0xDC00) { | ||
strOff += 4; bufLen += 4; | ||
bufLen += 4; strOff += 4; | ||
continue; | ||
} | ||
} | ||
strOff += 2; bufLen += 3; | ||
bufLen += 3; | ||
} | ||
strOff += 2; | ||
MaxGraey marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
return bufLen; | ||
} | ||
|
@@ -654,7 +656,8 @@ export namespace String { | |
while (strOff < strEnd) { | ||
let c1 = <u32>load<u16>(strOff); | ||
if (c1 < 128) { | ||
if (nullTerminated && !c1) break; | ||
// @ts-ignore: cast | ||
if (nullTerminated & !c1) break; | ||
store<u8>(bufOff, c1); | ||
bufOff += 1; strOff += 2; | ||
} else if (c1 < 2048) { | ||
|
@@ -703,36 +706,38 @@ export namespace String { | |
var str = __alloc(len << 1, idof<String>()); // max is one u16 char per u8 byte | ||
var strOff = str; | ||
while (bufOff < bufEnd) { | ||
let cp = <u32>load<u8>(bufOff++); | ||
if (cp < 128) { | ||
if (nullTerminated && !cp) break; | ||
store<u16>(strOff, cp); | ||
strOff += 2; | ||
} else if (cp > 191 && cp < 224) { | ||
if (bufEnd - bufOff < 1) break; | ||
store<u16>(strOff, (cp & 31) << 6 | load<u8>(bufOff++) & 63); | ||
strOff += 2; | ||
} else if (cp > 239 && cp < 365) { | ||
if (bufEnd - bufOff < 3) break; | ||
cp = ( | ||
(cp & 7) << 18 | | ||
(load<u8>(bufOff) & 63) << 12 | | ||
(load<u8>(bufOff, 1) & 63) << 6 | | ||
load<u8>(bufOff, 2) & 63 | ||
) - 0x10000; | ||
bufOff += 3; | ||
store<u16>(strOff, 0xD800 | (cp >> 10)); | ||
store<u16>(strOff, 0xDC00 | (cp & 1023), 2); | ||
strOff += 4; | ||
let u0 = <u32>load<u8>(bufOff); ++bufOff; | ||
if (!(u0 & 128)) { | ||
// @ts-ignore: cast | ||
if (nullTerminated & !u0) break; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar |
||
store<u16>(strOff, u0); | ||
} else { | ||
if (bufEnd - bufOff < 2) break; | ||
store<u16>(strOff, | ||
(cp & 15) << 12 | | ||
(load<u8>(bufOff) & 63) << 6 | | ||
load<u8>(bufOff, 1) & 63 | ||
); | ||
bufOff += 2; strOff += 2; | ||
if (bufEnd == bufOff) break; | ||
let u1 = <u32>load<u8>(bufOff) & 63; ++bufOff; | ||
if ((u0 & 224) == 192) { | ||
store<u16>(strOff, (u0 & 31) << 6 | u1); | ||
} else { | ||
if (bufEnd == bufOff) break; | ||
let u2 = <u32>load<u8>(bufOff) & 63; ++bufOff; | ||
if ((u0 & 240) == 224) { | ||
u0 = (u0 & 15) << 12 | u1 << 6 | u2; | ||
} else { | ||
if (bufEnd == bufOff) break; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On a first glimpse this seems to be trading branches, like where we previously checked that we can process 3 bytes (never emitting truncated bytes), we now check per byte (potentially emitting truncated bytes?). Makes me wonder what's better. |
||
u0 = (u0 & 7) << 18 | u1 << 12 | u2 << 6 | <u32>load<u8>(bufOff) & 63; | ||
++bufOff; | ||
} | ||
if (u0 < 0x10000) { | ||
store<u16>(strOff, u0); | ||
} else { | ||
u0 -= 0x10000; | ||
let lo = u0 >> 10 | 0xD800; | ||
let hi = (u0 & 0x03FF) | 0xDC00; | ||
store<u32>(strOff, lo | (hi << 16)); | ||
strOff += 2; | ||
} | ||
} | ||
} | ||
strOff += 2; | ||
} | ||
return changetype<String>(__realloc(str, strOff - str)); // retains | ||
} | ||
|
@@ -745,7 +750,7 @@ export namespace String { | |
} | ||
|
||
export function encode(str: string): ArrayBuffer { | ||
var size = changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize; | ||
var size = UTF16.byteLength(str); | ||
var buf = __alloc(size, idof<ArrayBuffer>()); | ||
memory.copy(buf, changetype<usize>(str), <usize>size); | ||
return changetype<ArrayBuffer>(buf); // retains | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can also do
u32(nullTerminated) & u32(!c1)
here to avoid the ts-ignore.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, and in first version I used exactly that but later we decide just comment with ts ignorance)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It was in other PR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, hmm. Do you remember if that was due to wrapping?