Skip to content

optimize UTF8 conversion routines #1022

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Jan 25, 2020
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 41 additions & 36 deletions std/assembly/string.ts
Original file line number Diff line number Diff line change
Expand Up @@ -625,23 +625,25 @@ export namespace String {
export function byteLength(str: string, nullTerminated: bool = false): i32 {
var strOff = changetype<usize>(str);
var strEnd = strOff + <usize>changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize;
var bufLen = nullTerminated ? 1 : 0;
var bufLen = <i32>nullTerminated;
while (strOff < strEnd) {
let c1 = <u32>load<u16>(strOff);
if (c1 < 128) {
if (nullTerminated && !c1) break;
bufLen += 1; strOff += 2;
// @ts-ignore: cast
if (nullTerminated & !c1) break;
Copy link
Member

@dcodeIO dcodeIO Jan 25, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can also do u32(nullTerminated) & u32(!c1) here to avoid the ts-ignore.

Copy link
Member Author

@MaxGraey MaxGraey Jan 25, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, and in first version I used exactly that but later we decide just comment with ts ignorance)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was in other PR

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, hmm. Do you remember if that was due to wrapping?

bufLen += 1;
} else if (c1 < 2048) {
bufLen += 2; strOff += 2;
bufLen += 2;
} else {
if ((c1 & 0xFC00) == 0xD800 && strOff + 2 < strEnd) {
if ((<u32>load<u16>(strOff, 2) & 0xFC00) == 0xDC00) {
strOff += 4; bufLen += 4;
bufLen += 4; strOff += 4;
continue;
}
}
strOff += 2; bufLen += 3;
bufLen += 3;
}
strOff += 2;
}
return bufLen;
}
Expand All @@ -654,7 +656,8 @@ export namespace String {
while (strOff < strEnd) {
let c1 = <u32>load<u16>(strOff);
if (c1 < 128) {
if (nullTerminated && !c1) break;
// @ts-ignore: cast
if (nullTerminated & !c1) break;
store<u8>(bufOff, c1);
bufOff += 1; strOff += 2;
} else if (c1 < 2048) {
Expand Down Expand Up @@ -703,36 +706,38 @@ export namespace String {
var str = __alloc(len << 1, idof<String>()); // max is one u16 char per u8 byte
var strOff = str;
while (bufOff < bufEnd) {
let cp = <u32>load<u8>(bufOff++);
if (cp < 128) {
if (nullTerminated && !cp) break;
store<u16>(strOff, cp);
strOff += 2;
} else if (cp > 191 && cp < 224) {
if (bufEnd - bufOff < 1) break;
store<u16>(strOff, (cp & 31) << 6 | load<u8>(bufOff++) & 63);
strOff += 2;
} else if (cp > 239 && cp < 365) {
if (bufEnd - bufOff < 3) break;
cp = (
(cp & 7) << 18 |
(load<u8>(bufOff) & 63) << 12 |
(load<u8>(bufOff, 1) & 63) << 6 |
load<u8>(bufOff, 2) & 63
) - 0x10000;
bufOff += 3;
store<u16>(strOff, 0xD800 | (cp >> 10));
store<u16>(strOff, 0xDC00 | (cp & 1023), 2);
strOff += 4;
let u0 = <u32>load<u8>(bufOff); ++bufOff;
if (!(u0 & 128)) {
// @ts-ignore: cast
if (nullTerminated & !u0) break;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar

store<u16>(strOff, u0);
} else {
if (bufEnd - bufOff < 2) break;
store<u16>(strOff,
(cp & 15) << 12 |
(load<u8>(bufOff) & 63) << 6 |
load<u8>(bufOff, 1) & 63
);
bufOff += 2; strOff += 2;
if (bufEnd == bufOff) break;
let u1 = <u32>load<u8>(bufOff) & 63; ++bufOff;
if ((u0 & 224) == 192) {
store<u16>(strOff, (u0 & 31) << 6 | u1);
} else {
if (bufEnd == bufOff) break;
let u2 = <u32>load<u8>(bufOff) & 63; ++bufOff;
if ((u0 & 240) == 224) {
u0 = (u0 & 15) << 12 | u1 << 6 | u2;
} else {
if (bufEnd == bufOff) break;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On a first glimpse this seems to be trading branches, like where we previously checked that we can process 3 bytes (never emitting truncated bytes), we now check per byte (potentially emitting truncated bytes?). Makes me wonder what's better.

u0 = (u0 & 7) << 18 | u1 << 12 | u2 << 6 | <u32>load<u8>(bufOff) & 63;
++bufOff;
}
if (u0 < 0x10000) {
store<u16>(strOff, u0);
} else {
u0 -= 0x10000;
let lo = u0 >> 10 | 0xD800;
let hi = (u0 & 0x03FF) | 0xDC00;
store<u32>(strOff, lo | (hi << 16));
strOff += 2;
}
}
}
strOff += 2;
}
return changetype<String>(__realloc(str, strOff - str)); // retains
}
Expand All @@ -745,7 +750,7 @@ export namespace String {
}

export function encode(str: string): ArrayBuffer {
var size = changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize;
var size = UTF16.byteLength(str);
var buf = __alloc(size, idof<ArrayBuffer>());
memory.copy(buf, changetype<usize>(str), <usize>size);
return changetype<ArrayBuffer>(buf); // retains
Expand Down
Loading