Skip to content

Commit 00ecb51

Browse files
committed
finalize: strip segments that contain only EM_ASM/EM_JS data
If we find a data segment whose entire contents is EM_JS or EM_ASM strings then strip it from the binary. See: emscripten-core/emscripten#13443
1 parent e283300 commit 00ecb51

File tree

5 files changed

+217
-54
lines changed

5 files changed

+217
-54
lines changed

src/binaryen-c.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -1633,9 +1633,9 @@ BINARYEN_API void BinaryenMemoryInitSetSize(BinaryenExpressionRef expr,
16331633

16341634
// DataDrop
16351635

1636-
// Gets the index of the segment being dropped by a `memory.drop` expression.
1636+
// Gets the index of the segment being dropped by a `data.drop` expression.
16371637
BINARYEN_API uint32_t BinaryenDataDropGetSegment(BinaryenExpressionRef expr);
1638-
// Sets the index of the segment being dropped by a `memory.drop` expression.
1638+
// Sets the index of the segment being dropped by a `data.drop` expression.
16391639
BINARYEN_API void BinaryenDataDropSetSegment(BinaryenExpressionRef expr,
16401640
uint32_t segmentIndex);
16411641

src/wasm/wasm-emscripten.cpp

+142-47
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,7 @@ Global* getStackPointerGlobal(Module& wasm) {
7676

7777
const Address UNKNOWN_OFFSET(uint32_t(-1));
7878

79-
std::string escape(const char* input) {
80-
std::string code = input;
79+
std::string escape(std::string code) {
8180
// replace newlines quotes with escaped newlines
8281
size_t curr = 0;
8382
while ((curr = code.find("\\n", curr)) != std::string::npos) {
@@ -109,14 +108,21 @@ class StringConstantTracker {
109108
public:
110109
StringConstantTracker(Module& wasm) : wasm(wasm) { calcSegmentOffsets(); }
111110

112-
std::string codeForConstAddr(int64_t address) {
113-
const char* str = stringAtAddr(address);
114-
if (!str) {
115-
Fatal() << "unable to find data for ASM/EM_JS const at: " << address;
111+
const char* stringAtAddr(Address address) {
112+
for (unsigned i = 0; i < wasm.memory.segments.size(); ++i) {
113+
Memory::Segment& segment = wasm.memory.segments[i];
114+
Address offset = segmentOffsets[i];
115+
if (offset != UNKNOWN_OFFSET && address >= offset &&
116+
address < offset + segment.data.size()) {
117+
return &segment.data[address - offset];
118+
}
116119
}
117-
return escape(str);
120+
Fatal() << "unable to find data for ASM/EM_JS const at: " << address;
121+
return nullptr;
118122
}
119123

124+
std::vector<Address> segmentOffsets; // segment index => address offset
125+
120126
private:
121127
void calcSegmentOffsets() {
122128
std::unordered_map<Index, Address> passiveOffsets;
@@ -172,32 +178,19 @@ class StringConstantTracker {
172178
}
173179
}
174180

175-
const char* stringAtAddr(Address address) {
176-
for (unsigned i = 0; i < wasm.memory.segments.size(); ++i) {
177-
Memory::Segment& segment = wasm.memory.segments[i];
178-
Address offset = segmentOffsets[i];
179-
if (offset != UNKNOWN_OFFSET && address >= offset &&
180-
address < offset + segment.data.size()) {
181-
return &segment.data[address - offset];
182-
}
183-
}
184-
return nullptr;
185-
}
186-
187181
Module& wasm;
188-
std::vector<Address> segmentOffsets; // segment index => address offset
182+
};
183+
184+
struct AsmConst {
185+
Address id;
186+
std::string code;
189187
};
190188

191189
struct AsmConstWalker : public LinearExecutionWalker<AsmConstWalker> {
192190
Module& wasm;
193191
bool minimizeWasmChanges;
194192
StringConstantTracker stringTracker;
195193

196-
struct AsmConst {
197-
Address id;
198-
std::string code;
199-
};
200-
201194
std::vector<AsmConst> asmConsts;
202195
// last sets in the current basic block, per index
203196
std::map<Index, LocalSet*> sets;
@@ -292,9 +285,8 @@ void AsmConstWalker::visitCall(Call* curr) {
292285
}
293286

294287
auto* value = arg->cast<Const>();
295-
int64_t address = value->value.getInteger();
296-
auto code = stringTracker.codeForConstAddr(address);
297-
createAsmConst(address, code);
288+
Address address = value->value.getInteger();
289+
asmConsts.push_back({address, stringTracker.stringAtAddr(address)});
298290
}
299291

300292
void AsmConstWalker::process() {
@@ -305,24 +297,105 @@ void AsmConstWalker::process() {
305297
addImports();
306298
}
307299

308-
void AsmConstWalker::createAsmConst(uint64_t id, std::string code) {
309-
AsmConst asmConst;
310-
asmConst.id = id;
311-
asmConst.code = code;
312-
asmConsts.push_back(asmConst);
313-
}
314-
315300
void AsmConstWalker::addImports() {
316301
for (auto& import : queuedImports) {
317302
wasm.addFunction(import.release());
318303
}
319304
}
320305

321-
static AsmConstWalker findEmAsmConstsAndReturnWalker(Module& wasm,
322-
bool minimizeWasmChanges) {
323-
AsmConstWalker walker(wasm, minimizeWasmChanges);
324-
walker.process();
325-
return walker;
306+
struct SegmentRemover : WalkerPass<PostWalker<SegmentRemover>> {
307+
SegmentRemover(Index segment) : segment(segment) {}
308+
309+
bool isFunctionParallel() override { return true; }
310+
311+
Pass* create() override { return new SegmentRemover(segment); }
312+
313+
void visitMemoryInit(MemoryInit* curr) {
314+
if (segment == curr->segment) {
315+
Builder builder(*getModule());
316+
replaceCurrent(builder.blockify(builder.makeDrop(curr->dest),
317+
builder.makeDrop(curr->offset),
318+
builder.makeDrop(curr->size)));
319+
}
320+
}
321+
322+
void visitDataDrop(DataDrop* curr) {
323+
if (segment == curr->segment) {
324+
Builder builder(*getModule());
325+
replaceCurrent(builder.makeNop());
326+
}
327+
}
328+
329+
Index segment;
330+
};
331+
332+
static void removeSegment(Module& wasm, Index segment) {
333+
PassRunner runner(&wasm);
334+
SegmentRemover(segment).run(&runner, &wasm);
335+
// Resize the segment to zero. In theory we should completely remove it
336+
// but that would mean re-numbering the segments that follow which is
337+
// non-trivial.
338+
wasm.memory.segments[segment].data.resize(0);
339+
}
340+
341+
static Address getExportedAddress(Module& wasm, Export* export_) {
342+
Global* g = wasm.getGlobal(export_->value);
343+
auto* addrConst = g->init->dynCast<Const>();
344+
return addrConst->value.getInteger();
345+
}
346+
347+
static std::vector<AsmConst> findEmAsmConsts(Module& wasm,
348+
bool minimizeWasmChanges) {
349+
Export* start = wasm.getExportOrNull("__start_em_asm");
350+
Export* end = wasm.getExportOrNull("__stop_em_asm");
351+
352+
// Older versions of emscripten don't export these symbols. Instead
353+
// we run AsmConstWalker in an attempt to derive the string addresses
354+
// from the code.
355+
if (!start || !end) {
356+
AsmConstWalker walker(wasm, minimizeWasmChanges);
357+
walker.process();
358+
return walker.asmConsts;
359+
}
360+
361+
// Newer version of emscripten export this symbols and we
362+
// can use it ot find all the EM_ASM constants. Sadly __start_em_asm and
363+
// __stop_em_asm don't alwasy mark the start and end of segment because in
364+
// dynamic linking we merge all data segments into one.
365+
std::vector<AsmConst> asmConsts;
366+
StringConstantTracker stringTracker(wasm);
367+
Address startAddress = getExportedAddress(wasm, start);
368+
Address endAddress = getExportedAddress(wasm, end);
369+
for (Index i = 0; i < wasm.memory.segments.size(); i++) {
370+
Address segmentStart = stringTracker.segmentOffsets[i];
371+
size_t segmentSize = wasm.memory.segments[i].data.size();
372+
if (segmentStart <= startAddress &&
373+
segmentStart + segmentSize >= endAddress) {
374+
Address address = startAddress;
375+
while (address < endAddress) {
376+
auto code = stringTracker.stringAtAddr(address);
377+
asmConsts.push_back({address, code});
378+
address.addr += strlen(code) + 1;
379+
}
380+
381+
if (segmentStart == startAddress &&
382+
segmentStart + segmentSize == endAddress) {
383+
removeSegment(wasm, i);
384+
} else {
385+
// If we can't remove the whole segment then just set the string
386+
// data to zero.
387+
size_t segmentOffset = startAddress - segmentStart;
388+
char* startElem = &wasm.memory.segments[i].data[segmentOffset];
389+
memset(startElem, 0, endAddress - startAddress);
390+
}
391+
break;
392+
}
393+
}
394+
395+
assert(asmConsts.size());
396+
wasm.removeExport("__start_em_asm");
397+
wasm.removeExport("__stop_em_asm");
398+
return asmConsts;
326399
}
327400

328401
struct EmJsWalker : public PostWalker<EmJsWalker> {
@@ -331,6 +404,7 @@ struct EmJsWalker : public PostWalker<EmJsWalker> {
331404
std::vector<Export> toRemove;
332405

333406
std::map<std::string, std::string> codeByName;
407+
std::map<Address, size_t> codeAddresses; // map from address to string len
334408

335409
EmJsWalker(Module& _wasm) : wasm(_wasm), stringTracker(_wasm) {}
336410

@@ -353,8 +427,9 @@ struct EmJsWalker : public PostWalker<EmJsWalker> {
353427
}
354428
auto* addrConst = consts.list[0];
355429
int64_t address = addrConst->value.getInteger();
356-
auto code = stringTracker.codeForConstAddr(address);
430+
auto code = stringTracker.stringAtAddr(address);
357431
codeByName[funcName] = code;
432+
codeAddresses[address] = strlen(code) + 1;
358433
}
359434
};
360435

@@ -366,6 +441,27 @@ EmJsWalker findEmJsFuncsAndReturnWalker(Module& wasm) {
366441
wasm.removeExport(exp.name);
367442
wasm.removeFunction(exp.value);
368443
}
444+
445+
// With newer versions of emscripten/llvm we pack all EM_JS strings into
446+
// single segment.
447+
// We can detect this by checking for segments that contain only JS strings.
448+
// When we find such segements we remove them from the final binary.
449+
for (Index i = 0; i < wasm.memory.segments.size(); i++) {
450+
Address start = walker.stringTracker.segmentOffsets[0];
451+
Address cur = start;
452+
453+
while (cur < start + wasm.memory.segments[i].data.size()) {
454+
if (walker.codeAddresses.count(cur) == 0) {
455+
break;
456+
}
457+
cur.addr += walker.codeAddresses[cur];
458+
}
459+
460+
if (cur == start + wasm.memory.segments[i].data.size()) {
461+
// Entire segment is contains JS strings. Remove it.
462+
removeSegment(wasm, i);
463+
}
464+
}
369465
return walker;
370466
}
371467

@@ -383,16 +479,15 @@ std::string EmscriptenGlueGenerator::generateEmscriptenMetadata() {
383479
std::stringstream meta;
384480
meta << "{\n";
385481

386-
AsmConstWalker emAsmWalker =
387-
findEmAsmConstsAndReturnWalker(wasm, minimizeWasmChanges);
482+
std::vector<AsmConst> asmConsts = findEmAsmConsts(wasm, minimizeWasmChanges);
388483

389484
// print
390485
commaFirst = true;
391-
if (!emAsmWalker.asmConsts.empty()) {
486+
if (!asmConsts.empty()) {
392487
meta << " \"asmConsts\": {";
393-
for (auto& asmConst : emAsmWalker.asmConsts) {
488+
for (auto& asmConst : asmConsts) {
394489
meta << nextElement();
395-
meta << '"' << asmConst.id << "\": \"" << asmConst.code << "\"";
490+
meta << '"' << asmConst.id << "\": \"" << escape(asmConst.code) << "\"";
396491
}
397492
meta << "\n },\n";
398493
}
@@ -405,7 +500,7 @@ std::string EmscriptenGlueGenerator::generateEmscriptenMetadata() {
405500
auto& name = pair.first;
406501
auto& code = pair.second;
407502
meta << nextElement();
408-
meta << '"' << name << "\": \"" << code << '"';
503+
meta << '"' << name << "\": \"" << escape(code) << '"';
409504
}
410505
meta << "\n },\n";
411506
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
;; Test that em_asm string are extracted correctly when the __start_em_asm
2+
;; and __stop_em_asm globals are exported.
3+
4+
;; RUN: wasm-emscripten-finalize %s -S | filecheck %s
5+
6+
;; Check that the data segment that contains only EM_ASM strings resized to
7+
;; zero, and that the string are extracted into the metadata.
8+
9+
;; CHECK: (data (i32.const 100) "normal data")
10+
;; CHECK-NEXT: (data (i32.const 512) "")
11+
;; CHECK-NEXT: (data (i32.const 1024) "more data")
12+
13+
;; CHECK: "asmConsts": {
14+
;; CHECK-NEXT: "512": "{ console.log('JS hello'); }",
15+
;; CHECK-NEXT: "541": "{ console.log('hello again'); }"
16+
;; CHECK-NEXT: },
17+
18+
;; Check that the exports are removed
19+
;; CHECK-NOT: export
20+
21+
(module
22+
(memory 1 1)
23+
(global (export "__start_em_asm") i32 (i32.const 512))
24+
(global (export "__stop_em_asm") i32 (i32.const 573))
25+
26+
(data (i32.const 100) "normal data")
27+
(data (i32.const 512) "{ console.log('JS hello'); }\00{ console.log('hello again'); }\00")
28+
(data (i32.const 1024) "more data")
29+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
;; Test that em_asm string are extraced correctly when the __start_em_asm
2+
;; and __stop_em_asm globals are exported.
3+
4+
;; RUN: wasm-emscripten-finalize %s -S | filecheck %s
5+
6+
;; Check for the case when __start_em_asm and __stop_em_asm don't define an
7+
;; entire segment. In this case we preserve the segment but zero the data.
8+
9+
;; CHECK: (data (i32.const 512) "xx\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00yy")
10+
11+
;; CHECK: "asmConsts": {
12+
;; CHECK-NEXT: "514": "{ console.log('JS hello'); }",
13+
;; CHECK-NEXT: "543": "{ console.log('hello again'); }"
14+
;; CHECK-NEXT: },
15+
16+
;; Check that the exports are removed
17+
;; CHECK-NOT: export
18+
19+
(module
20+
(memory 1 1)
21+
(global (export "__start_em_asm") i32 (i32.const 514))
22+
(global (export "__stop_em_asm") i32 (i32.const 575))
23+
(data (i32.const 512) "xx{ console.log('JS hello'); }\00{ console.log('hello again'); }\00yy")
24+
)

test/lit/wasm-emscripten-finalize/em_js.wat

+20-5
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,32 @@
33

44
;; RUN: wasm-emscripten-finalize %s -S | filecheck %s
55

6-
;; Both functions should be stripped from the binary
6+
;; All functions should be stripped from the binary, regardless
7+
;; of internal name
78
;; CHECK-NOT: (func
89

10+
;; The data section that contains only em_js strings should
11+
;; be stripped.
12+
;; CHECK-NOT: (i32.const 512) "Only em_js strings here\00")
13+
14+
;; Data sections that also contain other stuff should not be stripped
15+
;; CHECK: (data (i32.const 1024) "some JS string data\00xxx")
16+
;; CHECK: (data (i32.const 2048) "more JS string data\00yyy")
17+
918
;; CHECK: "emJsFuncs": {
10-
;; CHECK-NEXT: "bar": "more JS string dara",
11-
;; CHECK-NEXT: "foo": "some JS string"
19+
;; CHECK-NEXT: "bar": "more JS string data",
20+
;; CHECK-NEXT: "baz": "Only em_js strings here
21+
;; CHECK-NEXT: "foo": "some JS string data"
1222
;; CHECK-NEXT: },
1323

1424
(module
1525
(memory 1 1)
16-
(data (i32.const 1024) "some JS string\00")
17-
(data (i32.const 2048) "more JS string dara\00")
26+
(data (i32.const 512) "Only em_js strings here\00")
27+
(data (i32.const 1024) "some JS string data\00xxx")
28+
(data (i32.const 2048) "more JS string data\00yyy")
1829
(export "__em_js__foo" (func $__em_js__foo))
1930
(export "__em_js__bar" (func $bar))
31+
(export "__em_js__baz" (func $baz))
2032
;; Name matches export name
2133
(func $__em_js__foo (result i32)
2234
(i32.const 1024)
@@ -25,4 +37,7 @@
2537
(func $bar (result i32)
2638
(i32.const 2048)
2739
)
40+
(func $baz (result i32)
41+
(i32.const 512)
42+
)
2843
)

0 commit comments

Comments
 (0)