Skip to content

Commit 01b115a

Browse files
committed
finalize: strip segments that contain only EM_ASM/EM_JS data
If we find a data segment whose entire contents is EM_JS or EM_ASM strings then strip it from the binary. See: emscripten-core/emscripten#13443
1 parent 74ccdb8 commit 01b115a

File tree

3 files changed

+76
-8
lines changed

3 files changed

+76
-8
lines changed

src/binaryen-c.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -1625,9 +1625,9 @@ BINARYEN_API void BinaryenMemoryInitSetSize(BinaryenExpressionRef expr,
16251625

16261626
// DataDrop
16271627

1628-
// Gets the index of the segment being dropped by a `memory.drop` expression.
1628+
// Gets the index of the segment being dropped by a `data.drop` expression.
16291629
BINARYEN_API uint32_t BinaryenDataDropGetSegment(BinaryenExpressionRef expr);
1630-
// Sets the index of the segment being dropped by a `memory.drop` expression.
1630+
// Sets the index of the segment being dropped by a `data.drop` expression.
16311631
BINARYEN_API void BinaryenDataDropSetSegment(BinaryenExpressionRef expr,
16321632
uint32_t segmentIndex);
16331633

src/wasm/wasm-emscripten.cpp

+54-1
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ class StringConstantTracker {
117117
return escape(str);
118118
}
119119

120+
std::vector<Address> segmentOffsets; // segment index => address offset
121+
120122
private:
121123
void calcSegmentOffsets() {
122124
std::unordered_map<Index, Address> passiveOffsets;
@@ -185,7 +187,6 @@ class StringConstantTracker {
185187
}
186188

187189
Module& wasm;
188-
std::vector<Address> segmentOffsets; // segment index => address offset
189190
};
190191

191192
enum class Proxying {
@@ -380,6 +381,7 @@ struct EmJsWalker : public PostWalker<EmJsWalker> {
380381
std::vector<Export> toRemove;
381382

382383
std::map<std::string, std::string> codeByName;
384+
std::map<Address, size_t> codeAddresses; // map from address to string len
383385

384386
EmJsWalker(Module& _wasm) : wasm(_wasm), stringTracker(_wasm) {}
385387

@@ -404,7 +406,32 @@ struct EmJsWalker : public PostWalker<EmJsWalker> {
404406
int64_t address = addrConst->value.getInteger();
405407
auto code = stringTracker.codeForConstAddr(address);
406408
codeByName[funcName] = code;
409+
codeAddresses[address] = code.size() + 1;
410+
}
411+
};
412+
413+
struct SegmentRemover : WalkerPass<PostWalker<SegmentRemover>> {
414+
SegmentRemover(Index segment) : segment(segment) {}
415+
416+
bool isFunctionParallel() override { return true; }
417+
418+
Pass* create() override { return new SegmentRemover(segment); }
419+
420+
void visitMemoryInit(MemoryInit* curr) {
421+
if (segment == curr->segment) {
422+
Builder builder(*getModule());
423+
replaceCurrent(builder.makeNop());
424+
}
407425
}
426+
427+
void visitDataDrop(DataDrop* curr) {
428+
if (segment == curr->segment) {
429+
Builder builder(*getModule());
430+
replaceCurrent(builder.makeNop());
431+
}
432+
}
433+
434+
Index segment;
408435
};
409436

410437
EmJsWalker fixEmJsFuncsAndReturnWalker(Module& wasm) {
@@ -415,6 +442,32 @@ EmJsWalker fixEmJsFuncsAndReturnWalker(Module& wasm) {
415442
wasm.removeExport(exp.name);
416443
wasm.removeFunction(exp.value);
417444
}
445+
446+
// With newer versions of emscripten/llvm we pack all EM_JS strings into
447+
// single segment.
448+
// We can detect this by checking for segments that contain on JS strings.
449+
// When we find such segements we remove them from the final binary.
450+
for (Index i = 0; i < wasm.memory.segments.size(); i++) {
451+
Address start = walker.stringTracker.segmentOffsets[0];
452+
Address cur = start;
453+
454+
while (cur < start + wasm.memory.segments[i].data.size()) {
455+
if (walker.codeAddresses.count(cur) == 0) {
456+
break;
457+
}
458+
cur.addr += walker.codeAddresses[cur];
459+
}
460+
461+
if (cur == start + wasm.memory.segments[i].data.size()) {
462+
// Enture segment is containes JS strings. Remove it.
463+
PassRunner runner(&wasm);
464+
SegmentRemover(i).run(&runner, &wasm);
465+
// Resize the segment to zero. In theory we should completely remove it
466+
// but that would mean re-numbering the segments that follow which would
467+
// mean renumbering.
468+
wasm.memory.segments[i].data.resize(0);
469+
}
470+
}
418471
return walker;
419472
}
420473

test/lit/wasm-emscripten-finalize/em_js.wat

+20-5
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,32 @@
33

44
;; RUN: wasm-emscripten-finalize %s -S | filecheck %s
55

6-
;; Both functions should be stripped from the binary
6+
;; All functions should be stripped from the binary, regardless
7+
;; of internal name
78
;; CHECK-NOT: (func
89

10+
;; The data section that contains only em_js strings should
11+
;; be stripped.
12+
;; CHECK-NOT: (i32.const 512) "Only em_js strings here\00")
13+
14+
;; Data sections that also contain other stuff should not be stripped
15+
;; CHECK: (data (i32.const 1024) "some JS string data\00xxx")
16+
;; CHECK: (data (i32.const 2048) "more JS string data\00yyy")
17+
918
;; CHECK: "emJsFuncs": {
10-
;; CHECK-NEXT: "bar": "more JS string dara",
11-
;; CHECK-NEXT: "foo": "some JS string"
19+
;; CHECK-NEXT: "bar": "more JS string data",
20+
;; CHECK-NEXT: "baz": "Only em_js strings here
21+
;; CHECK-NEXT: "foo": "some JS string data"
1222
;; CHECK-NEXT: },
1323

1424
(module
1525
(memory 1 1)
16-
(data (i32.const 1024) "some JS string\00")
17-
(data (i32.const 2048) "more JS string dara\00")
26+
(data (i32.const 512) "Only em_js strings here\00")
27+
(data (i32.const 1024) "some JS string data\00xxx")
28+
(data (i32.const 2048) "more JS string data\00yyy")
1829
(export "__em_js__foo" (func $__em_js__foo))
1930
(export "__em_js__bar" (func $bar))
31+
(export "__em_js__baz" (func $baz))
2032
;; Name matches export name
2133
(func $__em_js__foo (result i32)
2234
(i32.const 1024)
@@ -25,4 +37,7 @@
2537
(func $bar (result i32)
2638
(i32.const 2048)
2739
)
40+
(func $baz (result i32)
41+
(i32.const 512)
42+
)
2843
)

0 commit comments

Comments
 (0)