Skip to content

Commit 0dea714

Browse files
committed
finalize: Strip segments that contains on EM_ASM/EM_JS data
If we find a data segment whose entire contents is EM_JS or EM_ASM strings then strip it from the binary. See: emscripten-core/emscripten#13443
1 parent 0540d2c commit 0dea714

File tree

3 files changed

+79
-10
lines changed

3 files changed

+79
-10
lines changed

src/binaryen-c.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -1625,9 +1625,9 @@ BINARYEN_API void BinaryenMemoryInitSetSize(BinaryenExpressionRef expr,
16251625

16261626
// DataDrop
16271627

1628-
// Gets the index of the segment being dropped by a `memory.drop` expression.
1628+
// Gets the index of the segment being dropped by a `data.drop` expression.
16291629
BINARYEN_API uint32_t BinaryenDataDropGetSegment(BinaryenExpressionRef expr);
1630-
// Sets the index of the segment being dropped by a `memory.drop` expression.
1630+
// Sets the index of the segment being dropped by a `data.drop` expression.
16311631
BINARYEN_API void BinaryenDataDropSetSegment(BinaryenExpressionRef expr,
16321632
uint32_t segmentIndex);
16331633

src/wasm/wasm-emscripten.cpp

+57-3
Original file line numberDiff line numberDiff line change
@@ -111,13 +111,16 @@ class StringConstantTracker {
111111
: wasm(wasm) { calcSegmentOffsets(); }
112112

113113
std::string codeForConstAddr(int64_t address) {
114-
const char* str = stringAtAddr(address);
114+
char* str = stringAtAddr(address);
115115
if (!str) {
116116
Fatal() << "unable to find data for ASM/EM_JS const at: " << address;
117117
}
118118
return escape(str);
119119
}
120120

121+
std::set<Index> segmentsUsed;
122+
std::vector<Address> segmentOffsets; // segment index => address offset
123+
121124
private:
122125
void calcSegmentOffsets() {
123126
std::unordered_map<Index, Address> passiveOffsets;
@@ -173,20 +176,20 @@ class StringConstantTracker {
173176
}
174177
}
175178

176-
const char* stringAtAddr(Address address) {
179+
char* stringAtAddr(Address address) {
177180
for (unsigned i = 0; i < wasm.memory.segments.size(); ++i) {
178181
Memory::Segment& segment = wasm.memory.segments[i];
179182
Address offset = segmentOffsets[i];
180183
if (offset != UNKNOWN_OFFSET && address >= offset &&
181184
address < offset + segment.data.size()) {
185+
segmentsUsed.insert(i);
182186
return &segment.data[address - offset];
183187
}
184188
}
185189
return nullptr;
186190
}
187191

188192
Module& wasm;
189-
std::vector<Address> segmentOffsets; // segment index => address offset
190193
};
191194

192195
enum class Proxying {
@@ -381,6 +384,7 @@ struct EmJsWalker : public PostWalker<EmJsWalker> {
381384
std::vector<Export> toRemove;
382385

383386
std::map<std::string, std::string> codeByName;
387+
std::map<Address, size_t> codeAddresses; // map from address to string len
384388

385389
EmJsWalker(Module& _wasm)
386390
: wasm(_wasm), stringTracker(_wasm) {}
@@ -406,9 +410,34 @@ struct EmJsWalker : public PostWalker<EmJsWalker> {
406410
int64_t address = addrConst->value.getInteger();
407411
auto code = stringTracker.codeForConstAddr(address);
408412
codeByName[funcName] = code;
413+
codeAddresses[address] = code.size() + 1;
409414
}
410415
};
411416

417+
struct SegmentRemover : WalkerPass<PostWalker<SegmentRemover>> {
418+
SegmentRemover(Index segment) : segment(segment) {}
419+
420+
bool isFunctionParallel() override { return true; }
421+
422+
Pass* create() override { return new SegmentRemover(segment); }
423+
424+
void visitMemoryInit(MemoryInit* curr) {
425+
if (segment == curr->segment) {
426+
Builder builder(*getModule());
427+
replaceCurrent(builder.makeNop());
428+
}
429+
}
430+
431+
void visitDataDrop(DataDrop* curr) {
432+
if (segment == curr->segment) {
433+
Builder builder(*getModule());
434+
replaceCurrent(builder.makeNop());
435+
}
436+
}
437+
438+
Index segment;
439+
};
440+
412441
EmJsWalker fixEmJsFuncsAndReturnWalker(Module& wasm) {
413442
EmJsWalker walker(wasm);
414443
walker.walkModule(&wasm);
@@ -417,6 +446,31 @@ EmJsWalker fixEmJsFuncsAndReturnWalker(Module& wasm) {
417446
wasm.removeExport(exp.name);
418447
wasm.removeFunction(exp.value);
419448
}
449+
450+
451+
for (Index segIndex: walker.stringTracker.segmentsUsed) {
452+
Address start = walker.stringTracker.segmentOffsets[0];
453+
Address cur = start;
454+
455+
// With newer versions of emscripten/llvm we pack all EM_JS strings into
456+
// single segment.
457+
// We can detect this by checking if the strings found cover the entire
458+
// segment
459+
while (cur < start + wasm.memory.segments[segIndex].data.size()) {
460+
if (walker.codeAddresses.count(cur) == 0)
461+
break;
462+
cur.addr += walker.codeAddresses[cur];
463+
}
464+
465+
if (cur == start + wasm.memory.segments[segIndex].data.size()) {
466+
PassRunner runner(&wasm);
467+
SegmentRemover(segIndex).run(&runner, &wasm);
468+
// Resize the segment to zero. In theory we should completely remove it
469+
// but that would mean re-numbering the segments that follow which would
470+
// mean renumbering.
471+
wasm.memory.segments[segIndex].data.resize(0);
472+
}
473+
}
420474
return walker;
421475
}
422476

test/lit/wasm-emscripten-finalize/em_js.wat

+20-5
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,32 @@
33

44
;; RUN: wasm-emscripten-finalize %s -S | filecheck %s
55

6-
;; Both functions should be stripped from the binary
6+
;; All functions should be stripped from the binary, regardless
7+
;; of internal name
78
;; CHECK-NOT: (func
89

10+
;; The data section that contains only em_js strings should
11+
;; be stripped.
12+
;; CHECK-NOT: (i32.const 512) "Only em_js strings here\00")
13+
14+
;; Data sections that also contain other stuff should not be stripped
15+
;; CHECK: (data (i32.const 1024) "some JS string data\00xxx")
16+
;; CHECK: (data (i32.const 2048) "more JS string data\00yyy")
17+
918
;; CHECK: "emJsFuncs": {
10-
;; CHECK-NEXT: "bar": "more JS string dara",
11-
;; CHECK-NEXT: "foo": "some JS string"
19+
;; CHECK-NEXT: "bar": "more JS string data",
20+
;; CHECK-NEXT: "baz": "Only em_js strings here
21+
;; CHECK-NEXT: "foo": "some JS string data"
1222
;; CHECK-NEXT: },
1323

1424
(module
1525
(memory 1 1)
16-
(data (i32.const 1024) "some JS string\00")
17-
(data (i32.const 2048) "more JS string dara\00")
26+
(data (i32.const 512) "Only em_js strings here\00")
27+
(data (i32.const 1024) "some JS string data\00xxx")
28+
(data (i32.const 2048) "more JS string data\00yyy")
1829
(export "__em_js__foo" (func $__em_js__foo))
1930
(export "__em_js__bar" (func $bar))
31+
(export "__em_js__baz" (func $baz))
2032
;; Name matches export name
2133
(func $__em_js__foo (result i32)
2234
(i32.const 1024)
@@ -25,4 +37,7 @@
2537
(func $bar (result i32)
2638
(i32.const 2048)
2739
)
40+
(func $baz (result i32)
41+
(i32.const 512)
42+
)
2843
)

0 commit comments

Comments
 (0)