Skip to content

Commit 2124bd4

Browse files
committed
finalize: Strip segments that contains on EM_ASM/EM_JS data
If we find a data segment whose entire contents is EM_JS or EM_ASM strings then strip it from the binary. See: emscripten-core/emscripten#13443
1 parent 12d8090 commit 2124bd4

File tree

3 files changed

+79
-10
lines changed

3 files changed

+79
-10
lines changed

src/binaryen-c.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -1625,9 +1625,9 @@ BINARYEN_API void BinaryenMemoryInitSetSize(BinaryenExpressionRef expr,
16251625

16261626
// DataDrop
16271627

1628-
// Gets the index of the segment being dropped by a `memory.drop` expression.
1628+
// Gets the index of the segment being dropped by a `data.drop` expression.
16291629
BINARYEN_API uint32_t BinaryenDataDropGetSegment(BinaryenExpressionRef expr);
1630-
// Sets the index of the segment being dropped by a `memory.drop` expression.
1630+
// Sets the index of the segment being dropped by a `data.drop` expression.
16311631
BINARYEN_API void BinaryenDataDropSetSegment(BinaryenExpressionRef expr,
16321632
uint32_t segmentIndex);
16331633

src/wasm/wasm-emscripten.cpp

+57-3
Original file line numberDiff line numberDiff line change
@@ -110,13 +110,16 @@ class StringConstantTracker {
110110
StringConstantTracker(Module& wasm) : wasm(wasm) { calcSegmentOffsets(); }
111111

112112
std::string codeForConstAddr(int64_t address) {
113-
const char* str = stringAtAddr(address);
113+
char* str = stringAtAddr(address);
114114
if (!str) {
115115
Fatal() << "unable to find data for ASM/EM_JS const at: " << address;
116116
}
117117
return escape(str);
118118
}
119119

120+
std::set<Index> segmentsUsed;
121+
std::vector<Address> segmentOffsets; // segment index => address offset
122+
120123
private:
121124
void calcSegmentOffsets() {
122125
std::unordered_map<Index, Address> passiveOffsets;
@@ -172,20 +175,20 @@ class StringConstantTracker {
172175
}
173176
}
174177

175-
const char* stringAtAddr(Address address) {
178+
char* stringAtAddr(Address address) {
176179
for (unsigned i = 0; i < wasm.memory.segments.size(); ++i) {
177180
Memory::Segment& segment = wasm.memory.segments[i];
178181
Address offset = segmentOffsets[i];
179182
if (offset != UNKNOWN_OFFSET && address >= offset &&
180183
address < offset + segment.data.size()) {
184+
segmentsUsed.insert(i);
181185
return &segment.data[address - offset];
182186
}
183187
}
184188
return nullptr;
185189
}
186190

187191
Module& wasm;
188-
std::vector<Address> segmentOffsets; // segment index => address offset
189192
};
190193

191194
enum class Proxying {
@@ -380,6 +383,7 @@ struct EmJsWalker : public PostWalker<EmJsWalker> {
380383
std::vector<Export> toRemove;
381384

382385
std::map<std::string, std::string> codeByName;
386+
std::map<Address, size_t> codeAddresses; // map from address to string len
383387

384388
EmJsWalker(Module& _wasm) : wasm(_wasm), stringTracker(_wasm) {}
385389

@@ -404,9 +408,34 @@ struct EmJsWalker : public PostWalker<EmJsWalker> {
404408
int64_t address = addrConst->value.getInteger();
405409
auto code = stringTracker.codeForConstAddr(address);
406410
codeByName[funcName] = code;
411+
codeAddresses[address] = code.size() + 1;
407412
}
408413
};
409414

415+
struct SegmentRemover : WalkerPass<PostWalker<SegmentRemover>> {
416+
SegmentRemover(Index segment) : segment(segment) {}
417+
418+
bool isFunctionParallel() override { return true; }
419+
420+
Pass* create() override { return new SegmentRemover(segment); }
421+
422+
void visitMemoryInit(MemoryInit* curr) {
423+
if (segment == curr->segment) {
424+
Builder builder(*getModule());
425+
replaceCurrent(builder.makeNop());
426+
}
427+
}
428+
429+
void visitDataDrop(DataDrop* curr) {
430+
if (segment == curr->segment) {
431+
Builder builder(*getModule());
432+
replaceCurrent(builder.makeNop());
433+
}
434+
}
435+
436+
Index segment;
437+
};
438+
410439
EmJsWalker fixEmJsFuncsAndReturnWalker(Module& wasm) {
411440
EmJsWalker walker(wasm);
412441
walker.walkModule(&wasm);
@@ -415,6 +444,31 @@ EmJsWalker fixEmJsFuncsAndReturnWalker(Module& wasm) {
415444
wasm.removeExport(exp.name);
416445
wasm.removeFunction(exp.value);
417446
}
447+
448+
449+
for (Index segIndex: walker.stringTracker.segmentsUsed) {
450+
Address start = walker.stringTracker.segmentOffsets[0];
451+
Address cur = start;
452+
453+
// With newer versions of emscripten/llvm we pack all EM_JS strings into
454+
// single segment.
455+
// We can detect this by checking if the strings found cover the entire
456+
// segment
457+
while (cur < start + wasm.memory.segments[segIndex].data.size()) {
458+
if (walker.codeAddresses.count(cur) == 0)
459+
break;
460+
cur.addr += walker.codeAddresses[cur];
461+
}
462+
463+
if (cur == start + wasm.memory.segments[segIndex].data.size()) {
464+
PassRunner runner(&wasm);
465+
SegmentRemover(segIndex).run(&runner, &wasm);
466+
// Resize the segment to zero. In theory we should completely remove it
467+
// but that would mean re-numbering the segments that follow which would
468+
// mean renumbering.
469+
wasm.memory.segments[segIndex].data.resize(0);
470+
}
471+
}
418472
return walker;
419473
}
420474

test/lit/wasm-emscripten-finalize/em_js.wat

+20-5
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,32 @@
33

44
;; RUN: wasm-emscripten-finalize %s -S | filecheck %s
55

6-
;; Both functions should be stripped from the binary
6+
;; All functions should be stripped from the binary, regardless
7+
;; of internal name
78
;; CHECK-NOT: (func
89

10+
;; The data section that contains only em_js strings should
11+
;; be stripped.
12+
;; CHECK-NOT: (i32.const 512) "Only em_js strings here\00")
13+
14+
;; Data sections that also contain other stuff should not be stripped
15+
;; CHECK: (data (i32.const 1024) "some JS string data\00xxx")
16+
;; CHECK: (data (i32.const 2048) "more JS string data\00yyy")
17+
918
;; CHECK: "emJsFuncs": {
10-
;; CHECK-NEXT: "bar": "more JS string dara",
11-
;; CHECK-NEXT: "foo": "some JS string"
19+
;; CHECK-NEXT: "bar": "more JS string data",
20+
;; CHECK-NEXT: "baz": "Only em_js strings here
21+
;; CHECK-NEXT: "foo": "some JS string data"
1222
;; CHECK-NEXT: },
1323

1424
(module
1525
(memory 1 1)
16-
(data (i32.const 1024) "some JS string\00")
17-
(data (i32.const 2048) "more JS string dara\00")
26+
(data (i32.const 512) "Only em_js strings here\00")
27+
(data (i32.const 1024) "some JS string data\00xxx")
28+
(data (i32.const 2048) "more JS string data\00yyy")
1829
(export "__em_js__foo" (func $__em_js__foo))
1930
(export "__em_js__bar" (func $bar))
31+
(export "__em_js__baz" (func $baz))
2032
;; Name matches export name
2133
(func $__em_js__foo (result i32)
2234
(i32.const 1024)
@@ -25,4 +37,7 @@
2537
(func $bar (result i32)
2638
(i32.const 2048)
2739
)
40+
(func $baz (result i32)
41+
(i32.const 512)
42+
)
2843
)

0 commit comments

Comments
 (0)