Skip to content

Commit 35bbb4e

Browse files
authored
Add experimental implementation of metadata extraction in python. NFC (#15918)
This change is NFC because the default extraction method is still 'binaryen' Fixes: #15250
1 parent c5548ad commit 35bbb4e

File tree

4 files changed

+453
-15
lines changed

4 files changed

+453
-15
lines changed

.circleci/config.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,10 @@ jobs:
363363
test_targets: "posixtest"
364364
test-core0:
365365
executor: bionic
366+
# Temporarily set EMCC_READ_METADATA to compare to ensure that the python
367+
# can marches precisely the output wasm-emscripten-finalize.
368+
environment:
369+
EMCC_READ_METADATA: "compare"
366370
steps:
367371
- run-tests:
368372
test_targets: "core0"

emscripten.py

Lines changed: 84 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from tools import utils
2828
from tools import gen_struct_info
2929
from tools import webassembly
30+
from tools import extract_metadata
3031
from tools.utils import exit_with_error, path_from_root
3132
from tools.shared import DEBUG, WINDOWS, asmjs_mangle
3233
from tools.shared import treat_as_user_function, strip_prefix
@@ -390,6 +391,62 @@ def remove_trailing_zeros(memfile):
390391
utils.write_binary(memfile, mem_data[:end])
391392

392393

394+
def get_metadata_binaryen(infile, outfile, modify_wasm, args):
395+
stdout = building.run_binaryen_command('wasm-emscripten-finalize',
396+
infile=infile,
397+
outfile=outfile if modify_wasm else None,
398+
args=args,
399+
stdout=subprocess.PIPE)
400+
metadata = load_metadata_json(stdout)
401+
return metadata
402+
403+
404+
def get_metadata_python(infile, outfile, modify_wasm, args):
405+
metadata = extract_metadata.extract_metadata(infile)
406+
if modify_wasm:
407+
# In some cases we still need to modify the wasm file
408+
# using wasm-emscripten-finalize.
409+
building.run_binaryen_command('wasm-emscripten-finalize',
410+
infile=infile,
411+
outfile=outfile,
412+
args=args,
413+
stdout=subprocess.PIPE)
414+
# When we do this we can generate new imports, so
415+
# re-read parts of the metadata post-finalize
416+
extract_metadata.update_metadata(outfile, metadata)
417+
elif 'main' in metadata['exports']:
418+
# Mimic a bug in wasm-emscripten-finalize where we don't correctly
419+
# detect the presense of the main wrapper function unless we are
420+
# modifying the binary. This is because binaryen doesn't reaad
421+
# the function bodies in this mode.
422+
# TODO(sbc): Remove this once we make the switch away from
423+
# binaryen metadata.
424+
metadata['mainReadsParams'] = 1
425+
return metadata
426+
427+
428+
# Test function for comparing binaryen vs python metadata.
429+
# Remove this once we go back to having just one method.
430+
def compare_metadata(metadata, pymetadata):
431+
if sorted(metadata.keys()) != sorted(pymetadata.keys()):
432+
print(sorted(metadata.keys()))
433+
print(sorted(pymetadata.keys()))
434+
exit_with_error('metadata keys mismatch')
435+
for key in metadata:
436+
old = metadata[key]
437+
new = pymetadata[key]
438+
if key == 'features':
439+
old = sorted(old)
440+
new = sorted(new)
441+
if old != new:
442+
print(key)
443+
open(path_from_root('first.txt'), 'w').write(pprint.pformat(old))
444+
open(path_from_root('second.txt'), 'w').write(pprint.pformat(new))
445+
print(pprint.pformat(old))
446+
print(pprint.pformat(new))
447+
exit_with_error('metadata mismatch')
448+
449+
393450
def finalize_wasm(infile, outfile, memfile):
394451
building.save_intermediate(infile, 'base.wasm')
395452
# tell binaryen to look at the features section, and if there isn't one, to use MVP
@@ -445,11 +502,31 @@ def finalize_wasm(infile, outfile, memfile):
445502

446503
if settings.DEBUG_LEVEL >= 3:
447504
args.append('--dwarf')
448-
stdout = building.run_binaryen_command('wasm-emscripten-finalize',
449-
infile=infile,
450-
outfile=outfile if modify_wasm else None,
451-
args=args,
452-
stdout=subprocess.PIPE)
505+
506+
# Currently we have two different ways to extract the metadata from the
507+
# wasm binary:
508+
# 1. via wasm-emscripten-finalize (binaryen)
509+
# 2. via local python code
510+
# We also have a 'compare' mode that runs both extraction methods and
511+
# checks that they produce identical results.
512+
read_metadata = os.environ.get('EMCC_READ_METADATA', 'binaryen')
513+
if read_metadata == 'binaryen':
514+
metadata = get_metadata_binaryen(infile, outfile, modify_wasm, args)
515+
elif read_metadata == 'python':
516+
metadata = get_metadata_python(infile, outfile, modify_wasm, args)
517+
elif read_metadata == 'compare':
518+
shutil.copy2(infile, infile + '.bak')
519+
if settings.GENERATE_SOURCE_MAP:
520+
shutil.copy2(infile + '.map', infile + '.map.bak')
521+
pymetadata = get_metadata_python(infile, outfile, modify_wasm, args)
522+
shutil.move(infile + '.bak', infile)
523+
if settings.GENERATE_SOURCE_MAP:
524+
shutil.move(infile + '.map.bak', infile + '.map')
525+
metadata = get_metadata_binaryen(infile, outfile, modify_wasm, args)
526+
compare_metadata(metadata, pymetadata)
527+
else:
528+
assert False
529+
453530
if modify_wasm:
454531
building.save_intermediate(infile, 'post_finalize.wasm')
455532
elif infile != outfile:
@@ -464,7 +541,7 @@ def finalize_wasm(infile, outfile, memfile):
464541
# the dynamic linking case, our loader zeros it out)
465542
remove_trailing_zeros(memfile)
466543

467-
return load_metadata_wasm(stdout)
544+
return metadata
468545

469546

470547
def create_asm_consts(metadata):
@@ -762,7 +839,7 @@ def create_module(sending, receiving, invoke_funcs, metadata):
762839
return module
763840

764841

765-
def load_metadata_wasm(metadata_raw):
842+
def load_metadata_json(metadata_raw):
766843
try:
767844
metadata_json = json.loads(metadata_raw)
768845
except Exception:

tools/extract_metadata.py

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
# Copyright 2022 The Emscripten Authors. All rights reserved.
2+
# Emscripten is available under two separate licenses, the MIT license and the
3+
# University of Illinois/NCSA Open Source License. Both these licenses can be
4+
# found in the LICENSE file.
5+
6+
from . import webassembly
7+
from .shared import exit_with_error
8+
from .settings import settings
9+
10+
11+
def is_wrapper_function(module, function):
12+
module.seek(function.offset)
13+
num_local_decls = module.readULEB()
14+
while num_local_decls:
15+
local_count = module.readULEB() # noqa
16+
local_type = module.read_type() # noqa
17+
num_local_decls -= 1
18+
end = function.offset + function.size
19+
while module.tell() != end:
20+
opcode = module.readByte()
21+
try:
22+
opcode = webassembly.OpCode(opcode)
23+
except ValueError as e:
24+
print(e)
25+
return False
26+
if opcode == webassembly.OpCode.CALL:
27+
callee = module.readULEB() # noqa
28+
elif opcode == webassembly.OpCode.END:
29+
break
30+
else:
31+
return False
32+
assert opcode == webassembly.OpCode.END
33+
return True
34+
35+
36+
def get_const_expr_value(expr):
37+
assert len(expr) == 2
38+
assert expr[1][0] == webassembly.OpCode.END
39+
opcode, immediates = expr[0]
40+
if opcode in (webassembly.OpCode.I32_CONST, webassembly.OpCode.I64_CONST):
41+
assert len(immediates) == 1
42+
return immediates[0]
43+
elif opcode in (webassembly.OpCode.GLOBAL_GET,):
44+
return 0
45+
else:
46+
exit_with_error('unexpected opcode in const expr: ' + str(opcode))
47+
48+
49+
def get_global_value(globl):
50+
return get_const_expr_value(globl.init)
51+
52+
53+
def find_segment_with_address(module, address, size=0):
54+
segments = module.get_segments()
55+
active = [s for s in segments if s.init]
56+
57+
for seg in active:
58+
offset = get_const_expr_value(seg.init)
59+
if offset is None:
60+
continue
61+
if offset == address:
62+
return (seg, 0)
63+
if address > offset and address < offset + seg.size:
64+
return (seg, address - offset)
65+
66+
passive = [s for s in segments if not s.init]
67+
for seg in passive:
68+
if seg.size == size:
69+
return (seg, 0)
70+
71+
72+
def data_to_string(data):
73+
data = data.decode('utf8')
74+
# We have at least one test (tests/utf8.cpp) that uses a double
75+
# backslash in the C++ source code, in order to represent a single backslash.
76+
# This is because these strings historically were written and read back via
77+
# JSON and a single slash is interpreted as an escape char there.
78+
# Technically this escaping is no longer needed and could be removed
79+
# but in order to maintain compatibility we strip out the double
80+
# slashes here.
81+
data = data.replace('\\\\', '\\')
82+
return data
83+
84+
85+
def get_asm_strings(module, globls, export_map, imported_globals):
86+
if '__start_em_asm' not in export_map or '__stop_em_asm' not in export_map:
87+
return {}
88+
89+
start = export_map['__start_em_asm']
90+
end = export_map['__stop_em_asm']
91+
start_global = globls[start.index - imported_globals]
92+
end_global = globls[end.index - imported_globals]
93+
start_addr = get_global_value(start_global)
94+
end_addr = get_global_value(end_global)
95+
96+
seg = find_segment_with_address(module, start_addr, end_addr - start_addr)
97+
if not seg:
98+
exit_with_error('unable to find segment starting at __start_em_asm: %s' % start_addr)
99+
seg, seg_offset = seg
100+
101+
asm_strings = {}
102+
str_start = seg_offset
103+
data = module.readAt(seg.offset, seg.size)
104+
size = end_addr - start_addr
105+
end = seg_offset + size
106+
while str_start < end:
107+
str_end = data.find(b'\0', str_start)
108+
asm_strings[str(start_addr - seg_offset + str_start)] = data_to_string(data[str_start:str_end])
109+
str_start = str_end + 1
110+
return asm_strings
111+
112+
113+
def get_main_reads_params(module, export_map, imported_funcs):
114+
if settings.STANDALONE_WASM:
115+
return 1
116+
117+
main = export_map.get('main')
118+
if not main or main.kind != webassembly.ExternType.FUNC:
119+
return 0
120+
121+
functions = module.get_functions()
122+
main_func = functions[main.index - imported_funcs]
123+
if is_wrapper_function(module, main_func):
124+
return 0
125+
else:
126+
return 1
127+
128+
129+
def get_names_globals(globls, exports, imported_globals):
130+
named_globals = {}
131+
for export in exports:
132+
if export.kind == webassembly.ExternType.GLOBAL:
133+
if export.name in ('__start_em_asm', '__stop_em_asm') or export.name.startswith('__em_js__'):
134+
continue
135+
g = globls[export.index - imported_globals]
136+
named_globals[export.name] = str(get_global_value(g))
137+
return named_globals
138+
139+
140+
def update_metadata(filename, metadata):
141+
declares = []
142+
invoke_funcs = []
143+
em_js_funcs = set(metadata['emJsFuncs'])
144+
module = webassembly.Module(filename)
145+
for i in module.get_imports():
146+
if i.kind == webassembly.ExternType.FUNC:
147+
if i.field.startswith('invoke_'):
148+
invoke_funcs.append(i.field)
149+
elif i.field not in em_js_funcs:
150+
declares.append(i.field)
151+
152+
exports = [e.name for e in module.get_exports() if e.kind == webassembly.ExternType.FUNC]
153+
metadata['declares'] = declares
154+
metadata['exports'] = exports
155+
metadata['invokeFuncs'] = invoke_funcs
156+
157+
158+
def get_string_at(module, address):
159+
seg, offset = find_segment_with_address(module, address)
160+
data = module.readAt(seg.offset, seg.size)
161+
str_end = data.find(b'\0', offset)
162+
return data_to_string(data[offset:str_end])
163+
164+
165+
def extract_metadata(filename):
166+
module = webassembly.Module(filename)
167+
export_names = []
168+
declares = []
169+
invoke_funcs = []
170+
imported_funcs = 0
171+
imported_globals = 0
172+
global_imports = []
173+
em_js_funcs = {}
174+
exports = module.get_exports()
175+
imports = module.get_imports()
176+
globls = module.get_globals()
177+
178+
for i in imports:
179+
if i.kind == webassembly.ExternType.FUNC:
180+
if i.field.startswith('invoke_'):
181+
invoke_funcs.append(i.field)
182+
elif i.field not in em_js_funcs:
183+
declares.append(i.field)
184+
imported_funcs += 1
185+
elif i.kind == webassembly.ExternType.GLOBAL:
186+
imported_globals += 1
187+
global_imports.append(i.field)
188+
189+
export_map = {e.name: e for e in exports}
190+
for e in exports:
191+
if e.kind == webassembly.ExternType.GLOBAL and e.name.startswith('__em_js__'):
192+
name = e.name[len('__em_js__'):]
193+
globl = globls[e.index - imported_globals]
194+
string_address = get_global_value(globl)
195+
em_js_funcs[name] = get_string_at(module, string_address)
196+
197+
export_names = [e.name for e in exports if e.kind == webassembly.ExternType.FUNC]
198+
199+
features = module.parse_features_section()
200+
features = ['--enable-' + f[1] for f in features if f[0] == '+']
201+
features = [f.replace('--enable-atomics', '--enable-threads') for f in features]
202+
features = [f.replace('--enable-simd128', '--enable-simd') for f in features]
203+
features = [f.replace('--enable-nontrapping-fptoint', '--enable-nontrapping-float-to-int') for f in features]
204+
205+
# If main does not read its parameters, it will just be a stub that
206+
# calls __original_main (which has no parameters).
207+
metadata = {}
208+
metadata['asmConsts'] = get_asm_strings(module, globls, export_map, imported_globals)
209+
metadata['declares'] = declares
210+
metadata['emJsFuncs'] = em_js_funcs
211+
metadata['exports'] = export_names
212+
metadata['features'] = features
213+
metadata['globalImports'] = global_imports
214+
metadata['invokeFuncs'] = invoke_funcs
215+
metadata['mainReadsParams'] = get_main_reads_params(module, export_map, imported_funcs)
216+
metadata['namedGlobals'] = get_names_globals(globls, exports, imported_globals)
217+
# print("Metadata parsed: " + pprint.pformat(metadata))
218+
return metadata

0 commit comments

Comments
 (0)