Skip to content

Commit c0cda71

Browse files
authored
Add name section and object symbol table support to emsymbolizer (#21367)
With llvm/llvm-project#82083, llvm-symbolizer works correctly with name sections, so emsymbolizer can use it. Also do the same for object files with symbol tables. Object files do not currently work correctly in llvm-symbolizer where code addresses overlap with data addresses; when that is fixed, emsymbolizer will start working, but we should add a test for it.
1 parent 40252f5 commit c0cda71

File tree

2 files changed

+84
-49
lines changed

2 files changed

+84
-49
lines changed

emsymbolizer.py

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@
88
# line/column number, potentially including inlining.
99
# If the wasm has separate DWARF info, do the above with the side file
1010
# If there is a source map, we can parse it to get file and line number.
11-
# If there is an emscripten symbol map, we can parse that to get the symbol name
12-
# If there is a name section or symbol table, llvm-nm can show the symbol name.
11+
# If there is an emscripten symbol map, we can use that to get the symbol name
12+
# If there is a name section or symbol table, llvm-symbolizer can show the
13+
# symbol name.
14+
# Separate DWARF and emscripten symbol maps are not supported yet.
1315

1416
import argparse
1517
import json
@@ -50,21 +52,30 @@ def get_codesec_offset(module):
5052

5153

5254
def has_debug_line_section(module):
53-
for sec in module.sections():
54-
if sec.name == ".debug_line":
55-
return True
56-
return False
55+
return module.get_custom_section('.debug_line') is not None
56+
57+
58+
def has_name_section(module):
59+
return module.get_custom_section('name') is not None
60+
5761

62+
def has_linking_section(module):
63+
return module.get_custom_section('linking') is not None
5864

59-
def symbolize_address_dwarf(module, address):
60-
vma_adjust = get_codesec_offset(module)
65+
66+
def symbolize_address_symbolizer(module, address, is_dwarf):
67+
if is_dwarf:
68+
vma_adjust = get_codesec_offset(module)
69+
else:
70+
vma_adjust = 0
6171
cmd = [LLVM_SYMBOLIZER, '-e', module.filename, f'--adjust-vma={vma_adjust}',
6272
str(address)]
6373
out = shared.run_process(cmd, stdout=subprocess.PIPE).stdout.strip()
6474
out_lines = out.splitlines()
75+
6576
# Source location regex, e.g., /abc/def.c:3:5
6677
SOURCE_LOC_RE = re.compile(r'(.+):(\d+):(\d+)$')
67-
# llvm-dwarfdump prints two lines per location. The first line contains a
78+
# llvm-symbolizer prints two lines per location. The first line contains a
6879
# function name, and the second contains a source location like
6980
# '/abc/def.c:3:5'. If the function or source info is not available, it will
7081
# be printed as '??', in which case we store None. If the line and column info
@@ -210,30 +221,32 @@ def main(args):
210221
with webassembly.Module(args.wasm_file) as module:
211222
base = 16 if args.address.lower().startswith('0x') else 10
212223
address = int(args.address, base)
213-
symbolized = 0
214224

215225
if args.addrtype == 'code':
216226
address += get_codesec_offset(module)
217227

218228
if ((has_debug_line_section(module) and not args.source) or
219229
'dwarf' in args.source):
220-
symbolize_address_dwarf(module, address)
221-
symbolized += 1
222-
223-
if ((get_sourceMappingURL_section(module) and not args.source) or
224-
'sourcemap' in args.source):
230+
symbolize_address_symbolizer(module, address, is_dwarf=True)
231+
elif ((get_sourceMappingURL_section(module) and not args.source) or
232+
'sourcemap' in args.source):
225233
symbolize_address_sourcemap(module, address, args.file)
226-
symbolized += 1
227-
228-
if not symbolized:
234+
elif ((has_name_section(module) and not args.source) or
235+
'names' in args.source):
236+
symbolize_address_symbolizer(module, address, is_dwarf=False)
237+
elif ((has_linking_section(module) and not args.source) or
238+
'symtab' in args.source):
239+
symbolize_address_symbolizer(module, address, is_dwarf=False)
240+
else:
229241
raise Error('No .debug_line or sourceMappingURL section found in '
230242
f'{module.filename}.'
231243
" I don't know how to symbolize this file yet")
232244

233245

234246
def get_args():
235247
parser = argparse.ArgumentParser()
236-
parser.add_argument('-s', '--source', choices=['dwarf', 'sourcemap'],
248+
parser.add_argument('-s', '--source', choices=['dwarf', 'sourcemap',
249+
'names', 'symtab'],
237250
help='Force debug info source type', default=())
238251
parser.add_argument('-f', '--file', action='store',
239252
help='Force debug info source file')

test/test_other.py

Lines changed: 52 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -9771,7 +9771,33 @@ def test(dump_file):
97719771
test('foo.wasm.dump')
97729772
test('bar.wasm.dump')
97739773

9774-
def test_emsymbolizer(self):
9774+
def get_instr_addr(self, text, filename):
9775+
'''
9776+
Runs llvm-objdump to get the address of the first occurrence of the
9777+
specified line within the given function. llvm-objdump's output format
9778+
example is as follows:
9779+
...
9780+
00000004 <foo>:
9781+
...
9782+
6: 41 00 i32.const 0
9783+
...
9784+
The addresses here are the offsets to the start of the file. Returns
9785+
the address string in hexadecimal.
9786+
'''
9787+
out = self.run_process([common.LLVM_OBJDUMP, '-d', filename],
9788+
stdout=PIPE).stdout.strip()
9789+
out_lines = out.splitlines()
9790+
found = False
9791+
for line in out_lines:
9792+
if text in line:
9793+
offset = line.strip().split(':')[0]
9794+
found = True
9795+
break
9796+
assert found
9797+
return '0x' + offset
9798+
9799+
def test_emsymbolizer_srcloc(self):
9800+
'Test emsymbolizer use cases that provide src location granularity info'
97759801
def check_dwarf_loc_info(address, funcs, locs):
97769802
out = self.run_process(
97779803
[emsymbolizer, '-s', 'dwarf', 'test_dwarf.wasm', address],
@@ -9783,45 +9809,19 @@ def check_dwarf_loc_info(address, funcs, locs):
97839809

97849810
def check_source_map_loc_info(address, loc):
97859811
out = self.run_process(
9786-
[emsymbolizer, '-s', 'sourcemap', 'test_dwarf.wasm',
9787-
address],
9812+
[emsymbolizer, '-s', 'sourcemap', 'test_dwarf.wasm', address],
97889813
stdout=PIPE).stdout
97899814
self.assertIn(loc, out)
97909815

9791-
# Runs llvm-objdump to get the address of the first occurrence of the
9792-
# specified line within the given function. llvm-objdump's output format
9793-
# example is as follows:
9794-
# ...
9795-
# 00000004 <foo>:
9796-
# ...
9797-
# 6: 41 00 i32.const 0
9798-
# ...
9799-
# The addresses here are the offsets to the start of the file. Returns
9800-
# the address string in hexadecimal.
9801-
def get_addr(text):
9802-
out = self.run_process([common.LLVM_OBJDUMP, '-d', 'test_dwarf.wasm'],
9803-
stdout=PIPE).stdout.strip()
9804-
out_lines = out.splitlines()
9805-
found = False
9806-
for line in out_lines:
9807-
if text in line:
9808-
offset = line.strip().split(':')[0]
9809-
found = True
9810-
break
9811-
assert found
9812-
return '0x' + offset
9813-
98149816
# We test two locations within test_dwarf.c:
98159817
# out_to_js(0); // line 6
98169818
# __builtin_trap(); // line 13
9817-
9818-
# 1. Test DWARF + source map together
98199819
self.run_process([EMCC, test_file('core/test_dwarf.c'),
98209820
'-g', '-gsource-map', '-O1', '-o', 'test_dwarf.js'])
98219821
# Address of out_to_js(0) within foo(), uninlined
9822-
out_to_js_call_addr = get_addr('call\t0')
9822+
out_to_js_call_addr = self.get_instr_addr('call\t0', 'test_dwarf.wasm')
98239823
# Address of __builtin_trap() within bar(), inlined into main()
9824-
unreachable_addr = get_addr('unreachable')
9824+
unreachable_addr = self.get_instr_addr('unreachable', 'test_dwarf.wasm')
98259825

98269826
# Function name of out_to_js(0) within foo(), uninlined
98279827
out_to_js_call_func = ['foo']
@@ -9835,6 +9835,7 @@ def get_addr(text):
98359835
# The first one corresponds to the innermost inlined location.
98369836
unreachable_loc = ['test_dwarf.c:13:3', 'test_dwarf.c:18:3']
98379837

9838+
# 1. Test DWARF + source map together
98389839
# For DWARF, we check for the full inlined info for both function names and
98399840
# source locations. Source maps provide neither function names nor inlined
98409841
# info. So we only check for the source location of the outermost function.
@@ -9860,6 +9861,27 @@ def get_addr(text):
98609861
out_to_js_call_loc)
98619862
check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc)
98629863

9864+
def test_emsymbolizer_functions(self):
9865+
'Test emsymbolizer use cases that only provide function-granularity info'
9866+
def check_func_info(filename, address, func):
9867+
out = self.run_process(
9868+
[emsymbolizer, filename, address], stdout=PIPE).stdout
9869+
self.assertIn(func, out)
9870+
9871+
# 1. Test name section only
9872+
self.run_process([EMCC, test_file('core/test_dwarf.c'),
9873+
'--profiling-funcs', '-O1', '-o', 'test_dwarf.js'])
9874+
with webassembly.Module('test_dwarf.wasm') as wasm:
9875+
self.assertTrue(wasm.has_name_section())
9876+
self.assertIsNone(wasm.get_custom_section('.debug_info'))
9877+
# Address of out_to_js(0) within foo(), uninlined
9878+
out_to_js_call_addr = self.get_instr_addr('call\t0', 'test_dwarf.wasm')
9879+
# Address of __builtin_trap() within bar(), inlined into main()
9880+
unreachable_addr = self.get_instr_addr('unreachable', 'test_dwarf.wasm')
9881+
check_func_info('test_dwarf.wasm', out_to_js_call_addr, 'foo')
9882+
# The name section will not show bar, as it's inlined into main
9883+
check_func_info('test_dwarf.wasm', unreachable_addr, '__original_main')
9884+
98639885
def test_separate_dwarf(self):
98649886
self.run_process([EMCC, test_file('hello_world.c'), '-g'])
98659887
self.assertExists('a.out.wasm')

0 commit comments

Comments
 (0)