emscripten-core · dschuff · Jan 27, 2022 · Jan 22, 2022 · Jan 26, 2022 · Jan 26, 2022
diff --git a/emsymbolizer.py b/emsymbolizer.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+
+# This is a utility for looking up the symbol names and/or file+line numbers
+# of code addresses. There are several possible sources of this information,
+# with varying granularity (listed here in approximate preference order).
+
+# If the wasm has DWARF info, llvm-symbolizer can show the symbol, file, and
+# line/column number, potentially including inlining.
+# If there is a source map, we can parse it to get file and line number.
+# If there is an emscripten symbol map, we can parse that to get the symbol name
+# If there is a name section or symbol table, llvm-nm can show the symbol name.
+
+import os
+import sys
+from tools import shared
+from tools import webassembly
+from tools.shared import check_call
+
+LLVM_SYMBOLIZER = os.path.expanduser(
+    shared.build_llvm_tool_path(shared.exe_suffix('llvm-symbolizer')))
+
+
+class Error(BaseException):
+  pass
+
+
+def get_codesec_offset(module):
+  for sec in module.sections():
+    if sec.type == webassembly.SecType.CODE:
+      return sec.offset
+  raise Error(f'No code section found in {module.filename}')
+
+
+def has_debug_line_section(module):
+  for sec in module.sections():
+    if sec.name == ".debug_line":
+      return True
+  return False
+
+
+def symbolize_address_dwarf(module, address):
+  vma_adjust = get_codesec_offset(module)
+  cmd = [LLVM_SYMBOLIZER, '-e', module.filename, f'--adjust-vma={vma_adjust}',
+         str(address)]
+  check_call(cmd)
+
+
+def main(argv):
+  wasm_file = argv[1]
+  print('Warning: the command-line and output format of this file are not '
+        'finalized yet', file=sys.stderr)
+  module = webassembly.Module(wasm_file)
+
+  if not has_debug_line_section(module):
+    raise Error(f"No .debug_line section found in {module.filename}."
+                " I don't know how to symbolize this file yet")
+
+  symbolize_address_dwarf(module, int(argv[2], 16))
+  return 0
+
+
+if __name__ == '__main__':
+  try:
+    rv = main(sys.argv)
+  except (Error, webassembly.InvalidWasmError, OSError) as e:
+    print(f'{sys.argv[0]}: {str(e)}', file=sys.stderr)
+    rv = 1
+  sys.exit(rv)
diff --git a/tests/core/test_dwarf.c b/tests/core/test_dwarf.c
@@ -2,14 +2,20 @@
 
 EM_JS(int, out_to_js, (int x), {})
 
-void foo() {
+void __attribute__((noinline)) foo() {
   out_to_js(0); // line 5
   out_to_js(1); // line 6
   out_to_js(2); // line 7
   // A silly possible recursion to avoid binaryen doing any inlining.
   if (out_to_js(3)) foo();
 }
 
+void __attribute__((always_inline)) bar() {
+  out_to_js(3);
+  __builtin_trap();
+}
+
 int main() {
   foo();
+  bar();
 }
diff --git a/tests/test_other.py b/tests/test_other.py
@@ -8219,6 +8219,30 @@ def test(infile, source_map_added_dir=''):
     ensure_dir('inner')
     test('inner/a.cpp', 'inner')
 
+  def test_emsymbolizer(self):
+    # Test DWARF output
+    self.run_process([EMCC, test_file('core/test_dwarf.c'),
+                      '-g', '-O1', '-o', 'test_dwarf.js'])
+
+    # Use hard-coded addresses. This is potentially brittle, but LLVM's
+    # O1 output is pretty minimal so hopefully it won't break too much?
+    # Another option would be to disassemble the binary to look for certain
+    # instructions or code sequences.
+
+    def get_addr(address):
+      return self.run_process(
+          [PYTHON, path_from_root('emsymbolizer.py'), 'test_dwarf.wasm', address],
+          stdout=PIPE).stdout
+
+    # Check a location in foo(), not inlined.
+    self.assertIn('test_dwarf.c:6:3', get_addr('0x101'))
+    # Check that both bar (inlined) and main (inlinee) are in the output,
+    # as described by the DWARF.
+    # TODO: consider also checking the function names once the output format
+    # stabilizes more
+    self.assertRegex(get_addr('0x124').replace('\n', ''),
+                     'test_dwarf.c:15:3.*test_dwarf.c:20:3')
+
   def test_separate_dwarf(self):
     self.run_process([EMCC, test_file('hello_world.c'), '-g'])
     self.assertExists('a.out.wasm')

diff --git a/tools/webassembly.py b/tools/webassembly.py
@@ -108,6 +108,10 @@ class DylinkType(IntEnum):
   IMPORT_INFO = 4
 
 
+class InvalidWasmError(BaseException):
+  pass
+
+
 Section = namedtuple('Section', ['type', 'size', 'offset', 'name'])
 Limits = namedtuple('Limits', ['flags', 'initial', 'maximum'])
 Import = namedtuple('Import', ['kind', 'module', 'field'])
@@ -123,15 +127,18 @@ class Module:
   """Extremely minimal wasm module reader.  Currently only used
   for parsing the dylink section."""
   def __init__(self, filename):
+    self.buf = None # Set this before FS calls below in case they throw.
+    self.filename = filename
     self.size = os.path.getsize(filename)
     self.buf = open(filename, 'rb')
     magic = self.buf.read(4)
     version = self.buf.read(4)
-    assert magic == MAGIC
-    assert version == VERSION
+    if magic != MAGIC or version != VERSION:
+      raise InvalidWasmError(f'{filename} is not a valid wasm file')
 
   def __del__(self):
-    self.buf.close()
+    if self.buf:
+      self.buf.close()
 
   def readAt(self, offset, count):
     self.buf.seek(offset)