Skip to content

Commit de5b418

Browse files
zzkcodecherrymui
authored andcommitted
runtime: let the fault thread to crash the process
Let the fault thread to crash the program to make sure while gdb coredump file could see the correct backtrace in the number one thread in gdb. Fixes #63277. Change-Id: Ie4473f76f0feba596091433918bcd35a4ff7e11b GitHub-Last-Rev: f4615c2 GitHub-Pull-Request: #63666 Reviewed-on: https://go-review.googlesource.com/c/go/+/536895 Reviewed-by: Michael Pratt <[email protected]> Reviewed-by: Cherry Mui <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]>
1 parent 58bfef8 commit de5b418

File tree

2 files changed

+206
-47
lines changed

2 files changed

+206
-47
lines changed

src/runtime/runtime-gdb_unix_test.go

+180-36
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,43 @@ import (
2020
"testing"
2121
)
2222

23+
func canGenerateCore(t *testing.T) bool {
24+
// Ensure there is enough RLIMIT_CORE available to generate a full core.
25+
var lim syscall.Rlimit
26+
err := syscall.Getrlimit(syscall.RLIMIT_CORE, &lim)
27+
if err != nil {
28+
t.Fatalf("error getting rlimit: %v", err)
29+
}
30+
// Minimum RLIMIT_CORE max to allow. This is a conservative estimate.
31+
// Most systems allow infinity.
32+
const minRlimitCore = 100 << 20 // 100 MB
33+
if lim.Max < minRlimitCore {
34+
t.Skipf("RLIMIT_CORE max too low: %#+v", lim)
35+
}
36+
37+
// Make sure core pattern will send core to the current directory.
38+
b, err := os.ReadFile("/proc/sys/kernel/core_pattern")
39+
if err != nil {
40+
t.Fatalf("error reading core_pattern: %v", err)
41+
}
42+
if string(b) != "core\n" {
43+
t.Skipf("Unexpected core pattern %q", string(b))
44+
}
45+
46+
coreUsesPID := false
47+
b, err = os.ReadFile("/proc/sys/kernel/core_uses_pid")
48+
if err == nil {
49+
switch string(bytes.TrimSpace(b)) {
50+
case "0":
51+
case "1":
52+
coreUsesPID = true
53+
default:
54+
t.Skipf("unexpected core_uses_pid value %q", string(b))
55+
}
56+
}
57+
return coreUsesPID
58+
}
59+
2360
const coreSignalSource = `
2461
package main
2562
@@ -81,45 +118,12 @@ func TestGdbCoreSignalBacktrace(t *testing.T) {
81118
t.Parallel()
82119
checkGdbVersion(t)
83120

84-
// Ensure there is enough RLIMIT_CORE available to generate a full core.
85-
var lim syscall.Rlimit
86-
err := syscall.Getrlimit(syscall.RLIMIT_CORE, &lim)
87-
if err != nil {
88-
t.Fatalf("error getting rlimit: %v", err)
89-
}
90-
// Minimum RLIMIT_CORE max to allow. This is a conservative estimate.
91-
// Most systems allow infinity.
92-
const minRlimitCore = 100 << 20 // 100 MB
93-
if lim.Max < minRlimitCore {
94-
t.Skipf("RLIMIT_CORE max too low: %#+v", lim)
95-
}
96-
97-
// Make sure core pattern will send core to the current directory.
98-
b, err := os.ReadFile("/proc/sys/kernel/core_pattern")
99-
if err != nil {
100-
t.Fatalf("error reading core_pattern: %v", err)
101-
}
102-
if string(b) != "core\n" {
103-
t.Skipf("Unexpected core pattern %q", string(b))
104-
}
105-
106-
coreUsesPID := false
107-
b, err = os.ReadFile("/proc/sys/kernel/core_uses_pid")
108-
if err == nil {
109-
switch string(bytes.TrimSpace(b)) {
110-
case "0":
111-
case "1":
112-
coreUsesPID = true
113-
default:
114-
t.Skipf("unexpected core_uses_pid value %q", string(b))
115-
}
116-
}
117-
118-
dir := t.TempDir()
121+
coreUsesPID := canGenerateCore(t)
119122

120123
// Build the source code.
124+
dir := t.TempDir()
121125
src := filepath.Join(dir, "main.go")
122-
err = os.WriteFile(src, []byte(coreSignalSource), 0644)
126+
err := os.WriteFile(src, []byte(coreSignalSource), 0644)
123127
if err != nil {
124128
t.Fatalf("failed to create file: %v", err)
125129
}
@@ -230,3 +234,143 @@ func TestGdbCoreSignalBacktrace(t *testing.T) {
230234
t.Fatalf("could not find runtime symbol in backtrace after signal handler:\n%s", rest)
231235
}
232236
}
237+
238+
const coreCrashThreadSource = `
239+
package main
240+
241+
/*
242+
#cgo CFLAGS: -g -O0
243+
#include <stdio.h>
244+
#include <stddef.h>
245+
void trigger_crash()
246+
{
247+
int* ptr = NULL;
248+
*ptr = 1024;
249+
}
250+
*/
251+
import "C"
252+
import (
253+
"flag"
254+
"fmt"
255+
"os"
256+
"runtime/debug"
257+
"syscall"
258+
)
259+
260+
func enableCore() {
261+
debug.SetTraceback("crash")
262+
263+
var lim syscall.Rlimit
264+
err := syscall.Getrlimit(syscall.RLIMIT_CORE, &lim)
265+
if err != nil {
266+
panic(fmt.Sprintf("error getting rlimit: %v", err))
267+
}
268+
lim.Cur = lim.Max
269+
fmt.Fprintf(os.Stderr, "Setting RLIMIT_CORE = %+#v\n", lim)
270+
err = syscall.Setrlimit(syscall.RLIMIT_CORE, &lim)
271+
if err != nil {
272+
panic(fmt.Sprintf("error setting rlimit: %v", err))
273+
}
274+
}
275+
276+
func main() {
277+
flag.Parse()
278+
279+
enableCore()
280+
281+
C.trigger_crash()
282+
}
283+
`
284+
285+
// TestGdbCoreCrashThreadBacktrace tests that runtime could let the fault thread to crash process
286+
// and make fault thread as number one thread while gdb in a core file
287+
func TestGdbCoreCrashThreadBacktrace(t *testing.T) {
288+
if runtime.GOOS != "linux" {
289+
// N.B. This test isn't fundamentally Linux-only, but it needs
290+
// to know how to enable/find core files on each OS.
291+
t.Skip("Test only supported on Linux")
292+
}
293+
if runtime.GOARCH != "386" && runtime.GOARCH != "amd64" {
294+
// TODO(go.dev/issue/25218): Other architectures use sigreturn
295+
// via VDSO, which we somehow don't handle correctly.
296+
t.Skip("Backtrace through signal handler only works on 386 and amd64")
297+
}
298+
299+
checkGdbEnvironment(t)
300+
t.Parallel()
301+
checkGdbVersion(t)
302+
303+
coreUsesPID := canGenerateCore(t)
304+
305+
// Build the source code.
306+
dir := t.TempDir()
307+
src := filepath.Join(dir, "main.go")
308+
err := os.WriteFile(src, []byte(coreCrashThreadSource), 0644)
309+
if err != nil {
310+
t.Fatalf("failed to create file: %v", err)
311+
}
312+
cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", "a.exe", "main.go")
313+
cmd.Dir = dir
314+
out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
315+
if err != nil {
316+
t.Fatalf("building source %v\n%s", err, out)
317+
}
318+
319+
// Start the test binary.
320+
cmd = testenv.Command(t, "./a.exe")
321+
cmd.Dir = dir
322+
var output bytes.Buffer
323+
cmd.Stdout = &output // for test logging
324+
cmd.Stderr = &output
325+
326+
if err := cmd.Start(); err != nil {
327+
t.Fatalf("error starting test binary: %v", err)
328+
}
329+
330+
pid := cmd.Process.Pid
331+
332+
err = cmd.Wait()
333+
t.Logf("child output:\n%s", output.String())
334+
if err == nil {
335+
t.Fatalf("Wait succeeded, want SIGABRT")
336+
}
337+
ee, ok := err.(*exec.ExitError)
338+
if !ok {
339+
t.Fatalf("Wait err got %T %v, want exec.ExitError", ee, ee)
340+
}
341+
ws, ok := ee.Sys().(syscall.WaitStatus)
342+
if !ok {
343+
t.Fatalf("Sys got %T %v, want syscall.WaitStatus", ee.Sys(), ee.Sys())
344+
}
345+
if ws.Signal() != syscall.SIGABRT {
346+
t.Fatalf("Signal got %d want SIGABRT", ws.Signal())
347+
}
348+
if !ws.CoreDump() {
349+
t.Fatalf("CoreDump got %v want true", ws.CoreDump())
350+
}
351+
352+
coreFile := "core"
353+
if coreUsesPID {
354+
coreFile += fmt.Sprintf(".%d", pid)
355+
}
356+
357+
// Execute gdb commands.
358+
args := []string{"-nx", "-batch",
359+
"-iex", "add-auto-load-safe-path " + filepath.Join(testenv.GOROOT(t), "src", "runtime"),
360+
"-ex", "backtrace",
361+
filepath.Join(dir, "a.exe"),
362+
filepath.Join(dir, coreFile),
363+
}
364+
cmd = testenv.Command(t, "gdb", args...)
365+
366+
got, err := cmd.CombinedOutput()
367+
t.Logf("gdb output:\n%s", got)
368+
if err != nil {
369+
t.Fatalf("gdb exited with error: %v", err)
370+
}
371+
372+
re := regexp.MustCompile(`#.* trigger_crash`)
373+
if found := re.Find(got) != nil; !found {
374+
t.Fatalf("could not find trigger_crash in backtrace")
375+
}
376+
}

src/runtime/signal_unix.go

+26-11
Original file line numberDiff line numberDiff line change
@@ -597,7 +597,7 @@ func adjustSignalStack(sig uint32, mp *m, gsigStack *gsignalStack) bool {
597597

598598
// crashing is the number of m's we have waited for when implementing
599599
// GOTRACEBACK=crash when a signal is received.
600-
var crashing int32
600+
var crashing atomic.Int32
601601

602602
// testSigtrap and testSigusr1 are used by the runtime tests. If
603603
// non-nil, it is called on SIGTRAP/SIGUSR1. If it returns true, the
@@ -730,7 +730,7 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
730730
mp.throwing = throwTypeRuntime
731731
mp.caughtsig.set(gp)
732732

733-
if crashing == 0 {
733+
if crashing.Load() == 0 {
734734
startpanic_m()
735735
}
736736

@@ -740,32 +740,47 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
740740
if level > 0 {
741741
goroutineheader(gp)
742742
tracebacktrap(c.sigpc(), c.sigsp(), c.siglr(), gp)
743-
if crashing > 0 && gp != mp.curg && mp.curg != nil && readgstatus(mp.curg)&^_Gscan == _Grunning {
743+
if crashing.Load() > 0 && gp != mp.curg && mp.curg != nil && readgstatus(mp.curg)&^_Gscan == _Grunning {
744744
// tracebackothers on original m skipped this one; trace it now.
745745
goroutineheader(mp.curg)
746746
traceback(^uintptr(0), ^uintptr(0), 0, mp.curg)
747-
} else if crashing == 0 {
747+
} else if crashing.Load() == 0 {
748748
tracebackothers(gp)
749749
print("\n")
750750
}
751751
dumpregs(c)
752752
}
753753

754754
if docrash {
755-
crashing++
756-
if crashing < mcount()-int32(extraMLength.Load()) {
755+
isCrashThread := false
756+
if crashing.CompareAndSwap(0, 1) {
757+
isCrashThread = true
758+
} else {
759+
crashing.Add(1)
760+
}
761+
if crashing.Load() < mcount()-int32(extraMLength.Load()) {
757762
// There are other m's that need to dump their stacks.
758763
// Relay SIGQUIT to the next m by sending it to the current process.
759764
// All m's that have already received SIGQUIT have signal masks blocking
760765
// receipt of any signals, so the SIGQUIT will go to an m that hasn't seen it yet.
761-
// When the last m receives the SIGQUIT, it will fall through to the call to
762-
// crash below. Just in case the relaying gets botched, each m involved in
766+
// The first m will wait until all ms received the SIGQUIT, then crash/exit.
767+
// Just in case the relaying gets botched, each m involved in
763768
// the relay sleeps for 5 seconds and then does the crash/exit itself.
764-
// In expected operation, the last m has received the SIGQUIT and run
765-
// crash/exit and the process is gone, all long before any of the
766-
// 5-second sleeps have finished.
769+
// The faulting m is crashing first so it is the faulting thread in the core dump (see issue #63277):
770+
// in expected operation, the first m will wait until the last m has received the SIGQUIT,
771+
// and then run crash/exit and the process is gone.
772+
// However, if it spends more than 5 seconds to send SIGQUIT to all ms,
773+
// any of ms may crash/exit the process after waiting for 5 seconds.
767774
print("\n-----\n\n")
768775
raiseproc(_SIGQUIT)
776+
}
777+
if isCrashThread {
778+
i := 0
779+
for (crashing.Load() < mcount()-int32(extraMLength.Load())) && i < 10 {
780+
i++
781+
usleep(500 * 1000)
782+
}
783+
} else {
769784
usleep(5 * 1000 * 1000)
770785
}
771786
printDebugLog()

0 commit comments

Comments
 (0)