roachtest: use atomic pointer for logger

DarrylWong · DarrylWong · commit 3ca9845cc23d · 2025-02-06T15:40:58.000-05:00
The test runner swaps out the test logger when running post
test artifacts collection and checks. However, in the case of
a timeout, the test goroutine may still be running and have
access to the test logger. This leads to a race condition where
the logger is replaced as it's being used by the test.

This change switches the test logger to use an atomic pointer
instead.
diff --git a/pkg/cmd/roachtest/github_test.go b/pkg/cmd/roachtest/github_test.go
@@ -139,12 +139,12 @@ func TestCreatePostRequest(t *testing.T) {
 
 		ti := &testImpl{
 			spec:        testSpec,
-			l:           nilLogger(),
 			start:       time.Date(2023, time.July, 21, 16, 34, 3, 817, time.UTC),
 			end:         time.Date(2023, time.July, 21, 16, 42, 13, 137, time.UTC),
 			cockroach:   "cockroach",
 			cockroachEA: "cockroach-ea",
 		}
+		ti.ReplaceL(nilLogger())
 
 		testClusterImpl := &clusterImpl{spec: clusterSpec, arch: vm.ArchAMD64, name: "foo"}
 		vo := vm.DefaultCreateOpts()
diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go
@@ -14,6 +14,7 @@ import (
 	"regexp"
 	"strings"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
@@ -77,7 +78,11 @@ type testImpl struct {
 	buildVersion *version.Version
 
 	// l is the logger that the test will use for its output.
-	l *logger.Logger
+	//
+	// N.B. We need to use an atomic pointer here since the test
+	// runner can swap the logger out when running post test assertions
+	// and artifacts collection.
+	l atomic.Pointer[logger.Logger]
 
 	// taskManager manages tasks (goroutines) for tests.
 	taskManager task.Manager
@@ -172,7 +177,7 @@ func (t *testImpl) Cockroach() string {
 			// If the test is a benchmark test, we don't want to enable assertions
 			// as it will slow down performance.
 			if t.spec.Benchmark {
-				t.l.Printf("Benchmark test, running with standard cockroach")
+				t.L().Printf("Benchmark test, running with standard cockroach")
 				t.randomizedCockroach = t.StandardCockroach()
 				return
 			}
@@ -181,20 +186,20 @@ func (t *testImpl) Cockroach() string {
 				// The build with runtime assertions should exist in every nightly
 				// CI build, but we can't assume it exists in every roachtest call.
 				if path := t.RuntimeAssertionsCockroach(); path != "" {
-					t.l.Printf("Runtime assertions enabled")
+					t.L().Printf("Runtime assertions enabled")
 					t.randomizedCockroach = path
 					return
 				} else {
-					t.l.Printf("WARNING: running without runtime assertions since the corresponding binary was not specified")
+					t.L().Printf("WARNING: running without runtime assertions since the corresponding binary was not specified")
 				}
 			}
-			t.l.Printf("Runtime assertions disabled")
+			t.L().Printf("Runtime assertions disabled")
 			t.randomizedCockroach = t.StandardCockroach()
 		case registry.StandardCockroach:
-			t.l.Printf("Runtime assertions disabled: registry.StandardCockroach set")
+			t.L().Printf("Runtime assertions disabled: registry.StandardCockroach set")
 			t.randomizedCockroach = t.StandardCockroach()
 		case registry.RuntimeAssertionsCockroach:
-			t.l.Printf("Runtime assertions enabled: registry.RuntimeAssertionsCockroach set")
+			t.L().Printf("Runtime assertions enabled: registry.RuntimeAssertionsCockroach set")
 			t.randomizedCockroach = t.RuntimeAssertionsCockroach()
 		default:
 			t.Fatal("Specified cockroach binary does not exist.")
@@ -259,13 +264,12 @@ func (t *testImpl) SnapshotPrefix() string {
 
 // L returns the test's logger.
 func (t *testImpl) L() *logger.Logger {
-	return t.l
+	return t.l.Load()
 }
 
 // ReplaceL replaces the test's logger.
 func (t *testImpl) ReplaceL(l *logger.Logger) {
-	// TODO(tbg): get rid of this, this is racy & hacky.
-	t.l = l
+	t.l.Store(l)
 }
 
 func (t *testImpl) status(ctx context.Context, id int64, args ...interface{}) {
diff --git a/pkg/cmd/roachtest/test_impl_test.go b/pkg/cmd/roachtest/test_impl_test.go
@@ -179,9 +179,8 @@ func Test_failuresMatchingError(t *testing.T) {
 }
 
 func Test_failureSpecifyOwnerAndAddFailureCombination(t *testing.T) {
-	ti := testImpl{
-		l: nilLogger(),
-	}
+	ti := testImpl{}
+	ti.ReplaceL(nilLogger())
 	ti.addFailure(0, "", vmPreemptionError("my_VM"))
 	errWithOwnership := failuresAsErrorWithOwnership(ti.failures())
 
diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
@@ -830,14 +830,14 @@ func (r *testRunner) runWorker(
 			buildVersion:           binaryVersion,
 			artifactsDir:           testArtifactsDir,
 			artifactsSpec:          artifactsSpec,
-			l:                      testL,
 			versionsBinaryOverride: topt.versionsBinaryOverride,
 			skipInit:               topt.skipInit,
 			debug:                  clustersOpt.debugMode.IsDebug(),
 			goCoverEnabled:         topt.goCoverEnabled,
 			exportOpenmetrics:      topt.exportOpenMetrics,
 			runID:                  generateRunID(clustersOpt),
 		}
+		t.ReplaceL(testL)
 		github := newGithubIssues(r.config.disableIssue, c, vmCreateOpts)
 
 		// handleClusterCreationFailure can be called when the `err` given
diff --git a/pkg/cmd/roachtest/test_test.go b/pkg/cmd/roachtest/test_test.go
@@ -768,3 +768,38 @@ func TestVMPreemptionPolling(t *testing.T) {
 		require.NoError(t, err)
 	})
 }
+
+// TestRunnerFailureAfterTimeout checks that a test has a failure added
+// after the test has timed out works as expected.
+//
+// Specifically, this is a regression test that replacing the test logger
+// for post test artifacts collection or assertion checks is atomic and
+// doesn't race with the logger potentially still being used by the test.
+func TestRunnerFailureAfterTimeout(t *testing.T) {
+	ctx := context.Background()
+	stopper := stop.NewStopper()
+	defer stopper.Stop(ctx)
+	cr := newClusterRegistry()
+	runner := newUnitTestRunner(cr, stopper)
+
+	var buf syncedBuffer
+	copt := defaultClusterOpt()
+	lopt := defaultLoggingOpt(&buf)
+	test := registry.TestSpec{
+		Name:  `timeout`,
+		Owner: OwnerUnitTest,
+		// Set the timeout very low so we can observe the timeout
+		// and error racing.
+		Timeout:          1 * time.Nanosecond,
+		Cluster:          spec.MakeClusterSpec(0),
+		CompatibleClouds: registry.AllExceptAWS,
+		Suites:           registry.Suites(registry.Nightly),
+		CockroachBinary:  registry.StandardCockroach,
+		Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
+			t.Error("test failed")
+		},
+	}
+	err := runner.Run(ctx, []registry.TestSpec{test}, 1, /* count */
+		defaultParallelism, copt, testOpts{}, lopt)
+	require.Error(t, err)
+}