Skip to content

Commit 4fe1d00

Browse files
craig[bot]xinhaoz
craig[bot]
andcommitted
Merge #138343
138343: sql: setup http router after version initialization r=xinhaoz a=xinhaoz Previously the http routes were set up after sql version initialization. As of #127170, the version initialization occurs in the sql server prestart, which occurs just after router setup. It became possible to hit the http routes before the version was set, leading to panics as the http routes rely on the sql version being set. Epic: none Fixes: #138342 Co-authored-by: Xin Hao Zhang <[email protected]>
2 parents 31e5601 + 331596c commit 4fe1d00

File tree

3 files changed

+107
-10
lines changed

3 files changed

+107
-10
lines changed

pkg/cmd/roachtest/tests/registry.go

+1
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ func RegisterTests(r registry.Registry) {
6363
registerHibernate(r, hibernateOpts)
6464
registerHibernate(r, hibernateSpatialOpts)
6565
registerHotSpotSplits(r)
66+
registerHTTPRestart(r)
6667
registerImportCancellation(r)
6768
registerImportDecommissioned(r)
6869
registerImportMixedVersions(r)

pkg/cmd/roachtest/tests/versionupgrade.go

+93
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,16 @@ import (
1414

1515
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
1616
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
17+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
18+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
1719
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade"
1820
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/mixedversion"
1921
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
2022
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
2123
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
2224
"github.com/cockroachdb/cockroach/pkg/storage"
2325
"github.com/cockroachdb/cockroach/pkg/testutils/release"
26+
"github.com/cockroachdb/cockroach/pkg/ts/tspb"
2427
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
2528
"github.com/cockroachdb/cockroach/pkg/util/version"
2629
)
@@ -108,6 +111,7 @@ func runVersionUpgrade(ctx context.Context, t test.Test, c cluster.Cluster) {
108111
}
109112

110113
mvt := mixedversion.NewTest(testCtx, t, t.L(), c, c.All(), opts...)
114+
111115
mvt.InMixedVersion(
112116
"maybe run backup",
113117
func(ctx context.Context, l *logger.Logger, rng *rand.Rand, h *mixedversion.Helper) error {
@@ -287,3 +291,92 @@ for i in 1 2 3 4; do
287291
done
288292
`)
289293
}
294+
295+
// This is a regression test for a race detailed in
296+
// https://github.com/cockroachdb/cockroach/issues/138342, where it became
297+
// possible for an HTTP request to cause a fatal error if the sql server
298+
// did not initialize the cluster version in time.
299+
func registerHTTPRestart(r registry.Registry) {
300+
r.Add(registry.TestSpec{
301+
Name: "http-register-routes/mixed-version",
302+
Owner: registry.OwnerObservability,
303+
Cluster: r.MakeClusterSpec(4),
304+
CompatibleClouds: registry.AllClouds,
305+
Suites: registry.Suites(registry.MixedVersion, registry.Nightly),
306+
Randomized: true,
307+
Run: runHTTPRestart,
308+
Timeout: 1 * time.Hour,
309+
})
310+
}
311+
312+
func runHTTPRestart(ctx context.Context, t test.Test, c cluster.Cluster) {
313+
mvt := mixedversion.NewTest(ctx, t, t.L(), c,
314+
c.CRDBNodes(),
315+
mixedversion.AlwaysUseLatestPredecessors,
316+
)
317+
318+
// Any http request requiring auth will do.
319+
httpReq := tspb.TimeSeriesQueryRequest{
320+
StartNanos: timeutil.Now().UnixNano() - 10*time.Second.Nanoseconds(),
321+
EndNanos: timeutil.Now().UnixNano(),
322+
// Ask for 10s intervals.
323+
SampleNanos: (10 * time.Second).Nanoseconds(),
324+
Queries: []tspb.Query{{
325+
Name: "cr.node.sql.service.latency-p90",
326+
SourceAggregator: tspb.TimeSeriesQueryAggregator_MAX.Enum(),
327+
}},
328+
}
329+
330+
httpCall := func(ctx context.Context, node int, l *logger.Logger, useSystemTenant bool) {
331+
logEvery := roachtestutil.Every(1 * time.Second)
332+
var clientOpts []func(opts *roachtestutil.RoachtestHTTPOptions)
333+
var urlOpts []option.OptionFunc
334+
if useSystemTenant {
335+
clientOpts = append(clientOpts, roachtestutil.VirtualCluster(install.SystemInterfaceName))
336+
urlOpts = append(urlOpts, option.VirtualClusterName(install.SystemInterfaceName))
337+
}
338+
client := roachtestutil.DefaultHTTPClient(c, l, clientOpts...)
339+
adminUrls, err := c.ExternalAdminUIAddr(ctx, l, c.Node(node), urlOpts...)
340+
if err != nil {
341+
t.Fatal(err)
342+
}
343+
url := "https://" + adminUrls[0] + "/ts/query"
344+
l.Printf("Sending requests to %s", url)
345+
346+
var response tspb.TimeSeriesQueryResponse
347+
// Eventually we should see a successful request.
348+
reqSuccess := false
349+
for {
350+
select {
351+
case <-ctx.Done():
352+
if !reqSuccess {
353+
t.Fatalf("n%d: No successful http requests made.", node)
354+
}
355+
return
356+
default:
357+
}
358+
if err := client.PostProtobuf(ctx, url, &httpReq, &response); err != nil {
359+
if logEvery.ShouldLog() {
360+
l.Printf("n%d: Error posting protobuf: %s", node, err)
361+
}
362+
continue
363+
}
364+
reqSuccess = true
365+
}
366+
}
367+
368+
for _, n := range c.CRDBNodes() {
369+
mvt.BackgroundFunc("HTTP requests to system tenant", func(ctx context.Context, l *logger.Logger, rng *rand.Rand, h *mixedversion.Helper) error {
370+
httpCall(ctx, n, l, true /* useSystemTenant */)
371+
return nil
372+
})
373+
mvt.BackgroundFunc("HTTP requests to secondary tenant", func(ctx context.Context, l *logger.Logger, rng *rand.Rand, h *mixedversion.Helper) error {
374+
if h.DeploymentMode() == mixedversion.SystemOnlyDeployment {
375+
return nil
376+
}
377+
httpCall(ctx, n, l, false /* useSystemTenant */)
378+
return nil
379+
})
380+
}
381+
mvt.Run()
382+
}

pkg/server/tenant.go

+13-10
Original file line numberDiff line numberDiff line change
@@ -799,9 +799,22 @@ func (s *SQLServerWrapper) PreStart(ctx context.Context) error {
799799
return err
800800
}
801801

802+
// Start the SQL subsystem.
803+
if err := s.sqlServer.preStart(
804+
workersCtx,
805+
s.stopper,
806+
s.sqlServer.cfg.TestingKnobs,
807+
orphanedLeasesTimeThresholdNanos,
808+
); err != nil {
809+
return err
810+
}
811+
802812
// Connect the HTTP endpoints. This also wraps the privileged HTTP
803813
// endpoints served by gwMux by the HTTP cookie authentication
804814
// check.
815+
// NB: This must occur after sqlServer.preStart() which initializes
816+
// the cluster version from storage as the http auth server relies on
817+
// the cluster version being initialized.
805818
if err := s.http.setupRoutes(ctx,
806819
s.authentication, /* authnServer */
807820
s.adminAuthzCheck, /* adminAuthzCheck */
@@ -828,16 +841,6 @@ func (s *SQLServerWrapper) PreStart(ctx context.Context) error {
828841
return err
829842
}
830843

831-
// Start the SQL subsystem.
832-
if err := s.sqlServer.preStart(
833-
workersCtx,
834-
s.stopper,
835-
s.sqlServer.cfg.TestingKnobs,
836-
orphanedLeasesTimeThresholdNanos,
837-
); err != nil {
838-
return err
839-
}
840-
841844
// Initialize the external storage builders configuration params now that the
842845
// engines have been created. The object can be used to create ExternalStorage
843846
// objects hereafter.

0 commit comments

Comments
 (0)