Skip to content

Commit 0ecd2ec

Browse files
craig[bot]fqazixinhaoz
committed
143669: sql: add timeout for PCR reader catalog lease acquisition r=fqazi a=fqazi Previously, the logic to determine if a PCR reader catalog was in use could become stuck if an availability issue occurred with the leasing subsystem. This was because we could end up waiting indefinitely for the lease in failure scenarios like TestUnavailableZipDir, and the statement_timeout is not active this early. To address this, this patch adds a 30-second timeout for obtaining a lease on the system database when detecting PCR reader catalogs. Fixes: #141565 Release note: None 143768: sqlstats: generalize insight.ConcurrentBufferIngester r=xinhaoz a=xinhaoz This commit copies the ConcurrentBufferIngester from insights into the sqlstats pkg in preparation to have it consume the sql execution event and handle the writing to both sqlstats and insights systems, instead of just for insights. At this stage we are preserving how this ingester consumes events. The ingester is still only used for insights writing. Previously, the insights registry had the responsibility of buffering statements by session id and processing them once the entire transaction completes (via receicing a transaction event). The responsibility of buffering statements by session has now moved to the ingester. As a result we remove the `ObserveStatement` function from the insights registry which was only used to insert into the statement buffer. Part of: #141024 Epic: none Release note: None Co-authored-by: Faizan Qazi <[email protected]> Co-authored-by: Xin Hao Zhang <[email protected]>
3 parents 3bed9a9 + 59f3557 + 695432f commit 0ecd2ec

22 files changed

+690
-724
lines changed

Diff for: pkg/base/testing_knobs.go

-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ type TestingKnobs struct {
5454
KeyVisualizer ModuleTestingKnobs
5555
TenantCapabilitiesTestingKnobs ModuleTestingKnobs
5656
TableStatsKnobs ModuleTestingKnobs
57-
Insights ModuleTestingKnobs
5857
TableMetadata ModuleTestingKnobs
5958
LicenseTestingKnobs ModuleTestingKnobs
6059
VecIndexTestingKnobs ModuleTestingKnobs

Diff for: pkg/cli/zip_test.go

-2
Original file line numberDiff line numberDiff line change
@@ -497,8 +497,6 @@ func TestUnavailableZip(t *testing.T) {
497497
defer leaktest.AfterTest(t)()
498498
defer log.Scope(t).Close(t)
499499

500-
skip.WithIssue(t, 141565)
501-
502500
skip.UnderShort(t)
503501
// Race builds make the servers so slow that they report spurious
504502
// unavailability.

Diff for: pkg/server/server_sql.go

-5
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,6 @@ import (
105105
"github.com/cockroachdb/cockroach/pkg/sql/sqlliveness/slinstance"
106106
"github.com/cockroachdb/cockroach/pkg/sql/sqlliveness/slprovider"
107107
"github.com/cockroachdb/cockroach/pkg/sql/sqlstats"
108-
"github.com/cockroachdb/cockroach/pkg/sql/sqlstats/insights"
109108
"github.com/cockroachdb/cockroach/pkg/sql/stats"
110109
"github.com/cockroachdb/cockroach/pkg/sql/stmtdiagnostics"
111110
"github.com/cockroachdb/cockroach/pkg/sql/syntheticprivilegecache"
@@ -1144,10 +1143,6 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) {
11441143
execCfg.ExternalConnectionTestingKnobs = externalConnKnobs.(*externalconn.TestingKnobs)
11451144
}
11461145

1147-
if insightsKnobs := cfg.TestingKnobs.Insights; insightsKnobs != nil {
1148-
execCfg.InsightsTestingKnobs = insightsKnobs.(*insights.TestingKnobs)
1149-
1150-
}
11511146
var tableStatsTestingKnobs *stats.TableStatsTestingKnobs
11521147
if tableStatsKnobs := cfg.TestingKnobs.TableStatsKnobs; tableStatsKnobs != nil {
11531148
tableStatsTestingKnobs = tableStatsKnobs.(*stats.TableStatsTestingKnobs)

Diff for: pkg/sql/conn_executor.go

+44-24
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,9 @@ type Server struct {
343343
// sqlStatsController is the control-plane interface for sqlStats.
344344
sqlStatsController *persistedsqlstats.Controller
345345

346+
// sqlStatsIngester provides the interface to consume stats about a sql execution.
347+
sqlStatsIngester *sslocal.SQLStatsIngester
348+
346349
// schemaTelemetryController is the control-plane interface for schema
347350
// telemetry.
348351
schemaTelemetryController *schematelemetrycontroller.Controller
@@ -438,13 +441,7 @@ type ServerMetrics struct {
438441
func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
439442
metrics := makeMetrics(false /* internal */, &cfg.Settings.SV)
440443
serverMetrics := makeServerMetrics(cfg)
441-
insightsProvider := insights.New(cfg.Settings, serverMetrics.InsightsMetrics, cfg.InsightsTestingKnobs)
442-
// TODO(117690): Unify StmtStatsEnable and TxnStatsEnable into a single cluster setting.
443-
sqlstats.TxnStatsEnable.SetOnChange(&cfg.Settings.SV, func(_ context.Context) {
444-
if !sqlstats.TxnStatsEnable.Get(&cfg.Settings.SV) {
445-
insightsProvider.Writer().Clear()
446-
}
447-
})
444+
insightsProvider := insights.New(cfg.Settings, serverMetrics.InsightsMetrics)
448445
reportedSQLStats := sslocal.New(
449446
cfg.Settings,
450447
sqlstats.MaxMemReportedSQLStatsStmtFingerprints,
@@ -466,6 +463,13 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
466463
reportedSQLStats,
467464
cfg.SQLStatsTestingKnobs,
468465
)
466+
sqlStatsIngester := sslocal.NewSQLStatsIngester(cfg.SQLStatsTestingKnobs, insightsProvider)
467+
// TODO(117690): Unify StmtStatsEnable and TxnStatsEnable into a single cluster setting.
468+
sqlstats.TxnStatsEnable.SetOnChange(&cfg.Settings.SV, func(_ context.Context) {
469+
if !sqlstats.TxnStatsEnable.Get(&cfg.Settings.SV) {
470+
sqlStatsIngester.Clear()
471+
}
472+
})
469473
s := &Server{
470474
cfg: cfg,
471475
Metrics: metrics,
@@ -474,6 +478,7 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
474478
pool: pool,
475479
localSqlStats: memSQLStats,
476480
reportedStats: reportedSQLStats,
481+
sqlStatsIngester: sqlStatsIngester,
477482
reportedStatsController: reportedSQLStatsController,
478483
insights: insightsProvider,
479484
reCache: tree.NewRegexpCache(512),
@@ -657,7 +662,7 @@ func (s *Server) Start(ctx context.Context, stopper *stop.Stopper) {
657662
// should be accounted for in their costs.
658663
ctx = multitenant.WithTenantCostControlExemption(ctx)
659664

660-
s.insights.Start(ctx, stopper)
665+
s.sqlStatsIngester.Start(ctx, stopper)
661666
s.sqlStats.Start(ctx, stopper)
662667

663668
s.schemaTelemetryController.Start(ctx, stopper)
@@ -1219,14 +1224,10 @@ func (s *Server) newConnExecutor(
12191224
ex.applicationStats = applicationStats
12201225
// We ignore statements and transactions run by the internal executor by
12211226
// passing a nil writer.
1222-
var writer *insights.ConcurrentBufferIngester
1223-
if !ex.sessionData().Internal {
1224-
writer = ex.server.insights.Writer()
1225-
}
12261227
ex.statsCollector = sslocal.NewStatsCollector(
12271228
s.cfg.Settings,
12281229
applicationStats,
1229-
writer,
1230+
s.sqlStatsIngester,
12301231
ex.phaseTimes,
12311232
s.localSqlStats.GetCounters(),
12321233
underOuterTxn,
@@ -1273,17 +1274,8 @@ func (s *Server) newConnExecutor(
12731274

12741275
ex.extraTxnState.hasAdminRoleCache = HasAdminRoleCache{}
12751276

1276-
if lm := ex.server.cfg.LeaseManager; executorType == executorTypeExec && lm != nil {
1277-
if desc, err := lm.Acquire(ctx, ex.server.cfg.Clock.Now(), keys.SystemDatabaseID); err != nil {
1278-
log.Infof(ctx, "unable to lease system database to determine if PCR reader is in use: %s", err)
1279-
} else {
1280-
defer desc.Release(ctx)
1281-
// The system database ReplicatedPCRVersion is set during reader tenant bootstrap,
1282-
// which guarantees that all user tenant sql connections to the reader tenant will
1283-
// correctly set this
1284-
ex.isPCRReaderCatalog = desc.Underlying().(catalog.DatabaseDescriptor).GetReplicatedPCRVersion() != 0
1285-
}
1286-
}
1277+
// Determine if we are running on a PCR reader catalog.
1278+
ex.initPCRReaderCatalog(ctx)
12871279

12881280
if postSetupFn != nil {
12891281
postSetupFn(ex)
@@ -3852,6 +3844,34 @@ func (ex *connExecutor) initEvalCtx(ctx context.Context, evalCtx *extendedEvalCo
38523844
evalCtx.copyFromExecCfg(ex.server.cfg)
38533845
}
38543846

3847+
// initPCRReaderCatalog leases the system database to determine if
3848+
// we are connecting to a PCR reader catalog, if this has not been attempted
3849+
// before.
3850+
func (ex *connExecutor) initPCRReaderCatalog(ctx context.Context) {
3851+
// Wait up to 10 seconds attempting to acquire the lease on the system
3852+
// database. Normally we should already have a lease on this object,
3853+
// unless there is some availability issue.
3854+
const initPCRReaderCatalogTimeout = 10 * time.Second
3855+
err := timeutil.RunWithTimeout(ctx, "detect-pcr-reader-catalog", initPCRReaderCatalogTimeout,
3856+
func(ctx context.Context) error {
3857+
if lm := ex.server.cfg.LeaseManager; ex.executorType == executorTypeExec && lm != nil {
3858+
desc, err := lm.Acquire(ctx, ex.server.cfg.Clock.Now(), keys.SystemDatabaseID)
3859+
if err != nil {
3860+
return err
3861+
}
3862+
defer desc.Release(ctx)
3863+
// The system database ReplicatedPCRVersion is set during reader tenant bootstrap,
3864+
// which guarantees that all user tenant sql connections to the reader tenant will
3865+
// correctly set this
3866+
ex.isPCRReaderCatalog = desc.Underlying().(catalog.DatabaseDescriptor).GetReplicatedPCRVersion() != 0
3867+
}
3868+
return nil
3869+
})
3870+
if err != nil {
3871+
log.Infof(ctx, "unable to lease system database to determine if PCR reader is in use: %s", err)
3872+
}
3873+
}
3874+
38553875
// GetPCRReaderTimestamp if the system database is setup as PCR
38563876
// catalog reader, then this function will return an non-zero timestamp
38573877
// to use for all read operations.

Diff for: pkg/sql/exec_util.go

-2
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,6 @@ import (
9999
"github.com/cockroachdb/cockroach/pkg/sql/sessionphase"
100100
"github.com/cockroachdb/cockroach/pkg/sql/sqlliveness"
101101
"github.com/cockroachdb/cockroach/pkg/sql/sqlstats"
102-
"github.com/cockroachdb/cockroach/pkg/sql/sqlstats/insights"
103102
"github.com/cockroachdb/cockroach/pkg/sql/stats"
104103
"github.com/cockroachdb/cockroach/pkg/sql/stmtdiagnostics"
105104
"github.com/cockroachdb/cockroach/pkg/sql/syntheticprivilegecache"
@@ -1408,7 +1407,6 @@ type ExecutorConfig struct {
14081407
UnusedIndexRecommendationsKnobs *idxusage.UnusedIndexRecommendationTestingKnobs
14091408
ExternalConnectionTestingKnobs *externalconn.TestingKnobs
14101409
EventLogTestingKnobs *EventLogTestingKnobs
1411-
InsightsTestingKnobs *insights.TestingKnobs
14121410
TableMetadataKnobs *tablemetadatacache_util.TestingKnobs
14131411

14141412
// HistogramWindowInterval is (server.Config).HistogramWindowInterval.

Diff for: pkg/sql/sqlstats/insights/BUILD.bazel

-7
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,11 @@ go_library(
77
srcs = [
88
"causes.go",
99
"detector.go",
10-
"ingester.go",
1110
"insights.go",
1211
"pool.go",
1312
"provider.go",
1413
"registry.go",
1514
"store.go",
16-
"test_utils.go",
1715
"util.go",
1816
],
1917
embed = [":insights_go_proto"],
@@ -24,14 +22,12 @@ go_library(
2422
"//pkg/settings/cluster",
2523
"//pkg/sql/appstatspb",
2624
"//pkg/sql/clusterunique",
27-
"//pkg/sql/contention/contentionutils",
2825
"//pkg/sql/pgwire/pgerror",
2926
"//pkg/sql/sqlstats",
3027
"//pkg/util/cache",
3128
"//pkg/util/intsets",
3229
"//pkg/util/metric",
3330
"//pkg/util/quantile",
34-
"//pkg/util/stop",
3531
"//pkg/util/syncutil",
3632
"//pkg/util/uuid",
3733
"@com_github_cockroachdb_redact//:redact",
@@ -44,7 +40,6 @@ go_test(
4440
srcs = [
4541
"causes_test.go",
4642
"detector_test.go",
47-
"ingester_test.go",
4843
"insights_test.go",
4944
"registry_test.go",
5045
"store_test.go",
@@ -59,8 +54,6 @@ go_test(
5954
"//pkg/sql/pgwire/pgcode",
6055
"//pkg/sql/pgwire/pgerror",
6156
"//pkg/sql/sqlstats",
62-
"//pkg/util/leaktest",
63-
"//pkg/util/log",
6457
"//pkg/util/stop",
6558
"//pkg/util/uint128",
6659
"//pkg/util/uuid",

0 commit comments

Comments
 (0)