Skip to content

Commit ef63425

Browse files
committed
backup: add checkpointing to backup compactions
This patch adds periodic checkpointing to backup compaction jobs. Epic: None Release note: None
1 parent ca256da commit ef63425

File tree

4 files changed

+182
-12
lines changed

4 files changed

+182
-12
lines changed

Diff for: pkg/backup/compaction_job.go

+67-12
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import (
4040
"github.com/cockroachdb/cockroach/pkg/util/hlc"
4141
"github.com/cockroachdb/cockroach/pkg/util/log"
4242
"github.com/cockroachdb/cockroach/pkg/util/retry"
43+
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
4344
"github.com/cockroachdb/cockroach/pkg/util/uuid"
4445
"github.com/cockroachdb/errors"
4546
"github.com/gogo/protobuf/types"
@@ -218,8 +219,8 @@ func (b *backupResumer) ResumeCompaction(
218219

219220
var backupManifest *backuppb.BackupManifest
220221
updatedDetails := initialDetails
222+
testingKnobs := execCtx.ExecCfg().BackupRestoreTestingKnobs
221223
if initialDetails.URI == "" {
222-
testingKnobs := execCtx.ExecCfg().BackupRestoreTestingKnobs
223224
if testingKnobs != nil && testingKnobs.RunBeforeResolvingCompactionDest != nil {
224225
if err := testingKnobs.RunBeforeResolvingCompactionDest(); err != nil {
225226
return err
@@ -334,6 +335,10 @@ func (b *backupResumer) ResumeCompaction(
334335
}
335336
}
336337

338+
if testingKnobs != nil && testingKnobs.AfterLoadingManifestOnResume != nil {
339+
testingKnobs.AfterLoadingManifestOnResume(backupManifest)
340+
}
341+
337342
// We retry on pretty generic failures -- any rpc error. If a worker node were
338343
// to restart, it would produce this kind of error, but there may be other
339344
// errors that are also rpc errors. Don't retry too aggressively.
@@ -342,9 +347,8 @@ func (b *backupResumer) ResumeCompaction(
342347
MaxRetries: 5,
343348
}
344349

345-
if execCtx.ExecCfg().BackupRestoreTestingKnobs != nil &&
346-
execCtx.ExecCfg().BackupRestoreTestingKnobs.BackupDistSQLRetryPolicy != nil {
347-
retryOpts = *execCtx.ExecCfg().BackupRestoreTestingKnobs.BackupDistSQLRetryPolicy
350+
if testingKnobs != nil && testingKnobs.BackupDistSQLRetryPolicy != nil {
351+
retryOpts = *testingKnobs.BackupDistSQLRetryPolicy
348352
}
349353

350354
if err := execCtx.ExecCfg().JobRegistry.CheckPausepoint("backup.before.flow"); err != nil {
@@ -376,8 +380,6 @@ func (b *backupResumer) ResumeCompaction(
376380

377381
// Reload the backup manifest to pick up any spans we may have completed on
378382
// previous attempts.
379-
// TODO (kev-cao): Compactions currently do not create checkpoints, but this
380-
// can be used to reload the manifest once we add checkpointing.
381383
var reloadBackupErr error
382384
mem.Shrink(ctx, memSize)
383385
backupManifest, memSize, reloadBackupErr = b.readManifestOnResume(ctx, &mem, execCtx.ExecCfg(),
@@ -752,9 +754,13 @@ func concludeBackupCompaction(
752754
// the associated manifest.
753755
func processProgress(
754756
ctx context.Context,
757+
execCtx sql.JobExecContext,
758+
details jobspb.BackupDetails,
755759
manifest *backuppb.BackupManifest,
756760
progCh <-chan *execinfrapb.RemoteProducerMetadata_BulkProcessorProgress,
761+
kmsEnv cloud.KMSEnv,
757762
) error {
763+
var lastCheckpointTime time.Time
758764
// When a processor is done exporting a span, it will send a progress update
759765
// to progCh.
760766
for progress := range progCh {
@@ -763,15 +769,22 @@ func processProgress(
763769
log.Errorf(ctx, "unable to unmarshal backup progress details: %+v", err)
764770
return err
765771
}
766-
for _, file := range progDetails.Files {
767-
manifest.Files = append(manifest.Files, file)
768-
manifest.EntryCounts.Add(file.EntryCounts)
769-
}
772+
updateManifestWithProgress(progDetails, manifest)
773+
770774
// TODO (kev-cao): Add per node progress updates.
775+
776+
if wroteCheckpoint, err := maybeWriteBackupCheckpoint(
777+
ctx, execCtx, details, manifest, lastCheckpointTime, kmsEnv,
778+
); err != nil {
779+
log.Errorf(ctx, "unable to checkpoint compaction: %+v", err)
780+
} else if wroteCheckpoint {
781+
lastCheckpointTime = timeutil.Now()
782+
}
771783
}
772784
return nil
773785
}
774786

787+
// compactionJobDescription generates a redacted description of the job.
775788
func compactionJobDescription(details jobspb.BackupDetails) (string, error) {
776789
fmtCtx := tree.NewFmtCtx(tree.FmtSimple)
777790
redactedURIs, err := sanitizeURIList(details.Destination.To)
@@ -822,8 +835,7 @@ func doCompaction(
822835
)
823836
}
824837
checkpointLoop := func(ctx context.Context) error {
825-
// TODO (kev-cao): Add logic for checkpointing during loop.
826-
return processProgress(ctx, manifest, progCh)
838+
return processProgress(ctx, execCtx, details, manifest, progCh, kmsEnv)
827839
}
828840
// TODO (kev-cao): Add trace aggregator loop.
829841

@@ -838,6 +850,49 @@ func doCompaction(
838850
)
839851
}
840852

853+
// updateManifestWithProgress takes a progress update from the processors and
854+
// updates the backup manifest accordingly.
855+
func updateManifestWithProgress(
856+
progDetails backuppb.BackupManifest_Progress, manifest *backuppb.BackupManifest,
857+
) {
858+
for _, file := range progDetails.Files {
859+
manifest.Files = append(manifest.Files, file)
860+
manifest.EntryCounts.Add(file.EntryCounts)
861+
}
862+
}
863+
864+
// maybeWriteBackupCheckpoint writes a checkpoint for the backup if
865+
// the time since the last checkpoint exceeds the configured interval. If a
866+
// checkpoint is written, the function returns true.
867+
func maybeWriteBackupCheckpoint(
868+
ctx context.Context,
869+
execCtx sql.JobExecContext,
870+
details jobspb.BackupDetails,
871+
manifest *backuppb.BackupManifest,
872+
lastCheckpointTime time.Time,
873+
kmsEnv cloud.KMSEnv,
874+
) (bool, error) {
875+
if details.URI == "" {
876+
return false, errors.New("backup details does not contain a default URI")
877+
}
878+
execCfg := execCtx.ExecCfg()
879+
interval := BackupCheckpointInterval.Get(&execCfg.Settings.SV)
880+
if timeutil.Since(lastCheckpointTime) < interval {
881+
return false, nil
882+
}
883+
if err := backupinfo.WriteBackupManifestCheckpoint(
884+
ctx, details.URI, details.EncryptionOptions, kmsEnv,
885+
manifest, execCfg, execCtx.User(),
886+
); err != nil {
887+
return false, err
888+
}
889+
backupRestoreKnobs := execCfg.BackupRestoreTestingKnobs
890+
if backupRestoreKnobs != nil && backupRestoreKnobs.AfterCompactBackupsCheckpoint != nil {
891+
backupRestoreKnobs.AfterCompactBackupsCheckpoint()
892+
}
893+
return true, nil
894+
}
895+
841896
func init() {
842897
builtins.StartCompactionJob = StartCompactionJob
843898
}

Diff for: pkg/backup/compaction_test.go

+105
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"time"
1717

1818
"github.com/cockroachdb/cockroach/pkg/backup/backupinfo"
19+
"github.com/cockroachdb/cockroach/pkg/backup/backuppb"
1920
"github.com/cockroachdb/cockroach/pkg/base"
2021
"github.com/cockroachdb/cockroach/pkg/jobs"
2122
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
@@ -592,6 +593,110 @@ func TestScheduledBackupCompaction(t *testing.T) {
592593
require.Equal(t, 4, numBackups)
593594
}
594595

596+
func TestCompactionCheckpointing(t *testing.T) {
597+
defer leaktest.AfterTest(t)()
598+
defer log.Scope(t).Close(t)
599+
600+
// These two channels are for having the test wait until the compaction job
601+
// writes a checkpoint and to pause the compaction job until the test is
602+
// ready to resume, respectively.
603+
// We block the job from continuing to prevent the job from completing before
604+
// we can pause it.
605+
hitCheckpoint := make(chan struct{})
606+
blockCompaction := make(chan struct{})
607+
doBlockCheckpoint := true
608+
609+
// These two channels are for having the test wait until the compaction job
610+
// has loaded a manifest after resuming a job and to pause the job until the
611+
// test is ready to resume, respectively.
612+
// We block resuming because otherwise we have a race condition where the
613+
// manifest is updated by the job before we can check its files.
614+
loadedManifest := make(chan struct{})
615+
blockResume := make(chan struct{})
616+
doBlockResume := true
617+
var manifest *backuppb.BackupManifest
618+
619+
defer func() {
620+
close(hitCheckpoint)
621+
close(blockCompaction)
622+
close(loadedManifest)
623+
close(blockResume)
624+
}()
625+
// Need a large enough backup so that it doesn't finish in one iteration
626+
const numAccounts = 10000
627+
tc, db, _, cleanup := backupRestoreTestSetupWithParams(
628+
t, singleNode, numAccounts, InitManualReplication, base.TestClusterArgs{
629+
ServerArgs: base.TestServerArgs{
630+
Knobs: base.TestingKnobs{
631+
BackupRestore: &sql.BackupRestoreTestingKnobs{
632+
AfterLoadingManifestOnResume: func(m *backuppb.BackupManifest) {
633+
if doBlockResume {
634+
manifest = m
635+
loadedManifest <- struct{}{}
636+
<-blockResume
637+
}
638+
},
639+
AfterCompactBackupsCheckpoint: func() {
640+
if doBlockCheckpoint {
641+
hitCheckpoint <- struct{}{}
642+
<-blockCompaction
643+
}
644+
},
645+
},
646+
},
647+
},
648+
},
649+
)
650+
defer cleanup()
651+
writeQueries := func() {
652+
db.Exec(t, "UPDATE data.bank SET balance = balance + 1")
653+
}
654+
db.Exec(t, "SET CLUSTER SETTING bulkio.backup.checkpoint_interval = '10ms'")
655+
start := getTime(t)
656+
db.Exec(t, fmt.Sprintf("BACKUP INTO 'nodelocal://1/backup' AS OF SYSTEM TIME %d", start))
657+
writeQueries()
658+
db.Exec(t, "BACKUP INTO LATEST IN 'nodelocal://1/backup'")
659+
writeQueries()
660+
end := getTime(t)
661+
db.Exec(t, fmt.Sprintf("BACKUP INTO LATEST IN 'nodelocal://1/backup' AS OF SYSTEM TIME %d", end))
662+
663+
var backupPath string
664+
db.QueryRow(t, "SHOW BACKUPS IN 'nodelocal://1/backup'").Scan(&backupPath)
665+
666+
var jobID jobspb.JobID
667+
db.QueryRow(
668+
t,
669+
"SELECT crdb_internal.backup_compaction(ARRAY['nodelocal://1/backup'], $1, ''::BYTES, $2, $3)",
670+
backupPath, start, end,
671+
).Scan(&jobID)
672+
// Ensure that the very first manifest when the job is initially started has
673+
// no files.
674+
<-loadedManifest
675+
require.Equal(t, 0, len(manifest.Files))
676+
blockResume <- struct{}{}
677+
678+
// Pause after first checkpoint and then resume the job from that checkpoint.
679+
<-hitCheckpoint
680+
db.Exec(t, "PAUSE JOB $1", jobID)
681+
// Wait for the job to pause.
682+
jobutils.WaitForJobToPause(t, db, jobID)
683+
doBlockCheckpoint = false // Don't bother pausing on other checkpoints.
684+
blockCompaction <- struct{}{}
685+
db.Exec(t, "RESUME JOB $1", jobID)
686+
jobutils.WaitForJobToRun(t, db, jobID)
687+
688+
// Now that the job has been paused and resumed after previously hitting a
689+
// checkpoint, the initial manifest at the start of the job should have
690+
// some files from before the pause.
691+
<-loadedManifest
692+
require.NotEmpty(t, manifest.Files)
693+
doBlockResume = false
694+
blockResume <- struct{}{}
695+
696+
waitForSuccessfulJob(t, tc, jobID)
697+
validateCompactedBackupForTables(t, db, []string{"bank"}, "'nodelocal://1/backup'", start, end)
698+
}
699+
595700
// Start and end are unix epoch in nanoseconds.
596701
func validateCompactedBackupForTables(
597702
t *testing.T, db *sqlutils.SQLRunner, tables []string, collectionURIs string, start, end int64,

Diff for: pkg/sql/BUILD.bazel

+1
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ go_library(
301301
importpath = "github.com/cockroachdb/cockroach/pkg/sql",
302302
visibility = ["//visibility:public"],
303303
deps = [
304+
"//pkg/backup/backuppb",
304305
"//pkg/base",
305306
"//pkg/build",
306307
"//pkg/cloud",

Diff for: pkg/sql/exec_util.go

+9
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"time"
2222

2323
apd "github.com/cockroachdb/apd/v3"
24+
"github.com/cockroachdb/cockroach/pkg/backup/backuppb"
2425
"github.com/cockroachdb/cockroach/pkg/base"
2526
"github.com/cockroachdb/cockroach/pkg/cloud/externalconn"
2627
"github.com/cockroachdb/cockroach/pkg/clusterversion"
@@ -1873,6 +1874,14 @@ type BackupRestoreTestingKnobs struct {
18731874
// is written.
18741875
AfterBackupCheckpoint func()
18751876

1877+
// AfterCompactBackupsCheckpoint if set will be called after a BACKUP-CHECKPOINT
1878+
// for compaction is written.
1879+
AfterCompactBackupsCheckpoint func()
1880+
1881+
// AfterLoadingManifestOnResume is run once the backup manifest has been
1882+
// loaded/created on the resumption of a compaction job.
1883+
AfterLoadingManifestOnResume func(manifest *backuppb.BackupManifest)
1884+
18761885
// CaptureResolvedTableDescSpans allows for intercepting the spans which are
18771886
// resolved during backup planning, and will eventually be backed up during
18781887
// execution.

0 commit comments

Comments
 (0)