@@ -40,6 +40,7 @@ import (
40
40
"github.com/cockroachdb/cockroach/pkg/util/hlc"
41
41
"github.com/cockroachdb/cockroach/pkg/util/log"
42
42
"github.com/cockroachdb/cockroach/pkg/util/retry"
43
+ "github.com/cockroachdb/cockroach/pkg/util/timeutil"
43
44
"github.com/cockroachdb/cockroach/pkg/util/uuid"
44
45
"github.com/cockroachdb/errors"
45
46
"github.com/gogo/protobuf/types"
@@ -231,8 +232,8 @@ func (b *backupResumer) ResumeCompaction(
231
232
232
233
var backupManifest * backuppb.BackupManifest
233
234
updatedDetails := initialDetails
235
+ testingKnobs := execCtx .ExecCfg ().BackupRestoreTestingKnobs
234
236
if initialDetails .URI == "" {
235
- testingKnobs := execCtx .ExecCfg ().BackupRestoreTestingKnobs
236
237
if testingKnobs != nil && testingKnobs .RunBeforeResolvingCompactionDest != nil {
237
238
if err := testingKnobs .RunBeforeResolvingCompactionDest (); err != nil {
238
239
return err
@@ -263,21 +264,13 @@ func (b *backupResumer) ResumeCompaction(
263
264
return err
264
265
}
265
266
266
- if err := execCtx .ExecCfg ().JobRegistry .CheckPausepoint ("backup.before.write_first_checkpoint" ); err != nil {
267
- return err
268
- }
269
-
270
267
if err := backupinfo .WriteBackupManifestCheckpoint (
271
268
ctx , updatedDetails .URI , updatedDetails .EncryptionOptions , kmsEnv ,
272
269
backupManifest , execCtx .ExecCfg (), execCtx .User (),
273
270
); err != nil {
274
271
return err
275
272
}
276
273
277
- if err := execCtx .ExecCfg ().JobRegistry .CheckPausepoint ("backup.after.write_first_checkpoint" ); err != nil {
278
- return err
279
- }
280
-
281
274
description := maybeUpdateJobDescription (
282
275
initialDetails , updatedDetails , b .job .Payload ().Description ,
283
276
)
@@ -297,7 +290,7 @@ func (b *backupResumer) ResumeCompaction(
297
290
return err
298
291
}
299
292
300
- if err := execCtx .ExecCfg ().JobRegistry .CheckPausepoint ("backup .after.details_has_checkpoint" ); err != nil {
293
+ if err := execCtx .ExecCfg ().JobRegistry .CheckPausepoint ("backup_compaction .after.details_has_checkpoint" ); err != nil {
301
294
return err
302
295
}
303
296
// TODO (kev-cao): Add telemetry for backup compactions.
@@ -347,6 +340,10 @@ func (b *backupResumer) ResumeCompaction(
347
340
}
348
341
}
349
342
343
+ if testingKnobs != nil && testingKnobs .AfterLoadingCompactionManifestOnResume != nil {
344
+ testingKnobs .AfterLoadingCompactionManifestOnResume (backupManifest )
345
+ }
346
+
350
347
// We retry on pretty generic failures -- any rpc error. If a worker node were
351
348
// to restart, it would produce this kind of error, but there may be other
352
349
// errors that are also rpc errors. Don't retry too aggressively.
@@ -355,13 +352,8 @@ func (b *backupResumer) ResumeCompaction(
355
352
MaxRetries : 5 ,
356
353
}
357
354
358
- if execCtx .ExecCfg ().BackupRestoreTestingKnobs != nil &&
359
- execCtx .ExecCfg ().BackupRestoreTestingKnobs .BackupDistSQLRetryPolicy != nil {
360
- retryOpts = * execCtx .ExecCfg ().BackupRestoreTestingKnobs .BackupDistSQLRetryPolicy
361
- }
362
-
363
- if err := execCtx .ExecCfg ().JobRegistry .CheckPausepoint ("backup.before.flow" ); err != nil {
364
- return err
355
+ if testingKnobs != nil && testingKnobs .BackupDistSQLRetryPolicy != nil {
356
+ retryOpts = * testingKnobs .BackupDistSQLRetryPolicy
365
357
}
366
358
367
359
// We want to retry a backup if there are transient failures (i.e. worker nodes
@@ -389,8 +381,6 @@ func (b *backupResumer) ResumeCompaction(
389
381
390
382
// Reload the backup manifest to pick up any spans we may have completed on
391
383
// previous attempts.
392
- // TODO (kev-cao): Compactions currently do not create checkpoints, but this
393
- // can be used to reload the manifest once we add checkpointing.
394
384
var reloadBackupErr error
395
385
mem .Shrink (ctx , memSize )
396
386
backupManifest , memSize , reloadBackupErr = b .readManifestOnResume (ctx , & mem , execCtx .ExecCfg (),
@@ -765,9 +755,13 @@ func concludeBackupCompaction(
765
755
// the associated manifest.
766
756
func processProgress (
767
757
ctx context.Context ,
758
+ execCtx sql.JobExecContext ,
759
+ details jobspb.BackupDetails ,
768
760
manifest * backuppb.BackupManifest ,
769
761
progCh <- chan * execinfrapb.RemoteProducerMetadata_BulkProcessorProgress ,
762
+ kmsEnv cloud.KMSEnv ,
770
763
) error {
764
+ var lastCheckpointTime time.Time
771
765
// When a processor is done exporting a span, it will send a progress update
772
766
// to progCh.
773
767
for progress := range progCh {
@@ -780,11 +774,24 @@ func processProgress(
780
774
manifest .Files = append (manifest .Files , file )
781
775
manifest .EntryCounts .Add (file .EntryCounts )
782
776
}
777
+
783
778
// TODO (kev-cao): Add per node progress updates.
779
+
780
+ if wroteCheckpoint , err := maybeWriteBackupCheckpoint (
781
+ ctx , execCtx , details , manifest , lastCheckpointTime , kmsEnv ,
782
+ ); err != nil {
783
+ log .Errorf (ctx , "unable to checkpoint compaction: %+v" , err )
784
+ } else if wroteCheckpoint {
785
+ lastCheckpointTime = timeutil .Now ()
786
+ if err := execCtx .ExecCfg ().JobRegistry .CheckPausepoint ("backup_compaction.after.write_checkpoint" ); err != nil {
787
+ return err
788
+ }
789
+ }
784
790
}
785
791
return nil
786
792
}
787
793
794
+ // compactionJobDescription generates a redacted description of the job.
788
795
func compactionJobDescription (details jobspb.BackupDetails ) (string , error ) {
789
796
fmtCtx := tree .NewFmtCtx (tree .FmtSimple )
790
797
redactedURIs , err := sanitizeURIList (details .Destination .To )
@@ -835,8 +842,7 @@ func doCompaction(
835
842
)
836
843
}
837
844
checkpointLoop := func (ctx context.Context ) error {
838
- // TODO (kev-cao): Add logic for checkpointing during loop.
839
- return processProgress (ctx , manifest , progCh )
845
+ return processProgress (ctx , execCtx , details , manifest , progCh , kmsEnv )
840
846
}
841
847
// TODO (kev-cao): Add trace aggregator loop.
842
848
@@ -851,6 +857,34 @@ func doCompaction(
851
857
)
852
858
}
853
859
860
+ // maybeWriteBackupCheckpoint writes a checkpoint for the backup if
861
+ // the time since the last checkpoint exceeds the configured interval. If a
862
+ // checkpoint is written, the function returns true.
863
+ func maybeWriteBackupCheckpoint (
864
+ ctx context.Context ,
865
+ execCtx sql.JobExecContext ,
866
+ details jobspb.BackupDetails ,
867
+ manifest * backuppb.BackupManifest ,
868
+ lastCheckpointTime time.Time ,
869
+ kmsEnv cloud.KMSEnv ,
870
+ ) (bool , error ) {
871
+ if details .URI == "" {
872
+ return false , errors .New ("backup details does not contain a default URI" )
873
+ }
874
+ execCfg := execCtx .ExecCfg ()
875
+ interval := BackupCheckpointInterval .Get (& execCfg .Settings .SV )
876
+ if timeutil .Since (lastCheckpointTime ) < interval {
877
+ return false , nil
878
+ }
879
+ if err := backupinfo .WriteBackupManifestCheckpoint (
880
+ ctx , details .URI , details .EncryptionOptions , kmsEnv ,
881
+ manifest , execCfg , execCtx .User (),
882
+ ); err != nil {
883
+ return false , err
884
+ }
885
+ return true , nil
886
+ }
887
+
854
888
func init () {
855
889
builtins .StartCompactionJob = StartCompactionJob
856
890
}
0 commit comments