@@ -40,6 +40,7 @@ import (
40
40
"github.com/cockroachdb/cockroach/pkg/util/hlc"
41
41
"github.com/cockroachdb/cockroach/pkg/util/log"
42
42
"github.com/cockroachdb/cockroach/pkg/util/retry"
43
+ "github.com/cockroachdb/cockroach/pkg/util/timeutil"
43
44
"github.com/cockroachdb/cockroach/pkg/util/uuid"
44
45
"github.com/cockroachdb/errors"
45
46
"github.com/gogo/protobuf/types"
@@ -231,8 +232,8 @@ func (b *backupResumer) ResumeCompaction(
231
232
232
233
var backupManifest * backuppb.BackupManifest
233
234
updatedDetails := initialDetails
235
+ testingKnobs := execCtx .ExecCfg ().BackupRestoreTestingKnobs
234
236
if initialDetails .URI == "" {
235
- testingKnobs := execCtx .ExecCfg ().BackupRestoreTestingKnobs
236
237
if testingKnobs != nil && testingKnobs .RunBeforeResolvingCompactionDest != nil {
237
238
if err := testingKnobs .RunBeforeResolvingCompactionDest (); err != nil {
238
239
return err
@@ -263,7 +264,7 @@ func (b *backupResumer) ResumeCompaction(
263
264
return err
264
265
}
265
266
266
- if err := execCtx .ExecCfg ().JobRegistry .CheckPausepoint ("backup .before.write_first_checkpoint" ); err != nil {
267
+ if err := execCtx .ExecCfg ().JobRegistry .CheckPausepoint ("backup_compaction .before.write_first_checkpoint" ); err != nil {
267
268
return err
268
269
}
269
270
@@ -274,7 +275,7 @@ func (b *backupResumer) ResumeCompaction(
274
275
return err
275
276
}
276
277
277
- if err := execCtx .ExecCfg ().JobRegistry .CheckPausepoint ("backup .after.write_first_checkpoint" ); err != nil {
278
+ if err := execCtx .ExecCfg ().JobRegistry .CheckPausepoint ("backup_compaction .after.write_first_checkpoint" ); err != nil {
278
279
return err
279
280
}
280
281
@@ -297,7 +298,7 @@ func (b *backupResumer) ResumeCompaction(
297
298
return err
298
299
}
299
300
300
- if err := execCtx .ExecCfg ().JobRegistry .CheckPausepoint ("backup .after.details_has_checkpoint" ); err != nil {
301
+ if err := execCtx .ExecCfg ().JobRegistry .CheckPausepoint ("backup_compaction .after.details_has_checkpoint" ); err != nil {
301
302
return err
302
303
}
303
304
// TODO (kev-cao): Add telemetry for backup compactions.
@@ -347,6 +348,10 @@ func (b *backupResumer) ResumeCompaction(
347
348
}
348
349
}
349
350
351
+ if testingKnobs != nil && testingKnobs .AfterLoadingCompactionManifestOnResume != nil {
352
+ testingKnobs .AfterLoadingCompactionManifestOnResume (backupManifest )
353
+ }
354
+
350
355
// We retry on pretty generic failures -- any rpc error. If a worker node were
351
356
// to restart, it would produce this kind of error, but there may be other
352
357
// errors that are also rpc errors. Don't retry too aggressively.
@@ -355,12 +360,11 @@ func (b *backupResumer) ResumeCompaction(
355
360
MaxRetries : 5 ,
356
361
}
357
362
358
- if execCtx .ExecCfg ().BackupRestoreTestingKnobs != nil &&
359
- execCtx .ExecCfg ().BackupRestoreTestingKnobs .BackupDistSQLRetryPolicy != nil {
360
- retryOpts = * execCtx .ExecCfg ().BackupRestoreTestingKnobs .BackupDistSQLRetryPolicy
363
+ if testingKnobs != nil && testingKnobs .BackupDistSQLRetryPolicy != nil {
364
+ retryOpts = * testingKnobs .BackupDistSQLRetryPolicy
361
365
}
362
366
363
- if err := execCtx .ExecCfg ().JobRegistry .CheckPausepoint ("backup .before.flow" ); err != nil {
367
+ if err := execCtx .ExecCfg ().JobRegistry .CheckPausepoint ("backup_compaction .before.flow" ); err != nil {
364
368
return err
365
369
}
366
370
@@ -389,8 +393,6 @@ func (b *backupResumer) ResumeCompaction(
389
393
390
394
// Reload the backup manifest to pick up any spans we may have completed on
391
395
// previous attempts.
392
- // TODO (kev-cao): Compactions currently do not create checkpoints, but this
393
- // can be used to reload the manifest once we add checkpointing.
394
396
var reloadBackupErr error
395
397
mem .Shrink (ctx , memSize )
396
398
backupManifest , memSize , reloadBackupErr = b .readManifestOnResume (ctx , & mem , execCtx .ExecCfg (),
@@ -765,9 +767,13 @@ func concludeBackupCompaction(
765
767
// the associated manifest.
766
768
func processProgress (
767
769
ctx context.Context ,
770
+ execCtx sql.JobExecContext ,
771
+ details jobspb.BackupDetails ,
768
772
manifest * backuppb.BackupManifest ,
769
773
progCh <- chan * execinfrapb.RemoteProducerMetadata_BulkProcessorProgress ,
774
+ kmsEnv cloud.KMSEnv ,
770
775
) error {
776
+ var lastCheckpointTime time.Time
771
777
// When a processor is done exporting a span, it will send a progress update
772
778
// to progCh.
773
779
for progress := range progCh {
@@ -780,11 +786,24 @@ func processProgress(
780
786
manifest .Files = append (manifest .Files , file )
781
787
manifest .EntryCounts .Add (file .EntryCounts )
782
788
}
789
+
783
790
// TODO (kev-cao): Add per node progress updates.
791
+
792
+ if wroteCheckpoint , err := maybeWriteBackupCheckpoint (
793
+ ctx , execCtx , details , manifest , lastCheckpointTime , kmsEnv ,
794
+ ); err != nil {
795
+ log .Errorf (ctx , "unable to checkpoint compaction: %+v" , err )
796
+ } else if wroteCheckpoint {
797
+ lastCheckpointTime = timeutil .Now ()
798
+ if err := execCtx .ExecCfg ().JobRegistry .CheckPausepoint ("backup_compaction.after.write_checkpoint" ); err != nil {
799
+ return err
800
+ }
801
+ }
784
802
}
785
803
return nil
786
804
}
787
805
806
+ // compactionJobDescription generates a redacted description of the job.
788
807
func compactionJobDescription (details jobspb.BackupDetails ) (string , error ) {
789
808
fmtCtx := tree .NewFmtCtx (tree .FmtSimple )
790
809
redactedURIs , err := sanitizeURIList (details .Destination .To )
@@ -835,8 +854,7 @@ func doCompaction(
835
854
)
836
855
}
837
856
checkpointLoop := func (ctx context.Context ) error {
838
- // TODO (kev-cao): Add logic for checkpointing during loop.
839
- return processProgress (ctx , manifest , progCh )
857
+ return processProgress (ctx , execCtx , details , manifest , progCh , kmsEnv )
840
858
}
841
859
// TODO (kev-cao): Add trace aggregator loop.
842
860
@@ -851,6 +869,34 @@ func doCompaction(
851
869
)
852
870
}
853
871
872
+ // maybeWriteBackupCheckpoint writes a checkpoint for the backup if
873
+ // the time since the last checkpoint exceeds the configured interval. If a
874
+ // checkpoint is written, the function returns true.
875
+ func maybeWriteBackupCheckpoint (
876
+ ctx context.Context ,
877
+ execCtx sql.JobExecContext ,
878
+ details jobspb.BackupDetails ,
879
+ manifest * backuppb.BackupManifest ,
880
+ lastCheckpointTime time.Time ,
881
+ kmsEnv cloud.KMSEnv ,
882
+ ) (bool , error ) {
883
+ if details .URI == "" {
884
+ return false , errors .New ("backup details does not contain a default URI" )
885
+ }
886
+ execCfg := execCtx .ExecCfg ()
887
+ interval := BackupCheckpointInterval .Get (& execCfg .Settings .SV )
888
+ if timeutil .Since (lastCheckpointTime ) < interval {
889
+ return false , nil
890
+ }
891
+ if err := backupinfo .WriteBackupManifestCheckpoint (
892
+ ctx , details .URI , details .EncryptionOptions , kmsEnv ,
893
+ manifest , execCfg , execCtx .User (),
894
+ ); err != nil {
895
+ return false , err
896
+ }
897
+ return true , nil
898
+ }
899
+
854
900
func init () {
855
901
builtins .StartCompactionJob = StartCompactionJob
856
902
}
0 commit comments