Skip to content

Commit 2a54e00

Browse files
committed
roachtest/backup_restore: Use a schedule for backup layers.
This puts down a protected timestamp, so we can't GC needed data while a previous backup is running. Epic: none Release note: None
1 parent cc4cc90 commit 2a54e00

File tree

2 files changed

+58
-10
lines changed

2 files changed

+58
-10
lines changed

pkg/cmd/roachtest/operations/BUILD.bazel

+1
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ go_library(
4242
"//pkg/testutils/fingerprintutils",
4343
"//pkg/util/hlc",
4444
"//pkg/util/randutil",
45+
"//pkg/util/retry",
4546
"//pkg/util/timeutil",
4647
"@com_github_cockroachdb_errors//:errors",
4748
"@com_github_lib_pq//oid",

pkg/cmd/roachtest/operations/backup_restore.go

+57-10
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"github.com/cockroachdb/cockroach/pkg/testutils/fingerprintutils"
2323
"github.com/cockroachdb/cockroach/pkg/util/hlc"
2424
"github.com/cockroachdb/cockroach/pkg/util/randutil"
25+
"github.com/cockroachdb/cockroach/pkg/util/retry"
2526
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
2627
)
2728

@@ -74,27 +75,73 @@ outer:
7475
return nil
7576
}
7677

77-
o.Status(fmt.Sprintf("backing db %s (full)", dbName))
78+
o.Status(fmt.Sprintf("creating backup schedule for db %s", dbName))
7879
bucket := fmt.Sprintf("gs://%s/operation-backup-restore/%d/?AUTH=implicit", testutils.BackupTestingBucket(), timeutil.Now().UnixNano())
7980

80-
backupTS := hlc.Timestamp{WallTime: timeutil.Now().Add(-10 * time.Second).UTC().UnixNano()}
81+
var backupTS *hlc.Timestamp
8182

8283
if !online {
83-
_, err = conn.ExecContext(ctx, fmt.Sprintf("BACKUP DATABASE %s INTO '%s' AS OF SYSTEM TIME '%s' WITH revision_history", dbName, bucket, backupTS.AsOfSystemTime()))
84+
// Back up by creating a schedule, to lay down a protected time stamp.
85+
// Then take 1 full and 24 incrementals, as rapidly as possible - that is, one every minute.
86+
_, err = conn.ExecContext(
87+
ctx, fmt.Sprintf(
88+
"CREATE SCHEDULE IF NOT EXISTS backup_restore_operation FOR BACKUP DATABASE %s INTO '%s' WITH revision_history RECURRING '* * * * *' FULL BACKUP '@weekly' with schedule options first_run='now'", dbName, bucket))
8489
if err != nil {
8590
o.Fatal(err)
8691
}
87-
for i := range 24 {
88-
o.Status(fmt.Sprintf("backing up db %s (incremental layer %d)", dbName, i))
89-
// Update backupTS to match the latest layer.
90-
backupTS = hlc.Timestamp{WallTime: timeutil.Now().Add(-10 * time.Second).UTC().UnixNano()}
91-
_, err = conn.ExecContext(ctx, fmt.Sprintf("BACKUP DATABASE %s INTO LATEST IN '%s' AS OF SYSTEM TIME '%s' WITH revision_history", dbName, bucket, backupTS.AsOfSystemTime()))
92+
defer func() {
93+
_, _ = conn.Exec(fmt.Sprintf("DROP SCHEDULES WITH x AS (SHOW SCHEDULES) SELECT id FROM x WHERE label = 'backup_restore_operation';"))
94+
}()
95+
96+
retryOpts := retry.Options{
97+
InitialBackoff: 1 * time.Minute,
98+
MaxBackoff: 1 * time.Minute,
99+
MaxRetries: 24 * 60, // 24 Hours
100+
}
101+
102+
var endTime time.Time
103+
104+
for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
105+
// Take only the latest collection, if >1 exist.
106+
rows, err := conn.QueryContext(ctx, "WITH t AS (SHOW BACKUPS IN $1) SELECT * FROM t ORDER BY path DESC LIMIT 1", bucket)
92107
if err != nil {
93108
o.Fatal(err)
94109
}
110+
111+
path := ""
112+
if !rows.Next() {
113+
o.Status("no backups found, retrying")
114+
continue
115+
}
116+
if err := rows.Scan(&path); err != nil {
117+
o.Fatal(err)
118+
}
119+
120+
res := conn.QueryRowContext(ctx, "WITH t AS (SHOW BACKUP $1 in $2) SELECT COUNT(*) FROM t WHERE t.object_type='database' AND t.object_name=$3", path, bucket, dbName)
121+
var count int
122+
if err := res.Scan(&count); err != nil {
123+
o.Fatal(err)
124+
}
125+
126+
if count < 25 {
127+
o.Status(fmt.Sprintf("found %d layers, need 25", count))
128+
continue
129+
}
130+
o.Status("found 25 layers, proceeding")
131+
132+
res = conn.QueryRowContext(ctx, "WITH t AS (SHOW BACKUP $1 in $2) SELECT end_time FROM t WHERE t.object_type='database' ORDER BY end_time DESC LIMIT 1", path, bucket)
133+
if err := res.Scan(&endTime); err != nil {
134+
o.Fatal(err)
135+
}
136+
137+
backupTS = &hlc.Timestamp{WallTime: endTime.UTC().UnixNano()}
138+
break
95139
}
140+
96141
} else {
97-
// Revision history doesn't work with online restore.
142+
// Revision history and incrementals don't work with online restore.
143+
backupTS = &hlc.Timestamp{WallTime: timeutil.Now().Add(-10 * time.Second).UTC().UnixNano()}
144+
98145
_, err = conn.ExecContext(ctx, fmt.Sprintf("BACKUP DATABASE %s INTO '%s' AS OF SYSTEM TIME '%s'", dbName, bucket, backupTS.AsOfSystemTime()))
99146
if err != nil {
100147
o.Fatal(err)
@@ -136,7 +183,7 @@ outer:
136183

137184
if validate {
138185
o.Status(fmt.Sprintf("verifying db %s matches %s", dbName, restoreDBName))
139-
sourceFingerprints, err := fingerprintutils.FingerprintDatabase(ctx, conn, dbName, fingerprintutils.AOST(backupTS), fingerprintutils.Stripped())
186+
sourceFingerprints, err := fingerprintutils.FingerprintDatabase(ctx, conn, dbName, fingerprintutils.AOST(*backupTS), fingerprintutils.Stripped())
140187
if err != nil {
141188
o.Fatal(err)
142189
}

0 commit comments

Comments
 (0)