Skip to content

Commit 223eeaa

Browse files
committed
Fix SLM check for restore in progress (elastic#50868)
* Fix SLM check for restore in progress This commit fixes the check in SLM where the `RestoreInProgress` metadata was checked for existence. Rather than check existence we should instead check the `isEmpty` method. Prior to this, a successful restore for a repository that used SLM retention would prevent SLM retention from running in subsequent invocations, due to SLM thinking that a restore was still running.
1 parent 3bac1dc commit 223eeaa

File tree

3 files changed

+88
-5
lines changed

3 files changed

+88
-5
lines changed

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotRetentionTask.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,7 @@ public static boolean okayToDeleteSnapshots(ClusterState state) {
464464

465465
// Cannot delete during a restore
466466
final RestoreInProgress restoreInProgress = state.custom(RestoreInProgress.TYPE);
467-
if (restoreInProgress != null) {
467+
if (restoreInProgress != null && restoreInProgress.isEmpty() == false) {
468468
return false;
469469
}
470470

@@ -498,6 +498,7 @@ public void onNewClusterState(ClusterState state) {
498498
logger.debug("retrying SLM snapshot retention deletion after snapshot operation has completed");
499499
reRun.accept(state);
500500
} else {
501+
logger.trace("received new cluster state but a snapshot operation is still running");
501502
observer.waitForNextChange(this);
502503
}
503504
} catch (Exception e) {

x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SLMSnapshotBlockingIntegTests.java

+73
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88

99
import org.elasticsearch.action.ActionFuture;
1010
import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsResponse;
11+
import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotAction;
12+
import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotRequest;
13+
import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse;
1114
import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotStatus;
1215
import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotsStatusResponse;
1316
import org.elasticsearch.action.index.IndexRequestBuilder;
@@ -22,6 +25,7 @@
2225
import org.elasticsearch.plugins.Plugin;
2326
import org.elasticsearch.repositories.RepositoriesService;
2427
import org.elasticsearch.repositories.RepositoryException;
28+
import org.elasticsearch.rest.RestStatus;
2529
import org.elasticsearch.snapshots.ConcurrentSnapshotExecutionException;
2630
import org.elasticsearch.snapshots.SnapshotInfo;
2731
import org.elasticsearch.snapshots.SnapshotMissingException;
@@ -31,6 +35,7 @@
3135
import org.elasticsearch.xpack.core.LocalStateCompositeXPackPlugin;
3236
import org.elasticsearch.xpack.core.XPackSettings;
3337
import org.elasticsearch.xpack.core.ilm.LifecycleSettings;
38+
import org.elasticsearch.xpack.core.slm.SnapshotInvocationRecord;
3439
import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicy;
3540
import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicyItem;
3641
import org.elasticsearch.xpack.core.slm.SnapshotRetentionConfiguration;
@@ -415,6 +420,74 @@ private void testUnsuccessfulSnapshotRetention(boolean partialSuccess) throws Ex
415420
}
416421
}
417422

423+
public void testSLMRetentionAfterRestore() throws Exception {
424+
final String indexName = "test";
425+
final String policyName = "test-policy";
426+
int docCount = 20;
427+
for (int i = 0; i < docCount; i++) {
428+
index(indexName, i + "", Collections.singletonMap("foo", "bar"));
429+
}
430+
431+
// Create a snapshot repo
432+
initializeRepo(REPO);
433+
434+
logger.info("--> creating policy {}", policyName);
435+
createSnapshotPolicy(policyName, "snap", NEVER_EXECUTE_CRON_SCHEDULE, REPO, indexName, true, false,
436+
new SnapshotRetentionConfiguration(TimeValue.ZERO, null, null));
437+
438+
logger.info("--> executing snapshot lifecycle");
439+
final String snapshotName = executePolicy(policyName);
440+
441+
// Check that the executed snapshot shows up in the SLM output
442+
assertBusy(() -> {
443+
GetSnapshotLifecycleAction.Response getResp =
444+
client().execute(GetSnapshotLifecycleAction.INSTANCE, new GetSnapshotLifecycleAction.Request(policyName)).get();
445+
logger.info("--> checking for in progress snapshot...");
446+
447+
assertThat(getResp.getPolicies().size(), greaterThan(0));
448+
SnapshotLifecyclePolicyItem item = getResp.getPolicies().get(0);
449+
SnapshotInvocationRecord lastSuccess = item.getLastSuccess();
450+
assertNotNull(lastSuccess);
451+
assertThat(lastSuccess.getSnapshotName(), equalTo(snapshotName));
452+
});
453+
454+
logger.info("--> restoring index");
455+
RestoreSnapshotRequest restoreReq = new RestoreSnapshotRequest(REPO, snapshotName);
456+
restoreReq.indices(indexName);
457+
restoreReq.renamePattern("(.+)");
458+
restoreReq.renameReplacement("restored_$1");
459+
restoreReq.waitForCompletion(true);
460+
RestoreSnapshotResponse resp = client().execute(RestoreSnapshotAction.INSTANCE, restoreReq).get();
461+
assertThat(resp.status(), equalTo(RestStatus.OK));
462+
463+
logger.info("--> executing SLM retention");
464+
assertAcked(client().execute(ExecuteSnapshotRetentionAction.INSTANCE, new ExecuteSnapshotRetentionAction.Request()).get());
465+
logger.info("--> waiting for {} snapshot to be deleted", snapshotName);
466+
assertBusy(() -> {
467+
try {
468+
try {
469+
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
470+
.prepareGetSnapshots(REPO).setSnapshots(snapshotName).get();
471+
assertThat(snapshotsStatusResponse.getSnapshots(REPO), empty());
472+
} catch (SnapshotMissingException e) {
473+
// This is what we want to happen
474+
}
475+
logger.info("--> snapshot [{}] has been deleted", snapshotName);
476+
} catch (RepositoryException re) {
477+
// Concurrent status calls and write operations may lead to failures in determining the current repository generation
478+
// TODO: Remove this hack once tracking the current repository generation has been made consistent
479+
throw new AssertionError(re);
480+
}
481+
});
482+
483+
// Cancel/delete the snapshot
484+
try {
485+
client().admin().cluster().prepareDeleteSnapshot(REPO, snapshotName).get();
486+
} catch (SnapshotMissingException e) {
487+
// ignore
488+
}
489+
}
490+
418491
private SnapshotsStatusResponse getSnapshotStatus(String snapshotName) {
419492
try {
420493
return client().admin().cluster().prepareSnapshotStatus(REPO).setSnapshots(snapshotName).get();

x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SnapshotRetentionTaskTests.java

+13-4
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
import static org.hamcrest.Matchers.hasSize;
6868
import static org.hamcrest.Matchers.not;
6969
import static org.mockito.Mockito.mock;
70+
import static org.mockito.Mockito.when;
7071

7172
public class SnapshotRetentionTaskTests extends ESTestCase {
7273

@@ -363,6 +364,14 @@ public void testOkToDeleteSnapshots() {
363364
.build();
364365

365366
assertThat(SnapshotRetentionTask.okayToDeleteSnapshots(state), equalTo(false));
367+
368+
restoreInProgress = mock(RestoreInProgress.class);
369+
when(restoreInProgress.isEmpty()).thenReturn(true);
370+
state = ClusterState.builder(new ClusterName("cluster"))
371+
.putCustom(RestoreInProgress.TYPE, restoreInProgress)
372+
.build();
373+
374+
assertThat(SnapshotRetentionTask.okayToDeleteSnapshots(state), equalTo(true));
366375
}
367376

368377
public void testSkipWhileStopping() throws Exception {
@@ -420,10 +429,10 @@ private void doTestRunManuallyDuringMode(OperationMode mode) throws Exception {
420429
final String repoId = "repo";
421430
SnapshotLifecyclePolicy policy = new SnapshotLifecyclePolicy(policyId, "snap", "1 * * * * ?",
422431
repoId, null, new SnapshotRetentionConfiguration(TimeValue.timeValueDays(30), null, null));
423-
432+
424433
ClusterState state = createState(mode, policy);
425434
ClusterServiceUtils.setState(clusterService, state);
426-
435+
427436
AtomicBoolean retentionWasRun = new AtomicBoolean(false);
428437
MockSnapshotRetentionTask task = new MockSnapshotRetentionTask(noOpClient, clusterService,
429438
new SnapshotLifecycleTaskTests.VerifyingHistoryStore(noOpClient, ZoneOffset.UTC, (historyItem) -> {
@@ -436,10 +445,10 @@ private void doTestRunManuallyDuringMode(OperationMode mode) throws Exception {
436445
(deletionPolicyId, repo, snapId, slmStats, listener) -> {
437446
},
438447
System::nanoTime);
439-
448+
440449
long time = System.currentTimeMillis();
441450
task.triggered(new SchedulerEngine.Event(SnapshotRetentionService.SLM_RETENTION_MANUAL_JOB_ID, time, time));
442-
451+
443452
assertTrue("retention should be run manually even if SLM is disabled", retentionWasRun.get());
444453
} finally {
445454
threadPool.shutdownNow();

0 commit comments

Comments
 (0)