Skip to content

Commit c83aceb

Browse files
authored
Fix SLM check for restore in progress (#50868)
* Fix SLM check for restore in progress This commit fixes the check in SLM where the `RestoreInProgress` metadata was checked for existence. Rather than check existence we should instead check the `isEmpty` method. Prior to this, a successful restore for a repository that used SLM retention would prevent SLM retention from running in subsequent invocations, due to SLM thinking that a restore was still running.
1 parent 5a76fac commit c83aceb

File tree

3 files changed

+88
-5
lines changed

3 files changed

+88
-5
lines changed

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotRetentionTask.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ public static boolean okayToDeleteSnapshots(ClusterState state) {
446446

447447
// Cannot delete during a restore
448448
final RestoreInProgress restoreInProgress = state.custom(RestoreInProgress.TYPE);
449-
if (restoreInProgress != null) {
449+
if (restoreInProgress != null && restoreInProgress.isEmpty() == false) {
450450
return false;
451451
}
452452

@@ -480,6 +480,7 @@ public void onNewClusterState(ClusterState state) {
480480
logger.debug("retrying SLM snapshot retention deletion after snapshot operation has completed");
481481
reRun.accept(state);
482482
} else {
483+
logger.trace("received new cluster state but a snapshot operation is still running");
483484
observer.waitForNextChange(this);
484485
}
485486
} catch (Exception e) {

x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SLMSnapshotBlockingIntegTests.java

+73
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88

99
import org.elasticsearch.action.ActionFuture;
1010
import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsResponse;
11+
import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotAction;
12+
import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotRequest;
13+
import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse;
1114
import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotStatus;
1215
import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotsStatusResponse;
1316
import org.elasticsearch.action.index.IndexRequestBuilder;
@@ -22,6 +25,7 @@
2225
import org.elasticsearch.plugins.Plugin;
2326
import org.elasticsearch.repositories.RepositoriesService;
2427
import org.elasticsearch.repositories.RepositoryException;
28+
import org.elasticsearch.rest.RestStatus;
2529
import org.elasticsearch.snapshots.ConcurrentSnapshotExecutionException;
2630
import org.elasticsearch.snapshots.SnapshotInfo;
2731
import org.elasticsearch.snapshots.SnapshotMissingException;
@@ -30,6 +34,7 @@
3034
import org.elasticsearch.test.ESIntegTestCase;
3135
import org.elasticsearch.xpack.core.LocalStateCompositeXPackPlugin;
3236
import org.elasticsearch.xpack.core.ilm.LifecycleSettings;
37+
import org.elasticsearch.xpack.core.slm.SnapshotInvocationRecord;
3338
import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicy;
3439
import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicyItem;
3540
import org.elasticsearch.xpack.core.slm.SnapshotRetentionConfiguration;
@@ -385,6 +390,74 @@ private void testUnsuccessfulSnapshotRetention(boolean partialSuccess) throws Ex
385390
}
386391
}
387392

393+
public void testSLMRetentionAfterRestore() throws Exception {
394+
final String indexName = "test";
395+
final String policyName = "test-policy";
396+
int docCount = 20;
397+
for (int i = 0; i < docCount; i++) {
398+
index(indexName, i + "", Collections.singletonMap("foo", "bar"));
399+
}
400+
401+
// Create a snapshot repo
402+
initializeRepo(REPO);
403+
404+
logger.info("--> creating policy {}", policyName);
405+
createSnapshotPolicy(policyName, "snap", NEVER_EXECUTE_CRON_SCHEDULE, REPO, indexName, true, false,
406+
new SnapshotRetentionConfiguration(TimeValue.ZERO, null, null));
407+
408+
logger.info("--> executing snapshot lifecycle");
409+
final String snapshotName = executePolicy(policyName);
410+
411+
// Check that the executed snapshot shows up in the SLM output
412+
assertBusy(() -> {
413+
GetSnapshotLifecycleAction.Response getResp =
414+
client().execute(GetSnapshotLifecycleAction.INSTANCE, new GetSnapshotLifecycleAction.Request(policyName)).get();
415+
logger.info("--> checking for in progress snapshot...");
416+
417+
assertThat(getResp.getPolicies().size(), greaterThan(0));
418+
SnapshotLifecyclePolicyItem item = getResp.getPolicies().get(0);
419+
SnapshotInvocationRecord lastSuccess = item.getLastSuccess();
420+
assertNotNull(lastSuccess);
421+
assertThat(lastSuccess.getSnapshotName(), equalTo(snapshotName));
422+
});
423+
424+
logger.info("--> restoring index");
425+
RestoreSnapshotRequest restoreReq = new RestoreSnapshotRequest(REPO, snapshotName);
426+
restoreReq.indices(indexName);
427+
restoreReq.renamePattern("(.+)");
428+
restoreReq.renameReplacement("restored_$1");
429+
restoreReq.waitForCompletion(true);
430+
RestoreSnapshotResponse resp = client().execute(RestoreSnapshotAction.INSTANCE, restoreReq).get();
431+
assertThat(resp.status(), equalTo(RestStatus.OK));
432+
433+
logger.info("--> executing SLM retention");
434+
assertAcked(client().execute(ExecuteSnapshotRetentionAction.INSTANCE, new ExecuteSnapshotRetentionAction.Request()).get());
435+
logger.info("--> waiting for {} snapshot to be deleted", snapshotName);
436+
assertBusy(() -> {
437+
try {
438+
try {
439+
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
440+
.prepareGetSnapshots(REPO).setSnapshots(snapshotName).get();
441+
assertThat(snapshotsStatusResponse.getSnapshots(REPO), empty());
442+
} catch (SnapshotMissingException e) {
443+
// This is what we want to happen
444+
}
445+
logger.info("--> snapshot [{}] has been deleted", snapshotName);
446+
} catch (RepositoryException re) {
447+
// Concurrent status calls and write operations may lead to failures in determining the current repository generation
448+
// TODO: Remove this hack once tracking the current repository generation has been made consistent
449+
throw new AssertionError(re);
450+
}
451+
});
452+
453+
// Cancel/delete the snapshot
454+
try {
455+
client().admin().cluster().prepareDeleteSnapshot(REPO, snapshotName).get();
456+
} catch (SnapshotMissingException e) {
457+
// ignore
458+
}
459+
}
460+
388461
private SnapshotsStatusResponse getSnapshotStatus(String snapshotName) {
389462
try {
390463
return client().admin().cluster().prepareSnapshotStatus(REPO).setSnapshots(snapshotName).get();

x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SnapshotRetentionTaskTests.java

+13-4
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
import static org.hamcrest.Matchers.hasSize;
6868
import static org.hamcrest.Matchers.not;
6969
import static org.mockito.Mockito.mock;
70+
import static org.mockito.Mockito.when;
7071

7172
public class SnapshotRetentionTaskTests extends ESTestCase {
7273

@@ -363,6 +364,14 @@ public void testOkToDeleteSnapshots() {
363364
.build();
364365

365366
assertThat(SnapshotRetentionTask.okayToDeleteSnapshots(state), equalTo(false));
367+
368+
restoreInProgress = mock(RestoreInProgress.class);
369+
when(restoreInProgress.isEmpty()).thenReturn(true);
370+
state = ClusterState.builder(new ClusterName("cluster"))
371+
.putCustom(RestoreInProgress.TYPE, restoreInProgress)
372+
.build();
373+
374+
assertThat(SnapshotRetentionTask.okayToDeleteSnapshots(state), equalTo(true));
366375
}
367376

368377
public void testSkipWhileStopping() throws Exception {
@@ -420,10 +429,10 @@ private void doTestRunManuallyDuringMode(OperationMode mode) throws Exception {
420429
final String repoId = "repo";
421430
SnapshotLifecyclePolicy policy = new SnapshotLifecyclePolicy(policyId, "snap", "1 * * * * ?",
422431
repoId, null, new SnapshotRetentionConfiguration(TimeValue.timeValueDays(30), null, null));
423-
432+
424433
ClusterState state = createState(mode, policy);
425434
ClusterServiceUtils.setState(clusterService, state);
426-
435+
427436
AtomicBoolean retentionWasRun = new AtomicBoolean(false);
428437
MockSnapshotRetentionTask task = new MockSnapshotRetentionTask(noOpClient, clusterService,
429438
new SnapshotLifecycleTaskTests.VerifyingHistoryStore(noOpClient, ZoneOffset.UTC, (historyItem) -> {
@@ -436,10 +445,10 @@ private void doTestRunManuallyDuringMode(OperationMode mode) throws Exception {
436445
(deletionPolicyId, repo, snapId, slmStats, listener) -> {
437446
},
438447
System::nanoTime);
439-
448+
440449
long time = System.currentTimeMillis();
441450
task.triggered(new SchedulerEngine.Event(SnapshotRetentionService.SLM_RETENTION_MANUAL_JOB_ID, time, time));
442-
451+
443452
assertTrue("retention should be run manually even if SLM is disabled", retentionWasRun.get());
444453
} finally {
445454
threadPool.shutdownNow();

0 commit comments

Comments
 (0)