Skip to content

Commit 325d2ed

Browse files
committed
Handle failure to retrieve ILM policy step better
This commit wraps the calls to retrieve the current step in a try/catch so that the exception does not bubble up. Instead, step info is added containing the exception to the existing step. Semi-related to #49128
1 parent 74a9407 commit 325d2ed

File tree

2 files changed

+110
-8
lines changed

2 files changed

+110
-8
lines changed

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunner.java

+31-8
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,14 @@ boolean isReadyToTransitionToThisPhase(final String policy, final IndexMetaData
104104
public void runPeriodicStep(String policy, IndexMetaData indexMetaData) {
105105
String index = indexMetaData.getIndex().getName();
106106
LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData);
107-
Step currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState);
107+
final Step currentStep;
108+
try {
109+
currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState);
110+
} catch (Exception e) {
111+
markPolicyRetrievalError(policy, indexMetaData.getIndex(), lifecycleState, e);
112+
return;
113+
}
114+
108115
if (currentStep == null) {
109116
if (stepRegistry.policyExists(policy) == false) {
110117
markPolicyDoesNotExist(policy, indexMetaData.getIndex(), lifecycleState);
@@ -194,7 +201,13 @@ public void onFailure(String source, Exception e) {
194201
public void maybeRunAsyncAction(ClusterState currentState, IndexMetaData indexMetaData, String policy, StepKey expectedStepKey) {
195202
String index = indexMetaData.getIndex().getName();
196203
LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData);
197-
Step currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState);
204+
final Step currentStep;
205+
try {
206+
currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState);
207+
} catch (Exception e) {
208+
markPolicyRetrievalError(policy, indexMetaData.getIndex(), lifecycleState, e);
209+
return;
210+
}
198211
if (currentStep == null) {
199212
logger.warn("current step [{}] for index [{}] with policy [{}] is not recognized",
200213
getCurrentStepKey(lifecycleState), index, policy);
@@ -237,7 +250,13 @@ public void onFailure(Exception e) {
237250
public void runPolicyAfterStateChange(String policy, IndexMetaData indexMetaData) {
238251
String index = indexMetaData.getIndex().getName();
239252
LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData);
240-
Step currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState);
253+
final Step currentStep;
254+
try {
255+
currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState);
256+
} catch (Exception e) {
257+
markPolicyRetrievalError(policy, indexMetaData.getIndex(), lifecycleState, e);
258+
return;
259+
}
241260
if (currentStep == null) {
242261
if (stepRegistry.policyExists(policy) == false) {
243262
markPolicyDoesNotExist(policy, indexMetaData.getIndex(), lifecycleState);
@@ -596,10 +615,14 @@ private static IndexMetaData.Builder removePolicyForIndex(IndexMetaData indexMet
596615
}
597616

598617
private void markPolicyDoesNotExist(String policyName, Index index, LifecycleExecutionState executionState) {
599-
logger.debug("policy [{}] for index [{}] does not exist, recording this in step_info for this index",
600-
policyName, index.getName());
601-
setStepInfo(index, policyName, getCurrentStepKey(executionState),
602-
new SetStepInfoUpdateTask.ExceptionWrapper(
603-
new IllegalArgumentException("policy [" + policyName + "] does not exist")));
618+
markPolicyRetrievalError(policyName, index, executionState,
619+
new IllegalArgumentException("policy [" + policyName + "] does not exist"));
620+
}
621+
622+
private void markPolicyRetrievalError(String policyName, Index index, LifecycleExecutionState executionState, Exception e) {
623+
logger.debug(
624+
new ParameterizedMessage("unable to retrieve policy [{}] for index [{}], recording this in step_info for this index",
625+
policyName, index.getName()), e);
626+
setStepInfo(index, policyName, getCurrentStepKey(executionState), new SetStepInfoUpdateTask.ExceptionWrapper(e));
604627
}
605628
}

x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunnerTests.java

+79
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
import java.util.TreeMap;
7373
import java.util.concurrent.CountDownLatch;
7474
import java.util.concurrent.TimeUnit;
75+
import java.util.concurrent.atomic.AtomicBoolean;
7576
import java.util.concurrent.atomic.AtomicLong;
7677
import java.util.function.BiFunction;
7778
import java.util.function.Function;
@@ -356,6 +357,84 @@ public void testRunStateChangePolicyWithNextStep() throws Exception {
356357
threadPool.shutdownNow();
357358
}
358359

360+
public void testRunPeriodicPolicyWithFailureToReadPolicy() throws Exception {
361+
doTestRunPolicyWithFailureToReadPolicy(false, true);
362+
}
363+
364+
public void testRunStateChangePolicyWithFailureToReadPolicy() throws Exception {
365+
doTestRunPolicyWithFailureToReadPolicy(false, false);
366+
}
367+
368+
public void testRunAsyncActionPolicyWithFailureToReadPolicy() throws Exception {
369+
doTestRunPolicyWithFailureToReadPolicy(true, false);
370+
}
371+
372+
public void doTestRunPolicyWithFailureToReadPolicy(boolean asyncAction, boolean periodicAction) throws Exception {
373+
String policyName = "foo";
374+
StepKey stepKey = new StepKey("phase", "action", "cluster_state_action_step");
375+
StepKey nextStepKey = new StepKey("phase", "action", "next_cluster_state_action_step");
376+
MockClusterStateActionStep step = new MockClusterStateActionStep(stepKey, nextStepKey);
377+
MockClusterStateActionStep nextStep = new MockClusterStateActionStep(nextStepKey, null);
378+
MockPolicyStepsRegistry stepRegistry = createOneStepPolicyStepRegistry(policyName, step);
379+
AtomicBoolean resolved = new AtomicBoolean(false);
380+
stepRegistry.setResolver((i, k) -> {
381+
resolved.set(true);
382+
throw new IllegalArgumentException("fake failure retrieving step");
383+
});
384+
ThreadPool threadPool = new TestThreadPool("name");
385+
LifecycleExecutionState les = LifecycleExecutionState.builder()
386+
.setPhase("phase")
387+
.setAction("action")
388+
.setStep("cluster_state_action_step")
389+
.build();
390+
IndexMetaData indexMetaData = IndexMetaData.builder("test")
391+
.settings(Settings.builder()
392+
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
393+
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
394+
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1)
395+
.put(LifecycleSettings.LIFECYCLE_NAME, policyName))
396+
.putCustom(LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY, les.asMap())
397+
.build();
398+
ClusterService clusterService = ClusterServiceUtils.createClusterService(threadPool);
399+
DiscoveryNode node = clusterService.localNode();
400+
IndexLifecycleMetadata ilm = new IndexLifecycleMetadata(Collections.emptyMap(), OperationMode.RUNNING);
401+
ClusterState state = ClusterState.builder(new ClusterName("cluster"))
402+
.metaData(MetaData.builder()
403+
.put(indexMetaData, true)
404+
.putCustom(IndexLifecycleMetadata.TYPE, ilm))
405+
.nodes(DiscoveryNodes.builder()
406+
.add(node)
407+
.masterNodeId(node.getId())
408+
.localNodeId(node.getId()))
409+
.build();
410+
ClusterServiceUtils.setState(clusterService, state);
411+
long stepTime = randomLong();
412+
IndexLifecycleRunner runner = new IndexLifecycleRunner(stepRegistry, clusterService, threadPool, () -> stepTime);
413+
414+
ClusterState before = clusterService.state();
415+
if (asyncAction) {
416+
runner.maybeRunAsyncAction(before, indexMetaData, policyName, stepKey);
417+
} else if (periodicAction) {
418+
runner.runPeriodicStep(policyName, indexMetaData);
419+
} else {
420+
runner.runPolicyAfterStateChange(policyName, indexMetaData);
421+
}
422+
423+
// The cluster state can take a few extra milliseconds to update after the steps are executed
424+
assertBusy(() -> assertNotEquals(before, clusterService.state()));
425+
LifecycleExecutionState newExecutionState = LifecycleExecutionState
426+
.fromIndexMetadata(clusterService.state().metaData().index(indexMetaData.getIndex()));
427+
assertThat(newExecutionState.getPhase(), equalTo("phase"));
428+
assertThat(newExecutionState.getAction(), equalTo("action"));
429+
assertThat(newExecutionState.getStep(), equalTo("cluster_state_action_step"));
430+
assertThat(step.getExecuteCount(), equalTo(0L));
431+
assertThat(nextStep.getExecuteCount(), equalTo(0L));
432+
assertThat(newExecutionState.getStepInfo(),
433+
containsString("{\"type\":\"illegal_argument_exception\",\"reason\":\"fake failure retrieving step\"}"));
434+
clusterService.close();
435+
threadPool.shutdownNow();
436+
}
437+
359438
public void testRunAsyncActionDoesNotRun() {
360439
String policyName = "foo";
361440
StepKey stepKey = new StepKey("phase", "action", "async_action_step");

0 commit comments

Comments
 (0)