Skip to content

Commit d2faab1

Browse files
authored
Handle failure to retrieve ILM policy step better (#49193) (#49317)
This commit wraps the calls to retrieve the current step in a try/catch so that the exception does not bubble up. Instead, step info is added containing the exception to the existing step. Semi-related to #49128 (cherry picked from commit 72530f8) Signed-off-by: Andrei Dan <[email protected]>
1 parent 2df55e0 commit d2faab1

File tree

2 files changed

+110
-8
lines changed

2 files changed

+110
-8
lines changed

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunner.java

+31-8
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,14 @@ boolean isReadyToTransitionToThisPhase(final String policy, final IndexMetaData
102102
public void runPeriodicStep(String policy, IndexMetaData indexMetaData) {
103103
String index = indexMetaData.getIndex().getName();
104104
LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData);
105-
Step currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState);
105+
final Step currentStep;
106+
try {
107+
currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState);
108+
} catch (Exception e) {
109+
markPolicyRetrievalError(policy, indexMetaData.getIndex(), lifecycleState, e);
110+
return;
111+
}
112+
106113
if (currentStep == null) {
107114
if (stepRegistry.policyExists(policy) == false) {
108115
markPolicyDoesNotExist(policy, indexMetaData.getIndex(), lifecycleState);
@@ -160,7 +167,13 @@ public void onFailure(Exception e) {
160167
public void maybeRunAsyncAction(ClusterState currentState, IndexMetaData indexMetaData, String policy, StepKey expectedStepKey) {
161168
String index = indexMetaData.getIndex().getName();
162169
LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData);
163-
Step currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState);
170+
final Step currentStep;
171+
try {
172+
currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState);
173+
} catch (Exception e) {
174+
markPolicyRetrievalError(policy, indexMetaData.getIndex(), lifecycleState, e);
175+
return;
176+
}
164177
if (currentStep == null) {
165178
logger.warn("current step [{}] for index [{}] with policy [{}] is not recognized",
166179
getCurrentStepKey(lifecycleState), index, policy);
@@ -203,7 +216,13 @@ public void onFailure(Exception e) {
203216
public void runPolicyAfterStateChange(String policy, IndexMetaData indexMetaData) {
204217
String index = indexMetaData.getIndex().getName();
205218
LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData);
206-
Step currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState);
219+
final Step currentStep;
220+
try {
221+
currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState);
222+
} catch (Exception e) {
223+
markPolicyRetrievalError(policy, indexMetaData.getIndex(), lifecycleState, e);
224+
return;
225+
}
207226
if (currentStep == null) {
208227
if (stepRegistry.policyExists(policy) == false) {
209228
markPolicyDoesNotExist(policy, indexMetaData.getIndex(), lifecycleState);
@@ -521,10 +540,14 @@ private static IndexMetaData.Builder removePolicyForIndex(IndexMetaData indexMet
521540
}
522541

523542
private void markPolicyDoesNotExist(String policyName, Index index, LifecycleExecutionState executionState) {
524-
logger.debug("policy [{}] for index [{}] does not exist, recording this in step_info for this index",
525-
policyName, index.getName());
526-
setStepInfo(index, policyName, getCurrentStepKey(executionState),
527-
new SetStepInfoUpdateTask.ExceptionWrapper(
528-
new IllegalArgumentException("policy [" + policyName + "] does not exist")));
543+
markPolicyRetrievalError(policyName, index, executionState,
544+
new IllegalArgumentException("policy [" + policyName + "] does not exist"));
545+
}
546+
547+
private void markPolicyRetrievalError(String policyName, Index index, LifecycleExecutionState executionState, Exception e) {
548+
logger.debug(
549+
new ParameterizedMessage("unable to retrieve policy [{}] for index [{}], recording this in step_info for this index",
550+
policyName, index.getName()), e);
551+
setStepInfo(index, policyName, getCurrentStepKey(executionState), new SetStepInfoUpdateTask.ExceptionWrapper(e));
529552
}
530553
}

x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunnerTests.java

+79
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
import java.util.TreeMap;
7070
import java.util.concurrent.CountDownLatch;
7171
import java.util.concurrent.TimeUnit;
72+
import java.util.concurrent.atomic.AtomicBoolean;
7273
import java.util.concurrent.atomic.AtomicLong;
7374
import java.util.function.BiFunction;
7475
import java.util.function.Function;
@@ -295,6 +296,84 @@ public void testRunStateChangePolicyWithNextStep() throws Exception {
295296
threadPool.shutdownNow();
296297
}
297298

299+
public void testRunPeriodicPolicyWithFailureToReadPolicy() throws Exception {
300+
doTestRunPolicyWithFailureToReadPolicy(false, true);
301+
}
302+
303+
public void testRunStateChangePolicyWithFailureToReadPolicy() throws Exception {
304+
doTestRunPolicyWithFailureToReadPolicy(false, false);
305+
}
306+
307+
public void testRunAsyncActionPolicyWithFailureToReadPolicy() throws Exception {
308+
doTestRunPolicyWithFailureToReadPolicy(true, false);
309+
}
310+
311+
public void doTestRunPolicyWithFailureToReadPolicy(boolean asyncAction, boolean periodicAction) throws Exception {
312+
String policyName = "foo";
313+
StepKey stepKey = new StepKey("phase", "action", "cluster_state_action_step");
314+
StepKey nextStepKey = new StepKey("phase", "action", "next_cluster_state_action_step");
315+
MockClusterStateActionStep step = new MockClusterStateActionStep(stepKey, nextStepKey);
316+
MockClusterStateActionStep nextStep = new MockClusterStateActionStep(nextStepKey, null);
317+
MockPolicyStepsRegistry stepRegistry = createOneStepPolicyStepRegistry(policyName, step);
318+
AtomicBoolean resolved = new AtomicBoolean(false);
319+
stepRegistry.setResolver((i, k) -> {
320+
resolved.set(true);
321+
throw new IllegalArgumentException("fake failure retrieving step");
322+
});
323+
ThreadPool threadPool = new TestThreadPool("name");
324+
LifecycleExecutionState les = LifecycleExecutionState.builder()
325+
.setPhase("phase")
326+
.setAction("action")
327+
.setStep("cluster_state_action_step")
328+
.build();
329+
IndexMetaData indexMetaData = IndexMetaData.builder("test")
330+
.settings(Settings.builder()
331+
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
332+
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
333+
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1)
334+
.put(LifecycleSettings.LIFECYCLE_NAME, policyName))
335+
.putCustom(LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY, les.asMap())
336+
.build();
337+
ClusterService clusterService = ClusterServiceUtils.createClusterService(threadPool);
338+
DiscoveryNode node = clusterService.localNode();
339+
IndexLifecycleMetadata ilm = new IndexLifecycleMetadata(Collections.emptyMap(), OperationMode.RUNNING);
340+
ClusterState state = ClusterState.builder(new ClusterName("cluster"))
341+
.metaData(MetaData.builder()
342+
.put(indexMetaData, true)
343+
.putCustom(IndexLifecycleMetadata.TYPE, ilm))
344+
.nodes(DiscoveryNodes.builder()
345+
.add(node)
346+
.masterNodeId(node.getId())
347+
.localNodeId(node.getId()))
348+
.build();
349+
ClusterServiceUtils.setState(clusterService, state);
350+
long stepTime = randomLong();
351+
IndexLifecycleRunner runner = new IndexLifecycleRunner(stepRegistry, clusterService, threadPool, () -> stepTime);
352+
353+
ClusterState before = clusterService.state();
354+
if (asyncAction) {
355+
runner.maybeRunAsyncAction(before, indexMetaData, policyName, stepKey);
356+
} else if (periodicAction) {
357+
runner.runPeriodicStep(policyName, indexMetaData);
358+
} else {
359+
runner.runPolicyAfterStateChange(policyName, indexMetaData);
360+
}
361+
362+
// The cluster state can take a few extra milliseconds to update after the steps are executed
363+
assertBusy(() -> assertNotEquals(before, clusterService.state()));
364+
LifecycleExecutionState newExecutionState = LifecycleExecutionState
365+
.fromIndexMetadata(clusterService.state().metaData().index(indexMetaData.getIndex()));
366+
assertThat(newExecutionState.getPhase(), equalTo("phase"));
367+
assertThat(newExecutionState.getAction(), equalTo("action"));
368+
assertThat(newExecutionState.getStep(), equalTo("cluster_state_action_step"));
369+
assertThat(step.getExecuteCount(), equalTo(0L));
370+
assertThat(nextStep.getExecuteCount(), equalTo(0L));
371+
assertThat(newExecutionState.getStepInfo(),
372+
containsString("{\"type\":\"illegal_argument_exception\",\"reason\":\"fake failure retrieving step\"}"));
373+
clusterService.close();
374+
threadPool.shutdownNow();
375+
}
376+
298377
public void testRunAsyncActionDoesNotRun() {
299378
String policyName = "foo";
300379
StepKey stepKey = new StepKey("phase", "action", "async_action_step");

0 commit comments

Comments
 (0)