Skip to content

Commit dbec4e7

Browse files
committed
Don't halt policy execution on policy trigger exception (elastic#49128)
When triggered either by becoming master, a new cluster state, or a periodic schedule, an ILM policy execution through `maybeRunAsyncAction`, `runPolicyAfterStateChange`, or `runPeriodicStep` throwing an exception will cause the loop the terminate. This means that any indices that would have been processed after the index where the exception was thrown will not be processed by ILM. For most execution this is not a problem because the actual running of steps is protected by a try/catch that moves the index to the ERROR step in the event of a problem. If an exception occurs prior to step execution (for example, in fetching and parsing the current policy/step) however, it causes the loop termination previously mentioned. This commit wraps the invocation of the methods specified above in a try/catch block that provides better logging and does not bubble the exception up.
1 parent d6f2903 commit dbec4e7

File tree

3 files changed

+164
-25
lines changed

3 files changed

+164
-25
lines changed

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleService.java

+55-23
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import com.carrotsearch.hppc.cursors.ObjectCursor;
99
import org.apache.logging.log4j.LogManager;
1010
import org.apache.logging.log4j.Logger;
11+
import org.apache.logging.log4j.message.ParameterizedMessage;
1112
import org.apache.lucene.util.SetOnce;
1213
import org.elasticsearch.client.Client;
1314
import org.elasticsearch.cluster.ClusterChangedEvent;
@@ -114,19 +115,35 @@ public void onMaster() {
114115
final LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(idxMeta);
115116
StepKey stepKey = IndexLifecycleRunner.getCurrentStepKey(lifecycleState);
116117

117-
if (OperationMode.STOPPING == currentMode) {
118-
if (stepKey != null && IGNORE_STEPS_MAINTENANCE_REQUESTED.contains(stepKey.getName())) {
119-
logger.info("waiting to stop ILM because index [{}] with policy [{}] is currently in step [{}]",
120-
idxMeta.getIndex().getName(), policyName, stepKey.getName());
118+
try {
119+
if (OperationMode.STOPPING == currentMode) {
120+
if (stepKey != null && IGNORE_STEPS_MAINTENANCE_REQUESTED.contains(stepKey.getName())) {
121+
logger.info("waiting to stop ILM because index [{}] with policy [{}] is currently in step [{}]",
122+
idxMeta.getIndex().getName(), policyName, stepKey.getName());
123+
lifecycleRunner.maybeRunAsyncAction(clusterState, idxMeta, policyName, stepKey);
124+
// ILM is trying to stop, but this index is in a Shrink step (or other dangerous step) so we can't stop
125+
safeToStop = false;
126+
} else {
127+
logger.info("skipping policy execution of step [{}] for index [{}] with policy [{}]" +
128+
" because ILM is stopping",
129+
stepKey == null ? "n/a" : stepKey.getName(), idxMeta.getIndex().getName(), policyName);
130+
}
131+
} else {
121132
lifecycleRunner.maybeRunAsyncAction(clusterState, idxMeta, policyName, stepKey);
122-
// ILM is trying to stop, but this index is in a Shrink step (or other dangerous step) so we can't stop
123-
safeToStop = false;
133+
}
134+
} catch (Exception e) {
135+
if (logger.isTraceEnabled()) {
136+
logger.warn(new ParameterizedMessage("async action execution failed during master election trigger" +
137+
" for index [{}] with policy [{}] in step [{}], lifecycle state: [{}]",
138+
idxMeta.getIndex().getName(), policyName, stepKey, lifecycleState.asMap()), e);
124139
} else {
125-
logger.info("skipping policy execution of step [{}] for index [{}] with policy [{}] because ILM is stopping",
126-
stepKey == null ? "n/a" : stepKey.getName(), idxMeta.getIndex().getName(), policyName);
140+
logger.warn(new ParameterizedMessage("async action execution failed during master election trigger" +
141+
" for index [{}] with policy [{}] in step [{}]",
142+
idxMeta.getIndex().getName(), policyName, stepKey), e);
143+
127144
}
128-
} else {
129-
lifecycleRunner.maybeRunAsyncAction(clusterState, idxMeta, policyName, stepKey);
145+
// Don't rethrow the exception, we don't want a failure for one index to be
146+
// called to cause actions not to be triggered for further indices
130147
}
131148
}
132149
}
@@ -252,27 +269,42 @@ void triggerPolicies(ClusterState clusterState, boolean fromClusterStateChange)
252269
final LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(idxMeta);
253270
StepKey stepKey = IndexLifecycleRunner.getCurrentStepKey(lifecycleState);
254271

255-
if (OperationMode.STOPPING == currentMode) {
256-
if (stepKey != null && IGNORE_STEPS_MAINTENANCE_REQUESTED.contains(stepKey.getName())) {
257-
logger.info("waiting to stop ILM because index [{}] with policy [{}] is currently in step [{}]",
258-
idxMeta.getIndex().getName(), policyName, stepKey.getName());
272+
try {
273+
if (OperationMode.STOPPING == currentMode) {
274+
if (stepKey != null && IGNORE_STEPS_MAINTENANCE_REQUESTED.contains(stepKey.getName())) {
275+
logger.info("waiting to stop ILM because index [{}] with policy [{}] is currently in step [{}]",
276+
idxMeta.getIndex().getName(), policyName, stepKey.getName());
277+
if (fromClusterStateChange) {
278+
lifecycleRunner.runPolicyAfterStateChange(policyName, idxMeta);
279+
} else {
280+
lifecycleRunner.runPeriodicStep(policyName, idxMeta);
281+
}
282+
// ILM is trying to stop, but this index is in a Shrink step (or other dangerous step) so we can't stop
283+
safeToStop = false;
284+
} else {
285+
logger.info("skipping policy execution of step [{}] for index [{}] with policy [{}] because ILM is stopping",
286+
stepKey == null ? "n/a" : stepKey.getName(), idxMeta.getIndex().getName(), policyName);
287+
}
288+
} else {
259289
if (fromClusterStateChange) {
260290
lifecycleRunner.runPolicyAfterStateChange(policyName, idxMeta);
261291
} else {
262292
lifecycleRunner.runPeriodicStep(policyName, idxMeta);
263293
}
264-
// ILM is trying to stop, but this index is in a Shrink step (or other dangerous step) so we can't stop
265-
safeToStop = false;
266-
} else {
267-
logger.info("skipping policy execution of step [{}] for index [{}] with policy [{}] because ILM is stopping",
268-
stepKey == null ? "n/a" : stepKey.getName(), idxMeta.getIndex().getName(), policyName);
269294
}
270-
} else {
271-
if (fromClusterStateChange) {
272-
lifecycleRunner.runPolicyAfterStateChange(policyName, idxMeta);
295+
} catch (Exception e) {
296+
if (logger.isTraceEnabled()) {
297+
logger.warn(new ParameterizedMessage("async action execution failed during policy trigger" +
298+
" for index [{}] with policy [{}] in step [{}], lifecycle state: [{}]",
299+
idxMeta.getIndex().getName(), policyName, stepKey, lifecycleState.asMap()), e);
273300
} else {
274-
lifecycleRunner.runPeriodicStep(policyName, idxMeta);
301+
logger.warn(new ParameterizedMessage("async action execution failed during policy trigger" +
302+
" for index [{}] with policy [{}] in step [{}]",
303+
idxMeta.getIndex().getName(), policyName, stepKey), e);
304+
275305
}
306+
// Don't rethrow the exception, we don't want a failure for one index to be
307+
// called to cause actions not to be triggered for further indices
276308
}
277309
}
278310
}

x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunnerTests.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -1338,7 +1338,7 @@ private void assertClusterStateStepInfo(ClusterState oldClusterState, Index inde
13381338
assertEquals(newLifecycleState.getStepTime(), newLifecycleState.getStepTime());
13391339
}
13401340

1341-
private static class MockAsyncActionStep extends AsyncActionStep {
1341+
static class MockAsyncActionStep extends AsyncActionStep {
13421342

13431343
private Exception exception;
13441344
private boolean willComplete;
@@ -1387,7 +1387,7 @@ public void performAction(IndexMetaData indexMetaData, ClusterState currentState
13871387

13881388
}
13891389

1390-
private static class MockAsyncWaitStep extends AsyncWaitStep {
1390+
static class MockAsyncWaitStep extends AsyncWaitStep {
13911391

13921392
private Exception exception;
13931393
private boolean willComplete;

x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/IndexLifecycleServiceTests.java

+107
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,9 @@
5050
import java.util.Collections;
5151
import java.util.SortedMap;
5252
import java.util.TreeMap;
53+
import java.util.concurrent.CountDownLatch;
5354
import java.util.concurrent.ExecutorService;
55+
import java.util.concurrent.TimeUnit;
5456

5557
import static org.elasticsearch.node.Node.NODE_MASTER_SETTING;
5658
import static org.elasticsearch.xpack.core.ilm.AbstractStepTestCase.randomStepKey;
@@ -299,6 +301,111 @@ public void testRequestedStopOnSafeAction() {
299301
assertTrue(moveToMaintenance.get());
300302
}
301303

304+
public void testExceptionStillProcessesOtherIndices() {
305+
doTestExceptionStillProcessesOtherIndices(false);
306+
}
307+
308+
public void testExceptionStillProcessesOtherIndicesOnMaster() {
309+
doTestExceptionStillProcessesOtherIndices(true);
310+
}
311+
312+
@SuppressWarnings("unchecked")
313+
public void doTestExceptionStillProcessesOtherIndices(boolean useOnMaster) {
314+
String policy1 = randomAlphaOfLengthBetween(1, 20);
315+
Step.StepKey i1currentStepKey = randomStepKey();
316+
final Step i1mockStep;
317+
if (useOnMaster) {
318+
i1mockStep = new IndexLifecycleRunnerTests.MockAsyncActionStep(i1currentStepKey, randomStepKey());
319+
} else {
320+
i1mockStep = new IndexLifecycleRunnerTests.MockClusterStateActionStep(i1currentStepKey, randomStepKey());
321+
}
322+
MockAction i1mockAction = new MockAction(Collections.singletonList(i1mockStep));
323+
Phase i1phase = new Phase("phase", TimeValue.ZERO, Collections.singletonMap("action", i1mockAction));
324+
LifecyclePolicy i1policy = newTestLifecyclePolicy(policy1, Collections.singletonMap(i1phase.getName(), i1phase));
325+
Index index1 = new Index(randomAlphaOfLengthBetween(1, 20), randomAlphaOfLengthBetween(1, 20));
326+
LifecycleExecutionState.Builder i1lifecycleState = LifecycleExecutionState.builder();
327+
i1lifecycleState.setPhase(i1currentStepKey.getPhase());
328+
i1lifecycleState.setAction(i1currentStepKey.getAction());
329+
i1lifecycleState.setStep(i1currentStepKey.getName());
330+
331+
String policy2 = randomValueOtherThan(policy1, () -> randomAlphaOfLengthBetween(1, 20));
332+
Step.StepKey i2currentStepKey = randomStepKey();
333+
final Step i2mockStep;
334+
if (useOnMaster) {
335+
i2mockStep = new IndexLifecycleRunnerTests.MockAsyncActionStep(i2currentStepKey, randomStepKey());
336+
} else {
337+
i2mockStep = new IndexLifecycleRunnerTests.MockClusterStateActionStep(i2currentStepKey, randomStepKey());
338+
}
339+
MockAction mockAction = new MockAction(Collections.singletonList(i2mockStep));
340+
Phase i2phase = new Phase("phase", TimeValue.ZERO, Collections.singletonMap("action", mockAction));
341+
LifecyclePolicy i2policy = newTestLifecyclePolicy(policy1, Collections.singletonMap(i2phase.getName(), i1phase));
342+
Index index2 = new Index(randomAlphaOfLengthBetween(1, 20), randomAlphaOfLengthBetween(1, 20));
343+
LifecycleExecutionState.Builder i2lifecycleState = LifecycleExecutionState.builder();
344+
i2lifecycleState.setPhase(i2currentStepKey.getPhase());
345+
i2lifecycleState.setAction(i2currentStepKey.getAction());
346+
i2lifecycleState.setStep(i2currentStepKey.getName());
347+
348+
CountDownLatch stepLatch = new CountDownLatch(2);
349+
boolean failStep1 = randomBoolean();
350+
if (useOnMaster) {
351+
((IndexLifecycleRunnerTests.MockAsyncActionStep) i1mockStep).setLatch(stepLatch);
352+
((IndexLifecycleRunnerTests.MockAsyncActionStep) i1mockStep)
353+
.setException(failStep1 ? new IllegalArgumentException("forcing a failure for index 1") : null);
354+
((IndexLifecycleRunnerTests.MockAsyncActionStep) i2mockStep).setLatch(stepLatch);
355+
((IndexLifecycleRunnerTests.MockAsyncActionStep) i2mockStep)
356+
.setException(failStep1 ? null : new IllegalArgumentException("forcing a failure for index 2"));
357+
} else {
358+
((IndexLifecycleRunnerTests.MockClusterStateActionStep) i1mockStep).setLatch(stepLatch);
359+
((IndexLifecycleRunnerTests.MockClusterStateActionStep) i1mockStep)
360+
.setException(failStep1 ? new IllegalArgumentException("forcing a failure for index 1") : null);
361+
((IndexLifecycleRunnerTests.MockClusterStateActionStep) i1mockStep).setLatch(stepLatch);
362+
((IndexLifecycleRunnerTests.MockClusterStateActionStep) i1mockStep)
363+
.setException(failStep1 ? null : new IllegalArgumentException("forcing a failure for index 2"));
364+
}
365+
366+
SortedMap<String, LifecyclePolicyMetadata> policyMap = new TreeMap<>();
367+
policyMap.put(policy1, new LifecyclePolicyMetadata(i1policy, Collections.emptyMap(),
368+
randomNonNegativeLong(), randomNonNegativeLong()));
369+
policyMap.put(policy2, new LifecyclePolicyMetadata(i2policy, Collections.emptyMap(),
370+
randomNonNegativeLong(), randomNonNegativeLong()));
371+
372+
IndexMetaData i1indexMetadata = IndexMetaData.builder(index1.getName())
373+
.settings(settings(Version.CURRENT).put(LifecycleSettings.LIFECYCLE_NAME_SETTING.getKey(), policy1))
374+
.putCustom(ILM_CUSTOM_METADATA_KEY, i1lifecycleState.build().asMap())
375+
.numberOfShards(randomIntBetween(1, 5)).numberOfReplicas(randomIntBetween(0, 5)).build();
376+
IndexMetaData i2indexMetadata = IndexMetaData.builder(index2.getName())
377+
.settings(settings(Version.CURRENT).put(LifecycleSettings.LIFECYCLE_NAME_SETTING.getKey(), policy1))
378+
.putCustom(ILM_CUSTOM_METADATA_KEY, i2lifecycleState.build().asMap())
379+
.numberOfShards(randomIntBetween(1, 5)).numberOfReplicas(randomIntBetween(0, 5)).build();
380+
ImmutableOpenMap.Builder<String, IndexMetaData> indices = ImmutableOpenMap.<String, IndexMetaData> builder()
381+
.fPut(index1.getName(), i1indexMetadata)
382+
.fPut(index2.getName(), i2indexMetadata);
383+
384+
MetaData metaData = MetaData.builder()
385+
.putCustom(IndexLifecycleMetadata.TYPE, new IndexLifecycleMetadata(policyMap, OperationMode.RUNNING))
386+
.indices(indices.build())
387+
.persistentSettings(settings(Version.CURRENT).build())
388+
.build();
389+
390+
ClusterState currentState = ClusterState.builder(ClusterName.DEFAULT)
391+
.metaData(metaData)
392+
.nodes(DiscoveryNodes.builder().localNodeId(nodeId).masterNodeId(nodeId).add(masterNode).build())
393+
.build();
394+
395+
if (useOnMaster) {
396+
when(clusterService.state()).thenReturn(currentState);
397+
indexLifecycleService.onMaster();
398+
} else {
399+
indexLifecycleService.triggerPolicies(currentState, randomBoolean());
400+
}
401+
try {
402+
stepLatch.await(5, TimeUnit.SECONDS);
403+
} catch (InterruptedException e) {
404+
logger.error("failure while waiting for step execution", e);
405+
fail("both steps should have been executed, even with an exception");
406+
}
407+
}
408+
302409
public void testTriggeredDifferentJob() {
303410
Mockito.reset(clusterService);
304411
SchedulerEngine.Event schedulerEvent = new SchedulerEngine.Event("foo", randomLong(), randomLong());

0 commit comments

Comments
 (0)