Skip to content

Commit 3d439f2

Browse files
committed
ILM fix the init step to actually be retryable (elastic#52076)
We marked the `init` ILM step as retryable but our test used `waitUntil` without an assert so we didn’t catch the fact that we were not actually able to retry this step as our ILM state didn’t contain any information about the policy execution (as we were in the process of initialising it). This commit manually sets the current step to `init` when we’re moving the ilm policy into the ERROR step (this enables us to successfully move to the error step and later retry the step) * ShrunkenIndexCheckStep: Use correct logger (cherry picked from commit f78d4b3) Signed-off-by: Andrei Dan <[email protected]>
1 parent db1e65f commit 3d439f2

File tree

6 files changed

+75
-40
lines changed

6 files changed

+75
-40
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/InitializePolicyContextStep.java

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,22 +38,26 @@ public ClusterState performAction(Index index, ClusterState clusterState) {
3838
return clusterState;
3939
}
4040

41-
LifecycleExecutionState lifecycleState = LifecycleExecutionState
42-
.fromIndexMetadata(indexMetaData);
43-
44-
if (lifecycleState.getLifecycleDate() != null) {
45-
return clusterState;
46-
}
47-
4841
IndexMetaData.Builder indexMetadataBuilder = IndexMetaData.builder(indexMetaData);
49-
if (shouldParseIndexName(indexMetaData.getSettings())) {
50-
long parsedOriginationDate = parseIndexNameAndExtractDate(index.getName());
51-
indexMetadataBuilder.settingsVersion(indexMetaData.getSettingsVersion() + 1)
52-
.settings(Settings.builder()
53-
.put(indexMetaData.getSettings())
54-
.put(LifecycleSettings.LIFECYCLE_ORIGINATION_DATE, parsedOriginationDate)
55-
.build()
56-
);
42+
LifecycleExecutionState lifecycleState;
43+
try {
44+
lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData);
45+
if (lifecycleState.getLifecycleDate() != null) {
46+
return clusterState;
47+
}
48+
49+
if (shouldParseIndexName(indexMetaData.getSettings())) {
50+
long parsedOriginationDate = parseIndexNameAndExtractDate(index.getName());
51+
indexMetadataBuilder.settingsVersion(indexMetaData.getSettingsVersion() + 1)
52+
.settings(Settings.builder()
53+
.put(indexMetaData.getSettings())
54+
.put(LifecycleSettings.LIFECYCLE_ORIGINATION_DATE, parsedOriginationDate)
55+
.build()
56+
);
57+
}
58+
} catch (Exception e) {
59+
String policy = indexMetaData.getSettings().get(LifecycleSettings.LIFECYCLE_NAME);
60+
throw new InitializePolicyException(policy, index.getName(), e);
5761
}
5862

5963
ClusterState.Builder newClusterStateBuilder = ClusterState.builder(clusterState);
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
package org.elasticsearch.xpack.core.ilm;
7+
8+
import org.elasticsearch.ElasticsearchException;
9+
10+
import java.util.Locale;
11+
12+
/**
13+
* Exception thrown when a problem is encountered while initialising an ILM policy for an index.
14+
*/
15+
public class InitializePolicyException extends ElasticsearchException {
16+
17+
public InitializePolicyException(String policy, String index, Throwable cause) {
18+
super(String.format(Locale.ROOT, "unable to initialize policy [%s] for index [%s]", policy, index), cause);
19+
}
20+
}

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/ShrunkenIndexCheckStep.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
*/
2626
public class ShrunkenIndexCheckStep extends ClusterStateWaitStep {
2727
public static final String NAME = "is-shrunken-index";
28-
private static final Logger logger = LogManager.getLogger(InitializePolicyContextStep.class);
28+
private static final Logger logger = LogManager.getLogger(ShrunkenIndexCheckStep.class);
2929
private String shrunkIndexPrefix;
3030

3131
public ShrunkenIndexCheckStep(StepKey key, StepKey nextStepKey, String shrunkIndexPrefix) {

x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/InitializePolicyContextStepTests.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,14 @@ public InitializePolicyContextStep mutateInstance(InitializePolicyContextStep in
3333
StepKey nextKey = instance.getNextStepKey();
3434

3535
switch (between(0, 1)) {
36-
case 0:
37-
key = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
38-
break;
39-
case 1:
40-
nextKey = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
41-
break;
42-
default:
43-
throw new AssertionError("Illegal randomisation branch");
36+
case 0:
37+
key = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
38+
break;
39+
case 1:
40+
nextKey = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
41+
break;
42+
default:
43+
throw new AssertionError("Illegal randomisation branch");
4444
}
4545

4646
return new InitializePolicyContextStep(key, nextKey);

x-pack/plugin/ilm/qa/multi-node/src/test/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1123,26 +1123,26 @@ public void testRolloverStepRetriesUntilRolledOverIndexIsDeleted() throws Except
11231123
// {@link org.elasticsearch.xpack.core.ilm.ErrorStep} in order to retry the failing step. As {@link #assertBusy}
11241124
// increases the wait time between calls exponentially, we might miss the window where the policy is on
11251125
// {@link WaitForRolloverReadyStep} and the move to `attempt-rollover` request will not be successful.
1126-
waitUntil(() -> {
1126+
assertTrue(waitUntil(() -> {
11271127
try {
11281128
return client().performRequest(moveToStepRequest).getStatusLine().getStatusCode() == 200;
11291129
} catch (IOException e) {
11301130
return false;
11311131
}
1132-
}, 30, TimeUnit.SECONDS);
1132+
}, 30, TimeUnit.SECONDS));
11331133

11341134
// Similar to above, using {@link #waitUntil} as we want to make sure the `attempt-rollover` step started failing and is being
11351135
// retried (which means ILM moves back and forth between the `attempt-rollover` step and the `error` step)
1136-
waitUntil(() -> {
1136+
assertTrue("ILM did not start retrying the attempt-rollover step", waitUntil(() -> {
11371137
try {
11381138
Map<String, Object> explainIndexResponse = explainIndex(index);
1139-
String step = (String) explainIndexResponse.get("step");
1139+
String failedStep = (String) explainIndexResponse.get("failed_step");
11401140
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
1141-
return step != null && step.equals("attempt-rollover") && retryCount != null && retryCount >= 1;
1141+
return failedStep != null && failedStep.equals("attempt-rollover") && retryCount != null && retryCount >= 1;
11421142
} catch (IOException e) {
11431143
return false;
11441144
}
1145-
}, 30, TimeUnit.SECONDS);
1145+
}, 30, TimeUnit.SECONDS));
11461146

11471147
deleteIndex(rolledIndex);
11481148

@@ -1184,16 +1184,17 @@ public void testUpdateRolloverLifecycleDateStepRetriesWhenRolloverInfoIsMissing(
11841184
"}");
11851185
client().performRequest(moveToStepRequest);
11861186

1187-
waitUntil(() -> {
1187+
assertTrue("ILM did not start retrying the update-rollover-lifecycle-date step", waitUntil(() -> {
11881188
try {
11891189
Map<String, Object> explainIndexResponse = explainIndex(index);
1190-
String step = (String) explainIndexResponse.get("step");
1190+
String failedStep = (String) explainIndexResponse.get("failed_step");
11911191
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
1192-
return step != null && step.equals(UpdateRolloverLifecycleDateStep.NAME) && retryCount != null && retryCount >= 1;
1192+
return failedStep != null && failedStep.equals(UpdateRolloverLifecycleDateStep.NAME) && retryCount != null
1193+
&& retryCount >= 1;
11931194
} catch (IOException e) {
11941195
return false;
11951196
}
1196-
});
1197+
}, 30, TimeUnit.SECONDS));
11971198

11981199
index(client(), index, "1", "foo", "bar");
11991200
Request refreshIndex = new Request("POST", "/" + index + "/_refresh");
@@ -1344,16 +1345,17 @@ public void testRetryableInitializationStep() throws Exception {
13441345
assertOK(client().performRequest(startReq));
13451346

13461347
// Wait until an error has occurred.
1347-
waitUntil(() -> {
1348+
assertTrue("ILM did not start retrying the init step", waitUntil(() -> {
13481349
try {
13491350
Map<String, Object> explainIndexResponse = explainIndex(index);
1350-
String step = (String) explainIndexResponse.get("step");
1351+
String failedStep = (String) explainIndexResponse.get("failed_step");
13511352
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
1352-
return step != null && step.equals(InitializePolicyContextStep.KEY.getAction()) && retryCount != null && retryCount >= 1;
1353+
return failedStep != null && failedStep.equals(InitializePolicyContextStep.KEY.getAction()) && retryCount != null
1354+
&& retryCount >= 1;
13531355
} catch (IOException e) {
13541356
return false;
13551357
}
1356-
}, 30, TimeUnit.SECONDS);
1358+
}, 30, TimeUnit.SECONDS));
13571359

13581360
// Turn origination date parsing back off
13591361
updateIndexSettings(index, Settings.builder()

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.elasticsearch.xpack.core.ilm.ErrorStep;
2525
import org.elasticsearch.xpack.core.ilm.IndexLifecycleMetadata;
2626
import org.elasticsearch.xpack.core.ilm.InitializePolicyContextStep;
27+
import org.elasticsearch.xpack.core.ilm.InitializePolicyException;
2728
import org.elasticsearch.xpack.core.ilm.LifecycleExecutionState;
2829
import org.elasticsearch.xpack.core.ilm.LifecyclePolicyMetadata;
2930
import org.elasticsearch.xpack.core.ilm.LifecycleSettings;
@@ -132,8 +133,16 @@ static ClusterState moveClusterStateToErrorStep(Index index, ClusterState cluste
132133
ElasticsearchException.generateThrowableXContent(causeXContentBuilder, STACKTRACE_PARAMS, cause);
133134
causeXContentBuilder.endObject();
134135
LifecycleExecutionState currentState = LifecycleExecutionState.fromIndexMetadata(idxMeta);
135-
Step.StepKey currentStep = Objects.requireNonNull(LifecycleExecutionState.getCurrentStepKey(currentState),
136-
"unable to move to an error step where there is no current step, state: " + currentState);
136+
Step.StepKey currentStep;
137+
// if an error is encountered while initialising the policy the lifecycle execution state will not yet contain any step information
138+
// as we haven't yet initialised the policy, so we'll manually set the current step to be the "initialize policy" step so we can
139+
// record the error (and later retry the init policy step)
140+
if (cause instanceof InitializePolicyException) {
141+
currentStep = InitializePolicyContextStep.KEY;
142+
} else {
143+
currentStep = Objects.requireNonNull(LifecycleExecutionState.getCurrentStepKey(currentState),
144+
"unable to move to an error step where there is no current step, state: " + currentState);
145+
}
137146
LifecycleExecutionState nextStepState = updateExecutionStateToStep(policyMetadata, currentState,
138147
new Step.StepKey(currentStep.getPhase(), currentStep.getAction(), ErrorStep.NAME), nowSupplier, false);
139148

0 commit comments

Comments
 (0)