Skip to content

Commit f78d4b3

Browse files
authored
ILM fix the init step to actually be retryable (#52076)
We marked the `init` ILM step as retryable but our test used `waitUntil` without an assert so we didn’t catch the fact that we were not actually able to retry this step as our ILM state didn’t contain any information about the policy execution (as we were in the process of initialising it). This commit manually sets the current step to `init` when we’re moving the ilm policy into the ERROR step (this enables us to successfully move to the error step and later retry the step) * ShrunkenIndexCheckStep: Use correct logger
1 parent fd99542 commit f78d4b3

File tree

6 files changed

+75
-40
lines changed

6 files changed

+75
-40
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/InitializePolicyContextStep.java

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,22 +38,26 @@ public ClusterState performAction(Index index, ClusterState clusterState) {
3838
return clusterState;
3939
}
4040

41-
LifecycleExecutionState lifecycleState = LifecycleExecutionState
42-
.fromIndexMetadata(indexMetaData);
43-
44-
if (lifecycleState.getLifecycleDate() != null) {
45-
return clusterState;
46-
}
47-
4841
IndexMetaData.Builder indexMetadataBuilder = IndexMetaData.builder(indexMetaData);
49-
if (shouldParseIndexName(indexMetaData.getSettings())) {
50-
long parsedOriginationDate = parseIndexNameAndExtractDate(index.getName());
51-
indexMetadataBuilder.settingsVersion(indexMetaData.getSettingsVersion() + 1)
52-
.settings(Settings.builder()
53-
.put(indexMetaData.getSettings())
54-
.put(LifecycleSettings.LIFECYCLE_ORIGINATION_DATE, parsedOriginationDate)
55-
.build()
56-
);
42+
LifecycleExecutionState lifecycleState;
43+
try {
44+
lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData);
45+
if (lifecycleState.getLifecycleDate() != null) {
46+
return clusterState;
47+
}
48+
49+
if (shouldParseIndexName(indexMetaData.getSettings())) {
50+
long parsedOriginationDate = parseIndexNameAndExtractDate(index.getName());
51+
indexMetadataBuilder.settingsVersion(indexMetaData.getSettingsVersion() + 1)
52+
.settings(Settings.builder()
53+
.put(indexMetaData.getSettings())
54+
.put(LifecycleSettings.LIFECYCLE_ORIGINATION_DATE, parsedOriginationDate)
55+
.build()
56+
);
57+
}
58+
} catch (Exception e) {
59+
String policy = indexMetaData.getSettings().get(LifecycleSettings.LIFECYCLE_NAME);
60+
throw new InitializePolicyException(policy, index.getName(), e);
5761
}
5862

5963
ClusterState.Builder newClusterStateBuilder = ClusterState.builder(clusterState);
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
package org.elasticsearch.xpack.core.ilm;
7+
8+
import org.elasticsearch.ElasticsearchException;
9+
10+
import java.util.Locale;
11+
12+
/**
13+
* Exception thrown when a problem is encountered while initialising an ILM policy for an index.
14+
*/
15+
public class InitializePolicyException extends ElasticsearchException {
16+
17+
public InitializePolicyException(String policy, String index, Throwable cause) {
18+
super(String.format(Locale.ROOT, "unable to initialize policy [%s] for index [%s]", policy, index), cause);
19+
}
20+
}

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/ShrunkenIndexCheckStep.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
*/
2626
public class ShrunkenIndexCheckStep extends ClusterStateWaitStep {
2727
public static final String NAME = "is-shrunken-index";
28-
private static final Logger logger = LogManager.getLogger(InitializePolicyContextStep.class);
28+
private static final Logger logger = LogManager.getLogger(ShrunkenIndexCheckStep.class);
2929
private String shrunkIndexPrefix;
3030

3131
public ShrunkenIndexCheckStep(StepKey key, StepKey nextStepKey, String shrunkIndexPrefix) {

x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/InitializePolicyContextStepTests.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,14 @@ public InitializePolicyContextStep mutateInstance(InitializePolicyContextStep in
3333
StepKey nextKey = instance.getNextStepKey();
3434

3535
switch (between(0, 1)) {
36-
case 0:
37-
key = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
38-
break;
39-
case 1:
40-
nextKey = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
41-
break;
42-
default:
43-
throw new AssertionError("Illegal randomisation branch");
36+
case 0:
37+
key = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
38+
break;
39+
case 1:
40+
nextKey = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
41+
break;
42+
default:
43+
throw new AssertionError("Illegal randomisation branch");
4444
}
4545

4646
return new InitializePolicyContextStep(key, nextKey);

x-pack/plugin/ilm/qa/multi-node/src/test/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1176,26 +1176,26 @@ public void testRolloverStepRetriesUntilRolledOverIndexIsDeleted() throws Except
11761176
// {@link org.elasticsearch.xpack.core.ilm.ErrorStep} in order to retry the failing step. As {@link #assertBusy}
11771177
// increases the wait time between calls exponentially, we might miss the window where the policy is on
11781178
// {@link WaitForRolloverReadyStep} and the move to `attempt-rollover` request will not be successful.
1179-
waitUntil(() -> {
1179+
assertTrue(waitUntil(() -> {
11801180
try {
11811181
return client().performRequest(moveToStepRequest).getStatusLine().getStatusCode() == 200;
11821182
} catch (IOException e) {
11831183
return false;
11841184
}
1185-
}, 30, TimeUnit.SECONDS);
1185+
}, 30, TimeUnit.SECONDS));
11861186

11871187
// Similar to above, using {@link #waitUntil} as we want to make sure the `attempt-rollover` step started failing and is being
11881188
// retried (which means ILM moves back and forth between the `attempt-rollover` step and the `error` step)
1189-
waitUntil(() -> {
1189+
assertTrue("ILM did not start retrying the attempt-rollover step", waitUntil(() -> {
11901190
try {
11911191
Map<String, Object> explainIndexResponse = explainIndex(index);
1192-
String step = (String) explainIndexResponse.get("step");
1192+
String failedStep = (String) explainIndexResponse.get("failed_step");
11931193
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
1194-
return step != null && step.equals("attempt-rollover") && retryCount != null && retryCount >= 1;
1194+
return failedStep != null && failedStep.equals("attempt-rollover") && retryCount != null && retryCount >= 1;
11951195
} catch (IOException e) {
11961196
return false;
11971197
}
1198-
}, 30, TimeUnit.SECONDS);
1198+
}, 30, TimeUnit.SECONDS));
11991199

12001200
deleteIndex(rolledIndex);
12011201

@@ -1237,16 +1237,17 @@ public void testUpdateRolloverLifecycleDateStepRetriesWhenRolloverInfoIsMissing(
12371237
"}");
12381238
client().performRequest(moveToStepRequest);
12391239

1240-
waitUntil(() -> {
1240+
assertTrue("ILM did not start retrying the update-rollover-lifecycle-date step", waitUntil(() -> {
12411241
try {
12421242
Map<String, Object> explainIndexResponse = explainIndex(index);
1243-
String step = (String) explainIndexResponse.get("step");
1243+
String failedStep = (String) explainIndexResponse.get("failed_step");
12441244
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
1245-
return step != null && step.equals(UpdateRolloverLifecycleDateStep.NAME) && retryCount != null && retryCount >= 1;
1245+
return failedStep != null && failedStep.equals(UpdateRolloverLifecycleDateStep.NAME) && retryCount != null
1246+
&& retryCount >= 1;
12461247
} catch (IOException e) {
12471248
return false;
12481249
}
1249-
});
1250+
}, 30, TimeUnit.SECONDS));
12501251

12511252
index(client(), index, "1", "foo", "bar");
12521253
Request refreshIndex = new Request("POST", "/" + index + "/_refresh");
@@ -1432,16 +1433,17 @@ public void testRetryableInitializationStep() throws Exception {
14321433
assertOK(client().performRequest(startReq));
14331434

14341435
// Wait until an error has occurred.
1435-
waitUntil(() -> {
1436+
assertTrue("ILM did not start retrying the init step", waitUntil(() -> {
14361437
try {
14371438
Map<String, Object> explainIndexResponse = explainIndex(index);
1438-
String step = (String) explainIndexResponse.get("step");
1439+
String failedStep = (String) explainIndexResponse.get("failed_step");
14391440
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
1440-
return step != null && step.equals(InitializePolicyContextStep.KEY.getAction()) && retryCount != null && retryCount >= 1;
1441+
return failedStep != null && failedStep.equals(InitializePolicyContextStep.KEY.getAction()) && retryCount != null
1442+
&& retryCount >= 1;
14411443
} catch (IOException e) {
14421444
return false;
14431445
}
1444-
}, 30, TimeUnit.SECONDS);
1446+
}, 30, TimeUnit.SECONDS));
14451447

14461448
// Turn origination date parsing back off
14471449
updateIndexSettings(index, Settings.builder()

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.elasticsearch.xpack.core.ilm.ErrorStep;
2525
import org.elasticsearch.xpack.core.ilm.IndexLifecycleMetadata;
2626
import org.elasticsearch.xpack.core.ilm.InitializePolicyContextStep;
27+
import org.elasticsearch.xpack.core.ilm.InitializePolicyException;
2728
import org.elasticsearch.xpack.core.ilm.LifecycleExecutionState;
2829
import org.elasticsearch.xpack.core.ilm.LifecyclePolicyMetadata;
2930
import org.elasticsearch.xpack.core.ilm.LifecycleSettings;
@@ -133,8 +134,16 @@ static ClusterState moveClusterStateToErrorStep(Index index, ClusterState cluste
133134
ElasticsearchException.generateThrowableXContent(causeXContentBuilder, STACKTRACE_PARAMS, cause);
134135
causeXContentBuilder.endObject();
135136
LifecycleExecutionState currentState = LifecycleExecutionState.fromIndexMetadata(idxMeta);
136-
Step.StepKey currentStep = Objects.requireNonNull(LifecycleExecutionState.getCurrentStepKey(currentState),
137-
"unable to move to an error step where there is no current step, state: " + currentState);
137+
Step.StepKey currentStep;
138+
// if an error is encountered while initialising the policy the lifecycle execution state will not yet contain any step information
139+
// as we haven't yet initialised the policy, so we'll manually set the current step to be the "initialize policy" step so we can
140+
// record the error (and later retry the init policy step)
141+
if (cause instanceof InitializePolicyException) {
142+
currentStep = InitializePolicyContextStep.KEY;
143+
} else {
144+
currentStep = Objects.requireNonNull(LifecycleExecutionState.getCurrentStepKey(currentState),
145+
"unable to move to an error step where there is no current step, state: " + currentState);
146+
}
138147
LifecycleExecutionState nextStepState = updateExecutionStateToStep(policyMetadata, currentState,
139148
new Step.StepKey(currentStep.getPhase(), currentStep.getAction(), ErrorStep.NAME), nowSupplier, false);
140149

0 commit comments

Comments
 (0)