Skip to content

Commit bd3a70d

Browse files
authored
ILM fix the init step to actually be retryable (#52076) (#52375)
We marked the `init` ILM step as retryable but our test used `waitUntil` without an assert so we didn’t catch the fact that we were not actually able to retry this step as our ILM state didn’t contain any information about the policy execution (as we were in the process of initialising it). This commit manually sets the current step to `init` when we’re moving the ilm policy into the ERROR step (this enables us to successfully move to the error step and later retry the step) * ShrunkenIndexCheckStep: Use correct logger (cherry picked from commit f78d4b3) Signed-off-by: Andrei Dan <[email protected]>
1 parent 0656a33 commit bd3a70d

File tree

6 files changed

+75
-40
lines changed

6 files changed

+75
-40
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/InitializePolicyContextStep.java

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,22 +38,26 @@ public ClusterState performAction(Index index, ClusterState clusterState) {
3838
return clusterState;
3939
}
4040

41-
LifecycleExecutionState lifecycleState = LifecycleExecutionState
42-
.fromIndexMetadata(indexMetaData);
43-
44-
if (lifecycleState.getLifecycleDate() != null) {
45-
return clusterState;
46-
}
47-
4841
IndexMetaData.Builder indexMetadataBuilder = IndexMetaData.builder(indexMetaData);
49-
if (shouldParseIndexName(indexMetaData.getSettings())) {
50-
long parsedOriginationDate = parseIndexNameAndExtractDate(index.getName());
51-
indexMetadataBuilder.settingsVersion(indexMetaData.getSettingsVersion() + 1)
52-
.settings(Settings.builder()
53-
.put(indexMetaData.getSettings())
54-
.put(LifecycleSettings.LIFECYCLE_ORIGINATION_DATE, parsedOriginationDate)
55-
.build()
56-
);
42+
LifecycleExecutionState lifecycleState;
43+
try {
44+
lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData);
45+
if (lifecycleState.getLifecycleDate() != null) {
46+
return clusterState;
47+
}
48+
49+
if (shouldParseIndexName(indexMetaData.getSettings())) {
50+
long parsedOriginationDate = parseIndexNameAndExtractDate(index.getName());
51+
indexMetadataBuilder.settingsVersion(indexMetaData.getSettingsVersion() + 1)
52+
.settings(Settings.builder()
53+
.put(indexMetaData.getSettings())
54+
.put(LifecycleSettings.LIFECYCLE_ORIGINATION_DATE, parsedOriginationDate)
55+
.build()
56+
);
57+
}
58+
} catch (Exception e) {
59+
String policy = indexMetaData.getSettings().get(LifecycleSettings.LIFECYCLE_NAME);
60+
throw new InitializePolicyException(policy, index.getName(), e);
5761
}
5862

5963
ClusterState.Builder newClusterStateBuilder = ClusterState.builder(clusterState);
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
package org.elasticsearch.xpack.core.ilm;
7+
8+
import org.elasticsearch.ElasticsearchException;
9+
10+
import java.util.Locale;
11+
12+
/**
13+
* Exception thrown when a problem is encountered while initialising an ILM policy for an index.
14+
*/
15+
public class InitializePolicyException extends ElasticsearchException {
16+
17+
public InitializePolicyException(String policy, String index, Throwable cause) {
18+
super(String.format(Locale.ROOT, "unable to initialize policy [%s] for index [%s]", policy, index), cause);
19+
}
20+
}

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/ShrunkenIndexCheckStep.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
*/
2626
public class ShrunkenIndexCheckStep extends ClusterStateWaitStep {
2727
public static final String NAME = "is-shrunken-index";
28-
private static final Logger logger = LogManager.getLogger(InitializePolicyContextStep.class);
28+
private static final Logger logger = LogManager.getLogger(ShrunkenIndexCheckStep.class);
2929
private String shrunkIndexPrefix;
3030

3131
public ShrunkenIndexCheckStep(StepKey key, StepKey nextStepKey, String shrunkIndexPrefix) {

x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/InitializePolicyContextStepTests.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,14 @@ public InitializePolicyContextStep mutateInstance(InitializePolicyContextStep in
3333
StepKey nextKey = instance.getNextStepKey();
3434

3535
switch (between(0, 1)) {
36-
case 0:
37-
key = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
38-
break;
39-
case 1:
40-
nextKey = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
41-
break;
42-
default:
43-
throw new AssertionError("Illegal randomisation branch");
36+
case 0:
37+
key = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
38+
break;
39+
case 1:
40+
nextKey = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
41+
break;
42+
default:
43+
throw new AssertionError("Illegal randomisation branch");
4444
}
4545

4646
return new InitializePolicyContextStep(key, nextKey);

x-pack/plugin/ilm/qa/multi-node/src/test/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1185,26 +1185,26 @@ public void testRolloverStepRetriesUntilRolledOverIndexIsDeleted() throws Except
11851185
// {@link org.elasticsearch.xpack.core.ilm.ErrorStep} in order to retry the failing step. As {@link #assertBusy}
11861186
// increases the wait time between calls exponentially, we might miss the window where the policy is on
11871187
// {@link WaitForRolloverReadyStep} and the move to `attempt-rollover` request will not be successful.
1188-
waitUntil(() -> {
1188+
assertTrue(waitUntil(() -> {
11891189
try {
11901190
return client().performRequest(moveToStepRequest).getStatusLine().getStatusCode() == 200;
11911191
} catch (IOException e) {
11921192
return false;
11931193
}
1194-
}, 30, TimeUnit.SECONDS);
1194+
}, 30, TimeUnit.SECONDS));
11951195

11961196
// Similar to above, using {@link #waitUntil} as we want to make sure the `attempt-rollover` step started failing and is being
11971197
// retried (which means ILM moves back and forth between the `attempt-rollover` step and the `error` step)
1198-
waitUntil(() -> {
1198+
assertTrue("ILM did not start retrying the attempt-rollover step", waitUntil(() -> {
11991199
try {
12001200
Map<String, Object> explainIndexResponse = explainIndex(index);
1201-
String step = (String) explainIndexResponse.get("step");
1201+
String failedStep = (String) explainIndexResponse.get("failed_step");
12021202
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
1203-
return step != null && step.equals("attempt-rollover") && retryCount != null && retryCount >= 1;
1203+
return failedStep != null && failedStep.equals("attempt-rollover") && retryCount != null && retryCount >= 1;
12041204
} catch (IOException e) {
12051205
return false;
12061206
}
1207-
}, 30, TimeUnit.SECONDS);
1207+
}, 30, TimeUnit.SECONDS));
12081208

12091209
deleteIndex(rolledIndex);
12101210

@@ -1246,16 +1246,17 @@ public void testUpdateRolloverLifecycleDateStepRetriesWhenRolloverInfoIsMissing(
12461246
"}");
12471247
client().performRequest(moveToStepRequest);
12481248

1249-
waitUntil(() -> {
1249+
assertTrue("ILM did not start retrying the update-rollover-lifecycle-date step", waitUntil(() -> {
12501250
try {
12511251
Map<String, Object> explainIndexResponse = explainIndex(index);
1252-
String step = (String) explainIndexResponse.get("step");
1252+
String failedStep = (String) explainIndexResponse.get("failed_step");
12531253
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
1254-
return step != null && step.equals(UpdateRolloverLifecycleDateStep.NAME) && retryCount != null && retryCount >= 1;
1254+
return failedStep != null && failedStep.equals(UpdateRolloverLifecycleDateStep.NAME) && retryCount != null
1255+
&& retryCount >= 1;
12551256
} catch (IOException e) {
12561257
return false;
12571258
}
1258-
});
1259+
}, 30, TimeUnit.SECONDS));
12591260

12601261
index(client(), index, "1", "foo", "bar");
12611262
Request refreshIndex = new Request("POST", "/" + index + "/_refresh");
@@ -1441,16 +1442,17 @@ public void testRetryableInitializationStep() throws Exception {
14411442
assertOK(client().performRequest(startReq));
14421443

14431444
// Wait until an error has occurred.
1444-
waitUntil(() -> {
1445+
assertTrue("ILM did not start retrying the init step", waitUntil(() -> {
14451446
try {
14461447
Map<String, Object> explainIndexResponse = explainIndex(index);
1447-
String step = (String) explainIndexResponse.get("step");
1448+
String failedStep = (String) explainIndexResponse.get("failed_step");
14481449
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
1449-
return step != null && step.equals(InitializePolicyContextStep.KEY.getAction()) && retryCount != null && retryCount >= 1;
1450+
return failedStep != null && failedStep.equals(InitializePolicyContextStep.KEY.getAction()) && retryCount != null
1451+
&& retryCount >= 1;
14501452
} catch (IOException e) {
14511453
return false;
14521454
}
1453-
}, 30, TimeUnit.SECONDS);
1455+
}, 30, TimeUnit.SECONDS));
14541456

14551457
// Turn origination date parsing back off
14561458
updateIndexSettings(index, Settings.builder()

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.elasticsearch.xpack.core.ilm.ErrorStep;
2525
import org.elasticsearch.xpack.core.ilm.IndexLifecycleMetadata;
2626
import org.elasticsearch.xpack.core.ilm.InitializePolicyContextStep;
27+
import org.elasticsearch.xpack.core.ilm.InitializePolicyException;
2728
import org.elasticsearch.xpack.core.ilm.LifecycleExecutionState;
2829
import org.elasticsearch.xpack.core.ilm.LifecyclePolicyMetadata;
2930
import org.elasticsearch.xpack.core.ilm.LifecycleSettings;
@@ -133,8 +134,16 @@ static ClusterState moveClusterStateToErrorStep(Index index, ClusterState cluste
133134
ElasticsearchException.generateThrowableXContent(causeXContentBuilder, STACKTRACE_PARAMS, cause);
134135
causeXContentBuilder.endObject();
135136
LifecycleExecutionState currentState = LifecycleExecutionState.fromIndexMetadata(idxMeta);
136-
Step.StepKey currentStep = Objects.requireNonNull(LifecycleExecutionState.getCurrentStepKey(currentState),
137-
"unable to move to an error step where there is no current step, state: " + currentState);
137+
Step.StepKey currentStep;
138+
// if an error is encountered while initialising the policy the lifecycle execution state will not yet contain any step information
139+
// as we haven't yet initialised the policy, so we'll manually set the current step to be the "initialize policy" step so we can
140+
// record the error (and later retry the init policy step)
141+
if (cause instanceof InitializePolicyException) {
142+
currentStep = InitializePolicyContextStep.KEY;
143+
} else {
144+
currentStep = Objects.requireNonNull(LifecycleExecutionState.getCurrentStepKey(currentState),
145+
"unable to move to an error step where there is no current step, state: " + currentState);
146+
}
138147
LifecycleExecutionState nextStepState = updateExecutionStateToStep(policyMetadata, currentState,
139148
new Step.StepKey(currentStep.getPhase(), currentStep.getAction(), ErrorStep.NAME), nowSupplier, false);
140149

0 commit comments

Comments
 (0)