Skip to content

Commit 8bee5f4

Browse files
authored
ILM retryable async action steps (#50522)
This adds support for retrying AsyncActionSteps by triggering the async step after ILM was moved back on the failed step (the async step we'll be attempting to run after the cluster state reflects ILM being moved back on the failed step). This also marks the RolloverStep as retryable and adds an integration test where the RolloverStep is failing to execute as the rolled over index already exists to test that the async action RolloverStep is retried until the rolled over index is deleted.
1 parent e23e1bf commit 8bee5f4

File tree

3 files changed

+155
-63
lines changed

3 files changed

+155
-63
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/RolloverStep.java

+5
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ public RolloverStep(StepKey key, StepKey nextStepKey, Client client) {
3030
super(key, nextStepKey, client);
3131
}
3232

33+
@Override
34+
public boolean isRetryable() {
35+
return true;
36+
}
37+
3338
@Override
3439
public void performAction(IndexMetaData indexMetaData, ClusterState currentClusterState,
3540
ClusterStateObserver observer, Listener listener) {

x-pack/plugin/ilm/qa/multi-node/src/test/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java

+136-63
Original file line numberDiff line numberDiff line change
@@ -940,69 +940,142 @@ public void testILMRolloverRetriesOnReadOnlyBlock() throws Exception {
940940
assertBusy(() -> assertThat(getStepKeyForIndex(firstIndex), equalTo(TerminalPolicyStep.KEY)));
941941
}
942942

943-
public void testILMRolloverOnManuallyRolledIndex() throws Exception {
944-
String originalIndex = index + "-000001";
945-
String secondIndex = index + "-000002";
946-
String thirdIndex = index + "-000003";
947-
948-
// Set up a policy with rollover
949-
createNewSingletonPolicy("hot", new RolloverAction(null, null, 2L));
950-
Request createIndexTemplate = new Request("PUT", "_template/rolling_indexes");
951-
createIndexTemplate.setJsonEntity("{" +
952-
"\"index_patterns\": [\""+ index + "-*\"], \n" +
953-
" \"settings\": {\n" +
954-
" \"number_of_shards\": 1,\n" +
955-
" \"number_of_replicas\": 0,\n" +
956-
" \"index.lifecycle.name\": \"" + policy+ "\", \n" +
957-
" \"index.lifecycle.rollover_alias\": \"alias\"\n" +
958-
" }\n" +
959-
"}");
960-
client().performRequest(createIndexTemplate);
961-
962-
createIndexWithSettings(
963-
originalIndex,
964-
Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
965-
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0),
966-
true
967-
);
968-
969-
// Index a document
970-
index(client(), originalIndex, "1", "foo", "bar");
971-
Request refreshOriginalIndex = new Request("POST", "/" + originalIndex + "/_refresh");
972-
client().performRequest(refreshOriginalIndex);
973-
974-
// Manual rollover
975-
Request rolloverRequest = new Request("POST", "/alias/_rollover");
976-
rolloverRequest.setJsonEntity("{\n" +
977-
" \"conditions\": {\n" +
978-
" \"max_docs\": \"1\"\n" +
979-
" }\n" +
980-
"}"
981-
);
982-
client().performRequest(rolloverRequest);
983-
assertBusy(() -> assertTrue(indexExists(secondIndex)));
984-
985-
// Index another document into the original index so the ILM rollover policy condition is met
986-
index(client(), originalIndex, "2", "foo", "bar");
987-
client().performRequest(refreshOriginalIndex);
988-
989-
// Wait for the rollover policy to execute
990-
assertBusy(() -> assertThat(getStepKeyForIndex(originalIndex), equalTo(TerminalPolicyStep.KEY)));
991-
992-
// ILM should manage the second index after attempting (and skipping) rolling the original index
993-
assertBusy(() -> assertTrue((boolean) explainIndex(secondIndex).getOrDefault("managed", true)));
994-
995-
// index some documents to trigger an ILM rollover
996-
index(client(), "alias", "1", "foo", "bar");
997-
index(client(), "alias", "2", "foo", "bar");
998-
index(client(), "alias", "3", "foo", "bar");
999-
Request refreshSecondIndex = new Request("POST", "/" + secondIndex + "/_refresh");
1000-
client().performRequest(refreshSecondIndex).getStatusLine();
1001-
1002-
// ILM should rollover the second index even though it skipped the first one
1003-
assertBusy(() -> assertThat(getStepKeyForIndex(secondIndex), equalTo(TerminalPolicyStep.KEY)));
1004-
assertBusy(() -> assertTrue(indexExists(thirdIndex)));
1005-
}
943+
public void testILMRolloverOnManuallyRolledIndex() throws Exception {
944+
String originalIndex = index + "-000001";
945+
String secondIndex = index + "-000002";
946+
String thirdIndex = index + "-000003";
947+
948+
// Set up a policy with rollover
949+
createNewSingletonPolicy("hot", new RolloverAction(null, null, 2L));
950+
Request createIndexTemplate = new Request("PUT", "_template/rolling_indexes");
951+
createIndexTemplate.setJsonEntity("{" +
952+
"\"index_patterns\": [\"" + index + "-*\"], \n" +
953+
" \"settings\": {\n" +
954+
" \"number_of_shards\": 1,\n" +
955+
" \"number_of_replicas\": 0,\n" +
956+
" \"index.lifecycle.name\": \"" + policy + "\", \n" +
957+
" \"index.lifecycle.rollover_alias\": \"alias\"\n" +
958+
" }\n" +
959+
"}");
960+
client().performRequest(createIndexTemplate);
961+
962+
createIndexWithSettings(
963+
originalIndex,
964+
Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
965+
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0),
966+
true
967+
);
968+
969+
// Index a document
970+
index(client(), originalIndex, "1", "foo", "bar");
971+
Request refreshOriginalIndex = new Request("POST", "/" + originalIndex + "/_refresh");
972+
client().performRequest(refreshOriginalIndex);
973+
974+
// Manual rollover
975+
Request rolloverRequest = new Request("POST", "/alias/_rollover");
976+
rolloverRequest.setJsonEntity("{\n" +
977+
" \"conditions\": {\n" +
978+
" \"max_docs\": \"1\"\n" +
979+
" }\n" +
980+
"}"
981+
);
982+
client().performRequest(rolloverRequest);
983+
assertBusy(() -> assertTrue(indexExists(secondIndex)));
984+
985+
// Index another document into the original index so the ILM rollover policy condition is met
986+
index(client(), originalIndex, "2", "foo", "bar");
987+
client().performRequest(refreshOriginalIndex);
988+
989+
// Wait for the rollover policy to execute
990+
assertBusy(() -> assertThat(getStepKeyForIndex(originalIndex), equalTo(TerminalPolicyStep.KEY)));
991+
992+
// ILM should manage the second index after attempting (and skipping) rolling the original index
993+
assertBusy(() -> assertTrue((boolean) explainIndex(secondIndex).getOrDefault("managed", true)));
994+
995+
// index some documents to trigger an ILM rollover
996+
index(client(), "alias", "1", "foo", "bar");
997+
index(client(), "alias", "2", "foo", "bar");
998+
index(client(), "alias", "3", "foo", "bar");
999+
Request refreshSecondIndex = new Request("POST", "/" + secondIndex + "/_refresh");
1000+
client().performRequest(refreshSecondIndex).getStatusLine();
1001+
1002+
// ILM should rollover the second index even though it skipped the first one
1003+
assertBusy(() -> assertThat(getStepKeyForIndex(secondIndex), equalTo(TerminalPolicyStep.KEY)));
1004+
assertBusy(() -> assertTrue(indexExists(thirdIndex)));
1005+
}
1006+
1007+
public void testRolloverStepRetriesUntilRolledOverIndexIsDeleted() throws Exception {
1008+
String index = this.index + "-000001";
1009+
String rolledIndex = this.index + "-000002";
1010+
1011+
createNewSingletonPolicy("hot", new RolloverAction(null, TimeValue.timeValueSeconds(1), null));
1012+
1013+
// create the rolled index so the rollover of the first index fails
1014+
createIndexWithSettings(
1015+
rolledIndex,
1016+
Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
1017+
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
1018+
.put(RolloverAction.LIFECYCLE_ROLLOVER_ALIAS, "alias"),
1019+
false
1020+
);
1021+
1022+
createIndexWithSettings(
1023+
index,
1024+
Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
1025+
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
1026+
.put(LifecycleSettings.LIFECYCLE_NAME, policy)
1027+
.put(RolloverAction.LIFECYCLE_ROLLOVER_ALIAS, "alias"),
1028+
true
1029+
);
1030+
1031+
assertBusy(() -> assertThat((Integer) explainIndex(index).get(FAILED_STEP_RETRY_COUNT_FIELD), greaterThanOrEqualTo(1)), 30,
1032+
TimeUnit.SECONDS);
1033+
1034+
Request moveToStepRequest = new Request("POST", "_ilm/move/" + index);
1035+
moveToStepRequest.setJsonEntity("{\n" +
1036+
" \"current_step\": {\n" +
1037+
" \"phase\": \"hot\",\n" +
1038+
" \"action\": \"rollover\",\n" +
1039+
" \"name\": \"check-rollover-ready\"\n" +
1040+
" },\n" +
1041+
" \"next_step\": {\n" +
1042+
" \"phase\": \"hot\",\n" +
1043+
" \"action\": \"rollover\",\n" +
1044+
" \"name\": \"attempt-rollover\"\n" +
1045+
" }\n" +
1046+
"}");
1047+
1048+
// Using {@link #waitUntil} here as ILM moves back and forth between the {@link WaitForRolloverReadyStep} step and
1049+
// {@link org.elasticsearch.xpack.core.ilm.ErrorStep} in order to retry the failing step. As {@link #assertBusy}
1050+
// increases the wait time between calls exponentially, we might miss the window where the policy is on
1051+
// {@link WaitForRolloverReadyStep} and the move to `attempt-rollover` request will not be successful.
1052+
waitUntil(() -> {
1053+
try {
1054+
return client().performRequest(moveToStepRequest).getStatusLine().getStatusCode() == 200;
1055+
} catch (IOException e) {
1056+
return false;
1057+
}
1058+
}, 30, TimeUnit.SECONDS);
1059+
1060+
// Similar to above, using {@link #waitUntil} as we want to make sure the `attempt-rollover` step started failing and is being
1061+
// retried (which means ILM moves back and forth between the `attempt-rollover` step and the `error` step)
1062+
waitUntil(() -> {
1063+
try {
1064+
Map<String, Object> explainIndexResponse = explainIndex(index);
1065+
String step = (String) explainIndexResponse.get("step");
1066+
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
1067+
return step != null && step.equals("attempt-rollover") && retryCount != null && retryCount >= 1;
1068+
} catch (IOException e) {
1069+
return false;
1070+
}
1071+
}, 30, TimeUnit.SECONDS);
1072+
1073+
deleteIndex(rolledIndex);
1074+
1075+
// the rollover step should eventually succeed
1076+
assertBusy(() -> assertThat(indexExists(rolledIndex), is(true)));
1077+
assertBusy(() -> assertThat(getStepKeyForIndex(index), equalTo(TerminalPolicyStep.KEY)));
1078+
}
10061079

10071080
public void testHistoryIsWrittenWithSuccess() throws Exception {
10081081
String index = "index";

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunner.java

+14
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,20 @@ public void onFailure(String source, Exception e) {
203203
logger.error(new ParameterizedMessage("retry execution of step [{}] for index [{}] failed",
204204
failedStep.getKey().getName(), index), e);
205205
}
206+
207+
@Override
208+
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
209+
if (oldState.equals(newState) == false) {
210+
IndexMetaData newIndexMeta = newState.metaData().index(index);
211+
Step indexMetaCurrentStep = getCurrentStep(stepRegistry, policy, newIndexMeta);
212+
StepKey stepKey = indexMetaCurrentStep.getKey();
213+
if (stepKey != null && stepKey != TerminalPolicyStep.KEY && newIndexMeta != null) {
214+
logger.trace("policy [{}] for index [{}] was moved back on the failed step for as part of an automatic " +
215+
"retry. Attempting to execute the failed step [{}] if it's an async action", policy, index, stepKey);
216+
maybeRunAsyncAction(newState, newIndexMeta, policy, stepKey);
217+
}
218+
}
219+
}
206220
});
207221
} else {
208222
logger.debug("policy [{}] for index [{}] on an error step after a terminal error, skipping execution", policy, index);

0 commit comments

Comments
 (0)