Skip to content

ILM fix the init step to actually be retryable #52076

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Feb 13, 2020
Original file line number Diff line number Diff line change
Expand Up @@ -38,22 +38,26 @@ public ClusterState performAction(Index index, ClusterState clusterState) {
return clusterState;
}

LifecycleExecutionState lifecycleState = LifecycleExecutionState
.fromIndexMetadata(indexMetaData);

if (lifecycleState.getLifecycleDate() != null) {
return clusterState;
}

IndexMetaData.Builder indexMetadataBuilder = IndexMetaData.builder(indexMetaData);
if (shouldParseIndexName(indexMetaData.getSettings())) {
long parsedOriginationDate = parseIndexNameAndExtractDate(index.getName());
indexMetadataBuilder.settingsVersion(indexMetaData.getSettingsVersion() + 1)
.settings(Settings.builder()
.put(indexMetaData.getSettings())
.put(LifecycleSettings.LIFECYCLE_ORIGINATION_DATE, parsedOriginationDate)
.build()
);
LifecycleExecutionState lifecycleState;
try {
lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData);
if (lifecycleState.getLifecycleDate() != null) {
return clusterState;
}

if (shouldParseIndexName(indexMetaData.getSettings())) {
long parsedOriginationDate = parseIndexNameAndExtractDate(index.getName());
indexMetadataBuilder.settingsVersion(indexMetaData.getSettingsVersion() + 1)
.settings(Settings.builder()
.put(indexMetaData.getSettings())
.put(LifecycleSettings.LIFECYCLE_ORIGINATION_DATE, parsedOriginationDate)
.build()
);
}
} catch (Exception e) {
String policy = indexMetaData.getSettings().get(LifecycleSettings.LIFECYCLE_NAME);
throw new InitializePolicyException(policy, index.getName(), e);
}

ClusterState.Builder newClusterStateBuilder = ClusterState.builder(clusterState);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.core.ilm;

import org.elasticsearch.ElasticsearchException;

import java.util.Locale;

/**
* Exception thrown when a problem is encountered while initialising an ILM policy for an index.
*/
public class InitializePolicyException extends ElasticsearchException {

public InitializePolicyException(String policy, String index, Throwable cause) {
super(String.format(Locale.ROOT, "unable to initialize policy [%s] for index [%s]", policy, index), cause);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
*/
public class ShrunkenIndexCheckStep extends ClusterStateWaitStep {
public static final String NAME = "is-shrunken-index";
private static final Logger logger = LogManager.getLogger(InitializePolicyContextStep.class);
private static final Logger logger = LogManager.getLogger(ShrunkenIndexCheckStep.class);
private String shrunkIndexPrefix;

public ShrunkenIndexCheckStep(StepKey key, StepKey nextStepKey, String shrunkIndexPrefix) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ public InitializePolicyContextStep mutateInstance(InitializePolicyContextStep in
StepKey nextKey = instance.getNextStepKey();

switch (between(0, 1)) {
case 0:
key = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
break;
case 1:
nextKey = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
break;
default:
throw new AssertionError("Illegal randomisation branch");
case 0:
key = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
break;
case 1:
nextKey = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
break;
default:
throw new AssertionError("Illegal randomisation branch");
}

return new InitializePolicyContextStep(key, nextKey);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1176,26 +1176,26 @@ public void testRolloverStepRetriesUntilRolledOverIndexIsDeleted() throws Except
// {@link org.elasticsearch.xpack.core.ilm.ErrorStep} in order to retry the failing step. As {@link #assertBusy}
// increases the wait time between calls exponentially, we might miss the window where the policy is on
// {@link WaitForRolloverReadyStep} and the move to `attempt-rollover` request will not be successful.
waitUntil(() -> {
assertTrue(waitUntil(() -> {
try {
return client().performRequest(moveToStepRequest).getStatusLine().getStatusCode() == 200;
} catch (IOException e) {
return false;
}
}, 30, TimeUnit.SECONDS);
}, 30, TimeUnit.SECONDS));

// Similar to above, using {@link #waitUntil} as we want to make sure the `attempt-rollover` step started failing and is being
// retried (which means ILM moves back and forth between the `attempt-rollover` step and the `error` step)
waitUntil(() -> {
assertTrue("ILM did not start retrying the attempt-rollover step", waitUntil(() -> {
try {
Map<String, Object> explainIndexResponse = explainIndex(index);
String step = (String) explainIndexResponse.get("step");
String failedStep = (String) explainIndexResponse.get("failed_step");
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
return step != null && step.equals("attempt-rollover") && retryCount != null && retryCount >= 1;
return failedStep != null && failedStep.equals("attempt-rollover") && retryCount != null && retryCount >= 1;
} catch (IOException e) {
return false;
}
}, 30, TimeUnit.SECONDS);
}, 30, TimeUnit.SECONDS));

deleteIndex(rolledIndex);

Expand Down Expand Up @@ -1237,16 +1237,17 @@ public void testUpdateRolloverLifecycleDateStepRetriesWhenRolloverInfoIsMissing(
"}");
client().performRequest(moveToStepRequest);

waitUntil(() -> {
assertTrue("ILM did not start retrying the update-rollover-lifecycle-date step", waitUntil(() -> {
try {
Map<String, Object> explainIndexResponse = explainIndex(index);
String step = (String) explainIndexResponse.get("step");
String failedStep = (String) explainIndexResponse.get("failed_step");
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
return step != null && step.equals(UpdateRolloverLifecycleDateStep.NAME) && retryCount != null && retryCount >= 1;
return failedStep != null && failedStep.equals(UpdateRolloverLifecycleDateStep.NAME) && retryCount != null
&& retryCount >= 1;
} catch (IOException e) {
return false;
}
});
}, 30, TimeUnit.SECONDS));

index(client(), index, "1", "foo", "bar");
Request refreshIndex = new Request("POST", "/" + index + "/_refresh");
Expand Down Expand Up @@ -1432,16 +1433,17 @@ public void testRetryableInitializationStep() throws Exception {
assertOK(client().performRequest(startReq));

// Wait until an error has occurred.
waitUntil(() -> {
assertTrue("ILM did not start retrying the init step", waitUntil(() -> {
try {
Map<String, Object> explainIndexResponse = explainIndex(index);
String step = (String) explainIndexResponse.get("step");
String failedStep = (String) explainIndexResponse.get("failed_step");
Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD);
return step != null && step.equals(InitializePolicyContextStep.KEY.getAction()) && retryCount != null && retryCount >= 1;
return failedStep != null && failedStep.equals(InitializePolicyContextStep.KEY.getAction()) && retryCount != null
&& retryCount >= 1;
} catch (IOException e) {
return false;
}
}, 30, TimeUnit.SECONDS);
}, 30, TimeUnit.SECONDS));

// Turn origination date parsing back off
updateIndexSettings(index, Settings.builder()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.elasticsearch.xpack.core.ilm.ErrorStep;
import org.elasticsearch.xpack.core.ilm.IndexLifecycleMetadata;
import org.elasticsearch.xpack.core.ilm.InitializePolicyContextStep;
import org.elasticsearch.xpack.core.ilm.InitializePolicyException;
import org.elasticsearch.xpack.core.ilm.LifecycleExecutionState;
import org.elasticsearch.xpack.core.ilm.LifecyclePolicyMetadata;
import org.elasticsearch.xpack.core.ilm.LifecycleSettings;
Expand Down Expand Up @@ -133,8 +134,16 @@ static ClusterState moveClusterStateToErrorStep(Index index, ClusterState cluste
ElasticsearchException.generateThrowableXContent(causeXContentBuilder, STACKTRACE_PARAMS, cause);
causeXContentBuilder.endObject();
LifecycleExecutionState currentState = LifecycleExecutionState.fromIndexMetadata(idxMeta);
Step.StepKey currentStep = Objects.requireNonNull(LifecycleExecutionState.getCurrentStepKey(currentState),
"unable to move to an error step where there is no current step, state: " + currentState);
Step.StepKey currentStep;
// if an error is encountered while initialising the policy the lifecycle execution state will not yet contain any step information
// as we haven't yet initialised the policy, so we'll manually set the current step to be the "initialize policy" step so we can
// record the error (and later retry the init policy step)
if (cause instanceof InitializePolicyException) {
currentStep = InitializePolicyContextStep.KEY;
} else {
currentStep = Objects.requireNonNull(LifecycleExecutionState.getCurrentStepKey(currentState),
"unable to move to an error step where there is no current step, state: " + currentState);
}
LifecycleExecutionState nextStepState = updateExecutionStateToStep(policyMetadata, currentState,
new Step.StepKey(currentStep.getPhase(), currentStep.getAction(), ErrorStep.NAME), nowSupplier, false);

Expand Down