Skip to content

Commit 7f52ac4

Browse files
committed
ILM ClusterStateWaitThresholdBreachTests cycles due to shrunk-shards-allocated (elastic#75695)
ClusterStateWaitThresholdBreachTests is meant to simulate a shrink action failure that lasts past the threshold we configured such that ILM rewinds, deletes an attempted shrunk index and retries (successfully the 2nd time). We used to simulated this failure by configuring a shrink action with a number of shards higher than the index number of shards. We're now adding a step that'll validate against this misconfiguration so we needed a new way to integration test this shrink action cycle. This makes the test use a high number of replicas configuration for the managed index, blocking it in the `shrunk-shards-allocated` step, instead of the previous failure in the `shrink` step. (cherry picked from commit 3b2973d) Signed-off-by: Andrei Dan <[email protected]> # Conflicts: # x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/ClusterStateWaitThresholdBreachTests.java
1 parent bcace7d commit 7f52ac4

File tree

1 file changed

+20
-45
lines changed

1 file changed

+20
-45
lines changed

x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/ClusterStateWaitThresholdBreachTests.java

Lines changed: 20 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
package org.elasticsearch.xpack.ilm;
99

1010
import org.elasticsearch.action.admin.indices.create.CreateIndexResponse;
11-
import org.elasticsearch.action.admin.indices.shrink.ResizeRequest;
1211
import org.elasticsearch.cluster.metadata.IndexMetadata;
1312
import org.elasticsearch.cluster.service.ClusterService;
1413
import org.elasticsearch.common.settings.Settings;
@@ -40,11 +39,11 @@
4039
import java.util.concurrent.TimeUnit;
4140
import java.util.function.LongSupplier;
4241

42+
import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_NUMBER_OF_REPLICAS_SETTING;
4343
import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS;
4444
import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS;
4545
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
4646
import static org.hamcrest.Matchers.equalTo;
47-
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
4847
import static org.hamcrest.Matchers.is;
4948
import static org.hamcrest.Matchers.not;
5049
import static org.hamcrest.Matchers.notNullValue;
@@ -104,55 +103,33 @@ public void testWaitInShrunkShardsAllocatedExceedsThreshold() throws Exception {
104103
internalCluster().startDataOnlyNode();
105104

106105
int numShards = 2;
107-
{
108-
Phase warmPhase = new Phase("warm", TimeValue.ZERO, Map
109-
.of(MigrateAction.NAME, new MigrateAction(false), ShrinkAction.NAME,
110-
new ShrinkAction(numShards + randomIntBetween(1, numShards), null))
111-
);
112-
LifecyclePolicy lifecyclePolicy = new LifecyclePolicy(policy, Map.of("warm", warmPhase));
113-
PutLifecycleAction.Request putLifecycleRequest = new PutLifecycleAction.Request(lifecyclePolicy);
114-
assertAcked(client().execute(PutLifecycleAction.INSTANCE, putLifecycleRequest).get());
115-
}
116-
106+
Phase warmPhase = new Phase("warm", TimeValue.ZERO, Map.of(MigrateAction.NAME, new MigrateAction(false), ShrinkAction.NAME,
107+
new ShrinkAction(1, null)));
108+
LifecyclePolicy lifecyclePolicy = new LifecyclePolicy(policy, Map.of("warm", warmPhase));
109+
PutLifecycleAction.Request putLifecycleRequest = new PutLifecycleAction.Request(lifecyclePolicy);
110+
assertAcked(client().execute(PutLifecycleAction.INSTANCE, putLifecycleRequest).get());
111+
112+
// we're configuring a very high number of replicas. this will make ths shrunk index unable to allocate successfully, so ILM will
113+
// wait in the `shrunk-shards-allocated` step (we don't wait for the original index to be GREEN before)
117114
Settings settings = Settings.builder().put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, numShards)
118-
.put(SETTING_NUMBER_OF_REPLICAS, 0).put(LifecycleSettings.LIFECYCLE_NAME, policy)
115+
.put(SETTING_NUMBER_OF_REPLICAS, 42).put(LifecycleSettings.LIFECYCLE_NAME, policy)
119116
// configuring the threshold to the minimum value
120117
.put(LifecycleSettings.LIFECYCLE_STEP_WAIT_TIME_THRESHOLD, "1h")
121118
.build();
122119
CreateIndexResponse res = client().admin().indices().prepareCreate(managedIndex).setSettings(settings).get();
123120
assertTrue(res.isAcknowledged());
124121

125122
String[] firstAttemptShrinkIndexName = new String[1];
126-
// ILM will retry the shrink step because the number of shards to shrink to is gt the current number of shards
127123
assertBusy(() -> {
128124
ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex);
129125
ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE,
130126
explainRequest).get();
131127

132128
IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex);
133-
assertThat(indexLifecycleExplainResponse.getFailedStepRetryCount(), greaterThanOrEqualTo(1));
134-
135129
firstAttemptShrinkIndexName[0] = indexLifecycleExplainResponse.getShrinkIndexName();
136130
assertThat(firstAttemptShrinkIndexName[0], is(notNullValue()));
137131
}, 30, TimeUnit.SECONDS);
138132

139-
140-
// we're manually shrinking the index but configuring a very high number of replicas and waiting for all active shards
141-
// this will make ths shrunk index unable to allocate successfully, so ILM will wait in the `shrunk-shards-allocated` step
142-
ResizeRequest resizeRequest = new ResizeRequest(firstAttemptShrinkIndexName[0], managedIndex);
143-
Settings.Builder builder = Settings.builder();
144-
// a very high number of replicas, coupled with an `all` wait for active shards configuration will block the shrink action in the
145-
// `shrunk-shards-allocated` step.
146-
builder.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 42)
147-
.put("index.write.wait_for_active_shards", "all")
148-
.put(LifecycleSettings.LIFECYCLE_NAME, policy)
149-
.put(IndexMetadata.INDEX_ROUTING_REQUIRE_GROUP_SETTING.getKey() + "_id", (String) null)
150-
.put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1);
151-
Settings relevantTargetSettings = builder.build();
152-
resizeRequest.getTargetIndexRequest().settings(relevantTargetSettings);
153-
client().admin().indices().resizeIndex(resizeRequest).get();
154-
ensureYellow(firstAttemptShrinkIndexName[0]);
155-
156133
// let's check ILM for the managed index is waiting in the `shrunk-shards-allocated` step
157134
assertBusy(() -> {
158135
ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex);
@@ -197,25 +174,23 @@ public void testWaitInShrunkShardsAllocatedExceedsThreshold() throws Exception {
197174
// the shrink index generated in the first attempt must've been deleted!
198175
assertBusy(() -> assertFalse(indexExists(firstAttemptShrinkIndexName[0])));
199176

200-
// at this point, the manged index is looping into the `shrink` step as the action is trying to shrink to a higher number of
201-
// shards than the source index has. we'll update the policy to shrink to 1 shard and this should unblock the policy and it
202-
// should successfully shrink the managed index to the second cycle shrink index name
203-
{
204-
Phase warmPhase = new Phase("warm", TimeValue.ZERO, Map.of(MigrateAction.NAME,
205-
new MigrateAction(false), ShrinkAction.NAME, new ShrinkAction(1, null))
206-
);
207-
LifecyclePolicy lifecyclePolicy = new LifecyclePolicy(policy, Map.of("warm", warmPhase));
208-
PutLifecycleAction.Request putLifecycleRequest = new PutLifecycleAction.Request(lifecyclePolicy);
209-
assertAcked(client().execute(PutLifecycleAction.INSTANCE, putLifecycleRequest).get());
210-
}
211-
212177
assertBusy(() -> assertTrue(indexExists(secondCycleShrinkIndexName[0])), 30, TimeUnit.SECONDS);
178+
179+
// at this point, the second shrink attempt was executed and the manged index is looping into the `shrunk-shards-allocated` step as
180+
// waiting for the huge numbers of replicas for the shrunk index to allocate. this will never happen, so let's unblock this
181+
// situation and allow for shrink to complete by reducing the number of shards for the shrunk index to 0
182+
Settings.Builder zeroReplicasSetting = Settings.builder().put(INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), 0);
183+
assertAcked(
184+
client().admin().indices().prepareUpdateSettings(secondCycleShrinkIndexName[0]).setSettings(zeroReplicasSetting)
185+
);
186+
213187
assertBusy(() -> {
214188
ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(secondCycleShrinkIndexName[0]);
215189
ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE,
216190
explainRequest).get();
217191
IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses()
218192
.get(secondCycleShrinkIndexName[0]);
193+
assertThat(indexLifecycleExplainResponse.getPhase(), equalTo("warm"));
219194
assertThat(indexLifecycleExplainResponse.getStep(), equalTo(PhaseCompleteStep.NAME));
220195
}, 30, TimeUnit.SECONDS);
221196
}

0 commit comments

Comments
 (0)