Skip to content

Commit d7a1eaa

Browse files
authored
ILM: allow check-migration step to continue if tier setting unset (#62636)
This allows the `check-migration` step to move past the allocation check if the tier routing settings are manually unset. This helps a user unblock ILM in case a tier is removed (ie. if the warm tier is decommissioned this will allow users to resume the ILM policies stuck in `check-migration` waiting for the warm nodes to become available and the managed index to allocate. this allows the index to allocate on the other available tiers)
1 parent e48eab9 commit d7a1eaa

File tree

3 files changed

+160
-9
lines changed

3 files changed

+160
-9
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStep.java

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import org.elasticsearch.cluster.node.DiscoveryNode;
1414
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
1515
import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders;
16+
import org.elasticsearch.common.Strings;
1617
import org.elasticsearch.common.settings.ClusterSettings;
1718
import org.elasticsearch.common.settings.Setting;
1819
import org.elasticsearch.common.settings.Settings;
@@ -26,8 +27,9 @@
2627
import java.util.Set;
2728

2829
import static org.elasticsearch.cluster.node.DiscoveryNodeRole.DATA_ROLE;
29-
import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_INCLUDE_SETTING;
30+
import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_PREFER_SETTING;
3031
import static org.elasticsearch.xpack.core.ilm.AllocationRoutedStep.getPendingAllocations;
32+
import static org.elasticsearch.xpack.core.ilm.step.info.AllocationInfo.waitingForActiveShardsAllocationInfo;
3133

3234
/**
3335
* Checks whether all shards have been correctly routed in response to updating the allocation rules for an index in order
@@ -71,11 +73,23 @@ public Result isConditionMet(Index index, ClusterState clusterState) {
7173
logger.debug("[{}] lifecycle action for index [{}] executed but index no longer exists", getKey().getAction(), index.getName());
7274
return new Result(false, null);
7375
}
74-
String destinationTier = INDEX_ROUTING_INCLUDE_SETTING.get(idxMeta.getSettings());
76+
String destinationTier = INDEX_ROUTING_PREFER_SETTING.get(idxMeta.getSettings());
7577
if (ActiveShardCount.ALL.enoughShardsActive(clusterState, index.getName()) == false) {
76-
logger.debug("[{}] migration of index [{}] to the [{}] tier cannot progress, as not all shards are active",
78+
if (Strings.isEmpty(destinationTier)) {
79+
logger.debug("[{}] lifecycle action for index [{}] cannot make progress because not all shards are active",
80+
getKey().getAction(), index.getName());
81+
} else {
82+
logger.debug("[{}] migration of index [{}] to the [{}] tier cannot progress, as not all shards are active",
7783
getKey().getAction(), index.getName(), destinationTier);
78-
return new Result(false, AllocationInfo.waitingForActiveShardsAllocationInfo(idxMeta.getNumberOfReplicas()));
84+
}
85+
return new Result(false, waitingForActiveShardsAllocationInfo(idxMeta.getNumberOfReplicas()));
86+
}
87+
88+
if (Strings.isEmpty(destinationTier)) {
89+
logger.debug("index [{}] has no data tier routing setting configured and all its shards are active. considering the [{}] " +
90+
"step condition met and continuing to the next step", index.getName(), getKey().getName());
91+
// the user removed the tier routing setting and all the shards are active so we'll cary on
92+
return new Result(true, null);
7993
}
8094

8195
int allocationPendingAllShards = getPendingAllocations(index, ALLOCATION_DECIDERS, clusterState);

x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStepTests.java

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
import java.util.Set;
3131

3232
import static java.util.Collections.emptyMap;
33-
import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_INCLUDE_SETTING;
33+
import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_PREFER;
3434
import static org.elasticsearch.xpack.core.ilm.step.info.AllocationInfo.waitingForActiveShardsAllocationInfo;
3535
import static org.hamcrest.Matchers.is;
3636
import static org.hamcrest.Matchers.nullValue;
@@ -95,7 +95,7 @@ public void testExecuteWithUnassignedShard() {
9595

9696
public void testExecuteWithPendingShards() {
9797
IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLengthBetween(5, 10))
98-
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_INCLUDE_SETTING.getKey(), DataTier.DATA_WARM))
98+
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_PREFER, DataTier.DATA_WARM))
9999
.numberOfShards(1).numberOfReplicas(0).build();
100100
Index index = indexMetadata.getIndex();
101101
IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index)
@@ -122,7 +122,7 @@ public void testExecuteWithPendingShards() {
122122

123123
public void testExecuteWithPendingShardsAndTargetRoleNotPresentInCluster() {
124124
IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLengthBetween(5, 10))
125-
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_INCLUDE_SETTING.getKey(), DataTier.DATA_WARM))
125+
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_PREFER, DataTier.DATA_WARM))
126126
.numberOfShards(1).numberOfReplicas(0).build();
127127
Index index = indexMetadata.getIndex();
128128
IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index)
@@ -159,7 +159,7 @@ public void testExecuteIndexMissing() {
159159

160160
public void testExecuteIsComplete() {
161161
IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLengthBetween(5, 10))
162-
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_INCLUDE_SETTING.getKey(), DataTier.DATA_WARM))
162+
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_PREFER, DataTier.DATA_WARM))
163163
.numberOfShards(1).numberOfReplicas(0).build();
164164
Index index = indexMetadata.getIndex();
165165
IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index)
@@ -181,7 +181,7 @@ public void testExecuteIsComplete() {
181181

182182
public void testExecuteWithGenericDataNodes() {
183183
IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLengthBetween(5, 10))
184-
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_INCLUDE_SETTING.getKey(), DataTier.DATA_WARM))
184+
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_PREFER, DataTier.DATA_WARM))
185185
.numberOfShards(1).numberOfReplicas(0).build();
186186
Index index = indexMetadata.getIndex();
187187
IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index)
@@ -200,6 +200,52 @@ public void testExecuteWithGenericDataNodes() {
200200
assertThat(result.getInfomationContext(), is(nullValue()));
201201
}
202202

203+
public void testExecuteForIndexWithoutTierRoutingInformationWaitsForReplicasToBeActive() {
204+
IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLengthBetween(5, 10))
205+
.settings(settings(Version.CURRENT))
206+
.numberOfShards(1).numberOfReplicas(1).build();
207+
Index index = indexMetadata.getIndex();
208+
{
209+
IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index)
210+
.addShard(TestShardRouting.newShardRouting(new ShardId(index, 0), "node1", true, ShardRoutingState.STARTED))
211+
.addReplica();
212+
213+
ClusterState clusterState =
214+
ClusterState.builder(ClusterState.EMPTY_STATE).metadata(Metadata.builder().put(indexMetadata, true).build())
215+
.nodes(DiscoveryNodes.builder()
216+
.add(newNode("node1", Collections.singleton(DataTier.DATA_HOT_NODE_ROLE)))
217+
)
218+
.routingTable(RoutingTable.builder().add(indexRoutingTable).build())
219+
.build();
220+
DataTierMigrationRoutedStep step = createRandomInstance();
221+
Result expectedResult = new Result(false, waitingForActiveShardsAllocationInfo(1));
222+
223+
Result result = step.isConditionMet(index, clusterState);
224+
assertThat(result.isComplete(), is(false));
225+
assertThat(result.getInfomationContext(), is(expectedResult.getInfomationContext()));
226+
}
227+
228+
{
229+
IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index)
230+
.addShard(TestShardRouting.newShardRouting(new ShardId(index, 0), "node1", true, ShardRoutingState.STARTED))
231+
.addShard(TestShardRouting.newShardRouting(new ShardId(index, 0), "node2", false, ShardRoutingState.STARTED));
232+
233+
ClusterState clusterState =
234+
ClusterState.builder(ClusterState.EMPTY_STATE).metadata(Metadata.builder().put(indexMetadata, true).build())
235+
.nodes(DiscoveryNodes.builder()
236+
.add(newNode("node1", Collections.singleton(DataTier.DATA_HOT_NODE_ROLE)))
237+
.add(newNode("node2", Collections.singleton(DataTier.DATA_WARM_NODE_ROLE)))
238+
)
239+
.routingTable(RoutingTable.builder().add(indexRoutingTable).build())
240+
.build();
241+
DataTierMigrationRoutedStep step = createRandomInstance();
242+
243+
Result result = step.isConditionMet(index, clusterState);
244+
assertThat(result.isComplete(), is(true));
245+
assertThat(result.getInfomationContext(), is(nullValue()));
246+
}
247+
}
248+
203249
private DiscoveryNode newNode(String nodeId, Set<DiscoveryNodeRole> roles) {
204250
return new DiscoveryNode(nodeId, buildNewFakeTransportAddress(), emptyMap(), roles, Version.CURRENT);
205251
}

x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/DataTiersMigrationsTests.java

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,16 @@
66

77
package org.elasticsearch.xpack.ilm;
88

9+
import org.elasticsearch.action.admin.cluster.allocation.ClusterAllocationExplainRequest;
10+
import org.elasticsearch.action.admin.cluster.allocation.ClusterAllocationExplainResponse;
911
import org.elasticsearch.action.admin.indices.create.CreateIndexResponse;
12+
import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest;
13+
import org.elasticsearch.cluster.routing.ShardRoutingState;
1014
import org.elasticsearch.common.settings.Settings;
1115
import org.elasticsearch.common.unit.TimeValue;
1216
import org.elasticsearch.plugins.Plugin;
1317
import org.elasticsearch.test.ESIntegTestCase;
18+
import org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider;
1419
import org.elasticsearch.xpack.core.DataTier;
1520
import org.elasticsearch.xpack.core.LocalStateCompositeXPackPlugin;
1621
import org.elasticsearch.xpack.core.XPackSettings;
@@ -30,6 +35,7 @@
3035
import java.util.Collections;
3136
import java.util.Locale;
3237
import java.util.Map;
38+
import java.util.concurrent.TimeUnit;
3339

3440
import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS;
3541
import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS;
@@ -138,4 +144,89 @@ public void testIndexDataTierMigration() throws Exception {
138144
assertThat(indexLifecycleExplainResponse.getStep(), is("complete"));
139145
});
140146
}
147+
148+
public void testUserOptsOutOfTierMigration() throws Exception {
149+
internalCluster().startMasterOnlyNodes(1, Settings.EMPTY);
150+
logger.info("starting hot data node");
151+
internalCluster().startNode(hotNode(Settings.EMPTY));
152+
153+
Phase hotPhase = new Phase("hot", TimeValue.ZERO, Collections.emptyMap());
154+
Phase warmPhase = new Phase("warm", TimeValue.ZERO, Collections.emptyMap());
155+
Phase coldPhase = new Phase("cold", TimeValue.ZERO, Collections.emptyMap());
156+
LifecyclePolicy lifecyclePolicy = new LifecyclePolicy(policy, Map.of("hot", hotPhase, "warm", warmPhase, "cold", coldPhase));
157+
PutLifecycleAction.Request putLifecycleRequest = new PutLifecycleAction.Request(lifecyclePolicy);
158+
PutLifecycleAction.Response putLifecycleResponse = client().execute(PutLifecycleAction.INSTANCE, putLifecycleRequest).get();
159+
assertAcked(putLifecycleResponse);
160+
161+
Settings settings = Settings.builder().put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1)
162+
.put(SETTING_NUMBER_OF_REPLICAS, 1).put(LifecycleSettings.LIFECYCLE_NAME, policy).build();
163+
CreateIndexResponse res = client().admin().indices().prepareCreate(managedIndex).setSettings(settings).get();
164+
assertTrue(res.isAcknowledged());
165+
166+
assertBusy(() -> {
167+
ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex);
168+
ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE,
169+
explainRequest).get();
170+
171+
IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex);
172+
assertThat(indexLifecycleExplainResponse.getPhase(), is("warm"));
173+
assertThat(indexLifecycleExplainResponse.getStep(), is(DataTierMigrationRoutedStep.NAME));
174+
});
175+
176+
Settings removeTierRoutingSetting = Settings.builder().putNull(DataTierAllocationDecider.INDEX_ROUTING_PREFER).build();
177+
UpdateSettingsRequest updateSettingsRequest = new UpdateSettingsRequest(managedIndex).settings(removeTierRoutingSetting);
178+
assertAcked(client().admin().indices().updateSettings(updateSettingsRequest).actionGet());
179+
180+
assertBusy(() -> {
181+
ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex);
182+
ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE,
183+
explainRequest).get();
184+
185+
IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex);
186+
assertThat(indexLifecycleExplainResponse.getPhase(), is("warm"));
187+
assertThat(indexLifecycleExplainResponse.getStep(), is(DataTierMigrationRoutedStep.NAME));
188+
assertReplicaIsUnassigned();
189+
}, 30, TimeUnit.SECONDS);
190+
191+
internalCluster().startNode(coldNode(Settings.EMPTY));
192+
193+
// the index should successfully allocate
194+
ensureGreen(managedIndex);
195+
196+
// the index is successfully allocated but the migrate action from the cold phase re-configured the tier migration setting to the
197+
// cold tier so ILM is stuck in `check-migration` in the cold phase this time
198+
// we have 2 options to resume the ILM execution:
199+
// 1. start another cold node so both the primary and replica can relocate to the cold nodes
200+
// 2. remove the tier routing setting from the index again (we're doing this below)
201+
assertBusy(() -> {
202+
ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex);
203+
ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE,
204+
explainRequest).get();
205+
206+
IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex);
207+
assertThat(indexLifecycleExplainResponse.getPhase(), is("cold"));
208+
assertThat(indexLifecycleExplainResponse.getStep(), is(DataTierMigrationRoutedStep.NAME));
209+
});
210+
211+
// remove the tier routing setting again
212+
assertAcked(client().admin().indices().updateSettings(updateSettingsRequest).actionGet());
213+
214+
// wait for lifecycle to complete in the cold phase
215+
assertBusy(() -> {
216+
ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex);
217+
ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE,
218+
explainRequest).get();
219+
220+
IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex);
221+
assertThat(indexLifecycleExplainResponse.getPhase(), is("cold"));
222+
assertThat(indexLifecycleExplainResponse.getStep(), is("complete"));
223+
}, 30, TimeUnit.SECONDS);
224+
}
225+
226+
private void assertReplicaIsUnassigned() {
227+
ClusterAllocationExplainRequest explainReplicaShard =
228+
new ClusterAllocationExplainRequest().setIndex(managedIndex).setPrimary(false).setShard(0);
229+
ClusterAllocationExplainResponse response = client().admin().cluster().allocationExplain(explainReplicaShard).actionGet();
230+
assertThat(response.getExplanation().getShardState(), is(ShardRoutingState.UNASSIGNED));
231+
}
141232
}

0 commit comments

Comments
 (0)