Skip to content

Commit 79d0c4e

Browse files
authored
ILM: allow check-migration step to continue if tier setting unset (#62636) (#62724)
This allows the `check-migration` step to move past the allocation check if the tier routing settings are manually unset. This helps a user unblock ILM in case a tier is removed (ie. if the warm tier is decommissioned this will allow users to resume the ILM policies stuck in `check-migration` waiting for the warm nodes to become available and the managed index to allocate. this allows the index to allocate on the other available tiers) (cherry picked from commit d7a1eaa) Signed-off-by: Andrei Dan <[email protected]>
1 parent ee835ee commit 79d0c4e

File tree

3 files changed

+162
-9
lines changed

3 files changed

+162
-9
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStep.java

+18-4
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import org.elasticsearch.cluster.node.DiscoveryNode;
1414
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
1515
import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders;
16+
import org.elasticsearch.common.Strings;
1617
import org.elasticsearch.common.settings.ClusterSettings;
1718
import org.elasticsearch.common.settings.Setting;
1819
import org.elasticsearch.common.settings.Settings;
@@ -25,8 +26,9 @@
2526
import java.util.Set;
2627

2728
import static org.elasticsearch.cluster.node.DiscoveryNodeRole.DATA_ROLE;
28-
import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_INCLUDE_SETTING;
29+
import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_PREFER_SETTING;
2930
import static org.elasticsearch.xpack.core.ilm.AllocationRoutedStep.getPendingAllocations;
31+
import static org.elasticsearch.xpack.core.ilm.step.info.AllocationInfo.waitingForActiveShardsAllocationInfo;
3032

3133
/**
3234
* Checks whether all shards have been correctly routed in response to updating the allocation rules for an index in order
@@ -70,11 +72,23 @@ public Result isConditionMet(Index index, ClusterState clusterState) {
7072
logger.debug("[{}] lifecycle action for index [{}] executed but index no longer exists", getKey().getAction(), index.getName());
7173
return new Result(false, null);
7274
}
73-
String destinationTier = INDEX_ROUTING_INCLUDE_SETTING.get(idxMeta.getSettings());
75+
String destinationTier = INDEX_ROUTING_PREFER_SETTING.get(idxMeta.getSettings());
7476
if (ActiveShardCount.ALL.enoughShardsActive(clusterState, index.getName()) == false) {
75-
logger.debug("[{}] migration of index [{}] to the [{}] tier cannot progress, as not all shards are active",
77+
if (Strings.isEmpty(destinationTier)) {
78+
logger.debug("[{}] lifecycle action for index [{}] cannot make progress because not all shards are active",
79+
getKey().getAction(), index.getName());
80+
} else {
81+
logger.debug("[{}] migration of index [{}] to the [{}] tier cannot progress, as not all shards are active",
7682
getKey().getAction(), index.getName(), destinationTier);
77-
return new Result(false, AllocationInfo.waitingForActiveShardsAllocationInfo(idxMeta.getNumberOfReplicas()));
83+
}
84+
return new Result(false, waitingForActiveShardsAllocationInfo(idxMeta.getNumberOfReplicas()));
85+
}
86+
87+
if (Strings.isEmpty(destinationTier)) {
88+
logger.debug("index [{}] has no data tier routing setting configured and all its shards are active. considering the [{}] " +
89+
"step condition met and continuing to the next step", index.getName(), getKey().getName());
90+
// the user removed the tier routing setting and all the shards are active so we'll cary on
91+
return new Result(true, null);
7892
}
7993

8094
int allocationPendingAllShards = getPendingAllocations(index, ALLOCATION_DECIDERS, clusterState);

x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStepTests.java

+51-5
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
import java.util.Set;
3131

3232
import static java.util.Collections.emptyMap;
33-
import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_INCLUDE_SETTING;
33+
import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_PREFER;
3434
import static org.elasticsearch.xpack.core.ilm.step.info.AllocationInfo.waitingForActiveShardsAllocationInfo;
3535
import static org.hamcrest.Matchers.is;
3636
import static org.hamcrest.Matchers.nullValue;
@@ -95,7 +95,7 @@ public void testExecuteWithUnassignedShard() {
9595

9696
public void testExecuteWithPendingShards() {
9797
IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLengthBetween(5, 10))
98-
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_INCLUDE_SETTING.getKey(), DataTier.DATA_WARM))
98+
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_PREFER, DataTier.DATA_WARM))
9999
.numberOfShards(1).numberOfReplicas(0).build();
100100
Index index = indexMetadata.getIndex();
101101
IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index)
@@ -122,7 +122,7 @@ public void testExecuteWithPendingShards() {
122122

123123
public void testExecuteWithPendingShardsAndTargetRoleNotPresentInCluster() {
124124
IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLengthBetween(5, 10))
125-
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_INCLUDE_SETTING.getKey(), DataTier.DATA_WARM))
125+
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_PREFER, DataTier.DATA_WARM))
126126
.numberOfShards(1).numberOfReplicas(0).build();
127127
Index index = indexMetadata.getIndex();
128128
IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index)
@@ -159,7 +159,7 @@ public void testExecuteIndexMissing() {
159159

160160
public void testExecuteIsComplete() {
161161
IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLengthBetween(5, 10))
162-
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_INCLUDE_SETTING.getKey(), DataTier.DATA_WARM))
162+
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_PREFER, DataTier.DATA_WARM))
163163
.numberOfShards(1).numberOfReplicas(0).build();
164164
Index index = indexMetadata.getIndex();
165165
IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index)
@@ -181,7 +181,7 @@ public void testExecuteIsComplete() {
181181

182182
public void testExecuteWithGenericDataNodes() {
183183
IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLengthBetween(5, 10))
184-
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_INCLUDE_SETTING.getKey(), DataTier.DATA_WARM))
184+
.settings(settings(Version.CURRENT).put(INDEX_ROUTING_PREFER, DataTier.DATA_WARM))
185185
.numberOfShards(1).numberOfReplicas(0).build();
186186
Index index = indexMetadata.getIndex();
187187
IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index)
@@ -200,6 +200,52 @@ public void testExecuteWithGenericDataNodes() {
200200
assertThat(result.getInfomationContext(), is(nullValue()));
201201
}
202202

203+
public void testExecuteForIndexWithoutTierRoutingInformationWaitsForReplicasToBeActive() {
204+
IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLengthBetween(5, 10))
205+
.settings(settings(Version.CURRENT))
206+
.numberOfShards(1).numberOfReplicas(1).build();
207+
Index index = indexMetadata.getIndex();
208+
{
209+
IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index)
210+
.addShard(TestShardRouting.newShardRouting(new ShardId(index, 0), "node1", true, ShardRoutingState.STARTED))
211+
.addReplica();
212+
213+
ClusterState clusterState =
214+
ClusterState.builder(ClusterState.EMPTY_STATE).metadata(Metadata.builder().put(indexMetadata, true).build())
215+
.nodes(DiscoveryNodes.builder()
216+
.add(newNode("node1", Collections.singleton(DataTier.DATA_HOT_NODE_ROLE)))
217+
)
218+
.routingTable(RoutingTable.builder().add(indexRoutingTable).build())
219+
.build();
220+
DataTierMigrationRoutedStep step = createRandomInstance();
221+
Result expectedResult = new Result(false, waitingForActiveShardsAllocationInfo(1));
222+
223+
Result result = step.isConditionMet(index, clusterState);
224+
assertThat(result.isComplete(), is(false));
225+
assertThat(result.getInfomationContext(), is(expectedResult.getInfomationContext()));
226+
}
227+
228+
{
229+
IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index)
230+
.addShard(TestShardRouting.newShardRouting(new ShardId(index, 0), "node1", true, ShardRoutingState.STARTED))
231+
.addShard(TestShardRouting.newShardRouting(new ShardId(index, 0), "node2", false, ShardRoutingState.STARTED));
232+
233+
ClusterState clusterState =
234+
ClusterState.builder(ClusterState.EMPTY_STATE).metadata(Metadata.builder().put(indexMetadata, true).build())
235+
.nodes(DiscoveryNodes.builder()
236+
.add(newNode("node1", Collections.singleton(DataTier.DATA_HOT_NODE_ROLE)))
237+
.add(newNode("node2", Collections.singleton(DataTier.DATA_WARM_NODE_ROLE)))
238+
)
239+
.routingTable(RoutingTable.builder().add(indexRoutingTable).build())
240+
.build();
241+
DataTierMigrationRoutedStep step = createRandomInstance();
242+
243+
Result result = step.isConditionMet(index, clusterState);
244+
assertThat(result.isComplete(), is(true));
245+
assertThat(result.getInfomationContext(), is(nullValue()));
246+
}
247+
}
248+
203249
private DiscoveryNode newNode(String nodeId, Set<DiscoveryNodeRole> roles) {
204250
return new DiscoveryNode(nodeId, buildNewFakeTransportAddress(), emptyMap(), roles, Version.CURRENT);
205251
}

x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/DataTiersMigrationsTests.java

+93
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,16 @@
66

77
package org.elasticsearch.xpack.ilm;
88

9+
import org.elasticsearch.action.admin.cluster.allocation.ClusterAllocationExplainRequest;
10+
import org.elasticsearch.action.admin.cluster.allocation.ClusterAllocationExplainResponse;
911
import org.elasticsearch.action.admin.indices.create.CreateIndexResponse;
12+
import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest;
13+
import org.elasticsearch.cluster.routing.ShardRoutingState;
1014
import org.elasticsearch.common.settings.Settings;
1115
import org.elasticsearch.common.unit.TimeValue;
1216
import org.elasticsearch.plugins.Plugin;
1317
import org.elasticsearch.test.ESIntegTestCase;
18+
import org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider;
1419
import org.elasticsearch.xpack.core.DataTier;
1520
import org.elasticsearch.xpack.core.LocalStateCompositeXPackPlugin;
1621
import org.elasticsearch.xpack.core.XPackSettings;
@@ -29,6 +34,7 @@
2934
import java.util.Collection;
3035
import java.util.Collections;
3136
import java.util.Locale;
37+
import java.util.concurrent.TimeUnit;
3238

3339
import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS;
3440
import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS;
@@ -153,4 +159,91 @@ public void testIndexDataTierMigration() throws Exception {
153159
assertThat(indexLifecycleExplainResponse.getStep(), is("complete"));
154160
});
155161
}
162+
163+
public void testUserOptsOutOfTierMigration() throws Exception {
164+
internalCluster().startMasterOnlyNodes(1, Settings.EMPTY);
165+
logger.info("starting hot data node");
166+
internalCluster().startNode(hotNode(Settings.EMPTY));
167+
168+
Phase hotPhase = new Phase("hot", TimeValue.ZERO, Collections.emptyMap());
169+
Phase warmPhase = new Phase("warm", TimeValue.ZERO, Collections.emptyMap());
170+
Phase coldPhase = new Phase("cold", TimeValue.ZERO, Collections.emptyMap());
171+
LifecyclePolicy lifecyclePolicy = new LifecyclePolicy(
172+
policy, org.elasticsearch.common.collect.Map.of("hot", hotPhase, "warm", warmPhase, "cold", coldPhase)
173+
);
174+
PutLifecycleAction.Request putLifecycleRequest = new PutLifecycleAction.Request(lifecyclePolicy);
175+
PutLifecycleAction.Response putLifecycleResponse = client().execute(PutLifecycleAction.INSTANCE, putLifecycleRequest).get();
176+
assertAcked(putLifecycleResponse);
177+
178+
Settings settings = Settings.builder().put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1)
179+
.put(SETTING_NUMBER_OF_REPLICAS, 1).put(LifecycleSettings.LIFECYCLE_NAME, policy).build();
180+
CreateIndexResponse res = client().admin().indices().prepareCreate(managedIndex).setSettings(settings).get();
181+
assertTrue(res.isAcknowledged());
182+
183+
assertBusy(() -> {
184+
ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex);
185+
ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE,
186+
explainRequest).get();
187+
188+
IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex);
189+
assertThat(indexLifecycleExplainResponse.getPhase(), is("warm"));
190+
assertThat(indexLifecycleExplainResponse.getStep(), is(DataTierMigrationRoutedStep.NAME));
191+
});
192+
193+
Settings removeTierRoutingSetting = Settings.builder().putNull(DataTierAllocationDecider.INDEX_ROUTING_PREFER).build();
194+
UpdateSettingsRequest updateSettingsRequest = new UpdateSettingsRequest(managedIndex).settings(removeTierRoutingSetting);
195+
assertAcked(client().admin().indices().updateSettings(updateSettingsRequest).actionGet());
196+
197+
assertBusy(() -> {
198+
ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex);
199+
ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE,
200+
explainRequest).get();
201+
202+
IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex);
203+
assertThat(indexLifecycleExplainResponse.getPhase(), is("warm"));
204+
assertThat(indexLifecycleExplainResponse.getStep(), is(DataTierMigrationRoutedStep.NAME));
205+
assertReplicaIsUnassigned();
206+
}, 30, TimeUnit.SECONDS);
207+
208+
internalCluster().startNode(coldNode(Settings.EMPTY));
209+
210+
// the index should successfully allocate
211+
ensureGreen(managedIndex);
212+
213+
// the index is successfully allocated but the migrate action from the cold phase re-configured the tier migration setting to the
214+
// cold tier so ILM is stuck in `check-migration` in the cold phase this time
215+
// we have 2 options to resume the ILM execution:
216+
// 1. start another cold node so both the primary and replica can relocate to the cold nodes
217+
// 2. remove the tier routing setting from the index again (we're doing this below)
218+
assertBusy(() -> {
219+
ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex);
220+
ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE,
221+
explainRequest).get();
222+
223+
IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex);
224+
assertThat(indexLifecycleExplainResponse.getPhase(), is("cold"));
225+
assertThat(indexLifecycleExplainResponse.getStep(), is(DataTierMigrationRoutedStep.NAME));
226+
});
227+
228+
// remove the tier routing setting again
229+
assertAcked(client().admin().indices().updateSettings(updateSettingsRequest).actionGet());
230+
231+
// wait for lifecycle to complete in the cold phase
232+
assertBusy(() -> {
233+
ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex);
234+
ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE,
235+
explainRequest).get();
236+
237+
IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex);
238+
assertThat(indexLifecycleExplainResponse.getPhase(), is("cold"));
239+
assertThat(indexLifecycleExplainResponse.getStep(), is("complete"));
240+
}, 30, TimeUnit.SECONDS);
241+
}
242+
243+
private void assertReplicaIsUnassigned() {
244+
ClusterAllocationExplainRequest explainReplicaShard =
245+
new ClusterAllocationExplainRequest().setIndex(managedIndex).setPrimary(false).setShard(0);
246+
ClusterAllocationExplainResponse response = client().admin().cluster().allocationExplain(explainReplicaShard).actionGet();
247+
assertThat(response.getExplanation().getShardState(), is(ShardRoutingState.UNASSIGNED));
248+
}
156249
}

0 commit comments

Comments
 (0)