Skip to content

Commit 1a8f227

Browse files
authored
Improve awareness allocation explanation (#69371)
Users sometimes struggle with unassigned shards due to the awareness decider, especially if the decider is using more zones than expected (e.g. a typo in an attribute value, or a stale forced-awareness config). It is hard to see why there are more zones than expected, or even really to tell from the message that this is the problem. This commit adds detail about the names of the zones that the allocation decider is using, so that users can see for themselves whether that is the source of the problem, and rewords it slightly to make it easier to understand.
1 parent 2ba3e92 commit 1a8f227

File tree

2 files changed

+117
-8
lines changed

2 files changed

+117
-8
lines changed

server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AwarenessAllocationDecider.java

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@
2323
import java.util.List;
2424
import java.util.Map;
2525
import java.util.function.Function;
26+
import java.util.stream.StreamSupport;
2627

2728
import static java.util.Collections.emptyList;
29+
import static java.util.stream.Collectors.toList;
2830
import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_AUTO_EXPAND_REPLICAS_SETTING;
2931

3032
/**
@@ -186,25 +188,44 @@ private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, Rout
186188
final int currentNodeCount = shardPerAttribute.get(node.node().getAttributes().get(awarenessAttribute));
187189
final int maximumNodeCount = (shardCount + numberOfAttributes - 1) / numberOfAttributes; // ceil(shardCount/numberOfAttributes)
188190
if (currentNodeCount > maximumNodeCount) {
189-
return debug ? debugNoTooManyCopies(shardCount, awarenessAttribute, numberOfAttributes, currentNodeCount, maximumNodeCount)
191+
return debug ? debugNoTooManyCopies(
192+
shardCount,
193+
awarenessAttribute,
194+
node.node().getAttributes().get(awarenessAttribute),
195+
numberOfAttributes,
196+
StreamSupport.stream(nodesPerAttribute.keys().spliterator(), false).map(c -> c.value).sorted().collect(toList()),
197+
fullValues == null ? null : fullValues.stream().sorted().collect(toList()),
198+
currentNodeCount,
199+
maximumNodeCount)
190200
: Decision.NO;
191201
}
192202
}
193203

194204
return YES_ALL_MET;
195205
}
196206

197-
private static Decision debugNoTooManyCopies(int shardCount, String awarenessAttribute, int numberOfAttributes, int currentNodeCount,
198-
int maximumNodeCount) {
207+
private static Decision debugNoTooManyCopies(
208+
int shardCount,
209+
String attributeName,
210+
String attributeValue,
211+
int numberOfAttributes,
212+
List<String> realAttributes,
213+
List<String> forcedAttributes,
214+
int currentNodeCount,
215+
int maximumNodeCount) {
199216
return Decision.single(Decision.Type.NO, NAME,
200-
"there are too many copies of the shard allocated to nodes with attribute [%s], there are [%d] total configured " +
201-
"shard copies for this shard id and [%d] total attribute values, expected the allocated shard count per " +
202-
"attribute [%d] to be less than or equal to the upper bound of the required number of shards per attribute [%d]",
203-
awarenessAttribute,
217+
"there are [%d] copies of this shard and [%d] values for attribute [%s] (%s from nodes in the cluster and %s) so there " +
218+
"may be at most [%d] copies of this shard allocated to nodes with each value, but (including this copy) there " +
219+
"would be [%d] copies allocated to nodes with [node.attr.%s: %s]",
204220
shardCount,
205221
numberOfAttributes,
222+
attributeName,
223+
realAttributes,
224+
forcedAttributes == null ? "no forced awareness" : forcedAttributes + " from forced awareness",
225+
maximumNodeCount,
206226
currentNodeCount,
207-
maximumNodeCount);
227+
attributeName,
228+
attributeValue);
208229
}
209230

210231
private static Decision debugNoMissingAttribute(String awarenessAttribute, List<String> awarenessAttributes) {

server/src/test/java/org/elasticsearch/cluster/routing/allocation/AwarenessAllocationTests.java

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,27 @@
1717
import org.elasticsearch.cluster.metadata.IndexMetadata;
1818
import org.elasticsearch.cluster.metadata.Metadata;
1919
import org.elasticsearch.cluster.node.DiscoveryNodes;
20+
import org.elasticsearch.cluster.routing.RoutingNode;
2021
import org.elasticsearch.cluster.routing.RoutingTable;
2122
import org.elasticsearch.cluster.routing.ShardRouting;
2223
import org.elasticsearch.cluster.routing.ShardRoutingState;
2324
import org.elasticsearch.cluster.routing.allocation.command.AllocationCommands;
2425
import org.elasticsearch.cluster.routing.allocation.command.CancelAllocationCommand;
2526
import org.elasticsearch.cluster.routing.allocation.command.MoveAllocationCommand;
27+
import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders;
2628
import org.elasticsearch.cluster.routing.allocation.decider.AwarenessAllocationDecider;
2729
import org.elasticsearch.cluster.routing.allocation.decider.ClusterRebalanceAllocationDecider;
30+
import org.elasticsearch.cluster.routing.allocation.decider.Decision;
31+
import org.elasticsearch.common.settings.ClusterSettings;
2832
import org.elasticsearch.common.settings.Settings;
2933

3034
import java.util.HashMap;
3135
import java.util.Map;
36+
import java.util.function.UnaryOperator;
37+
import java.util.stream.StreamSupport;
3238

3339
import static java.util.Collections.emptyMap;
40+
import static java.util.Collections.singletonList;
3441
import static java.util.Collections.singletonMap;
3542
import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING;
3643
import static org.elasticsearch.cluster.routing.ShardRoutingState.RELOCATING;
@@ -937,4 +944,85 @@ public void testNodesWithoutAttributeAreIgnored() {
937944
assertTrue(clusterState.getRoutingNodes().node("X-0").isEmpty());
938945
}
939946

947+
public void testExplanation() {
948+
testExplanation(Settings.builder()
949+
.put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.getKey(), "zone"),
950+
UnaryOperator.identity(),
951+
"there are [5] copies of this shard and [2] values for attribute [zone] ([a, b] from nodes in the cluster and " +
952+
"no forced awareness) so there may be at most [3] copies of this shard allocated to nodes with each " +
953+
"value, but (including this copy) there would be [4] copies allocated to nodes with [node.attr.zone: a]");
954+
}
955+
956+
public void testExplanationWithMissingAttribute() {
957+
testExplanation(Settings.builder()
958+
.put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.getKey(), "zone"),
959+
n -> n.add(newNode("X-0", emptyMap())),
960+
"there are [5] copies of this shard and [2] values for attribute [zone] ([a, b] from nodes in the cluster and " +
961+
"no forced awareness) so there may be at most [3] copies of this shard allocated to nodes with each " +
962+
"value, but (including this copy) there would be [4] copies allocated to nodes with [node.attr.zone: a]");
963+
}
964+
965+
public void testExplanationWithForcedAttributes() {
966+
testExplanation(Settings.builder()
967+
.put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.getKey(), "zone")
968+
.put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING.getKey() + "zone.values",
969+
"b,c"),
970+
UnaryOperator.identity(),
971+
"there are [5] copies of this shard and [3] values for attribute [zone] ([a, b] from nodes in the cluster and " +
972+
"[b, c] from forced awareness) so there may be at most [2] copies of this shard allocated to nodes with each " +
973+
"value, but (including this copy) there would be [3] copies allocated to nodes with [node.attr.zone: a]");
974+
}
975+
976+
private void testExplanation(
977+
Settings.Builder settingsBuilder,
978+
UnaryOperator<DiscoveryNodes.Builder> nodesOperator,
979+
String expectedMessage) {
980+
final Settings settings = settingsBuilder
981+
.build();
982+
983+
final AllocationService strategy = createAllocationService(settings);
984+
985+
final Metadata metadata = Metadata.builder()
986+
.put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(4))
987+
.build();
988+
989+
final ClusterState clusterState = applyStartedShardsUntilNoChange(
990+
ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.get(Settings.EMPTY))
991+
.metadata(metadata)
992+
.routingTable(RoutingTable.builder()
993+
.addAsNew(metadata.index("test"))
994+
.build())
995+
.nodes(nodesOperator.apply(DiscoveryNodes.builder()
996+
.add(newNode("A-0", singletonMap("zone", "a")))
997+
.add(newNode("A-1", singletonMap("zone", "a")))
998+
.add(newNode("A-2", singletonMap("zone", "a")))
999+
.add(newNode("A-3", singletonMap("zone", "a")))
1000+
.add(newNode("A-4", singletonMap("zone", "a")))
1001+
.add(newNode("B-0", singletonMap("zone", "b"))))
1002+
).build(), strategy);
1003+
1004+
final ShardRouting unassignedShard = clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0);
1005+
1006+
final AwarenessAllocationDecider decider = new AwarenessAllocationDecider(
1007+
settings,
1008+
new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS));
1009+
1010+
final RoutingNode emptyNode = StreamSupport.stream(clusterState.getRoutingNodes().spliterator(), false)
1011+
.filter(RoutingNode::isEmpty).findFirst().orElseThrow(AssertionError::new);
1012+
1013+
final RoutingAllocation routingAllocation = new RoutingAllocation(
1014+
new AllocationDeciders(singletonList(decider)),
1015+
clusterState.getRoutingNodes(),
1016+
clusterState,
1017+
null,
1018+
null,
1019+
0L);
1020+
routingAllocation.debugDecision(true);
1021+
1022+
final Decision decision = decider.canAllocate(unassignedShard, emptyNode, routingAllocation);
1023+
assertThat(decision.type(), equalTo(Decision.Type.NO));
1024+
assertThat(decision.label(), equalTo("awareness"));
1025+
assertThat(decision.getExplanation(), equalTo(expectedMessage));
1026+
}
1027+
9401028
}

0 commit comments

Comments
 (0)