Skip to content

Commit bb3ea99

Browse files
authored
Skip zone/host awareness with auto-expand replicas (#69334)
Today if an index is set to `auto_expand_replicas: N-all` then we will try and create a shard copy on every node that matches the applicable allocation filters. This conflits with shard allocation awareness and the same-host allocation decider if there is an uneven distribution of nodes across zones or hosts, since these deciders prevent shard copies from being allocated unevenly and may therefore leave some unassigned shards. The point of these two deciders is to improve resilience given a limited number of shard copies but there is no need for this behaviour when the number of shard copies is not limited, so this commit supresses them in that case. Closes #54151 Closes #2869
1 parent f27da75 commit bb3ea99

File tree

8 files changed

+142
-20
lines changed

8 files changed

+142
-20
lines changed

docs/reference/index-modules.asciidoc

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -139,16 +139,19 @@ specific index module:
139139
The number of replicas each primary shard has. Defaults to 1.
140140

141141
`index.auto_expand_replicas`::
142-
143-
Auto-expand the number of replicas based on the number of data nodes in the cluster.
144-
Set to a dash delimited lower and upper bound (e.g. `0-5`) or use `all`
145-
for the upper bound (e.g. `0-all`). Defaults to `false` (i.e. disabled).
146-
Note that the auto-expanded number of replicas only takes
147-
<<shard-allocation-filtering,allocation filtering>> rules into account, but ignores
148-
any other allocation rules such as <<shard-allocation-awareness,shard allocation awareness>>
149-
and <<allocation-total-shards,total shards per node>>, and this can lead to the
150-
cluster health becoming `YELLOW` if the applicable rules prevent all the replicas
151-
from being allocated.
142+
Auto-expand the number of replicas based on the number of data nodes in the
143+
cluster. Set to a dash delimited lower and upper bound (e.g. `0-5`) or use `all`
144+
for the upper bound (e.g. `0-all`). Defaults to `false` (i.e. disabled). Note
145+
that the auto-expanded number of replicas only takes
146+
<<shard-allocation-filtering,allocation filtering>> rules into account, but
147+
ignores other allocation rules such as <<allocation-total-shards,total shards
148+
per node>>, and this can lead to the cluster health becoming `YELLOW` if the
149+
applicable rules prevent all the replicas from being allocated.
150+
+
151+
If the upper bound is `all` then <<shard-allocation-awareness,shard allocation
152+
awareness>> and
153+
<<cluster-routing-allocation-same-shard-host,`cluster.routing.allocation.same_shard.host`>>
154+
are ignored for this index.
152155

153156
`index.search.idle.after`::
154157
How long a shard can not receive a search or get request until it's considered

docs/reference/modules/cluster/shards_allocation.asciidoc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ one of the active allocation ids in the cluster state.
4545
These should be fast so more initial primary recoveries can happen in
4646
parallel on the same node. Defaults to `4`.
4747

48-
48+
[[cluster-routing-allocation-same-shard-host]]
4949
`cluster.routing.allocation.same_shard.host`::
5050
(<<dynamic-cluster-setting,Dynamic>>)
5151
Allows to perform a check to prevent allocation of multiple instances of

server/src/main/java/org/elasticsearch/cluster/metadata/AutoExpandReplicas.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@ int getMaxReplicas(int numDataNodes) {
9191
return Math.min(maxReplicas, numDataNodes-1);
9292
}
9393

94+
public boolean expandToAllNodes() {
95+
return maxReplicas == Integer.MAX_VALUE;
96+
}
97+
9498
private OptionalInt getDesiredNumberOfReplicas(IndexMetadata indexMetadata, RoutingAllocation allocation) {
9599
if (enabled) {
96100
int numMatchingDataNodes = 0;

server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AwarenessAllocationDecider.java

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,6 @@
88

99
package org.elasticsearch.cluster.routing.allocation.decider;
1010

11-
import java.util.HashMap;
12-
import java.util.List;
13-
import java.util.Map;
14-
import java.util.function.Function;
15-
1611
import com.carrotsearch.hppc.ObjectIntHashMap;
1712
import org.elasticsearch.cluster.metadata.IndexMetadata;
1813
import org.elasticsearch.cluster.routing.RoutingNode;
@@ -24,7 +19,13 @@
2419
import org.elasticsearch.common.settings.Setting.Property;
2520
import org.elasticsearch.common.settings.Settings;
2621

22+
import java.util.HashMap;
23+
import java.util.List;
24+
import java.util.Map;
25+
import java.util.function.Function;
26+
2727
import static java.util.Collections.emptyList;
28+
import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_AUTO_EXPAND_REPLICAS_SETTING;
2829

2930
/**
3031
* This {@link AllocationDecider} controls shard allocation based on
@@ -118,6 +119,9 @@ public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAl
118119
"allocation awareness is not enabled, set cluster setting ["
119120
+ CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.getKey() + "] to enable it");
120121

122+
private static final Decision YES_AUTO_EXPAND_ALL = Decision.single(Decision.Type.YES, NAME,
123+
"allocation awareness is ignored, this index is set to auto-expand to all nodes");
124+
121125
private static final Decision YES_ALL_MET =
122126
Decision.single(Decision.Type.YES, NAME, "node meets all awareness attribute requirements");
123127

@@ -128,6 +132,11 @@ private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, Rout
128132

129133
final boolean debug = allocation.debugDecision();
130134
IndexMetadata indexMetadata = allocation.metadata().getIndexSafe(shardRouting.index());
135+
136+
if (INDEX_AUTO_EXPAND_REPLICAS_SETTING.get(indexMetadata.getSettings()).expandToAllNodes()) {
137+
return YES_AUTO_EXPAND_ALL;
138+
}
139+
131140
int shardCount = indexMetadata.getNumberOfReplicas() + 1; // 1 for primary
132141
for (String awarenessAttribute : awarenessAttributes) {
133142
// the node the shard exists on must be associated with an awareness attribute

server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/SameShardAllocationDecider.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
import org.elasticsearch.common.settings.Setting.Property;
1818
import org.elasticsearch.common.settings.Settings;
1919

20+
import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_AUTO_EXPAND_REPLICAS_SETTING;
21+
2022
/**
2123
* An allocation decider that prevents multiple instances of the same shard to
2224
* be allocated on the same {@code node}.
@@ -58,6 +60,9 @@ private void setSameHost(boolean sameHost) {
5860
private static final Decision YES_NONE_HOLD_COPY =
5961
Decision.single(Decision.Type.YES, NAME, "none of the nodes on this host hold a copy of this shard");
6062

63+
private static final Decision YES_AUTO_EXPAND_ALL = Decision.single(Decision.Type.YES, NAME,
64+
"same-host allocation is ignored, this index is set to auto-expand to all nodes");
65+
6166
@Override
6267
public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
6368
Iterable<ShardRouting> assignedShards = allocation.routingNodes().assignedShards(shardRouting.shardId());
@@ -66,6 +71,10 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing
6671
// if its already a NO decision looking at the node, or we aren't configured to look at the host, return the decision
6772
return decision;
6873
}
74+
if (INDEX_AUTO_EXPAND_REPLICAS_SETTING.get(
75+
allocation.metadata().getIndexSafe(shardRouting.index()).getSettings()).expandToAllNodes()) {
76+
return YES_AUTO_EXPAND_ALL;
77+
}
6978
if (node.node() != null) {
7079
for (RoutingNode checkNode : allocation.routingNodes()) {
7180
if (checkNode.node() == null) {

server/src/test/java/org/elasticsearch/cluster/metadata/AutoExpandReplicasTests.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,16 +50,19 @@ public void testParseSettings() {
5050
assertEquals(0, autoExpandReplicas.getMinReplicas());
5151
assertEquals(5, autoExpandReplicas.getMaxReplicas(8));
5252
assertEquals(2, autoExpandReplicas.getMaxReplicas(3));
53+
assertFalse(autoExpandReplicas.expandToAllNodes());
5354

5455
autoExpandReplicas = AutoExpandReplicas.SETTING.get(Settings.builder().put("index.auto_expand_replicas", "0-all").build());
5556
assertEquals(0, autoExpandReplicas.getMinReplicas());
5657
assertEquals(5, autoExpandReplicas.getMaxReplicas(6));
5758
assertEquals(2, autoExpandReplicas.getMaxReplicas(3));
59+
assertTrue(autoExpandReplicas.expandToAllNodes());
5860

5961
autoExpandReplicas = AutoExpandReplicas.SETTING.get(Settings.builder().put("index.auto_expand_replicas", "1-all").build());
6062
assertEquals(1, autoExpandReplicas.getMinReplicas());
6163
assertEquals(5, autoExpandReplicas.getMaxReplicas(6));
6264
assertEquals(2, autoExpandReplicas.getMaxReplicas(3));
65+
assertTrue(autoExpandReplicas.expandToAllNodes());
6366

6467
}
6568

server/src/test/java/org/elasticsearch/cluster/routing/allocation/AwarenessAllocationTests.java

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import org.apache.logging.log4j.LogManager;
1212
import org.apache.logging.log4j.Logger;
1313
import org.elasticsearch.Version;
14+
import org.elasticsearch.cluster.ClusterName;
1415
import org.elasticsearch.cluster.ClusterState;
1516
import org.elasticsearch.cluster.ESAllocationTestCase;
1617
import org.elasticsearch.cluster.metadata.IndexMetadata;
@@ -34,6 +35,7 @@
3435
import static org.elasticsearch.cluster.routing.ShardRoutingState.RELOCATING;
3536
import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED;
3637
import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED;
38+
import static org.hamcrest.Matchers.empty;
3739
import static org.hamcrest.Matchers.equalTo;
3840
import static org.hamcrest.Matchers.greaterThan;
3941
import static org.hamcrest.Matchers.sameInstance;
@@ -770,8 +772,13 @@ public void testUnassignedShardsWithUnbalancedZones() {
770772

771773
logger.info("Building initial routing table for 'testUnassignedShardsWithUnbalancedZones'");
772774

775+
final Settings.Builder indexSettings = settings(Version.CURRENT);
776+
if (randomBoolean()) {
777+
indexSettings.put(IndexMetadata.SETTING_AUTO_EXPAND_REPLICAS, "0-4");
778+
}
779+
773780
Metadata metadata = Metadata.builder()
774-
.put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(4))
781+
.put(IndexMetadata.builder("test").settings(indexSettings).numberOfShards(1).numberOfReplicas(4))
775782
.build();
776783

777784
RoutingTable initialRoutingTable = RoutingTable.builder()
@@ -865,4 +872,36 @@ public void testMultipleAwarenessAttributes() {
865872
assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(2));
866873
assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0));
867874
}
875+
876+
public void testDisabledByAutoExpandReplicas() {
877+
final Settings settings = Settings.builder()
878+
.put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.getKey(), "zone")
879+
.build();
880+
881+
final AllocationService strategy = createAllocationService(settings);
882+
883+
final Metadata metadata = Metadata.builder()
884+
.put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)
885+
.put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
886+
.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 99)
887+
.put(IndexMetadata.SETTING_AUTO_EXPAND_REPLICAS, "0-all")))
888+
.build();
889+
890+
final ClusterState clusterState = applyStartedShardsUntilNoChange(
891+
ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.get(Settings.EMPTY))
892+
.metadata(metadata)
893+
.routingTable(RoutingTable.builder()
894+
.addAsNew(metadata.index("test"))
895+
.build())
896+
.nodes(DiscoveryNodes.builder()
897+
.add(newNode("A-0", singletonMap("zone", "a")))
898+
.add(newNode("A-1", singletonMap("zone", "a")))
899+
.add(newNode("A-2", singletonMap("zone", "a")))
900+
.add(newNode("A-3", singletonMap("zone", "a")))
901+
.add(newNode("A-4", singletonMap("zone", "a")))
902+
.add(newNode("B-0", singletonMap("zone", "b")))
903+
).build(), strategy);
904+
905+
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED), empty());
906+
}
868907
}

server/src/test/java/org/elasticsearch/cluster/routing/allocation/SameShardRoutingTests.java

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
import org.elasticsearch.Version;
1414
import org.elasticsearch.action.support.replication.ClusterStateCreationUtils;
1515
import org.elasticsearch.cluster.ClusterInfo;
16-
import org.elasticsearch.cluster.ClusterName;
1716
import org.elasticsearch.cluster.ClusterState;
1817
import org.elasticsearch.cluster.ESAllocationTestCase;
1918
import org.elasticsearch.cluster.metadata.IndexMetadata;
@@ -37,8 +36,11 @@
3736
import java.util.Collections;
3837

3938
import static java.util.Collections.emptyMap;
39+
import static org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING;
4040
import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING;
41+
import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED;
4142
import static org.elasticsearch.cluster.routing.allocation.RoutingNodesUtils.numberOfShardsOfType;
43+
import static org.hamcrest.Matchers.empty;
4244
import static org.hamcrest.Matchers.equalTo;
4345

4446
public class SameShardRoutingTests extends ESAllocationTestCase {
@@ -48,14 +50,19 @@ public void testSameHost() {
4850
AllocationService strategy = createAllocationService(
4951
Settings.builder().put(SameShardAllocationDecider.CLUSTER_ROUTING_ALLOCATION_SAME_HOST_SETTING.getKey(), true).build());
5052

53+
final Settings.Builder indexSettings = settings(Version.CURRENT);
54+
if (randomBoolean()) {
55+
indexSettings.put(IndexMetadata.SETTING_AUTO_EXPAND_REPLICAS, "0-1");
56+
}
57+
5158
Metadata metadata = Metadata.builder()
52-
.put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(2).numberOfReplicas(1))
59+
.put(IndexMetadata.builder("test").settings(indexSettings).numberOfShards(2).numberOfReplicas(1))
5360
.build();
5461

5562
RoutingTable routingTable = RoutingTable.builder()
5663
.addAsNew(metadata.index("test"))
5764
.build();
58-
ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata)
65+
ClusterState clusterState = ClusterState.builder(CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata)
5966
.routingTable(routingTable).build();
6067

6168
logger.info("--> adding two nodes with the same host");
@@ -88,6 +95,54 @@ public void testSameHost() {
8895
}
8996
}
9097

98+
public void testSameHostCheckDisabledByAutoExpandReplicas() {
99+
final AllocationService strategy = createAllocationService(
100+
Settings.builder().put(SameShardAllocationDecider.CLUSTER_ROUTING_ALLOCATION_SAME_HOST_SETTING.getKey(), true).build());
101+
102+
final Metadata metadata = Metadata.builder()
103+
.put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)
104+
.put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
105+
.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 99)
106+
.put(IndexMetadata.SETTING_AUTO_EXPAND_REPLICAS, "0-all")))
107+
.build();
108+
109+
final DiscoveryNode node1 = new DiscoveryNode(
110+
"node1",
111+
"node1",
112+
"node1",
113+
"test1",
114+
"test1",
115+
buildNewFakeTransportAddress(),
116+
emptyMap(),
117+
MASTER_DATA_ROLES,
118+
Version.CURRENT);
119+
120+
121+
final DiscoveryNode node2 = new DiscoveryNode(
122+
"node2",
123+
"node2",
124+
"node2",
125+
"test1",
126+
"test1",
127+
buildNewFakeTransportAddress(),
128+
emptyMap(),
129+
MASTER_DATA_ROLES,
130+
Version.CURRENT);
131+
132+
final ClusterState clusterState = applyStartedShardsUntilNoChange(ClusterState
133+
.builder(CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
134+
.metadata(metadata)
135+
.routingTable(RoutingTable.builder()
136+
.addAsNew(metadata.index("test"))
137+
.build())
138+
.nodes(
139+
DiscoveryNodes.builder()
140+
.add(node1)
141+
.add(node2)).build(), strategy);
142+
143+
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED), empty());
144+
}
145+
91146
public void testForceAllocatePrimaryOnSameNodeNotAllowed() {
92147
SameShardAllocationDecider decider = new SameShardAllocationDecider(
93148
Settings.EMPTY, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS));

0 commit comments

Comments
 (0)