Skip to content

Commit eadcb5f

Browse files
authored
Fix size of rolling-upgrade bootstrap config (elastic#38031)
Zen2 nodes will bootstrap themselves once they believe there to be no remaining Zen1 master-eligible nodes in the cluster, as long as minimum_master_nodes is satisfied. Today the bootstrap configuration comprises just the ids of the known master-eligible nodes, and this might be too small to be safe. For instance, if there are 5 master-eligible nodes (so that minimum_master_nodes is 3) then the bootstrap configuration could comprise just 3 nodes, of which 2 form a quorum, and this does not intersect other quorums that might arise, leading to a split-brain. This commit fixes this by expanding the bootstrap configuration so that its quorums satisfy minimum_master_nodes, by adding some of the IDs of the other master-eligible nodes in the last-published cluster state.
1 parent e0d5de3 commit eadcb5f

File tree

2 files changed

+90
-4
lines changed

2 files changed

+90
-4
lines changed

server/src/main/java/org/elasticsearch/cluster/coordination/DiscoveryUpgradeService.java

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@
4747
import org.elasticsearch.transport.TransportService;
4848

4949
import java.io.IOException;
50+
import java.util.HashSet;
51+
import java.util.Iterator;
5052
import java.util.Optional;
5153
import java.util.Set;
5254
import java.util.function.BooleanSupplier;
@@ -130,7 +132,11 @@ public void activate(Optional<DiscoveryNode> lastKnownLeader, ClusterState lastA
130132
: lastAcceptedClusterState.getMinimumMasterNodesOnPublishingMaster();
131133

132134
assert joiningRound == null : joiningRound;
133-
joiningRound = new JoiningRound(enableUnsafeBootstrappingOnUpgrade && lastKnownLeader.isPresent(), minimumMasterNodes);
135+
final Set<String> knownMasterNodeIds = new HashSet<>();
136+
lastAcceptedClusterState.nodes().getMasterNodes().forEach(c -> knownMasterNodeIds.add(c.key));
137+
138+
joiningRound
139+
= new JoiningRound(enableUnsafeBootstrappingOnUpgrade && lastKnownLeader.isPresent(), minimumMasterNodes, knownMasterNodeIds);
134140
joiningRound.scheduleNextAttempt();
135141
}
136142

@@ -168,10 +174,12 @@ void countDown() {
168174
private class JoiningRound {
169175
private final boolean upgrading;
170176
private final int minimumMasterNodes;
177+
private final Set<String> knownMasterNodeIds;
171178

172-
JoiningRound(boolean upgrading, int minimumMasterNodes) {
179+
JoiningRound(boolean upgrading, int minimumMasterNodes, Set<String> knownMasterNodeIds) {
173180
this.upgrading = upgrading;
174181
this.minimumMasterNodes = minimumMasterNodes;
182+
this.knownMasterNodeIds = knownMasterNodeIds;
175183
}
176184

177185
private boolean isRunning() {
@@ -210,8 +218,20 @@ public void run() {
210218
// no Zen1 nodes found, but the last-known master was a Zen1 node, so this is a rolling upgrade
211219
transportService.getThreadPool().generic().execute(() -> {
212220
try {
213-
initialConfigurationConsumer.accept(new VotingConfiguration(discoveryNodes.stream()
214-
.map(DiscoveryNode::getId).collect(Collectors.toSet())));
221+
Set<String> nodeIds = new HashSet<>();
222+
discoveryNodes.forEach(n -> nodeIds.add(n.getId()));
223+
224+
final Iterator<String> knownNodeIdIterator = knownMasterNodeIds.iterator();
225+
while (nodeIds.size() < 2 * minimumMasterNodes - 1 && knownNodeIdIterator.hasNext()) {
226+
nodeIds.add(knownNodeIdIterator.next());
227+
}
228+
229+
final VotingConfiguration votingConfiguration = new VotingConfiguration(nodeIds);
230+
assert votingConfiguration.hasQuorum(
231+
discoveryNodes.stream().map(DiscoveryNode::getId).collect(Collectors.toList()));
232+
assert 2 * minimumMasterNodes - 2 <= nodeIds.size() : nodeIds + " too small for " + minimumMasterNodes;
233+
234+
initialConfigurationConsumer.accept(votingConfiguration);
215235
} catch (Exception e) {
216236
logger.debug("exception during bootstrapping upgrade, retrying", e);
217237
} finally {

server/src/test/java/org/elasticsearch/cluster/coordination/Zen1IT.java

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@
2222
import org.elasticsearch.action.admin.cluster.configuration.AddVotingConfigExclusionsRequest;
2323
import org.elasticsearch.action.admin.cluster.configuration.ClearVotingConfigExclusionsAction;
2424
import org.elasticsearch.action.admin.cluster.configuration.ClearVotingConfigExclusionsRequest;
25+
import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest;
2526
import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequestBuilder;
2627
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
2728
import org.elasticsearch.action.admin.cluster.state.ClusterStateRequest;
29+
import org.elasticsearch.action.admin.indices.create.CreateIndexRequest;
2830
import org.elasticsearch.client.Client;
2931
import org.elasticsearch.client.Requests;
3032
import org.elasticsearch.cluster.metadata.IndexMetaData;
@@ -34,24 +36,34 @@
3436
import org.elasticsearch.common.Priority;
3537
import org.elasticsearch.common.settings.Settings;
3638
import org.elasticsearch.common.unit.TimeValue;
39+
import org.elasticsearch.discovery.Discovery;
3740
import org.elasticsearch.discovery.zen.ElectMasterService;
3841
import org.elasticsearch.env.NodeEnvironment;
3942
import org.elasticsearch.gateway.MetaStateService;
43+
import org.elasticsearch.plugins.Plugin;
4044
import org.elasticsearch.test.ESIntegTestCase;
4145
import org.elasticsearch.test.InternalTestCluster.RestartCallback;
4246
import org.elasticsearch.test.discovery.TestZenDiscovery;
47+
import org.elasticsearch.test.transport.MockTransportService;
48+
import org.elasticsearch.transport.TransportService;
4349

50+
import java.util.Collection;
51+
import java.util.Collections;
4452
import java.util.List;
4553
import java.util.stream.Collectors;
4654
import java.util.stream.IntStream;
4755
import java.util.stream.StreamSupport;
4856

4957
import static org.elasticsearch.cluster.coordination.ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING;
5058
import static org.elasticsearch.cluster.coordination.Coordinator.ZEN1_BWC_TERM;
59+
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_ACTION_NAME;
60+
import static org.elasticsearch.cluster.coordination.JoinHelper.START_JOIN_ACTION_NAME;
61+
import static org.elasticsearch.cluster.coordination.PublicationTransportHandler.PUBLISH_STATE_ACTION_NAME;
5162
import static org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE_SETTING;
5263
import static org.elasticsearch.cluster.routing.allocation.decider.FilterAllocationDecider.CLUSTER_ROUTING_EXCLUDE_GROUP_SETTING;
5364
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
5465
import static org.elasticsearch.test.InternalTestCluster.REMOVED_MINIMUM_MASTER_NODES;
66+
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
5567
import static org.hamcrest.Matchers.equalTo;
5668
import static org.hamcrest.Matchers.is;
5769

@@ -67,6 +79,10 @@ public class Zen1IT extends ESIntegTestCase {
6779
.put(TestZenDiscovery.USE_ZEN2.getKey(), true)
6880
.build();
6981

82+
protected Collection<Class<? extends Plugin>> nodePlugins() {
83+
return Collections.singletonList(MockTransportService.TestPlugin.class);
84+
}
85+
7086
public void testZen2NodesJoiningZen1Cluster() {
7187
internalCluster().startNodes(randomIntBetween(1, 3), ZEN1_SETTINGS);
7288
internalCluster().startNodes(randomIntBetween(1, 3), ZEN2_SETTINGS);
@@ -79,6 +95,56 @@ public void testZen1NodesJoiningZen2Cluster() {
7995
createIndex("test");
8096
}
8197

98+
public void testMixedClusterDisruption() throws Exception {
99+
final List<String> nodes = internalCluster().startNodes(IntStream.range(0, 5)
100+
.mapToObj(i -> i < 2 ? ZEN1_SETTINGS : ZEN2_SETTINGS).toArray(Settings[]::new));
101+
102+
final List<MockTransportService> transportServices = nodes.stream()
103+
.map(n -> (MockTransportService) internalCluster().getInstance(TransportService.class, n)).collect(Collectors.toList());
104+
105+
logger.info("--> disrupting communications");
106+
107+
// The idea here is to make some of the Zen2 nodes believe the Zen1 nodes have gone away by introducing a network partition, so that
108+
// they bootstrap themselves, but keep the Zen1 side of the cluster alive.
109+
110+
// Set up a bridged network partition with the Zen1 nodes {0,1} on one side, Zen2 nodes {3,4} on the other, and node {2} in both
111+
transportServices.get(0).addFailToSendNoConnectRule(transportServices.get(3));
112+
transportServices.get(0).addFailToSendNoConnectRule(transportServices.get(4));
113+
transportServices.get(1).addFailToSendNoConnectRule(transportServices.get(3));
114+
transportServices.get(1).addFailToSendNoConnectRule(transportServices.get(4));
115+
transportServices.get(3).addFailToSendNoConnectRule(transportServices.get(0));
116+
transportServices.get(3).addFailToSendNoConnectRule(transportServices.get(1));
117+
transportServices.get(4).addFailToSendNoConnectRule(transportServices.get(0));
118+
transportServices.get(4).addFailToSendNoConnectRule(transportServices.get(1));
119+
120+
// Nodes 3 and 4 will bootstrap, but we want to keep node 2 as part of the Zen1 cluster, so prevent any messages that might switch
121+
// its allegiance
122+
transportServices.get(3).addFailToSendNoConnectRule(transportServices.get(2),
123+
PUBLISH_STATE_ACTION_NAME, FOLLOWER_CHECK_ACTION_NAME, START_JOIN_ACTION_NAME);
124+
transportServices.get(4).addFailToSendNoConnectRule(transportServices.get(2),
125+
PUBLISH_STATE_ACTION_NAME, FOLLOWER_CHECK_ACTION_NAME, START_JOIN_ACTION_NAME);
126+
127+
logger.info("--> waiting for disconnected nodes to be removed");
128+
ensureStableCluster(3, nodes.get(0));
129+
130+
logger.info("--> creating index on Zen1 side");
131+
assertAcked(client(nodes.get(0)).admin().indices().create(new CreateIndexRequest("test")).get());
132+
assertFalse(client(nodes.get(0)).admin().cluster().health(new ClusterHealthRequest("test")
133+
.waitForGreenStatus()).get().isTimedOut());
134+
135+
logger.info("--> waiting for disconnected nodes to bootstrap themselves");
136+
assertBusy(() -> assertTrue(IntStream.range(3, 5)
137+
.mapToObj(n -> (Coordinator) internalCluster().getInstance(Discovery.class, nodes.get(n)))
138+
.anyMatch(Coordinator::isInitialConfigurationSet)));
139+
140+
logger.info("--> clearing disruption and waiting for cluster to reform");
141+
transportServices.forEach(MockTransportService::clearAllRules);
142+
143+
ensureStableCluster(5, nodes.get(0));
144+
assertFalse(client(nodes.get(0)).admin().cluster().health(new ClusterHealthRequest("test")
145+
.waitForGreenStatus()).get().isTimedOut());
146+
}
147+
82148
public void testMixedClusterFormation() throws Exception {
83149
final int zen1NodeCount = randomIntBetween(1, 3);
84150
final int zen2NodeCount = randomIntBetween(zen1NodeCount == 1 ? 2 : 1, 3);

0 commit comments

Comments
 (0)