Skip to content

Commit 2e064e0

Browse files
committed
Allow election of nodes outside voting config (#43243)
Today we suppress election attempts on master-eligible nodes that are not in the voting configuration. In fact this restriction is not necessary: any master-eligible node can safely become master as long as it has a fresh enough cluster state and can gather a quorum of votes. Moreover, this restriction is sometimes undesirable: there may be a reason why we do not want any of the nodes in the voting configuration to become master. The reason for this restriction is as follows. If you want to shut the master down then you might first exclude it from the voting configuration. When this exclusion succeeds you might reasonably expect that a new master has been elected, since the voting config exclusion is almost always a step towards shutting the node down. If we allow nodes outside the voting configuration to be the master then the excluded node will continue to be master, which is confusing. This commit adjusts the logic to allow master-eligible nodes to attempt an election even if they are not in the voting configuration. If such a master is successfully elected then it adds itself to the voting configuration. This commit also adjusts the logic that causes master nodes to abdicate when they are excluded from the voting configuration, to avoid the confusion described above. Relates #37712, #37802.
1 parent 5a9c483 commit 2e064e0

File tree

4 files changed

+98
-21
lines changed

4 files changed

+98
-21
lines changed

server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java

+18-14
Original file line numberDiff line numberDiff line change
@@ -380,9 +380,8 @@ private void startElection() {
380380
// The preVoteCollector is only active while we are candidate, but it does not call this method with synchronisation, so we have
381381
// to check our mode again here.
382382
if (mode == Mode.CANDIDATE) {
383-
if (electionQuorumContainsLocalNode(getLastAcceptedState()) == false) {
384-
logger.trace("skip election as local node is not part of election quorum: {}",
385-
getLastAcceptedState().coordinationMetaData());
383+
if (localNodeMayWinElection(getLastAcceptedState()) == false) {
384+
logger.trace("skip election as local node may not win it: {}", getLastAcceptedState().coordinationMetaData());
386385
return;
387386
}
388387

@@ -415,16 +414,17 @@ private void abdicateTo(DiscoveryNode newMaster) {
415414
becomeCandidate("after abdicating to " + newMaster);
416415
}
417416

418-
private static boolean electionQuorumContainsLocalNode(ClusterState lastAcceptedState) {
417+
private static boolean localNodeMayWinElection(ClusterState lastAcceptedState) {
419418
final DiscoveryNode localNode = lastAcceptedState.nodes().getLocalNode();
420419
assert localNode != null;
421-
return electionQuorumContains(lastAcceptedState, localNode);
420+
return nodeMayWinElection(lastAcceptedState, localNode);
422421
}
423422

424-
private static boolean electionQuorumContains(ClusterState lastAcceptedState, DiscoveryNode node) {
423+
private static boolean nodeMayWinElection(ClusterState lastAcceptedState, DiscoveryNode node) {
425424
final String nodeId = node.getId();
426425
return lastAcceptedState.getLastCommittedConfiguration().getNodeIds().contains(nodeId)
427-
|| lastAcceptedState.getLastAcceptedConfiguration().getNodeIds().contains(nodeId);
426+
|| lastAcceptedState.getLastAcceptedConfiguration().getNodeIds().contains(nodeId)
427+
|| lastAcceptedState.getVotingConfigExclusions().stream().noneMatch(vce -> vce.getNodeId().equals(nodeId));
428428
}
429429

430430
private Optional<Join> ensureTermAtLeast(DiscoveryNode sourceNode, long targetTerm) {
@@ -867,8 +867,8 @@ public boolean setInitialConfiguration(final VotingConfiguration votingConfigura
867867
metaDataBuilder.coordinationMetaData(coordinationMetaData);
868868

869869
coordinationState.get().setInitialState(ClusterState.builder(currentState).metaData(metaDataBuilder).build());
870-
assert electionQuorumContainsLocalNode(getLastAcceptedState()) :
871-
"initial state does not have local node in its election quorum: " + getLastAcceptedState().coordinationMetaData();
870+
assert localNodeMayWinElection(getLastAcceptedState()) :
871+
"initial state does not allow local node to win election: " + getLastAcceptedState().coordinationMetaData();
872872
preVoteCollector.update(getPreVoteResponse(), null); // pick up the change to last-accepted version
873873
startElectionScheduler();
874874
return true;
@@ -1164,8 +1164,8 @@ public void run() {
11641164
if (mode == Mode.CANDIDATE) {
11651165
final ClusterState lastAcceptedState = coordinationState.get().getLastAcceptedState();
11661166

1167-
if (electionQuorumContainsLocalNode(lastAcceptedState) == false) {
1168-
logger.trace("skip prevoting as local node is not part of election quorum: {}",
1167+
if (localNodeMayWinElection(lastAcceptedState) == false) {
1168+
logger.trace("skip prevoting as local node may not win election: {}",
11691169
lastAcceptedState.coordinationMetaData());
11701170
return;
11711171
}
@@ -1329,16 +1329,20 @@ public void onSuccess(String source) {
13291329
updateMaxTermSeen(getCurrentTerm());
13301330

13311331
if (mode == Mode.LEADER) {
1332+
// if necessary, abdicate to another node or improve the voting configuration
1333+
boolean attemptReconfiguration = true;
13321334
final ClusterState state = getLastAcceptedState(); // committed state
1333-
if (electionQuorumContainsLocalNode(state) == false) {
1335+
if (localNodeMayWinElection(state) == false) {
13341336
final List<DiscoveryNode> masterCandidates = completedNodes().stream()
13351337
.filter(DiscoveryNode::isMasterNode)
1336-
.filter(node -> electionQuorumContains(state, node))
1338+
.filter(node -> nodeMayWinElection(state, node))
13371339
.collect(Collectors.toList());
13381340
if (masterCandidates.isEmpty() == false) {
13391341
abdicateTo(masterCandidates.get(random.nextInt(masterCandidates.size())));
1342+
attemptReconfiguration = false;
13401343
}
1341-
} else {
1344+
}
1345+
if (attemptReconfiguration) {
13421346
scheduleReconfigurationIfNeeded();
13431347
}
13441348
}

server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java

+5-5
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,11 @@ static class VotingConfigNode implements Comparable<VotingConfigNode> {
150150

151151
@Override
152152
public int compareTo(VotingConfigNode other) {
153+
// prefer current master
154+
final int currentMasterComp = Boolean.compare(other.currentMaster, currentMaster);
155+
if (currentMasterComp != 0) {
156+
return currentMasterComp;
157+
}
153158
// prefer nodes that are live
154159
final int liveComp = Boolean.compare(other.live, live);
155160
if (liveComp != 0) {
@@ -160,11 +165,6 @@ public int compareTo(VotingConfigNode other) {
160165
if (inCurrentConfigComp != 0) {
161166
return inCurrentConfigComp;
162167
}
163-
// prefer current master
164-
final int currentMasterComp = Boolean.compare(other.currentMaster, currentMaster);
165-
if (currentMasterComp != 0) {
166-
return currentMasterComp;
167-
}
168168
// tiebreak by node id to have stable ordering
169169
return id.compareTo(other.id);
170170
}

server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ public void testReconfigurationExamples() {
5353

5454
check(nodes("a"), conf("a"), true, conf("a"));
5555
check(nodes("a", "b"), conf("a"), true, conf("a"));
56-
check(nodes("a", "b"), conf("b"), true, conf("b"));
56+
check(nodes("a", "b"), conf("b"), true, conf("a"));
5757
check(nodes("a", "b"), conf("a", "c"), true, conf("a"));
5858
check(nodes("a", "b"), conf("a", "b"), true, conf("a"));
5959
check(nodes("a", "b"), conf("a", "b", "e"), true, conf("a", "b", "e"));

server/src/test/java/org/elasticsearch/cluster/coordination/VotingConfigurationIT.java

+74-1
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,38 @@
1818
*/
1919
package org.elasticsearch.cluster.coordination;
2020

21+
import org.elasticsearch.ElasticsearchException;
2122
import org.elasticsearch.action.admin.cluster.configuration.AddVotingConfigExclusionsAction;
2223
import org.elasticsearch.action.admin.cluster.configuration.AddVotingConfigExclusionsRequest;
24+
import org.elasticsearch.cluster.ClusterState;
25+
import org.elasticsearch.cluster.node.DiscoveryNode;
2326
import org.elasticsearch.common.Priority;
27+
import org.elasticsearch.plugins.Plugin;
2428
import org.elasticsearch.test.ESIntegTestCase;
29+
import org.elasticsearch.test.transport.MockTransportService;
30+
import org.elasticsearch.transport.TransportService;
2531

32+
import java.util.Collection;
33+
import java.util.Collections;
34+
import java.util.List;
35+
import java.util.Set;
2636
import java.util.concurrent.ExecutionException;
2737

28-
@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0)
38+
import static org.hamcrest.Matchers.equalTo;
39+
import static org.hamcrest.Matchers.hasItem;
40+
import static org.hamcrest.Matchers.hasSize;
41+
import static org.hamcrest.Matchers.nullValue;
42+
43+
@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0, autoManageMasterNodes = false)
2944
public class VotingConfigurationIT extends ESIntegTestCase {
3045

46+
@Override
47+
protected Collection<Class<? extends Plugin>> nodePlugins() {
48+
return Collections.singletonList(MockTransportService.TestPlugin.class);
49+
}
50+
3151
public void testAbdicateAfterVotingConfigExclusionAdded() throws ExecutionException, InterruptedException {
52+
internalCluster().setBootstrapMasterNodeIndex(0);
3253
internalCluster().startNodes(2);
3354
final String originalMaster = internalCluster().getMasterName();
3455

@@ -38,4 +59,56 @@ public void testAbdicateAfterVotingConfigExclusionAdded() throws ExecutionExcept
3859
client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
3960
assertNotEquals(originalMaster, internalCluster().getMasterName());
4061
}
62+
63+
public void testElectsNodeNotInVotingConfiguration() throws Exception {
64+
internalCluster().setBootstrapMasterNodeIndex(0);
65+
final List<String> nodeNames = internalCluster().startNodes(4);
66+
67+
// a 4-node cluster settles on a 3-node configuration; we then prevent the nodes in the configuration from winning an election
68+
// by failing at the pre-voting stage, so that the extra node must be elected instead when the master shuts down. This extra node
69+
// should then add itself into the voting configuration.
70+
71+
assertFalse(internalCluster().client().admin().cluster().prepareHealth()
72+
.setWaitForNodes("4").setWaitForEvents(Priority.LANGUID).get().isTimedOut());
73+
74+
String excludedNodeName = null;
75+
final ClusterState clusterState
76+
= internalCluster().client().admin().cluster().prepareState().clear().setNodes(true).setMetaData(true).get().getState();
77+
final Set<String> votingConfiguration = clusterState.getLastCommittedConfiguration().getNodeIds();
78+
assertThat(votingConfiguration, hasSize(3));
79+
assertThat(clusterState.nodes().getSize(), equalTo(4));
80+
assertThat(votingConfiguration, hasItem(clusterState.nodes().getMasterNodeId()));
81+
for (DiscoveryNode discoveryNode : clusterState.nodes()) {
82+
if (votingConfiguration.contains(discoveryNode.getId()) == false) {
83+
assertThat(excludedNodeName, nullValue());
84+
excludedNodeName = discoveryNode.getName();
85+
}
86+
}
87+
88+
for (final String sender : nodeNames) {
89+
if (sender.equals(excludedNodeName)) {
90+
continue;
91+
}
92+
final MockTransportService senderTransportService
93+
= (MockTransportService) internalCluster().getInstance(TransportService.class, sender);
94+
for (final String receiver : nodeNames) {
95+
senderTransportService.addSendBehavior(internalCluster().getInstance(TransportService.class, receiver),
96+
(connection, requestId, action, request, options) -> {
97+
if (action.equals(PreVoteCollector.REQUEST_PRE_VOTE_ACTION_NAME)) {
98+
throw new ElasticsearchException("rejected");
99+
}
100+
connection.sendRequest(requestId, action, request, options);
101+
});
102+
}
103+
}
104+
105+
internalCluster().stopCurrentMasterNode();
106+
assertFalse(internalCluster().client().admin().cluster().prepareHealth()
107+
.setWaitForNodes("3").setWaitForEvents(Priority.LANGUID).get().isTimedOut());
108+
109+
final ClusterState newClusterState
110+
= internalCluster().client().admin().cluster().prepareState().clear().setNodes(true).setMetaData(true).get().getState();
111+
assertThat(newClusterState.nodes().getMasterNode().getName(), equalTo(excludedNodeName));
112+
assertThat(newClusterState.getLastCommittedConfiguration().getNodeIds(), hasItem(newClusterState.nodes().getMasterNodeId()));
113+
}
41114
}

0 commit comments

Comments
 (0)