Skip to content

Commit f324394

Browse files
authored
Describe STALE_STATE_CONFIG in ClusterFormationFH (#53878)
We mark cluster states persisted on master-ineligible nodes as potentially-stale using the voting configuration `{STALE_STATE_CONFIG}` which prevents these nodes from being elected as master if they are restarted as master-eligible. Today we do not handle this special voting configuration differently in the `ClusterFormationFailureHandler`, leading to a mysterious message `an election requires a node with id [STALE_STATE_CONFIG]` if the election does not succeed. This commit adds a special case description for this situation to explain better why this node cannot win an election. Closes #53734
1 parent 3cf3f60 commit f324394

File tree

3 files changed

+25
-2
lines changed

3 files changed

+25
-2
lines changed

server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java

+7-1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.elasticsearch.common.transport.TransportAddress;
3131
import org.elasticsearch.common.unit.TimeValue;
3232
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
33+
import org.elasticsearch.gateway.GatewayMetaState;
3334
import org.elasticsearch.threadpool.ThreadPool;
3435
import org.elasticsearch.threadpool.ThreadPool.Names;
3536

@@ -206,7 +207,12 @@ private String describeQuorum(VotingConfiguration votingConfiguration) {
206207
assert requiredNodes <= realNodeIds.size() : nodeIds;
207208

208209
if (nodeIds.size() == 1) {
209-
return "a node with id " + realNodeIds;
210+
if (nodeIds.contains(GatewayMetaState.STALE_STATE_CONFIG_NODE_ID)) {
211+
return "one or more nodes that have already participated as master-eligible nodes in the cluster but this node was " +
212+
"not master-eligible the last time it joined the cluster";
213+
} else {
214+
return "a node with id " + realNodeIds;
215+
}
210216
} else if (nodeIds.size() == 2) {
211217
return "two nodes with ids " + realNodeIds;
212218
} else {

server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java

+8-1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,13 @@
7878
*/
7979
public class GatewayMetaState implements Closeable {
8080

81+
/**
82+
* Fake node ID for a voting configuration written by a master-ineligible data node to indicate that its on-disk state is potentially
83+
* stale (since it is written asynchronously after application, rather than before acceptance). This node ID means that if the node is
84+
* restarted as a master-eligible node then it does not win any elections until it has received a fresh cluster state.
85+
*/
86+
public static final String STALE_STATE_CONFIG_NODE_ID = "STALE_STATE_CONFIG";
87+
8188
// Set by calling start()
8289
private final SetOnce<PersistedState> persistedState = new SetOnce<>();
8390

@@ -360,7 +367,7 @@ protected void doRun() {
360367
}
361368

362369
static final CoordinationMetaData.VotingConfiguration staleStateConfiguration =
363-
new CoordinationMetaData.VotingConfiguration(Collections.singleton("STALE_STATE_CONFIG"));
370+
new CoordinationMetaData.VotingConfiguration(Collections.singleton(STALE_STATE_CONFIG_NODE_ID));
364371

365372
static ClusterState resetVotingConfiguration(ClusterState clusterState) {
366373
CoordinationMetaData newCoordinationMetaData = CoordinationMetaData.builder(clusterState.coordinationMetaData())

server/src/test/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelperTests.java

+10
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import org.elasticsearch.cluster.node.DiscoveryNodes;
3030
import org.elasticsearch.common.settings.Settings;
3131
import org.elasticsearch.common.transport.TransportAddress;
32+
import org.elasticsearch.gateway.GatewayMetaState;
3233
import org.elasticsearch.test.ESTestCase;
3334

3435
import java.util.Arrays;
@@ -395,5 +396,14 @@ public void testDescriptionAfterBootstrapping() {
395396
"have discovered [] which is not a quorum; " +
396397
"discovery will continue using [] from hosts providers and [" + otherMasterNode + ", " + localNode +
397398
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0")));
399+
400+
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, GatewayMetaState.STALE_STATE_CONFIG_NODE_ID), emptyList(),
401+
emptyList(), 0L, electionStrategy).getDescription(),
402+
is("master not discovered or elected yet, an election requires one or more nodes that have already participated as " +
403+
"master-eligible nodes in the cluster but this node was not master-eligible the last time it joined the cluster, " +
404+
"have discovered [] which is not a quorum; " +
405+
"discovery will continue using [] from hosts providers and [" + localNode +
406+
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
407+
398408
}
399409
}

0 commit comments

Comments
 (0)