From bc4c5d4b8507cb143b66d96da5d1580d103c79f3 Mon Sep 17 00:00:00 2001 From: David Turner Date: Fri, 20 Mar 2020 15:31:45 +0000 Subject: [PATCH 1/2] Describe STALE_STATE_CONFIG in ClusterFormationFH We mark cluster states persisted on master-ineligible nodes as potentially-stale using the voting configuration `{STALE_STATE_CONFIG}` which prevents these nodes from being elected as master if they are restarted as master-eligible. Today we do not handle this special voting configuration differently in the `ClusterFormationFailureHandler`, leading to a mysterious message `an election requires a node with id [STALE_STATE_CONFIG]` if the election does not succeed. This commit adds a special case description for this situation to explain better why this node cannot win an election. Closes #53734 --- .../coordination/ClusterFormationFailureHelper.java | 8 +++++++- .../org/elasticsearch/gateway/GatewayMetaState.java | 9 ++++++++- .../ClusterFormationFailureHelperTests.java | 10 ++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java b/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java index c249458dd23cf..481c3ae458e95 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java @@ -30,6 +30,7 @@ import org.elasticsearch.common.transport.TransportAddress; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.util.concurrent.AbstractRunnable; +import org.elasticsearch.gateway.GatewayMetaState; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.threadpool.ThreadPool.Names; @@ -206,7 +207,12 @@ private String describeQuorum(VotingConfiguration votingConfiguration) { assert requiredNodes <= realNodeIds.size() : nodeIds; if (nodeIds.size() == 1) { - return "a node with id " + realNodeIds; + if (nodeIds.contains(GatewayMetaState.STALE_STATE_CONFIG_NODE_ID)) { + return "an election requires one or more nodes that have already participated as master-eligible nodes in the " + + "cluster, but this node was not master-eligible the last time it joined the cluster"; + } else { + return "a node with id " + realNodeIds; + } } else if (nodeIds.size() == 2) { return "two nodes with ids " + realNodeIds; } else { diff --git a/server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java b/server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java index 89e0ff5c9fb66..62b3eebccab38 100644 --- a/server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java +++ b/server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java @@ -78,6 +78,13 @@ */ public class GatewayMetaState implements Closeable { + /** + * Fake node ID for a voting configuration written by a master-ineligible data node to indicate that its on-disk state is potentially + * stale (since it is written asynchronously after application, rather than before acceptance). This node ID means that if the node is + * restarted as a master-eligible node then it does not win any elections until it has received a fresh cluster state. + */ + public static final String STALE_STATE_CONFIG_NODE_ID = "STALE_STATE_CONFIG"; + // Set by calling start() private final SetOnce persistedState = new SetOnce<>(); @@ -360,7 +367,7 @@ protected void doRun() { } static final CoordinationMetaData.VotingConfiguration staleStateConfiguration = - new CoordinationMetaData.VotingConfiguration(Collections.singleton("STALE_STATE_CONFIG")); + new CoordinationMetaData.VotingConfiguration(Collections.singleton(STALE_STATE_CONFIG_NODE_ID)); static ClusterState resetVotingConfiguration(ClusterState clusterState) { CoordinationMetaData newCoordinationMetaData = CoordinationMetaData.builder(clusterState.coordinationMetaData()) diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelperTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelperTests.java index 850a1ec0b7abd..e72cda66612d9 100644 --- a/server/src/test/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelperTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelperTests.java @@ -29,6 +29,7 @@ import org.elasticsearch.cluster.node.DiscoveryNodes; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.transport.TransportAddress; +import org.elasticsearch.gateway.GatewayMetaState; import org.elasticsearch.test.ESTestCase; import java.util.Arrays; @@ -395,5 +396,14 @@ public void testDescriptionAfterBootstrapping() { "have discovered [] which is not a quorum; " + "discovery will continue using [] from hosts providers and [" + otherMasterNode + ", " + localNode + "] from last-known cluster state; node term 0, last-accepted version 0 in term 0"))); + + assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, GatewayMetaState.STALE_STATE_CONFIG_NODE_ID), emptyList(), + emptyList(), 0L, electionStrategy).getDescription(), + is("master not discovered or elected yet, an election requires one or more nodes that have already participated as " + + "master-eligible nodes in the cluster, but this node was not master-eligible the last time it joined the cluster, " + + "have discovered [] which is not a quorum; " + + "discovery will continue using [] from hosts providers and [" + localNode + + "] from last-known cluster state; node term 0, last-accepted version 0 in term 0")); + } } From 04191954778ca88aa5276fc643a96dfdc9026d5c Mon Sep 17 00:00:00 2001 From: David Turner Date: Fri, 20 Mar 2020 15:48:41 +0000 Subject: [PATCH 2/2] Passing tests are always nice --- .../cluster/coordination/ClusterFormationFailureHelper.java | 4 ++-- .../coordination/ClusterFormationFailureHelperTests.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java b/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java index 481c3ae458e95..e68af52eaed38 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java @@ -208,8 +208,8 @@ private String describeQuorum(VotingConfiguration votingConfiguration) { if (nodeIds.size() == 1) { if (nodeIds.contains(GatewayMetaState.STALE_STATE_CONFIG_NODE_ID)) { - return "an election requires one or more nodes that have already participated as master-eligible nodes in the " + - "cluster, but this node was not master-eligible the last time it joined the cluster"; + return "one or more nodes that have already participated as master-eligible nodes in the cluster but this node was " + + "not master-eligible the last time it joined the cluster"; } else { return "a node with id " + realNodeIds; } diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelperTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelperTests.java index e72cda66612d9..756cf6668b5ba 100644 --- a/server/src/test/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelperTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelperTests.java @@ -400,7 +400,7 @@ public void testDescriptionAfterBootstrapping() { assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, GatewayMetaState.STALE_STATE_CONFIG_NODE_ID), emptyList(), emptyList(), 0L, electionStrategy).getDescription(), is("master not discovered or elected yet, an election requires one or more nodes that have already participated as " + - "master-eligible nodes in the cluster, but this node was not master-eligible the last time it joined the cluster, " + + "master-eligible nodes in the cluster but this node was not master-eligible the last time it joined the cluster, " + "have discovered [] which is not a quorum; " + "discovery will continue using [] from hosts providers and [" + localNode + "] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));