Treat lagging nodes as faulty (elastic#51)

DaveCTurner · web-flow · commit f1b4358639cb · 2018-05-22T13:42:40.000+01:00
Today, if a node detects it is lagging the master it falls back to CANDIDATE but then receives a heartbeat from the master and goes back to FOLLOWER. If it is lagging because it missed a cluster state update for some reason then it will never recover, because there's no impetus to publish a further update. This change fixes this, crudely, by causing nodes to consider themselves as faulty while they are lagging, which eventually results in them being kicked out of the cluster and, later, rejoining, which fixes the lag. Fixes elastic#43, but will eventually be superseded by elastic#53.
diff --git a/server/src/main/java/org/elasticsearch/discovery/zen2/Legislator.java b/server/src/main/java/org/elasticsearch/discovery/zen2/Legislator.java
@@ -55,6 +55,7 @@
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.transport.TransportException;
 import org.elasticsearch.transport.TransportResponse;
+import org.elasticsearch.transport.TransportResponse.Empty;
 import org.elasticsearch.transport.TransportResponseHandler;
 
 import java.io.IOException;
@@ -136,6 +137,7 @@ public class Legislator extends AbstractComponent {
     private final Map<DiscoveryNode, MembershipAction.JoinCallback> joinRequestAccumulator = new HashMap<>();
 
     private Optional<Publication> currentPublication = Optional.empty();
+    private long laggingUntilCommittedVersionExceeds;
 
     public Legislator(Settings settings, ConsensusState.PersistedState persistedState,
                       Transport transport, MasterService masterService, DiscoveryNode localNode, LongSupplier currentTimeSupplier,
@@ -279,35 +281,16 @@ private void startSeekingJoins() {
     private Join joinLeaderInTerm(DiscoveryNode sourceNode, long term) {
         logger.debug("joinLeaderInTerm: from [{}] with term {}", sourceNode, term);
         Join join = consensusState.handleStartJoin(sourceNode, term);
+        lastJoin = Optional.of(join);
         if (mode != Mode.CANDIDATE) {
             becomeCandidate("joinLeaderInTerm");
         }
         return join;
     }
 
     public void handleStartJoin(DiscoveryNode sourceNode, StartJoinRequest startJoinRequest) {
-        Join join = joinLeaderInTerm(sourceNode, startJoinRequest.getTerm());
-
-        transport.sendJoin(sourceNode, join, new TransportResponseHandler<TransportResponse.Empty>() {
-            @Override
-            public void handleResponse(TransportResponse.Empty response) {
-                logger.debug("SendJoinResponseHandler: successfully joined {}", sourceNode);
-            }
-
-            @Override
-            public void handleException(TransportException exp) {
-                if (exp.getRootCause() instanceof ConsensusMessageRejectedException) {
-                    logger.debug("SendJoinResponseHandler: [{}] failed: {}", sourceNode, exp.getRootCause().getMessage());
-                } else {
-                    logger.debug(() -> new ParameterizedMessage("SendJoinResponseHandler: [{}] failed", sourceNode), exp);
-                }
-            }
-
-            @Override
-            public String executor() {
-                return ThreadPool.Names.SAME;
-            }
-        });
+        final Join join = joinLeaderInTerm(sourceNode, startJoinRequest.getTerm());
+        sendJoin(sourceNode, join);
     }
 
     private Optional<Join> ensureTermAtLeast(DiscoveryNode sourceNode, long targetTerm) {
@@ -899,11 +882,7 @@ public LegislatorPublishResponse handlePublishRequest(DiscoveryNode sourceNode,
             DiscoveryNodes.builder(publishRequest.getAcceptedState().nodes()).localNodeId(getLocalNode().getId()).build()).build();
         publishRequest = new PublishRequest(clusterState);
 
-        final Optional<Join> optionalJoin = ensureTermAtLeast(sourceNode, publishRequest.getAcceptedState().term());
-
-        if (optionalJoin.isPresent()) {
-            lastJoin = optionalJoin;
-        }
+        ensureTermAtLeast(sourceNode, publishRequest.getAcceptedState().term());
 
         logger.trace("handlePublishRequest: handling [{}] from [{}]", publishRequest, sourceNode);
 
@@ -931,36 +910,46 @@ public HeartbeatResponse handleHeartbeatRequest(DiscoveryNode sourceNode, Heartb
         }
 
         ensureTermAtLeast(sourceNode, heartbeatRequest.getTerm()).ifPresent(join -> {
-            logger.debug("handleHeartbeatRequest: sending join [{}] for term [{}] to {}",
-                join, heartbeatRequest.getTerm(), sourceNode);
-
-            transport.sendJoin(sourceNode, join, new TransportResponseHandler<TransportResponse.Empty>() {
-                @Override
-                public void handleResponse(TransportResponse.Empty response) {
-                    logger.debug("SendJoinResponseHandler: successfully joined {}", sourceNode);
-                }
-
-                @Override
-                public void handleException(TransportException exp) {
-                    if (exp.getRootCause() instanceof ConsensusMessageRejectedException) {
-                        logger.debug("SendJoinResponseHandler: [{}] failed: {}", sourceNode, exp.getRootCause().getMessage());
-                    } else {
-                        logger.debug(() -> new ParameterizedMessage("SendJoinResponseHandler: [{}] failed", sourceNode), exp);
-                    }
-                }
-
-                @Override
-                public String executor() {
-                    return ThreadPool.Names.SAME;
-                }
-            });
+            logger.debug("handleHeartbeatRequest: sending join [{}] for term [{}] to {}", join, heartbeatRequest.getTerm(), sourceNode);
+            sendJoin(sourceNode, join);
         });
 
+        if (laggingUntilCommittedVersionExceeds > 0
+            && (lastCommittedState.isPresent() == false || lastCommittedState.get().version() <= laggingUntilCommittedVersionExceeds)) {
+            logger.debug("handleHeartbeatRequest: rejecting [{}] from [{}] due to lag at version [{}]",
+                heartbeatRequest, sourceNode, laggingUntilCommittedVersionExceeds);
+            throw new ConsensusMessageRejectedException("HeartbeatRequest rejected: lagging at version [{}]",
+                laggingUntilCommittedVersionExceeds);
+        }
+
         becomeFollower("handleHeartbeatRequest", sourceNode);
 
         return new HeartbeatResponse(consensusState.getLastAcceptedVersion(), consensusState.getCurrentTerm());
     }
 
+    private void sendJoin(DiscoveryNode sourceNode, Join join) {
+        transport.sendJoin(sourceNode, join, new TransportResponseHandler<Empty>() {
+            @Override
+            public void handleResponse(Empty response) {
+                logger.debug("SendJoinResponseHandler: successfully joined {}", sourceNode);
+            }
+
+            @Override
+            public void handleException(TransportException exp) {
+                if (exp.getRootCause() instanceof ConsensusMessageRejectedException) {
+                    logger.debug("SendJoinResponseHandler: [{}] failed: {}", sourceNode, exp.getRootCause().getMessage());
+                } else {
+                    logger.debug(() -> new ParameterizedMessage("SendJoinResponseHandler: [{}] failed", sourceNode), exp);
+                }
+            }
+
+            @Override
+            public String executor() {
+                return ThreadPool.Names.SAME;
+            }
+        });
+    }
+
     public void handleApplyCommit(DiscoveryNode sourceNode, ApplyCommit applyCommit) {
         logger.trace("handleApplyCommit: applying {} from [{}]", applyCommit, sourceNode);
         consensusState.handleCommit(applyCommit);
@@ -1004,6 +993,9 @@ public OfferJoin handleSeekJoins(DiscoveryNode sender, SeekJoins seekJoins) {
                 consensusState.getCurrentTerm(), seekJoins, sender, newTerm);
             sendStartJoin(new StartJoinRequest(newTerm));
             throw new ConsensusMessageRejectedException("I'm still a leader");
+
+            // TODO what about a node that sent a join to a different node in our term? Is it now stuck until the next term?
+
         } else {
             // TODO: remove this once we have a discovery layer. If a node finds an active master node during discovery,
             // it will try to join that one, and not start seeking joins.
@@ -1464,15 +1456,15 @@ public void handleResponse(LeaderCheckResponse leaderCheckResponse) {
 
                         final long leaderVersion = leaderCheckResponse.getVersion();
                         long localVersion = getLastCommittedState().map(ClusterState::getVersion).orElse(-1L);
-                        if (leaderVersion > localVersion) {
+                        if (leaderVersion > localVersion && running) {
                             logger.trace("LeaderCheck.handleResponse: heartbeat for version {} > local version {}, starting lag detector",
                                 leaderVersion, localVersion);
                             futureExecutor.schedule(publishTimeout, "LeaderCheck#lagDetection", () -> {
                                 long localVersion2 = getLastCommittedState().map(ClusterState::getVersion).orElse(-1L);
-                                if (leaderVersion > localVersion2) {
+                                if (leaderVersion > localVersion2 && running) {
                                     logger.debug("LeaderCheck.handleResponse: lag detected: local version {} < leader version {} after {}",
                                         localVersion2, leaderVersion, publishTimeout);
-                                    becomeCandidate("LeaderCheck.handleResponse");
+                                    laggingUntilCommittedVersionExceeds = localVersion2;
                                 }
                             });
                         }
diff --git a/server/src/test/java/org/elasticsearch/discovery/zen2/LegislatorTests.java b/server/src/test/java/org/elasticsearch/discovery/zen2/LegislatorTests.java
@@ -75,6 +75,7 @@
 import static org.elasticsearch.discovery.zen2.Legislator.CONSENSUS_HEARTBEAT_TIMEOUT_SETTING;
 import static org.elasticsearch.discovery.zen2.Legislator.CONSENSUS_LEADER_CHECK_RETRY_COUNT_SETTING;
 import static org.elasticsearch.discovery.zen2.Legislator.CONSENSUS_MIN_DELAY_SETTING;
+import static org.elasticsearch.discovery.zen2.Legislator.CONSENSUS_PUBLISH_TIMEOUT_SETTING;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThan;
 import static org.hamcrest.core.Is.is;
@@ -407,6 +408,57 @@ public void testFastRemovalWhenFollowerDropsConnections() {
         cluster.assertUniqueLeaderAndExpectedModes();
     }
 
+    @TestLogging("org.elasticsearch.discovery.zen2:TRACE")
+    public void testLagDetectionCausesRejoin() {
+        Cluster cluster = new Cluster(3);
+        cluster.runRandomly(true);
+        cluster.stabilise();
+        ClusterNode leader = cluster.getAnyLeader();
+
+        final VotingConfiguration allNodes = new VotingConfiguration(
+            cluster.clusterNodes.stream().map(cn -> cn.localNode.getId()).collect(Collectors.toSet()));
+
+        // TODO: have the following automatically done as part of a reconfiguration subsystem
+        if (leader.legislator.hasElectionQuorum(allNodes) == false) {
+            logger.info("--> leader does not have a join quorum for the new configuration, abdicating to self");
+            // abdicate to self to acquire all join votes
+            leader.legislator.abdicateTo(leader.localNode);
+
+            cluster.stabilise();
+            leader = cluster.getAnyLeader();
+        }
+
+        logger.info("--> start of reconfiguration to make all nodes into voting nodes");
+
+        leader.handleClientValue(ConsensusStateTests.nextStateWithConfig(leader.legislator.getLastAcceptedState(), allNodes));
+        cluster.deliverNextMessageUntilQuiescent();
+
+        logger.info("--> end of reconfiguration to make all nodes into voting nodes");
+
+        for (final ClusterNode clusterNode : cluster.clusterNodes) {
+            if (clusterNode != leader) {
+                logger.info("--> disconnecting {}", clusterNode.getLocalNode());
+                clusterNode.isConnected = false;
+                break;
+            }
+        }
+
+        leader.handleClientValue(ConsensusStateTests.nextStateWithValue(leader.legislator.getLastAcceptedState(), randomLong()));
+        cluster.deliverNextMessageUntilQuiescent();
+
+        for (final ClusterNode clusterNode : cluster.clusterNodes) {
+            logger.info("--> reconnecting {}", clusterNode.getLocalNode());
+            clusterNode.isConnected = true;
+        }
+
+        cluster.stabilise();
+
+        // Furthermore the first one to wake up causes an election to complete successfully, because we run to quiescence
+        // before waking any other nodes up. Therefore the cluster has a unique leader and all connected nodes are FOLLOWERs.
+        cluster.assertConsistentStates();
+        cluster.assertUniqueLeaderAndExpectedModes();
+    }
+
     class Cluster {
         private final List<ClusterNode> clusterNodes;
         private final List<InFlightMessage> inFlightMessages = new ArrayList<>();
@@ -416,20 +468,38 @@ class Cluster {
         private static final long RANDOM_MODE_DELAY_VARIABILITY = 10000L;
         private long masterServicesTaskId = 0L;
 
-        // How long to wait? The worst case is that a leader just committed a value to all the other nodes, and then
-        // dropped off the network, which would mean that all the other nodes must detect its failure. It takes
-        // CONSENSUS_LEADER_CHECK_RETRY_COUNT_SETTING consecutive leader checks to fail before a follower becomes a
-        // candidates, and with an unresponsive leader each leader check takes up to
-        // CONSENSUS_HEARTBEAT_DELAY_SETTING + CONSENSUS_HEARTBEAT_TIMEOUT_SETTING. After all the retries have
-        // failed, nodes wake up, become candidates, and wait for up to 2 * CONSENSUS_MIN_DELAY_SETTING before
-        // attempting an election. The first election is expected to succeed, however, because we run to quiescence
-        // before waking any other nodes up.
-        private final long DEFAULT_STABILISATION_TIME =
-            (CONSENSUS_HEARTBEAT_DELAY_SETTING.get(Settings.EMPTY).millis() +
-                CONSENSUS_HEARTBEAT_TIMEOUT_SETTING.get(Settings.EMPTY).millis()) *
-                CONSENSUS_LEADER_CHECK_RETRY_COUNT_SETTING.get(Settings.EMPTY) +
-                2 * CONSENSUS_MIN_DELAY_SETTING.get(Settings.EMPTY).millis() +
-                RANDOM_MODE_DELAY_VARIABILITY + DEFAULT_DELAY_VARIABILITY;
+        // How long does it take for the cluster to stabilise?
+
+        // Each heartbeat takes at most this long:
+        private final long DEFAULT_MAX_HEARTBEAT_TIME
+            = CONSENSUS_HEARTBEAT_DELAY_SETTING.get(Settings.EMPTY).millis()
+            + CONSENSUS_HEARTBEAT_TIMEOUT_SETTING.get(Settings.EMPTY).millis()
+            + 2 * DEFAULT_DELAY_VARIABILITY;
+        // Multiple heartbeat failures are needed before the leader's failure is detected:
+        private final long DEFAULT_MAX_FAILURE_DETECTION_TIME
+            = DEFAULT_MAX_HEARTBEAT_TIME * CONSENSUS_LEADER_CHECK_RETRY_COUNT_SETTING.get(Settings.EMPTY);
+        // When stabilising, there are no election collisions, because we run to quiescence before waking any other nodes up.
+        // Therefore elections takes this long:
+        private final long DEFAULT_ELECTION_TIME
+            = 2 * (CONSENSUS_MIN_DELAY_SETTING.get(Settings.EMPTY).millis() + DEFAULT_DELAY_VARIABILITY);
+        // Lag detection takes this long to notice that a follower is lagging the leader:
+        private final long DEFAULT_LAG_DETECTION_TIME
+            = CONSENSUS_HEARTBEAT_DELAY_SETTING.get(Settings.EMPTY).millis() // before the heartbeat that hears about the lag
+            + CONSENSUS_PUBLISH_TIMEOUT_SETTING.get(Settings.EMPTY).millis() // waiting for the lag
+            + 2 * DEFAULT_DELAY_VARIABILITY;
+
+        // Worst cases for stabilisation:
+        //
+        // 1. Just before stabilisation there was a leader which committed a value and then dropped off the network. All nodes must first
+        // detect its failure and then elect a new one.
+        //
+        // 2. Just before stabilisation the leader published a value which wasn't received by one of the followers. The follower's lag
+        // detection must notice this omission, then the master must notice the node is rejecting heartbeats and remove it from the cluster,
+        // then the follower must notice it is missing from the cluster and rejoin.
+        //
+        private final long DEFAULT_STABILISATION_TIME = RANDOM_MODE_DELAY_VARIABILITY
+            + Math.max(DEFAULT_MAX_FAILURE_DETECTION_TIME + DEFAULT_ELECTION_TIME, // case 1
+            DEFAULT_LAG_DETECTION_TIME + DEFAULT_MAX_FAILURE_DETECTION_TIME + DEFAULT_MAX_HEARTBEAT_TIME); // case 2
 
         Cluster(int nodeCount) {
             clusterNodes = new ArrayList<>(nodeCount);