|
| 1 | +/* |
| 2 | + * Licensed to Elasticsearch under one or more contributor |
| 3 | + * license agreements. See the NOTICE file distributed with |
| 4 | + * this work for additional information regarding copyright |
| 5 | + * ownership. Elasticsearch licenses this file to you under |
| 6 | + * the Apache License, Version 2.0 (the "License"); you may |
| 7 | + * not use this file except in compliance with the License. |
| 8 | + * You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, |
| 13 | + * software distributed under the License is distributed on an |
| 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + * KIND, either express or implied. See the License for the |
| 16 | + * specific language governing permissions and limitations |
| 17 | + * under the License. |
| 18 | + */ |
| 19 | + |
| 20 | +package org.elasticsearch.cluster.coordination; |
| 21 | + |
| 22 | +import org.apache.logging.log4j.message.ParameterizedMessage; |
| 23 | +import org.elasticsearch.cluster.coordination.Coordinator.Mode; |
| 24 | +import org.elasticsearch.cluster.node.DiscoveryNode; |
| 25 | +import org.elasticsearch.cluster.node.DiscoveryNodes; |
| 26 | +import org.elasticsearch.common.component.AbstractComponent; |
| 27 | +import org.elasticsearch.common.io.stream.StreamInput; |
| 28 | +import org.elasticsearch.common.io.stream.StreamOutput; |
| 29 | +import org.elasticsearch.common.settings.Setting; |
| 30 | +import org.elasticsearch.common.settings.Settings; |
| 31 | +import org.elasticsearch.common.unit.TimeValue; |
| 32 | +import org.elasticsearch.common.util.concurrent.AbstractRunnable; |
| 33 | +import org.elasticsearch.threadpool.ThreadPool.Names; |
| 34 | +import org.elasticsearch.transport.ConnectTransportException; |
| 35 | +import org.elasticsearch.transport.TransportChannel; |
| 36 | +import org.elasticsearch.transport.TransportException; |
| 37 | +import org.elasticsearch.transport.TransportRequest; |
| 38 | +import org.elasticsearch.transport.TransportRequestOptions; |
| 39 | +import org.elasticsearch.transport.TransportRequestOptions.Type; |
| 40 | +import org.elasticsearch.transport.TransportResponse.Empty; |
| 41 | +import org.elasticsearch.transport.TransportResponseHandler; |
| 42 | +import org.elasticsearch.transport.TransportService; |
| 43 | + |
| 44 | +import java.io.IOException; |
| 45 | +import java.util.HashSet; |
| 46 | +import java.util.Map; |
| 47 | +import java.util.Objects; |
| 48 | +import java.util.Set; |
| 49 | +import java.util.function.Consumer; |
| 50 | +import java.util.function.Predicate; |
| 51 | + |
| 52 | +import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap; |
| 53 | + |
| 54 | +/** |
| 55 | + * The FollowersChecker is responsible for allowing a leader to check that its followers are still connected and healthy. On deciding that a |
| 56 | + * follower has failed the leader will remove it from the cluster. We are fairly lenient, possibly allowing multiple checks to fail before |
| 57 | + * considering a follower to be faulty, to allow for a brief network partition or a long GC cycle to occur without triggering the removal of |
| 58 | + * a node and the consequent shard reallocation. |
| 59 | + */ |
| 60 | +public class FollowersChecker extends AbstractComponent { |
| 61 | + |
| 62 | + public static final String FOLLOWER_CHECK_ACTION_NAME = "internal:coordination/fault_detection/follower_check"; |
| 63 | + |
| 64 | + // the time between checks sent to each node |
| 65 | + public static final Setting<TimeValue> FOLLOWER_CHECK_INTERVAL_SETTING = |
| 66 | + Setting.timeSetting("cluster.fault_detection.follower_check.interval", |
| 67 | + TimeValue.timeValueMillis(1000), TimeValue.timeValueMillis(100), Setting.Property.NodeScope); |
| 68 | + |
| 69 | + // the timeout for each check sent to each node |
| 70 | + public static final Setting<TimeValue> FOLLOWER_CHECK_TIMEOUT_SETTING = |
| 71 | + Setting.timeSetting("cluster.fault_detection.follower_check.timeout", |
| 72 | + TimeValue.timeValueMillis(30000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope); |
| 73 | + |
| 74 | + // the number of failed checks that must happen before the follower is considered to have failed. |
| 75 | + public static final Setting<Integer> FOLLOWER_CHECK_RETRY_COUNT_SETTING = |
| 76 | + Setting.intSetting("cluster.fault_detection.follower_check.retry_count", 3, 1, Setting.Property.NodeScope); |
| 77 | + |
| 78 | + private final TimeValue followerCheckInterval; |
| 79 | + private final TimeValue followerCheckTimeout; |
| 80 | + private final int followerCheckRetryCount; |
| 81 | + private final Consumer<DiscoveryNode> onNodeFailure; |
| 82 | + private final Consumer<FollowerCheckRequest> handleRequestAndUpdateState; |
| 83 | + |
| 84 | + private final Object mutex = new Object(); // protects writes to this state; read access does not need sync |
| 85 | + private final Map<DiscoveryNode, FollowerChecker> followerCheckers = newConcurrentMap(); |
| 86 | + private final Set<DiscoveryNode> faultyNodes = new HashSet<>(); |
| 87 | + |
| 88 | + private final TransportService transportService; |
| 89 | + |
| 90 | + private volatile FastResponseState fastResponseState; |
| 91 | + |
| 92 | + public FollowersChecker(Settings settings, TransportService transportService, |
| 93 | + Consumer<FollowerCheckRequest> handleRequestAndUpdateState, |
| 94 | + Consumer<DiscoveryNode> onNodeFailure) { |
| 95 | + super(settings); |
| 96 | + this.transportService = transportService; |
| 97 | + this.handleRequestAndUpdateState = handleRequestAndUpdateState; |
| 98 | + this.onNodeFailure = onNodeFailure; |
| 99 | + |
| 100 | + followerCheckInterval = FOLLOWER_CHECK_INTERVAL_SETTING.get(settings); |
| 101 | + followerCheckTimeout = FOLLOWER_CHECK_TIMEOUT_SETTING.get(settings); |
| 102 | + followerCheckRetryCount = FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings); |
| 103 | + |
| 104 | + updateFastResponseState(0, Mode.CANDIDATE); |
| 105 | + transportService.registerRequestHandler(FOLLOWER_CHECK_ACTION_NAME, Names.SAME, FollowerCheckRequest::new, |
| 106 | + (request, transportChannel, task) -> handleFollowerCheck(request, transportChannel)); |
| 107 | + } |
| 108 | + |
| 109 | + /** |
| 110 | + * Update the set of known nodes, starting to check any new ones and stopping checking any previously-known-but-now-unknown ones. |
| 111 | + */ |
| 112 | + public void setCurrentNodes(DiscoveryNodes discoveryNodes) { |
| 113 | + synchronized (mutex) { |
| 114 | + final Predicate<DiscoveryNode> isUnknownNode = n -> discoveryNodes.nodeExists(n) == false; |
| 115 | + followerCheckers.keySet().removeIf(isUnknownNode); |
| 116 | + faultyNodes.removeIf(isUnknownNode); |
| 117 | + |
| 118 | + for (final DiscoveryNode discoveryNode : discoveryNodes) { |
| 119 | + if (discoveryNode.equals(discoveryNodes.getLocalNode()) == false |
| 120 | + && followerCheckers.containsKey(discoveryNode) == false |
| 121 | + && faultyNodes.contains(discoveryNode) == false) { |
| 122 | + |
| 123 | + final FollowerChecker followerChecker = new FollowerChecker(discoveryNode); |
| 124 | + followerCheckers.put(discoveryNode, followerChecker); |
| 125 | + followerChecker.start(); |
| 126 | + } |
| 127 | + } |
| 128 | + } |
| 129 | + } |
| 130 | + |
| 131 | + /** |
| 132 | + * The system is normally in a state in which every follower remains a follower of a stable leader in a single term for an extended |
| 133 | + * period of time, and therefore our response to every follower check is the same. We handle this case with a single volatile read |
| 134 | + * entirely on the network thread, and only if the fast path fails do we perform some work in the background, by notifying the |
| 135 | + * FollowersChecker whenever our term or mode changes here. |
| 136 | + */ |
| 137 | + public void updateFastResponseState(final long term, final Mode mode) { |
| 138 | + fastResponseState = new FastResponseState(term, mode); |
| 139 | + } |
| 140 | + |
| 141 | + private void handleFollowerCheck(FollowerCheckRequest request, TransportChannel transportChannel) throws IOException { |
| 142 | + FastResponseState responder = this.fastResponseState; |
| 143 | + |
| 144 | + if (responder.mode == Mode.FOLLOWER && responder.term == request.term) { |
| 145 | + // TODO trigger a term bump if we voted for a different leader in this term |
| 146 | + logger.trace("responding to {} on fast path", request); |
| 147 | + transportChannel.sendResponse(Empty.INSTANCE); |
| 148 | + return; |
| 149 | + } |
| 150 | + |
| 151 | + if (request.term < responder.term) { |
| 152 | + throw new CoordinationStateRejectedException("rejecting " + request + " since local state is " + this); |
| 153 | + } |
| 154 | + |
| 155 | + transportService.getThreadPool().generic().execute(new AbstractRunnable() { |
| 156 | + @Override |
| 157 | + protected void doRun() throws IOException { |
| 158 | + logger.trace("responding to {} on slow path", request); |
| 159 | + try { |
| 160 | + handleRequestAndUpdateState.accept(request); |
| 161 | + } catch (Exception e) { |
| 162 | + transportChannel.sendResponse(e); |
| 163 | + return; |
| 164 | + } |
| 165 | + transportChannel.sendResponse(Empty.INSTANCE); |
| 166 | + } |
| 167 | + |
| 168 | + @Override |
| 169 | + public void onFailure(Exception e) { |
| 170 | + logger.debug(new ParameterizedMessage("exception while responding to {}", request), e); |
| 171 | + } |
| 172 | + |
| 173 | + @Override |
| 174 | + public String toString() { |
| 175 | + return "slow path response to " + request; |
| 176 | + } |
| 177 | + }); |
| 178 | + } |
| 179 | + |
| 180 | + // TODO in the PoC a faulty node was considered non-faulty again if it sent us a PeersRequest: |
| 181 | + // - node disconnects, detected faulty, removal is enqueued |
| 182 | + // - node reconnects, pings us, finds we are master, requests to join, all before removal is applied |
| 183 | + // - join is processed before removal, but we do not publish to known-faulty nodes so the joining node does not receive this publication |
| 184 | + // - it doesn't start its leader checker since it receives nothing to cause it to become a follower |
| 185 | + // Apparently this meant that it remained a candidate for too long, leading to a test failure. At the time this logic was added, we did |
| 186 | + // not have gossip-based discovery which would (I think) have retried this joining process a short time later. It's therefore possible |
| 187 | + // that this is no longer required, so it's omitted here until we can be sure if it's necessary or not. |
| 188 | + |
| 189 | + /** |
| 190 | + * @return nodes in the current cluster state which have failed their follower checks. |
| 191 | + */ |
| 192 | + public Set<DiscoveryNode> getFaultyNodes() { |
| 193 | + synchronized (mutex) { |
| 194 | + return new HashSet<>(this.faultyNodes); |
| 195 | + } |
| 196 | + } |
| 197 | + |
| 198 | + @Override |
| 199 | + public String toString() { |
| 200 | + return "FollowersChecker{" + |
| 201 | + "followerCheckInterval=" + followerCheckInterval + |
| 202 | + ", followerCheckTimeout=" + followerCheckTimeout + |
| 203 | + ", followerCheckRetryCount=" + followerCheckRetryCount + |
| 204 | + ", followerCheckers=" + followerCheckers + |
| 205 | + ", faultyNodes=" + faultyNodes + |
| 206 | + ", fastResponseState=" + fastResponseState + |
| 207 | + '}'; |
| 208 | + } |
| 209 | + |
| 210 | + static class FastResponseState { |
| 211 | + final long term; |
| 212 | + final Mode mode; |
| 213 | + |
| 214 | + FastResponseState(final long term, final Mode mode) { |
| 215 | + this.term = term; |
| 216 | + this.mode = mode; |
| 217 | + } |
| 218 | + |
| 219 | + @Override |
| 220 | + public String toString() { |
| 221 | + return "FastResponseState{" + |
| 222 | + "term=" + term + |
| 223 | + ", mode=" + mode + |
| 224 | + '}'; |
| 225 | + } |
| 226 | + } |
| 227 | + |
| 228 | + /** |
| 229 | + * A checker for an individual follower. |
| 230 | + */ |
| 231 | + private class FollowerChecker { |
| 232 | + private final DiscoveryNode discoveryNode; |
| 233 | + private int failureCountSinceLastSuccess; |
| 234 | + |
| 235 | + FollowerChecker(DiscoveryNode discoveryNode) { |
| 236 | + this.discoveryNode = discoveryNode; |
| 237 | + } |
| 238 | + |
| 239 | + private boolean running() { |
| 240 | + return this == followerCheckers.get(discoveryNode); |
| 241 | + } |
| 242 | + |
| 243 | + void start() { |
| 244 | + assert running(); |
| 245 | + handleWakeUp(); |
| 246 | + } |
| 247 | + |
| 248 | + private void handleWakeUp() { |
| 249 | + if (running() == false) { |
| 250 | + logger.trace("handleWakeUp: not running"); |
| 251 | + return; |
| 252 | + } |
| 253 | + |
| 254 | + final FollowerCheckRequest request = new FollowerCheckRequest(fastResponseState.term); |
| 255 | + logger.trace("handleWakeUp: checking {} with {}", discoveryNode, request); |
| 256 | + transportService.sendRequest(discoveryNode, FOLLOWER_CHECK_ACTION_NAME, request, |
| 257 | + TransportRequestOptions.builder().withTimeout(followerCheckTimeout).withType(Type.PING).build(), |
| 258 | + new TransportResponseHandler<Empty>() { |
| 259 | + @Override |
| 260 | + public void handleResponse(Empty response) { |
| 261 | + if (running() == false) { |
| 262 | + logger.trace("{} no longer running", FollowerChecker.this); |
| 263 | + return; |
| 264 | + } |
| 265 | + |
| 266 | + failureCountSinceLastSuccess = 0; |
| 267 | + logger.trace("{} check successful", FollowerChecker.this); |
| 268 | + scheduleNextWakeUp(); |
| 269 | + } |
| 270 | + |
| 271 | + @Override |
| 272 | + public void handleException(TransportException exp) { |
| 273 | + if (running() == false) { |
| 274 | + logger.debug(new ParameterizedMessage("{} no longer running", FollowerChecker.this), exp); |
| 275 | + return; |
| 276 | + } |
| 277 | + |
| 278 | + failureCountSinceLastSuccess++; |
| 279 | + |
| 280 | + if (failureCountSinceLastSuccess >= followerCheckRetryCount) { |
| 281 | + logger.debug(() -> new ParameterizedMessage("{} failed too many times", FollowerChecker.this), exp); |
| 282 | + } else if (exp instanceof ConnectTransportException |
| 283 | + || exp.getCause() instanceof ConnectTransportException) { |
| 284 | + logger.debug(() -> new ParameterizedMessage("{} disconnected", FollowerChecker.this), exp); |
| 285 | + } else { |
| 286 | + logger.debug(() -> new ParameterizedMessage("{} failed, retrying", FollowerChecker.this), exp); |
| 287 | + scheduleNextWakeUp(); |
| 288 | + return; |
| 289 | + } |
| 290 | + |
| 291 | + transportService.getThreadPool().generic().execute(new Runnable() { |
| 292 | + @Override |
| 293 | + public void run() { |
| 294 | + synchronized (mutex) { |
| 295 | + if (running() == false) { |
| 296 | + logger.debug("{} no longer running, not marking faulty", FollowerChecker.this); |
| 297 | + return; |
| 298 | + } |
| 299 | + faultyNodes.add(discoveryNode); |
| 300 | + followerCheckers.remove(discoveryNode); |
| 301 | + } |
| 302 | + onNodeFailure.accept(discoveryNode); |
| 303 | + } |
| 304 | + |
| 305 | + @Override |
| 306 | + public String toString() { |
| 307 | + return "detected failure of " + discoveryNode; |
| 308 | + } |
| 309 | + }); |
| 310 | + } |
| 311 | + |
| 312 | + |
| 313 | + @Override |
| 314 | + public String executor() { |
| 315 | + return Names.SAME; |
| 316 | + } |
| 317 | + }); |
| 318 | + } |
| 319 | + |
| 320 | + private void scheduleNextWakeUp() { |
| 321 | + transportService.getThreadPool().schedule(followerCheckInterval, Names.SAME, new Runnable() { |
| 322 | + @Override |
| 323 | + public void run() { |
| 324 | + handleWakeUp(); |
| 325 | + } |
| 326 | + |
| 327 | + @Override |
| 328 | + public String toString() { |
| 329 | + return FollowerChecker.this + "::handleWakeUp"; |
| 330 | + } |
| 331 | + }); |
| 332 | + } |
| 333 | + |
| 334 | + @Override |
| 335 | + public String toString() { |
| 336 | + return "FollowerChecker{" + |
| 337 | + "discoveryNode=" + discoveryNode + |
| 338 | + ", failureCountSinceLastSuccess=" + failureCountSinceLastSuccess + |
| 339 | + ", [" + FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey() + "]=" + followerCheckRetryCount + |
| 340 | + '}'; |
| 341 | + } |
| 342 | + } |
| 343 | + |
| 344 | + public static class FollowerCheckRequest extends TransportRequest { |
| 345 | + |
| 346 | + private final long term; |
| 347 | + |
| 348 | + public long getTerm() { |
| 349 | + return term; |
| 350 | + } |
| 351 | + |
| 352 | + public FollowerCheckRequest(final long term) { |
| 353 | + this.term = term; |
| 354 | + } |
| 355 | + |
| 356 | + public FollowerCheckRequest(final StreamInput in) throws IOException { |
| 357 | + super(in); |
| 358 | + term = in.readLong(); |
| 359 | + } |
| 360 | + |
| 361 | + @Override |
| 362 | + public void writeTo(final StreamOutput out) throws IOException { |
| 363 | + super.writeTo(out); |
| 364 | + out.writeLong(term); |
| 365 | + } |
| 366 | + |
| 367 | + @Override |
| 368 | + public boolean equals(Object o) { |
| 369 | + if (this == o) return true; |
| 370 | + if (o == null || getClass() != o.getClass()) return false; |
| 371 | + FollowerCheckRequest that = (FollowerCheckRequest) o; |
| 372 | + return term == that.term; |
| 373 | + } |
| 374 | + |
| 375 | + @Override |
| 376 | + public String toString() { |
| 377 | + return "FollowerCheckRequest{" + |
| 378 | + "term=" + term + |
| 379 | + '}'; |
| 380 | + } |
| 381 | + |
| 382 | + @Override |
| 383 | + public int hashCode() { |
| 384 | + return Objects.hash(term); |
| 385 | + } |
| 386 | + } |
| 387 | +} |
0 commit comments