Skip to content

Commit f44c28b

Browse files
authored
Deprecate and ignore join timeout (#60872)
There is no point in timing out a join attempt any more once a cluster is entirely in 7.x. Timing out and retrying with the same master is pointless, and an in-flight join attempt to one master no longer blocks attempts to join other masters. This commit deprecates this unnecessary setting and removes its effect from the joining process. Relates #60873 which removes this setting in master.
1 parent 235e5ed commit f44c28b

File tree

5 files changed

+19
-30
lines changed

5 files changed

+19
-30
lines changed

docs/reference/modules/discovery/discovery-settings.asciidoc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,9 @@ compatibility. Support for the old name will be removed in a future version.
4343
Specifies whether {es} should form a multiple-node cluster. By default, {es}
4444
discovers other nodes when forming a cluster and allows other nodes to join
4545
the cluster later. If `discovery.type` is set to `single-node`, {es} forms a
46-
single-node cluster and suppresses the timeouts set by
47-
`cluster.publish.timeout` and `cluster.join.timeout`. For more information
48-
about when you might use this setting, see <<single-node-discovery>>.
46+
single-node cluster and suppresses the timeout set by
47+
`cluster.publish.timeout`. For more information about when you might use
48+
this setting, see <<single-node-discovery>>.
4949

5050
`cluster.initial_master_nodes`::
5151

@@ -196,9 +196,9 @@ or may become unstable or intolerant of certain failures.
196196

197197
`cluster.join.timeout`::
198198

199-
Sets how long a node will wait after sending a request to join a cluster
200-
before it considers the request to have failed and retries, unless
201-
`discovery.type` is set to `single-node`. Defaults to `60s`.
199+
deprecated[7.10, Has no effect in 7.x clusters] Sets how long a node will
200+
wait after sending a request to join a version 6.8 master before it
201+
considers the request to have failed and retries. Defaults to `60s`.
202202

203203
`cluster.max_voting_config_exclusions`::
204204

docs/reference/setup/add-nodes.asciidoc

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,7 @@ to the voting configuration if it is appropriate to do so.
6767

6868
During master election or when joining an existing formed cluster, a node
6969
sends a join request to the master in order to be officially added to the
70-
cluster. You can use the `cluster.join.timeout` setting to configure how long a
71-
node waits after sending a request to join a cluster. Its default value is `30s`.
72-
See <<modules-discovery-settings>>.
70+
cluster.
7371

7472
[discrete]
7573
[[modules-discovery-removing-nodes]]

server/src/internalClusterTest/java/org/elasticsearch/action/support/master/IndexingMasterFailoverIT.java

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,15 +52,11 @@ protected Collection<Class<? extends Plugin>> nodePlugins() {
5252
public void testMasterFailoverDuringIndexingWithMappingChanges() throws Throwable {
5353
logger.info("--> start 4 nodes, 3 master, 1 data");
5454

55-
final Settings sharedSettings = Settings.builder()
56-
.put("cluster.join.timeout", "10s") // still long to induce failures but not too long so test won't time out
57-
.build();
58-
5955
internalCluster().setBootstrapMasterNodeIndex(2);
6056

61-
internalCluster().startMasterOnlyNodes(3, sharedSettings);
57+
internalCluster().startMasterOnlyNodes(3, Settings.EMPTY);
6258

63-
String dataNode = internalCluster().startDataOnlyNode(sharedSettings);
59+
String dataNode = internalCluster().startDataOnlyNode(Settings.EMPTY);
6460

6561
logger.info("--> wait for all nodes to join the cluster");
6662
ensureStableCluster(4);

server/src/main/java/org/elasticsearch/cluster/coordination/JoinHelper.java

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
import org.elasticsearch.cluster.routing.RerouteService;
3535
import org.elasticsearch.cluster.routing.allocation.AllocationService;
3636
import org.elasticsearch.cluster.service.MasterService;
37-
import org.elasticsearch.common.Nullable;
3837
import org.elasticsearch.common.Priority;
3938
import org.elasticsearch.common.collect.Tuple;
4039
import org.elasticsearch.common.io.stream.StreamInput;
@@ -43,7 +42,6 @@
4342
import org.elasticsearch.common.unit.TimeValue;
4443
import org.elasticsearch.discovery.zen.MembershipAction;
4544
import org.elasticsearch.discovery.zen.ZenDiscovery;
46-
import org.elasticsearch.discovery.DiscoveryModule;
4745
import org.elasticsearch.monitor.NodeHealthService;
4846
import org.elasticsearch.monitor.StatusInfo;
4947
import org.elasticsearch.threadpool.ThreadPool;
@@ -83,22 +81,21 @@ public class JoinHelper {
8381
public static final String VALIDATE_JOIN_ACTION_NAME = "internal:cluster/coordination/join/validate";
8482
public static final String START_JOIN_ACTION_NAME = "internal:cluster/coordination/start_join";
8583

86-
// the timeout for each join attempt
84+
// the timeout for Zen1 join attempts
8785
public static final Setting<TimeValue> JOIN_TIMEOUT_SETTING =
8886
Setting.timeSetting("cluster.join.timeout",
89-
TimeValue.timeValueMillis(60000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
87+
TimeValue.timeValueMillis(60000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope, Setting.Property.Deprecated);
9088

9189
private final MasterService masterService;
9290
private final TransportService transportService;
9391
private final JoinTaskExecutor joinTaskExecutor;
9492

95-
@Nullable // if using single-node discovery
96-
private final TimeValue joinTimeout;
93+
private final TimeValue joinTimeout; // only used for Zen1 joining
9794
private final NodeHealthService nodeHealthService;
9895

9996
private final Set<Tuple<DiscoveryNode, JoinRequest>> pendingOutgoingJoins = Collections.synchronizedSet(new HashSet<>());
10097

101-
private AtomicReference<FailedJoinAttempt> lastFailedJoinAttempt = new AtomicReference<>();
98+
private final AtomicReference<FailedJoinAttempt> lastFailedJoinAttempt = new AtomicReference<>();
10299

103100
JoinHelper(Settings settings, AllocationService allocationService, MasterService masterService,
104101
TransportService transportService, LongSupplier currentTermSupplier, Supplier<ClusterState> currentStateSupplier,
@@ -108,7 +105,7 @@ public class JoinHelper {
108105
this.masterService = masterService;
109106
this.transportService = transportService;
110107
this.nodeHealthService = nodeHealthService;
111-
this.joinTimeout = DiscoveryModule.isSingleNodeDiscovery(settings) ? null : JOIN_TIMEOUT_SETTING.get(settings);
108+
this.joinTimeout = JOIN_TIMEOUT_SETTING.get(settings);
112109
this.joinTaskExecutor = new JoinTaskExecutor(settings, allocationService, logger, rerouteService) {
113110

114111
@Override
@@ -286,15 +283,17 @@ public void sendJoinRequest(DiscoveryNode destination, long term, Optional<Join>
286283
logger.debug("attempting to join {} with {}", destination, joinRequest);
287284
final String actionName;
288285
final TransportRequest transportRequest;
286+
final TransportRequestOptions transportRequestOptions;
289287
if (Coordinator.isZen1Node(destination)) {
290288
actionName = MembershipAction.DISCOVERY_JOIN_ACTION_NAME;
291289
transportRequest = new MembershipAction.JoinRequest(transportService.getLocalNode());
290+
transportRequestOptions = TransportRequestOptions.builder().withTimeout(joinTimeout).build();
292291
} else {
293292
actionName = JOIN_ACTION_NAME;
294293
transportRequest = joinRequest;
294+
transportRequestOptions = TransportRequestOptions.EMPTY;
295295
}
296-
transportService.sendRequest(destination, actionName, transportRequest,
297-
TransportRequestOptions.builder().withTimeout(joinTimeout).build(),
296+
transportService.sendRequest(destination, actionName, transportRequest, transportRequestOptions,
298297
new TransportResponseHandler<Empty>() {
299298
@Override
300299
public Empty read(StreamInput in) {
@@ -363,9 +362,7 @@ public void sendValidateJoinRequest(DiscoveryNode node, ClusterState state, Acti
363362
} else {
364363
actionName = VALIDATE_JOIN_ACTION_NAME;
365364
}
366-
transportService.sendRequest(node, actionName,
367-
new ValidateJoinRequest(state),
368-
TransportRequestOptions.builder().withTimeout(joinTimeout).build(),
365+
transportService.sendRequest(node, actionName, new ValidateJoinRequest(state),
369366
new ActionListenerResponseHandler<>(listener, i -> Empty.INSTANCE, ThreadPool.Names.GENERIC));
370367
}
371368

server/src/test/java/org/elasticsearch/discovery/AbstractDisruptionTestCase.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
import org.elasticsearch.cluster.block.ClusterBlockLevel;
2525
import org.elasticsearch.cluster.coordination.Coordinator;
2626
import org.elasticsearch.cluster.coordination.FollowersChecker;
27-
import org.elasticsearch.cluster.coordination.JoinHelper;
2827
import org.elasticsearch.cluster.coordination.LeaderChecker;
2928
import org.elasticsearch.cluster.node.DiscoveryNodes;
3029
import org.elasticsearch.common.Nullable;
@@ -126,7 +125,6 @@ List<String> startCluster(int numberOfNodes) {
126125
.put(LeaderChecker.LEADER_CHECK_RETRY_COUNT_SETTING.getKey(), 1) // for hitting simulated network failures quickly
127126
.put(FollowersChecker.FOLLOWER_CHECK_TIMEOUT_SETTING.getKey(), "5s") // for hitting simulated network failures quickly
128127
.put(FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), 1) // for hitting simulated network failures quickly
129-
.put(JoinHelper.JOIN_TIMEOUT_SETTING.getKey(), "10s") // still long to induce failures but to long so test won't time out
130128
.put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "5s") // <-- for hitting simulated network failures quickly
131129
.put(TransportSettings.CONNECT_TIMEOUT.getKey(), "10s") // Network delay disruption waits for the min between this
132130
// value and the time of disruption and does not recover immediately

0 commit comments

Comments
 (0)