Skip to content

Commit d0da025

Browse files
committed
Extend default probe connect/handshake timeouts
Today the discovery phase has a short 1-second timeout for handshaking with a remote node after connecting, which allows it to quickly move on and retry in the case of connecting to something that doesn't respond straight away (e.g. it isn't an Elasticsearch node). This short timeout was necessary when the component was first developed because each connection attempt would block a thread. Since elastic#42636 the connection attempt is now nonblocking so we can apply a more relaxed timeout. If transport security is enabled then our handshake timeout applies to the TLS handshake followed by the Elasticsearch handshake. If the TLS handshake alone takes over a second then the whole handshake times out with a `ConnectTransportException`, but this does not tell us which of the two individual handshakes took so long. TLS handshakes have their own 10-second timeout, which if reached yields a `SslHandshakeTimeoutException` that allows us to distinguish a problem at the TLS level from one at the Elasticsearch level. Therefore this commit extends the discovery probe timeouts.
1 parent 03334b9 commit d0da025

File tree

3 files changed

+14
-7
lines changed

3 files changed

+14
-7
lines changed

docs/reference/modules/discovery/discovery-settings.asciidoc

+2-2
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,12 @@ Defaults to `1s`.
7373
`discovery.probe.connect_timeout`::
7474
(<<static-cluster-setting,Static>>)
7575
Sets how long to wait when attempting to connect to each address. Defaults to
76-
`3s`.
76+
`30s`.
7777

7878
`discovery.probe.handshake_timeout`::
7979
(<<static-cluster-setting,Static>>)
8080
Sets how long to wait when attempting to identify the remote node via a
81-
handshake. Defaults to `1s`.
81+
handshake. Defaults to `30s`.
8282

8383
`discovery.request_peers_timeout`::
8484
(<<static-cluster-setting,Static>>)

server/src/main/java/org/elasticsearch/discovery/HandshakingTransportAddressConnector.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,11 @@ public class HandshakingTransportAddressConnector implements TransportAddressCon
5151
// connection timeout for probes
5252
public static final Setting<TimeValue> PROBE_CONNECT_TIMEOUT_SETTING =
5353
Setting.timeSetting("discovery.probe.connect_timeout",
54-
TimeValue.timeValueMillis(3000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
54+
TimeValue.timeValueSeconds(30), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
5555
// handshake timeout for probes
5656
public static final Setting<TimeValue> PROBE_HANDSHAKE_TIMEOUT_SETTING =
5757
Setting.timeSetting("discovery.probe.handshake_timeout",
58-
TimeValue.timeValueMillis(1000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
58+
TimeValue.timeValueSeconds(30), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
5959

6060
private final TransportService transportService;
6161
private final TimeValue probeConnectTimeout;

server/src/test/java/org/elasticsearch/discovery/HandshakingTransportAddressConnectorTests.java

+10-3
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import org.elasticsearch.common.logging.Loggers;
3434
import org.elasticsearch.common.settings.Settings;
3535
import org.elasticsearch.common.transport.TransportAddress;
36+
import org.elasticsearch.common.unit.TimeValue;
3637
import org.elasticsearch.test.ESTestCase;
3738
import org.elasticsearch.test.MockLogAppender;
3839
import org.elasticsearch.test.junit.annotations.TestLogging;
@@ -53,6 +54,7 @@
5354
import static java.util.Collections.emptyMap;
5455
import static java.util.Collections.emptySet;
5556
import static org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING;
57+
import static org.elasticsearch.discovery.HandshakingTransportAddressConnector.PROBE_CONNECT_TIMEOUT_SETTING;
5658
import static org.elasticsearch.discovery.HandshakingTransportAddressConnector.PROBE_HANDSHAKE_TIMEOUT_SETTING;
5759
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
5860
import static org.hamcrest.Matchers.equalTo;
@@ -79,6 +81,7 @@ public void startServices() {
7981
final Settings settings = Settings.builder()
8082
.put(NODE_NAME_SETTING.getKey(), "node")
8183
.put(CLUSTER_NAME_SETTING.getKey(), "local-cluster")
84+
.put(PROBE_HANDSHAKE_TIMEOUT_SETTING.getKey(), "1s") // shorter than default for the sake of test speed
8285
.build();
8386
threadPool = new TestThreadPool("node", settings);
8487

@@ -211,6 +214,11 @@ public void testDoesNotConnectToDifferentCluster() throws InterruptedException {
211214
failureListener.assertFailure();
212215
}
213216

217+
public void testTimeoutDefaults() {
218+
assertThat(PROBE_HANDSHAKE_TIMEOUT_SETTING.get(Settings.EMPTY), equalTo(TimeValue.timeValueSeconds(30)));
219+
assertThat(PROBE_CONNECT_TIMEOUT_SETTING.get(Settings.EMPTY), equalTo(TimeValue.timeValueSeconds(30)));
220+
}
221+
214222
public void testHandshakeTimesOut() throws InterruptedException {
215223
remoteNode = new DiscoveryNode("remote-node", buildNewFakeTransportAddress(), Version.CURRENT);
216224
discoveryAddress = getDiscoveryAddress();
@@ -219,15 +227,14 @@ public void testHandshakeTimesOut() throws InterruptedException {
219227

220228
FailureListener failureListener = new FailureListener();
221229
handshakingTransportAddressConnector.connectToRemoteMasterNode(discoveryAddress, failureListener);
222-
Thread.sleep(PROBE_HANDSHAKE_TIMEOUT_SETTING.get(Settings.EMPTY).millis());
223230
failureListener.assertFailure();
224231
}
225232

226233
private TransportAddress getDiscoveryAddress() {
227234
return randomBoolean() ? remoteNode.getAddress() : buildNewFakeTransportAddress();
228235
}
229236

230-
private class FailureListener implements ActionListener<DiscoveryNode> {
237+
private static class FailureListener implements ActionListener<DiscoveryNode> {
231238
final CountDownLatch completionLatch = new CountDownLatch(1);
232239

233240
@Override
@@ -241,7 +248,7 @@ public void onFailure(Exception e) {
241248
}
242249

243250
void assertFailure() throws InterruptedException {
244-
assertTrue(completionLatch.await(30, TimeUnit.SECONDS));
251+
assertTrue(completionLatch.await(15, TimeUnit.SECONDS));
245252
}
246253
}
247254
}

0 commit comments

Comments
 (0)