Skip to content

Commit 7857109

Browse files
committed
Log when probe succeeds but full connection fails (elastic#51304)
It is permitted for nodes to accept transport connections at addresses other than their publish address, which allows a good deal of flexibility when configuring discovery. However, it is not unusual for users to misconfigure nodes to pick a publish address which is inaccessible to other nodes. We see this happen a lot if the nodes are on different networks separated by a proxy, or if the nodes are running in Docker with the wrong kind of network config. In this case we offer no useful feedback to the user unless they enable TRACE-level logs. It's particularly tricky to diagnose because if we test connectivity between the nodes (using their discovery addresses) then all will appear well. This commit adds a WARN-level log if this kind of misconfiguration is detected: the probe connection has succeeded (to indicate that we are really talking to a healthy Elasticsearch node) but the followup connection attempt fails. It also tidies up some loose ends in `HandshakingTransportAddressConnector`, removing some TODOs that need not be completed, and registering its accidentally-unregistered timeout settings.
1 parent 84664e8 commit 7857109

File tree

3 files changed

+87
-12
lines changed

3 files changed

+87
-12
lines changed

server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java

+3
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
import org.elasticsearch.common.util.concurrent.ThreadContext;
7070
import org.elasticsearch.discovery.DiscoveryModule;
7171
import org.elasticsearch.discovery.DiscoverySettings;
72+
import org.elasticsearch.discovery.HandshakingTransportAddressConnector;
7273
import org.elasticsearch.discovery.PeerFinder;
7374
import org.elasticsearch.discovery.SeedHostsResolver;
7475
import org.elasticsearch.discovery.SettingsBasedSeedHostsProvider;
@@ -529,6 +530,8 @@ public void apply(Settings value, Settings current, Settings previous) {
529530
ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING,
530531
ClusterBootstrapService.UNCONFIGURED_BOOTSTRAP_TIMEOUT_SETTING,
531532
LagDetector.CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING,
533+
HandshakingTransportAddressConnector.PROBE_CONNECT_TIMEOUT_SETTING,
534+
HandshakingTransportAddressConnector.PROBE_HANDSHAKE_TIMEOUT_SETTING,
532535
DiscoveryUpgradeService.BWC_PING_TIMEOUT_SETTING,
533536
DiscoveryUpgradeService.ENABLE_UNSAFE_BOOTSTRAPPING_ON_UPGRADE_SETTING)));
534537

server/src/main/java/org/elasticsearch/discovery/HandshakingTransportAddressConnector.java

+12-4
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ public void connectToRemoteMasterNode(TransportAddress transportAddress, ActionL
7474

7575
@Override
7676
protected void doRun() {
77-
// TODO if transportService is already connected to this address then skip the handshaking
77+
// We could skip this if the transportService were already connected to the given address, but the savings would be minimal
78+
// so we open a new connection anyway.
7879

7980
final DiscoveryNode targetNode = new DiscoveryNode("", transportAddress.toString(),
8081
UUIDs.randomBase64UUID(Randomness.get()), // generated deterministically for reproducible tests
@@ -101,21 +102,28 @@ protected void innerOnResponse(DiscoveryNode remoteNode) {
101102
IOUtils.closeWhileHandlingException(connection);
102103

103104
if (remoteNode.equals(transportService.getLocalNode())) {
104-
// TODO cache this result for some time? forever?
105105
listener.onFailure(new ConnectTransportException(remoteNode, "local node found"));
106106
} else if (remoteNode.isMasterNode() == false) {
107-
// TODO cache this result for some time?
108107
listener.onFailure(new ConnectTransportException(remoteNode, "non-master-eligible node found"));
109108
} else {
110109
transportService.connectToNode(remoteNode, new ActionListener<Void>() {
111110
@Override
112111
public void onResponse(Void ignored) {
113-
logger.trace("[{}] full connection successful: {}", thisConnectionAttempt, remoteNode);
112+
logger.trace("[{}] completed full connection with [{}]",
113+
thisConnectionAttempt, remoteNode);
114114
listener.onResponse(remoteNode);
115115
}
116116

117117
@Override
118118
public void onFailure(Exception e) {
119+
// we opened a connection and successfully performed a handshake, so we're definitely
120+
// talking to a master-eligible node with a matching cluster name and a good version,
121+
// but the attempt to open a full connection to its publish address failed; a common
122+
// reason is that the remote node is listening on 0.0.0.0 but has made an inappropriate
123+
// choice for its publish address.
124+
logger.warn(new ParameterizedMessage(
125+
"[{}] completed handshake with [{}] but followup connection failed",
126+
thisConnectionAttempt, remoteNode), e);
119127
listener.onFailure(e);
120128
}
121129
});

server/src/test/java/org/elasticsearch/discovery/HandshakingTransportAddressConnectorTests.java

+72-8
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,27 @@
1919

2020
package org.elasticsearch.discovery;
2121

22+
import org.apache.logging.log4j.Level;
23+
import org.apache.logging.log4j.LogManager;
24+
import org.apache.logging.log4j.Logger;
2225
import org.apache.lucene.util.SetOnce;
26+
import org.elasticsearch.ElasticsearchException;
2327
import org.elasticsearch.Version;
2428
import org.elasticsearch.action.ActionListener;
2529
import org.elasticsearch.cluster.ClusterName;
2630
import org.elasticsearch.cluster.node.DiscoveryNode;
31+
import org.elasticsearch.common.Nullable;
32+
import org.elasticsearch.common.logging.Loggers;
2733
import org.elasticsearch.common.settings.Settings;
34+
import org.elasticsearch.common.transport.TransportAddress;
2835
import org.elasticsearch.test.ESTestCase;
36+
import org.elasticsearch.test.MockLogAppender;
37+
import org.elasticsearch.test.junit.annotations.TestLogging;
2938
import org.elasticsearch.test.transport.MockTransport;
3039
import org.elasticsearch.threadpool.TestThreadPool;
3140
import org.elasticsearch.threadpool.ThreadPool;
41+
import org.elasticsearch.transport.ConnectTransportException;
42+
import org.elasticsearch.transport.TransportException;
3243
import org.elasticsearch.transport.TransportRequest;
3344
import org.elasticsearch.transport.TransportService;
3445
import org.elasticsearch.transport.TransportService.HandshakeResponse;
@@ -44,17 +55,22 @@
4455
import static org.elasticsearch.discovery.HandshakingTransportAddressConnector.PROBE_HANDSHAKE_TIMEOUT_SETTING;
4556
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
4657
import static org.hamcrest.Matchers.equalTo;
58+
import static org.hamcrest.Matchers.notNullValue;
59+
import static org.hamcrest.Matchers.oneOf;
4760

4861
public class HandshakingTransportAddressConnectorTests extends ESTestCase {
4962

5063
private DiscoveryNode remoteNode;
64+
private TransportAddress discoveryAddress;
5165
private TransportService transportService;
5266
private ThreadPool threadPool;
5367
private String remoteClusterName;
5468
private HandshakingTransportAddressConnector handshakingTransportAddressConnector;
5569
private DiscoveryNode localNode;
5670

5771
private boolean dropHandshake;
72+
@Nullable // unless we want the full connection to fail
73+
private TransportException fullConnectionFailure;
5874

5975
@Before
6076
public void startServices() {
@@ -66,17 +82,24 @@ public void startServices() {
6682
threadPool = new TestThreadPool("node", settings);
6783

6884
remoteNode = null;
85+
discoveryAddress = null;
6986
remoteClusterName = null;
7087
dropHandshake = false;
88+
fullConnectionFailure = null;
7189

7290
final MockTransport mockTransport = new MockTransport() {
7391
@Override
7492
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
7593
super.onSendRequest(requestId, action, request, node);
7694
assertThat(action, equalTo(TransportService.HANDSHAKE_ACTION_NAME));
77-
assertEquals(remoteNode.getAddress(), node.getAddress());
95+
assertThat(discoveryAddress, notNullValue());
96+
assertThat(node.getAddress(), oneOf(discoveryAddress, remoteNode.getAddress()));
7897
if (dropHandshake == false) {
79-
handleResponse(requestId, new HandshakeResponse(remoteNode, new ClusterName(remoteClusterName), Version.CURRENT));
98+
if (fullConnectionFailure != null && node.getAddress().equals(remoteNode.getAddress())) {
99+
handleError(requestId, fullConnectionFailure);
100+
} else {
101+
handleResponse(requestId, new HandshakeResponse(remoteNode, new ClusterName(remoteClusterName), Version.CURRENT));
102+
}
80103
}
81104
}
82105
};
@@ -91,7 +114,7 @@ protected void onSendRequest(long requestId, String action, TransportRequest req
91114
}
92115

93116
@After
94-
public void stopServices() throws InterruptedException {
117+
public void stopServices() {
95118
transportService.stop();
96119
terminate(threadPool);
97120
}
@@ -102,8 +125,9 @@ public void testConnectsToMasterNode() throws InterruptedException {
102125

103126
remoteNode = new DiscoveryNode("remote-node", buildNewFakeTransportAddress(), Version.CURRENT);
104127
remoteClusterName = "local-cluster";
128+
discoveryAddress = getDiscoveryAddress();
105129

106-
handshakingTransportAddressConnector.connectToRemoteMasterNode(remoteNode.getAddress(), new ActionListener<DiscoveryNode>() {
130+
handshakingTransportAddressConnector.connectToRemoteMasterNode(discoveryAddress, new ActionListener<DiscoveryNode>() {
107131
@Override
108132
public void onResponse(DiscoveryNode discoveryNode) {
109133
receivedNode.set(discoveryNode);
@@ -120,44 +144,84 @@ public void onFailure(Exception e) {
120144
assertEquals(remoteNode, receivedNode.get());
121145
}
122146

147+
@TestLogging(reason="ensure logging happens", value="org.elasticsearch.discovery.HandshakingTransportAddressConnector:INFO")
148+
public void testLogsFullConnectionFailureAfterSuccessfulHandshake() throws Exception {
149+
150+
remoteNode = new DiscoveryNode("remote-node", buildNewFakeTransportAddress(), Version.CURRENT);
151+
remoteClusterName = "local-cluster";
152+
discoveryAddress = buildNewFakeTransportAddress();
153+
154+
fullConnectionFailure = new ConnectTransportException(remoteNode, "simulated", new ElasticsearchException("root cause"));
155+
156+
FailureListener failureListener = new FailureListener();
157+
158+
MockLogAppender mockAppender = new MockLogAppender();
159+
mockAppender.start();
160+
mockAppender.addExpectation(
161+
new MockLogAppender.SeenEventExpectation(
162+
"message",
163+
HandshakingTransportAddressConnector.class.getCanonicalName(),
164+
Level.WARN,
165+
"*completed handshake with [*] but followup connection failed*"));
166+
Logger targetLogger = LogManager.getLogger(HandshakingTransportAddressConnector.class);
167+
Loggers.addAppender(targetLogger, mockAppender);
168+
169+
try {
170+
handshakingTransportAddressConnector.connectToRemoteMasterNode(discoveryAddress, failureListener);
171+
failureListener.assertFailure();
172+
mockAppender.assertAllExpectationsMatched();
173+
} finally {
174+
Loggers.removeAppender(targetLogger, mockAppender);
175+
mockAppender.stop();
176+
}
177+
}
178+
123179
public void testDoesNotConnectToNonMasterNode() throws InterruptedException {
124180
remoteNode = new DiscoveryNode("remote-node", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
181+
discoveryAddress = getDiscoveryAddress();
125182
remoteClusterName = "local-cluster";
126183

127184
FailureListener failureListener = new FailureListener();
128-
handshakingTransportAddressConnector.connectToRemoteMasterNode(remoteNode.getAddress(), failureListener);
185+
handshakingTransportAddressConnector.connectToRemoteMasterNode(discoveryAddress, failureListener);
129186
failureListener.assertFailure();
130187
}
131188

132189
public void testDoesNotConnectToLocalNode() throws Exception {
133190
remoteNode = localNode;
191+
discoveryAddress = getDiscoveryAddress();
134192
remoteClusterName = "local-cluster";
135193

136194
FailureListener failureListener = new FailureListener();
137-
handshakingTransportAddressConnector.connectToRemoteMasterNode(remoteNode.getAddress(), failureListener);
195+
handshakingTransportAddressConnector.connectToRemoteMasterNode(discoveryAddress, failureListener);
138196
failureListener.assertFailure();
139197
}
140198

141199
public void testDoesNotConnectToDifferentCluster() throws InterruptedException {
142200
remoteNode = new DiscoveryNode("remote-node", buildNewFakeTransportAddress(), Version.CURRENT);
201+
discoveryAddress = getDiscoveryAddress();
143202
remoteClusterName = "another-cluster";
144203

145204
FailureListener failureListener = new FailureListener();
146-
handshakingTransportAddressConnector.connectToRemoteMasterNode(remoteNode.getAddress(), failureListener);
205+
handshakingTransportAddressConnector.connectToRemoteMasterNode(discoveryAddress, failureListener);
147206
failureListener.assertFailure();
148207
}
149208

150209
public void testHandshakeTimesOut() throws InterruptedException {
151210
remoteNode = new DiscoveryNode("remote-node", buildNewFakeTransportAddress(), Version.CURRENT);
211+
discoveryAddress = getDiscoveryAddress();
152212
remoteClusterName = "local-cluster";
153213
dropHandshake = true;
154214

155215
FailureListener failureListener = new FailureListener();
156-
handshakingTransportAddressConnector.connectToRemoteMasterNode(remoteNode.getAddress(), failureListener);
216+
handshakingTransportAddressConnector.connectToRemoteMasterNode(discoveryAddress, failureListener);
157217
Thread.sleep(PROBE_HANDSHAKE_TIMEOUT_SETTING.get(Settings.EMPTY).millis());
158218
failureListener.assertFailure();
159219
}
160220

221+
private TransportAddress getDiscoveryAddress() {
222+
return randomBoolean() ? remoteNode.getAddress() : buildNewFakeTransportAddress();
223+
}
224+
161225
private class FailureListener implements ActionListener<DiscoveryNode> {
162226
final CountDownLatch completionLatch = new CountDownLatch(1);
163227

0 commit comments

Comments
 (0)