Skip to content

Commit 5eb1259

Browse files
authored
Validate build hash in handshake (#65601)
There is no guarantee of wire compatibility between nodes running different builds of the same version, but today we do not validate whether two communicating nodes are compatible or not. This results in confusing failures that look like serialization bugs, and it usually takes nontrivial effort to determine that the failure is in fact due to the user running incompatible builds. This commit adds the build hash to the transport service handshake and validates that matching versions have matching build hashes. Closes #65249 Backport of #65732
1 parent ea7d8d1 commit 5eb1259

10 files changed

+280
-33
lines changed

server/src/main/java/org/elasticsearch/transport/TransportService.java

Lines changed: 117 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.logging.log4j.LogManager;
2323
import org.apache.logging.log4j.Logger;
2424
import org.apache.logging.log4j.message.ParameterizedMessage;
25+
import org.elasticsearch.Build;
2526
import org.elasticsearch.Version;
2627
import org.elasticsearch.action.ActionListener;
2728
import org.elasticsearch.action.ActionListenerResponseHandler;
@@ -37,6 +38,7 @@
3738
import org.elasticsearch.common.io.stream.StreamOutput;
3839
import org.elasticsearch.common.io.stream.Writeable;
3940
import org.elasticsearch.common.lease.Releasable;
41+
import org.elasticsearch.common.logging.DeprecationLogger;
4042
import org.elasticsearch.common.logging.Loggers;
4143
import org.elasticsearch.common.regex.Regex;
4244
import org.elasticsearch.common.settings.ClusterSettings;
@@ -73,10 +75,27 @@
7375
import java.util.function.Predicate;
7476
import java.util.function.Supplier;
7577

76-
public class TransportService extends AbstractLifecycleComponent implements ReportingService<TransportInfo>, TransportMessageListener,
77-
TransportConnectionListener {
78+
public class TransportService extends AbstractLifecycleComponent
79+
implements ReportingService<TransportInfo>, TransportMessageListener, TransportConnectionListener {
80+
7881
private static final Logger logger = LogManager.getLogger(TransportService.class);
7982

83+
private static final String PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS_KEY = "es.unsafely_permit_handshake_from_incompatible_builds";
84+
private static final boolean PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS;
85+
86+
static {
87+
final String value = System.getProperty(PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS_KEY);
88+
if (value == null) {
89+
PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS = false;
90+
} else if (Boolean.parseBoolean(value)) {
91+
PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS = true;
92+
} else {
93+
throw new IllegalArgumentException("invalid value [" + value + "] for system property ["
94+
+ PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS_KEY + "]");
95+
}
96+
}
97+
98+
8099
public static final String DIRECT_RESPONSE_PROFILE = ".direct";
81100
public static final String HANDSHAKE_ACTION_NAME = "internal:transport/handshake";
82101

@@ -115,6 +134,7 @@ protected boolean removeEldestEntry(Map.Entry eldest) {
115134
private final RemoteClusterService remoteClusterService;
116135

117136
private final boolean validateConnections;
137+
private final boolean requireCompatibleBuild;
118138

119139
/** if set will call requests sent to this id to shortcut and executed locally */
120140
volatile DiscoveryNode localNode = null;
@@ -160,9 +180,15 @@ public TransportService(Settings settings, Transport transport, ThreadPool threa
160180
public TransportService(Settings settings, Transport transport, ThreadPool threadPool, TransportInterceptor transportInterceptor,
161181
Function<BoundTransportAddress, DiscoveryNode> localNodeFactory, @Nullable ClusterSettings clusterSettings,
162182
Set<String> taskHeaders, ConnectionManager connectionManager) {
183+
184+
final boolean isTransportClient = TransportClient.CLIENT_TYPE.equals(settings.get(Client.CLIENT_TYPE_SETTING_S.getKey()));
185+
186+
// If we are a transport client then we skip the check that the remote node has a compatible build hash
187+
this.requireCompatibleBuild = isTransportClient == false;
188+
163189
// The only time we do not want to validate node connections is when this is a transport client using the simple node sampler
164-
this.validateConnections = TransportClient.CLIENT_TYPE.equals(settings.get(Client.CLIENT_TYPE_SETTING_S.getKey())) == false ||
165-
TransportClient.CLIENT_TRANSPORT_SNIFF.get(settings);
190+
this.validateConnections = isTransportClient == false || TransportClient.CLIENT_TRANSPORT_SNIFF.get(settings);
191+
166192
this.transport = transport;
167193
transport.setSlowLogThreshold(TransportSettings.SLOW_OPERATION_THRESHOLD_SETTING.get(settings));
168194
this.threadPool = threadPool;
@@ -192,7 +218,14 @@ public TransportService(Settings settings, Transport transport, ThreadPool threa
192218
false, false,
193219
HandshakeRequest::new,
194220
(request, channel, task) -> channel.sendResponse(
195-
new HandshakeResponse(localNode, clusterName, localNode.getVersion())));
221+
new HandshakeResponse(localNode.getVersion(), Build.CURRENT.hash(), localNode, clusterName)));
222+
223+
if (PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS) {
224+
logger.warn("transport handshakes from incompatible builds are unsafely permitted on this node; remove system property [" +
225+
PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS_KEY + "] to resolve this warning");
226+
DeprecationLogger.getLogger(TransportService.class).deprecate("permit_handshake_from_incompatible_builds",
227+
"system property [" + PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS_KEY + "] is deprecated and should be removed");
228+
}
196229
}
197230

198231
public RemoteClusterService getRemoteClusterService() {
@@ -481,7 +514,7 @@ public void onFailure(Exception e) {
481514
listener.onFailure(e);
482515
}
483516
}
484-
, HandshakeResponse::new, ThreadPool.Names.GENERIC
517+
, in -> new HandshakeResponse(in, requireCompatibleBuild), ThreadPool.Names.GENERIC
485518
));
486519
}
487520

@@ -503,28 +536,89 @@ private HandshakeRequest() {
503536
}
504537

505538
public static class HandshakeResponse extends TransportResponse {
539+
540+
private static final Version BUILD_HASH_HANDSHAKE_VERSION = Version.V_7_11_0;
541+
542+
private final Version version;
543+
544+
@Nullable // if version < BUILD_HASH_HANDSHAKE_VERSION
545+
private final String buildHash;
546+
506547
private final DiscoveryNode discoveryNode;
548+
507549
private final ClusterName clusterName;
508-
private final Version version;
509550

510-
public HandshakeResponse(DiscoveryNode discoveryNode, ClusterName clusterName, Version version) {
511-
this.discoveryNode = discoveryNode;
512-
this.version = version;
513-
this.clusterName = clusterName;
551+
public HandshakeResponse(Version version, String buildHash, DiscoveryNode discoveryNode, ClusterName clusterName) {
552+
this.buildHash = Objects.requireNonNull(buildHash);
553+
this.discoveryNode = Objects.requireNonNull(discoveryNode);
554+
this.version = Objects.requireNonNull(version);
555+
this.clusterName = Objects.requireNonNull(clusterName);
514556
}
515557

516-
public HandshakeResponse(StreamInput in) throws IOException {
558+
public HandshakeResponse(StreamInput in, boolean requireCompatibleBuild) throws IOException {
517559
super(in);
518-
discoveryNode = in.readOptionalWriteable(DiscoveryNode::new);
519-
clusterName = new ClusterName(in);
520-
version = Version.readVersion(in);
560+
if (in.getVersion().onOrAfter(BUILD_HASH_HANDSHAKE_VERSION)) {
561+
// the first two fields need only VInts and raw (ASCII) characters, so we cross our fingers and hope that they appear
562+
// on the wire as we expect them to even if this turns out to be an incompatible build
563+
version = Version.readVersion(in);
564+
buildHash = in.readString();
565+
566+
try {
567+
// If the remote node is incompatible then make an effort to identify it anyway, so we can mention it in the exception
568+
// message, but recognise that this may fail
569+
discoveryNode = new DiscoveryNode(in);
570+
} catch (Exception e) {
571+
if (isIncompatibleBuild(version, buildHash, requireCompatibleBuild)) {
572+
throw new IllegalArgumentException("unidentifiable remote node is build [" + buildHash +
573+
"] of version [" + version + "] but this node is build [" + Build.CURRENT.hash() +
574+
"] of version [" + Version.CURRENT + "] which has an incompatible wire format", e);
575+
} else {
576+
throw e;
577+
}
578+
}
579+
580+
if (isIncompatibleBuild(version, buildHash, requireCompatibleBuild)) {
581+
if (PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS) {
582+
logger.warn("remote node [{}] is build [{}] of version [{}] but this node is build [{}] of version [{}] " +
583+
"which may not be compatible; remove system property [{}] to resolve this warning",
584+
discoveryNode, buildHash, version, Build.CURRENT.hash(), Version.CURRENT,
585+
PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS_KEY);
586+
} else {
587+
throw new IllegalArgumentException("remote node [" + discoveryNode + "] is build [" + buildHash +
588+
"] of version [" + version + "] but this node is build [" + Build.CURRENT.hash() +
589+
"] of version [" + Version.CURRENT + "] which has an incompatible wire format");
590+
}
591+
}
592+
593+
clusterName = new ClusterName(in);
594+
} else {
595+
discoveryNode = in.readOptionalWriteable(DiscoveryNode::new);
596+
clusterName = new ClusterName(in);
597+
version = Version.readVersion(in);
598+
buildHash = null;
599+
}
521600
}
522601

523602
@Override
524603
public void writeTo(StreamOutput out) throws IOException {
525-
out.writeOptionalWriteable(discoveryNode);
526-
clusterName.writeTo(out);
527-
Version.writeVersion(version, out);
604+
if (out.getVersion().onOrAfter(BUILD_HASH_HANDSHAKE_VERSION)) {
605+
Version.writeVersion(version, out);
606+
out.writeString(buildHash);
607+
discoveryNode.writeTo(out);
608+
clusterName.writeTo(out);
609+
} else {
610+
out.writeOptionalWriteable(discoveryNode);
611+
clusterName.writeTo(out);
612+
Version.writeVersion(version, out);
613+
}
614+
}
615+
616+
public Version getVersion() {
617+
return version;
618+
}
619+
620+
public String getBuildHash() {
621+
return buildHash;
528622
}
529623

530624
public DiscoveryNode getDiscoveryNode() {
@@ -534,6 +628,10 @@ public DiscoveryNode getDiscoveryNode() {
534628
public ClusterName getClusterName() {
535629
return clusterName;
536630
}
631+
632+
private static boolean isIncompatibleBuild(Version version, String buildHash, boolean requireCompatibleBuild) {
633+
return requireCompatibleBuild && version == Version.CURRENT && Build.CURRENT.hash().equals(buildHash) == false;
634+
}
537635
}
538636

539637
public void disconnectFromNode(DiscoveryNode node) {
@@ -1353,4 +1451,5 @@ public void onResponseReceived(long requestId, Transport.ResponseContext holder)
13531451
}
13541452
}
13551453
}
1454+
13561455
}

server/src/test/java/org/elasticsearch/client/transport/FailAndRetryMockTransport.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.elasticsearch.client.transport;
2121

22+
import org.elasticsearch.Build;
2223
import org.elasticsearch.Version;
2324
import org.elasticsearch.action.ActionListener;
2425
import org.elasticsearch.action.admin.cluster.node.liveness.LivenessResponse;
@@ -102,7 +103,11 @@ public void sendRequest(long requestId, String action, TransportRequest request,
102103
} else if (TransportService.HANDSHAKE_ACTION_NAME.equals(action)) {
103104
TransportResponseHandler transportResponseHandler = responseHandlers.onResponseReceived(requestId, listener);
104105
Version version = node.getVersion();
105-
transportResponseHandler.handleResponse(new TransportService.HandshakeResponse(node, clusterName, version));
106+
transportResponseHandler.handleResponse(new TransportService.HandshakeResponse(
107+
version,
108+
Build.CURRENT.hash(),
109+
node,
110+
clusterName));
106111

107112
} else {
108113
throw new UnsupportedOperationException("Mock transport does not understand action " + action);

server/src/test/java/org/elasticsearch/client/transport/TransportClientHeadersTests.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.elasticsearch.client.transport;
2121

22+
import org.elasticsearch.Build;
2223
import org.elasticsearch.Version;
2324
import org.elasticsearch.action.ActionType;
2425
import org.elasticsearch.action.admin.cluster.node.liveness.LivenessResponse;
@@ -180,7 +181,11 @@ public <T extends TransportResponse> void sendRequest(Transport.Connection conne
180181
clusterStateLatch.countDown();
181182
} else if (TransportService.HANDSHAKE_ACTION_NAME .equals(action)) {
182183
((TransportResponseHandler<TransportService.HandshakeResponse>) handler).handleResponse(
183-
new TransportService.HandshakeResponse(connection.getNode(), clusterName, connection.getNode().getVersion()));
184+
new TransportService.HandshakeResponse(
185+
connection.getNode().getVersion(),
186+
Build.CURRENT.hash(),
187+
connection.getNode(),
188+
clusterName));
184189
} else {
185190
handler.handleException(new TransportException("", new InternalException(action)));
186191
}

server/src/test/java/org/elasticsearch/cluster/NodeConnectionsServiceTests.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import org.apache.logging.log4j.Level;
2323
import org.apache.logging.log4j.LogManager;
24+
import org.elasticsearch.Build;
2425
import org.elasticsearch.ElasticsearchTimeoutException;
2526
import org.elasticsearch.Version;
2627
import org.elasticsearch.action.ActionListener;
@@ -479,7 +480,7 @@ private TestTransportService(Transport transport, ThreadPool threadPool) {
479480
@Override
480481
public void handshake(Transport.Connection connection, TimeValue timeout, Predicate<ClusterName> clusterNamePredicate,
481482
ActionListener<HandshakeResponse> listener) {
482-
listener.onResponse(new HandshakeResponse(connection.getNode(), new ClusterName(""), Version.CURRENT));
483+
listener.onResponse(new HandshakeResponse(Version.CURRENT, Build.CURRENT.hash(), connection.getNode(), new ClusterName("")));
483484
}
484485

485486
@Override

server/src/test/java/org/elasticsearch/cluster/coordination/FollowersCheckerTests.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*/
1919
package org.elasticsearch.cluster.coordination;
2020

21+
import org.elasticsearch.Build;
2122
import org.elasticsearch.ElasticsearchException;
2223
import org.elasticsearch.Version;
2324
import org.elasticsearch.cluster.ClusterName;
@@ -226,7 +227,11 @@ public void testFailsNodeThatDisconnects() {
226227
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
227228
assertFalse(node.equals(localNode));
228229
if (action.equals(HANDSHAKE_ACTION_NAME)) {
229-
handleResponse(requestId, new TransportService.HandshakeResponse(node, ClusterName.DEFAULT, Version.CURRENT));
230+
handleResponse(requestId, new TransportService.HandshakeResponse(
231+
Version.CURRENT,
232+
Build.CURRENT.hash(),
233+
node,
234+
ClusterName.DEFAULT));
230235
return;
231236
}
232237
deterministicTaskQueue.scheduleNow(new Runnable() {

server/src/test/java/org/elasticsearch/cluster/coordination/LeaderCheckerTests.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.elasticsearch.cluster.coordination;
2121

22+
import org.elasticsearch.Build;
2223
import org.elasticsearch.ElasticsearchException;
2324
import org.elasticsearch.Version;
2425
import org.elasticsearch.cluster.ClusterName;
@@ -221,7 +222,11 @@ public void testFollowerFailsImmediatelyOnDisconnection() {
221222
@Override
222223
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
223224
if (action.equals(HANDSHAKE_ACTION_NAME)) {
224-
handleResponse(requestId, new TransportService.HandshakeResponse(node, ClusterName.DEFAULT, Version.CURRENT));
225+
handleResponse(requestId, new TransportService.HandshakeResponse(
226+
Version.CURRENT,
227+
Build.CURRENT.hash(),
228+
node,
229+
ClusterName.DEFAULT));
225230
return;
226231
}
227232
assertThat(action, equalTo(LEADER_CHECK_ACTION_NAME));
@@ -330,7 +335,11 @@ public void testFollowerFailsImmediatelyOnHealthCheckFailure() {
330335
@Override
331336
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
332337
if (action.equals(HANDSHAKE_ACTION_NAME)) {
333-
handleResponse(requestId, new TransportService.HandshakeResponse(node, ClusterName.DEFAULT, Version.CURRENT));
338+
handleResponse(requestId, new TransportService.HandshakeResponse(
339+
Version.CURRENT,
340+
Build.CURRENT.hash(),
341+
node,
342+
ClusterName.DEFAULT));
334343
return;
335344
}
336345
assertThat(action, equalTo(LEADER_CHECK_ACTION_NAME));

server/src/test/java/org/elasticsearch/cluster/coordination/NodeJoinTests.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.elasticsearch.cluster.coordination;
2020

2121
import org.apache.logging.log4j.message.ParameterizedMessage;
22+
import org.elasticsearch.Build;
2223
import org.elasticsearch.Version;
2324
import org.elasticsearch.action.ActionListener;
2425
import org.elasticsearch.cluster.ClusterName;
@@ -162,8 +163,12 @@ private void setupMasterServiceAndCoordinator(long term, ClusterState initialSta
162163
@Override
163164
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode destination) {
164165
if (action.equals(HANDSHAKE_ACTION_NAME)) {
165-
handleResponse(requestId, new TransportService.HandshakeResponse(destination, initialState.getClusterName(),
166-
destination.getVersion()));
166+
handleResponse(requestId, new TransportService.HandshakeResponse(
167+
destination.getVersion(),
168+
Build.CURRENT.hash(),
169+
destination,
170+
initialState.getClusterName()
171+
));
167172
} else if (action.equals(JoinHelper.VALIDATE_JOIN_ACTION_NAME)) {
168173
handleResponse(requestId, new TransportResponse.Empty());
169174
} else {

server/src/test/java/org/elasticsearch/discovery/HandshakingTransportAddressConnectorTests.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.apache.logging.log4j.LogManager;
2424
import org.apache.logging.log4j.Logger;
2525
import org.apache.lucene.util.SetOnce;
26+
import org.elasticsearch.Build;
2627
import org.elasticsearch.ElasticsearchException;
2728
import org.elasticsearch.Version;
2829
import org.elasticsearch.action.ActionListener;
@@ -98,7 +99,11 @@ protected void onSendRequest(long requestId, String action, TransportRequest req
9899
if (fullConnectionFailure != null && node.getAddress().equals(remoteNode.getAddress())) {
99100
handleError(requestId, fullConnectionFailure);
100101
} else {
101-
handleResponse(requestId, new HandshakeResponse(remoteNode, new ClusterName(remoteClusterName), Version.CURRENT));
102+
handleResponse(requestId, new HandshakeResponse(
103+
Version.CURRENT,
104+
Build.CURRENT.hash(),
105+
remoteNode,
106+
new ClusterName(remoteClusterName)));
102107
}
103108
}
104109
}

0 commit comments

Comments
 (0)