Skip to content

Commit aba2f3e

Browse files
authored
Validate build hash in handshake (#65732)
There is no guarantee of wire compatibility between nodes running different builds of the same version, but today we do not validate whether two communicating nodes are compatible or not. This results in confusing failures that look like serialization bugs, and it usually takes nontrivial effort to determine that the failure is in fact due to the user running incompatible builds. This commit adds the build hash to the transport service handshake and validates that matching versions have matching build hashes. Closes #65249
1 parent 6f323ad commit aba2f3e

File tree

8 files changed

+246
-29
lines changed

8 files changed

+246
-29
lines changed

server/src/main/java/org/elasticsearch/transport/TransportService.java

Lines changed: 108 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.logging.log4j.LogManager;
2323
import org.apache.logging.log4j.Logger;
2424
import org.apache.logging.log4j.message.ParameterizedMessage;
25+
import org.elasticsearch.Build;
2526
import org.elasticsearch.Version;
2627
import org.elasticsearch.action.ActionListener;
2728
import org.elasticsearch.action.ActionListenerResponseHandler;
@@ -34,6 +35,7 @@
3435
import org.elasticsearch.common.io.stream.StreamOutput;
3536
import org.elasticsearch.common.io.stream.Writeable;
3637
import org.elasticsearch.common.lease.Releasable;
38+
import org.elasticsearch.common.logging.DeprecationLogger;
3739
import org.elasticsearch.common.logging.Loggers;
3840
import org.elasticsearch.common.regex.Regex;
3941
import org.elasticsearch.common.settings.ClusterSettings;
@@ -68,10 +70,27 @@
6870
import java.util.function.Predicate;
6971
import java.util.function.Supplier;
7072

71-
public class TransportService extends AbstractLifecycleComponent implements ReportingService<TransportInfo>, TransportMessageListener,
72-
TransportConnectionListener {
73+
public class TransportService extends AbstractLifecycleComponent
74+
implements ReportingService<TransportInfo>, TransportMessageListener, TransportConnectionListener {
75+
7376
private static final Logger logger = LogManager.getLogger(TransportService.class);
7477

78+
private static final String PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS_KEY = "es.unsafely_permit_handshake_from_incompatible_builds";
79+
private static final boolean PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS;
80+
81+
static {
82+
final String value = System.getProperty(PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS_KEY);
83+
if (value == null) {
84+
PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS = false;
85+
} else if (Boolean.parseBoolean(value)) {
86+
PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS = true;
87+
} else {
88+
throw new IllegalArgumentException("invalid value [" + value + "] for system property ["
89+
+ PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS_KEY + "]");
90+
}
91+
}
92+
93+
7594
public static final String DIRECT_RESPONSE_PROFILE = ".direct";
7695
public static final String HANDSHAKE_ACTION_NAME = "internal:transport/handshake";
7796

@@ -182,7 +201,14 @@ public TransportService(Settings settings, Transport transport, ThreadPool threa
182201
false, false,
183202
HandshakeRequest::new,
184203
(request, channel, task) -> channel.sendResponse(
185-
new HandshakeResponse(localNode, clusterName, localNode.getVersion())));
204+
new HandshakeResponse(localNode.getVersion(), Build.CURRENT.hash(), localNode, clusterName)));
205+
206+
if (PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS) {
207+
logger.warn("transport handshakes from incompatible builds are unsafely permitted on this node; remove system property [" +
208+
PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS_KEY + "] to resolve this warning");
209+
DeprecationLogger.getLogger(TransportService.class).deprecate("permit_handshake_from_incompatible_builds",
210+
"system property [" + PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS_KEY + "] is deprecated and should be removed");
211+
}
186212
}
187213

188214
public RemoteClusterService getRemoteClusterService() {
@@ -440,8 +466,8 @@ public void onResponse(HandshakeResponse response) {
440466
public void onFailure(Exception e) {
441467
listener.onFailure(e);
442468
}
443-
}
444-
, HandshakeResponse::new, ThreadPool.Names.GENERIC
469+
},
470+
HandshakeResponse::new, ThreadPool.Names.GENERIC
445471
));
446472
}
447473

@@ -463,28 +489,89 @@ private HandshakeRequest() {
463489
}
464490

465491
public static class HandshakeResponse extends TransportResponse {
492+
493+
private static final Version BUILD_HASH_HANDSHAKE_VERSION = Version.V_8_0_0;
494+
495+
private final Version version;
496+
497+
@Nullable // if version < BUILD_HASH_HANDSHAKE_VERSION
498+
private final String buildHash;
499+
466500
private final DiscoveryNode discoveryNode;
501+
467502
private final ClusterName clusterName;
468-
private final Version version;
469503

470-
public HandshakeResponse(DiscoveryNode discoveryNode, ClusterName clusterName, Version version) {
471-
this.discoveryNode = discoveryNode;
472-
this.version = version;
473-
this.clusterName = clusterName;
504+
public HandshakeResponse(Version version, String buildHash, DiscoveryNode discoveryNode, ClusterName clusterName) {
505+
this.buildHash = Objects.requireNonNull(buildHash);
506+
this.discoveryNode = Objects.requireNonNull(discoveryNode);
507+
this.version = Objects.requireNonNull(version);
508+
this.clusterName = Objects.requireNonNull(clusterName);
474509
}
475510

476511
public HandshakeResponse(StreamInput in) throws IOException {
477512
super(in);
478-
discoveryNode = in.readOptionalWriteable(DiscoveryNode::new);
479-
clusterName = new ClusterName(in);
480-
version = Version.readVersion(in);
513+
if (in.getVersion().onOrAfter(BUILD_HASH_HANDSHAKE_VERSION)) {
514+
// the first two fields need only VInts and raw (ASCII) characters, so we cross our fingers and hope that they appear
515+
// on the wire as we expect them to even if this turns out to be an incompatible build
516+
version = Version.readVersion(in);
517+
buildHash = in.readString();
518+
519+
try {
520+
// If the remote node is incompatible then make an effort to identify it anyway, so we can mention it in the exception
521+
// message, but recognise that this may fail
522+
discoveryNode = new DiscoveryNode(in);
523+
} catch (Exception e) {
524+
if (isIncompatibleBuild(version, buildHash)) {
525+
throw new IllegalArgumentException("unidentifiable remote node is build [" + buildHash +
526+
"] of version [" + version + "] but this node is build [" + Build.CURRENT.hash() +
527+
"] of version [" + Version.CURRENT + "] which has an incompatible wire format", e);
528+
} else {
529+
throw e;
530+
}
531+
}
532+
533+
if (isIncompatibleBuild(version, buildHash)) {
534+
if (PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS) {
535+
logger.warn("remote node [{}] is build [{}] of version [{}] but this node is build [{}] of version [{}] " +
536+
"which may not be compatible; remove system property [{}] to resolve this warning",
537+
discoveryNode, buildHash, version, Build.CURRENT.hash(), Version.CURRENT,
538+
PERMIT_HANDSHAKES_FROM_INCOMPATIBLE_BUILDS_KEY);
539+
} else {
540+
throw new IllegalArgumentException("remote node [" + discoveryNode + "] is build [" + buildHash +
541+
"] of version [" + version + "] but this node is build [" + Build.CURRENT.hash() +
542+
"] of version [" + Version.CURRENT + "] which has an incompatible wire format");
543+
}
544+
}
545+
546+
clusterName = new ClusterName(in);
547+
} else {
548+
discoveryNode = in.readOptionalWriteable(DiscoveryNode::new);
549+
clusterName = new ClusterName(in);
550+
version = Version.readVersion(in);
551+
buildHash = null;
552+
}
481553
}
482554

483555
@Override
484556
public void writeTo(StreamOutput out) throws IOException {
485-
out.writeOptionalWriteable(discoveryNode);
486-
clusterName.writeTo(out);
487-
Version.writeVersion(version, out);
557+
if (out.getVersion().onOrAfter(BUILD_HASH_HANDSHAKE_VERSION)) {
558+
Version.writeVersion(version, out);
559+
out.writeString(buildHash);
560+
discoveryNode.writeTo(out);
561+
clusterName.writeTo(out);
562+
} else {
563+
out.writeOptionalWriteable(discoveryNode);
564+
clusterName.writeTo(out);
565+
Version.writeVersion(version, out);
566+
}
567+
}
568+
569+
public Version getVersion() {
570+
return version;
571+
}
572+
573+
public String getBuildHash() {
574+
return buildHash;
488575
}
489576

490577
public DiscoveryNode getDiscoveryNode() {
@@ -494,6 +581,10 @@ public DiscoveryNode getDiscoveryNode() {
494581
public ClusterName getClusterName() {
495582
return clusterName;
496583
}
584+
585+
private static boolean isIncompatibleBuild(Version version, String buildHash) {
586+
return version == Version.CURRENT && Build.CURRENT.hash().equals(buildHash) == false;
587+
}
497588
}
498589

499590
public void disconnectFromNode(DiscoveryNode node) {
@@ -1293,4 +1384,5 @@ public void onResponseReceived(long requestId, Transport.ResponseContext holder)
12931384
}
12941385
}
12951386
}
1387+
12961388
}

server/src/test/java/org/elasticsearch/cluster/NodeConnectionsServiceTests.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import org.apache.logging.log4j.Level;
2323
import org.apache.logging.log4j.LogManager;
24+
import org.elasticsearch.Build;
2425
import org.elasticsearch.ElasticsearchTimeoutException;
2526
import org.elasticsearch.Version;
2627
import org.elasticsearch.action.ActionListener;
@@ -479,7 +480,7 @@ private TestTransportService(Transport transport, ThreadPool threadPool) {
479480
@Override
480481
public void handshake(Transport.Connection connection, TimeValue timeout, Predicate<ClusterName> clusterNamePredicate,
481482
ActionListener<HandshakeResponse> listener) {
482-
listener.onResponse(new HandshakeResponse(connection.getNode(), new ClusterName(""), Version.CURRENT));
483+
listener.onResponse(new HandshakeResponse(Version.CURRENT, Build.CURRENT.hash(), connection.getNode(), new ClusterName("")));
483484
}
484485

485486
@Override

server/src/test/java/org/elasticsearch/cluster/coordination/FollowersCheckerTests.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*/
1919
package org.elasticsearch.cluster.coordination;
2020

21+
import org.elasticsearch.Build;
2122
import org.elasticsearch.ElasticsearchException;
2223
import org.elasticsearch.Version;
2324
import org.elasticsearch.cluster.ClusterName;
@@ -227,7 +228,11 @@ public void testFailsNodeThatDisconnects() {
227228
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
228229
assertFalse(node.equals(localNode));
229230
if (action.equals(HANDSHAKE_ACTION_NAME)) {
230-
handleResponse(requestId, new TransportService.HandshakeResponse(node, ClusterName.DEFAULT, Version.CURRENT));
231+
handleResponse(requestId, new TransportService.HandshakeResponse(
232+
Version.CURRENT,
233+
Build.CURRENT.hash(),
234+
node,
235+
ClusterName.DEFAULT));
231236
return;
232237
}
233238
deterministicTaskQueue.scheduleNow(new Runnable() {

server/src/test/java/org/elasticsearch/cluster/coordination/LeaderCheckerTests.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.elasticsearch.cluster.coordination;
2121

22+
import org.elasticsearch.Build;
2223
import org.elasticsearch.ElasticsearchException;
2324
import org.elasticsearch.Version;
2425
import org.elasticsearch.cluster.ClusterName;
@@ -222,7 +223,11 @@ public void testFollowerFailsImmediatelyOnDisconnection() {
222223
@Override
223224
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
224225
if (action.equals(HANDSHAKE_ACTION_NAME)) {
225-
handleResponse(requestId, new TransportService.HandshakeResponse(node, ClusterName.DEFAULT, Version.CURRENT));
226+
handleResponse(requestId, new TransportService.HandshakeResponse(
227+
Version.CURRENT,
228+
Build.CURRENT.hash(),
229+
node,
230+
ClusterName.DEFAULT));
226231
return;
227232
}
228233
assertThat(action, equalTo(LEADER_CHECK_ACTION_NAME));
@@ -332,7 +337,11 @@ public void testFollowerFailsImmediatelyOnHealthCheckFailure() {
332337
@Override
333338
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
334339
if (action.equals(HANDSHAKE_ACTION_NAME)) {
335-
handleResponse(requestId, new TransportService.HandshakeResponse(node, ClusterName.DEFAULT, Version.CURRENT));
340+
handleResponse(requestId, new TransportService.HandshakeResponse(
341+
Version.CURRENT,
342+
Build.CURRENT.hash(),
343+
node,
344+
ClusterName.DEFAULT));
336345
return;
337346
}
338347
assertThat(action, equalTo(LEADER_CHECK_ACTION_NAME));

server/src/test/java/org/elasticsearch/cluster/coordination/NodeJoinTests.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.elasticsearch.cluster.coordination;
2020

2121
import org.apache.logging.log4j.message.ParameterizedMessage;
22+
import org.elasticsearch.Build;
2223
import org.elasticsearch.Version;
2324
import org.elasticsearch.action.ActionListener;
2425
import org.elasticsearch.cluster.ClusterName;
@@ -161,8 +162,12 @@ private void setupMasterServiceAndCoordinator(long term, ClusterState initialSta
161162
@Override
162163
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode destination) {
163164
if (action.equals(HANDSHAKE_ACTION_NAME)) {
164-
handleResponse(requestId, new TransportService.HandshakeResponse(destination, initialState.getClusterName(),
165-
destination.getVersion()));
165+
handleResponse(requestId, new TransportService.HandshakeResponse(
166+
destination.getVersion(),
167+
Build.CURRENT.hash(),
168+
destination,
169+
initialState.getClusterName()
170+
));
166171
} else if (action.equals(JoinHelper.VALIDATE_JOIN_ACTION_NAME)) {
167172
handleResponse(requestId, new TransportResponse.Empty());
168173
} else {

server/src/test/java/org/elasticsearch/discovery/HandshakingTransportAddressConnectorTests.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.apache.logging.log4j.LogManager;
2424
import org.apache.logging.log4j.Logger;
2525
import org.apache.lucene.util.SetOnce;
26+
import org.elasticsearch.Build;
2627
import org.elasticsearch.ElasticsearchException;
2728
import org.elasticsearch.Version;
2829
import org.elasticsearch.action.ActionListener;
@@ -98,7 +99,11 @@ protected void onSendRequest(long requestId, String action, TransportRequest req
9899
if (fullConnectionFailure != null && node.getAddress().equals(remoteNode.getAddress())) {
99100
handleError(requestId, fullConnectionFailure);
100101
} else {
101-
handleResponse(requestId, new HandshakeResponse(remoteNode, new ClusterName(remoteClusterName), Version.CURRENT));
102+
handleResponse(requestId, new HandshakeResponse(
103+
Version.CURRENT,
104+
Build.CURRENT.hash(),
105+
remoteNode,
106+
new ClusterName(remoteClusterName)));
102107
}
103108
}
104109
}

server/src/test/java/org/elasticsearch/transport/TransportServiceDeserializationFailureTests.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.elasticsearch.transport;
2121

22+
import org.elasticsearch.Build;
2223
import org.elasticsearch.Version;
2324
import org.elasticsearch.action.support.PlainActionFuture;
2425
import org.elasticsearch.cluster.ClusterName;
@@ -58,7 +59,11 @@ public void testDeserializationFailureLogIdentifiesListener() {
5859
@Override
5960
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
6061
if (action.equals(TransportService.HANDSHAKE_ACTION_NAME)) {
61-
handleResponse(requestId, new TransportService.HandshakeResponse(otherNode, new ClusterName(""), Version.CURRENT));
62+
handleResponse(requestId, new TransportService.HandshakeResponse(
63+
Version.CURRENT,
64+
Build.CURRENT.hash(),
65+
otherNode,
66+
new ClusterName("")));
6267
}
6368
}
6469
};

0 commit comments

Comments
 (0)