Skip to content

Commit 1fea51c

Browse files
committed
Use a dedicated ConnectionManger for RemoteClusterConnection (#32988)
This change introduces a dedicated ConnectionManager for every RemoteClusterConnection such that there is not state shared with the TransportService internal ConnectionManager. All connections to a remote cluster are isolated from the TransportService but still uses the TransportService and it's internal properties like the Transport, tracing and internal listener actions on disconnects etc. This allows a remote cluster connection to have a different lifecycle than a local cluster connection, also local discovery code doesn't get notified if there is a disconnect on from a remote cluster and each connection can use it's own dedicated connection profile which allows to have a reduced set of connections per cluster without conflicting with the local cluster. Closes #31835
1 parent deedec7 commit 1fea51c

File tree

11 files changed

+167
-130
lines changed

11 files changed

+167
-130
lines changed

server/src/main/java/org/elasticsearch/transport/ConnectionManager.java

Lines changed: 38 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import java.util.concurrent.CopyOnWriteArrayList;
4545
import java.util.concurrent.CountDownLatch;
4646
import java.util.concurrent.TimeUnit;
47+
import java.util.concurrent.atomic.AtomicBoolean;
4748
import java.util.concurrent.locks.ReadWriteLock;
4849
import java.util.concurrent.locks.ReentrantReadWriteLock;
4950

@@ -62,6 +63,7 @@ public class ConnectionManager implements Closeable {
6263
private final TimeValue pingSchedule;
6364
private final ConnectionProfile defaultProfile;
6465
private final Lifecycle lifecycle = new Lifecycle();
66+
private final AtomicBoolean closed = new AtomicBoolean(false);
6567
private final ReadWriteLock closeLock = new ReentrantReadWriteLock();
6668
private final DelegatingNodeConnectionListener connectionListener = new DelegatingNodeConnectionListener();
6769

@@ -83,7 +85,9 @@ public ConnectionManager(Settings settings, Transport transport, ThreadPool thre
8385
}
8486

8587
public void addListener(TransportConnectionListener listener) {
86-
this.connectionListener.listeners.add(listener);
88+
if (connectionListener.listeners.contains(listener) == false) {
89+
this.connectionListener.listeners.add(listener);
90+
}
8791
}
8892

8993
public void removeListener(TransportConnectionListener listener) {
@@ -186,45 +190,50 @@ public void disconnectFromNode(DiscoveryNode node) {
186190
}
187191
}
188192

189-
public int connectedNodeCount() {
193+
/**
194+
* Returns the number of nodes this manager is connected to.
195+
*/
196+
public int size() {
190197
return connectedNodes.size();
191198
}
192199

193200
@Override
194201
public void close() {
195-
lifecycle.moveToStopped();
196-
CountDownLatch latch = new CountDownLatch(1);
202+
if (closed.compareAndSet(false, true)) {
203+
lifecycle.moveToStopped();
204+
CountDownLatch latch = new CountDownLatch(1);
197205

198-
// TODO: Consider moving all read/write lock (in Transport and this class) to the TransportService
199-
threadPool.generic().execute(() -> {
200-
closeLock.writeLock().lock();
201-
try {
202-
// we are holding a write lock so nobody modifies the connectedNodes / openConnections map - it's safe to first close
203-
// all instances and then clear them maps
204-
Iterator<Map.Entry<DiscoveryNode, Transport.Connection>> iterator = connectedNodes.entrySet().iterator();
205-
while (iterator.hasNext()) {
206-
Map.Entry<DiscoveryNode, Transport.Connection> next = iterator.next();
207-
try {
208-
IOUtils.closeWhileHandlingException(next.getValue());
209-
} finally {
210-
iterator.remove();
206+
// TODO: Consider moving all read/write lock (in Transport and this class) to the TransportService
207+
threadPool.generic().execute(() -> {
208+
closeLock.writeLock().lock();
209+
try {
210+
// we are holding a write lock so nobody modifies the connectedNodes / openConnections map - it's safe to first close
211+
// all instances and then clear them maps
212+
Iterator<Map.Entry<DiscoveryNode, Transport.Connection>> iterator = connectedNodes.entrySet().iterator();
213+
while (iterator.hasNext()) {
214+
Map.Entry<DiscoveryNode, Transport.Connection> next = iterator.next();
215+
try {
216+
IOUtils.closeWhileHandlingException(next.getValue());
217+
} finally {
218+
iterator.remove();
219+
}
211220
}
221+
} finally {
222+
closeLock.writeLock().unlock();
223+
latch.countDown();
212224
}
213-
} finally {
214-
closeLock.writeLock().unlock();
215-
latch.countDown();
216-
}
217-
});
225+
});
218226

219-
try {
220227
try {
221-
latch.await(30, TimeUnit.SECONDS);
222-
} catch (InterruptedException e) {
223-
Thread.currentThread().interrupt();
224-
// ignore
228+
try {
229+
latch.await(30, TimeUnit.SECONDS);
230+
} catch (InterruptedException e) {
231+
Thread.currentThread().interrupt();
232+
// ignore
233+
}
234+
} finally {
235+
lifecycle.moveToClosed();
225236
}
226-
} finally {
227-
lifecycle.moveToClosed();
228237
}
229238
}
230239

server/src/main/java/org/elasticsearch/transport/RemoteClusterConnection.java

Lines changed: 42 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import org.elasticsearch.cluster.node.DiscoveryNode;
4040
import org.elasticsearch.cluster.node.DiscoveryNodes;
4141
import org.elasticsearch.common.component.AbstractComponent;
42+
import org.elasticsearch.common.io.stream.StreamInput;
4243
import org.elasticsearch.common.settings.Settings;
4344
import org.elasticsearch.common.transport.TransportAddress;
4445
import org.elasticsearch.common.util.CancellableThreads;
@@ -84,30 +85,32 @@
8485
final class RemoteClusterConnection extends AbstractComponent implements TransportConnectionListener, Closeable {
8586

8687
private final TransportService transportService;
88+
private final ConnectionManager connectionManager;
8789
private final ConnectionProfile remoteProfile;
8890
private final ConnectedNodes connectedNodes;
8991
private final String clusterAlias;
9092
private final int maxNumRemoteConnections;
9193
private final Predicate<DiscoveryNode> nodePredicate;
94+
private final ThreadPool threadPool;
9295
private volatile List<Supplier<DiscoveryNode>> seedNodes;
9396
private volatile boolean skipUnavailable;
9497
private final ConnectHandler connectHandler;
9598
private SetOnce<ClusterName> remoteClusterName = new SetOnce<>();
96-
private final ClusterName localClusterName;
9799

98100
/**
99101
* Creates a new {@link RemoteClusterConnection}
100102
* @param settings the nodes settings object
101103
* @param clusterAlias the configured alias of the cluster to connect to
102104
* @param seedNodes a list of seed nodes to discover eligible nodes from
103105
* @param transportService the local nodes transport service
106+
* @param connectionManager the connection manager to use for this remote connection
104107
* @param maxNumRemoteConnections the maximum number of connections to the remote cluster
105108
* @param nodePredicate a predicate to filter eligible remote nodes to connect to
106109
*/
107110
RemoteClusterConnection(Settings settings, String clusterAlias, List<Supplier<DiscoveryNode>> seedNodes,
108-
TransportService transportService, int maxNumRemoteConnections, Predicate<DiscoveryNode> nodePredicate) {
111+
TransportService transportService, ConnectionManager connectionManager, int maxNumRemoteConnections,
112+
Predicate<DiscoveryNode> nodePredicate) {
109113
super(settings);
110-
this.localClusterName = ClusterName.CLUSTER_NAME_SETTING.get(settings);
111114
this.transportService = transportService;
112115
this.maxNumRemoteConnections = maxNumRemoteConnections;
113116
this.nodePredicate = nodePredicate;
@@ -126,7 +129,11 @@ final class RemoteClusterConnection extends AbstractComponent implements Transpo
126129
this.skipUnavailable = RemoteClusterService.REMOTE_CLUSTER_SKIP_UNAVAILABLE
127130
.getConcreteSettingForNamespace(clusterAlias).get(settings);
128131
this.connectHandler = new ConnectHandler();
129-
transportService.addConnectionListener(this);
132+
this.threadPool = transportService.threadPool;
133+
this.connectionManager = connectionManager;
134+
connectionManager.addListener(this);
135+
// we register the transport service here as a listener to make sure we notify handlers on disconnect etc.
136+
connectionManager.addListener(transportService);
130137
}
131138

132139
/**
@@ -187,8 +194,9 @@ public void ensureConnected(ActionListener<Void> voidActionListener) {
187194

188195
private void fetchShardsInternal(ClusterSearchShardsRequest searchShardsRequest,
189196
final ActionListener<ClusterSearchShardsResponse> listener) {
190-
final DiscoveryNode node = connectedNodes.getAny();
191-
transportService.sendRequest(node, ClusterSearchShardsAction.NAME, searchShardsRequest,
197+
final DiscoveryNode node = getAnyConnectedNode();
198+
Transport.Connection connection = connectionManager.getConnection(node);
199+
transportService.sendRequest(connection, ClusterSearchShardsAction.NAME, searchShardsRequest, TransportRequestOptions.EMPTY,
192200
new TransportResponseHandler<ClusterSearchShardsResponse>() {
193201

194202
@Override
@@ -223,12 +231,16 @@ void collectNodes(ActionListener<Function<String, DiscoveryNode>> listener) {
223231
request.clear();
224232
request.nodes(true);
225233
request.local(true); // run this on the node that gets the request it's as good as any other
226-
final DiscoveryNode node = connectedNodes.getAny();
227-
transportService.sendRequest(node, ClusterStateAction.NAME, request, TransportRequestOptions.EMPTY,
234+
final DiscoveryNode node = getAnyConnectedNode();
235+
Transport.Connection connection = connectionManager.getConnection(node);
236+
transportService.sendRequest(connection, ClusterStateAction.NAME, request, TransportRequestOptions.EMPTY,
228237
new TransportResponseHandler<ClusterStateResponse>() {
238+
229239
@Override
230-
public ClusterStateResponse newInstance() {
231-
return new ClusterStateResponse();
240+
public ClusterStateResponse read(StreamInput in) throws IOException {
241+
ClusterStateResponse response = new ClusterStateResponse();
242+
response.readFrom(in);
243+
return response;
232244
}
233245

234246
@Override
@@ -265,11 +277,11 @@ public String executor() {
265277
* If such node is not connected, the returned connection will be a proxy connection that redirects to it.
266278
*/
267279
Transport.Connection getConnection(DiscoveryNode remoteClusterNode) {
268-
if (transportService.nodeConnected(remoteClusterNode)) {
269-
return transportService.getConnection(remoteClusterNode);
280+
if (connectionManager.nodeConnected(remoteClusterNode)) {
281+
return connectionManager.getConnection(remoteClusterNode);
270282
}
271-
DiscoveryNode discoveryNode = connectedNodes.getAny();
272-
Transport.Connection connection = transportService.getConnection(discoveryNode);
283+
DiscoveryNode discoveryNode = getAnyConnectedNode();
284+
Transport.Connection connection = connectionManager.getConnection(discoveryNode);
273285
return new ProxyConnection(connection, remoteClusterNode);
274286
}
275287

@@ -321,33 +333,18 @@ public Version getVersion() {
321333
}
322334

323335
Transport.Connection getConnection() {
324-
return transportService.getConnection(getAnyConnectedNode());
336+
return connectionManager.getConnection(getAnyConnectedNode());
325337
}
326338

327339
@Override
328340
public void close() throws IOException {
329-
connectHandler.close();
341+
IOUtils.close(connectHandler, connectionManager);
330342
}
331343

332344
public boolean isClosed() {
333345
return connectHandler.isClosed();
334346
}
335347

336-
private ConnectionProfile getRemoteProfile(ClusterName name) {
337-
// we can only compare the cluster name to make a decision if we should use a remote profile
338-
// we can't use a cluster UUID here since we could be connecting to that remote cluster before
339-
// the remote node has joined its cluster and have a cluster UUID. The fact that we just lose a
340-
// rather smallish optimization on the connection layer under certain situations where remote clusters
341-
// have the same name as the local one is minor here.
342-
// the alternative here is to complicate the remote infrastructure to also wait until we formed a cluster,
343-
// gained a cluster UUID and then start connecting etc. we rather use this simplification in order to maintain simplicity
344-
if (this.localClusterName.equals(name)) {
345-
return null;
346-
} else {
347-
return remoteProfile;
348-
}
349-
}
350-
351348
/**
352349
* The connect handler manages node discovery and the actual connect to the remote cluster.
353350
* There is at most one connect job running at any time. If such a connect job is triggered
@@ -391,7 +388,7 @@ private void connect(ActionListener<Void> connectListener, boolean forceRun) {
391388
final boolean runConnect;
392389
final Collection<ActionListener<Void>> toNotify;
393390
final ActionListener<Void> listener = connectListener == null ? null :
394-
ContextPreservingActionListener.wrapPreservingContext(connectListener, transportService.getThreadPool().getThreadContext());
391+
ContextPreservingActionListener.wrapPreservingContext(connectListener, threadPool.getThreadContext());
395392
synchronized (queue) {
396393
if (listener != null && queue.offer(listener) == false) {
397394
listener.onFailure(new RejectedExecutionException("connect queue is full"));
@@ -419,7 +416,6 @@ private void connect(ActionListener<Void> connectListener, boolean forceRun) {
419416
}
420417

421418
private void forkConnect(final Collection<ActionListener<Void>> toNotify) {
422-
ThreadPool threadPool = transportService.getThreadPool();
423419
ExecutorService executor = threadPool.executor(ThreadPool.Names.MANAGEMENT);
424420
executor.submit(new AbstractRunnable() {
425421
@Override
@@ -456,13 +452,13 @@ protected void doRun() {
456452
maybeConnect();
457453
}
458454
});
459-
collectRemoteNodes(seedNodes.iterator(), transportService, listener);
455+
collectRemoteNodes(seedNodes.iterator(), transportService, connectionManager, listener);
460456
}
461457
});
462458
}
463459

464460
private void collectRemoteNodes(Iterator<Supplier<DiscoveryNode>> seedNodes,
465-
final TransportService transportService, ActionListener<Void> listener) {
461+
final TransportService transportService, final ConnectionManager manager, ActionListener<Void> listener) {
466462
if (Thread.currentThread().isInterrupted()) {
467463
listener.onFailure(new InterruptedException("remote connect thread got interrupted"));
468464
}
@@ -471,7 +467,7 @@ private void collectRemoteNodes(Iterator<Supplier<DiscoveryNode>> seedNodes,
471467
cancellableThreads.executeIO(() -> {
472468
final DiscoveryNode seedNode = seedNodes.next().get();
473469
final TransportService.HandshakeResponse handshakeResponse;
474-
Transport.Connection connection = transportService.openConnection(seedNode,
470+
Transport.Connection connection = manager.openConnection(seedNode,
475471
ConnectionProfile.buildSingleChannelProfile(TransportRequestOptions.Type.REG, null, null));
476472
boolean success = false;
477473
try {
@@ -486,7 +482,7 @@ private void collectRemoteNodes(Iterator<Supplier<DiscoveryNode>> seedNodes,
486482

487483
final DiscoveryNode handshakeNode = handshakeResponse.getDiscoveryNode();
488484
if (nodePredicate.test(handshakeNode) && connectedNodes.size() < maxNumRemoteConnections) {
489-
transportService.connectToNode(handshakeNode, getRemoteProfile(handshakeResponse.getClusterName()));
485+
manager.connectToNode(handshakeNode, remoteProfile, transportService.connectionValidator(handshakeNode));
490486
if (remoteClusterName.get() == null) {
491487
assert handshakeResponse.getClusterName().value() != null;
492488
remoteClusterName.set(handshakeResponse.getClusterName());
@@ -528,7 +524,7 @@ private void collectRemoteNodes(Iterator<Supplier<DiscoveryNode>> seedNodes,
528524
// ISE if we fail the handshake with an version incompatible node
529525
if (seedNodes.hasNext()) {
530526
logger.debug(() -> new ParameterizedMessage("fetching nodes from external cluster {} failed", clusterAlias), ex);
531-
collectRemoteNodes(seedNodes, transportService, listener);
527+
collectRemoteNodes(seedNodes, transportService, manager, listener);
532528
} else {
533529
listener.onFailure(ex);
534530
}
@@ -556,7 +552,6 @@ final boolean isClosed() {
556552
/* This class handles the _state response from the remote cluster when sniffing nodes to connect to */
557553
private class SniffClusterStateResponseHandler implements TransportResponseHandler<ClusterStateResponse> {
558554

559-
private final TransportService transportService;
560555
private final Transport.Connection connection;
561556
private final ActionListener<Void> listener;
562557
private final Iterator<Supplier<DiscoveryNode>> seedNodes;
@@ -565,7 +560,6 @@ private class SniffClusterStateResponseHandler implements TransportResponseHandl
565560
SniffClusterStateResponseHandler(TransportService transportService, Transport.Connection connection,
566561
ActionListener<Void> listener, Iterator<Supplier<DiscoveryNode>> seedNodes,
567562
CancellableThreads cancellableThreads) {
568-
this.transportService = transportService;
569563
this.connection = connection;
570564
this.listener = listener;
571565
this.seedNodes = seedNodes;
@@ -596,8 +590,8 @@ public void handleResponse(ClusterStateResponse response) {
596590
for (DiscoveryNode node : nodesIter) {
597591
if (nodePredicate.test(node) && connectedNodes.size() < maxNumRemoteConnections) {
598592
try {
599-
transportService.connectToNode(node, getRemoteProfile(remoteClusterName.get())); // noop if node is
600-
// connected
593+
connectionManager.connectToNode(node, remoteProfile,
594+
transportService.connectionValidator(node)); // noop if node is connected
601595
connectedNodes.add(node);
602596
} catch (ConnectTransportException | IllegalStateException ex) {
603597
// ISE if we fail the handshake with an version incompatible node
@@ -613,7 +607,7 @@ public void handleResponse(ClusterStateResponse response) {
613607
listener.onFailure(ex); // we got canceled - fail the listener and step out
614608
} catch (Exception ex) {
615609
logger.warn(() -> new ParameterizedMessage("fetching nodes from external cluster {} failed", clusterAlias), ex);
616-
collectRemoteNodes(seedNodes, transportService, listener);
610+
collectRemoteNodes(seedNodes, transportService, connectionManager, listener);
617611
}
618612
}
619613

@@ -624,7 +618,7 @@ public void handleException(TransportException exp) {
624618
IOUtils.closeWhileHandlingException(connection);
625619
} finally {
626620
// once the connection is closed lets try the next node
627-
collectRemoteNodes(seedNodes, transportService, listener);
621+
collectRemoteNodes(seedNodes, transportService, connectionManager, listener);
628622
}
629623
}
630624

@@ -781,4 +775,8 @@ private synchronized void ensureIteratorAvailable() {
781775
}
782776
}
783777
}
778+
779+
ConnectionManager getConnectionManager() {
780+
return connectionManager;
781+
}
784782
}

0 commit comments

Comments
 (0)