Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Only connect to new nodes on new cluster state #39629

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
import java.util.Queue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
Expand Down Expand Up @@ -450,7 +451,7 @@ private void applyChanges(UpdateTask task, ClusterState previousClusterState, Cl
}

logger.trace("connecting to nodes of cluster state with version {}", newClusterState.version());
nodeConnectionsService.connectToNodes(newClusterState.nodes());
connectToNodesAndWait(newClusterState);

// nothing to do until we actually recover from the gateway or any other block indicates we need to disable persistency
if (clusterChangedEvent.state().blocks().disableStatePersistence() == false && clusterChangedEvent.metaDataChanged()) {
Expand All @@ -470,6 +471,18 @@ private void applyChanges(UpdateTask task, ClusterState previousClusterState, Cl
callClusterStateListeners(clusterChangedEvent);
}

protected void connectToNodesAndWait(ClusterState newClusterState) {
// can't wait for an ActionFuture on the cluster applier thread, but we do want to block the thread here, so use a CountDownLatch.
final CountDownLatch countDownLatch = new CountDownLatch(1);
nodeConnectionsService.connectToNodes(newClusterState.nodes(), countDownLatch::countDown);
try {
countDownLatch.await();
} catch (InterruptedException e) {
logger.debug("interrupted while connecting to nodes, continuing", e);
Thread.currentThread().interrupt();
}
}

private void callClusterStateAppliers(ClusterChangedEvent clusterChangedEvent) {
clusterStateAppliers.forEach(applier -> {
logger.trace("calling [{}] with change to version [{}]", applier, clusterChangedEvent.state().version());
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNode.Role;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.service.ClusterApplierService;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.Nullable;
Expand Down Expand Up @@ -1754,12 +1753,7 @@ protected Optional<DisruptableMockTransport> getDisruptableMockTransport(Transpo
clusterService = new ClusterService(settings, clusterSettings, masterService, clusterApplierService);
clusterService.setNodeConnectionsService(
new NodeConnectionsService(clusterService.getSettings(), deterministicTaskQueue.getThreadPool(this::onNode),
transportService) {
@Override
public void connectToNodes(DiscoveryNodes discoveryNodes) {
// override this method as it does blocking calls
}
});
transportService));
final Collection<BiConsumer<DiscoveryNode, ClusterState>> onJoinValidators =
Collections.singletonList((dn, cs) -> extraJoinValidators.forEach(validator -> validator.accept(dn, cs)));
coordinator = new Coordinator("test_node", settings, clusterSettings, transportService, writableRegistry(),
Expand Down Expand Up @@ -2148,6 +2142,10 @@ public void onNewClusterState(String source, Supplier<ClusterState> clusterState
}
}

@Override
protected void connectToNodesAndWait(ClusterState newClusterState) {
// don't do anything, and don't block
}
}

private static DiscoveryNode createDiscoveryNode(int nodeIndex, boolean masterEligible) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,13 @@
package org.elasticsearch.cluster.service;

import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.ClusterStateObserver;
import org.elasticsearch.cluster.LocalNodeMasterListener;
import org.elasticsearch.cluster.NodeConnectionsService;
import org.elasticsearch.cluster.block.ClusterBlocks;
import org.elasticsearch.cluster.coordination.NoMasterBlockService;
import org.elasticsearch.cluster.metadata.MetaData;
Expand Down Expand Up @@ -54,6 +53,7 @@

import static java.util.Collections.emptyMap;
import static java.util.Collections.emptySet;
import static org.elasticsearch.test.ClusterServiceUtils.createNoOpNodeConnectionsService;
import static org.elasticsearch.test.ClusterServiceUtils.setState;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.is;
Expand Down Expand Up @@ -88,23 +88,13 @@ public void tearDown() throws Exception {
super.tearDown();
}

TimedClusterApplierService createTimedClusterService(boolean makeMaster) {
private TimedClusterApplierService createTimedClusterService(boolean makeMaster) {
DiscoveryNode localNode = new DiscoveryNode("node1", buildNewFakeTransportAddress(), emptyMap(),
emptySet(), Version.CURRENT);
TimedClusterApplierService timedClusterApplierService = new TimedClusterApplierService(Settings.builder().put("cluster.name",
"ClusterApplierServiceTests").build(), new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS),
threadPool);
timedClusterApplierService.setNodeConnectionsService(new NodeConnectionsService(Settings.EMPTY, null, null) {
@Override
public void connectToNodes(DiscoveryNodes discoveryNodes) {
// skip
}

@Override
public void disconnectFromNodesExcept(DiscoveryNodes nodesToKeep) {
// skip
}
});
timedClusterApplierService.setNodeConnectionsService(createNoOpNodeConnectionsService());
timedClusterApplierService.setInitialState(ClusterState.builder(new ClusterName("ClusterApplierServiceTests"))
.nodes(DiscoveryNodes.builder()
.add(localNode)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,11 @@ private final class TestClusterNode {
protected PrioritizedEsThreadPoolExecutor createThreadPoolExecutor() {
return new MockSinglePrioritizingExecutor(node.getName(), deterministicTaskQueue);
}

@Override
protected void connectToNodesAndWait(ClusterState newClusterState) {
// don't do anything, and don't block
}
});
mockTransport = new DisruptableMockTransport(node, logger) {
@Override
Expand Down Expand Up @@ -992,23 +997,7 @@ public void start(ClusterState initialState) {
coordinator.start();
masterService.start();
clusterService.getClusterApplierService().setNodeConnectionsService(
new NodeConnectionsService(clusterService.getSettings(), threadPool, transportService) {
@Override
public void connectToNodes(DiscoveryNodes discoveryNodes) {
// override this method as it does blocking calls
boolean callSuper = true;
for (final DiscoveryNode node : discoveryNodes) {
try {
transportService.connectToNode(node);
} catch (Exception e) {
callSuper = false;
}
}
if (callSuper) {
super.connectToNodes(discoveryNodes);
}
}
});
new NodeConnectionsService(clusterService.getSettings(), threadPool, transportService));
clusterService.getClusterApplierService().start();
indicesService.start();
indicesClusterStateService.start();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,17 +137,7 @@ public static ClusterService createClusterService(ThreadPool threadPool, Discove
.put("cluster.name", "ClusterServiceTests")
.build();
ClusterService clusterService = new ClusterService(settings, clusterSettings, threadPool);
clusterService.setNodeConnectionsService(new NodeConnectionsService(Settings.EMPTY, null, null) {
@Override
public void connectToNodes(DiscoveryNodes discoveryNodes) {
// skip
}

@Override
public void disconnectFromNodesExcept(DiscoveryNodes nodesToKeep) {
// skip
}
});
clusterService.setNodeConnectionsService(createNoOpNodeConnectionsService());
ClusterState initialClusterState = ClusterState.builder(new ClusterName(ClusterServiceUtils.class.getSimpleName()))
.nodes(DiscoveryNodes.builder()
.add(localNode)
Expand All @@ -162,6 +152,21 @@ public void disconnectFromNodesExcept(DiscoveryNodes nodesToKeep) {
return clusterService;
}

public static NodeConnectionsService createNoOpNodeConnectionsService() {
return new NodeConnectionsService(Settings.EMPTY, null, null) {
@Override
public void connectToNodes(DiscoveryNodes discoveryNodes, Runnable onCompletion) {
// don't do anything
onCompletion.run();
}

@Override
public void disconnectFromNodesExcept(DiscoveryNodes nodesToKeep) {
// don't do anything
}
};
}

public static ClusterStatePublisher createClusterStatePublisher(ClusterApplier clusterApplier) {
return (event, publishListener, ackListener) ->
clusterApplier.onNewClusterState("mock_publish_to_self[" + event.source() + "]", () -> event.state(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.function.BiConsumer;

import static org.junit.Assert.assertFalse;
Expand All @@ -49,7 +50,7 @@
*/
public class NetworkDisruption implements ServiceDisruptionScheme {

private final Logger logger = LogManager.getLogger(NetworkDisruption.class);
private static final Logger logger = LogManager.getLogger(NetworkDisruption.class);

private final DisruptedLinks disruptedLinks;
private final NetworkLinkDisruptionType networkLinkDisruptionType;
Expand Down Expand Up @@ -103,9 +104,17 @@ public void ensureHealthy(InternalTestCluster cluster) {
* handy to be able to ensure this happens faster
*/
public static void ensureFullyConnectedCluster(InternalTestCluster cluster) {
for (String node: cluster.getNodeNames()) {
final String[] nodeNames = cluster.getNodeNames();
final CountDownLatch countDownLatch = new CountDownLatch(nodeNames.length);
for (String node : nodeNames) {
ClusterState stateOnNode = cluster.getInstance(ClusterService.class, node).state();
cluster.getInstance(NodeConnectionsService.class, node).connectToNodes(stateOnNode.nodes());
cluster.getInstance(NodeConnectionsService.class, node).reconnectToNodes(stateOnNode.nodes(), countDownLatch::countDown);
}

try {
countDownLatch.await();
} catch (InterruptedException e) {
throw new AssertionError(e);
}
}

Expand Down