Skip to content

Commit 93dc894

Browse files
authored
Strengthen validateClusterFormed check (#49248)
Strengthens the validateClusterFormed check that is used by the test infrastructure to make sure that nodes are properly connected and know about each other. Is used in situations where the cluster is scaled up and down, and where there previously was a network disruption that has been healed. Closes #49243
1 parent fcf570f commit 93dc894

File tree

2 files changed

+25
-26
lines changed

2 files changed

+25
-26
lines changed

test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java

+24-19
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,9 @@
150150
import static org.hamcrest.Matchers.equalTo;
151151
import static org.hamcrest.Matchers.greaterThan;
152152
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
153-
import static org.junit.Assert.assertEquals;
154153
import static org.hamcrest.Matchers.not;
155154
import static org.hamcrest.Matchers.nullValue;
155+
import static org.junit.Assert.assertEquals;
156156
import static org.junit.Assert.assertFalse;
157157
import static org.junit.Assert.assertThat;
158158
import static org.junit.Assert.assertTrue;
@@ -1092,29 +1092,35 @@ private synchronized void reset(boolean wipeData) throws IOException {
10921092

10931093
/** ensure a cluster is formed with all published nodes. */
10941094
public synchronized void validateClusterFormed() {
1095-
String name = randomFrom(random, getNodeNames());
1096-
validateClusterFormed(name);
1097-
}
1098-
1099-
/** ensure a cluster is formed with all published nodes, but do so by using the client of the specified node */
1100-
private synchronized void validateClusterFormed(String viaNode) {
1101-
Set<DiscoveryNode> expectedNodes = new HashSet<>();
1095+
final Set<DiscoveryNode> expectedNodes = new HashSet<>();
11021096
for (NodeAndClient nodeAndClient : nodes.values()) {
11031097
expectedNodes.add(getInstanceFromNode(ClusterService.class, nodeAndClient.node()).localNode());
11041098
}
1105-
logger.trace("validating cluster formed via [{}], expecting {}", viaNode, expectedNodes);
1106-
final Client client = client(viaNode);
1099+
logger.trace("validating cluster formed, expecting {}", expectedNodes);
1100+
11071101
try {
11081102
assertBusy(() -> {
1109-
DiscoveryNodes discoveryNodes = client.admin().cluster().prepareState().get().getState().nodes();
1110-
assertEquals(expectedNodes.size(), discoveryNodes.getSize());
1111-
for (DiscoveryNode expectedNode : expectedNodes) {
1112-
assertTrue("Expected node to exist: " + expectedNode, discoveryNodes.nodeExists(expectedNode));
1113-
}
1103+
final List<ClusterState> states = nodes.values().stream()
1104+
.map(node -> getInstanceFromNode(ClusterService.class, node.node()))
1105+
.map(ClusterService::state)
1106+
.collect(Collectors.toList());
1107+
final String debugString = ", expected nodes: " + expectedNodes + " and actual cluster states " + states;
1108+
// all nodes have a master
1109+
assertTrue("Missing master" + debugString, states.stream().allMatch(cs -> cs.nodes().getMasterNodeId() != null));
1110+
// all nodes have the same master (in same term)
1111+
assertEquals("Not all masters in same term" + debugString, 1,
1112+
states.stream().mapToLong(ClusterState::term).distinct().count());
1113+
// all nodes know about all other nodes
1114+
states.forEach(cs -> {
1115+
DiscoveryNodes discoveryNodes = cs.nodes();
1116+
assertEquals("Node size mismatch" + debugString, expectedNodes.size(), discoveryNodes.getSize());
1117+
for (DiscoveryNode expectedNode : expectedNodes) {
1118+
assertTrue("Expected node to exist: " + expectedNode + debugString, discoveryNodes.nodeExists(expectedNode));
1119+
}
1120+
});
11141121
}, 30, TimeUnit.SECONDS);
11151122
} catch (AssertionError ae) {
1116-
throw new IllegalStateException("cluster failed to form with expected nodes " + expectedNodes + " and actual nodes " +
1117-
client.admin().cluster().prepareState().get().getState().nodes());
1123+
throw new IllegalStateException("cluster failed to form", ae);
11181124
} catch (Exception e) {
11191125
throw new IllegalStateException(e);
11201126
}
@@ -1663,8 +1669,7 @@ private void restartNode(NodeAndClient nodeAndClient, RestartCallback callback)
16631669

16641670
if (callback.validateClusterForming() || excludedNodeIds.isEmpty() == false) {
16651671
// we have to validate cluster size to ensure that the restarted node has rejoined the cluster if it was master-eligible;
1666-
// we have to do this via the node that was just restarted as it may be that the master didn't yet process the fact that it left
1667-
validateClusterFormed(nodeAndClient.name);
1672+
validateClusterFormed();
16681673
}
16691674
}
16701675

test/framework/src/main/java/org/elasticsearch/test/disruption/NetworkDisruption.java

+1-7
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
package org.elasticsearch.test.disruption;
2121

2222
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
23-
2423
import org.apache.logging.log4j.LogManager;
2524
import org.apache.logging.log4j.Logger;
2625
import org.elasticsearch.cluster.ClusterState;
@@ -41,8 +40,6 @@
4140
import java.util.concurrent.CountDownLatch;
4241
import java.util.function.BiConsumer;
4342

44-
import static org.junit.Assert.assertFalse;
45-
4643
/**
4744
* Network disruptions are modeled using two components:
4845
* 1) the {@link DisruptedLinks} represents the links in the network that are to be disrupted
@@ -119,10 +116,7 @@ public static void ensureFullyConnectedCluster(InternalTestCluster cluster) {
119116
}
120117

121118
protected void ensureNodeCount(InternalTestCluster cluster) {
122-
assertFalse("cluster failed to form after disruption was healed", cluster.client().admin().cluster().prepareHealth()
123-
.setWaitForNodes(String.valueOf(cluster.size()))
124-
.setWaitForNoRelocatingShards(true)
125-
.get().isTimedOut());
119+
cluster.validateClusterFormed();
126120
}
127121

128122
@Override

0 commit comments

Comments
 (0)