Skip to content

Commit 8b82170

Browse files
authored
Switch more tests to zen2 (#36367)
1. CCR tests work without any changes 2. `testDanglingIndices` require changes the source code (added TODO). 3. `testIndexDeletionWhenNodeRejoins` because it's using just two nodes, adding the node to exclusions is needed on restart. 4. `testCorruptTranslogTruncationOfReplica` starts dedicated master one, because otherwise, the cluster does not form, if nodes are stopped and one node is started back. 5. `testResolvePath` needs TEST cluster, because all nodes are stopped at the end of the test and it's not possible to perform checks needed by SUITE cluster. 6. `SnapshotDisruptionIT`. Without changes, the test fails because Zen2 retries snapshot creation as soon as network partition heals. This results into the race between creating snapshot and test cleanup logic (deleting index). Zen1 on the other hand, also schedules retry, but it takes some time after network partition heals, so cleanup logic executes latter and test passes. The check that snapshot is eventually created is added to the end of the test.
1 parent 13b1f19 commit 8b82170

File tree

5 files changed

+91
-79
lines changed

5 files changed

+91
-79
lines changed

server/src/test/java/org/elasticsearch/discovery/SnapshotDisruptionIT.java

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ protected Settings nodeSettings(int nodeOrdinal) {
6868
return Settings.builder().put(super.nodeSettings(nodeOrdinal))
6969
.put(AbstractDisruptionTestCase.DEFAULT_SETTINGS)
7070
.put(TestZenDiscovery.USE_MOCK_PINGS.getKey(), false)
71-
.put(TestZenDiscovery.USE_ZEN2.getKey(), false) // requires more work
7271
.put(DiscoverySettings.COMMIT_TIMEOUT_SETTING.getKey(), "30s")
7372
.build();
7473
}
@@ -133,7 +132,7 @@ public void clusterChanged(ClusterChangedEvent event) {
133132

134133
logger.info("--> wait until the snapshot is done");
135134
assertBusy(() -> {
136-
SnapshotsInProgress snapshots = dataNodeClient().admin().cluster().prepareState().setLocal(true).get().getState()
135+
SnapshotsInProgress snapshots = dataNodeClient().admin().cluster().prepareState().setLocal(false).get().getState()
137136
.custom(SnapshotsInProgress.TYPE);
138137
if (snapshots != null && snapshots.entries().size() > 0) {
139138
logger.info("Current snapshot state [{}]", snapshots.entries().get(0).state());
@@ -146,15 +145,9 @@ public void clusterChanged(ClusterChangedEvent event) {
146145
logger.info("--> verify that snapshot was successful or no longer exist");
147146
assertBusy(() -> {
148147
try {
149-
GetSnapshotsResponse snapshotsStatusResponse = dataNodeClient().admin().cluster().prepareGetSnapshots("test-repo")
150-
.setSnapshots("test-snap-2").get();
151-
SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
152-
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
153-
assertEquals(snapshotInfo.totalShards(), snapshotInfo.successfulShards());
154-
assertEquals(0, snapshotInfo.failedShards());
155-
logger.info("--> done verifying");
148+
assertSnapshotExists("test-repo", "test-snap-2");
156149
} catch (SnapshotMissingException exception) {
157-
logger.info("--> snapshot doesn't exist");
150+
logger.info("--> done verifying, snapshot doesn't exist");
158151
}
159152
}, 1, TimeUnit.MINUTES);
160153

@@ -172,6 +165,21 @@ public void clusterChanged(ClusterChangedEvent event) {
172165
cause = cause.getCause();
173166
assertThat(cause, instanceOf(FailedToCommitClusterStateException.class));
174167
}
168+
169+
logger.info("--> verify that snapshot eventually will be created due to retries");
170+
assertBusy(() -> {
171+
assertSnapshotExists("test-repo", "test-snap-2");
172+
}, 1, TimeUnit.MINUTES);
173+
}
174+
175+
private void assertSnapshotExists(String repository, String snapshot) {
176+
GetSnapshotsResponse snapshotsStatusResponse = dataNodeClient().admin().cluster().prepareGetSnapshots(repository)
177+
.setSnapshots(snapshot).get();
178+
SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
179+
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
180+
assertEquals(snapshotInfo.totalShards(), snapshotInfo.successfulShards());
181+
assertEquals(0, snapshotInfo.failedShards());
182+
logger.info("--> done verifying, snapshot exists");
175183
}
176184

177185
private void createRandomIndex(String idxName) throws InterruptedException {

server/src/test/java/org/elasticsearch/gateway/GatewayIndexStateIT.java

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -276,10 +276,14 @@ public void testTwoNodesSingleDoc() throws Exception {
276276
}
277277

278278
public void testDanglingIndices() throws Exception {
279+
/*TODO This test test does not work with Zen2, because once master node looses its cluster state during restart
280+
it will start with term = 1, which is the same as the term data node has. Data node won't accept cluster state from master
281+
after the restart, because the term is the same, but version of the cluster state is greater on the data node.
282+
Consider adding term to JoinRequest, so that master node can bump its term if its current term is less than JoinRequest#term.
283+
*/
279284
logger.info("--> starting two nodes");
280285

281286
final String node_1 = internalCluster().startNodes(2,
282-
//TODO fails wih Zen2
283287
Settings.builder().put(TestZenDiscovery.USE_ZEN2.getKey(), false).build()).get(0);
284288

285289
logger.info("--> indexing a simple document");
@@ -333,9 +337,7 @@ public void testIndexDeletionWhenNodeRejoins() throws Exception {
333337
final List<String> nodes;
334338
logger.info("--> starting a cluster with " + numNodes + " nodes");
335339
nodes = internalCluster().startNodes(numNodes,
336-
Settings.builder().put(IndexGraveyard.SETTING_MAX_TOMBSTONES.getKey(), randomIntBetween(10, 100))
337-
//TODO fails with Zen2
338-
.put(TestZenDiscovery.USE_ZEN2.getKey(), false).build());
340+
Settings.builder().put(IndexGraveyard.SETTING_MAX_TOMBSTONES.getKey(), randomIntBetween(10, 100)).build());
339341
logger.info("--> create an index");
340342
createIndex(indexName);
341343

@@ -355,6 +357,7 @@ public Settings onNodeStopped(final String nodeName) throws Exception {
355357
final Client client = client(otherNode);
356358
client.admin().indices().prepareDelete(indexName).execute().actionGet();
357359
assertFalse(client.admin().indices().prepareExists(indexName).execute().actionGet().isExists());
360+
logger.info("--> index deleted");
358361
return super.onNodeStopped(nodeName);
359362
}
360363
});

server/src/test/java/org/elasticsearch/index/shard/RemoveCorruptedShardDataCommandIT.java

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@
7070
import org.elasticsearch.test.ESIntegTestCase;
7171
import org.elasticsearch.test.InternalSettingsPlugin;
7272
import org.elasticsearch.test.InternalTestCluster;
73-
import org.elasticsearch.test.discovery.TestZenDiscovery;
7473
import org.elasticsearch.test.engine.MockEngineSupport;
7574
import org.elasticsearch.test.transport.MockTransportService;
7675

@@ -99,16 +98,9 @@
9998
import static org.hamcrest.Matchers.notNullValue;
10099
import static org.hamcrest.Matchers.startsWith;
101100

102-
@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.SUITE, numDataNodes = 0)
101+
@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0)
103102
public class RemoveCorruptedShardDataCommandIT extends ESIntegTestCase {
104103

105-
@Override
106-
protected Settings nodeSettings(int nodeOrdinal) {
107-
return Settings.builder().put(super.nodeSettings(nodeOrdinal))
108-
.put(TestZenDiscovery.USE_ZEN2.getKey(), false) // no state persistence yet
109-
.build();
110-
}
111-
112104
@Override
113105
protected Collection<Class<? extends Plugin>> nodePlugins() {
114106
return Arrays.asList(MockTransportService.TestPlugin.class, MockEngineFactoryPlugin.class, InternalSettingsPlugin.class);
@@ -260,7 +252,7 @@ public Settings onNodeStopped(String nodeName) throws Exception {
260252
}
261253

262254
public void testCorruptTranslogTruncation() throws Exception {
263-
internalCluster().startNodes(2, Settings.EMPTY);
255+
internalCluster().startNodes(2);
264256

265257
final String node1 = internalCluster().getNodeNames()[0];
266258
final String node2 = internalCluster().getNodeNames()[1];
@@ -436,10 +428,10 @@ public Settings onNodeStopped(String nodeName) throws Exception {
436428
}
437429

438430
public void testCorruptTranslogTruncationOfReplica() throws Exception {
439-
internalCluster().startNodes(2, Settings.EMPTY);
431+
internalCluster().startMasterOnlyNode();
440432

441-
final String node1 = internalCluster().getNodeNames()[0];
442-
final String node2 = internalCluster().getNodeNames()[1];
433+
final String node1 = internalCluster().startDataOnlyNode();
434+
final String node2 = internalCluster().startDataOnlyNode();
443435
logger.info("--> nodes name: {}, {}", node1, node2);
444436

445437
final String indexName = "test";
@@ -481,12 +473,11 @@ public void testCorruptTranslogTruncationOfReplica() throws Exception {
481473
final ShardId shardId = new ShardId(resolveIndex(indexName), 0);
482474
final Set<Path> translogDirs = getDirs(node2, shardId, ShardPath.TRANSLOG_FOLDER_NAME);
483475

484-
// stop the cluster nodes. we don't use full restart so the node start up order will be the same
485-
// and shard roles will be maintained
476+
// stop data nodes. After the restart the 1st node will be primary and the 2nd node will be replica
486477
internalCluster().stopRandomDataNode();
487478
internalCluster().stopRandomDataNode();
488479

489-
// Corrupt the translog file(s)
480+
// Corrupt the translog file(s) on the replica
490481
logger.info("--> corrupting translog");
491482
TestTranslog.corruptRandomTranslogFile(logger, random(), translogDirs);
492483

test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java

Lines changed: 59 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1641,35 +1641,7 @@ private synchronized void stopNodesAndClient(NodeAndClient nodeAndClient) throws
16411641
}
16421642

16431643
private synchronized void stopNodesAndClients(Collection<NodeAndClient> nodeAndClients) throws IOException {
1644-
final Set<String> excludedNodeIds = new HashSet<>();
1645-
1646-
if (autoManageMinMasterNodes && nodeAndClients.size() > 0) {
1647-
1648-
final long currentMasters = nodes.values().stream().filter(NodeAndClient::isMasterEligible).count();
1649-
final long stoppingMasters = nodeAndClients.stream().filter(NodeAndClient::isMasterEligible).count();
1650-
1651-
assert stoppingMasters <= currentMasters : currentMasters + " < " + stoppingMasters;
1652-
if (stoppingMasters != currentMasters && stoppingMasters > 0) {
1653-
// If stopping few enough master-nodes that there's still a majority left, there is no need to withdraw their votes first.
1654-
// However, we do not yet have a way to be sure there's a majority left, because the voting configuration may not yet have
1655-
// been updated when the previous nodes shut down, so we must always explicitly withdraw votes.
1656-
// TODO add cluster health API to check that voting configuration is optimal so this isn't always needed
1657-
nodeAndClients.stream().filter(NodeAndClient::isMasterEligible).map(NodeAndClient::getName).forEach(excludedNodeIds::add);
1658-
assert excludedNodeIds.size() == stoppingMasters;
1659-
1660-
logger.info("adding voting config exclusions {} prior to shutdown", excludedNodeIds);
1661-
try {
1662-
client().execute(AddVotingConfigExclusionsAction.INSTANCE,
1663-
new AddVotingConfigExclusionsRequest(excludedNodeIds.toArray(new String[0]))).get();
1664-
} catch (InterruptedException | ExecutionException e) {
1665-
throw new AssertionError("unexpected", e);
1666-
}
1667-
}
1668-
1669-
if (stoppingMasters > 0) {
1670-
updateMinMasterNodes(getMasterNodesCount() - Math.toIntExact(stoppingMasters));
1671-
}
1672-
}
1644+
final Set<String> excludedNodeIds = excludeMasters(nodeAndClients);
16731645

16741646
for (NodeAndClient nodeAndClient: nodeAndClients) {
16751647
removeDisruptionSchemeFromNode(nodeAndClient);
@@ -1678,14 +1650,7 @@ private synchronized void stopNodesAndClients(Collection<NodeAndClient> nodeAndC
16781650
nodeAndClient.close();
16791651
}
16801652

1681-
if (excludedNodeIds.isEmpty() == false) {
1682-
logger.info("removing voting config exclusions for {} after shutdown", excludedNodeIds);
1683-
try {
1684-
client().execute(ClearVotingConfigExclusionsAction.INSTANCE, new ClearVotingConfigExclusionsRequest()).get();
1685-
} catch (InterruptedException | ExecutionException e) {
1686-
throw new AssertionError("unexpected", e);
1687-
}
1688-
}
1653+
removeExclusions(excludedNodeIds);
16891654
}
16901655

16911656
/**
@@ -1751,31 +1716,78 @@ public synchronized void rollingRestart(RestartCallback callback) throws Excepti
17511716

17521717
private void restartNode(NodeAndClient nodeAndClient, RestartCallback callback) throws Exception {
17531718
logger.info("Restarting node [{}] ", nodeAndClient.name);
1719+
17541720
if (activeDisruptionScheme != null) {
17551721
activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
17561722
}
1757-
final int masterNodesCount = getMasterNodesCount();
1758-
// special case to allow stopping one node in a two node cluster and keep it functional
1759-
final boolean updateMinMaster = nodeAndClient.isMasterEligible() && masterNodesCount == 2 && autoManageMinMasterNodes;
1760-
if (updateMinMaster) {
1761-
updateMinMasterNodes(masterNodesCount - 1);
1762-
}
1723+
1724+
Set<String> excludedNodeIds = excludeMasters(Collections.singleton(nodeAndClient));
1725+
17631726
final Settings newSettings = nodeAndClient.closeForRestart(callback,
1764-
autoManageMinMasterNodes ? getMinMasterNodes(masterNodesCount) : -1);
1727+
autoManageMinMasterNodes ? getMinMasterNodes(getMasterNodesCount()) : -1);
1728+
1729+
removeExclusions(excludedNodeIds);
1730+
17651731
nodeAndClient.recreateNode(newSettings, () -> rebuildUnicastHostFiles(emptyList()));
17661732
nodeAndClient.startNode();
17671733
if (activeDisruptionScheme != null) {
17681734
activeDisruptionScheme.applyToNode(nodeAndClient.name, this);
17691735
}
1770-
if (callback.validateClusterForming() || updateMinMaster) {
1736+
1737+
if (callback.validateClusterForming() || excludedNodeIds.isEmpty() == false) {
17711738
// we have to validate cluster size if updateMinMaster == true, because we need the
17721739
// second node to join in order to increment min_master_nodes back to 2.
17731740
// we also have to do via the node that was just restarted as it may be that the master didn't yet process
17741741
// the fact it left
17751742
validateClusterFormed(nodeAndClient.name);
17761743
}
1777-
if (updateMinMaster) {
1778-
updateMinMasterNodes(masterNodesCount);
1744+
1745+
if (excludedNodeIds.isEmpty() == false) {
1746+
updateMinMasterNodes(getMasterNodesCount());
1747+
}
1748+
}
1749+
1750+
private Set<String> excludeMasters(Collection<NodeAndClient> nodeAndClients) {
1751+
final Set<String> excludedNodeIds = new HashSet<>();
1752+
if (autoManageMinMasterNodes && nodeAndClients.size() > 0) {
1753+
1754+
final long currentMasters = nodes.values().stream().filter(NodeAndClient::isMasterEligible).count();
1755+
final long stoppingMasters = nodeAndClients.stream().filter(NodeAndClient::isMasterEligible).count();
1756+
1757+
assert stoppingMasters <= currentMasters : currentMasters + " < " + stoppingMasters;
1758+
if (stoppingMasters != currentMasters && stoppingMasters > 0) {
1759+
// If stopping few enough master-nodes that there's still a majority left, there is no need to withdraw their votes first.
1760+
// However, we do not yet have a way to be sure there's a majority left, because the voting configuration may not yet have
1761+
// been updated when the previous nodes shut down, so we must always explicitly withdraw votes.
1762+
// TODO add cluster health API to check that voting configuration is optimal so this isn't always needed
1763+
nodeAndClients.stream().filter(NodeAndClient::isMasterEligible).map(NodeAndClient::getName).forEach(excludedNodeIds::add);
1764+
assert excludedNodeIds.size() == stoppingMasters;
1765+
1766+
logger.info("adding voting config exclusions {} prior to restart/shutdown", excludedNodeIds);
1767+
try {
1768+
client().execute(AddVotingConfigExclusionsAction.INSTANCE,
1769+
new AddVotingConfigExclusionsRequest(excludedNodeIds.toArray(new String[0]))).get();
1770+
} catch (InterruptedException | ExecutionException e) {
1771+
throw new AssertionError("unexpected", e);
1772+
}
1773+
}
1774+
1775+
if (stoppingMasters > 0) {
1776+
updateMinMasterNodes(getMasterNodesCount() - Math.toIntExact(stoppingMasters));
1777+
}
1778+
}
1779+
return excludedNodeIds;
1780+
}
1781+
1782+
private void removeExclusions(Set<String> excludedNodeIds) {
1783+
if (excludedNodeIds.isEmpty() == false) {
1784+
logger.info("removing voting config exclusions for {} after restart/shutdown", excludedNodeIds);
1785+
try {
1786+
Client client = getRandomNodeAndClient(node -> excludedNodeIds.contains(node.name) == false).client(random);
1787+
client.execute(ClearVotingConfigExclusionsAction.INSTANCE, new ClearVotingConfigExclusionsRequest()).get();
1788+
} catch (InterruptedException | ExecutionException e) {
1789+
throw new AssertionError("unexpected", e);
1790+
}
17791791
}
17801792
}
17811793

@@ -1833,7 +1845,6 @@ public synchronized void fullRestart(RestartCallback callback) throws Exception
18331845
}
18341846
}
18351847

1836-
18371848
/**
18381849
* Returns the name of the current master node in the cluster.
18391850
*/

x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/CcrIntegTestCase.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,6 @@ private NodeConfigurationSource createNodeConfigurationSource() {
188188
builder.put(IndicesStore.INDICES_STORE_DELETE_SHARD_TIMEOUT.getKey(), new TimeValue(1, TimeUnit.SECONDS));
189189
builder.putList(DISCOVERY_ZEN_PING_UNICAST_HOSTS_SETTING.getKey()); // empty list disables a port scan for other nodes
190190
builder.putList(DISCOVERY_HOSTS_PROVIDER_SETTING.getKey(), "file");
191-
builder.put(TestZenDiscovery.USE_ZEN2.getKey(), false); // some tests do full cluster restarts
192191
builder.put(NetworkModule.TRANSPORT_TYPE_KEY, getTestTransportType());
193192
builder.put(XPackSettings.SECURITY_ENABLED.getKey(), false);
194193
builder.put(XPackSettings.MONITORING_ENABLED.getKey(), false);

0 commit comments

Comments
 (0)