Skip to content

Commit f15be14

Browse files
authored
Fix CloseWhileRelocatingShardsIT (elastic#38728)
1 parent cb8afbb commit f15be14

File tree

1 file changed

+56
-25
lines changed

1 file changed

+56
-25
lines changed

server/src/test/java/org/elasticsearch/indices/state/CloseWhileRelocatingShardsIT.java

+56-25
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@
1818
*/
1919
package org.elasticsearch.indices.state;
2020

21+
import org.apache.logging.log4j.message.ParameterizedMessage;
2122
import org.elasticsearch.action.admin.cluster.reroute.ClusterRerouteRequest;
2223
import org.elasticsearch.action.support.master.AcknowledgedResponse;
24+
import org.elasticsearch.cluster.ClusterState;
2325
import org.elasticsearch.cluster.node.DiscoveryNode;
2426
import org.elasticsearch.cluster.routing.IndexRoutingTable;
2527
import org.elasticsearch.cluster.routing.ShardRouting;
@@ -32,13 +34,14 @@
3234
import org.elasticsearch.cluster.service.ClusterService;
3335
import org.elasticsearch.common.settings.Settings;
3436
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
37+
import org.elasticsearch.index.shard.ShardId;
3538
import org.elasticsearch.indices.recovery.PeerRecoverySourceService;
3639
import org.elasticsearch.indices.recovery.StartRecoveryRequest;
3740
import org.elasticsearch.plugins.Plugin;
3841
import org.elasticsearch.test.BackgroundIndexer;
3942
import org.elasticsearch.test.ESIntegTestCase;
40-
import org.elasticsearch.test.junit.annotations.TestLogging;
4143
import org.elasticsearch.test.transport.MockTransportService;
44+
import org.elasticsearch.test.transport.StubbableTransport;
4245
import org.elasticsearch.transport.TransportService;
4346

4447
import java.util.ArrayList;
@@ -57,6 +60,7 @@
5760
import static org.elasticsearch.indices.state.CloseIndexIT.assertIndexIsOpened;
5861
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
5962
import static org.hamcrest.Matchers.greaterThan;
63+
import static org.hamcrest.Matchers.hasSize;
6064

6165
@ESIntegTestCase.ClusterScope(minNumDataNodes = 2)
6266
public class CloseWhileRelocatingShardsIT extends ESIntegTestCase {
@@ -68,9 +72,11 @@ protected Collection<Class<? extends Plugin>> nodePlugins() {
6872

6973
@Override
7074
protected Settings nodeSettings(int nodeOrdinal) {
75+
final int maxRecoveries = Integer.MAX_VALUE;
7176
return Settings.builder()
7277
.put(super.nodeSettings(nodeOrdinal))
73-
.put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), Integer.MAX_VALUE)
78+
.put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), maxRecoveries)
79+
.put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), maxRecoveries)
7480
.put(ConcurrentRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_CLUSTER_CONCURRENT_REBALANCE_SETTING.getKey(), -1)
7581
.build();
7682
}
@@ -80,7 +86,6 @@ protected int maximumNumberOfShards() {
8086
return 3;
8187
}
8288

83-
@TestLogging("org.elasticsearch.cluster.metadata.MetaDataIndexStateService:DEBUG,org.elasticsearch.action.admin.indices.close:DEBUG")
8489
public void testCloseWhileRelocatingShards() throws Exception {
8590
final String[] indices = new String[randomIntBetween(3, 5)];
8691
final Map<String, Long> docsPerIndex = new HashMap<>();
@@ -119,21 +124,19 @@ public void testCloseWhileRelocatingShards() throws Exception {
119124

120125
final String targetNode = internalCluster().startDataOnlyNode();
121126
ensureClusterSizeConsistency(); // wait for the master to finish processing join.
122-
final MockTransportService targetTransportService =
123-
(MockTransportService) internalCluster().getInstance(TransportService.class, targetNode);
124127

125-
final Set<String> acknowledgedCloses = ConcurrentCollections.newConcurrentSet();
126128
try {
127129
final ClusterService clusterService = internalCluster().getInstance(ClusterService.class, internalCluster().getMasterName());
130+
final ClusterState state = clusterService.state();
128131
final CountDownLatch latch = new CountDownLatch(indices.length);
129-
final CountDownLatch release = new CountDownLatch(1);
132+
final CountDownLatch release = new CountDownLatch(indices.length);
130133

131134
// relocate one shard for every index to be closed
132135
final AllocationCommands commands = new AllocationCommands();
133136
for (final String index : indices) {
134137
final NumShards numShards = getNumShards(index);
135138
final int shardId = numShards.numPrimaries == 1 ? 0 : randomIntBetween(0, numShards.numPrimaries - 1);
136-
final IndexRoutingTable indexRoutingTable = clusterService.state().routingTable().index(index);
139+
final IndexRoutingTable indexRoutingTable = state.routingTable().index(index);
137140

138141
final ShardRouting primary = indexRoutingTable.shard(shardId).primaryShard();
139142
assertTrue(primary.started());
@@ -146,24 +149,49 @@ public void testCloseWhileRelocatingShards() throws Exception {
146149
currentNodeId = replica.currentNodeId();
147150
}
148151
}
152+
commands.add(new MoveAllocationCommand(index, shardId, state.nodes().resolveNode(currentNodeId).getName(), targetNode));
153+
}
154+
155+
// Build the list of shards for which recoveries will be blocked
156+
final Set<ShardId> blockedShards = commands.commands().stream()
157+
.map(c -> (MoveAllocationCommand) c)
158+
.map(c -> new ShardId(clusterService.state().metaData().index(c.index()).getIndex(), c.shardId()))
159+
.collect(Collectors.toSet());
160+
assertThat(blockedShards, hasSize(indices.length));
161+
162+
final Set<String> acknowledgedCloses = ConcurrentCollections.newConcurrentSet();
163+
final Set<String> interruptedRecoveries = ConcurrentCollections.newConcurrentSet();
149164

150-
final DiscoveryNode sourceNode = clusterService.state().nodes().resolveNode(primary.currentNodeId());
151-
targetTransportService.addSendBehavior(internalCluster().getInstance(TransportService.class, sourceNode.getName()),
152-
(connection, requestId, action, request, options) -> {
153-
if (PeerRecoverySourceService.Actions.START_RECOVERY.equals(action)) {
154-
logger.debug("blocking recovery of shard {}", ((StartRecoveryRequest) request).shardId());
155-
latch.countDown();
156-
try {
157-
release.await();
158-
logger.debug("releasing recovery of shard {}", ((StartRecoveryRequest) request).shardId());
159-
} catch (InterruptedException e) {
160-
throw new AssertionError(e);
161-
}
162-
}
163-
connection.sendRequest(requestId, action, request, options);
165+
// Create a SendRequestBehavior that will block outgoing start recovery request
166+
final StubbableTransport.SendRequestBehavior sendBehavior = (connection, requestId, action, request, options) -> {
167+
if (PeerRecoverySourceService.Actions.START_RECOVERY.equals(action)) {
168+
final StartRecoveryRequest startRecoveryRequest = ((StartRecoveryRequest) request);
169+
if (blockedShards.contains(startRecoveryRequest.shardId())) {
170+
logger.debug("blocking recovery of shard {}", startRecoveryRequest.shardId());
171+
latch.countDown();
172+
try {
173+
release.await();
174+
logger.debug("releasing recovery of shard {}", startRecoveryRequest.shardId());
175+
} catch (final InterruptedException e) {
176+
logger.warn(() -> new ParameterizedMessage("exception when releasing recovery of shard {}",
177+
startRecoveryRequest.shardId()), e);
178+
interruptedRecoveries.add(startRecoveryRequest.shardId().getIndexName());
179+
Thread.currentThread().interrupt();
180+
return;
164181
}
165-
);
166-
commands.add(new MoveAllocationCommand(index, shardId, currentNodeId, targetNode));
182+
}
183+
}
184+
connection.sendRequest(requestId, action, request, options);
185+
};
186+
187+
final MockTransportService targetTransportService =
188+
(MockTransportService) internalCluster().getInstance(TransportService.class, targetNode);
189+
190+
for (DiscoveryNode node : state.getNodes()) {
191+
if (node.isDataNode() && node.getName().equals(targetNode) == false) {
192+
final TransportService sourceTransportService = internalCluster().getInstance(TransportService.class, node.getName());
193+
targetTransportService.addSendBehavior(sourceTransportService, sendBehavior);
194+
}
167195
}
168196

169197
assertAcked(client().admin().cluster().reroute(new ClusterRerouteRequest().commands(commands)).get());
@@ -222,12 +250,15 @@ public void testCloseWhileRelocatingShards() throws Exception {
222250

223251
targetTransportService.clearAllRules();
224252

253+
// If a shard recovery has been interrupted, we expect its index to be closed
254+
interruptedRecoveries.forEach(CloseIndexIT::assertIndexIsClosed);
255+
225256
assertThat("Consider that the test failed if no indices were successfully closed", acknowledgedCloses.size(), greaterThan(0));
226257
assertAcked(client().admin().indices().prepareOpen("index-*"));
227258
ensureGreen(indices);
228259

229260
for (String index : acknowledgedCloses) {
230-
long docsCount = client().prepareSearch(index).setSize(0).get().getHits().getTotalHits().value;
261+
long docsCount = client().prepareSearch(index).setSize(0).setTrackTotalHits(true).get().getHits().getTotalHits().value;
231262
assertEquals("Expected " + docsPerIndex.get(index) + " docs in index " + index + " but got " + docsCount
232263
+ " (close acknowledged=" + acknowledgedCloses.contains(index) + ")", (long) docsPerIndex.get(index), docsCount);
233264
}

0 commit comments

Comments
 (0)