Skip to content

Commit 5e3ae47

Browse files
committed
TEST: Retry synced-flush if ongoing ops on primary (#30978)
When the last indexing operation is completed, we will fire a global checkpoint sync. Since a global checkpoint sync request is a replication request, it will acquire an index shard permit on the primary when executing. If this happens at the same time while we are issuing the synced-flush, the synced-flush request will fail as it thinks there are in-flight operations. We can avoid such situation by retrying another synced-flush if the current request fails due to ongoing operations on the primary. Closes #29392
1 parent c52cf4b commit 5e3ae47

File tree

2 files changed

+26
-28
lines changed

2 files changed

+26
-28
lines changed

server/src/test/java/org/elasticsearch/indices/flush/FlushIT.java

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -46,15 +46,13 @@
4646
import org.elasticsearch.index.shard.ShardId;
4747
import org.elasticsearch.indices.IndicesService;
4848
import org.elasticsearch.test.ESIntegTestCase;
49-
import org.elasticsearch.test.junit.annotations.TestLogging;
5049

5150
import java.io.IOException;
5251
import java.util.Arrays;
5352
import java.util.List;
5453
import java.util.Map;
5554
import java.util.concurrent.CopyOnWriteArrayList;
5655
import java.util.concurrent.CountDownLatch;
57-
import java.util.concurrent.ExecutionException;
5856
import java.util.concurrent.atomic.AtomicBoolean;
5957
import java.util.concurrent.atomic.AtomicInteger;
6058
import java.util.stream.Collectors;
@@ -102,7 +100,7 @@ public void onFailure(Exception e) {
102100
}
103101
}
104102

105-
public void testSyncedFlush() throws ExecutionException, InterruptedException, IOException {
103+
public void testSyncedFlush() throws Exception {
106104
internalCluster().ensureAtLeastNumDataNodes(2);
107105
prepareCreate("test").setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)).get();
108106
ensureGreen();
@@ -245,13 +243,6 @@ private void indexDoc(Engine engine, String id) throws IOException {
245243
assertThat(indexResult.getFailure(), nullValue());
246244
}
247245

248-
private String syncedFlushDescription(ShardsSyncedFlushResult result) {
249-
return result.shardResponses().entrySet().stream()
250-
.map(e -> "Shard [" + e.getKey() + "], result [" + e.getValue() + "]")
251-
.collect(Collectors.joining(","));
252-
}
253-
254-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/29392")
255246
public void testSyncedFlushSkipOutOfSyncReplicas() throws Exception {
256247
internalCluster().ensureAtLeastNumDataNodes(between(2, 3));
257248
final int numberOfReplicas = internalCluster().numDataNodes() - 1;
@@ -277,7 +268,6 @@ public void testSyncedFlushSkipOutOfSyncReplicas() throws Exception {
277268
indexDoc(IndexShardTestCase.getEngine(outOfSyncReplica), "extra_" + i);
278269
}
279270
final ShardsSyncedFlushResult partialResult = SyncedFlushUtil.attemptSyncedFlush(internalCluster(), shardId);
280-
logger.info("Partial seal: {}", syncedFlushDescription(partialResult));
281271
assertThat(partialResult.totalShards(), equalTo(numberOfReplicas + 1));
282272
assertThat(partialResult.successfulShards(), equalTo(numberOfReplicas));
283273
assertThat(partialResult.shardResponses().get(outOfSyncReplica.routingEntry()).failureReason, equalTo(
@@ -293,8 +283,6 @@ public void testSyncedFlushSkipOutOfSyncReplicas() throws Exception {
293283
assertThat(fullResult.successfulShards(), equalTo(numberOfReplicas + 1));
294284
}
295285

296-
@TestLogging("_root:DEBUG")
297-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/29392")
298286
public void testDoNotRenewSyncedFlushWhenAllSealed() throws Exception {
299287
internalCluster().ensureAtLeastNumDataNodes(between(2, 3));
300288
final int numberOfReplicas = internalCluster().numDataNodes() - 1;
@@ -311,11 +299,9 @@ public void testDoNotRenewSyncedFlushWhenAllSealed() throws Exception {
311299
index("test", "doc", Integer.toString(i));
312300
}
313301
final ShardsSyncedFlushResult firstSeal = SyncedFlushUtil.attemptSyncedFlush(internalCluster(), shardId);
314-
logger.info("First seal: {}", syncedFlushDescription(firstSeal));
315302
assertThat(firstSeal.successfulShards(), equalTo(numberOfReplicas + 1));
316303
// Do not renew synced-flush
317304
final ShardsSyncedFlushResult secondSeal = SyncedFlushUtil.attemptSyncedFlush(internalCluster(), shardId);
318-
logger.info("Second seal: {}", syncedFlushDescription(secondSeal));
319305
assertThat(secondSeal.successfulShards(), equalTo(numberOfReplicas + 1));
320306
assertThat(secondSeal.syncId(), equalTo(firstSeal.syncId()));
321307
// Shards were updated, renew synced flush.
@@ -324,7 +310,6 @@ public void testDoNotRenewSyncedFlushWhenAllSealed() throws Exception {
324310
index("test", "doc", Integer.toString(i));
325311
}
326312
final ShardsSyncedFlushResult thirdSeal = SyncedFlushUtil.attemptSyncedFlush(internalCluster(), shardId);
327-
logger.info("Third seal: {}", syncedFlushDescription(thirdSeal));
328313
assertThat(thirdSeal.successfulShards(), equalTo(numberOfReplicas + 1));
329314
assertThat(thirdSeal.syncId(), not(equalTo(firstSeal.syncId())));
330315
// Manually remove or change sync-id, renew synced flush.
@@ -340,7 +325,6 @@ public void testDoNotRenewSyncedFlushWhenAllSealed() throws Exception {
340325
assertThat(shard.commitStats().syncId(), nullValue());
341326
}
342327
final ShardsSyncedFlushResult forthSeal = SyncedFlushUtil.attemptSyncedFlush(internalCluster(), shardId);
343-
logger.info("Forth seal: {}", syncedFlushDescription(forthSeal));
344328
assertThat(forthSeal.successfulShards(), equalTo(numberOfReplicas + 1));
345329
assertThat(forthSeal.syncId(), not(equalTo(thirdSeal.syncId())));
346330
}

server/src/test/java/org/elasticsearch/indices/flush/SyncedFlushUtil.java

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,15 @@
2222
import org.elasticsearch.action.ActionListener;
2323
import org.elasticsearch.cluster.ClusterState;
2424
import org.elasticsearch.cluster.routing.ShardRouting;
25-
import org.elasticsearch.index.engine.Engine;
2625
import org.elasticsearch.index.shard.ShardId;
2726
import org.elasticsearch.test.InternalTestCluster;
2827

2928
import java.util.List;
3029
import java.util.Map;
3130
import java.util.concurrent.CountDownLatch;
31+
import java.util.concurrent.atomic.AtomicReference;
32+
33+
import static org.elasticsearch.test.ESTestCase.assertBusy;
3234

3335
/** Utils for SyncedFlush */
3436
public class SyncedFlushUtil {
@@ -40,19 +42,31 @@ private SyncedFlushUtil() {
4042
/**
4143
* Blocking version of {@link SyncedFlushService#attemptSyncedFlush(ShardId, ActionListener)}
4244
*/
43-
public static ShardsSyncedFlushResult attemptSyncedFlush(InternalTestCluster cluster, ShardId shardId) {
45+
public static ShardsSyncedFlushResult attemptSyncedFlush(InternalTestCluster cluster, ShardId shardId) throws Exception {
46+
/*
47+
* When the last indexing operation is completed, we will fire a global checkpoint sync.
48+
* Since a global checkpoint sync request is a replication request, it will acquire an index
49+
* shard permit on the primary when executing. If this happens at the same time while we are
50+
* issuing the synced-flush, the synced-flush request will fail as it thinks there are
51+
* in-flight operations. We can avoid such situation by continuing issuing another synced-flush
52+
* if the synced-flush failed due to the ongoing operations on the primary.
53+
*/
4454
SyncedFlushService service = cluster.getInstance(SyncedFlushService.class);
45-
LatchedListener<ShardsSyncedFlushResult> listener = new LatchedListener();
46-
service.attemptSyncedFlush(shardId, listener);
47-
try {
55+
AtomicReference<LatchedListener<ShardsSyncedFlushResult>> listenerHolder = new AtomicReference<>();
56+
assertBusy(() -> {
57+
LatchedListener<ShardsSyncedFlushResult> listener = new LatchedListener<>();
58+
listenerHolder.set(listener);
59+
service.attemptSyncedFlush(shardId, listener);
4860
listener.latch.await();
49-
} catch (InterruptedException e) {
50-
Thread.currentThread().interrupt();
61+
if (listener.result != null && listener.result.failureReason() != null
62+
&& listener.result.failureReason().contains("ongoing operations on primary")) {
63+
throw new AssertionError(listener.result.failureReason()); // cause the assert busy to retry
64+
}
65+
});
66+
if (listenerHolder.get().error != null) {
67+
throw ExceptionsHelper.convertToElastic(listenerHolder.get().error);
5168
}
52-
if (listener.error != null) {
53-
throw ExceptionsHelper.convertToElastic(listener.error);
54-
}
55-
return listener.result;
69+
return listenerHolder.get().result;
5670
}
5771

5872
public static final class LatchedListener<T> implements ActionListener<T> {

0 commit comments

Comments
 (0)