Skip to content

Commit e569cf8

Browse files
committed
Address failing CCR retention lease test
This test fails rarely but it is flaky in its current form. The problem here is that we lack a guarantee on the retention leases having been synced to all shard copies. We need to sleep long enough to ensure that that occurs, and then we can sample the retention leases, possibly sleep again (we usually will not have too since the first sleep will have been long enough to allow a sync and a renewal to happen, if one was going to happen), and the sample the retention leases for comparison. Closes #39331
1 parent e4e96b8 commit e569cf8

File tree

1 file changed

+20
-2
lines changed

1 file changed

+20
-2
lines changed

x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/CcrRetentionLeaseIT.java

+20-2
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,21 @@ public void testRetentionLeasesAreNotBeingRenewedAfterRecoveryCompletes() throws
301301
final RestoreInfo restoreInfo = future.actionGet();
302302
final long start = System.nanoTime();
303303

304+
/*
305+
* We want to ensure that the retention leases have been synced to all shard copies, as otherwise they might sync between the two
306+
* times that we sample the retention leases, which would cause our check to fail.
307+
*/
308+
final TimeValue syncIntervalSetting = IndexService.RETENTION_LEASE_SYNC_INTERVAL_SETTING.get(
309+
leaderClient()
310+
.admin()
311+
.indices()
312+
.prepareGetSettings(leaderIndex)
313+
.get()
314+
.getIndexToSettings()
315+
.get(leaderIndex));
316+
final long syncEnd = System.nanoTime();
317+
Thread.sleep(Math.max(0, randomIntBetween(2, 4) * syncIntervalSetting.millis() - TimeUnit.NANOSECONDS.toMillis(syncEnd - start)));
318+
304319
final ClusterStateResponse leaderIndexClusterState =
305320
leaderClient().admin().cluster().prepareState().clear().setMetaData(true).setIndices(leaderIndex).get();
306321
final String leaderUUID = leaderIndexClusterState.getState().metaData().index(leaderIndex).getIndexUUID();
@@ -347,8 +362,8 @@ public void testRetentionLeasesAreNotBeingRenewedAfterRecoveryCompletes() throws
347362
.getIndexToSettings()
348363
.get(followerIndex));
349364

350-
final long end = System.nanoTime();
351-
Thread.sleep(Math.max(0, randomIntBetween(2, 4) * renewIntervalSetting.millis() - TimeUnit.NANOSECONDS.toMillis(end - start)));
365+
final long renewEnd = System.nanoTime();
366+
Thread.sleep(Math.max(0, randomIntBetween(2, 4) * renewIntervalSetting.millis() - TimeUnit.NANOSECONDS.toMillis(renewEnd - start)));
352367

353368
// now ensure that the retention leases are the same
354369
assertBusy(() -> {
@@ -358,6 +373,9 @@ public void testRetentionLeasesAreNotBeingRenewedAfterRecoveryCompletes() throws
358373
assertThat(stats.getShards(), arrayWithSize(numberOfShards * (1 + numberOfReplicas)));
359374
final List<ShardStats> shardsStats = getShardsStats(stats);
360375
for (int i = 0; i < numberOfShards * (1 + numberOfReplicas); i++) {
376+
if (shardsStats.get(i).getShardRouting().primary() == false) {
377+
continue;
378+
}
361379
final RetentionLeases currentRetentionLeases = shardsStats.get(i).getRetentionLeaseStats().retentionLeases();
362380
assertThat(currentRetentionLeases.leases(), hasSize(1));
363381
final ClusterStateResponse followerIndexClusterState =

0 commit comments

Comments
 (0)