Skip to content

Commit 703d6c2

Browse files
authored
Do not release snapshot file download permit during recovery retries (#79438)
Relates #79316 Backport of #79409
1 parent c6e5483 commit 703d6c2

File tree

2 files changed

+73
-3
lines changed

2 files changed

+73
-3
lines changed

server/src/internalClusterTest/java/org/elasticsearch/indices/recovery/SnapshotBasedIndexRecoveryIT.java

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@
7373
import java.util.Map;
7474
import java.util.concurrent.CountDownLatch;
7575
import java.util.concurrent.TimeUnit;
76+
import java.util.concurrent.atomic.AtomicBoolean;
7677
import java.util.concurrent.atomic.AtomicInteger;
7778
import java.util.concurrent.atomic.AtomicLong;
7879
import java.util.concurrent.atomic.AtomicReference;
@@ -1009,6 +1010,70 @@ public void testRecoveryReEstablishKeepsTheGrantedSnapshotFileDownloadPermit() t
10091010
});
10101011
}
10111012

1013+
public void testRecoveryRetryKeepsTheGrantedSnapshotFileDownloadPermit() throws Exception {
1014+
executeRecoveryWithSnapshotFileDownloadThrottled((indices,
1015+
sourceNode,
1016+
targetNode,
1017+
targetMockTransportService,
1018+
recoverySnapshotFileRequests,
1019+
awaitForRecoverSnapshotFileRequestReceived,
1020+
respondToRecoverSnapshotFile) -> {
1021+
MockTransportService sourceMockTransportService =
1022+
(MockTransportService) internalCluster().getInstance(TransportService.class, sourceNode);
1023+
1024+
CountDownLatch startRecoveryRetryReceived = new CountDownLatch(1);
1025+
AtomicBoolean delayRecoveryExceptionSent = new AtomicBoolean();
1026+
sourceMockTransportService.addRequestHandlingBehavior(PeerRecoverySourceService.Actions.START_RECOVERY,
1027+
(handler, request, channel, task) -> {
1028+
if (delayRecoveryExceptionSent.compareAndSet(false, true)) {
1029+
channel.sendResponse(new DelayRecoveryException("delay"));
1030+
} else {
1031+
startRecoveryRetryReceived.countDown();
1032+
handler.messageReceived(request, channel, task);
1033+
}
1034+
});
1035+
1036+
String indexRecoveredFromSnapshot1 = indices.get(0);
1037+
assertAcked(
1038+
client().admin().indices().prepareUpdateSettings(indexRecoveredFromSnapshot1)
1039+
.setSettings(Settings.builder()
1040+
.put("index.routing.allocation.require._name", targetNode)).get()
1041+
);
1042+
1043+
startRecoveryRetryReceived.await();
1044+
sourceMockTransportService.clearAllRules();
1045+
awaitForRecoverSnapshotFileRequestReceived.run();
1046+
1047+
String indexRecoveredFromPeer = indices.get(1);
1048+
assertAcked(
1049+
client().admin().indices().prepareUpdateSettings(indexRecoveredFromPeer)
1050+
.setSettings(Settings.builder()
1051+
.put("index.routing.allocation.require._name", targetNode)).get()
1052+
);
1053+
1054+
ensureGreen(indexRecoveredFromPeer);
1055+
assertPeerRecoveryDidNotUseSnapshots(indexRecoveredFromPeer, sourceNode, targetNode);
1056+
1057+
respondToRecoverSnapshotFile.run();
1058+
1059+
ensureGreen(indexRecoveredFromSnapshot1);
1060+
assertPeerRecoveryUsedSnapshots(indexRecoveredFromSnapshot1, sourceNode, targetNode);
1061+
1062+
targetMockTransportService.clearAllRules();
1063+
1064+
final String indexRecoveredFromSnapshot2 = indices.get(2);
1065+
assertAcked(
1066+
client().admin().indices().prepareUpdateSettings(indexRecoveredFromSnapshot2)
1067+
.setSettings(Settings.builder()
1068+
.put("index.routing.allocation.require._name", targetNode)).get()
1069+
);
1070+
1071+
ensureGreen(indexRecoveredFromSnapshot2);
1072+
assertPeerRecoveryUsedSnapshots(indexRecoveredFromSnapshot2, sourceNode, targetNode);
1073+
});
1074+
}
1075+
1076+
10121077
private void executeRecoveryWithSnapshotFileDownloadThrottled(SnapshotBasedRecoveryThrottlingTestCase testCase) throws Exception {
10131078
updateSetting(INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS.getKey(), "1");
10141079
updateSetting(INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS_PER_NODE.getKey(), "1");

server/src/main/java/org/elasticsearch/indices/recovery/RecoveryTarget.java

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,6 @@ public class RecoveryTarget extends AbstractRefCounted implements RecoveryTarget
6868
private final IndexShard indexShard;
6969
private final DiscoveryNode sourceNode;
7070
private final SnapshotFilesProvider snapshotFilesProvider;
71-
@Nullable // if we're not downloading files from snapshots in this recovery
72-
private final Releasable snapshotFileDownloadsPermit;
7371
private final MultiFileWriter multiFileWriter;
7472
private final RecoveryRequestTracker requestTracker = new RecoveryRequestTracker();
7573
private final Store store;
@@ -84,6 +82,9 @@ public class RecoveryTarget extends AbstractRefCounted implements RecoveryTarget
8482

8583
private volatile boolean recoveryMonitorEnabled = true;
8684

85+
@Nullable // if we're not downloading files from snapshots in this recovery or we're retrying
86+
private volatile Releasable snapshotFileDownloadsPermit;
87+
8788
// latch that can be used to blockingly wait for RecoveryTarget to be closed
8889
private final CountDownLatch closedLatch = new CountDownLatch(1);
8990

@@ -126,7 +127,11 @@ public RecoveryTarget(IndexShard indexShard,
126127
* @return a copy of this recovery target
127128
*/
128129
public RecoveryTarget retryCopy() {
129-
return new RecoveryTarget(indexShard, sourceNode, snapshotFilesProvider, snapshotFileDownloadsPermit, listener);
130+
// If we're retrying we should remove the reference from this instance as the underlying resources
131+
// get released after the retry copy is created
132+
Releasable snapshotFileDownloadsPermitCopy = snapshotFileDownloadsPermit;
133+
snapshotFileDownloadsPermit = null;
134+
return new RecoveryTarget(indexShard, sourceNode, snapshotFilesProvider, snapshotFileDownloadsPermitCopy, listener);
130135
}
131136

132137
@Nullable

0 commit comments

Comments
 (0)