Skip to content

Commit 2ec1483

Browse files
authored
Create peer-recovery retention leases (#43190)
This creates a peer-recovery retention lease for every shard during recovery, ensuring that the replication group retains history for future peer recoveries. It also ensures that leases for active shard copies do not expire, and leases for inactive shard copies expire immediately if the shard is fully-allocated. Relates #41536
1 parent 7293926 commit 2ec1483

File tree

26 files changed

+712
-193
lines changed

26 files changed

+712
-193
lines changed

client/rest-high-level/src/test/java/org/elasticsearch/client/CCRIT.java

+4-1
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
import org.elasticsearch.common.xcontent.XContentHelper;
5555
import org.elasticsearch.common.xcontent.XContentType;
5656
import org.elasticsearch.common.xcontent.json.JsonXContent;
57+
import org.elasticsearch.index.seqno.ReplicationTracker;
5758
import org.elasticsearch.test.rest.yaml.ObjectPath;
5859
import org.junit.Before;
5960

@@ -260,7 +261,9 @@ public void testForgetFollower() throws IOException {
260261
final Map<?, ?> shardStatsAsMap = (Map<?, ?>) shardStats.get(0);
261262
final Map<?, ?> retentionLeasesStats = (Map<?, ?>) shardStatsAsMap.get("retention_leases");
262263
final List<?> leases = (List<?>) retentionLeasesStats.get("leases");
263-
assertThat(leases, empty());
264+
for (final Object lease : leases) {
265+
assertThat(((Map<?, ?>) lease).get("source"), equalTo(ReplicationTracker.PEER_RECOVERY_RETENTION_LEASE_SOURCE));
266+
}
264267
}
265268
}
266269

server/src/main/java/org/elasticsearch/index/seqno/ReplicationTracker.java

+130-13
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.elasticsearch.Version;
2525
import org.elasticsearch.action.ActionListener;
2626
import org.elasticsearch.action.support.replication.ReplicationResponse;
27+
import org.elasticsearch.cluster.metadata.IndexMetaData;
2728
import org.elasticsearch.cluster.routing.AllocationId;
2829
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
2930
import org.elasticsearch.cluster.routing.ShardRouting;
@@ -217,10 +218,22 @@ public synchronized Tuple<Boolean, RetentionLeases> getRetentionLeases(final boo
217218
// the primary calculates the non-expired retention leases and syncs them to replicas
218219
final long currentTimeMillis = currentTimeMillisSupplier.getAsLong();
219220
final long retentionLeaseMillis = indexSettings.getRetentionLeaseMillis();
221+
final Set<String> leaseIdsForCurrentPeers
222+
= routingTable.assignedShards().stream().map(ReplicationTracker::getPeerRecoveryRetentionLeaseId).collect(Collectors.toSet());
220223
final Map<Boolean, List<RetentionLease>> partitionByExpiration = retentionLeases
221224
.leases()
222225
.stream()
223-
.collect(Collectors.groupingBy(lease -> currentTimeMillis - lease.timestamp() > retentionLeaseMillis));
226+
.collect(Collectors.groupingBy(lease -> {
227+
if (lease.source().equals(PEER_RECOVERY_RETENTION_LEASE_SOURCE)) {
228+
if (leaseIdsForCurrentPeers.contains(lease.id())) {
229+
return false;
230+
}
231+
if (routingTable.allShardsStarted()) {
232+
return true;
233+
}
234+
}
235+
return currentTimeMillis - lease.timestamp() > retentionLeaseMillis;
236+
}));
224237
final Collection<RetentionLease> expiredLeases = partitionByExpiration.get(true);
225238
if (expiredLeases == null) {
226239
// early out as no retention leases have expired
@@ -242,7 +255,7 @@ public synchronized Tuple<Boolean, RetentionLeases> getRetentionLeases(final boo
242255
* @param source the source of the retention lease
243256
* @param listener the callback when the retention lease is successfully added and synced to replicas
244257
* @return the new retention lease
245-
* @throws IllegalArgumentException if the specified retention lease already exists
258+
* @throws RetentionLeaseAlreadyExistsException if the specified retention lease already exists
246259
*/
247260
public RetentionLease addRetentionLease(
248261
final String id,
@@ -253,30 +266,46 @@ public RetentionLease addRetentionLease(
253266
final RetentionLease retentionLease;
254267
final RetentionLeases currentRetentionLeases;
255268
synchronized (this) {
256-
assert primaryMode;
257-
if (retentionLeases.contains(id)) {
258-
throw new RetentionLeaseAlreadyExistsException(id);
259-
}
260-
retentionLease = new RetentionLease(id, retainingSequenceNumber, currentTimeMillisSupplier.getAsLong(), source);
261-
logger.debug("adding new retention lease [{}] to current retention leases [{}]", retentionLease, retentionLeases);
262-
retentionLeases = new RetentionLeases(
263-
operationPrimaryTerm,
264-
retentionLeases.version() + 1,
265-
Stream.concat(retentionLeases.leases().stream(), Stream.of(retentionLease)).collect(Collectors.toList()));
269+
retentionLease = innerAddRetentionLease(id, retainingSequenceNumber, source);
266270
currentRetentionLeases = retentionLeases;
267271
}
268272
onSyncRetentionLeases.accept(currentRetentionLeases, listener);
269273
return retentionLease;
270274
}
271275

276+
/**
277+
* Adds a new retention lease, but does not synchronise it with the rest of the replication group.
278+
*
279+
* @param id the identifier of the retention lease
280+
* @param retainingSequenceNumber the retaining sequence number
281+
* @param source the source of the retention lease
282+
* @return the new retention lease
283+
* @throws RetentionLeaseAlreadyExistsException if the specified retention lease already exists
284+
*/
285+
private RetentionLease innerAddRetentionLease(String id, long retainingSequenceNumber, String source) {
286+
assert Thread.holdsLock(this);
287+
assert primaryMode : id + "/" + retainingSequenceNumber + "/" + source;
288+
if (retentionLeases.contains(id)) {
289+
throw new RetentionLeaseAlreadyExistsException(id);
290+
}
291+
final RetentionLease retentionLease
292+
= new RetentionLease(id, retainingSequenceNumber, currentTimeMillisSupplier.getAsLong(), source);
293+
logger.debug("adding new retention lease [{}] to current retention leases [{}]", retentionLease, retentionLeases);
294+
retentionLeases = new RetentionLeases(
295+
operationPrimaryTerm,
296+
retentionLeases.version() + 1,
297+
Stream.concat(retentionLeases.leases().stream(), Stream.of(retentionLease)).collect(Collectors.toList()));
298+
return retentionLease;
299+
}
300+
272301
/**
273302
* Renews an existing retention lease.
274303
*
275304
* @param id the identifier of the retention lease
276305
* @param retainingSequenceNumber the retaining sequence number
277306
* @param source the source of the retention lease
278307
* @return the renewed retention lease
279-
* @throws IllegalArgumentException if the specified retention lease does not exist
308+
* @throws RetentionLeaseNotFoundException if the specified retention lease does not exist
280309
*/
281310
public synchronized RetentionLease renewRetentionLease(final String id, final long retainingSequenceNumber, final String source) {
282311
assert primaryMode;
@@ -390,6 +419,51 @@ public boolean assertRetentionLeasesPersisted(final Path path) throws IOExceptio
390419
return true;
391420
}
392421

422+
423+
/**
424+
* Retention leases for peer recovery have source {@link ReplicationTracker#PEER_RECOVERY_RETENTION_LEASE_SOURCE}, a lease ID
425+
* containing the persistent node ID calculated by {@link ReplicationTracker#getPeerRecoveryRetentionLeaseId}, and retain operations
426+
* with sequence numbers strictly greater than the given global checkpoint.
427+
*/
428+
public void addPeerRecoveryRetentionLease(String nodeId, long globalCheckpoint, ActionListener<ReplicationResponse> listener) {
429+
addRetentionLease(getPeerRecoveryRetentionLeaseId(nodeId), globalCheckpoint + 1, PEER_RECOVERY_RETENTION_LEASE_SOURCE, listener);
430+
}
431+
432+
/**
433+
* Source for peer recovery retention leases; see {@link ReplicationTracker#addPeerRecoveryRetentionLease}.
434+
*/
435+
public static final String PEER_RECOVERY_RETENTION_LEASE_SOURCE = "peer recovery";
436+
437+
/**
438+
* Id for a peer recovery retention lease for the given node. See {@link ReplicationTracker#addPeerRecoveryRetentionLease}.
439+
*/
440+
static String getPeerRecoveryRetentionLeaseId(String nodeId) {
441+
return "peer_recovery/" + nodeId;
442+
}
443+
444+
/**
445+
* Id for a peer recovery retention lease for the given {@link ShardRouting}.
446+
* See {@link ReplicationTracker#addPeerRecoveryRetentionLease}.
447+
*/
448+
public static String getPeerRecoveryRetentionLeaseId(ShardRouting shardRouting) {
449+
return getPeerRecoveryRetentionLeaseId(shardRouting.currentNodeId());
450+
}
451+
452+
/**
453+
* Advance the peer-recovery retention lease for all tracked shard copies, for use in tests until advancing these leases is done
454+
* properly. TODO remove this.
455+
*/
456+
public synchronized void advancePeerRecoveryRetentionLeasesToGlobalCheckpoints() {
457+
assert primaryMode;
458+
for (ShardRouting shardRouting : routingTable) {
459+
if (shardRouting.assignedToNode()) {
460+
final CheckpointState checkpointState = checkpoints.get(shardRouting.allocationId().getId());
461+
renewRetentionLease(getPeerRecoveryRetentionLeaseId(shardRouting), checkpointState.globalCheckpoint + 1,
462+
PEER_RECOVERY_RETENTION_LEASE_SOURCE);
463+
}
464+
}
465+
}
466+
393467
public static class CheckpointState implements Writeable {
394468

395469
/**
@@ -616,6 +690,23 @@ private boolean invariant() {
616690
assert checkpoints.get(aId) != null : "aId [" + aId + "] is pending in sync but isn't tracked";
617691
}
618692

693+
if (primaryMode
694+
&& indexSettings.isSoftDeleteEnabled()
695+
&& indexSettings.getIndexMetaData().getState() == IndexMetaData.State.OPEN
696+
&& indexSettings.getIndexVersionCreated().onOrAfter(Version.V_8_0_0)) {
697+
// all tracked shard copies have a corresponding peer-recovery retention lease
698+
for (final ShardRouting shardRouting : routingTable.assignedShards()) {
699+
if (checkpoints.get(shardRouting.allocationId().getId()).tracked) {
700+
assert retentionLeases.contains(getPeerRecoveryRetentionLeaseId(shardRouting))
701+
: "no retention lease for tracked shard [" + shardRouting + "] in " + retentionLeases;
702+
assert PEER_RECOVERY_RETENTION_LEASE_SOURCE.equals(
703+
retentionLeases.get(getPeerRecoveryRetentionLeaseId(shardRouting)).source())
704+
: "incorrect source [" + retentionLeases.get(getPeerRecoveryRetentionLeaseId(shardRouting)).source()
705+
+ "] for [" + shardRouting + "] in " + retentionLeases;
706+
}
707+
}
708+
}
709+
619710
return true;
620711
}
621712

@@ -669,6 +760,7 @@ public ReplicationTracker(
669760
this.pendingInSync = new HashSet<>();
670761
this.routingTable = null;
671762
this.replicationGroup = null;
763+
assert Version.V_EMPTY.equals(indexSettings.getIndexVersionCreated()) == false;
672764
assert invariant();
673765
}
674766

@@ -772,6 +864,31 @@ public synchronized void activatePrimaryMode(final long localCheckpoint) {
772864
primaryMode = true;
773865
updateLocalCheckpoint(shardAllocationId, checkpoints.get(shardAllocationId), localCheckpoint);
774866
updateGlobalCheckpointOnPrimary();
867+
868+
if (indexSettings.isSoftDeleteEnabled()) {
869+
final ShardRouting primaryShard = routingTable.primaryShard();
870+
final String leaseId = getPeerRecoveryRetentionLeaseId(primaryShard);
871+
if (retentionLeases.get(leaseId) == null) {
872+
/*
873+
* We might have got here here via a rolling upgrade from an older version that doesn't create peer recovery retention
874+
* leases for every shard copy, but in this case we do not expect any leases to exist.
875+
*/
876+
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_8_0_0)) {
877+
// We are starting up the whole replication group from scratch: if we were not (i.e. this is a replica promotion) then
878+
// this copy must already be in-sync and active and therefore holds a retention lease for itself.
879+
assert routingTable.activeShards().equals(Collections.singletonList(primaryShard)) : routingTable.activeShards();
880+
assert primaryShard.allocationId().getId().equals(shardAllocationId)
881+
: routingTable.activeShards() + " vs " + shardAllocationId;
882+
assert replicationGroup.getReplicationTargets().equals(Collections.singletonList(primaryShard));
883+
884+
// Safe to call innerAddRetentionLease() without a subsequent sync since there are no other members of this replication
885+
// group.
886+
innerAddRetentionLease(leaseId, Math.max(0L, checkpoints.get(shardAllocationId).globalCheckpoint + 1),
887+
PEER_RECOVERY_RETENTION_LEASE_SOURCE);
888+
}
889+
}
890+
}
891+
775892
assert invariant();
776893
}
777894

server/src/main/java/org/elasticsearch/index/seqno/RetentionLeaseSyncer.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ public interface RetentionLeaseSyncer {
4444
RetentionLeaseSyncer EMPTY = new RetentionLeaseSyncer() {
4545
@Override
4646
public void sync(final ShardId shardId, final RetentionLeases retentionLeases, final ActionListener<ReplicationResponse> listener) {
47-
47+
listener.onResponse(new ReplicationResponse());
4848
}
4949

5050
@Override

server/src/main/java/org/elasticsearch/index/seqno/RetentionLeases.java

+1-10
Original file line numberDiff line numberDiff line change
@@ -274,14 +274,5 @@ private static Map<String, RetentionLease> toMap(final Collection<RetentionLease
274274
LinkedHashMap::new));
275275
}
276276

277-
/**
278-
* A utility method to convert a retention lease collection to a map from retention lease ID to retention lease.
279-
*
280-
* @param retentionLeases the retention lease collection
281-
* @return the map from retention lease ID to retention lease
282-
*/
283-
static Map<String, RetentionLease> toMap(final RetentionLeases retentionLeases) {
284-
return retentionLeases.leases;
285-
}
286-
287277
}
278+

server/src/main/java/org/elasticsearch/index/shard/IndexShard.java

+15
Original file line numberDiff line numberDiff line change
@@ -2415,6 +2415,21 @@ public boolean isRelocatedPrimary() {
24152415
return replicationTracker.isRelocated();
24162416
}
24172417

2418+
public void addPeerRecoveryRetentionLease(String nodeId, long globalCheckpoint, ActionListener<ReplicationResponse> listener) {
2419+
assert assertPrimaryMode();
2420+
replicationTracker.addPeerRecoveryRetentionLease(nodeId, globalCheckpoint, listener);
2421+
}
2422+
2423+
/**
2424+
* Test-only method to advance the all shards' peer-recovery retention leases to their tracked global checkpoints so that operations
2425+
* can be discarded. TODO Remove this when retention leases are advanced by other mechanisms.
2426+
*/
2427+
public void advancePeerRecoveryRetentionLeasesToGlobalCheckpoints() {
2428+
assert assertPrimaryMode();
2429+
replicationTracker.advancePeerRecoveryRetentionLeasesToGlobalCheckpoints();
2430+
syncRetentionLeases();
2431+
}
2432+
24182433
class ShardEventListener implements Engine.EventListener {
24192434
private final CopyOnWriteArrayList<Consumer<ShardFailure>> delegates = new CopyOnWriteArrayList<>();
24202435

server/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java

+26-3
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
import org.elasticsearch.ExceptionsHelper;
3333
import org.elasticsearch.action.ActionListener;
3434
import org.elasticsearch.action.StepListener;
35+
import org.elasticsearch.action.support.replication.ReplicationResponse;
36+
import org.elasticsearch.cluster.metadata.IndexMetaData;
3537
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
3638
import org.elasticsearch.cluster.routing.ShardRouting;
3739
import org.elasticsearch.common.CheckedSupplier;
@@ -49,6 +51,7 @@
4951
import org.elasticsearch.index.engine.Engine;
5052
import org.elasticsearch.index.engine.RecoveryEngineException;
5153
import org.elasticsearch.index.seqno.LocalCheckpointTracker;
54+
import org.elasticsearch.index.seqno.RetentionLeaseAlreadyExistsException;
5255
import org.elasticsearch.index.seqno.RetentionLeases;
5356
import org.elasticsearch.index.seqno.SequenceNumbers;
5457
import org.elasticsearch.index.shard.IndexShard;
@@ -188,10 +191,30 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
188191
}
189192
assert startingSeqNo >= 0 : "startingSeqNo must be non negative. got: " + startingSeqNo;
190193

194+
final StepListener<ReplicationResponse> establishRetentionLeaseStep = new StepListener<>();
195+
if (shard.indexSettings().isSoftDeleteEnabled()
196+
&& shard.indexSettings().getIndexMetaData().getState() != IndexMetaData.State.CLOSE) {
197+
runUnderPrimaryPermit(() -> {
198+
try {
199+
// conservative estimate of the GCP for creating the lease. TODO use the actual GCP once it's appropriate to do so
200+
final long globalCheckpoint = startingSeqNo - 1;
201+
// blindly create the lease. TODO integrate this with the recovery process
202+
shard.addPeerRecoveryRetentionLease(request.targetNode().getId(), globalCheckpoint, establishRetentionLeaseStep);
203+
} catch (RetentionLeaseAlreadyExistsException e) {
204+
logger.debug("peer-recovery retention lease already exists", e);
205+
establishRetentionLeaseStep.onResponse(null);
206+
}
207+
}, shardId + " establishing retention lease for [" + request.targetAllocationId() + "]", shard, cancellableThreads, logger);
208+
} else {
209+
establishRetentionLeaseStep.onResponse(null);
210+
}
211+
191212
final StepListener<TimeValue> prepareEngineStep = new StepListener<>();
192-
// For a sequence based recovery, the target can keep its local translog
193-
prepareTargetForTranslog(isSequenceNumberBasedRecovery == false,
194-
shard.estimateNumberOfHistoryOperations("peer-recovery", startingSeqNo), prepareEngineStep);
213+
establishRetentionLeaseStep.whenComplete(r -> {
214+
// For a sequence based recovery, the target can keep its local translog
215+
prepareTargetForTranslog(isSequenceNumberBasedRecovery == false,
216+
shard.estimateNumberOfHistoryOperations("peer-recovery", startingSeqNo), prepareEngineStep);
217+
}, onFailure);
195218
final StepListener<SendSnapshotResult> sendSnapshotStep = new StepListener<>();
196219
prepareEngineStep.whenComplete(prepareEngineTime -> {
197220
/*

0 commit comments

Comments
 (0)