Skip to content

Commit 2995d4a

Browse files
committed
Sequence number based replica allocation (#46959)
With this change, shard allocation prefers allocating replicas on a node that already has a copy of the shard that is as close as possible to the primary, so that it is as cheap as possible to bring the new replica in sync with the primary. Furthermore, if we find a copy that is identical to the primary then we cancel an ongoing recovery because the new copy which is identical to the primary needs no work to recover as a replica. We no longer need to perform a synced flush before performing a rolling upgrade or full cluster start with this improvement. Closes #46318
1 parent 4f06225 commit 2995d4a

File tree

7 files changed

+508
-108
lines changed

7 files changed

+508
-108
lines changed

server/src/main/java/org/elasticsearch/gateway/ReplicaShardAllocator.java

+101-75
Large diffs are not rendered by default.

server/src/main/java/org/elasticsearch/index/seqno/ReplicationTracker.java

+10-1
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,7 @@ public void removePeerRecoveryRetentionLease(String nodeId, ActionListener<Repli
529529
/**
530530
* Id for a peer recovery retention lease for the given node. See {@link ReplicationTracker#addPeerRecoveryRetentionLease}.
531531
*/
532-
static String getPeerRecoveryRetentionLeaseId(String nodeId) {
532+
public static String getPeerRecoveryRetentionLeaseId(String nodeId) {
533533
return "peer_recovery/" + nodeId;
534534
}
535535

@@ -541,6 +541,15 @@ public static String getPeerRecoveryRetentionLeaseId(ShardRouting shardRouting)
541541
return getPeerRecoveryRetentionLeaseId(shardRouting.currentNodeId());
542542
}
543543

544+
/**
545+
* Returns a list of peer recovery retention leases installed in this replication group
546+
*/
547+
public List<RetentionLease> getPeerRecoveryRetentionLeases() {
548+
return getRetentionLeases().leases().stream()
549+
.filter(lease -> PEER_RECOVERY_RETENTION_LEASE_SOURCE.equals(lease.source()))
550+
.collect(Collectors.toList());
551+
}
552+
544553
/**
545554
* Advance the peer-recovery retention leases for all assigned shard copies to discard history below the corresponding global
546555
* checkpoint, and renew any leases that are approaching expiry.

server/src/main/java/org/elasticsearch/index/shard/IndexShard.java

+7
Original file line numberDiff line numberDiff line change
@@ -2621,6 +2621,13 @@ public void removePeerRecoveryRetentionLease(String nodeId, ActionListener<Repli
26212621
replicationTracker.removePeerRecoveryRetentionLease(nodeId, listener);
26222622
}
26232623

2624+
/**
2625+
* Returns a list of retention leases for peer recovery installed in this shard copy.
2626+
*/
2627+
public List<RetentionLease> getPeerRecoveryRetentionLeases() {
2628+
return replicationTracker.getPeerRecoveryRetentionLeases();
2629+
}
2630+
26242631
private SafeCommitInfo getSafeCommitInfo() {
26252632
final Engine engine = getEngineOrNull();
26262633
return engine == null ? SafeCommitInfo.EMPTY : engine.getSafeCommitInfo();

server/src/main/java/org/elasticsearch/indices/store/TransportNodesListShardStoreMetaData.java

+49-16
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.logging.log4j.message.ParameterizedMessage;
2323
import org.elasticsearch.ElasticsearchException;
2424
import org.elasticsearch.action.ActionListener;
25+
import org.elasticsearch.Version;
2526
import org.elasticsearch.action.ActionType;
2627
import org.elasticsearch.action.FailedNodeException;
2728
import org.elasticsearch.action.support.ActionFilters;
@@ -45,6 +46,8 @@
4546
import org.elasticsearch.gateway.AsyncShardFetch;
4647
import org.elasticsearch.index.IndexService;
4748
import org.elasticsearch.index.IndexSettings;
49+
import org.elasticsearch.index.seqno.ReplicationTracker;
50+
import org.elasticsearch.index.seqno.RetentionLease;
4851
import org.elasticsearch.index.shard.IndexShard;
4952
import org.elasticsearch.index.shard.ShardId;
5053
import org.elasticsearch.index.shard.ShardPath;
@@ -55,6 +58,7 @@
5558
import org.elasticsearch.transport.TransportService;
5659

5760
import java.io.IOException;
61+
import java.util.Collections;
5862
import java.util.Iterator;
5963
import java.util.List;
6064
import java.util.concurrent.TimeUnit;
@@ -127,15 +131,16 @@ private StoreFilesMetaData listStoreMetaData(ShardId shardId) throws IOException
127131
IndexShard indexShard = indexService.getShardOrNull(shardId.id());
128132
if (indexShard != null) {
129133
try {
130-
final StoreFilesMetaData storeFilesMetaData = new StoreFilesMetaData(shardId, indexShard.snapshotStoreMetadata());
134+
final StoreFilesMetaData storeFilesMetaData = new StoreFilesMetaData(shardId,
135+
indexShard.snapshotStoreMetadata(), indexShard.getPeerRecoveryRetentionLeases());
131136
exists = true;
132137
return storeFilesMetaData;
133138
} catch (org.apache.lucene.index.IndexNotFoundException e) {
134139
logger.trace(new ParameterizedMessage("[{}] node is missing index, responding with empty", shardId), e);
135-
return new StoreFilesMetaData(shardId, Store.MetadataSnapshot.EMPTY);
140+
return new StoreFilesMetaData(shardId, Store.MetadataSnapshot.EMPTY, Collections.emptyList());
136141
} catch (IOException e) {
137142
logger.warn(new ParameterizedMessage("[{}] can't read metadata from store, responding with empty", shardId), e);
138-
return new StoreFilesMetaData(shardId, Store.MetadataSnapshot.EMPTY);
143+
return new StoreFilesMetaData(shardId, Store.MetadataSnapshot.EMPTY, Collections.emptyList());
139144
}
140145
}
141146
}
@@ -150,20 +155,23 @@ private StoreFilesMetaData listStoreMetaData(ShardId shardId) throws IOException
150155
}
151156
if (metaData == null) {
152157
logger.trace("{} node doesn't have meta data for the requests index, responding with empty", shardId);
153-
return new StoreFilesMetaData(shardId, Store.MetadataSnapshot.EMPTY);
158+
return new StoreFilesMetaData(shardId, Store.MetadataSnapshot.EMPTY, Collections.emptyList());
154159
}
155160
final IndexSettings indexSettings = indexService != null ? indexService.getIndexSettings() :
156161
new IndexSettings(metaData, settings);
157162
final ShardPath shardPath = ShardPath.loadShardPath(logger, nodeEnv, shardId, indexSettings);
158163
if (shardPath == null) {
159-
return new StoreFilesMetaData(shardId, Store.MetadataSnapshot.EMPTY);
164+
return new StoreFilesMetaData(shardId, Store.MetadataSnapshot.EMPTY, Collections.emptyList());
160165
}
161166
// note that this may fail if it can't get access to the shard lock. Since we check above there is an active shard, this means:
162167
// 1) a shard is being constructed, which means the master will not use a copy of this replica
163168
// 2) A shard is shutting down and has not cleared it's content within lock timeout. In this case the master may not
164169
// reuse local resources.
165-
return new StoreFilesMetaData(shardId, Store.readMetadataSnapshot(shardPath.resolveIndex(), shardId,
166-
nodeEnv::shardLock, logger));
170+
final Store.MetadataSnapshot metadataSnapshot =
171+
Store.readMetadataSnapshot(shardPath.resolveIndex(), shardId, nodeEnv::shardLock, logger);
172+
// We use peer recovery retention leases from the primary for allocating replicas. We should always have retention leases when
173+
// we refresh shard info after the primary has started. Hence, we can ignore retention leases if there is no active shard.
174+
return new StoreFilesMetaData(shardId, metadataSnapshot, Collections.emptyList());
167175
} finally {
168176
TimeValue took = new TimeValue(System.nanoTime() - startTimeNS, TimeUnit.NANOSECONDS);
169177
if (exists) {
@@ -175,17 +183,34 @@ private StoreFilesMetaData listStoreMetaData(ShardId shardId) throws IOException
175183
}
176184

177185
public static class StoreFilesMetaData implements Iterable<StoreFileMetaData>, Writeable {
178-
private ShardId shardId;
179-
Store.MetadataSnapshot metadataSnapshot;
186+
private final ShardId shardId;
187+
private final Store.MetadataSnapshot metadataSnapshot;
188+
private final List<RetentionLease> peerRecoveryRetentionLeases;
189+
190+
public StoreFilesMetaData(ShardId shardId, Store.MetadataSnapshot metadataSnapshot,
191+
List<RetentionLease> peerRecoveryRetentionLeases) {
192+
this.shardId = shardId;
193+
this.metadataSnapshot = metadataSnapshot;
194+
this.peerRecoveryRetentionLeases = peerRecoveryRetentionLeases;
195+
}
180196

181197
public StoreFilesMetaData(StreamInput in) throws IOException {
182198
this.shardId = new ShardId(in);
183199
this.metadataSnapshot = new Store.MetadataSnapshot(in);
200+
if (in.getVersion().onOrAfter(Version.V_7_5_0)) {
201+
this.peerRecoveryRetentionLeases = in.readList(RetentionLease::new);
202+
} else {
203+
this.peerRecoveryRetentionLeases = Collections.emptyList();
204+
}
184205
}
185206

186-
public StoreFilesMetaData(ShardId shardId, Store.MetadataSnapshot metadataSnapshot) {
187-
this.shardId = shardId;
188-
this.metadataSnapshot = metadataSnapshot;
207+
@Override
208+
public void writeTo(StreamOutput out) throws IOException {
209+
shardId.writeTo(out);
210+
metadataSnapshot.writeTo(out);
211+
if (out.getVersion().onOrAfter(Version.V_7_5_0)) {
212+
out.writeList(peerRecoveryRetentionLeases);
213+
}
189214
}
190215

191216
public ShardId shardId() {
@@ -209,10 +234,18 @@ public StoreFileMetaData file(String name) {
209234
return metadataSnapshot.asMap().get(name);
210235
}
211236

212-
@Override
213-
public void writeTo(StreamOutput out) throws IOException {
214-
shardId.writeTo(out);
215-
metadataSnapshot.writeTo(out);
237+
/**
238+
* Returns the retaining sequence number of the peer recovery retention lease for a given node if exists; otherwise, returns -1.
239+
*/
240+
public long getPeerRecoveryRetentionLeaseRetainingSeqNo(DiscoveryNode node) {
241+
assert node != null;
242+
final String retentionLeaseId = ReplicationTracker.getPeerRecoveryRetentionLeaseId(node.getId());
243+
return peerRecoveryRetentionLeases.stream().filter(lease -> lease.id().equals(retentionLeaseId))
244+
.mapToLong(RetentionLease::retainingSequenceNumber).findFirst().orElse(-1L);
245+
}
246+
247+
public List<RetentionLease> peerRecoveryRetentionLeases() {
248+
return peerRecoveryRetentionLeases;
216249
}
217250

218251
/**

0 commit comments

Comments
 (0)