Skip to content

Commit 12ed6dc

Browse files
Only retain reasonable history for peer recoveries (#45208) (#45355)
Today if a shard is not fully allocated we maintain a retention lease for a lost peer for up to 12 hours, retaining all operations that occur in that time period so that we can recover this replica using an operations-based recovery if it returns. However it is not always reasonable to perform an operations-based recovery on such a replica: if the replica is a very long way behind the rest of the replication group then it can be much quicker to perform a file-based recovery instead. This commit introduces a notion of "reasonable" recoveries. If an operations-based recovery would involve copying only a small number of operations, but the index is large, then an operations-based recovery is reasonable; on the other hand if there are many operations to copy across and the index itself is relatively small then it makes more sense to perform a file-based recovery. We measure the size of the index by computing its number of documents (including deleted documents) in all segments belonging to the current safe commit, and compare this to the number of operations a lease is retaining below the local checkpoint of the safe commit. We consider an operations-based recovery to be reasonable iff it would involve replaying at most 10% of the documents in the index. The mechanism for this feature is to expire peer-recovery retention leases early if they are retaining so much history that an operations-based recovery using that lease would be unreasonable. Relates #41536
1 parent 7b0a804 commit 12ed6dc

16 files changed

+363
-41
lines changed

server/src/main/java/org/elasticsearch/index/IndexSettings.java

+13
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,19 @@ public final class IndexSettings {
301301
public static final Setting<Boolean> INDEX_SEARCH_THROTTLED = Setting.boolSetting("index.search.throttled", false,
302302
Property.IndexScope, Property.PrivateIndex, Property.Dynamic);
303303

304+
/**
305+
* Determines a balance between file-based and operations-based peer recoveries. The number of operations that will be used in an
306+
* operations-based peer recovery is limited to this proportion of the total number of documents in the shard (including deleted
307+
* documents) on the grounds that a file-based peer recovery may copy all of the documents in the shard over to the new peer, but is
308+
* significantly faster than replaying the missing operations on the peer, so once a peer falls far enough behind the primary it makes
309+
* more sense to copy all the data over again instead of replaying history.
310+
*
311+
* Defaults to retaining history for up to 10% of the documents in the shard. This can only be changed in tests, since this setting is
312+
* intentionally unregistered.
313+
*/
314+
public static final Setting<Double> FILE_BASED_RECOVERY_THRESHOLD_SETTING
315+
= Setting.doubleSetting("index.recovery.file_based_threshold", 0.1d, 0.0d, Setting.Property.IndexScope);
316+
304317
private final Index index;
305318
private final Version version;
306319
private final Logger logger;

server/src/main/java/org/elasticsearch/index/engine/CombinedDeletionPolicy.java

+36-10
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.apache.logging.log4j.Logger;
2424
import org.apache.lucene.index.IndexCommit;
2525
import org.apache.lucene.index.IndexDeletionPolicy;
26+
import org.apache.lucene.index.SegmentInfos;
2627
import org.apache.lucene.store.Directory;
2728
import org.elasticsearch.index.seqno.SequenceNumbers;
2829
import org.elasticsearch.index.translog.Translog;
@@ -43,14 +44,15 @@
4344
* In particular, this policy will delete index commits whose max sequence number is at most
4445
* the current global checkpoint except the index commit which has the highest max sequence number among those.
4546
*/
46-
public final class CombinedDeletionPolicy extends IndexDeletionPolicy {
47+
public class CombinedDeletionPolicy extends IndexDeletionPolicy {
4748
private final Logger logger;
4849
private final TranslogDeletionPolicy translogDeletionPolicy;
4950
private final SoftDeletesPolicy softDeletesPolicy;
5051
private final LongSupplier globalCheckpointSupplier;
5152
private final ObjectIntHashMap<IndexCommit> snapshottedCommits; // Number of snapshots held against each commit point.
5253
private volatile IndexCommit safeCommit; // the most recent safe commit point - its max_seqno at most the persisted global checkpoint.
5354
private volatile IndexCommit lastCommit; // the most recent commit point
55+
private volatile SafeCommitInfo safeCommitInfo = SafeCommitInfo.EMPTY;
5456

5557
CombinedDeletionPolicy(Logger logger, TranslogDeletionPolicy translogDeletionPolicy,
5658
SoftDeletesPolicy softDeletesPolicy, LongSupplier globalCheckpointSupplier) {
@@ -62,7 +64,7 @@ public final class CombinedDeletionPolicy extends IndexDeletionPolicy {
6264
}
6365

6466
@Override
65-
public synchronized void onInit(List<? extends IndexCommit> commits) throws IOException {
67+
public void onInit(List<? extends IndexCommit> commits) throws IOException {
6668
assert commits.isEmpty() == false : "index is opened, but we have no commits";
6769
onCommit(commits);
6870
if (safeCommit != commits.get(commits.size() - 1)) {
@@ -74,16 +76,32 @@ public synchronized void onInit(List<? extends IndexCommit> commits) throws IOEx
7476
}
7577

7678
@Override
77-
public synchronized void onCommit(List<? extends IndexCommit> commits) throws IOException {
78-
final int keptPosition = indexOfKeptCommits(commits, globalCheckpointSupplier.getAsLong());
79-
lastCommit = commits.get(commits.size() - 1);
80-
safeCommit = commits.get(keptPosition);
81-
for (int i = 0; i < keptPosition; i++) {
82-
if (snapshottedCommits.containsKey(commits.get(i)) == false) {
83-
deleteCommit(commits.get(i));
79+
public void onCommit(List<? extends IndexCommit> commits) throws IOException {
80+
final IndexCommit safeCommit;
81+
synchronized (this) {
82+
final int keptPosition = indexOfKeptCommits(commits, globalCheckpointSupplier.getAsLong());
83+
this.safeCommitInfo = SafeCommitInfo.EMPTY;
84+
this.lastCommit = commits.get(commits.size() - 1);
85+
this.safeCommit = commits.get(keptPosition);
86+
for (int i = 0; i < keptPosition; i++) {
87+
if (snapshottedCommits.containsKey(commits.get(i)) == false) {
88+
deleteCommit(commits.get(i));
89+
}
8490
}
91+
updateRetentionPolicy();
92+
safeCommit = this.safeCommit;
8593
}
86-
updateRetentionPolicy();
94+
95+
assert Thread.holdsLock(this) == false : "should not block concurrent acquire or relesase";
96+
safeCommitInfo = new SafeCommitInfo(Long.parseLong(
97+
safeCommit.getUserData().get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)), getDocCountOfCommit(safeCommit));
98+
99+
// This is protected from concurrent calls by a lock on the IndexWriter, but this assertion makes sure that we notice if that ceases
100+
// to be true in future. It is not disastrous if safeCommitInfo refers to an older safeCommit, it just means that we might retain a
101+
// bit more history and do a few more ops-based recoveries than we would otherwise.
102+
final IndexCommit newSafeCommit = this.safeCommit;
103+
assert safeCommit == newSafeCommit
104+
: "onCommit called concurrently? " + safeCommit.getGeneration() + " vs " + newSafeCommit.getGeneration();
87105
}
88106

89107
private void deleteCommit(IndexCommit commit) throws IOException {
@@ -109,6 +127,14 @@ private void updateRetentionPolicy() throws IOException {
109127
Long.parseLong(safeCommit.getUserData().get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)));
110128
}
111129

130+
protected int getDocCountOfCommit(IndexCommit indexCommit) throws IOException {
131+
return SegmentInfos.readCommit(indexCommit.getDirectory(), indexCommit.getSegmentsFileName()).totalMaxDoc();
132+
}
133+
134+
SafeCommitInfo getSafeCommitInfo() {
135+
return safeCommitInfo;
136+
}
137+
112138
/**
113139
* Captures the most recent commit point {@link #lastCommit} or the most recent safe commit point {@link #safeCommit}.
114140
* Index files of the capturing commit point won't be released until the commit reference is closed.

server/src/main/java/org/elasticsearch/index/engine/Engine.java

+5
Original file line numberDiff line numberDiff line change
@@ -1122,6 +1122,11 @@ public abstract void forceMerge(boolean flush, int maxNumSegments, boolean onlyE
11221122
*/
11231123
public abstract IndexCommitRef acquireSafeIndexCommit() throws EngineException;
11241124

1125+
/**
1126+
* @return a summary of the contents of the current safe commit
1127+
*/
1128+
public abstract SafeCommitInfo getSafeCommitInfo();
1129+
11251130
/**
11261131
* If the specified throwable contains a fatal error in the throwable graph, such a fatal error will be thrown. Callers should ensure
11271132
* that there are no catch statements that would catch an error in the stack as the fatal error here should go uncaught and be handled

server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java

+5
Original file line numberDiff line numberDiff line change
@@ -2008,6 +2008,11 @@ private void releaseIndexCommit(IndexCommit snapshot) throws IOException {
20082008
}
20092009
}
20102010

2011+
@Override
2012+
public SafeCommitInfo getSafeCommitInfo() {
2013+
return combinedDeletionPolicy.getSafeCommitInfo();
2014+
}
2015+
20112016
private boolean failOnTragicEvent(AlreadyClosedException ex) {
20122017
final boolean engineFailed;
20132018
// if we are already closed due to some tragic exception

server/src/main/java/org/elasticsearch/index/engine/ReadOnlyEngine.java

+7
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ public class ReadOnlyEngine extends Engine {
7777
private final Lock indexWriterLock;
7878
private final DocsStats docsStats;
7979
private final RamAccountingRefreshListener refreshListener;
80+
private final SafeCommitInfo safeCommitInfo;
8081

8182
protected volatile TranslogStats translogStats;
8283

@@ -120,6 +121,7 @@ public ReadOnlyEngine(EngineConfig config, SeqNoStats seqNoStats, TranslogStats
120121
assert translogStats != null || obtainLock : "mutiple translogs instances should not be opened at the same time";
121122
this.translogStats = translogStats != null ? translogStats : translogStats(config, lastCommittedSegmentInfos);
122123
this.indexWriterLock = indexWriterLock;
124+
this.safeCommitInfo = new SafeCommitInfo(seqNoStats.getLocalCheckpoint(), lastCommittedSegmentInfos.totalMaxDoc());
123125
success = true;
124126
} finally {
125127
if (success == false) {
@@ -420,6 +422,11 @@ public IndexCommitRef acquireSafeIndexCommit() {
420422
return acquireLastIndexCommit(false);
421423
}
422424

425+
@Override
426+
public SafeCommitInfo getSafeCommitInfo() {
427+
return safeCommitInfo;
428+
}
429+
423430
@Override
424431
public void activateThrottling() {
425432
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.elasticsearch.index.engine;
20+
21+
import org.elasticsearch.index.seqno.SequenceNumbers;
22+
23+
/**
24+
* Information about the safe commit, for making decisions about recoveries.
25+
*/
26+
public class SafeCommitInfo {
27+
28+
public final long localCheckpoint;
29+
public final int docCount;
30+
31+
public SafeCommitInfo(long localCheckpoint, int docCount) {
32+
this.localCheckpoint = localCheckpoint;
33+
this.docCount = docCount;
34+
}
35+
36+
public static final SafeCommitInfo EMPTY = new SafeCommitInfo(SequenceNumbers.NO_OPS_PERFORMED, 0);
37+
}

server/src/main/java/org/elasticsearch/index/seqno/ReplicationTracker.java

+36-2
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
3838
import org.elasticsearch.gateway.WriteStateException;
3939
import org.elasticsearch.index.IndexSettings;
40+
import org.elasticsearch.index.engine.SafeCommitInfo;
4041
import org.elasticsearch.index.shard.AbstractIndexShardComponent;
4142
import org.elasticsearch.index.shard.IndexShard;
4243
import org.elasticsearch.index.shard.ReplicationGroup;
@@ -57,6 +58,7 @@
5758
import java.util.function.Function;
5859
import java.util.function.LongConsumer;
5960
import java.util.function.LongSupplier;
61+
import java.util.function.Supplier;
6062
import java.util.function.ToLongFunction;
6163
import java.util.stream.Collectors;
6264
import java.util.stream.LongStream;
@@ -210,6 +212,17 @@ public class ReplicationTracker extends AbstractIndexShardComponent implements L
210212
*/
211213
private boolean hasAllPeerRecoveryRetentionLeases;
212214

215+
/**
216+
* Supplies information about the current safe commit which may be used to expire peer-recovery retention leases.
217+
*/
218+
private final Supplier<SafeCommitInfo> safeCommitInfoSupplier;
219+
220+
/**
221+
* Threshold for expiring peer-recovery retention leases and falling back to file-based recovery. See
222+
* {@link IndexSettings#FILE_BASED_RECOVERY_THRESHOLD_SETTING}.
223+
*/
224+
private final double fileBasedRecoveryThreshold;
225+
213226
/**
214227
* Get all retention leases tracked on this shard.
215228
*
@@ -237,6 +250,8 @@ public synchronized Tuple<Boolean, RetentionLeases> getRetentionLeases(final boo
237250
final long retentionLeaseMillis = indexSettings.getRetentionLeaseMillis();
238251
final Set<String> leaseIdsForCurrentPeers
239252
= routingTable.assignedShards().stream().map(ReplicationTracker::getPeerRecoveryRetentionLeaseId).collect(Collectors.toSet());
253+
final boolean allShardsStarted = routingTable.allShardsStarted();
254+
final long minimumReasonableRetainedSeqNo = allShardsStarted ? 0L : getMinimumReasonableRetainedSeqNo();
240255
final Map<Boolean, List<RetentionLease>> partitionByExpiration = retentionLeases
241256
.leases()
242257
.stream()
@@ -245,7 +260,12 @@ public synchronized Tuple<Boolean, RetentionLeases> getRetentionLeases(final boo
245260
if (leaseIdsForCurrentPeers.contains(lease.id())) {
246261
return false;
247262
}
248-
if (routingTable.allShardsStarted()) {
263+
if (allShardsStarted) {
264+
logger.trace("expiring unused [{}]", lease);
265+
return true;
266+
}
267+
if (lease.retainingSequenceNumber() < minimumReasonableRetainedSeqNo) {
268+
logger.trace("expiring unreasonable [{}] retaining history before [{}]", lease, minimumReasonableRetainedSeqNo);
249269
return true;
250270
}
251271
}
@@ -264,6 +284,17 @@ public synchronized Tuple<Boolean, RetentionLeases> getRetentionLeases(final boo
264284
return Tuple.tuple(true, retentionLeases);
265285
}
266286

287+
private long getMinimumReasonableRetainedSeqNo() {
288+
final SafeCommitInfo safeCommitInfo = safeCommitInfoSupplier.get();
289+
return safeCommitInfo.localCheckpoint + 1 - Math.round(Math.ceil(safeCommitInfo.docCount * fileBasedRecoveryThreshold));
290+
// NB safeCommitInfo.docCount is a very low-level count of the docs in the index, and in particular if this shard contains nested
291+
// docs then safeCommitInfo.docCount counts every child doc separately from the parent doc. However every part of a nested document
292+
// has the same seqno, so we may be overestimating the cost of a file-based recovery when compared to an ops-based recovery and
293+
// therefore preferring ops-based recoveries inappropriately in this case. Correctly accounting for nested docs seems difficult to
294+
// do cheaply, and the circumstances in which this matters should be relatively rare, so we use this naive calculation regardless.
295+
// TODO improve this measure for when nested docs are in use
296+
}
297+
267298
/**
268299
* Adds a new retention lease.
269300
*
@@ -850,7 +881,8 @@ public ReplicationTracker(
850881
final long globalCheckpoint,
851882
final LongConsumer onGlobalCheckpointUpdated,
852883
final LongSupplier currentTimeMillisSupplier,
853-
final BiConsumer<RetentionLeases, ActionListener<ReplicationResponse>> onSyncRetentionLeases) {
884+
final BiConsumer<RetentionLeases, ActionListener<ReplicationResponse>> onSyncRetentionLeases,
885+
final Supplier<SafeCommitInfo> safeCommitInfoSupplier) {
854886
super(shardId, indexSettings);
855887
assert globalCheckpoint >= SequenceNumbers.UNASSIGNED_SEQ_NO : "illegal initial global checkpoint: " + globalCheckpoint;
856888
this.shardAllocationId = allocationId;
@@ -867,6 +899,8 @@ public ReplicationTracker(
867899
this.routingTable = null;
868900
this.replicationGroup = null;
869901
this.hasAllPeerRecoveryRetentionLeases = indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_4_0);
902+
this.fileBasedRecoveryThreshold = IndexSettings.FILE_BASED_RECOVERY_THRESHOLD_SETTING.get(indexSettings.getSettings());
903+
this.safeCommitInfoSupplier = safeCommitInfoSupplier;
870904
assert Version.V_EMPTY.equals(indexSettings.getIndexVersionCreated()) == false;
871905
assert invariant();
872906
}

server/src/main/java/org/elasticsearch/index/shard/IndexShard.java

+8-1
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@
9292
import org.elasticsearch.index.engine.EngineFactory;
9393
import org.elasticsearch.index.engine.ReadOnlyEngine;
9494
import org.elasticsearch.index.engine.RefreshFailedEngineException;
95+
import org.elasticsearch.index.engine.SafeCommitInfo;
9596
import org.elasticsearch.index.engine.Segment;
9697
import org.elasticsearch.index.engine.SegmentsStats;
9798
import org.elasticsearch.index.fielddata.FieldDataStats;
@@ -336,7 +337,8 @@ public IndexShard(
336337
UNASSIGNED_SEQ_NO,
337338
globalCheckpointListeners::globalCheckpointUpdated,
338339
threadPool::absoluteTimeInMillis,
339-
(retentionLeases, listener) -> retentionLeaseSyncer.sync(shardId, retentionLeases, listener));
340+
(retentionLeases, listener) -> retentionLeaseSyncer.sync(shardId, retentionLeases, listener),
341+
this::getSafeCommitInfo);
340342

341343
// the query cache is a node-level thing, however we want the most popular filters
342344
// to be computed on a per-shard basis
@@ -2612,6 +2614,11 @@ public void removePeerRecoveryRetentionLease(String nodeId, ActionListener<Repli
26122614
replicationTracker.removePeerRecoveryRetentionLease(nodeId, listener);
26132615
}
26142616

2617+
private SafeCommitInfo getSafeCommitInfo() {
2618+
final Engine engine = getEngineOrNull();
2619+
return engine == null ? SafeCommitInfo.EMPTY : engine.getSafeCommitInfo();
2620+
}
2621+
26152622
class ShardEventListener implements Engine.EventListener {
26162623
private final CopyOnWriteArrayList<Consumer<ShardFailure>> delegates = new CopyOnWriteArrayList<>();
26172624

0 commit comments

Comments
 (0)