elastic
diff --git a/‎server/src/main/java/org/elasticsearch/index/IndexSettings.java
+13 b/‎server/src/main/java/org/elasticsearch/index/IndexSettings.java
+13
diff --git a/‎server/src/main/java/org/elasticsearch/index/engine/CombinedDeletionPolicy.java
+36-10 b/‎server/src/main/java/org/elasticsearch/index/engine/CombinedDeletionPolicy.java
+36-10
diff --git a/‎server/src/main/java/org/elasticsearch/index/engine/Engine.java
+5 b/‎server/src/main/java/org/elasticsearch/index/engine/Engine.java
+5
diff --git a/‎server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java
+5 b/‎server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java
+5
diff --git a/‎server/src/main/java/org/elasticsearch/index/engine/ReadOnlyEngine.java
+7 b/‎server/src/main/java/org/elasticsearch/index/engine/ReadOnlyEngine.java
+7
diff --git a/‎server/src/main/java/org/elasticsearch/index/engine/SafeCommitInfo.java
+37 b/‎server/src/main/java/org/elasticsearch/index/engine/SafeCommitInfo.java
+37
diff --git a/‎server/src/main/java/org/elasticsearch/index/seqno/ReplicationTracker.java
+36-2 b/‎server/src/main/java/org/elasticsearch/index/seqno/ReplicationTracker.java
+36-2
diff --git a/‎server/src/main/java/org/elasticsearch/index/shard/IndexShard.java
+8-1 b/‎server/src/main/java/org/elasticsearch/index/shard/IndexShard.java
+8-1
@@ -301,6 +301,19 @@ public final class IndexSettings {
     public static final Setting<Boolean> INDEX_SEARCH_THROTTLED = Setting.boolSetting("index.search.throttled", false,
         Property.IndexScope, Property.PrivateIndex, Property.Dynamic);
 
+    /**
+     * Determines a balance between file-based and operations-based peer recoveries. The number of operations that will be used in an
+     * operations-based peer recovery is limited to this proportion of the total number of documents in the shard (including deleted
+     * documents) on the grounds that a file-based peer recovery may copy all of the documents in the shard over to the new peer, but is
+     * significantly faster than replaying the missing operations on the peer, so once a peer falls far enough behind the primary it makes
+     * more sense to copy all the data over again instead of replaying history.
+     *
+     * Defaults to retaining history for up to 10% of the documents in the shard. This can only be changed in tests, since this setting is
+     * intentionally unregistered.
+     */
+    public static final Setting<Double> FILE_BASED_RECOVERY_THRESHOLD_SETTING
+        = Setting.doubleSetting("index.recovery.file_based_threshold", 0.1d, 0.0d, Setting.Property.IndexScope);
+
     private final Index index;
     private final Version version;
     private final Logger logger;
 
@@ -23,6 +23,7 @@
 import org.apache.logging.log4j.Logger;
 import org.apache.lucene.index.IndexCommit;
 import org.apache.lucene.index.IndexDeletionPolicy;
+import org.apache.lucene.index.SegmentInfos;
 import org.apache.lucene.store.Directory;
 import org.elasticsearch.index.seqno.SequenceNumbers;
 import org.elasticsearch.index.translog.Translog;
@@ -43,14 +44,15 @@
  * In particular, this policy will delete index commits whose max sequence number is at most
  * the current global checkpoint except the index commit which has the highest max sequence number among those.
  */
-public final class CombinedDeletionPolicy extends IndexDeletionPolicy {
+public class CombinedDeletionPolicy extends IndexDeletionPolicy {
     private final Logger logger;
     private final TranslogDeletionPolicy translogDeletionPolicy;
     private final SoftDeletesPolicy softDeletesPolicy;
     private final LongSupplier globalCheckpointSupplier;
     private final ObjectIntHashMap<IndexCommit> snapshottedCommits; // Number of snapshots held against each commit point.
     private volatile IndexCommit safeCommit; // the most recent safe commit point - its max_seqno at most the persisted global checkpoint.
     private volatile IndexCommit lastCommit; // the most recent commit point
+    private volatile SafeCommitInfo safeCommitInfo = SafeCommitInfo.EMPTY;
 
     CombinedDeletionPolicy(Logger logger, TranslogDeletionPolicy translogDeletionPolicy,
                            SoftDeletesPolicy softDeletesPolicy, LongSupplier globalCheckpointSupplier) {
@@ -62,7 +64,7 @@ public final class CombinedDeletionPolicy extends IndexDeletionPolicy {
     }
 
     @Override
-    public synchronized void onInit(List<? extends IndexCommit> commits) throws IOException {
+    public void onInit(List<? extends IndexCommit> commits) throws IOException {
         assert commits.isEmpty() == false : "index is opened, but we have no commits";
         onCommit(commits);
         if (safeCommit != commits.get(commits.size() - 1)) {
@@ -74,16 +76,32 @@ public synchronized void onInit(List<? extends IndexCommit> commits) throws IOEx
     }
 
     @Override
-    public synchronized void onCommit(List<? extends IndexCommit> commits) throws IOException {
-        final int keptPosition = indexOfKeptCommits(commits, globalCheckpointSupplier.getAsLong());
-        lastCommit = commits.get(commits.size() - 1);
-        safeCommit = commits.get(keptPosition);
-        for (int i = 0; i < keptPosition; i++) {
-            if (snapshottedCommits.containsKey(commits.get(i)) == false) {
-                deleteCommit(commits.get(i));
+    public void onCommit(List<? extends IndexCommit> commits) throws IOException {
+        final IndexCommit safeCommit;
+        synchronized (this) {
+            final int keptPosition = indexOfKeptCommits(commits, globalCheckpointSupplier.getAsLong());
+            this.safeCommitInfo = SafeCommitInfo.EMPTY;
+            this.lastCommit = commits.get(commits.size() - 1);
+            this.safeCommit = commits.get(keptPosition);
+            for (int i = 0; i < keptPosition; i++) {
+                if (snapshottedCommits.containsKey(commits.get(i)) == false) {
+                    deleteCommit(commits.get(i));
+                }
             }
+            updateRetentionPolicy();
+            safeCommit = this.safeCommit;
         }
-        updateRetentionPolicy();
+
+        assert Thread.holdsLock(this) == false : "should not block concurrent acquire or relesase";
+        safeCommitInfo = new SafeCommitInfo(Long.parseLong(
+            safeCommit.getUserData().get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)), getDocCountOfCommit(safeCommit));
+
+        // This is protected from concurrent calls by a lock on the IndexWriter, but this assertion makes sure that we notice if that ceases
+        // to be true in future. It is not disastrous if safeCommitInfo refers to an older safeCommit, it just means that we might retain a
+        // bit more history and do a few more ops-based recoveries than we would otherwise.
+        final IndexCommit newSafeCommit = this.safeCommit;
+        assert safeCommit == newSafeCommit
+            : "onCommit called concurrently? " + safeCommit.getGeneration() + " vs " + newSafeCommit.getGeneration();
     }
 
     private void deleteCommit(IndexCommit commit) throws IOException {
@@ -109,6 +127,14 @@ private void updateRetentionPolicy() throws IOException {
             Long.parseLong(safeCommit.getUserData().get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)));
     }
 
+    protected int getDocCountOfCommit(IndexCommit indexCommit) throws IOException {
+        return SegmentInfos.readCommit(indexCommit.getDirectory(), indexCommit.getSegmentsFileName()).totalMaxDoc();
+    }
+
+    SafeCommitInfo getSafeCommitInfo() {
+        return safeCommitInfo;
+    }
+
     /**
      * Captures the most recent commit point {@link #lastCommit} or the most recent safe commit point {@link #safeCommit}.
      * Index files of the capturing commit point won't be released until the commit reference is closed.
 
@@ -1122,6 +1122,11 @@ public abstract void forceMerge(boolean flush, int maxNumSegments, boolean onlyE
      */
     public abstract IndexCommitRef acquireSafeIndexCommit() throws EngineException;
 
+    /**
+     * @return a summary of the contents of the current safe commit
+     */
+    public abstract SafeCommitInfo getSafeCommitInfo();
+
     /**
      * If the specified throwable contains a fatal error in the throwable graph, such a fatal error will be thrown. Callers should ensure
      * that there are no catch statements that would catch an error in the stack as the fatal error here should go uncaught and be handled
 
@@ -2008,6 +2008,11 @@ private void releaseIndexCommit(IndexCommit snapshot) throws IOException {
         }
     }
 
+    @Override
+    public SafeCommitInfo getSafeCommitInfo() {
+        return combinedDeletionPolicy.getSafeCommitInfo();
+    }
+
     private boolean failOnTragicEvent(AlreadyClosedException ex) {
         final boolean engineFailed;
         // if we are already closed due to some tragic exception
 
@@ -77,6 +77,7 @@ public class ReadOnlyEngine extends Engine {
     private final Lock indexWriterLock;
     private final DocsStats docsStats;
     private final RamAccountingRefreshListener refreshListener;
+    private final SafeCommitInfo safeCommitInfo;
 
     protected volatile TranslogStats translogStats;
 
@@ -120,6 +121,7 @@ public ReadOnlyEngine(EngineConfig config, SeqNoStats seqNoStats, TranslogStats
                 assert translogStats != null || obtainLock : "mutiple translogs instances should not be opened at the same time";
                 this.translogStats = translogStats != null ? translogStats : translogStats(config, lastCommittedSegmentInfos);
                 this.indexWriterLock = indexWriterLock;
+                this.safeCommitInfo = new SafeCommitInfo(seqNoStats.getLocalCheckpoint(), lastCommittedSegmentInfos.totalMaxDoc());
                 success = true;
             } finally {
                 if (success == false) {
@@ -420,6 +422,11 @@ public IndexCommitRef acquireSafeIndexCommit() {
         return acquireLastIndexCommit(false);
     }
 
+    @Override
+    public SafeCommitInfo getSafeCommitInfo() {
+        return safeCommitInfo;
+    }
+
     @Override
     public void activateThrottling() {
     }
 
@@ -0,0 +1,37 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.index.engine;
+
+import org.elasticsearch.index.seqno.SequenceNumbers;
+
+/**
+ * Information about the safe commit, for making decisions about recoveries.
+ */
+public class SafeCommitInfo {
+
+    public final long localCheckpoint;
+    public final int docCount;
+
+    public SafeCommitInfo(long localCheckpoint, int docCount) {
+        this.localCheckpoint = localCheckpoint;
+        this.docCount = docCount;
+    }
+
+    public static final SafeCommitInfo EMPTY = new SafeCommitInfo(SequenceNumbers.NO_OPS_PERFORMED, 0);
+}
@@ -37,6 +37,7 @@
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.gateway.WriteStateException;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.engine.SafeCommitInfo;
 import org.elasticsearch.index.shard.AbstractIndexShardComponent;
 import org.elasticsearch.index.shard.IndexShard;
 import org.elasticsearch.index.shard.ReplicationGroup;
@@ -57,6 +58,7 @@
 import java.util.function.Function;
 import java.util.function.LongConsumer;
 import java.util.function.LongSupplier;
+import java.util.function.Supplier;
 import java.util.function.ToLongFunction;
 import java.util.stream.Collectors;
 import java.util.stream.LongStream;
@@ -210,6 +212,17 @@ public class ReplicationTracker extends AbstractIndexShardComponent implements L
      */
     private boolean hasAllPeerRecoveryRetentionLeases;
 
+    /**
+     * Supplies information about the current safe commit which may be used to expire peer-recovery retention leases.
+     */
+    private final Supplier<SafeCommitInfo> safeCommitInfoSupplier;
+
+    /**
+     * Threshold for expiring peer-recovery retention leases and falling back to file-based recovery. See
+     * {@link IndexSettings#FILE_BASED_RECOVERY_THRESHOLD_SETTING}.
+     */
+    private final double fileBasedRecoveryThreshold;
+
     /**
      * Get all retention leases tracked on this shard.
      *
@@ -237,6 +250,8 @@ public synchronized Tuple<Boolean, RetentionLeases> getRetentionLeases(final boo
         final long retentionLeaseMillis = indexSettings.getRetentionLeaseMillis();
         final Set<String> leaseIdsForCurrentPeers
             = routingTable.assignedShards().stream().map(ReplicationTracker::getPeerRecoveryRetentionLeaseId).collect(Collectors.toSet());
+        final boolean allShardsStarted = routingTable.allShardsStarted();
+        final long minimumReasonableRetainedSeqNo = allShardsStarted ? 0L : getMinimumReasonableRetainedSeqNo();
         final Map<Boolean, List<RetentionLease>> partitionByExpiration = retentionLeases
                 .leases()
                 .stream()
@@ -245,7 +260,12 @@ public synchronized Tuple<Boolean, RetentionLeases> getRetentionLeases(final boo
                         if (leaseIdsForCurrentPeers.contains(lease.id())) {
                             return false;
                         }
-                        if (routingTable.allShardsStarted()) {
+                        if (allShardsStarted) {
+                            logger.trace("expiring unused [{}]", lease);
+                            return true;
+                        }
+                        if (lease.retainingSequenceNumber() < minimumReasonableRetainedSeqNo) {
+                            logger.trace("expiring unreasonable [{}] retaining history before [{}]", lease, minimumReasonableRetainedSeqNo);
                             return true;
                         }
                     }
@@ -264,6 +284,17 @@ public synchronized Tuple<Boolean, RetentionLeases> getRetentionLeases(final boo
         return Tuple.tuple(true, retentionLeases);
     }
 
+    private long getMinimumReasonableRetainedSeqNo() {
+        final SafeCommitInfo safeCommitInfo = safeCommitInfoSupplier.get();
+        return safeCommitInfo.localCheckpoint + 1 - Math.round(Math.ceil(safeCommitInfo.docCount * fileBasedRecoveryThreshold));
+        // NB safeCommitInfo.docCount is a very low-level count of the docs in the index, and in particular if this shard contains nested
+        // docs then safeCommitInfo.docCount counts every child doc separately from the parent doc. However every part of a nested document
+        // has the same seqno, so we may be overestimating the cost of a file-based recovery when compared to an ops-based recovery and
+        // therefore preferring ops-based recoveries inappropriately in this case. Correctly accounting for nested docs seems difficult to
+        // do cheaply, and the circumstances in which this matters should be relatively rare, so we use this naive calculation regardless.
+        // TODO improve this measure for when nested docs are in use
+    }
+
     /**
      * Adds a new retention lease.
      *
@@ -850,7 +881,8 @@ public ReplicationTracker(
             final long globalCheckpoint,
             final LongConsumer onGlobalCheckpointUpdated,
             final LongSupplier currentTimeMillisSupplier,
-            final BiConsumer<RetentionLeases, ActionListener<ReplicationResponse>> onSyncRetentionLeases) {
+            final BiConsumer<RetentionLeases, ActionListener<ReplicationResponse>> onSyncRetentionLeases,
+            final Supplier<SafeCommitInfo> safeCommitInfoSupplier) {
         super(shardId, indexSettings);
         assert globalCheckpoint >= SequenceNumbers.UNASSIGNED_SEQ_NO : "illegal initial global checkpoint: " + globalCheckpoint;
         this.shardAllocationId = allocationId;
@@ -867,6 +899,8 @@ public ReplicationTracker(
         this.routingTable = null;
         this.replicationGroup = null;
         this.hasAllPeerRecoveryRetentionLeases = indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_4_0);
+        this.fileBasedRecoveryThreshold = IndexSettings.FILE_BASED_RECOVERY_THRESHOLD_SETTING.get(indexSettings.getSettings());
+        this.safeCommitInfoSupplier = safeCommitInfoSupplier;
         assert Version.V_EMPTY.equals(indexSettings.getIndexVersionCreated()) == false;
         assert invariant();
     }
 
@@ -92,6 +92,7 @@
 import org.elasticsearch.index.engine.EngineFactory;
 import org.elasticsearch.index.engine.ReadOnlyEngine;
 import org.elasticsearch.index.engine.RefreshFailedEngineException;
+import org.elasticsearch.index.engine.SafeCommitInfo;
 import org.elasticsearch.index.engine.Segment;
 import org.elasticsearch.index.engine.SegmentsStats;
 import org.elasticsearch.index.fielddata.FieldDataStats;
@@ -336,7 +337,8 @@ public IndexShard(
                 UNASSIGNED_SEQ_NO,
                 globalCheckpointListeners::globalCheckpointUpdated,
                 threadPool::absoluteTimeInMillis,
-                (retentionLeases, listener) -> retentionLeaseSyncer.sync(shardId, retentionLeases, listener));
+                (retentionLeases, listener) -> retentionLeaseSyncer.sync(shardId, retentionLeases, listener),
+                this::getSafeCommitInfo);
 
         // the query cache is a node-level thing, however we want the most popular filters
         // to be computed on a per-shard basis
@@ -2612,6 +2614,11 @@ public void removePeerRecoveryRetentionLease(String nodeId, ActionListener<Repli
         replicationTracker.removePeerRecoveryRetentionLease(nodeId, listener);
     }
 
+    private SafeCommitInfo getSafeCommitInfo() {
+        final Engine engine = getEngineOrNull();
+        return engine == null ? SafeCommitInfo.EMPTY : engine.getSafeCommitInfo();
+    }
+
     class ShardEventListener implements Engine.EventListener {
         private final CopyOnWriteArrayList<Consumer<ShardFailure>> delegates = new CopyOnWriteArrayList<>();
Original file line number	Diff line number	Diff line change
`@@ -2008,6 +2008,11 @@ private void releaseIndexCommit(IndexCommit snapshot) throws IOException {`
`2008`	`2008`	`}`
`2009`	`2009`	`}`
`2010`	`2010`
	`2011`	`+ @Override`
	`2012`	`+ public SafeCommitInfo getSafeCommitInfo() {`
	`2013`	`+ return combinedDeletionPolicy.getSafeCommitInfo();`
	`2014`	`+ }`
	`2015`	`+`
`2011`	`2016`	`private boolean failOnTragicEvent(AlreadyClosedException ex) {`
`2012`	`2017`	`final boolean engineFailed;`
`2013`	`2018`	`// if we are already closed due to some tragic exception`