Skip to content

Commit 2c73969

Browse files
authored
Introduce soft-deletes retention policy based on global checkpoint (#30335)
This commit introduces a soft-deletes retention merge policy based on the global checkpoint. Some notes on this simple retention policy: - This policy keeps all operations whose seq# is greater than the persisted global checkpoint and configurable extra operations prior to the global checkpoint. This is good enough for querying history changes. - This policy is not watertight for peer-recovery. We send the safe-commit in peer-recovery, thus we need to also send all operations after the local checkpoint of that commit. This is analog to the min translog generation for recovery. - This policy is too simple to support rollback. Relates #29530
1 parent 6e0d0fe commit 2c73969

File tree

7 files changed

+181
-18
lines changed

7 files changed

+181
-18
lines changed

server/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java

+1
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
131131
ShardsLimitAllocationDecider.INDEX_TOTAL_SHARDS_PER_NODE_SETTING,
132132
IndexSettings.INDEX_GC_DELETES_SETTING,
133133
IndexSettings.INDEX_SOFT_DELETES_SETTING,
134+
IndexSettings.INDEX_SOFT_DELETES_RETENTION_OPERATIONS_SETTING,
134135
IndicesRequestCache.INDEX_CACHE_REQUEST_ENABLED_SETTING,
135136
UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING,
136137
EnableAllocationDecider.INDEX_ROUTING_REBALANCE_ENABLE_SETTING,

server/src/main/java/org/elasticsearch/index/IndexSettings.java

+22
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,14 @@ public final class IndexSettings {
242242
*/
243243
public static final Setting<Boolean> INDEX_SOFT_DELETES_SETTING = Setting.boolSetting("index.soft_deletes", true, Property.IndexScope);
244244

245+
/**
246+
* Controls how many soft-deleted documents will be kept around before being merged away. Keeping more deleted
247+
* documents increases the chance of operation-based recoveries and allows querying a longer history of documents.
248+
* If soft-deletes is enabled, an engine by default will retain all operations up to the global checkpoint.
249+
**/
250+
public static final Setting<Long> INDEX_SOFT_DELETES_RETENTION_OPERATIONS_SETTING =
251+
Setting.longSetting("index.soft_deletes.retention.operations", 0, 0, Property.IndexScope, Property.Dynamic);
252+
245253
/**
246254
* The maximum number of refresh listeners allows on this shard.
247255
*/
@@ -287,6 +295,7 @@ public final class IndexSettings {
287295
private final IndexScopedSettings scopedSettings;
288296
private long gcDeletesInMillis = DEFAULT_GC_DELETES.millis();
289297
private final boolean softDeleteEnabled;
298+
private volatile long softDeleteRetentionOperations;
290299
private volatile boolean warmerEnabled;
291300
private volatile int maxResultWindow;
292301
private volatile int maxInnerResultWindow;
@@ -398,6 +407,7 @@ public IndexSettings(final IndexMetaData indexMetaData, final Settings nodeSetti
398407
mergeSchedulerConfig = new MergeSchedulerConfig(this);
399408
gcDeletesInMillis = scopedSettings.get(INDEX_GC_DELETES_SETTING).getMillis();
400409
softDeleteEnabled = version.onOrAfter(Version.V_7_0_0_alpha1) && scopedSettings.get(INDEX_SOFT_DELETES_SETTING);
410+
softDeleteRetentionOperations = scopedSettings.get(INDEX_SOFT_DELETES_RETENTION_OPERATIONS_SETTING);
401411
warmerEnabled = scopedSettings.get(INDEX_WARMER_ENABLED_SETTING);
402412
maxResultWindow = scopedSettings.get(MAX_RESULT_WINDOW_SETTING);
403413
maxInnerResultWindow = scopedSettings.get(MAX_INNER_RESULT_WINDOW_SETTING);
@@ -455,6 +465,7 @@ public IndexSettings(final IndexMetaData indexMetaData, final Settings nodeSetti
455465
scopedSettings.addSettingsUpdateConsumer(DEFAULT_FIELD_SETTING, this::setDefaultFields);
456466
scopedSettings.addSettingsUpdateConsumer(INDEX_SEARCH_IDLE_AFTER, this::setSearchIdleAfter);
457467
scopedSettings.addSettingsUpdateConsumer(MAX_REGEX_LENGTH_SETTING, this::setMaxRegexLength);
468+
scopedSettings.addSettingsUpdateConsumer(INDEX_SOFT_DELETES_RETENTION_OPERATIONS_SETTING, this::setSoftDeleteRetentionOperations);
458469
}
459470

460471
private void setSearchIdleAfter(TimeValue searchIdleAfter) { this.searchIdleAfter = searchIdleAfter; }
@@ -837,4 +848,15 @@ public boolean isExplicitRefresh() {
837848
public boolean isSoftDeleteEnabled() {
838849
return softDeleteEnabled;
839850
}
851+
852+
private void setSoftDeleteRetentionOperations(long ops) {
853+
this.softDeleteRetentionOperations = ops;
854+
}
855+
856+
/**
857+
* Returns the number of extra operations (i.e. soft-deleted documents) to be kept for recoveries and history purpose.
858+
*/
859+
public long getSoftDeleteRetentionOperations() {
860+
return this.softDeleteRetentionOperations;
861+
}
840862
}

server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java

+18-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.logging.log4j.Logger;
2323
import org.apache.logging.log4j.message.ParameterizedMessage;
2424
import org.apache.lucene.document.Field;
25+
import org.apache.lucene.document.LongPoint;
2526
import org.apache.lucene.document.NumericDocValuesField;
2627
import org.apache.lucene.index.DirectoryReader;
2728
import org.apache.lucene.index.IndexCommit;
@@ -34,8 +35,10 @@
3435
import org.apache.lucene.index.MergePolicy;
3536
import org.apache.lucene.index.SegmentCommitInfo;
3637
import org.apache.lucene.index.SegmentInfos;
38+
import org.apache.lucene.index.SoftDeletesRetentionMergePolicy;
3739
import org.apache.lucene.index.Term;
3840
import org.apache.lucene.search.IndexSearcher;
41+
import org.apache.lucene.search.Query;
3942
import org.apache.lucene.search.ReferenceManager;
4043
import org.apache.lucene.search.SearcherFactory;
4144
import org.apache.lucene.search.SearcherManager;
@@ -2002,8 +2005,8 @@ private IndexWriterConfig getIndexWriterConfig() {
20022005
// background merges
20032006
MergePolicy mergePolicy = config().getMergePolicy();
20042007
if (softDeleteEnabled) {
2005-
// TODO: soft-delete retention policy
20062008
iwc.setSoftDeletesField(Lucene.SOFT_DELETE_FIELD);
2009+
mergePolicy = new SoftDeletesRetentionMergePolicy(Lucene.SOFT_DELETE_FIELD, this::softDeletesRetentionQuery, mergePolicy);
20072010
}
20082011
iwc.setMergePolicy(new ElasticsearchMergePolicy(mergePolicy));
20092012
iwc.setSimilarity(engineConfig.getSimilarity());
@@ -2016,6 +2019,20 @@ private IndexWriterConfig getIndexWriterConfig() {
20162019
return iwc;
20172020
}
20182021

2022+
/**
2023+
* Documents including tombstones are soft-deleted and matched this query will be retained and won't cleaned up by merges.
2024+
*/
2025+
private Query softDeletesRetentionQuery() {
2026+
ensureOpen();
2027+
// TODO: We send the safe commit in peer-recovery, thus we need to retain all operations after the local checkpoint of that commit.
2028+
final long retainedExtraOps = engineConfig.getIndexSettings().getSoftDeleteRetentionOperations();
2029+
// Prefer using the global checkpoint which is persisted on disk than an in-memory value.
2030+
// If we failed to fsync checkpoint but already used a higher global checkpoint value to clean up soft-deleted ops,
2031+
// then we may not have all required operations whose seq# greater than the global checkpoint after restarted.
2032+
final long persistedGlobalCheckpoint = translog.getLastSyncedGlobalCheckpoint();
2033+
return LongPoint.newRangeQuery(SeqNoFieldMapper.NAME, persistedGlobalCheckpoint + 1 - retainedExtraOps, Long.MAX_VALUE);
2034+
}
2035+
20192036
/** Extended SearcherFactory that warms the segments if needed when acquiring a new searcher */
20202037
static final class SearchFactory extends EngineSearcherFactory {
20212038
private final Engine.Warmer warmer;

server/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java

+80-9
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@
123123
import org.elasticsearch.index.translog.Translog;
124124
import org.elasticsearch.index.translog.TranslogConfig;
125125
import org.elasticsearch.indices.breaker.NoneCircuitBreakerService;
126+
import org.elasticsearch.test.IndexSettingsModule;
126127
import org.hamcrest.MatcherAssert;
127128
import org.hamcrest.Matchers;
128129

@@ -178,6 +179,7 @@
178179
import static org.hamcrest.Matchers.hasItem;
179180
import static org.hamcrest.Matchers.hasKey;
180181
import static org.hamcrest.Matchers.hasSize;
182+
import static org.hamcrest.Matchers.isIn;
181183
import static org.hamcrest.Matchers.lessThanOrEqualTo;
182184
import static org.hamcrest.Matchers.not;
183185
import static org.hamcrest.Matchers.notNullValue;
@@ -251,8 +253,9 @@ public void testVersionMapAfterAutoIDDocument() throws IOException {
251253
}
252254

253255
public void testSegments() throws Exception {
256+
final AtomicLong globalCheckpoint = new AtomicLong(SequenceNumbers.NO_OPS_PERFORMED);
254257
try (Store store = createStore();
255-
InternalEngine engine = createEngine(defaultSettings, store, createTempDir(), NoMergePolicy.INSTANCE)) {
258+
InternalEngine engine = createEngine(config(defaultSettings, store, createTempDir(), NoMergePolicy.INSTANCE, null, null, globalCheckpoint::get))) {
256259
List<Segment> segments = engine.segments(false);
257260
assertThat(segments.isEmpty(), equalTo(true));
258261
assertThat(engine.segmentsStats(false).getCount(), equalTo(0L));
@@ -324,6 +327,8 @@ public void testSegments() throws Exception {
324327

325328

326329
engine.delete(new Engine.Delete("test", "1", newUid(doc), primaryTerm.get()));
330+
globalCheckpoint.set(engine.getLocalCheckpointTracker().getCheckpoint());
331+
engine.getTranslog().sync();
327332
engine.refresh("test");
328333

329334
segments = engine.segments(false);
@@ -1279,9 +1284,13 @@ public void testVersioningNewIndex() throws IOException {
12791284
assertThat(indexResult.getVersion(), equalTo(1L));
12801285
}
12811286

1282-
public void testForceMerge() throws IOException {
1287+
public void testForceMergeWithoutSoftDeletes() throws IOException {
1288+
Settings settings = Settings.builder()
1289+
.put(defaultSettings.getSettings())
1290+
.put(IndexSettings.INDEX_SOFT_DELETES_SETTING.getKey(), false).build();
1291+
IndexMetaData indexMetaData = IndexMetaData.builder(defaultSettings.getIndexMetaData()).settings(settings).build();
12831292
try (Store store = createStore();
1284-
Engine engine = createEngine(config(defaultSettings, store, createTempDir(),
1293+
Engine engine = createEngine(config(IndexSettingsModule.newIndexSettings(indexMetaData), store, createTempDir(),
12851294
new LogByteSizeMergePolicy(), null))) { // use log MP here we test some behavior in ESMP
12861295
int numDocs = randomIntBetween(10, 100);
12871296
for (int i = 0; i < numDocs; i++) {
@@ -1322,6 +1331,66 @@ public void testForceMerge() throws IOException {
13221331
}
13231332
}
13241333

1334+
public void testForceMergeWithSoftDeletesRetention() throws Exception {
1335+
final long retainedExtraOps = randomLongBetween(0, 10);
1336+
Settings.Builder settings = Settings.builder()
1337+
.put(defaultSettings.getSettings())
1338+
.put(IndexSettings.INDEX_SOFT_DELETES_SETTING.getKey(), true)
1339+
.put(IndexSettings.INDEX_SOFT_DELETES_RETENTION_OPERATIONS_SETTING.getKey(), retainedExtraOps);
1340+
final IndexMetaData indexMetaData = IndexMetaData.builder(defaultSettings.getIndexMetaData()).settings(settings).build();
1341+
final IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(indexMetaData);
1342+
final AtomicLong globalCheckpoint = new AtomicLong(SequenceNumbers.NO_OPS_PERFORMED);
1343+
final MapperService mapperService = createMapperService("test");
1344+
final Set<String> liveDocs = new HashSet<>();
1345+
try (Store store = createStore();
1346+
Engine engine = createEngine(config(indexSettings, store, createTempDir(), newMergePolicy(), null, null, globalCheckpoint::get))) {
1347+
int numDocs = scaledRandomIntBetween(10, 100);
1348+
for (int i = 0; i < numDocs; i++) {
1349+
ParsedDocument doc = testParsedDocument(Integer.toString(i), null, testDocument(), B_1, null);
1350+
engine.index(indexForDoc(doc));
1351+
liveDocs.add(doc.id());
1352+
}
1353+
for (int i = 0; i < numDocs; i++) {
1354+
ParsedDocument doc = testParsedDocument(Integer.toString(i), null, testDocument(), B_1, null);
1355+
if (randomBoolean()) {
1356+
engine.delete(new Engine.Delete(doc.type(), doc.id(), newUid(doc.id()), primaryTerm.get()));
1357+
liveDocs.remove(doc.id());
1358+
}
1359+
if (randomBoolean()) {
1360+
engine.index(indexForDoc(doc));
1361+
liveDocs.add(doc.id());
1362+
}
1363+
}
1364+
long localCheckpoint = engine.getLocalCheckpointTracker().getCheckpoint();
1365+
globalCheckpoint.set(randomLongBetween(0, localCheckpoint));
1366+
engine.getTranslog().sync();
1367+
engine.forceMerge(true, 1, false, false, false);
1368+
assertConsistentHistoryBetweenTranslogAndLuceneIndex(engine, mapperService);
1369+
Map<Long, Translog.Operation> ops = readAllOperationsInLucene(engine, mapperService)
1370+
.stream().collect(Collectors.toMap(Translog.Operation::seqNo, Function.identity()));
1371+
for (long seqno = 0; seqno <= localCheckpoint; seqno++) {
1372+
long keptIndex = globalCheckpoint.get() + 1 - retainedExtraOps;
1373+
String msg = "seq# [" + seqno + "], global checkpoint [" + globalCheckpoint + "], retained-ops [" + retainedExtraOps + "]";
1374+
if (seqno < keptIndex) {
1375+
Translog.Operation op = ops.get(seqno);
1376+
if (op != null) {
1377+
assertThat(op, instanceOf(Translog.Index.class));
1378+
assertThat(msg, ((Translog.Index) op).id(), isIn(liveDocs));
1379+
}
1380+
} else {
1381+
assertThat(msg, ops.get(seqno), notNullValue());
1382+
}
1383+
}
1384+
settings.put(IndexSettings.INDEX_SOFT_DELETES_RETENTION_OPERATIONS_SETTING.getKey(), 0);
1385+
indexSettings.updateIndexMetaData(IndexMetaData.builder(defaultSettings.getIndexMetaData()).settings(settings).build());
1386+
globalCheckpoint.set(localCheckpoint);
1387+
engine.getTranslog().sync();
1388+
engine.forceMerge(true, 1, false, false, false);
1389+
assertConsistentHistoryBetweenTranslogAndLuceneIndex(engine, mapperService);
1390+
assertThat(readAllOperationsInLucene(engine, mapperService), hasSize(liveDocs.size()));
1391+
}
1392+
}
1393+
13251394
public void testForceMergeAndClose() throws IOException, InterruptedException {
13261395
int numIters = randomIntBetween(2, 10);
13271396
for (int j = 0; j < numIters; j++) {
@@ -2525,14 +2594,16 @@ public void testSkipTranslogReplay() throws IOException {
25252594
Engine.IndexResult indexResult = engine.index(firstIndexRequest);
25262595
assertThat(indexResult.getVersion(), equalTo(1L));
25272596
}
2597+
EngineConfig config = engine.config();
25282598
assertVisibleCount(engine, numDocs);
25292599
engine.close();
2530-
trimUnsafeCommits(engine.config());
2531-
engine = new InternalEngine(engine.config());
2532-
engine.skipTranslogRecovery();
2533-
try (Engine.Searcher searcher = engine.acquireSearcher("test")) {
2534-
TopDocs topDocs = searcher.searcher().search(new MatchAllDocsQuery(), randomIntBetween(numDocs, numDocs + 10));
2535-
assertThat(topDocs.totalHits, equalTo(0L));
2600+
trimUnsafeCommits(config);
2601+
try (InternalEngine engine = new InternalEngine(config)) {
2602+
engine.skipTranslogRecovery();
2603+
try (Engine.Searcher searcher = engine.acquireSearcher("test")) {
2604+
TopDocs topDocs = searcher.searcher().search(new MatchAllDocsQuery(), randomIntBetween(numDocs, numDocs + 10));
2605+
assertThat(topDocs.totalHits, equalTo(0L));
2606+
}
25362607
}
25372608
}
25382609

server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java

+11-1
Original file line numberDiff line numberDiff line change
@@ -2350,7 +2350,16 @@ public void testDocStats() throws IOException {
23502350
deleteDoc(indexShard, "_doc", id);
23512351
indexDoc(indexShard, "_doc", id);
23522352
}
2353-
2353+
// Need to update and sync the global checkpoint as the soft-deletes retention MergePolicy depends on it.
2354+
if (indexShard.indexSettings.isSoftDeleteEnabled()) {
2355+
if (indexShard.routingEntry().primary()) {
2356+
indexShard.updateGlobalCheckpointForShard(indexShard.routingEntry().allocationId().getId(),
2357+
indexShard.getLocalCheckpoint());
2358+
} else {
2359+
indexShard.updateGlobalCheckpointOnReplica(indexShard.getLocalCheckpoint(), "test");
2360+
}
2361+
indexShard.sync();
2362+
}
23542363
// flush the buffered deletes
23552364
final FlushRequest flushRequest = new FlushRequest();
23562365
flushRequest.force(false);
@@ -2910,6 +2919,7 @@ public void testSegmentMemoryTrackedInBreaker() throws Exception {
29102919

29112920
// Deleting a doc causes its memory to be freed from the breaker
29122921
deleteDoc(primary, "_doc", "0");
2922+
primary.sync(); // need to sync global checkpoint as the soft-deletes retention MergePolicy depends on it.
29132923
primary.refresh("force refresh");
29142924

29152925
ss = primary.segmentStats(randomBoolean());

server/src/test/java/org/elasticsearch/indices/stats/IndexStatsIT.java

+30-2
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,15 @@
4343
import org.elasticsearch.common.settings.Settings;
4444
import org.elasticsearch.common.xcontent.XContentType;
4545
import org.elasticsearch.index.IndexModule;
46+
import org.elasticsearch.index.IndexService;
4647
import org.elasticsearch.index.IndexSettings;
4748
import org.elasticsearch.index.MergePolicyConfig;
4849
import org.elasticsearch.index.MergeSchedulerConfig;
4950
import org.elasticsearch.index.VersionType;
5051
import org.elasticsearch.index.cache.query.QueryCacheStats;
5152
import org.elasticsearch.index.engine.VersionConflictEngineException;
5253
import org.elasticsearch.index.query.QueryBuilders;
54+
import org.elasticsearch.index.shard.IndexShard;
5355
import org.elasticsearch.index.translog.Translog;
5456
import org.elasticsearch.indices.IndicesQueryCache;
5557
import org.elasticsearch.indices.IndicesRequestCache;
@@ -69,6 +71,7 @@
6971
import java.util.EnumSet;
7072
import java.util.List;
7173
import java.util.Random;
74+
import java.util.Set;
7275
import java.util.concurrent.BrokenBarrierException;
7376
import java.util.concurrent.CopyOnWriteArrayList;
7477
import java.util.concurrent.CountDownLatch;
@@ -1005,10 +1008,15 @@ private void assertCumulativeQueryCacheStats(IndicesStatsResponse response) {
10051008
}
10061009

10071010
public void testFilterCacheStats() throws Exception {
1008-
assertAcked(prepareCreate("index").setSettings(Settings.builder().put(indexSettings()).put("number_of_replicas", 0).build()).get());
1009-
indexRandom(true,
1011+
Settings settings = Settings.builder().put(indexSettings()).put("number_of_replicas", 0).build();
1012+
assertAcked(prepareCreate("index").setSettings(settings).get());
1013+
indexRandom(false, true,
10101014
client().prepareIndex("index", "type", "1").setSource("foo", "bar"),
10111015
client().prepareIndex("index", "type", "2").setSource("foo", "baz"));
1016+
if (IndexSettings.INDEX_SOFT_DELETES_SETTING.get(settings)) {
1017+
persistGlobalCheckpoint("index"); // Need to persist the global checkpoint for the soft-deletes retention MP.
1018+
}
1019+
refresh();
10121020
ensureGreen();
10131021

10141022
IndicesStatsResponse response = client().admin().indices().prepareStats("index").setQueryCache(true).get();
@@ -1039,6 +1047,9 @@ public void testFilterCacheStats() throws Exception {
10391047

10401048
assertEquals(DocWriteResponse.Result.DELETED, client().prepareDelete("index", "type", "1").get().getResult());
10411049
assertEquals(DocWriteResponse.Result.DELETED, client().prepareDelete("index", "type", "2").get().getResult());
1050+
if (IndexSettings.INDEX_SOFT_DELETES_SETTING.get(settings)) {
1051+
persistGlobalCheckpoint("index"); // Need to persist the global checkpoint for the soft-deletes retention MP.
1052+
}
10421053
refresh();
10431054
response = client().admin().indices().prepareStats("index").setQueryCache(true).get();
10441055
assertCumulativeQueryCacheStats(response);
@@ -1172,4 +1183,21 @@ public void testConcurrentIndexingAndStatsRequests() throws BrokenBarrierExcepti
11721183
assertThat(executionFailures.get(), emptyCollectionOf(Exception.class));
11731184
}
11741185

1186+
1187+
/**
1188+
* Persist the global checkpoint on all shards of the given index into disk.
1189+
* This makes sure that the persisted global checkpoint on those shards will equal to the in-memory value.
1190+
*/
1191+
private void persistGlobalCheckpoint(String index) throws Exception {
1192+
final Set<String> nodes = internalCluster().nodesInclude(index);
1193+
for (String node : nodes) {
1194+
final IndicesService indexServices = internalCluster().getInstance(IndicesService.class, node);
1195+
for (IndexService indexService : indexServices) {
1196+
for (IndexShard indexShard : indexService) {
1197+
indexShard.sync();
1198+
assertThat(indexShard.getLastSyncedGlobalCheckpoint(), equalTo(indexShard.getGlobalCheckpoint()));
1199+
}
1200+
}
1201+
}
1202+
}
11751203
}

0 commit comments

Comments
 (0)