Skip to content

Commit 77720e8

Browse files
authored
Reset starting seqno if fail to read last commit (#45106)
Previously, if the metadata snapshot is empty (either no commit found or error), we won't compute the starting sequence number and use -2 to opt out the operation-based recovery. With #43463, we have a starting sequence number before reading the last commit. Thus, we need to reset it if we fail to snapshot the store. Closes #45072
1 parent 5322b00 commit 77720e8

File tree

2 files changed

+38
-20
lines changed

2 files changed

+38
-20
lines changed

server/src/main/java/org/elasticsearch/indices/recovery/PeerRecoveryTargetService.java

+18-20
Original file line numberDiff line numberDiff line change
@@ -309,25 +309,6 @@ public RecoveryResponse read(StreamInput in) throws IOException {
309309
}
310310
}
311311

312-
/**
313-
* Obtains a snapshot of the store metadata for the recovery target.
314-
*
315-
* @param recoveryTarget the target of the recovery
316-
* @return a snapshot of the store metadata
317-
*/
318-
private static Store.MetadataSnapshot getStoreMetadataSnapshot(final Logger logger, final RecoveryTarget recoveryTarget) {
319-
try {
320-
return recoveryTarget.indexShard().snapshotStoreMetadata();
321-
} catch (final org.apache.lucene.index.IndexNotFoundException e) {
322-
// happens on an empty folder. no need to log
323-
logger.trace("{} shard folder empty, recovering all files", recoveryTarget);
324-
return Store.MetadataSnapshot.EMPTY;
325-
} catch (final IOException e) {
326-
logger.warn("error while listing local files, recovering as if there are none", e);
327-
return Store.MetadataSnapshot.EMPTY;
328-
}
329-
}
330-
331312
/**
332313
* Prepare the start recovery request.
333314
*
@@ -343,7 +324,24 @@ public static StartRecoveryRequest getStartRecoveryRequest(Logger logger, Discov
343324
final StartRecoveryRequest request;
344325
logger.trace("{} collecting local files for [{}]", recoveryTarget.shardId(), recoveryTarget.sourceNode());
345326

346-
final Store.MetadataSnapshot metadataSnapshot = getStoreMetadataSnapshot(logger, recoveryTarget);
327+
Store.MetadataSnapshot metadataSnapshot;
328+
try {
329+
metadataSnapshot = recoveryTarget.indexShard().snapshotStoreMetadata();
330+
} catch (final org.apache.lucene.index.IndexNotFoundException e) {
331+
// happens on an empty folder. no need to log
332+
assert startingSeqNo == UNASSIGNED_SEQ_NO : startingSeqNo;
333+
logger.trace("{} shard folder empty, recovering all files", recoveryTarget);
334+
metadataSnapshot = Store.MetadataSnapshot.EMPTY;
335+
} catch (final IOException e) {
336+
if (startingSeqNo != UNASSIGNED_SEQ_NO) {
337+
logger.warn(new ParameterizedMessage("error while listing local files, resetting the starting sequence number from {} " +
338+
"to unassigned and recovering as if there are none", startingSeqNo), e);
339+
startingSeqNo = UNASSIGNED_SEQ_NO;
340+
} else {
341+
logger.warn("error while listing local files, recovering as if there are none", e);
342+
}
343+
metadataSnapshot = Store.MetadataSnapshot.EMPTY;
344+
}
347345
logger.trace("{} local file count [{}]", recoveryTarget.shardId(), metadataSnapshot.size());
348346
request = new StartRecoveryRequest(
349347
recoveryTarget.shardId(),

server/src/test/java/org/elasticsearch/indices/recovery/PeerRecoveryTargetServiceTests.java

+20
Original file line numberDiff line numberDiff line change
@@ -258,4 +258,24 @@ public void testClosedIndexSkipsLocalRecovery() throws Exception {
258258
assertThat(replica.getLastKnownGlobalCheckpoint(), equalTo(UNASSIGNED_SEQ_NO));
259259
closeShards(replica);
260260
}
261+
262+
public void testResetStartingSeqNoIfLastCommitCorrupted() throws Exception {
263+
IndexShard shard = newStartedShard(false);
264+
populateRandomData(shard);
265+
DiscoveryNode pNode = new DiscoveryNode("foo", buildNewFakeTransportAddress(),
266+
Collections.emptyMap(), Collections.emptySet(), Version.CURRENT);
267+
DiscoveryNode rNode = new DiscoveryNode("foo", buildNewFakeTransportAddress(),
268+
Collections.emptyMap(), Collections.emptySet(), Version.CURRENT);
269+
shard = reinitShard(shard, ShardRoutingHelper.initWithSameId(shard.routingEntry(), RecoverySource.PeerRecoverySource.INSTANCE));
270+
shard.markAsRecovering("peer recovery", new RecoveryState(shard.routingEntry(), pNode, rNode));
271+
shard.prepareForIndexRecovery();
272+
long startingSeqNo = shard.recoverLocallyUpToGlobalCheckpoint();
273+
shard.store().markStoreCorrupted(new IOException("simulated"));
274+
RecoveryTarget recoveryTarget = new RecoveryTarget(shard, null, null);
275+
StartRecoveryRequest request = PeerRecoveryTargetService.getStartRecoveryRequest(logger, rNode, recoveryTarget, startingSeqNo);
276+
assertThat(request.startingSeqNo(), equalTo(UNASSIGNED_SEQ_NO));
277+
assertThat(request.metadataSnapshot().size(), equalTo(0));
278+
recoveryTarget.decRef();
279+
closeShards(shard);
280+
}
261281
}

0 commit comments

Comments
 (0)