Skip to content

Commit e5f3a62

Browse files
committed
Fix missing historyUUID in peer recovery when rolling upgrade 5.x to 6.3 (#31506)
Today we make sure that a 5.x index commit should have all required commit tags in RecoveryTarget#cleanFiles method. The reason we do this in RecoveryTarget#cleanFiles method because this is only needed in a file-based recovery and we assume that #cleanFiles should be called in a file-based recovery. However, this assumption is not valid if the index is sealed (.i.e synced-flush). This incorrect assumption would prevent users from rolling upgrade from 5.x to 6.3 if their index were sealed. Closes #31482
1 parent dae520f commit e5f3a62

File tree

3 files changed

+43
-5
lines changed

3 files changed

+43
-5
lines changed

qa/full-cluster-restart/src/test/java/org/elasticsearch/upgrades/FullClusterRestartIT.java

+15-2
Original file line numberDiff line numberDiff line change
@@ -713,8 +713,21 @@ public void testRecovery() throws Exception {
713713

714714
// make sure all recoveries are done
715715
ensureGreen(index);
716-
// Explicitly flush so we're sure to have a bunch of documents in the Lucene index
717-
client().performRequest("POST", "/_flush");
716+
// Recovering a synced-flush index from 5.x to 6.x might be subtle as a 5.x index commit does not have all 6.x commit tags.
717+
if (randomBoolean()) {
718+
// We have to spin synced-flush requests here because we fire the global checkpoint sync for the last write operation.
719+
// A synced-flush request considers the global checkpoint sync as an going operation because it acquires a shard permit.
720+
assertBusy(() -> {
721+
Response resp = client().performRequest("POST", index + "/_flush/synced");
722+
assertOK(resp);
723+
Map<String, Object> result = ObjectPath.createFromResponse(resp).evaluate("_shards");
724+
assertThat(result.get("successful"), equalTo(result.get("total")));
725+
assertThat(result.get("failed"), equalTo(0));
726+
});
727+
} else {
728+
// Explicitly flush so we're sure to have a bunch of documents in the Lucene index
729+
assertOK(client().performRequest("POST", "/_flush"));
730+
}
718731
if (shouldHaveTranslog) {
719732
// Update a few documents so we are sure to have a translog
720733
indexRandomDocuments(count / 10, false /* Flushing here would invalidate the whole thing....*/, false,

qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/RecoveryIT.java

+25
Original file line numberDiff line numberDiff line change
@@ -283,4 +283,29 @@ public void testSearchGeoPoints() throws Exception {
283283
}
284284
}
285285

286+
public void testRecoverSyncedFlushIndex() throws Exception {
287+
final String index = "recover_synced_flush_index";
288+
if (CLUSTER_TYPE == ClusterType.OLD) {
289+
Settings.Builder settings = Settings.builder()
290+
.put(IndexMetaData.INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), 1)
291+
.put(IndexMetaData.INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), 1)
292+
// if the node with the replica is the first to be restarted, while a replica is still recovering
293+
// then delayed allocation will kick in. When the node comes back, the master will search for a copy
294+
// but the recovering copy will be seen as invalid and the cluster health won't return to GREEN
295+
// before timing out
296+
.put(INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), "100ms")
297+
.put(SETTING_ALLOCATION_MAX_RETRY.getKey(), "0"); // fail faster
298+
createIndex(index, settings.build());
299+
indexDocs(index, 0, randomInt(5));
300+
// We have to spin synced-flush requests here because we fire the global checkpoint sync for the last write operation.
301+
// A synced-flush request considers the global checkpoint sync as an going operation because it acquires a shard permit.
302+
assertBusy(() -> {
303+
Response resp = client().performRequest("POST", index + "/_flush/synced");
304+
assertOK(resp);
305+
Map<String, Object> result = ObjectPath.createFromResponse(resp).evaluate("_shards");
306+
assertThat(result.get("successful"), equalTo(2));
307+
});
308+
}
309+
ensureGreen(index);
310+
}
286311
}

server/src/main/java/org/elasticsearch/indices/recovery/RecoveryTarget.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,9 @@ private void ensureRefCount() {
362362

363363
@Override
364364
public void prepareForTranslogOperations(boolean fileBasedRecovery, int totalTranslogOps) throws IOException {
365+
if (fileBasedRecovery && indexShard.indexSettings().getIndexVersionCreated().before(Version.V_6_0_0)) {
366+
store.ensureIndexHas6xCommitTags();
367+
}
365368
state().getTranslog().totalOperations(totalTranslogOps);
366369
indexShard().openEngineAndSkipTranslogRecovery();
367370
}
@@ -438,9 +441,6 @@ public void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaDa
438441
store.incRef();
439442
try {
440443
store.cleanupAndVerify("recovery CleanFilesRequestHandler", sourceMetaData);
441-
if (indexShard.indexSettings().getIndexVersionCreated().before(Version.V_6_0_0_rc1)) {
442-
store.ensureIndexHas6xCommitTags();
443-
}
444444
// TODO: Assign the global checkpoint to the max_seqno of the safe commit if the index version >= 6.2
445445
final String translogUUID = Translog.createEmptyTranslog(
446446
indexShard.shardPath().resolveTranslog(), SequenceNumbers.UNASSIGNED_SEQ_NO, shardId, indexShard.getPrimaryTerm());

0 commit comments

Comments
 (0)