Skip to content

Commit 8b1cdd7

Browse files
Fix SnapshotShardStatus Reporting for Failed Shard (#48556) (#48689)
Fixes the shard snapshot status reporting for failed shards in the corner case of failing the shard because of an exception thrown in `SnapshotShardsService` and not the repository. We were missing the update on the `snapshotStatus` instance in this case which made the transport APIs using this field report back an incorrect status. Fixed by moving the failure handling to the `SnapshotShardsService` for all cases (which also simplifies the code, the ex. wrapping in the repository was pointless as we only used the ex. trace upstream anyway). Also, added an assertion to another test that explicitly checks this failure situation (ex. in the `SnapshotShardsService`) already. Closes #48526
1 parent 8c97129 commit 8b1cdd7

File tree

3 files changed

+12
-11
lines changed

3 files changed

+12
-11
lines changed

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
import org.apache.lucene.store.IndexInput;
2828
import org.apache.lucene.store.RateLimiter;
2929
import org.apache.lucene.util.SetOnce;
30-
import org.elasticsearch.ExceptionsHelper;
3130
import org.elasticsearch.Version;
3231
import org.elasticsearch.action.ActionListener;
3332
import org.elasticsearch.action.ActionRunnable;
@@ -995,12 +994,6 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
995994
IndexCommit snapshotIndexCommit, IndexShardSnapshotStatus snapshotStatus, ActionListener<Void> listener) {
996995
final ShardId shardId = store.shardId();
997996
final long startTime = threadPool.absoluteTimeInMillis();
998-
final StepListener<Void> snapshotDoneListener = new StepListener<>();
999-
snapshotDoneListener.whenComplete(listener::onResponse, e -> {
1000-
snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), ExceptionsHelper.detailedMessage(e));
1001-
listener.onFailure(e instanceof IndexShardSnapshotFailedException ? (IndexShardSnapshotFailedException) e
1002-
: new IndexShardSnapshotFailedException(store.shardId(), e));
1003-
});
1004997
try {
1005998
logger.debug("[{}] [{}] snapshot to [{}] ...", shardId, snapshotId, metadata.name());
1006999

@@ -1137,8 +1130,8 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
11371130
snapshotId, shardId), e);
11381131
}
11391132
snapshotStatus.moveToDone(threadPool.absoluteTimeInMillis());
1140-
snapshotDoneListener.onResponse(null);
1141-
}, snapshotDoneListener::onFailure);
1133+
listener.onResponse(null);
1134+
}, listener::onFailure);
11421135
if (indexIncrementalFileCount == 0) {
11431136
allFilesUploadedListener.onResponse(Collections.emptyList());
11441137
return;
@@ -1181,7 +1174,7 @@ public void onFailure(Exception e) {
11811174
});
11821175
}
11831176
} catch (Exception e) {
1184-
snapshotDoneListener.onFailure(e);
1177+
listener.onFailure(e);
11851178
}
11861179
}
11871180

server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,8 +292,10 @@ public void onResponse(final Void aVoid) {
292292

293293
@Override
294294
public void onFailure(Exception e) {
295+
final String failure = ExceptionsHelper.detailedMessage(e);
296+
snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), failure);
295297
logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e);
296-
notifyFailedSnapshotShard(snapshot, shardId, ExceptionsHelper.detailedMessage(e));
298+
notifyFailedSnapshotShard(snapshot, shardId, failure);
297299
}
298300
});
299301
}

server/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1224,6 +1224,12 @@ public void testDataNodeRestartWithBusyMasterDuringSnapshot() throws Exception {
12241224
disruption.startDisrupting();
12251225
logger.info("--> restarting data node, which should cause primary shards to be failed");
12261226
internalCluster().restartNode(dataNode, InternalTestCluster.EMPTY_CALLBACK);
1227+
1228+
logger.info("--> wait for shard snapshots to show as failed");
1229+
assertBusy(() -> assertThat(
1230+
client().admin().cluster().prepareSnapshotStatus("test-repo").setSnapshots("test-snap").get().getSnapshots()
1231+
.get(0).getShardsStats().getFailedShards(), greaterThanOrEqualTo(1)), 60L, TimeUnit.SECONDS);
1232+
12271233
unblockNode("test-repo", dataNode);
12281234
disruption.stopDisrupting();
12291235
// check that snapshot completes

0 commit comments

Comments
 (0)