Skip to content

Commit 752fa87

Browse files
Fix SnapshotShardStatus Reporting for Failed Shard (#48556)
Fixes the shard snapshot status reporting for failed shards in the corner case of failing the shard because of an exception thrown in `SnapshotShardsService` and not the repository. We were missing the update on the `snapshotStatus` instance in this case which made the transport APIs using this field report back an incorrect status. Fixed by moving the failure handling to the `SnapshotShardsService` for all cases (which also simplifies the code, the ex. wrapping in the repository was pointless as we only used the ex. trace upstream anyway). Also, added an assertion to another test that explicitly checks this failure situation (ex. in the `SnapshotShardsService`) already. Closes #48526
1 parent e5646fe commit 752fa87

File tree

3 files changed

+12
-9
lines changed

3 files changed

+12
-9
lines changed

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
import org.apache.lucene.store.IndexOutput;
3232
import org.apache.lucene.store.RateLimiter;
3333
import org.apache.lucene.util.SetOnce;
34-
import org.elasticsearch.ExceptionsHelper;
3534
import org.elasticsearch.action.ActionListener;
3635
import org.elasticsearch.action.ActionRunnable;
3736
import org.elasticsearch.action.StepListener;
@@ -1042,10 +1041,6 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
10421041
ActionListener<String> listener) {
10431042
final ShardId shardId = store.shardId();
10441043
final long startTime = threadPool.absoluteTimeInMillis();
1045-
final ActionListener<String> snapshotDoneListener = ActionListener.wrap(listener::onResponse, e -> {
1046-
snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), ExceptionsHelper.stackTrace(e));
1047-
listener.onFailure(e instanceof IndexShardSnapshotFailedException ? e : new IndexShardSnapshotFailedException(shardId, e));
1048-
});
10491044
try {
10501045
final String generation = snapshotStatus.generation();
10511046
logger.debug("[{}] [{}] snapshot to [{}] [{}] ...", shardId, snapshotId, metadata.name(), generation);
@@ -1191,8 +1186,8 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
11911186
}
11921187
}
11931188
snapshotStatus.moveToDone(threadPool.absoluteTimeInMillis(), indexGeneration);
1194-
snapshotDoneListener.onResponse(indexGeneration);
1195-
}, snapshotDoneListener::onFailure);
1189+
listener.onResponse(indexGeneration);
1190+
}, listener::onFailure);
11961191
if (indexIncrementalFileCount == 0) {
11971192
allFilesUploadedListener.onResponse(Collections.emptyList());
11981193
return;
@@ -1222,7 +1217,7 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
12221217
}));
12231218
}
12241219
} catch (Exception e) {
1225-
snapshotDoneListener.onFailure(e);
1220+
listener.onFailure(e);
12261221
}
12271222
}
12281223

server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,8 +298,10 @@ public void onResponse(String newGeneration) {
298298

299299
@Override
300300
public void onFailure(Exception e) {
301+
final String failure = ExceptionsHelper.stackTrace(e);
302+
snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), failure);
301303
logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e);
302-
notifyFailedSnapshotShard(snapshot, shardId, ExceptionsHelper.stackTrace(e));
304+
notifyFailedSnapshotShard(snapshot, shardId, failure);
303305
}
304306
});
305307
}

server/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1219,6 +1219,12 @@ public void testDataNodeRestartWithBusyMasterDuringSnapshot() throws Exception {
12191219
disruption.startDisrupting();
12201220
logger.info("--> restarting data node, which should cause primary shards to be failed");
12211221
internalCluster().restartNode(dataNode, InternalTestCluster.EMPTY_CALLBACK);
1222+
1223+
logger.info("--> wait for shard snapshots to show as failed");
1224+
assertBusy(() -> assertThat(
1225+
client().admin().cluster().prepareSnapshotStatus("test-repo").setSnapshots("test-snap").get().getSnapshots()
1226+
.get(0).getShardsStats().getFailedShards(), greaterThanOrEqualTo(1)), 60L, TimeUnit.SECONDS);
1227+
12221228
unblockNode("test-repo", dataNode);
12231229
disruption.stopDisrupting();
12241230
// check that snapshot completes

0 commit comments

Comments
 (0)