Skip to content

Commit 3603970

Browse files
Fix SnapshotShardStatus Reporting for Failed Shard (#48556) (#48687)
Fixes the shard snapshot status reporting for failed shards in the corner case of failing the shard because of an exception thrown in `SnapshotShardsService` and not the repository. We were missing the update on the `snapshotStatus` instance in this case which made the transport APIs using this field report back an incorrect status. Fixed by moving the failure handling to the `SnapshotShardsService` for all cases (which also simplifies the code, the ex. wrapping in the repository was pointless as we only used the ex. trace upstream anyway). Also, added an assertion to another test that explicitly checks this failure situation (ex. in the `SnapshotShardsService`) already. Closes #48526
1 parent 52e5ceb commit 3603970

File tree

3 files changed

+12
-9
lines changed

3 files changed

+12
-9
lines changed

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
import org.apache.lucene.store.IndexOutput;
3232
import org.apache.lucene.store.RateLimiter;
3333
import org.apache.lucene.util.SetOnce;
34-
import org.elasticsearch.ExceptionsHelper;
3534
import org.elasticsearch.action.ActionListener;
3635
import org.elasticsearch.action.ActionRunnable;
3736
import org.elasticsearch.action.StepListener;
@@ -1056,10 +1055,6 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
10561055
ActionListener<String> listener) {
10571056
final ShardId shardId = store.shardId();
10581057
final long startTime = threadPool.absoluteTimeInMillis();
1059-
final ActionListener<String> snapshotDoneListener = ActionListener.wrap(listener::onResponse, e -> {
1060-
snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), ExceptionsHelper.stackTrace(e));
1061-
listener.onFailure(e instanceof IndexShardSnapshotFailedException ? e : new IndexShardSnapshotFailedException(shardId, e));
1062-
});
10631058
try {
10641059
final String generation = snapshotStatus.generation();
10651060
logger.debug("[{}] [{}] snapshot to [{}] [{}] ...", shardId, snapshotId, metadata.name(), generation);
@@ -1205,8 +1200,8 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
12051200
}
12061201
}
12071202
snapshotStatus.moveToDone(threadPool.absoluteTimeInMillis(), indexGeneration);
1208-
snapshotDoneListener.onResponse(indexGeneration);
1209-
}, snapshotDoneListener::onFailure);
1203+
listener.onResponse(indexGeneration);
1204+
}, listener::onFailure);
12101205
if (indexIncrementalFileCount == 0) {
12111206
allFilesUploadedListener.onResponse(Collections.emptyList());
12121207
return;
@@ -1232,7 +1227,7 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
12321227
}));
12331228
}
12341229
} catch (Exception e) {
1235-
snapshotDoneListener.onFailure(e);
1230+
listener.onFailure(e);
12361231
}
12371232
}
12381233

server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,8 +297,10 @@ public void onResponse(String newGeneration) {
297297

298298
@Override
299299
public void onFailure(Exception e) {
300+
final String failure = ExceptionsHelper.stackTrace(e);
301+
snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), failure);
300302
logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e);
301-
notifyFailedSnapshotShard(snapshot, shardId, ExceptionsHelper.stackTrace(e));
303+
notifyFailedSnapshotShard(snapshot, shardId, failure);
302304
}
303305
});
304306
}

server/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1224,6 +1224,12 @@ public void testDataNodeRestartWithBusyMasterDuringSnapshot() throws Exception {
12241224
disruption.startDisrupting();
12251225
logger.info("--> restarting data node, which should cause primary shards to be failed");
12261226
internalCluster().restartNode(dataNode, InternalTestCluster.EMPTY_CALLBACK);
1227+
1228+
logger.info("--> wait for shard snapshots to show as failed");
1229+
assertBusy(() -> assertThat(
1230+
client().admin().cluster().prepareSnapshotStatus("test-repo").setSnapshots("test-snap").get().getSnapshots()
1231+
.get(0).getShardsStats().getFailedShards(), greaterThanOrEqualTo(1)), 60L, TimeUnit.SECONDS);
1232+
12271233
unblockNode("test-repo", dataNode);
12281234
disruption.stopDisrupting();
12291235
// check that snapshot completes

0 commit comments

Comments
 (0)