Fix SnapshotShardStatus Reporting for Failed Shard (#48556) (#48689)

original-brownbear · web-flow · commit 8b1cdd7bf466 · 2019-10-30T21:00:53.000+01:00
Fixes the shard snapshot status reporting for failed shards in the corner case of failing the shard because of an exception thrown in `SnapshotShardsService` and not the repository. We were missing the update on the `snapshotStatus` instance in this case which made the transport APIs using this field report back an incorrect status. Fixed by moving the failure handling to the `SnapshotShardsService` for all cases (which also simplifies the code, the ex. wrapping in the repository was pointless as we only used the ex. trace upstream anyway). Also, added an assertion to another test that explicitly checks this failure situation (ex. in the `SnapshotShardsService`) already. Closes #48526
diff --git a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java
@@ -27,7 +27,6 @@
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.RateLimiter;
 import org.apache.lucene.util.SetOnce;
-import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.Version;
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.ActionRunnable;
@@ -995,12 +994,6 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
                               IndexCommit snapshotIndexCommit, IndexShardSnapshotStatus snapshotStatus, ActionListener<Void> listener) {
         final ShardId shardId = store.shardId();
         final long startTime = threadPool.absoluteTimeInMillis();
-        final StepListener<Void> snapshotDoneListener = new StepListener<>();
-        snapshotDoneListener.whenComplete(listener::onResponse, e -> {
-            snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), ExceptionsHelper.detailedMessage(e));
-            listener.onFailure(e instanceof IndexShardSnapshotFailedException ? (IndexShardSnapshotFailedException) e
-                : new IndexShardSnapshotFailedException(store.shardId(), e));
-        });
         try {
             logger.debug("[{}] [{}] snapshot to [{}] ...", shardId, snapshotId, metadata.name());
 
@@ -1137,8 +1130,8 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
                         snapshotId, shardId), e);
                 }
                 snapshotStatus.moveToDone(threadPool.absoluteTimeInMillis());
-                snapshotDoneListener.onResponse(null);
-            }, snapshotDoneListener::onFailure);
+                listener.onResponse(null);
+            }, listener::onFailure);
             if (indexIncrementalFileCount == 0) {
                 allFilesUploadedListener.onResponse(Collections.emptyList());
                 return;
@@ -1181,7 +1174,7 @@ public void onFailure(Exception e) {
                 });
             }
         } catch (Exception e) {
-            snapshotDoneListener.onFailure(e);
+            listener.onFailure(e);
         }
     }
 
diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java
@@ -292,8 +292,10 @@ public void onResponse(final Void aVoid) {
 
                     @Override
                     public void onFailure(Exception e) {
+                        final String failure = ExceptionsHelper.detailedMessage(e);
+                        snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), failure);
                         logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e);
-                        notifyFailedSnapshotShard(snapshot, shardId, ExceptionsHelper.detailedMessage(e));
+                        notifyFailedSnapshotShard(snapshot, shardId, failure);
                     }
                 });
             }
diff --git a/server/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java b/server/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java
@@ -1224,6 +1224,12 @@ public void testDataNodeRestartWithBusyMasterDuringSnapshot() throws Exception {
         disruption.startDisrupting();
         logger.info("-->  restarting data node, which should cause primary shards to be failed");
         internalCluster().restartNode(dataNode, InternalTestCluster.EMPTY_CALLBACK);
+
+        logger.info("-->  wait for shard snapshots to show as failed");
+        assertBusy(() -> assertThat(
+            client().admin().cluster().prepareSnapshotStatus("test-repo").setSnapshots("test-snap").get().getSnapshots()
+                .get(0).getShardsStats().getFailedShards(), greaterThanOrEqualTo(1)), 60L, TimeUnit.SECONDS);
+
         unblockNode("test-repo", dataNode);
         disruption.stopDisrupting();
         // check that snapshot completes

Original file line number	Diff line number	Diff line change
`@@ -292,8 +292,10 @@ public void onResponse(final Void aVoid) {`
`292`	`292`
`293`	`293`	`@Override`
`294`	`294`	`public void onFailure(Exception e) {`
	`295`	`+ final String failure = ExceptionsHelper.detailedMessage(e);`
	`296`	`+ snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), failure);`
`295`	`297`	`logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e);`
`296`		`- notifyFailedSnapshotShard(snapshot, shardId, ExceptionsHelper.detailedMessage(e));`
	`298`	`+ notifyFailedSnapshotShard(snapshot, shardId, failure);`
`297`	`299`	`}`
`298`	`300`	`});`
`299`	`301`	`}`