From ced1abab75cd311e8e182822d0873fa6cd6e230e Mon Sep 17 00:00:00 2001
From: Armin Braun <me@obrown.io>
Date: Sun, 27 Oct 2019 15:36:59 +0100
Subject: [PATCH] Fix SnapshotShardStatus Reporting for Failed Shard

Fixes the shard snapshot status reporting for failed shards
in the corner case of failing the shard because of an exception
thrown in `SnapshotShardsService` and not the repository.
We were missing the update on the `snapshotStatus` instance in
this case which made the transport APIs using this field report
back an incorrect status.
Fixed by moving the failure handling to the `SnapshotShardsService`
for all cases (which also simplifies the code, the ex. wrapping in
the repository was pointless as we only used the ex. trace upstream
anyway).
Also, added an assertion to another test that explicitly checks this
failure situation (ex. in the `SnapshotShardsService`) already.

Closes #48526
---
 .../repositories/blobstore/BlobStoreRepository.java   | 11 +++--------
 .../snapshots/SnapshotShardsService.java              |  4 +++-
 .../snapshots/DedicatedClusterSnapshotRestoreIT.java  |  6 ++++++
 3 files changed, 12 insertions(+), 9 deletions(-)
diff --git a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java
index e4626578864a8..9c1c24a35fbe4 100644
--- a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java
+++ b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java
@@ -31,7 +31,6 @@
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.store.RateLimiter;
 import org.apache.lucene.util.SetOnce;
-import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.ActionRunnable;
 import org.elasticsearch.action.StepListener;
@@ -1042,10 +1041,6 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
                               ActionListener<String> listener) {
         final ShardId shardId = store.shardId();
         final long startTime = threadPool.absoluteTimeInMillis();
-        final ActionListener<String> snapshotDoneListener = ActionListener.wrap(listener::onResponse, e -> {
-            snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), ExceptionsHelper.stackTrace(e));
-            listener.onFailure(e instanceof IndexShardSnapshotFailedException ? e : new IndexShardSnapshotFailedException(shardId, e));
-        });
         try {
             final String generation = snapshotStatus.generation();
             logger.debug("[{}] [{}] snapshot to [{}] [{}] ...", shardId, snapshotId, metadata.name(), generation);
@@ -1191,8 +1186,8 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
                     }
                 }
                 snapshotStatus.moveToDone(threadPool.absoluteTimeInMillis(), indexGeneration);
-                snapshotDoneListener.onResponse(indexGeneration);
-            }, snapshotDoneListener::onFailure);
+                listener.onResponse(indexGeneration);
+            }, listener::onFailure);
             if (indexIncrementalFileCount == 0) {
                 allFilesUploadedListener.onResponse(Collections.emptyList());
                 return;
@@ -1222,7 +1217,7 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
                 }));
             }
         } catch (Exception e) {
-            snapshotDoneListener.onFailure(e);
+            listener.onFailure(e);
         }
     }
 
diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java
index 9b256ecaa077c..96bc4969d9d91 100644
--- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java
+++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java
@@ -298,8 +298,10 @@ public void onResponse(String newGeneration) {
 
                     @Override
                     public void onFailure(Exception e) {
+                        final String failure = ExceptionsHelper.stackTrace(e);
+                        snapshotStatus.moveToFailed(threadPool.absoluteTimeInMillis(), failure);
                         logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to snapshot shard", shardId, snapshot), e);
-                        notifyFailedSnapshotShard(snapshot, shardId, ExceptionsHelper.stackTrace(e));
+                        notifyFailedSnapshotShard(snapshot, shardId, failure);
                     }
                 });
             }
diff --git a/server/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java b/server/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java
index 5dc55ce3b2bba..a11fb9d13bc04 100644
--- a/server/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java
+++ b/server/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java
@@ -1219,6 +1219,12 @@ public void testDataNodeRestartWithBusyMasterDuringSnapshot() throws Exception {
         disruption.startDisrupting();
         logger.info("-->  restarting data node, which should cause primary shards to be failed");
         internalCluster().restartNode(dataNode, InternalTestCluster.EMPTY_CALLBACK);
+
+        logger.info("-->  wait for shard snapshots to show as failed");
+        assertBusy(() -> assertThat(
+            client().admin().cluster().prepareSnapshotStatus("test-repo").setSnapshots("test-snap").get().getSnapshots()
+                .get(0).getShardsStats().getFailedShards(), greaterThanOrEqualTo(1)), 60L, TimeUnit.SECONDS);
+
         unblockNode("test-repo", dataNode);
         disruption.stopDisrupting();
         // check that snapshot completes