@@ -1236,6 +1236,55 @@ public void testDataNodeRestartWithBusyMasterDuringSnapshot() throws Exception {
1236
1236
}, 60L , TimeUnit .SECONDS );
1237
1237
}
1238
1238
1239
+ public void testDataNodeRestartAfterShardSnapshotFailure () throws Exception {
1240
+ logger .info ("--> starting a master node and two data nodes" );
1241
+ internalCluster ().startMasterOnlyNode ();
1242
+ final List <String > dataNodes = internalCluster ().startDataOnlyNodes (2 );
1243
+ logger .info ("--> creating repository" );
1244
+ assertAcked (client ().admin ().cluster ().preparePutRepository ("test-repo" )
1245
+ .setType ("mock" ).setSettings (Settings .builder ()
1246
+ .put ("location" , randomRepoPath ())
1247
+ .put ("compress" , randomBoolean ())
1248
+ .put ("chunk_size" , randomIntBetween (100 , 1000 ), ByteSizeUnit .BYTES )));
1249
+ assertAcked (prepareCreate ("test-idx" , 0 , Settings .builder ()
1250
+ .put ("number_of_shards" , 2 ).put ("number_of_replicas" , 0 )));
1251
+ ensureGreen ();
1252
+ logger .info ("--> indexing some data" );
1253
+ final int numdocs = randomIntBetween (50 , 100 );
1254
+ IndexRequestBuilder [] builders = new IndexRequestBuilder [numdocs ];
1255
+ for (int i = 0 ; i < builders .length ; i ++) {
1256
+ builders [i ] = client ().prepareIndex ("test-idx" , "type1" ,
1257
+ Integer .toString (i )).setSource ("field1" , "bar " + i );
1258
+ }
1259
+ indexRandom (true , builders );
1260
+ flushAndRefresh ();
1261
+ blockAllDataNodes ("test-repo" );
1262
+ logger .info ("--> snapshot" );
1263
+ client (internalCluster ().getMasterName ()).admin ().cluster ()
1264
+ .prepareCreateSnapshot ("test-repo" , "test-snap" ).setWaitForCompletion (false ).setIndices ("test-idx" ).get ();
1265
+ logger .info ("--> restarting first data node, which should cause the primary shard on it to be failed" );
1266
+ internalCluster ().restartNode (dataNodes .get (0 ), InternalTestCluster .EMPTY_CALLBACK );
1267
+
1268
+ logger .info ("--> wait for shard snapshot of first primary to show as failed" );
1269
+ assertBusy (() -> assertThat (
1270
+ client ().admin ().cluster ().prepareSnapshotStatus ("test-repo" ).setSnapshots ("test-snap" ).get ().getSnapshots ()
1271
+ .get (0 ).getShardsStats ().getFailedShards (), is (1 )), 60L , TimeUnit .SECONDS );
1272
+
1273
+ logger .info ("--> restarting second data node, which should cause the primary shard on it to be failed" );
1274
+ internalCluster ().restartNode (dataNodes .get (1 ), InternalTestCluster .EMPTY_CALLBACK );
1275
+
1276
+ // check that snapshot completes with both failed shards being accounted for in the snapshot result
1277
+ assertBusy (() -> {
1278
+ GetSnapshotsResponse snapshotsStatusResponse = client ().admin ().cluster ()
1279
+ .prepareGetSnapshots ("test-repo" ).setSnapshots ("test-snap" ).setIgnoreUnavailable (true ).get ();
1280
+ assertEquals (1 , snapshotsStatusResponse .getSnapshots ().size ());
1281
+ SnapshotInfo snapshotInfo = snapshotsStatusResponse .getSnapshots ().get (0 );
1282
+ assertTrue (snapshotInfo .state ().toString (), snapshotInfo .state ().completed ());
1283
+ assertThat (snapshotInfo .totalShards (), is (2 ));
1284
+ assertThat (snapshotInfo .shardFailures (), hasSize (2 ));
1285
+ }, 60L , TimeUnit .SECONDS );
1286
+ }
1287
+
1239
1288
public void testRetentionLeasesClearedOnRestore () throws Exception {
1240
1289
final String repoName = "test-repo-retention-leases" ;
1241
1290
assertAcked (client ().admin ().cluster ().preparePutRepository (repoName )
0 commit comments