@@ -1234,6 +1234,55 @@ public void testDataNodeRestartWithBusyMasterDuringSnapshot() throws Exception {
1234
1234
}, 60L , TimeUnit .SECONDS );
1235
1235
}
1236
1236
1237
+ public void testDataNodeRestartAfterShardSnapshotFailure () throws Exception {
1238
+ logger .info ("--> starting a master node and two data nodes" );
1239
+ internalCluster ().startMasterOnlyNode ();
1240
+ final List <String > dataNodes = internalCluster ().startDataOnlyNodes (2 );
1241
+ logger .info ("--> creating repository" );
1242
+ assertAcked (client ().admin ().cluster ().preparePutRepository ("test-repo" )
1243
+ .setType ("mock" ).setSettings (Settings .builder ()
1244
+ .put ("location" , randomRepoPath ())
1245
+ .put ("compress" , randomBoolean ())
1246
+ .put ("chunk_size" , randomIntBetween (100 , 1000 ), ByteSizeUnit .BYTES )));
1247
+ assertAcked (prepareCreate ("test-idx" , 0 , Settings .builder ()
1248
+ .put ("number_of_shards" , 2 ).put ("number_of_replicas" , 0 )));
1249
+ ensureGreen ();
1250
+ logger .info ("--> indexing some data" );
1251
+ final int numdocs = randomIntBetween (50 , 100 );
1252
+ IndexRequestBuilder [] builders = new IndexRequestBuilder [numdocs ];
1253
+ for (int i = 0 ; i < builders .length ; i ++) {
1254
+ builders [i ] = client ().prepareIndex ("test-idx" , "type1" ,
1255
+ Integer .toString (i )).setSource ("field1" , "bar " + i );
1256
+ }
1257
+ indexRandom (true , builders );
1258
+ flushAndRefresh ();
1259
+ blockAllDataNodes ("test-repo" );
1260
+ logger .info ("--> snapshot" );
1261
+ client (internalCluster ().getMasterName ()).admin ().cluster ()
1262
+ .prepareCreateSnapshot ("test-repo" , "test-snap" ).setWaitForCompletion (false ).setIndices ("test-idx" ).get ();
1263
+ logger .info ("--> restarting first data node, which should cause the primary shard on it to be failed" );
1264
+ internalCluster ().restartNode (dataNodes .get (0 ), InternalTestCluster .EMPTY_CALLBACK );
1265
+
1266
+ logger .info ("--> wait for shard snapshot of first primary to show as failed" );
1267
+ assertBusy (() -> assertThat (
1268
+ client ().admin ().cluster ().prepareSnapshotStatus ("test-repo" ).setSnapshots ("test-snap" ).get ().getSnapshots ()
1269
+ .get (0 ).getShardsStats ().getFailedShards (), is (1 )), 60L , TimeUnit .SECONDS );
1270
+
1271
+ logger .info ("--> restarting second data node, which should cause the primary shard on it to be failed" );
1272
+ internalCluster ().restartNode (dataNodes .get (1 ), InternalTestCluster .EMPTY_CALLBACK );
1273
+
1274
+ // check that snapshot completes with both failed shards being accounted for in the snapshot result
1275
+ assertBusy (() -> {
1276
+ GetSnapshotsResponse snapshotsStatusResponse = client ().admin ().cluster ()
1277
+ .prepareGetSnapshots ("test-repo" ).setSnapshots ("test-snap" ).setIgnoreUnavailable (true ).get ();
1278
+ assertEquals (1 , snapshotsStatusResponse .getSnapshots ("test-repo" ).size ());
1279
+ SnapshotInfo snapshotInfo = snapshotsStatusResponse .getSnapshots ("test-repo" ).get (0 );
1280
+ assertTrue (snapshotInfo .state ().toString (), snapshotInfo .state ().completed ());
1281
+ assertThat (snapshotInfo .totalShards (), is (2 ));
1282
+ assertThat (snapshotInfo .shardFailures (), hasSize (2 ));
1283
+ }, 60L , TimeUnit .SECONDS );
1284
+ }
1285
+
1237
1286
public void testRetentionLeasesClearedOnRestore () throws Exception {
1238
1287
final String repoName = "test-repo-retention-leases" ;
1239
1288
assertAcked (client ().admin ().cluster ().preparePutRepository (repoName )
0 commit comments