50
50
import org .elasticsearch .common .io .stream .StreamOutput ;
51
51
import org .elasticsearch .common .settings .Settings ;
52
52
import org .elasticsearch .index .IndexNotFoundException ;
53
+ import org .elasticsearch .index .engine .EngineClosedException ;
54
+ import org .elasticsearch .index .shard .IndexShardClosedException ;
53
55
import org .elasticsearch .index .shard .IndexShardNotStartedException ;
54
56
import org .elasticsearch .index .shard .IndexShardState ;
55
57
import org .elasticsearch .index .shard .ShardId ;
@@ -158,15 +160,15 @@ public void testBlocks() throws ExecutionException, InterruptedException {
158
160
ReplicationTask task = maybeTask ();
159
161
160
162
ClusterBlocks .Builder block = ClusterBlocks .builder ()
161
- .addGlobalBlock (new ClusterBlock (1 , "non retryable" , false , true , RestStatus .SERVICE_UNAVAILABLE , ClusterBlockLevel .ALL ));
163
+ .addGlobalBlock (new ClusterBlock (1 , "non retryable" , false , true , RestStatus .SERVICE_UNAVAILABLE , ClusterBlockLevel .ALL ));
162
164
setState (clusterService , ClusterState .builder (clusterService .state ()).blocks (block ));
163
165
TransportReplicationAction .ReroutePhase reroutePhase = action .new ReroutePhase (task , request , listener );
164
166
reroutePhase .run ();
165
167
assertListenerThrows ("primary phase should fail operation" , listener , ClusterBlockException .class );
166
168
assertPhase (task , "failed" );
167
169
168
170
block = ClusterBlocks .builder ()
169
- .addGlobalBlock (new ClusterBlock (1 , "retryable" , true , true , RestStatus .SERVICE_UNAVAILABLE , ClusterBlockLevel .ALL ));
171
+ .addGlobalBlock (new ClusterBlock (1 , "retryable" , true , true , RestStatus .SERVICE_UNAVAILABLE , ClusterBlockLevel .ALL ));
170
172
setState (clusterService , ClusterState .builder (clusterService .state ()).blocks (block ));
171
173
listener = new PlainActionFuture <>();
172
174
reroutePhase = action .new ReroutePhase (task , new Request ().timeout ("5ms" ), listener );
@@ -181,7 +183,7 @@ public void testBlocks() throws ExecutionException, InterruptedException {
181
183
assertPhase (task , "waiting_for_retry" );
182
184
183
185
block = ClusterBlocks .builder ()
184
- .addGlobalBlock (new ClusterBlock (1 , "non retryable" , false , true , RestStatus .SERVICE_UNAVAILABLE , ClusterBlockLevel .ALL ));
186
+ .addGlobalBlock (new ClusterBlock (1 , "non retryable" , false , true , RestStatus .SERVICE_UNAVAILABLE , ClusterBlockLevel .ALL ));
185
187
setState (clusterService , ClusterState .builder (clusterService .state ()).blocks (block ));
186
188
assertListenerThrows ("primary phase should fail operation when moving from a retryable block to a non-retryable one" , listener , ClusterBlockException .class );
187
189
assertIndexShardUninitialized ();
@@ -196,7 +198,7 @@ public void testNotStartedPrimary() throws InterruptedException, ExecutionExcept
196
198
final ShardId shardId = new ShardId (index , "_na_" , 0 );
197
199
// no replicas in oder to skip the replication part
198
200
setState (clusterService , state (index , true ,
199
- randomBoolean () ? ShardRoutingState .INITIALIZING : ShardRoutingState .UNASSIGNED ));
201
+ randomBoolean () ? ShardRoutingState .INITIALIZING : ShardRoutingState .UNASSIGNED ));
200
202
ReplicationTask task = maybeTask ();
201
203
202
204
logger .debug ("--> using initial state:\n {}" , clusterService .state ().prettyPrint ());
@@ -221,7 +223,7 @@ public void testNotStartedPrimary() throws InterruptedException, ExecutionExcept
221
223
final IndexShardRoutingTable shardRoutingTable = clusterService .state ().routingTable ().index (index ).shard (shardId .id ());
222
224
final String primaryNodeId = shardRoutingTable .primaryShard ().currentNodeId ();
223
225
final List <CapturingTransport .CapturedRequest > capturedRequests =
224
- transport .getCapturedRequestsByTargetNodeAndClear ().get (primaryNodeId );
226
+ transport .getCapturedRequestsByTargetNodeAndClear ().get (primaryNodeId );
225
227
assertThat (capturedRequests , notNullValue ());
226
228
assertThat (capturedRequests .size (), equalTo (1 ));
227
229
assertThat (capturedRequests .get (0 ).action , equalTo ("testAction[p]" ));
@@ -234,7 +236,7 @@ public void testNotStartedPrimary() throws InterruptedException, ExecutionExcept
234
236
* before the relocation target, there is a time span where relocation source believes active primary to be on
235
237
* relocation target and relocation target believes active primary to be on relocation source. This results in replication
236
238
* requests being sent back and forth.
237
- *
239
+ * <p>
238
240
* This test checks that replication request is not routed back from relocation target to relocation source in case of
239
241
* stale index routing table on relocation target.
240
242
*/
@@ -271,7 +273,7 @@ public void testNoRerouteOnStaleClusterState() throws InterruptedException, Exec
271
273
IndexShardRoutingTable shardRoutingTable = clusterService .state ().routingTable ().index (index ).shard (shardId .id ());
272
274
final String primaryNodeId = shardRoutingTable .primaryShard ().currentNodeId ();
273
275
final List <CapturingTransport .CapturedRequest > capturedRequests =
274
- transport .getCapturedRequestsByTargetNodeAndClear ().get (primaryNodeId );
276
+ transport .getCapturedRequestsByTargetNodeAndClear ().get (primaryNodeId );
275
277
assertThat (capturedRequests , notNullValue ());
276
278
assertThat (capturedRequests .size (), equalTo (1 ));
277
279
assertThat (capturedRequests .get (0 ).action , equalTo ("testAction[p]" ));
@@ -282,7 +284,7 @@ public void testUnknownIndexOrShardOnReroute() throws InterruptedException {
282
284
final String index = "test" ;
283
285
// no replicas in oder to skip the replication part
284
286
setState (clusterService , state (index , true ,
285
- randomBoolean () ? ShardRoutingState .INITIALIZING : ShardRoutingState .UNASSIGNED ));
287
+ randomBoolean () ? ShardRoutingState .INITIALIZING : ShardRoutingState .UNASSIGNED ));
286
288
logger .debug ("--> using initial state:\n {}" , clusterService .state ().prettyPrint ());
287
289
Request request = new Request (new ShardId ("unknown_index" , "_na_" , 0 )).timeout ("1ms" );
288
290
PlainActionFuture <Response > listener = new PlainActionFuture <>();
@@ -299,6 +301,61 @@ public void testUnknownIndexOrShardOnReroute() throws InterruptedException {
299
301
assertListenerThrows ("must throw shard not found exception" , listener , ShardNotFoundException .class );
300
302
}
301
303
304
+ public void testStalePrimaryShardOnReroute () throws InterruptedException {
305
+ final String index = "test" ;
306
+ final ShardId shardId = new ShardId (index , "_na_" , 0 );
307
+ // no replicas in order to skip the replication part
308
+ setState (clusterService , stateWithActivePrimary (index , true , randomInt (3 )));
309
+ logger .debug ("--> using initial state:\n {}" , clusterService .state ().prettyPrint ());
310
+ Request request = new Request (shardId );
311
+ boolean timeout = randomBoolean ();
312
+ if (timeout ) {
313
+ request .timeout ("0s" );
314
+ } else {
315
+ request .timeout ("1h" );
316
+ }
317
+ PlainActionFuture <Response > listener = new PlainActionFuture <>();
318
+ ReplicationTask task = maybeTask ();
319
+
320
+ TransportReplicationAction .ReroutePhase reroutePhase = action .new ReroutePhase (task , request , listener );
321
+ reroutePhase .run ();
322
+ CapturingTransport .CapturedRequest [] capturedRequests = transport .getCapturedRequestsAndClear ();
323
+ assertThat (capturedRequests , arrayWithSize (1 ));
324
+ assertThat (capturedRequests [0 ].action , equalTo ("testAction[p]" ));
325
+ assertPhase (task , "waiting_on_primary" );
326
+ transport .handleRemoteError (capturedRequests [0 ].requestId , randomRetryPrimaryException (shardId ));
327
+
328
+
329
+ if (timeout ) {
330
+ // we always try at least one more time on timeout
331
+ assertThat (listener .isDone (), equalTo (false ));
332
+ capturedRequests = transport .getCapturedRequestsAndClear ();
333
+ assertThat (capturedRequests , arrayWithSize (1 ));
334
+ assertThat (capturedRequests [0 ].action , equalTo ("testAction[p]" ));
335
+ assertPhase (task , "waiting_on_primary" );
336
+ transport .handleRemoteError (capturedRequests [0 ].requestId , randomRetryPrimaryException (shardId ));
337
+ assertListenerThrows ("must throw index not found exception" , listener , ElasticsearchException .class );
338
+ assertPhase (task , "failed" );
339
+ } else {
340
+ assertThat (listener .isDone (), equalTo (false ));
341
+ // generate a CS change
342
+ setState (clusterService , clusterService .state ());
343
+ capturedRequests = transport .getCapturedRequestsAndClear ();
344
+ assertThat (capturedRequests , arrayWithSize (1 ));
345
+ assertThat (capturedRequests [0 ].action , equalTo ("testAction[p]" ));
346
+ }
347
+ }
348
+
349
+ private ElasticsearchException randomRetryPrimaryException (ShardId shardId ) {
350
+ return randomFrom (
351
+ new ShardNotFoundException (shardId ),
352
+ new IndexNotFoundException (shardId .getIndex ()),
353
+ new IndexShardClosedException (shardId ),
354
+ new EngineClosedException (shardId ),
355
+ new TransportReplicationAction .RetryOnPrimaryException (shardId , "hello" )
356
+ );
357
+ }
358
+
302
359
public void testRoutePhaseExecutesRequest () {
303
360
final String index = "test" ;
304
361
final ShardId shardId = new ShardId (index , "_na_" , 0 );
@@ -449,7 +506,7 @@ protected Tuple<Response, Request> shardOperationOnPrimary(MetaData metaData, Re
449
506
PlainActionFuture <Response > listener = new PlainActionFuture <>();
450
507
ReplicationTask task = maybeTask ();
451
508
TransportReplicationAction <Request , Request , Response >.PrimaryPhase primaryPhase = actionWithRelocatingReplicasAfterPrimaryOp .new PrimaryPhase (
452
- task , request , createTransportChannel (listener ));
509
+ task , request , createTransportChannel (listener ));
453
510
primaryPhase .run ();
454
511
assertThat ("request was not processed on primary" , request .processedOnPrimary .get (), equalTo (true ));
455
512
ShardRouting relocatingReplicaShard = stateWithRelocatingReplica .getRoutingTable ().shardRoutingTable (index , shardId .id ()).replicaShards ().get (0 );
@@ -485,7 +542,7 @@ protected Tuple<Response, Request> shardOperationOnPrimary(MetaData metaData, Re
485
542
PlainActionFuture <Response > listener = new PlainActionFuture <>();
486
543
ReplicationTask task = maybeTask ();
487
544
TransportReplicationAction <Request , Request , Response >.PrimaryPhase primaryPhase = actionWithDeletedIndexAfterPrimaryOp .new PrimaryPhase (
488
- task , request , createTransportChannel (listener ));
545
+ task , request , createTransportChannel (listener ));
489
546
primaryPhase .run ();
490
547
assertThat ("request was not processed on primary" , request .processedOnPrimary .get (), equalTo (true ));
491
548
assertThat ("replication phase should be skipped if index gets deleted after primary operation" , transport .capturedRequestsByTargetNode ().size (), equalTo (0 ));
@@ -529,8 +586,8 @@ public void testWriteConsistency() throws ExecutionException, InterruptedExcepti
529
586
530
587
setState (clusterService , state (index , true , ShardRoutingState .STARTED , replicaStates ));
531
588
logger .debug ("using consistency level of [{}], assigned shards [{}], total shards [{}]. expecting op to [{}]. using state: \n {}" ,
532
- request .consistencyLevel (), 1 + assignedReplicas , 1 + assignedReplicas + unassignedReplicas , passesWriteConsistency ? "succeed" : "retry" ,
533
- clusterService .state ().prettyPrint ());
589
+ request .consistencyLevel (), 1 + assignedReplicas , 1 + assignedReplicas + unassignedReplicas , passesWriteConsistency ? "succeed" : "retry" ,
590
+ clusterService .state ().prettyPrint ());
534
591
535
592
final IndexShardRoutingTable shardRoutingTable = clusterService .state ().routingTable ().index (index ).shard (shardId .id ());
536
593
PlainActionFuture <Response > listener = new PlainActionFuture <>();
@@ -646,7 +703,7 @@ protected void runReplicateTest(ClusterState state, IndexShardRoutingTable shard
646
703
647
704
TransportChannel channel = createTransportChannel (listener , error ::set );
648
705
TransportReplicationAction <Request , Request , Response >.ReplicationPhase replicationPhase =
649
- action .new ReplicationPhase (task , request , new Response (), request .shardId (), channel , reference );
706
+ action .new ReplicationPhase (task , request , new Response (), request .shardId (), channel , reference );
650
707
651
708
assertThat (replicationPhase .totalShards (), equalTo (totalShards ));
652
709
assertThat (replicationPhase .pending (), equalTo (assignedReplicas ));
@@ -656,7 +713,7 @@ protected void runReplicateTest(ClusterState state, IndexShardRoutingTable shard
656
713
657
714
HashMap <String , Request > nodesSentTo = new HashMap <>();
658
715
boolean executeOnReplica =
659
- action .shouldExecuteReplication (clusterService .state ().getMetaData ().index (shardId .getIndex ()).getSettings ());
716
+ action .shouldExecuteReplication (clusterService .state ().getMetaData ().index (shardId .getIndex ()).getSettings ());
660
717
for (CapturingTransport .CapturedRequest capturedRequest : capturedRequests ) {
661
718
// no duplicate requests
662
719
Request replicationRequest = (Request ) capturedRequest .request ;
@@ -819,7 +876,7 @@ public void testCounterIncrementedWhileReplicationOngoing() throws InterruptedEx
819
876
final ShardId shardId = new ShardId (index , "_na_" , 0 );
820
877
// one replica to make sure replication is attempted
821
878
setState (clusterService , state (index , true ,
822
- ShardRoutingState .STARTED , ShardRoutingState .STARTED ));
879
+ ShardRoutingState .STARTED , ShardRoutingState .STARTED ));
823
880
ShardRouting primaryShard = clusterService .state ().routingTable ().shardRoutingTable (shardId ).primaryShard ();
824
881
indexShardRouting .set (primaryShard );
825
882
logger .debug ("--> using initial state:\n {}" , clusterService .state ().prettyPrint ());
@@ -856,7 +913,7 @@ public void testCounterIncrementedWhileReplicationOngoing() throws InterruptedEx
856
913
public void testReplicasCounter () throws Exception {
857
914
final ShardId shardId = new ShardId ("test" , "_na_" , 0 );
858
915
setState (clusterService , state (shardId .getIndexName (), true ,
859
- ShardRoutingState .STARTED , ShardRoutingState .STARTED ));
916
+ ShardRoutingState .STARTED , ShardRoutingState .STARTED ));
860
917
action = new ActionWithDelay (Settings .EMPTY , "testActionWithExceptions" , transportService , clusterService , threadPool );
861
918
final Action .ReplicaOperationTransportHandler replicaOperationTransportHandler = action .new ReplicaOperationTransportHandler ();
862
919
final ReplicationTask task = maybeTask ();
@@ -895,7 +952,7 @@ public void testCounterDecrementedIfShardOperationThrowsException() throws Inter
895
952
final String index = "test" ;
896
953
final ShardId shardId = new ShardId (index , "_na_" , 0 );
897
954
setState (clusterService , state (index , true ,
898
- ShardRoutingState .STARTED , ShardRoutingState .STARTED ));
955
+ ShardRoutingState .STARTED , ShardRoutingState .STARTED ));
899
956
logger .debug ("--> using initial state:\n {}" , clusterService .state ().prettyPrint ());
900
957
Request request = new Request (shardId ).timeout ("100ms" );
901
958
PlainActionFuture <Response > listener = new PlainActionFuture <>();
@@ -915,7 +972,7 @@ public void testReroutePhaseRetriedAfterDemotedPrimary() {
915
972
final ShardId shardId = new ShardId (index , "_na_" , 0 );
916
973
boolean localPrimary = true ;
917
974
setState (clusterService , state (index , localPrimary ,
918
- ShardRoutingState .STARTED , ShardRoutingState .STARTED ));
975
+ ShardRoutingState .STARTED , ShardRoutingState .STARTED ));
919
976
Action action = new Action (Settings .EMPTY , "testAction" , transportService , clusterService , threadPool ) {
920
977
@ Override
921
978
protected void resolveRequest (MetaData metaData , String concreteIndex , Request request ) {
@@ -967,7 +1024,7 @@ protected void resolveRequest(MetaData metaData, String concreteIndex, Request r
967
1024
// publish a new cluster state
968
1025
boolean localPrimaryOnRetry = randomBoolean ();
969
1026
setState (clusterService , state (index , localPrimaryOnRetry ,
970
- ShardRoutingState .STARTED , ShardRoutingState .STARTED ));
1027
+ ShardRoutingState .STARTED , ShardRoutingState .STARTED ));
971
1028
CapturingTransport .CapturedRequest [] primaryRetry = transport .getCapturedRequestsAndClear ();
972
1029
973
1030
// the request should be retried
@@ -1083,8 +1140,8 @@ class Action extends TransportReplicationAction<Request, Request, Response> {
1083
1140
ClusterService clusterService ,
1084
1141
ThreadPool threadPool ) {
1085
1142
super (settings , actionName , transportService , clusterService , null , threadPool ,
1086
- new ShardStateAction (settings , clusterService , transportService , null , null , threadPool ),
1087
- new ActionFilters (new HashSet <ActionFilter >()), new IndexNameExpressionResolver (Settings .EMPTY ), Request ::new , Request ::new , ThreadPool .Names .SAME );
1143
+ new ShardStateAction (settings , clusterService , transportService , null , null , threadPool ),
1144
+ new ActionFilters (new HashSet <ActionFilter >()), new IndexNameExpressionResolver (Settings .EMPTY ), Request ::new , Request ::new , ThreadPool .Names .SAME );
1088
1145
}
1089
1146
1090
1147
@ Override
0 commit comments