11
11
import org .apache .logging .log4j .LogManager ;
12
12
import org .apache .logging .log4j .Logger ;
13
13
import org .apache .logging .log4j .message .ParameterizedMessage ;
14
- import org .apache .logging .log4j .util .MessageSupplier ;
15
14
import org .elasticsearch .ElasticsearchException ;
16
15
import org .elasticsearch .ExceptionsHelper ;
17
16
import org .elasticsearch .action .ActionListener ;
@@ -616,43 +615,20 @@ public void messageReceived(StartedShardEntry request, TransportChannel channel,
616
615
SHARD_STARTED_ACTION_NAME ,
617
616
request
618
617
);
618
+
619
+ var update = new StartedShardUpdateTask (request , listener );
620
+
619
621
clusterService .submitStateUpdateTask (
620
622
"shard-started " + request ,
621
- request ,
623
+ update ,
622
624
ClusterStateTaskConfig .build (Priority .URGENT ),
623
625
shardStartedClusterStateTaskExecutor ,
624
- new ClusterStateTaskListener () {
625
- @ Override
626
- public void onFailure (Exception e ) {
627
- final MessageSupplier msg = () -> new ParameterizedMessage (
628
- "{} unexpected failure while starting shard [{}]" ,
629
- request .shardId ,
630
- request
631
- );
632
- if (e instanceof FailedToCommitClusterStateException ) {
633
- logger .debug (msg , e );
634
- } else {
635
- logger .error (msg , e );
636
- }
637
- listener .onFailure (e );
638
- }
639
-
640
- @ Override
641
- public void onNoLongerMaster () {
642
- logger .debug ("{} no longer master while starting shard [{}]" , request .shardId , request );
643
- listener .onFailure (new NotMasterException ("shard-started" ));
644
- }
645
-
646
- @ Override
647
- public void clusterStateProcessed (ClusterState oldState , ClusterState newState ) {
648
- listener .onResponse (TransportResponse .Empty .INSTANCE );
649
- }
650
- }
626
+ update
651
627
);
652
628
}
653
629
}
654
630
655
- public static class ShardStartedClusterStateTaskExecutor implements ClusterStateTaskExecutor <StartedShardEntry > {
631
+ public static class ShardStartedClusterStateTaskExecutor implements ClusterStateTaskExecutor <StartedShardUpdateTask > {
656
632
private final AllocationService allocationService ;
657
633
private final RerouteService rerouteService ;
658
634
@@ -662,52 +638,54 @@ public ShardStartedClusterStateTaskExecutor(AllocationService allocationService,
662
638
}
663
639
664
640
@ Override
665
- public ClusterTasksResult <StartedShardEntry > execute (ClusterState currentState , List <StartedShardEntry > tasks ) throws Exception {
666
- ClusterTasksResult .Builder <StartedShardEntry > builder = ClusterTasksResult .builder ();
667
- List <StartedShardEntry > tasksToBeApplied = new ArrayList <>();
641
+ public ClusterTasksResult <StartedShardUpdateTask > execute (ClusterState currentState , List <StartedShardUpdateTask > tasks )
642
+ throws Exception {
643
+ ClusterTasksResult .Builder <StartedShardUpdateTask > builder = ClusterTasksResult .builder ();
644
+ List <StartedShardUpdateTask > tasksToBeApplied = new ArrayList <>();
668
645
List <ShardRouting > shardRoutingsToBeApplied = new ArrayList <>(tasks .size ());
669
646
Set <ShardRouting > seenShardRoutings = new HashSet <>(); // to prevent duplicates
670
647
final Map <Index , IndexLongFieldRange > updatedTimestampRanges = new HashMap <>();
671
- for (StartedShardEntry task : tasks ) {
672
- final ShardRouting matched = currentState .getRoutingTable ().getByAllocationId (task .shardId , task .allocationId );
648
+ for (StartedShardUpdateTask task : tasks ) {
649
+ StartedShardEntry entry = task .getEntry ();
650
+ final ShardRouting matched = currentState .getRoutingTable ().getByAllocationId (entry .shardId , entry .allocationId );
673
651
if (matched == null ) {
674
652
// tasks that correspond to non-existent shards are marked as successful. The reason is that we resend shard started
675
653
// events on every cluster state publishing that does not contain the shard as started yet. This means that old stale
676
654
// requests might still be in flight even after the shard has already been started or failed on the master. We just
677
655
// ignore these requests for now.
678
- logger .debug ("{} ignoring shard started task [{}] (shard does not exist anymore)" , task .shardId , task );
656
+ logger .debug ("{} ignoring shard started task [{}] (shard does not exist anymore)" , entry .shardId , entry );
679
657
builder .success (task );
680
658
} else {
681
- if (matched .primary () && task .primaryTerm > 0 ) {
682
- final IndexMetadata indexMetadata = currentState .metadata ().index (task .shardId .getIndex ());
659
+ if (matched .primary () && entry .primaryTerm > 0 ) {
660
+ final IndexMetadata indexMetadata = currentState .metadata ().index (entry .shardId .getIndex ());
683
661
assert indexMetadata != null ;
684
- final long currentPrimaryTerm = indexMetadata .primaryTerm (task .shardId .id ());
685
- if (currentPrimaryTerm != task .primaryTerm ) {
686
- assert currentPrimaryTerm > task .primaryTerm
662
+ final long currentPrimaryTerm = indexMetadata .primaryTerm (entry .shardId .id ());
663
+ if (currentPrimaryTerm != entry .primaryTerm ) {
664
+ assert currentPrimaryTerm > entry .primaryTerm
687
665
: "received a primary term with a higher term than in the "
688
666
+ "current cluster state (received ["
689
- + task .primaryTerm
667
+ + entry .primaryTerm
690
668
+ "] but current is ["
691
669
+ currentPrimaryTerm
692
670
+ "])" ;
693
671
logger .debug (
694
672
"{} ignoring shard started task [{}] (primary term {} does not match current term {})" ,
695
- task .shardId ,
696
- task ,
697
- task .primaryTerm ,
673
+ entry .shardId ,
674
+ entry ,
675
+ entry .primaryTerm ,
698
676
currentPrimaryTerm
699
677
);
700
678
builder .success (task );
701
679
continue ;
702
680
}
703
681
}
704
682
if (matched .initializing () == false ) {
705
- assert matched .active () : "expected active shard routing for task " + task + " but found " + matched ;
683
+ assert matched .active () : "expected active shard routing for task " + entry + " but found " + matched ;
706
684
// same as above, this might have been a stale in-flight request, so we just ignore.
707
685
logger .debug (
708
686
"{} ignoring shard started task [{}] (shard exists but is not initializing: {})" ,
709
- task .shardId ,
710
- task ,
687
+ entry .shardId ,
688
+ entry ,
711
689
matched
712
690
);
713
691
builder .success (task );
@@ -716,29 +694,29 @@ public ClusterTasksResult<StartedShardEntry> execute(ClusterState currentState,
716
694
if (seenShardRoutings .contains (matched )) {
717
695
logger .trace (
718
696
"{} ignoring shard started task [{}] (already scheduled to start {})" ,
719
- task .shardId ,
720
- task ,
697
+ entry .shardId ,
698
+ entry ,
721
699
matched
722
700
);
723
701
tasksToBeApplied .add (task );
724
702
} else {
725
- logger .debug ("{} starting shard {} (shard started task: [{}])" , task .shardId , matched , task );
703
+ logger .debug ("{} starting shard {} (shard started task: [{}])" , entry .shardId , matched , entry );
726
704
tasksToBeApplied .add (task );
727
705
shardRoutingsToBeApplied .add (matched );
728
706
seenShardRoutings .add (matched );
729
707
730
708
// expand the timestamp range recorded in the index metadata if needed
731
- final Index index = task .shardId .getIndex ();
709
+ final Index index = entry .shardId .getIndex ();
732
710
IndexLongFieldRange currentTimestampMillisRange = updatedTimestampRanges .get (index );
733
711
final IndexMetadata indexMetadata = currentState .metadata ().index (index );
734
712
if (currentTimestampMillisRange == null ) {
735
713
currentTimestampMillisRange = indexMetadata .getTimestampRange ();
736
714
}
737
715
final IndexLongFieldRange newTimestampMillisRange ;
738
716
newTimestampMillisRange = currentTimestampMillisRange .extendWithShardRange (
739
- task .shardId .id (),
717
+ entry .shardId .id (),
740
718
indexMetadata .getNumberOfShards (),
741
- task .timestampRange
719
+ entry .timestampRange
742
720
);
743
721
if (newTimestampMillisRange != currentTimestampMillisRange ) {
744
722
updatedTimestampRanges .put (index , newTimestampMillisRange );
@@ -872,6 +850,43 @@ public int hashCode() {
872
850
}
873
851
}
874
852
853
+ public static class StartedShardUpdateTask implements ClusterStateTaskListener {
854
+
855
+ private final StartedShardEntry entry ;
856
+ private final ActionListener <TransportResponse .Empty > listener ;
857
+
858
+ public StartedShardUpdateTask (StartedShardEntry entry , ActionListener <TransportResponse .Empty > listener ) {
859
+ this .entry = entry ;
860
+ this .listener = listener ;
861
+ }
862
+
863
+ public StartedShardEntry getEntry () {
864
+ return entry ;
865
+ }
866
+
867
+ @ Override
868
+ public void onFailure (Exception e ) {
869
+ if (e instanceof NotMasterException ) {
870
+ logger .debug (() -> new ParameterizedMessage ("{} no longer master while starting shard [{}]" , entry .shardId , entry ));
871
+ } else if (e instanceof FailedToCommitClusterStateException ) {
872
+ logger .debug (() -> new ParameterizedMessage ("{} unexpected failure while starting shard [{}]" , entry .shardId , entry ), e );
873
+ } else {
874
+ logger .error (() -> new ParameterizedMessage ("{} unexpected failure while starting shard [{}]" , entry .shardId , entry ), e );
875
+ }
876
+ listener .onFailure (e );
877
+ }
878
+
879
+ @ Override
880
+ public void clusterStateProcessed (ClusterState oldState , ClusterState newState ) {
881
+ listener .onResponse (TransportResponse .Empty .INSTANCE );
882
+ }
883
+
884
+ @ Override
885
+ public String toString () {
886
+ return "StartedShardUpdateTask{entry=" + entry + ", listener=" + listener + "}" ;
887
+ }
888
+ }
889
+
875
890
public static class NoLongerPrimaryShardException extends ElasticsearchException {
876
891
877
892
public NoLongerPrimaryShardException (ShardId shardId , String msg ) {
0 commit comments