27
27
import org .elasticsearch .cluster .ClusterState .Builder ;
28
28
import org .elasticsearch .cluster .ClusterState .VotingConfiguration ;
29
29
import org .elasticsearch .cluster .ClusterStateTaskConfig ;
30
+ import org .elasticsearch .cluster .ClusterStateUpdateTask ;
30
31
import org .elasticsearch .cluster .block .ClusterBlocks ;
31
32
import org .elasticsearch .cluster .coordination .FollowersChecker .FollowerCheckRequest ;
32
33
import org .elasticsearch .cluster .coordination .JoinHelper .InitialJoinAccumulator ;
42
43
import org .elasticsearch .common .Strings ;
43
44
import org .elasticsearch .common .component .AbstractLifecycleComponent ;
44
45
import org .elasticsearch .common .lease .Releasable ;
46
+ import org .elasticsearch .common .settings .ClusterSettings ;
45
47
import org .elasticsearch .common .settings .Setting ;
46
48
import org .elasticsearch .common .settings .Settings ;
47
49
import org .elasticsearch .common .unit .TimeValue ;
64
66
import java .util .Optional ;
65
67
import java .util .Random ;
66
68
import java .util .Set ;
69
+ import java .util .concurrent .atomic .AtomicBoolean ;
67
70
import java .util .function .Supplier ;
68
71
import java .util .stream .Collectors ;
72
+ import java .util .stream .StreamSupport ;
69
73
74
+ import static java .util .Collections .emptySet ;
75
+ import static org .elasticsearch .cluster .coordination .Reconfigurator .CLUSTER_MASTER_NODES_FAILURE_TOLERANCE ;
70
76
import static org .elasticsearch .discovery .DiscoverySettings .NO_MASTER_BLOCK_WRITES ;
71
77
import static org .elasticsearch .gateway .GatewayService .STATE_NOT_RECOVERED_BLOCK ;
72
78
@@ -104,16 +110,18 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
104
110
@ Nullable
105
111
private Releasable leaderCheckScheduler ;
106
112
private long maxTermSeen ;
113
+ private final Reconfigurator reconfigurator ;
107
114
108
115
private Mode mode ;
109
116
private Optional <DiscoveryNode > lastKnownLeader ;
110
117
private Optional <Join > lastJoin ;
111
118
private JoinHelper .JoinAccumulator joinAccumulator ;
112
119
private Optional <CoordinatorPublication > currentPublication = Optional .empty ();
113
120
114
- public Coordinator (Settings settings , TransportService transportService , AllocationService allocationService ,
115
- MasterService masterService , Supplier <CoordinationState .PersistedState > persistedStateSupplier ,
116
- UnicastHostsProvider unicastHostsProvider , ClusterApplier clusterApplier , Random random ) {
121
+ public Coordinator (Settings settings , ClusterSettings clusterSettings , TransportService transportService ,
122
+ AllocationService allocationService , MasterService masterService ,
123
+ Supplier <CoordinationState .PersistedState > persistedStateSupplier , UnicastHostsProvider unicastHostsProvider ,
124
+ ClusterApplier clusterApplier , Random random ) {
117
125
super (settings );
118
126
this .transportService = transportService ;
119
127
this .masterService = masterService ;
@@ -136,6 +144,7 @@ public Coordinator(Settings settings, TransportService transportService, Allocat
136
144
this .nodeRemovalExecutor = new NodeRemovalClusterStateTaskExecutor (allocationService , logger );
137
145
this .clusterApplier = clusterApplier ;
138
146
masterService .setClusterStateSupplier (this ::getStateForMasterService );
147
+ this .reconfigurator = new Reconfigurator (settings , clusterSettings );
139
148
}
140
149
141
150
private Runnable getOnLeaderFailure () {
@@ -269,8 +278,13 @@ private void updateMaxTermSeen(final long term) {
269
278
logger .debug ("updateMaxTermSeen: maxTermSeen = {} > currentTerm = {}, enqueueing term bump" ,
270
279
maxTermSeen , currentTerm );
271
280
} else {
272
- ensureTermAtLeast (getLocalNode (), maxTermSeen );
273
- startElection ();
281
+ try {
282
+ ensureTermAtLeast (getLocalNode (), maxTermSeen );
283
+ startElection ();
284
+ } catch (Exception e ) {
285
+ logger .warn (new ParameterizedMessage ("failed to bump term to {}" , maxTermSeen ), e );
286
+ becomeCandidate ("updateMaxTermSeen" );
287
+ }
274
288
}
275
289
}
276
290
}
@@ -524,6 +538,12 @@ public void invariant() {
524
538
assert lastPublishedNodes .equals (followersChecker .getKnownFollowers ()) :
525
539
lastPublishedNodes + " != " + followersChecker .getKnownFollowers ();
526
540
}
541
+
542
+ assert becomingMaster || activePublication ||
543
+ coordinationState .get ().getLastAcceptedConfiguration ().equals (coordinationState .get ().getLastCommittedConfiguration ())
544
+ : coordinationState .get ().getLastAcceptedConfiguration () + " != "
545
+ + coordinationState .get ().getLastCommittedConfiguration ();
546
+
527
547
} else if (mode == Mode .FOLLOWER ) {
528
548
assert coordinationState .get ().electionWon () == false : getLocalNode () + " is FOLLOWER so electionWon() should be false" ;
529
549
assert lastKnownLeader .isPresent () && (lastKnownLeader .get ().equals (getLocalNode ()) == false );
@@ -582,13 +602,59 @@ public void setInitialConfiguration(final VotingConfiguration votingConfiguratio
582
602
MetaData .Builder metaDataBuilder = MetaData .builder ();
583
603
// automatically generate a UID for the metadata if we need to
584
604
metaDataBuilder .generateClusterUuidIfNeeded (); // TODO generate UUID in bootstrapping tool?
605
+ metaDataBuilder .persistentSettings (Settings .builder ().put (CLUSTER_MASTER_NODES_FAILURE_TOLERANCE .getKey (),
606
+ (votingConfiguration .getNodeIds ().size () - 1 ) / 2 ).build ()); // TODO set this in bootstrapping tool?
585
607
builder .metaData (metaDataBuilder );
586
608
coordinationState .get ().setInitialState (builder .build ());
587
609
preVoteCollector .update (getPreVoteResponse (), null ); // pick up the change to last-accepted version
588
610
startElectionScheduler ();
589
611
}
590
612
}
591
613
614
+ // Package-private for testing
615
+ ClusterState improveConfiguration (ClusterState clusterState ) {
616
+ assert Thread .holdsLock (mutex ) : "Coordinator mutex not held" ;
617
+
618
+ final Set <DiscoveryNode > liveNodes = StreamSupport .stream (clusterState .nodes ().spliterator (), false )
619
+ .filter (this ::hasJoinVoteFrom ).collect (Collectors .toSet ());
620
+ final ClusterState .VotingConfiguration newConfig = reconfigurator .reconfigure (
621
+ liveNodes , emptySet (), clusterState .getLastAcceptedConfiguration ());
622
+ if (newConfig .equals (clusterState .getLastAcceptedConfiguration ()) == false ) {
623
+ assert coordinationState .get ().joinVotesHaveQuorumFor (newConfig );
624
+ return ClusterState .builder (clusterState ).lastAcceptedConfiguration (newConfig ).build ();
625
+ }
626
+
627
+ return clusterState ;
628
+ }
629
+
630
+ private AtomicBoolean reconfigurationTaskScheduled = new AtomicBoolean ();
631
+
632
+ private void scheduleReconfigurationIfNeeded () {
633
+ assert Thread .holdsLock (mutex ) : "Coordinator mutex not held" ;
634
+ assert mode == Mode .LEADER : mode ;
635
+ assert currentPublication .isPresent () == false : "Expected no publication in progress" ;
636
+
637
+ final ClusterState state = getLastAcceptedState ();
638
+ if (improveConfiguration (state ) != state && reconfigurationTaskScheduled .compareAndSet (false , true )) {
639
+ logger .trace ("scheduling reconfiguration" );
640
+ masterService .submitStateUpdateTask ("reconfigure" , new ClusterStateUpdateTask (Priority .URGENT ) {
641
+ @ Override
642
+ public ClusterState execute (ClusterState currentState ) {
643
+ reconfigurationTaskScheduled .set (false );
644
+ synchronized (mutex ) {
645
+ return improveConfiguration (currentState );
646
+ }
647
+ }
648
+
649
+ @ Override
650
+ public void onFailure (String source , Exception e ) {
651
+ reconfigurationTaskScheduled .set (false );
652
+ logger .debug ("reconfiguration failed" , e );
653
+ }
654
+ });
655
+ }
656
+ }
657
+
592
658
// for tests
593
659
boolean hasJoinVoteFrom (DiscoveryNode localNode ) {
594
660
return coordinationState .get ().containsJoinVoteFor (localNode );
@@ -599,19 +665,34 @@ private void handleJoin(Join join) {
599
665
ensureTermAtLeast (getLocalNode (), join .getTerm ()).ifPresent (this ::handleJoin );
600
666
601
667
if (coordinationState .get ().electionWon ()) {
602
- // if we have already won the election then the actual join does not matter for election purposes,
603
- // so swallow any exception
604
- try {
605
- coordinationState .get ().handleJoin (join );
606
- } catch (CoordinationStateRejectedException e ) {
607
- logger .debug (new ParameterizedMessage ("failed to add {} - ignoring" , join ), e );
668
+ // If we have already won the election then the actual join does not matter for election purposes, so swallow any exception
669
+ final boolean isNewJoin = handleJoinIgnoringExceptions (join );
670
+
671
+ // If we haven't completely finished becoming master then there's already a publication scheduled which will, in turn,
672
+ // schedule a reconfiguration if needed. It's benign to schedule a reconfiguration anyway, but it might fail if it wins the
673
+ // race against the election-winning publication and log a big error message, which we can prevent by checking this here:
674
+ final boolean establishedAsMaster = mode == Mode .LEADER && getLastAcceptedState ().term () == getCurrentTerm ();
675
+ if (isNewJoin && establishedAsMaster && publicationInProgress () == false ) {
676
+ scheduleReconfigurationIfNeeded ();
608
677
}
609
678
} else {
610
679
coordinationState .get ().handleJoin (join ); // this might fail and bubble up the exception
611
680
}
612
681
}
613
682
}
614
683
684
+ /**
685
+ * @return true iff the join was from a new node and was successfully added
686
+ */
687
+ private boolean handleJoinIgnoringExceptions (Join join ) {
688
+ try {
689
+ return coordinationState .get ().handleJoin (join );
690
+ } catch (CoordinationStateRejectedException e ) {
691
+ logger .debug (new ParameterizedMessage ("failed to add {} - ignoring" , join ), e );
692
+ return false ;
693
+ }
694
+ }
695
+
615
696
public ClusterState getLastAcceptedState () {
616
697
synchronized (mutex ) {
617
698
return coordinationState .get ().getLastAcceptedState ();
@@ -904,6 +985,10 @@ public void onSuccess(String source) {
904
985
logger .debug ("publication ended successfully: {}" , CoordinatorPublication .this );
905
986
// trigger term bump if new term was found during publication
906
987
updateMaxTermSeen (getCurrentTerm ());
988
+
989
+ if (mode == Mode .LEADER ) {
990
+ scheduleReconfigurationIfNeeded ();
991
+ }
907
992
}
908
993
ackListener .onNodeAck (getLocalNode (), null );
909
994
publishListener .onResponse (null );
@@ -916,8 +1001,7 @@ public void onFailure(Exception e) {
916
1001
assert Thread .holdsLock (mutex ) : "Coordinator mutex not held" ;
917
1002
removePublicationAndPossiblyBecomeCandidate ("Publication.onCompletion(false)" );
918
1003
919
- FailedToCommitClusterStateException exception = new FailedToCommitClusterStateException (
920
- "publication failed" , e );
1004
+ final FailedToCommitClusterStateException exception = new FailedToCommitClusterStateException ("publication failed" , e );
921
1005
ackListener .onNodeAck (getLocalNode (), exception ); // other nodes have acked, but not the master.
922
1006
publishListener .onFailure (exception );
923
1007
}
0 commit comments