@@ -1641,35 +1641,7 @@ private synchronized void stopNodesAndClient(NodeAndClient nodeAndClient) throws
1641
1641
}
1642
1642
1643
1643
private synchronized void stopNodesAndClients (Collection <NodeAndClient > nodeAndClients ) throws IOException {
1644
- final Set <String > excludedNodeIds = new HashSet <>();
1645
-
1646
- if (autoManageMinMasterNodes && nodeAndClients .size () > 0 ) {
1647
-
1648
- final long currentMasters = nodes .values ().stream ().filter (NodeAndClient ::isMasterEligible ).count ();
1649
- final long stoppingMasters = nodeAndClients .stream ().filter (NodeAndClient ::isMasterEligible ).count ();
1650
-
1651
- assert stoppingMasters <= currentMasters : currentMasters + " < " + stoppingMasters ;
1652
- if (stoppingMasters != currentMasters && stoppingMasters > 0 ) {
1653
- // If stopping few enough master-nodes that there's still a majority left, there is no need to withdraw their votes first.
1654
- // However, we do not yet have a way to be sure there's a majority left, because the voting configuration may not yet have
1655
- // been updated when the previous nodes shut down, so we must always explicitly withdraw votes.
1656
- // TODO add cluster health API to check that voting configuration is optimal so this isn't always needed
1657
- nodeAndClients .stream ().filter (NodeAndClient ::isMasterEligible ).map (NodeAndClient ::getName ).forEach (excludedNodeIds ::add );
1658
- assert excludedNodeIds .size () == stoppingMasters ;
1659
-
1660
- logger .info ("adding voting config exclusions {} prior to shutdown" , excludedNodeIds );
1661
- try {
1662
- client ().execute (AddVotingConfigExclusionsAction .INSTANCE ,
1663
- new AddVotingConfigExclusionsRequest (excludedNodeIds .toArray (new String [0 ]))).get ();
1664
- } catch (InterruptedException | ExecutionException e ) {
1665
- throw new AssertionError ("unexpected" , e );
1666
- }
1667
- }
1668
-
1669
- if (stoppingMasters > 0 ) {
1670
- updateMinMasterNodes (getMasterNodesCount () - Math .toIntExact (stoppingMasters ));
1671
- }
1672
- }
1644
+ final Set <String > excludedNodeIds = excludeMasters (nodeAndClients );
1673
1645
1674
1646
for (NodeAndClient nodeAndClient : nodeAndClients ) {
1675
1647
removeDisruptionSchemeFromNode (nodeAndClient );
@@ -1678,14 +1650,7 @@ private synchronized void stopNodesAndClients(Collection<NodeAndClient> nodeAndC
1678
1650
nodeAndClient .close ();
1679
1651
}
1680
1652
1681
- if (excludedNodeIds .isEmpty () == false ) {
1682
- logger .info ("removing voting config exclusions for {} after shutdown" , excludedNodeIds );
1683
- try {
1684
- client ().execute (ClearVotingConfigExclusionsAction .INSTANCE , new ClearVotingConfigExclusionsRequest ()).get ();
1685
- } catch (InterruptedException | ExecutionException e ) {
1686
- throw new AssertionError ("unexpected" , e );
1687
- }
1688
- }
1653
+ removeExclusions (excludedNodeIds );
1689
1654
}
1690
1655
1691
1656
/**
@@ -1751,31 +1716,78 @@ public synchronized void rollingRestart(RestartCallback callback) throws Excepti
1751
1716
1752
1717
private void restartNode (NodeAndClient nodeAndClient , RestartCallback callback ) throws Exception {
1753
1718
logger .info ("Restarting node [{}] " , nodeAndClient .name );
1719
+
1754
1720
if (activeDisruptionScheme != null ) {
1755
1721
activeDisruptionScheme .removeFromNode (nodeAndClient .name , this );
1756
1722
}
1757
- final int masterNodesCount = getMasterNodesCount ();
1758
- // special case to allow stopping one node in a two node cluster and keep it functional
1759
- final boolean updateMinMaster = nodeAndClient .isMasterEligible () && masterNodesCount == 2 && autoManageMinMasterNodes ;
1760
- if (updateMinMaster ) {
1761
- updateMinMasterNodes (masterNodesCount - 1 );
1762
- }
1723
+
1724
+ Set <String > excludedNodeIds = excludeMasters (Collections .singleton (nodeAndClient ));
1725
+
1763
1726
final Settings newSettings = nodeAndClient .closeForRestart (callback ,
1764
- autoManageMinMasterNodes ? getMinMasterNodes (masterNodesCount ) : -1 );
1727
+ autoManageMinMasterNodes ? getMinMasterNodes (getMasterNodesCount ()) : -1 );
1728
+
1729
+ removeExclusions (excludedNodeIds );
1730
+
1765
1731
nodeAndClient .recreateNode (newSettings , () -> rebuildUnicastHostFiles (emptyList ()));
1766
1732
nodeAndClient .startNode ();
1767
1733
if (activeDisruptionScheme != null ) {
1768
1734
activeDisruptionScheme .applyToNode (nodeAndClient .name , this );
1769
1735
}
1770
- if (callback .validateClusterForming () || updateMinMaster ) {
1736
+
1737
+ if (callback .validateClusterForming () || excludedNodeIds .isEmpty () == false ) {
1771
1738
// we have to validate cluster size if updateMinMaster == true, because we need the
1772
1739
// second node to join in order to increment min_master_nodes back to 2.
1773
1740
// we also have to do via the node that was just restarted as it may be that the master didn't yet process
1774
1741
// the fact it left
1775
1742
validateClusterFormed (nodeAndClient .name );
1776
1743
}
1777
- if (updateMinMaster ) {
1778
- updateMinMasterNodes (masterNodesCount );
1744
+
1745
+ if (excludedNodeIds .isEmpty () == false ) {
1746
+ updateMinMasterNodes (getMasterNodesCount ());
1747
+ }
1748
+ }
1749
+
1750
+ private Set <String > excludeMasters (Collection <NodeAndClient > nodeAndClients ) {
1751
+ final Set <String > excludedNodeIds = new HashSet <>();
1752
+ if (autoManageMinMasterNodes && nodeAndClients .size () > 0 ) {
1753
+
1754
+ final long currentMasters = nodes .values ().stream ().filter (NodeAndClient ::isMasterEligible ).count ();
1755
+ final long stoppingMasters = nodeAndClients .stream ().filter (NodeAndClient ::isMasterEligible ).count ();
1756
+
1757
+ assert stoppingMasters <= currentMasters : currentMasters + " < " + stoppingMasters ;
1758
+ if (stoppingMasters != currentMasters && stoppingMasters > 0 ) {
1759
+ // If stopping few enough master-nodes that there's still a majority left, there is no need to withdraw their votes first.
1760
+ // However, we do not yet have a way to be sure there's a majority left, because the voting configuration may not yet have
1761
+ // been updated when the previous nodes shut down, so we must always explicitly withdraw votes.
1762
+ // TODO add cluster health API to check that voting configuration is optimal so this isn't always needed
1763
+ nodeAndClients .stream ().filter (NodeAndClient ::isMasterEligible ).map (NodeAndClient ::getName ).forEach (excludedNodeIds ::add );
1764
+ assert excludedNodeIds .size () == stoppingMasters ;
1765
+
1766
+ logger .info ("adding voting config exclusions {} prior to restart/shutdown" , excludedNodeIds );
1767
+ try {
1768
+ client ().execute (AddVotingConfigExclusionsAction .INSTANCE ,
1769
+ new AddVotingConfigExclusionsRequest (excludedNodeIds .toArray (new String [0 ]))).get ();
1770
+ } catch (InterruptedException | ExecutionException e ) {
1771
+ throw new AssertionError ("unexpected" , e );
1772
+ }
1773
+ }
1774
+
1775
+ if (stoppingMasters > 0 ) {
1776
+ updateMinMasterNodes (getMasterNodesCount () - Math .toIntExact (stoppingMasters ));
1777
+ }
1778
+ }
1779
+ return excludedNodeIds ;
1780
+ }
1781
+
1782
+ private void removeExclusions (Set <String > excludedNodeIds ) {
1783
+ if (excludedNodeIds .isEmpty () == false ) {
1784
+ logger .info ("removing voting config exclusions for {} after restart/shutdown" , excludedNodeIds );
1785
+ try {
1786
+ Client client = getRandomNodeAndClient (node -> excludedNodeIds .contains (node .name ) == false ).client (random );
1787
+ client .execute (ClearVotingConfigExclusionsAction .INSTANCE , new ClearVotingConfigExclusionsRequest ()).get ();
1788
+ } catch (InterruptedException | ExecutionException e ) {
1789
+ throw new AssertionError ("unexpected" , e );
1790
+ }
1779
1791
}
1780
1792
}
1781
1793
@@ -1833,7 +1845,6 @@ public synchronized void fullRestart(RestartCallback callback) throws Exception
1833
1845
}
1834
1846
}
1835
1847
1836
-
1837
1848
/**
1838
1849
* Returns the name of the current master node in the cluster.
1839
1850
*/
0 commit comments