Get test working

nicktindall · nicktindall · commit ce14d2eeec93 · 2024-06-18T15:35:01.000+10:00
diff --git a/server/src/internalClusterTest/java/org/elasticsearch/action/support/master/TransportMasterNodeActionIT.java b/server/src/internalClusterTest/java/org/elasticsearch/action/support/master/TransportMasterNodeActionIT.java
@@ -9,12 +9,13 @@
 package org.elasticsearch.action.support.master;
 
 import org.elasticsearch.ElasticsearchException;
-import org.elasticsearch.action.admin.cluster.state.ClusterStateAction;
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.action.admin.cluster.health.TransportClusterHealthAction;
 import org.elasticsearch.cluster.ClusterState;
-import org.elasticsearch.cluster.ClusterStateApplier;
 import org.elasticsearch.cluster.ClusterStateUpdateTask;
 import org.elasticsearch.cluster.coordination.PublicationTransportHandler;
 import org.elasticsearch.cluster.coordination.StatefulPreVoteCollector;
+import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.util.CollectionUtils;
 import org.elasticsearch.plugins.Plugin;
@@ -25,8 +26,10 @@
 
 import java.util.Collection;
 import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.CyclicBarrier;
 import java.util.concurrent.atomic.AtomicBoolean;
 
+import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThan;
 
 public class TransportMasterNodeActionIT extends ESIntegTestCase {
@@ -36,39 +39,49 @@ protected Collection<Class<? extends Plugin>> nodePlugins() {
         return CollectionUtils.appendToCopy(super.nodePlugins(), MockTransportService.TestPlugin.class);
     }
 
-    @TestLogging(reason = "wip", value = "org.elasticsearch.transport.TransportService.tracer:TRACE")
+    @TestLogging(reason = "wip", value = "org.elasticsearch.action.support.master.TransportMasterNodeAction:DEBUG")
     public void testRoutingLoopProtection() {
 
         final var newMaster = internalCluster().startMasterOnlyNode();
         final var enoughVotingMastersLatch = new CountDownLatch(1);
-        final var electionWonLatch = new CountDownLatch(1);
-        final var releaseMasterLatch = new CountDownLatch(1);
-
-        final ClusterStateApplier blockingApplier = event -> {
-            if (3 <= event.state().coordinationMetadata().getLastCommittedConfiguration().getNodeIds().size()) {
-                enoughVotingMastersLatch.countDown();
-            }
-
-            if (event.state().nodes().isLocalNodeElectedMaster()) {
-                logger.info("--> new master elected as planned");
-                electionWonLatch.countDown();
-                safeAwait(releaseMasterLatch);
-                logger.info("--> cluster state applications released on new master");
-            }
-        };
+        final var newMasterReceivedReroutedMessageLatch = new CountDownLatch(1);
 
         try {
-            internalCluster().getInstance(ClusterService.class, newMaster).addStateApplier(blockingApplier);
-
-            // need at least 3 voting master nodes for failover
+            /*
+             * Ensure that we've got 5 voting nodes in the cluster, this means even if the original
+             * master manages to accept its own failed state update before standing down, we can still
+             * establish a quorum without its (or our own) join.
+             */
+            internalCluster().getInstance(ClusterService.class, newMaster).addStateApplier(event -> {
+                if (5 <= event.state().coordinationMetadata().getLastCommittedConfiguration().getNodeIds().size()) {
+                    enoughVotingMastersLatch.countDown();
+                }
+            });
+            internalCluster().startMasterOnlyNode();
+            internalCluster().startMasterOnlyNode();
             internalCluster().startMasterOnlyNode();
             safeAwait(enoughVotingMastersLatch);
+            long originalTerm = internalCluster().masterClient().admin().cluster().prepareState().get().getState().term();
+
+            // Configure a latch that will be released when the existing master knows of the new master's election
+            final String originalMasterName = internalCluster().getMasterName();
+            logger.info("Original master was {}, new master will be {}", originalMasterName, newMaster);
+            final var previousMasterKnowsNewMasterIsElectedLatch = new CountDownLatch(1);
+            internalCluster().getInstance(ClusterService.class, originalMasterName).addStateApplier(event -> {
+                DiscoveryNode masterNode = event.state().nodes().getMasterNode();
+                if (masterNode != null && masterNode.getName().equals(newMaster)) {
+                    previousMasterKnowsNewMasterIsElectedLatch.countDown();
+                }
+            });
 
-            final var reroutedMessageReceived = new AtomicBoolean(false);
             for (final var transportService : internalCluster().getInstances(TransportService.class)) {
                 if (transportService.getLocalNode().getName().equals(newMaster)) {
                     continue;
                 }
+
+                /*
+                 * Disable every other nodes' ability to send pre-vote and publish requests
+                 */
                 final var mockTransportService = asInstanceOf(MockTransportService.class, transportService);
                 mockTransportService.addSendBehavior((connection, requestId, action, request, options) -> {
                     if (action.equals(StatefulPreVoteCollector.REQUEST_PRE_VOTE_ACTION_NAME)
@@ -78,21 +91,53 @@ public void testRoutingLoopProtection() {
                         connection.sendRequest(requestId, action, request, options);
                     }
                 });
-                mockTransportService.addRequestHandlingBehavior(ClusterStateAction.NAME, (handler, request, channel, task) -> {
-                    // assertThat(asInstanceOf(MasterNodeRequest.class, request).masterTerm(), equalTo(originalTerm)) TODO
-                    assertTrue("rerouted message received exactly once", reroutedMessageReceived.compareAndSet(false, true));
-                    handler.messageReceived(request, channel, task);
-                });
+
+                /*
+                 * Assert that no other node receives the re-routed message more than once, and only
+                 * from a node in the original term
+                 */
+                final var reroutedMessageReceived = new AtomicBoolean(false);
+                mockTransportService.addRequestHandlingBehavior(
+                    TransportClusterHealthAction.TYPE.name(),
+                    (handler, request, channel, task) -> {
+                        assertThat(asInstanceOf(MasterNodeRequest.class, request).masterTerm(), equalTo(originalTerm));
+                        assertTrue("rerouted message received exactly once", reroutedMessageReceived.compareAndSet(false, true));
+                        handler.messageReceived(request, channel, task);
+                    }
+                );
             }
 
-            final var doubleReroutedMessageLatch = new CountDownLatch(1);
+            /*
+             * Count down latch when the new master receives the re-routed message, ensure it only receives it once, and
+             * only from a node in the newMaster term
+             */
             MockTransportService.getInstance(newMaster)
-                .addRequestHandlingBehavior(ClusterStateAction.NAME, (handler, request, channel, task) -> {
-                    // assertThat(asInstanceOf(MasterNodeRequest.class, request).masterTerm(), greaterThan(originalTerm)) TODO
-                    assertThat(doubleReroutedMessageLatch.getCount(), greaterThan(0L));
-                    doubleReroutedMessageLatch.countDown();
+                .addRequestHandlingBehavior(TransportClusterHealthAction.TYPE.name(), (handler, request, channel, task) -> {
+                    assertThat(asInstanceOf(MasterNodeRequest.class, request).masterTerm(), greaterThan(originalTerm));
+                    assertThat(newMasterReceivedReroutedMessageLatch.getCount(), greaterThan(0L));
+                    newMasterReceivedReroutedMessageLatch.countDown();
+                    handler.messageReceived(request, channel, task);
                 });
 
+            /*
+             * Block cluster state applier on newMaster to delay clearing of old master, and identifying self as
+             * new master
+             */
+            final var stateApplierBarrier = new CyclicBarrier(2);
+            final var blockingStateApplier = new AtomicBoolean(true);
+            internalCluster().getInstance(ClusterService.class, newMaster).getClusterApplierService().onNewClusterState("test", () -> {
+                if (blockingStateApplier.get()) {
+                    // Meet to signify application is blocked
+                    safeAwait(stateApplierBarrier);
+                    // Wait for the signal to unblock
+                    safeAwait(stateApplierBarrier);
+                }
+                return null;
+            }, ActionListener.noop());
+
+            // Wait until state application is blocked
+            safeAwait(stateApplierBarrier);
+
             // trigger a cluster state update, which fails, causing a master failover
             internalCluster().getCurrentMasterNodeInstance(ClusterService.class)
                 .submitUnbatchedStateUpdateTask("no-op", new ClusterStateUpdateTask() {
@@ -107,24 +152,25 @@ public void onFailure(Exception e) {
                     }
                 });
 
-            safeAwait(electionWonLatch);
+            // Wait until the old master has acknowledged the new master's election
+            safeAwait(previousMasterKnowsNewMasterIsElectedLatch);
+            logger.info("New master is elected");
 
             // perform a TransportMasterNodeAction on the new master, which doesn't know it's the master yet
-            final var stateFuture = client(newMaster).admin().cluster().prepareState().clear().execute();
+            final var stateFuture = client(newMaster).admin().cluster().prepareHealth().execute();
 
-            // wait for the request to come back to the new master, which should now wait for its local term to advance
-            safeAwait(doubleReroutedMessageLatch);
+            // wait for the request to come back to the new master
+            safeAwait(newMasterReceivedReroutedMessageLatch);
+
+            // Unblock state application on new master, allow it to know of its election win
+            blockingStateApplier.set(false);
+            safeAwait(stateApplierBarrier);
 
             assertFalse(stateFuture.isDone());
 
-            releaseMasterLatch.countDown();
             safeGet(stateFuture);
-
         } finally {
             enoughVotingMastersLatch.countDown();
-            electionWonLatch.countDown();
-            releaseMasterLatch.countDown();
-            internalCluster().getInstance(ClusterService.class, newMaster).removeApplier(blockingApplier);
             for (final var transportService : internalCluster().getInstances(TransportService.class)) {
                 asInstanceOf(MockTransportService.class, transportService).clearAllRules();
             }