Skip to content

Commit af1ff52

Browse files
Fix TransportMasterNodeAction not Retrying NodeClosedException (#51325) (#51437)
Added node closed exception to the retryable remote exceptions as it's possible to run into this exception instead of a connect exception when the master node is just shutting down but still responding to requests.
1 parent ef0a933 commit af1ff52

File tree

3 files changed

+22
-2
lines changed

3 files changed

+22
-2
lines changed

server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
import org.elasticsearch.tasks.Task;
4747
import org.elasticsearch.threadpool.ThreadPool;
4848
import org.elasticsearch.transport.ConnectTransportException;
49+
import org.elasticsearch.transport.RemoteTransportException;
4950
import org.elasticsearch.transport.TransportException;
5051
import org.elasticsearch.transport.TransportService;
5152

@@ -180,7 +181,8 @@ protected void doStart(ClusterState clusterState) {
180181
@Override
181182
public void handleException(final TransportException exp) {
182183
Throwable cause = exp.unwrapCause();
183-
if (cause instanceof ConnectTransportException) {
184+
if (cause instanceof ConnectTransportException ||
185+
(exp instanceof RemoteTransportException && cause instanceof NodeClosedException)) {
184186
// we want to retry here a bit to see if a new master is elected
185187
logger.debug("connection exception while trying to forward request with action name [{}] to " +
186188
"master node [{}], scheduling a retry. Error: [{}]",

server/src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import org.elasticsearch.common.io.stream.StreamOutput;
4545
import org.elasticsearch.common.unit.TimeValue;
4646
import org.elasticsearch.discovery.MasterNotDiscoveredException;
47+
import org.elasticsearch.node.NodeClosedException;
4748
import org.elasticsearch.rest.RestStatus;
4849
import org.elasticsearch.tasks.Task;
4950
import org.elasticsearch.test.ESTestCase;
@@ -391,7 +392,8 @@ public void testDelegateToFailingMaster() throws ExecutionException, Interrupted
391392
assertThat(capturedRequest.action, equalTo("internal:testAction"));
392393

393394
if (rejoinSameMaster) {
394-
transport.handleRemoteError(capturedRequest.requestId, new ConnectTransportException(masterNode, "Fake error"));
395+
transport.handleRemoteError(capturedRequest.requestId,
396+
randomBoolean() ? new ConnectTransportException(masterNode, "Fake error") : new NodeClosedException(masterNode));
395397
assertFalse(listener.isDone());
396398
if (randomBoolean()) {
397399
// simulate master node removal

server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@
2828
import org.elasticsearch.common.Priority;
2929
import org.elasticsearch.common.settings.Settings;
3030
import org.elasticsearch.test.ESIntegTestCase;
31+
import org.elasticsearch.test.InternalTestCluster;
3132

33+
import java.util.ArrayList;
34+
import java.util.List;
3235
import java.util.concurrent.atomic.AtomicBoolean;
3336

3437
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
@@ -294,4 +297,17 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
294297
assertFalse(healthResponseFuture.get().isTimedOut());
295298
}
296299

300+
public void testHealthOnMasterFailover() throws Exception {
301+
final String node = internalCluster().startDataOnlyNode();
302+
final List<ActionFuture<ClusterHealthResponse>> responseFutures = new ArrayList<>();
303+
// Run a few health requests concurrent to master fail-overs against a data-node to make sure master failover is handled
304+
// without exceptions
305+
for (int i = 0; i < 20; ++i) {
306+
responseFutures.add(client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).execute());
307+
internalCluster().restartNode(internalCluster().getMasterName(), InternalTestCluster.EMPTY_CALLBACK);
308+
}
309+
for (ActionFuture<ClusterHealthResponse> responseFuture : responseFutures) {
310+
assertSame(responseFuture.get().getStatus(), ClusterHealthStatus.GREEN);
311+
}
312+
}
297313
}

0 commit comments

Comments
 (0)