Skip to content

Commit d56fc72

Browse files
committed
Fix node health-check-related test failures (#59277)
In #52680 we introduced a new health check mechanism. This commit fixes up some sporadic related test failures, and improves the behaviour of the `FollowersChecker` slightly in the case that no retries are configured. Closes #59252 Closes #59172
1 parent c80a9e2 commit d56fc72

File tree

4 files changed

+26
-49
lines changed

4 files changed

+26
-49
lines changed

server/src/main/java/org/elasticsearch/cluster/coordination/FollowersChecker.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -344,16 +344,16 @@ public void handleException(TransportException exp) {
344344
failureCountSinceLastSuccess++;
345345

346346
final String reason;
347-
if (failureCountSinceLastSuccess >= followerCheckRetryCount) {
348-
logger.debug(() -> new ParameterizedMessage("{} failed too many times", FollowerChecker.this), exp);
349-
reason = "followers check retry count exceeded";
350-
} else if (exp instanceof ConnectTransportException
347+
if (exp instanceof ConnectTransportException
351348
|| exp.getCause() instanceof ConnectTransportException) {
352349
logger.debug(() -> new ParameterizedMessage("{} disconnected", FollowerChecker.this), exp);
353350
reason = "disconnected";
354351
} else if (exp.getCause() instanceof NodeHealthCheckFailureException) {
355352
logger.debug(() -> new ParameterizedMessage("{} health check failed", FollowerChecker.this), exp);
356353
reason = "health check failed";
354+
} else if (failureCountSinceLastSuccess >= followerCheckRetryCount) {
355+
logger.debug(() -> new ParameterizedMessage("{} failed too many times", FollowerChecker.this), exp);
356+
reason = "followers check retry count exceeded";
357357
} else {
358358
logger.debug(() -> new ParameterizedMessage("{} failed, retrying", FollowerChecker.this), exp);
359359
scheduleNextWakeUp();

server/src/main/java/org/elasticsearch/monitor/fs/FsHealthService.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ public StatusInfo getHealth() {
129129

130130
class FsHealthMonitor implements Runnable {
131131

132-
private static final String TEMP_FILE_NAME = ".es_temp_file";
132+
static final String TEMP_FILE_NAME = ".es_temp_file";
133133
private byte[] byteToWrite;
134134

135135
FsHealthMonitor(){

server/src/test/java/org/elasticsearch/cluster/coordination/FollowersCheckerTests.java

Lines changed: 19 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -168,18 +168,7 @@ protected void onSendRequest(long requestId, String action, TransportRequest req
168168
}
169169

170170
public void testFailsNodeThatDoesNotRespond() {
171-
final Builder settingsBuilder = Settings.builder();
172-
if (randomBoolean()) {
173-
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
174-
}
175-
if (randomBoolean()) {
176-
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
177-
}
178-
if (randomBoolean()) {
179-
settingsBuilder.put(FOLLOWER_CHECK_TIMEOUT_SETTING.getKey(), randomIntBetween(1, 100000) + "ms");
180-
}
181-
final Settings settings = settingsBuilder.build();
182-
171+
final Settings settings = randomSettings();
183172
testBehaviourOfFailingNode(settings, () -> null,
184173
"followers check retry count exceeded",
185174
(FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis()
@@ -188,15 +177,7 @@ public void testFailsNodeThatDoesNotRespond() {
188177
}
189178

190179
public void testFailsNodeThatRejectsCheck() {
191-
final Builder settingsBuilder = Settings.builder();
192-
if (randomBoolean()) {
193-
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
194-
}
195-
if (randomBoolean()) {
196-
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
197-
}
198-
final Settings settings = settingsBuilder.build();
199-
180+
final Settings settings = randomSettings();
200181
testBehaviourOfFailingNode(settings, () -> {
201182
throw new ElasticsearchException("simulated exception");
202183
},
@@ -206,15 +187,7 @@ public void testFailsNodeThatRejectsCheck() {
206187
}
207188

208189
public void testFailureCounterResetsOnSuccess() {
209-
final Builder settingsBuilder = Settings.builder();
210-
if (randomBoolean()) {
211-
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(2, 10));
212-
}
213-
if (randomBoolean()) {
214-
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
215-
}
216-
final Settings settings = settingsBuilder.build();
217-
190+
final Settings settings = randomSettings();
218191
final int retryCount = FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings);
219192
final int maxRecoveries = randomIntBetween(3, 10);
220193

@@ -297,16 +270,7 @@ public String toString() {
297270
}
298271

299272
public void testFailsNodeThatIsUnhealthy() {
300-
final Builder settingsBuilder = Settings.builder();
301-
if (randomBoolean()) {
302-
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
303-
}
304-
if (randomBoolean()) {
305-
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
306-
}
307-
final Settings settings = settingsBuilder.build();
308-
309-
testBehaviourOfFailingNode(settings, () -> {
273+
testBehaviourOfFailingNode(randomSettings(), () -> {
310274
throw new NodeHealthCheckFailureException("non writable exception");
311275
}, "health check failed", 0, () -> new StatusInfo(HEALTHY, "healthy-info"));
312276
}
@@ -321,7 +285,7 @@ private void testBehaviourOfFailingNode(Settings testSettings, Supplier<Transpor
321285
final MockTransport mockTransport = new MockTransport() {
322286
@Override
323287
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
324-
assertFalse(node.equals(localNode));
288+
assertNotEquals(node, localNode);
325289
deterministicTaskQueue.scheduleNow(new Runnable() {
326290
@Override
327291
public void run() {
@@ -674,6 +638,20 @@ private static DiscoveryNode newNode(int nodeId, Map<String, String> attributes,
674638
Version.CURRENT);
675639
}
676640

641+
private static Settings randomSettings() {
642+
final Builder settingsBuilder = Settings.builder();
643+
if (randomBoolean()) {
644+
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
645+
}
646+
if (randomBoolean()) {
647+
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
648+
}
649+
if (randomBoolean()) {
650+
settingsBuilder.put(FOLLOWER_CHECK_TIMEOUT_SETTING.getKey(), randomIntBetween(1, 100000) + "ms");
651+
}
652+
return settingsBuilder.build();
653+
}
654+
677655
private static class ExpectsSuccess implements TransportResponseHandler<Empty> {
678656
private final AtomicBoolean responseReceived = new AtomicBoolean();
679657

server/src/test/java/org/elasticsearch/monitor/fs/FsHealthServiceTests.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -305,8 +305,7 @@ private static class FileSystemFsyncHungProvider extends FilterFileSystemProvide
305305
AtomicBoolean injectIOException = new AtomicBoolean();
306306
AtomicInteger injectedPaths = new AtomicInteger();
307307

308-
private String pathPrefix = "/";
309-
private long delay;
308+
private final long delay;
310309
private final ThreadPool threadPool;
311310

312311
FileSystemFsyncHungProvider(FileSystem inner, long delay, ThreadPool threadPool) {
@@ -325,7 +324,7 @@ public FileChannel newFileChannel(Path path, Set<? extends OpenOption> options,
325324
@Override
326325
public void force(boolean metaData) throws IOException {
327326
if (injectIOException.get()) {
328-
if (path.toString().startsWith(pathPrefix) && path.toString().endsWith(".es_temp_file")) {
327+
if (path.getFileName().toString().equals(FsHealthService.FsHealthMonitor.TEMP_FILE_NAME)) {
329328
injectedPaths.incrementAndGet();
330329
final long startTimeMillis = threadPool.relativeTimeInMillis();
331330
do {

0 commit comments

Comments
 (0)