Skip to content

Commit 6e8ea7a

Browse files
Work around JVM Bug in LongGCDisruptionTests (elastic#50731) (elastic#50974)
There is a JVM bug causing `Thread#suspend` calls to randomly take multiple seconds breaking these tests that call the method numerous times in a loop. Increasing the timeout would will not work since we may call `suspend` tens if not hundreds of times and even a small number of them experiencing the blocking will lead to multiple minutes of waiting. This PR detects the specific issue by timing the `Thread#suspend` calls and skips the remainder of the test if it timed out because of the JVM bug. Closes elastic#50047
1 parent d8510be commit 6e8ea7a

File tree

2 files changed

+27
-4
lines changed

2 files changed

+27
-4
lines changed

test/framework/src/main/java/org/elasticsearch/test/disruption/LongGCDisruption.java

+18
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
import java.util.Random;
3333
import java.util.Set;
3434
import java.util.concurrent.ConcurrentHashMap;
35+
import java.util.concurrent.TimeUnit;
36+
import java.util.concurrent.atomic.AtomicBoolean;
3537
import java.util.concurrent.atomic.AtomicReference;
3638
import java.util.regex.Pattern;
3739
import java.util.stream.Collectors;
@@ -56,11 +58,23 @@ public class LongGCDisruption extends SingleNodeDisruption {
5658
private Set<Thread> suspendedThreads;
5759
private Thread blockDetectionThread;
5860

61+
private final AtomicBoolean sawSlowSuspendBug = new AtomicBoolean(false);
62+
5963
public LongGCDisruption(Random random, String disruptedNode) {
6064
super(random);
6165
this.disruptedNode = disruptedNode;
6266
}
6367

68+
/**
69+
* Checks if during disruption we ran into a known JVM issue that makes {@link Thread#suspend()} calls block for multiple seconds
70+
* was observed.
71+
* @see <a href=https://bugs.openjdk.java.net/browse/JDK-8218446>JDK-8218446</a>
72+
* @return true if during thread suspending a call to {@link Thread#suspend()} took more than 3s
73+
*/
74+
public boolean sawSlowSuspendBug() {
75+
return sawSlowSuspendBug.get();
76+
}
77+
6478
@Override
6579
public synchronized void startDisrupting() {
6680
if (suspendedThreads == null) {
@@ -251,7 +265,11 @@ protected boolean suspendThreads(Set<Thread> nodeThreads) {
251265
* assuming that it is safe.
252266
*/
253267
boolean definitelySafe = true;
268+
final long startTime = System.nanoTime();
254269
thread.suspend();
270+
if (System.nanoTime() - startTime > TimeUnit.SECONDS.toNanos(3L)) {
271+
sawSlowSuspendBug.set(true);
272+
}
255273
// double check the thread is not in a shared resource like logging; if so, let it go and come back
256274
safe:
257275
for (StackTraceElement stackElement : thread.getStackTrace()) {

test/framework/src/test/java/org/elasticsearch/test/disruption/LongGCDisruptionTests.java

+9-4
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
*/
1919
package org.elasticsearch.test.disruption;
2020

21-
import org.elasticsearch.bootstrap.JavaVersion;
2221
import org.elasticsearch.common.Nullable;
2322
import org.elasticsearch.test.ESTestCase;
2423

@@ -115,8 +114,6 @@ protected long getSuspendingTimeoutInMillis() {
115114
* but does keep retrying until all threads can be safely paused
116115
*/
117116
public void testNotBlockingUnsafeStackTraces() throws Exception {
118-
assumeFalse("https://github.com/elastic/elasticsearch/issues/50047",
119-
JavaVersion.current().equals(JavaVersion.parse("11")) || JavaVersion.current().equals(JavaVersion.parse("12")));
120117
final String nodeName = "test_node";
121118
LongGCDisruption disruption = new LongGCDisruption(random(), nodeName) {
122119
@Override
@@ -149,14 +146,22 @@ protected Pattern[] getUnsafeClasses() {
149146
threads[i].start();
150147
}
151148
// make sure some threads are under lock
152-
disruption.startDisrupting();
149+
try {
150+
disruption.startDisrupting();
151+
} catch (RuntimeException e) {
152+
if (e.getMessage().contains("suspending node threads took too long") && disruption.sawSlowSuspendBug()) {
153+
return;
154+
}
155+
throw new AssertionError(e);
156+
}
153157
long first = ops.get();
154158
assertThat(lockedExecutor.lock.isLocked(), equalTo(false)); // no threads should own the lock
155159
Thread.sleep(100);
156160
assertThat(ops.get(), equalTo(first));
157161
disruption.stopDisrupting();
158162
assertBusy(() -> assertThat(ops.get(), greaterThan(first)));
159163
} finally {
164+
disruption.stopDisrupting();
160165
stop.set(true);
161166
for (final Thread thread : threads) {
162167
thread.join();

0 commit comments

Comments
 (0)