Skip to content

Commit b683589

Browse files
Dump Stacktrace on Slow IO-Thread Operations (#42000) (#42572)
* Dump Stacktrace on Slow IO-Thread Operations * Follow up to #39729 extending the functionality to actually dump the stack when the thread is blocked not afterwards * Logging the stacktrace after the thread became unblocked is only of limited use because we don't know what happened in the slow callback from that (only whether we were blocked on a read,write,connect etc.) * Relates #41745
1 parent 489616d commit b683589

File tree

3 files changed

+143
-55
lines changed

3 files changed

+143
-55
lines changed

test/framework/src/main/java/org/elasticsearch/transport/nio/MockNioTransport.java

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import org.elasticsearch.common.network.NetworkService;
3232
import org.elasticsearch.common.recycler.Recycler;
3333
import org.elasticsearch.common.settings.Settings;
34+
import org.elasticsearch.common.unit.TimeValue;
3435
import org.elasticsearch.common.util.PageCacheRecycler;
3536
import org.elasticsearch.indices.breaker.CircuitBreakerService;
3637
import org.elasticsearch.nio.BytesChannelContext;
@@ -57,11 +58,16 @@
5758
import java.nio.ByteBuffer;
5859
import java.nio.channels.ServerSocketChannel;
5960
import java.nio.channels.SocketChannel;
61+
import java.util.Arrays;
6062
import java.util.HashSet;
63+
import java.util.Map;
6164
import java.util.Set;
65+
import java.util.concurrent.ConcurrentHashMap;
6266
import java.util.concurrent.ConcurrentMap;
67+
import java.util.concurrent.TimeUnit;
6368
import java.util.function.Consumer;
6469
import java.util.function.IntFunction;
70+
import java.util.stream.Collectors;
6571

6672
import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;
6773
import static org.elasticsearch.common.util.concurrent.EsExecutors.daemonThreadFactory;
@@ -70,13 +76,15 @@ public class MockNioTransport extends TcpTransport {
7076
private static final Logger logger = LogManager.getLogger(MockNioTransport.class);
7177

7278
private final ConcurrentMap<String, MockTcpChannelFactory> profileToChannelFactory = newConcurrentMap();
79+
private final TransportThreadWatchdog transportThreadWatchdog;
7380
private volatile NioSelectorGroup nioGroup;
7481
private volatile MockTcpChannelFactory clientChannelFactory;
7582

7683
public MockNioTransport(Settings settings, Version version, ThreadPool threadPool, NetworkService networkService,
7784
PageCacheRecycler pageCacheRecycler, NamedWriteableRegistry namedWriteableRegistry,
7885
CircuitBreakerService circuitBreakerService) {
7986
super(settings, version, threadPool, pageCacheRecycler, circuitBreakerService, namedWriteableRegistry, networkService);
87+
this.transportThreadWatchdog = new TransportThreadWatchdog(threadPool);
8088
}
8189

8290
@Override
@@ -96,7 +104,7 @@ protected void doStart() {
96104
boolean success = false;
97105
try {
98106
nioGroup = new NioSelectorGroup(daemonThreadFactory(this.settings, TcpTransport.TRANSPORT_WORKER_THREAD_NAME_PREFIX), 2,
99-
(s) -> new TestEventHandler(this::onNonChannelException, s, System::nanoTime));
107+
(s) -> new TestEventHandler(this::onNonChannelException, s, transportThreadWatchdog));
100108

101109
ProfileSettings clientProfileSettings = new ProfileSettings(settings, "default");
102110
clientChannelFactory = new MockTcpChannelFactory(true, clientProfileSettings, "client");
@@ -125,6 +133,7 @@ protected void doStart() {
125133
@Override
126134
protected void stopInternal() {
127135
try {
136+
transportThreadWatchdog.stop();
128137
nioGroup.close();
129138
} catch (Exception e) {
130139
logger.warn("unexpected exception while stopping nio group", e);
@@ -319,4 +328,64 @@ public void sendMessage(BytesReference reference, ActionListener<Void> listener)
319328
getContext().sendMessage(BytesReference.toByteBuffers(reference), ActionListener.toBiConsumer(listener));
320329
}
321330
}
331+
332+
static final class TransportThreadWatchdog {
333+
334+
private static final long WARN_THRESHOLD = TimeUnit.MILLISECONDS.toNanos(150);
335+
336+
// Only check every 2s to not flood the logs on a blocked thread.
337+
// We mostly care about long blocks and not random slowness anyway and in tests would randomly catch slow operations that block for
338+
// less than 2s eventually.
339+
private static final TimeValue CHECK_INTERVAL = TimeValue.timeValueSeconds(2);
340+
341+
private final ThreadPool threadPool;
342+
private final ConcurrentHashMap<Thread, Long> registry = new ConcurrentHashMap<>();
343+
344+
private volatile boolean stopped;
345+
346+
TransportThreadWatchdog(ThreadPool threadPool) {
347+
this.threadPool = threadPool;
348+
threadPool.schedule(this::logLongRunningExecutions, CHECK_INTERVAL, ThreadPool.Names.GENERIC);
349+
}
350+
351+
public boolean register() {
352+
Long previousValue = registry.put(Thread.currentThread(), threadPool.relativeTimeInNanos());
353+
return previousValue == null;
354+
}
355+
356+
public void unregister() {
357+
Long previousValue = registry.remove(Thread.currentThread());
358+
assert previousValue != null;
359+
maybeLogElapsedTime(previousValue);
360+
}
361+
362+
private void maybeLogElapsedTime(long startTime) {
363+
long elapsedTime = threadPool.relativeTimeInNanos() - startTime;
364+
if (elapsedTime > WARN_THRESHOLD) {
365+
logger.warn(
366+
new ParameterizedMessage("Slow execution on network thread [{} milliseconds]",
367+
TimeUnit.NANOSECONDS.toMillis(elapsedTime)),
368+
new RuntimeException("Slow exception on network thread"));
369+
}
370+
}
371+
372+
private void logLongRunningExecutions() {
373+
for (Map.Entry<Thread, Long> entry : registry.entrySet()) {
374+
final long elapsedTime = threadPool.relativeTimeInMillis() - entry.getValue();
375+
if (elapsedTime > WARN_THRESHOLD) {
376+
final Thread thread = entry.getKey();
377+
logger.warn("Slow execution on network thread [{}] [{} milliseconds]: \n{}", thread.getName(),
378+
TimeUnit.NANOSECONDS.toMillis(elapsedTime),
379+
Arrays.stream(thread.getStackTrace()).map(Object::toString).collect(Collectors.joining("\n")));
380+
}
381+
}
382+
if (stopped == false) {
383+
threadPool.scheduleUnlessShuttingDown(CHECK_INTERVAL, ThreadPool.Names.GENERIC, this::logLongRunningExecutions);
384+
}
385+
}
386+
387+
public void stop() {
388+
stopped = true;
389+
}
390+
}
322391
}

test/framework/src/main/java/org/elasticsearch/transport/nio/TestEventHandler.java

Lines changed: 64 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,6 @@
1919

2020
package org.elasticsearch.transport.nio;
2121

22-
import org.apache.logging.log4j.LogManager;
23-
import org.apache.logging.log4j.Logger;
24-
import org.apache.logging.log4j.message.ParameterizedMessage;
2522
import org.elasticsearch.nio.ChannelContext;
2623
import org.elasticsearch.nio.EventHandler;
2724
import org.elasticsearch.nio.NioSelector;
@@ -32,185 +29,202 @@
3229
import java.util.Collections;
3330
import java.util.Set;
3431
import java.util.WeakHashMap;
35-
import java.util.concurrent.TimeUnit;
3632
import java.util.function.Consumer;
37-
import java.util.function.LongSupplier;
3833
import java.util.function.Supplier;
3934

4035
public class TestEventHandler extends EventHandler {
4136

42-
private static final Logger logger = LogManager.getLogger(TestEventHandler.class);
43-
4437
private final Set<SocketChannelContext> hasConnectedMap = Collections.newSetFromMap(new WeakHashMap<>());
4538
private final Set<SocketChannelContext> hasConnectExceptionMap = Collections.newSetFromMap(new WeakHashMap<>());
46-
private final LongSupplier relativeNanosSupplier;
39+
private final MockNioTransport.TransportThreadWatchdog transportThreadWatchdog;
4740

48-
TestEventHandler(Consumer<Exception> exceptionHandler, Supplier<NioSelector> selectorSupplier, LongSupplier relativeNanosSupplier) {
41+
TestEventHandler(Consumer<Exception> exceptionHandler, Supplier<NioSelector> selectorSupplier,
42+
MockNioTransport.TransportThreadWatchdog transportThreadWatchdog) {
4943
super(exceptionHandler, selectorSupplier);
50-
this.relativeNanosSupplier = relativeNanosSupplier;
44+
this.transportThreadWatchdog = transportThreadWatchdog;
5145
}
5246

5347
@Override
5448
protected void acceptChannel(ServerChannelContext context) throws IOException {
55-
long startTime = relativeNanosSupplier.getAsLong();
49+
final boolean registered = transportThreadWatchdog.register();
5650
try {
5751
super.acceptChannel(context);
5852
} finally {
59-
maybeLogElapsedTime(startTime);
53+
if (registered) {
54+
transportThreadWatchdog.unregister();
55+
}
6056
}
6157
}
6258

6359
@Override
6460
protected void acceptException(ServerChannelContext context, Exception exception) {
65-
long startTime = relativeNanosSupplier.getAsLong();
61+
final boolean registered = transportThreadWatchdog.register();
6662
try {
6763
super.acceptException(context, exception);
6864
} finally {
69-
maybeLogElapsedTime(startTime);
65+
if (registered) {
66+
transportThreadWatchdog.unregister();
67+
}
7068
}
7169
}
7270

7371
@Override
7472
protected void handleRegistration(ChannelContext<?> context) throws IOException {
75-
long startTime = relativeNanosSupplier.getAsLong();
73+
final boolean registered = transportThreadWatchdog.register();
7674
try {
7775
super.handleRegistration(context);
7876
} finally {
79-
maybeLogElapsedTime(startTime);
77+
if (registered) {
78+
transportThreadWatchdog.unregister();
79+
}
8080
}
8181
}
8282

8383
@Override
8484
protected void registrationException(ChannelContext<?> context, Exception exception) {
85-
long startTime = relativeNanosSupplier.getAsLong();
85+
final boolean registered = transportThreadWatchdog.register();
8686
try {
8787
super.registrationException(context, exception);
8888
} finally {
89-
maybeLogElapsedTime(startTime);
89+
if (registered) {
90+
transportThreadWatchdog.unregister();
91+
}
9092
}
9193
}
9294

9395
public void handleConnect(SocketChannelContext context) throws IOException {
9496
assert hasConnectedMap.contains(context) == false : "handleConnect should only be called is a channel is not yet connected";
95-
long startTime = relativeNanosSupplier.getAsLong();
97+
final boolean registered = transportThreadWatchdog.register();
9698
try {
9799
super.handleConnect(context);
98100
if (context.isConnectComplete()) {
99101
hasConnectedMap.add(context);
100102
}
101103
} finally {
102-
maybeLogElapsedTime(startTime);
104+
if (registered) {
105+
transportThreadWatchdog.unregister();
106+
}
103107
}
104108
}
105109

106110
public void connectException(SocketChannelContext context, Exception e) {
107111
assert hasConnectExceptionMap.contains(context) == false : "connectException should only called at maximum once per channel";
112+
final boolean registered = transportThreadWatchdog.register();
108113
hasConnectExceptionMap.add(context);
109-
long startTime = relativeNanosSupplier.getAsLong();
110114
try {
111115
super.connectException(context, e);
112116
} finally {
113-
maybeLogElapsedTime(startTime);
117+
if (registered) {
118+
transportThreadWatchdog.unregister();
119+
}
114120
}
115121
}
116122

117123
@Override
118124
protected void handleRead(SocketChannelContext context) throws IOException {
119-
long startTime = relativeNanosSupplier.getAsLong();
125+
final boolean registered = transportThreadWatchdog.register();
120126
try {
121127
super.handleRead(context);
122128
} finally {
123-
maybeLogElapsedTime(startTime);
129+
if (registered) {
130+
transportThreadWatchdog.unregister();
131+
}
124132
}
125133
}
126134

127135
@Override
128136
protected void readException(SocketChannelContext context, Exception exception) {
129-
long startTime = relativeNanosSupplier.getAsLong();
137+
final boolean registered = transportThreadWatchdog.register();
130138
try {
131139
super.readException(context, exception);
132140
} finally {
133-
maybeLogElapsedTime(startTime);
141+
if (registered) {
142+
transportThreadWatchdog.unregister();
143+
}
134144
}
135145
}
136146

137147
@Override
138148
protected void handleWrite(SocketChannelContext context) throws IOException {
139-
long startTime = relativeNanosSupplier.getAsLong();
149+
final boolean registered = transportThreadWatchdog.register();
140150
try {
141151
super.handleWrite(context);
142152
} finally {
143-
maybeLogElapsedTime(startTime);
153+
if (registered) {
154+
transportThreadWatchdog.unregister();
155+
}
144156
}
145157
}
146158

147159
@Override
148160
protected void writeException(SocketChannelContext context, Exception exception) {
149-
long startTime = relativeNanosSupplier.getAsLong();
161+
final boolean registered = transportThreadWatchdog.register();
150162
try {
151163
super.writeException(context, exception);
152164
} finally {
153-
maybeLogElapsedTime(startTime);
165+
if (registered) {
166+
transportThreadWatchdog.unregister();
167+
}
154168
}
155169
}
156170

157171
@Override
158172
protected void handleTask(Runnable task) {
159-
long startTime = relativeNanosSupplier.getAsLong();
173+
final boolean registered = transportThreadWatchdog.register();
160174
try {
161175
super.handleTask(task);
162176
} finally {
163-
maybeLogElapsedTime(startTime);
177+
if (registered) {
178+
transportThreadWatchdog.unregister();
179+
}
164180
}
165181
}
166182

167183
@Override
168184
protected void taskException(Exception exception) {
169-
long startTime = relativeNanosSupplier.getAsLong();
185+
final boolean registered = transportThreadWatchdog.register();
170186
try {
171187
super.taskException(exception);
172188
} finally {
173-
maybeLogElapsedTime(startTime);
189+
if (registered) {
190+
transportThreadWatchdog.unregister();
191+
}
174192
}
175193
}
176194

177195
@Override
178196
protected void handleClose(ChannelContext<?> context) throws IOException {
179-
long startTime = relativeNanosSupplier.getAsLong();
197+
final boolean registered = transportThreadWatchdog.register();
180198
try {
181199
super.handleClose(context);
182200
} finally {
183-
maybeLogElapsedTime(startTime);
201+
if (registered) {
202+
transportThreadWatchdog.unregister();
203+
}
184204
}
185205
}
186206

187207
@Override
188208
protected void closeException(ChannelContext<?> context, Exception exception) {
189-
long startTime = relativeNanosSupplier.getAsLong();
209+
final boolean registered = transportThreadWatchdog.register();
190210
try {
191211
super.closeException(context, exception);
192212
} finally {
193-
maybeLogElapsedTime(startTime);
213+
if (registered) {
214+
transportThreadWatchdog.unregister();
215+
}
194216
}
195217
}
196218

197219
@Override
198220
protected void genericChannelException(ChannelContext<?> context, Exception exception) {
199-
long startTime = relativeNanosSupplier.getAsLong();
221+
final boolean registered = transportThreadWatchdog.register();
200222
try {
201223
super.genericChannelException(context, exception);
202224
} finally {
203-
maybeLogElapsedTime(startTime);
204-
}
205-
}
206-
207-
private static final long WARN_THRESHOLD = 150;
208-
209-
private void maybeLogElapsedTime(long startTime) {
210-
long elapsedTime = TimeUnit.NANOSECONDS.toMillis(relativeNanosSupplier.getAsLong() - startTime);
211-
if (elapsedTime > WARN_THRESHOLD) {
212-
logger.warn(new ParameterizedMessage("Slow execution on network thread [{} milliseconds]", elapsedTime),
213-
new RuntimeException("Slow exception on network thread"));
225+
if (registered) {
226+
transportThreadWatchdog.unregister();
227+
}
214228
}
215229
}
216230
}

0 commit comments

Comments
 (0)