Skip to content

Commit bc440dc

Browse files
robertgshaw2-redhattlrmchlsmth
authored andcommitted
[V1] Improve TP>1 Error Handling + Stack Trace (vllm-project#11721)
Co-authored-by: Tyler Michael Smith <[email protected]>
1 parent e5c2734 commit bc440dc

File tree

4 files changed

+40
-21
lines changed

4 files changed

+40
-21
lines changed

vllm/v1/engine/async_llm.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import asyncio
22
import os
3-
import signal
43
from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
54

65
from vllm.config import ModelConfig, VllmConfig
@@ -42,21 +41,6 @@ def __init__(
4241
start_engine_loop: bool = True,
4342
) -> None:
4443

45-
# The child processes will send SIGQUIT when unrecoverable
46-
# errors happen. We kill the process tree here so that the
47-
# stack trace is very evident.
48-
# TODO: rather than killing the main process, we should
49-
# figure out how to raise an AsyncEngineDeadError and
50-
# handle at the API server level so we can return a better
51-
# error code to the clients calling VLLM.
52-
def sigquit_handler(signum, frame):
53-
logger.fatal(
54-
"AsyncLLM got SIGQUIT from worker processes, shutting "
55-
"down. See stack trace above for root cause issue.")
56-
kill_process_tree(os.getpid())
57-
58-
signal.signal(signal.SIGQUIT, sigquit_handler)
59-
6044
assert start_engine_loop
6145

6246
self.log_requests = log_requests

vllm/v1/engine/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def signal_handler(signum, frame):
198198
except Exception:
199199
traceback = get_exception_traceback()
200200
logger.error("EngineCore hit an exception: %s", traceback)
201-
parent_process.send_signal(signal.SIGQUIT)
201+
parent_process.send_signal(signal.SIGUSR1)
202202

203203
finally:
204204
if engine_core is not None:

vllm/v1/engine/core_client.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import os
2+
import signal
13
import weakref
24
from abc import ABC, abstractmethod
35
from typing import List, Type
@@ -8,7 +10,8 @@
810

911
from vllm.config import VllmConfig
1012
from vllm.logger import init_logger
11-
from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket
13+
from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
14+
make_zmq_socket)
1215
from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
1316
EngineCoreProfile, EngineCoreRequest,
1417
EngineCoreRequestType, EngineCoreRequestUnion)
@@ -134,6 +137,20 @@ def __init__(
134137
executor_class: Type[Executor],
135138
log_stats: bool = False,
136139
):
140+
# The child processes will send SIGUSR1 when unrecoverable
141+
# errors happen. We kill the process tree here so that the
142+
# stack trace is very evident.
143+
# TODO(rob): rather than killing the main process, we should
144+
# figure out how to raise an AsyncEngineDeadError and
145+
# handle at the API server level so we can return a better
146+
# error code to the clients calling VLLM.
147+
def sigusr1_handler(signum, frame):
148+
logger.fatal("Got fatal signal from worker processes, shutting "
149+
"down. See stack trace above for root cause issue.")
150+
kill_process_tree(os.getpid())
151+
152+
signal.signal(signal.SIGUSR1, sigusr1_handler)
153+
137154
# Serialization setup.
138155
self.encoder = PickleEncoder()
139156
self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)

vllm/v1/executor/multiproc_executor.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from multiprocessing.process import BaseProcess
1010
from typing import Any, Dict, List, Optional, Tuple
1111

12+
import psutil
1213
import zmq
1314

1415
from vllm.config import VllmConfig
@@ -38,6 +39,19 @@ def __init__(self, vllm_config: VllmConfig) -> None:
3839
# and ensure workers will be terminated.
3940
self._finalizer = weakref.finalize(self, self.shutdown)
4041

42+
# The child processes will send SIGUSR1 when unrecoverable
43+
# errors happen.
44+
def sigusr1_handler(signum, frame):
45+
logger.fatal(
46+
"MulitprocExecutor got fatal signal from worker processes, "
47+
"shutting down. See stack trace above for root cause issue.")
48+
# Propagate error up to parent process.
49+
parent_process = psutil.Process().parent()
50+
parent_process.send_signal(signal.SIGUSR1)
51+
self.shutdown()
52+
53+
signal.signal(signal.SIGUSR1, sigusr1_handler)
54+
4155
self.vllm_config = vllm_config
4256
self.parallel_config = vllm_config.parallel_config
4357

@@ -335,8 +349,11 @@ def signal_handler(signum, frame):
335349
except SystemExit:
336350
logger.debug("Worker interrupted.")
337351

338-
except BaseException as e:
339-
logger.exception(e)
352+
except Exception:
353+
# worker_busy_loop sends exceptions exceptons to Executor
354+
# for shutdown, but if there is an error in startup or an
355+
# error with IPC itself, we need to alert the parent.
356+
psutil.Process().parent().send_signal(signal.SIGUSR1)
340357
raise
341358

342359
finally:
@@ -377,9 +394,10 @@ def worker_busy_loop(self):
377394

378395
try:
379396
output = getattr(self.worker, method)(*args, **kwargs)
380-
except BaseException as e:
397+
except Exception as e:
381398
self.worker_response_mq.enqueue(
382399
(WorkerProc.ResponseStatus.FAILURE, e))
400+
logger.exception("WorkerProc hit an exception: %s", exc_info=e)
383401
continue
384402

385403
self.worker_response_mq.enqueue(

0 commit comments

Comments
 (0)