1
1
# SPDX-License-Identifier: Apache-2.0
2
2
3
- import pickle
4
3
import queue
5
4
import signal
6
5
import threading
7
6
import time
8
7
from multiprocessing .connection import Connection
9
- from typing import List , Tuple , Type
8
+ from typing import Any , List , Tuple , Type
10
9
11
10
import psutil
12
11
import zmq
19
18
from vllm .utils import get_exception_traceback , zmq_socket_ctx
20
19
from vllm .v1 .core .kv_cache_utils import get_kv_cache_config
21
20
from vllm .v1 .core .scheduler import Scheduler
22
- from vllm .v1 .engine import (EngineCoreOutputs , EngineCoreProfile ,
23
- EngineCoreRequest , EngineCoreRequestType ,
24
- EngineCoreRequestUnion , EngineCoreResetPrefixCache )
21
+ from vllm .v1 .engine import (EngineCoreOutputs , EngineCoreRequest ,
22
+ EngineCoreRequestType )
25
23
from vllm .v1 .engine .mm_input_mapper import MMInputMapperServer
26
24
from vllm .v1 .executor .abstract import Executor
27
25
from vllm .v1 .request import Request , RequestStatus
28
- from vllm .v1 .serial_utils import MsgpackEncoder , PickleEncoder
26
+ from vllm .v1 .serial_utils import MsgpackDecoder , MsgpackEncoder
29
27
from vllm .version import __version__ as VLLM_VERSION
30
28
31
29
logger = init_logger (__name__ )
@@ -171,7 +169,8 @@ def __init__(
171
169
# and to overlap some serialization/deserialization with the
172
170
# model forward pass.
173
171
# Threads handle Socket <-> Queues and core_busy_loop uses Queue.
174
- self .input_queue : queue .Queue [EngineCoreRequestUnion ] = queue .Queue ()
172
+ self .input_queue : queue .Queue [Tuple [EngineCoreRequestType ,
173
+ Any ]] = queue .Queue ()
175
174
self .output_queue : queue .Queue [EngineCoreOutputs ] = queue .Queue ()
176
175
threading .Thread (target = self .process_input_socket ,
177
176
args = (input_path , ),
@@ -233,7 +232,7 @@ def run_busy_loop(self):
233
232
while True :
234
233
try :
235
234
req = self .input_queue .get (timeout = POLLING_TIMEOUT_S )
236
- self ._handle_client_request (req )
235
+ self ._handle_client_request (* req )
237
236
break
238
237
except queue .Empty :
239
238
logger .debug ("EngineCore busy loop waiting." )
@@ -243,59 +242,51 @@ def run_busy_loop(self):
243
242
except BaseException :
244
243
raise
245
244
246
- # 2) Handle any new client requests (Abort or Add) .
245
+ # 2) Handle any new client requests.
247
246
while not self .input_queue .empty ():
248
247
req = self .input_queue .get_nowait ()
249
- self ._handle_client_request (req )
248
+ self ._handle_client_request (* req )
250
249
251
250
# 3) Step the engine core.
252
251
outputs = self .step ()
253
252
254
253
# 5) Put EngineCoreOutputs into the output queue.
255
254
self .output_queue .put_nowait (outputs )
256
255
257
- def _handle_client_request (self , request : EngineCoreRequestUnion ) -> None :
258
- """Handle EngineCoreRequest or EngineCoreABORT from Client."""
256
+ def _handle_client_request (self , request_type : EngineCoreRequestType ,
257
+ request : Any ) -> None :
258
+ """Dispatch request from client."""
259
259
260
- if isinstance ( request , EngineCoreRequest ) :
260
+ if request_type == EngineCoreRequestType . ADD :
261
261
self .add_request (request )
262
- elif isinstance (request , EngineCoreProfile ):
263
- self .model_executor .profile (request .is_start )
264
- elif isinstance (request , EngineCoreResetPrefixCache ):
265
- self .reset_prefix_cache ()
266
- else :
267
- # TODO: make an EngineCoreAbort wrapper
268
- assert isinstance (request , list )
262
+ elif request_type == EngineCoreRequestType .ABORT :
269
263
self .abort_requests (request )
264
+ elif request_type == EngineCoreRequestType .RESET_PREFIX_CACHE :
265
+ self .reset_prefix_cache ()
266
+ elif request_type == EngineCoreRequestType .PROFILE :
267
+ self .model_executor .profile (request )
270
268
271
269
def process_input_socket (self , input_path : str ):
272
270
"""Input socket IO thread."""
273
271
274
272
# Msgpack serialization decoding.
275
- decoder_add_req = PickleEncoder ( )
276
- decoder_abort_req = PickleEncoder ()
273
+ add_request_decoder = MsgpackDecoder ( EngineCoreRequest )
274
+ generic_decoder = MsgpackDecoder ()
277
275
278
276
with zmq_socket_ctx (input_path , zmq .constants .PULL ) as socket :
279
277
while True :
280
278
# (RequestType, RequestData)
281
279
type_frame , data_frame = socket .recv_multipart (copy = False )
282
- request_type = type_frame .buffer
283
- request_data = data_frame .buffer
280
+ request_type = EngineCoreRequestType (bytes (type_frame .buffer ))
284
281
285
282
# Deserialize the request data.
286
- if request_type == EngineCoreRequestType .ADD .value :
287
- request = decoder_add_req .decode (request_data )
288
- elif request_type == EngineCoreRequestType .ABORT .value :
289
- request = decoder_abort_req .decode (request_data )
290
- elif request_type in (
291
- EngineCoreRequestType .PROFILE .value ,
292
- EngineCoreRequestType .RESET_PREFIX_CACHE .value ):
293
- request = pickle .loads (request_data )
294
- else :
295
- raise ValueError (f"Unknown RequestType: { request_type } " )
283
+ decoder = add_request_decoder if (
284
+ request_type
285
+ == EngineCoreRequestType .ADD ) else generic_decoder
286
+ request = decoder .decode (data_frame .buffer )
296
287
297
288
# Push to input queue for core busy loop.
298
- self .input_queue .put_nowait (request )
289
+ self .input_queue .put_nowait (( request_type , request ) )
299
290
300
291
def process_output_socket (self , output_path : str ):
301
292
"""Output socket IO thread."""
0 commit comments