Skip to content

Commit 8d2dca1

Browse files
youkaichaocennn
authored andcommitted
[core] add sleep and wake up endpoint and v1 support (vllm-project#12987)
Signed-off-by: youkaichao <[email protected]> Signed-off-by: cennn <[email protected]> Co-authored-by: cennn <[email protected]> Signed-off-by: Louis Ulmer <[email protected]>
1 parent 2749bea commit 8d2dca1

File tree

13 files changed

+160
-9
lines changed

13 files changed

+160
-9
lines changed

tests/basic_correctness/test_cumem.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,14 +118,16 @@ def model(x):
118118

119119
@fork_new_process_for_each_test
120120
@pytest.mark.parametrize(
121-
"model",
121+
"model, use_v1",
122122
[
123123
# sleep mode with safetensors
124-
f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
124+
(f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B", True),
125125
# sleep mode with pytorch checkpoint
126-
"facebook/opt-125m"
126+
("facebook/opt-125m", False),
127127
])
128-
def test_end_to_end(model):
128+
def test_end_to_end(model: str, use_v1: bool):
129+
import os
130+
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
129131
free, total = torch.cuda.mem_get_info()
130132
used_bytes_baseline = total - free # in case other process is running
131133
load_format = LoadFormat.AUTO
@@ -152,3 +154,5 @@ def test_end_to_end(model):
152154

153155
# cmp output
154156
assert output[0].outputs[0].text == output2[0].outputs[0].text
157+
158+
del os.environ["VLLM_USE_V1"]
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
import requests
4+
5+
from ...utils import RemoteOpenAIServer
6+
7+
MODEL_NAME = "meta-llama/Llama-3.2-1B"
8+
9+
10+
def test_sleep_mode():
11+
# dtype, max-len etc set so that this can run in CI
12+
args = [
13+
"--dtype",
14+
"bfloat16",
15+
"--max-model-len",
16+
"8192",
17+
"--max-num-seqs",
18+
"128",
19+
"--enable-sleep-mode",
20+
]
21+
22+
with RemoteOpenAIServer(MODEL_NAME,
23+
args,
24+
env_dict={
25+
"VLLM_SERVER_DEV_MODE": "1",
26+
"CUDA_VISIBLE_DEVICES": "0"
27+
}) as remote_server:
28+
response = requests.post(remote_server.url_for("/sleep"),
29+
data={"level": "1"})
30+
assert response.status_code == 200
31+
response = requests.post(remote_server.url_for("/wake_up"))
32+
assert response.status_code == 200

vllm/engine/async_llm_engine.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1187,6 +1187,12 @@ async def stop_profile(self) -> None:
11871187
async def reset_prefix_cache(self) -> None:
11881188
self.engine.reset_prefix_cache()
11891189

1190+
async def sleep(self, level: int = 1) -> None:
1191+
self.engine.sleep(level)
1192+
1193+
async def wake_up(self) -> None:
1194+
self.engine.wake_up()
1195+
11901196
async def add_lora(self, lora_request: LoRARequest) -> None:
11911197
self.engine.add_lora(lora_request)
11921198

vllm/engine/multiprocessing/__init__.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,15 @@ class RPCResetPrefixCacheRequest(Enum):
127127
RESET_PREFIX_CACHE = 1
128128

129129

130+
class RPCSleepRequest(Enum):
131+
SLEEP_LEVEL_1 = 1
132+
SLEEP_LEVEL_2 = 2
133+
134+
135+
class RPCWakeUpRequest(Enum):
136+
WAKE_UP = 1
137+
138+
130139
@dataclass
131140
class RPCLoadAdapterRequest:
132141
lora_request: LoRARequest
@@ -141,7 +150,8 @@ class RPCAdapterLoadedResponse:
141150

142151
RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
143152
RPCUProfileRequest, RPCLoadAdapterRequest,
144-
RPCResetPrefixCacheRequest]
153+
RPCResetPrefixCacheRequest, RPCSleepRequest,
154+
RPCWakeUpRequest]
145155

146156
REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
147157
RPCError]

vllm/engine/multiprocessing/client.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,9 @@
3131
RPCLoadAdapterRequest,
3232
RPCProcessRequest,
3333
RPCResetPrefixCacheRequest,
34-
RPCStartupRequest, RPCStartupResponse,
35-
RPCUProfileRequest)
34+
RPCSleepRequest, RPCStartupRequest,
35+
RPCStartupResponse,
36+
RPCUProfileRequest, RPCWakeUpRequest)
3637
from vllm.engine.protocol import EngineClient
3738
# yapf: enable
3839
from vllm.envs import VLLM_RPC_TIMEOUT
@@ -685,6 +686,16 @@ async def reset_prefix_cache(self) -> None:
685686
request=RPCResetPrefixCacheRequest.RESET_PREFIX_CACHE,
686687
socket=self.input_socket)
687688

689+
async def sleep(self, level: int = 1) -> None:
690+
"""Sleep the engine for a given level"""
691+
return await self._send_one_way_rpc_request(
692+
request=RPCSleepRequest(level), socket=self.input_socket)
693+
694+
async def wake_up(self) -> None:
695+
"""Wake up the engine"""
696+
return await self._send_one_way_rpc_request(
697+
request=RPCWakeUpRequest.WAKE_UP, socket=self.input_socket)
698+
688699
async def add_lora(self, lora_request: LoRARequest) -> None:
689700
"""Load a new LoRA adapter into the engine for future requests."""
690701
# Uses the same I/O as generate requests

vllm/engine/multiprocessing/engine.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@
2020
RPCLoadAdapterRequest,
2121
RPCProcessRequest,
2222
RPCResetPrefixCacheRequest,
23-
RPCStartupRequest, RPCStartupResponse,
24-
RPCUProfileRequest)
23+
RPCSleepRequest, RPCStartupRequest,
24+
RPCStartupResponse,
25+
RPCUProfileRequest, RPCWakeUpRequest)
2526
# yapf: enable
2627
from vllm.logger import init_logger
2728
from vllm.outputs import RequestOutput
@@ -242,6 +243,10 @@ def handle_new_input(self):
242243
self._handle_load_adapter_request(request)
243244
elif isinstance(request, RPCResetPrefixCacheRequest):
244245
self.reset_prefix_cache()
246+
elif isinstance(request, RPCSleepRequest):
247+
self.sleep(request.value)
248+
elif isinstance(request, RPCWakeUpRequest):
249+
self.wake_up()
245250
else:
246251
raise ValueError("Unknown RPCRequest Type: "
247252
f"{type(request)}")
@@ -369,6 +374,12 @@ def stop_profile(self) -> None:
369374
def reset_prefix_cache(self) -> bool:
370375
return self.engine.reset_prefix_cache()
371376

377+
def sleep(self, level: int = 1) -> None:
378+
self.engine.sleep(level)
379+
380+
def wake_up(self) -> None:
381+
self.engine.wake_up()
382+
372383

373384
def signal_handler(*_) -> None:
374385
raise KeyboardInterrupt("MQLLMEngine terminated")

vllm/engine/protocol.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,16 @@ async def reset_prefix_cache(self) -> None:
278278
"""Reset the prefix cache"""
279279
...
280280

281+
@abstractmethod
282+
async def sleep(self, level: int = 1) -> None:
283+
"""Sleep the engine"""
284+
...
285+
286+
@abstractmethod
287+
async def wake_up(self) -> None:
288+
"""Wake up the engine"""
289+
...
290+
281291
@abstractmethod
282292
async def add_lora(self, lora_request: LoRARequest) -> None:
283293
"""Load a new LoRA adapter into the engine for future requests."""

vllm/entrypoints/openai/api_server.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -625,6 +625,24 @@ async def reset_prefix_cache(raw_request: Request):
625625
await engine_client(raw_request).reset_prefix_cache()
626626
return Response(status_code=200)
627627

628+
@router.post("/sleep")
629+
async def sleep(raw_request: Request):
630+
# get POST params
631+
level = raw_request.query_params.get("level", "1")
632+
logger.info("sleep the engine with level %s", level)
633+
await engine_client(raw_request).sleep(int(level))
634+
# FIXME: in v0 with frontend multiprocessing, the sleep command
635+
# is sent but does not finish yet when we return a response.
636+
return Response(status_code=200)
637+
638+
@router.post("/wake_up")
639+
async def wake_up(raw_request: Request):
640+
logger.info("wake up the engine")
641+
await engine_client(raw_request).wake_up()
642+
# FIXME: in v0 with frontend multiprocessing, the wake-up command
643+
# is sent but does not finish yet when we return a response.
644+
return Response(status_code=200)
645+
628646

629647
@router.post("/invocations", dependencies=[Depends(validate_json_request)])
630648
async def invocations(raw_request: Request):

vllm/entrypoints/openai/serving_transcription.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,7 @@ async def create_transcription(
295295
# TODO(rob): figure out a way to pipe streaming in.
296296
# Non-streaming response.
297297
try:
298+
assert result_generator is not None
298299
async for op in result_generator:
299300
result = op
300301
return TranscriptionResponse(text=result.outputs[0].text)

vllm/v1/engine/async_llm.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,12 @@ async def stop_profile(self) -> None:
361361
async def reset_prefix_cache(self) -> None:
362362
await self.engine_core.reset_prefix_cache_async()
363363

364+
async def sleep(self, level: int = 1) -> None:
365+
await self.engine_core.sleep_async(level)
366+
367+
async def wake_up(self) -> None:
368+
await self.engine_core.wake_up_async()
369+
364370
async def add_lora(self, lora_request: LoRARequest) -> None:
365371
"""Load a new LoRA adapter into the engine for future requests."""
366372
await self.engine_core.add_lora_async(lora_request)

vllm/v1/engine/core.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,12 @@ def profile(self, is_start: bool = True):
213213
def reset_prefix_cache(self):
214214
self.scheduler.reset_prefix_cache()
215215

216+
def sleep(self, level: int = 1):
217+
self.model_executor.sleep(level)
218+
219+
def wake_up(self):
220+
self.model_executor.wake_up()
221+
216222
def add_lora(self, lora_request: LoRARequest) -> None:
217223
self.model_executor.add_lora(lora_request)
218224

vllm/v1/engine/core_client.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,12 @@ def profile(self, is_start: bool = True) -> None:
8181
def reset_prefix_cache(self) -> None:
8282
raise NotImplementedError
8383

84+
def sleep(self, level: int = 1) -> None:
85+
raise NotImplementedError
86+
87+
def wake_up(self) -> None:
88+
raise NotImplementedError
89+
8490
def abort_requests(self, request_ids: List[str]) -> None:
8591
raise NotImplementedError
8692

@@ -99,6 +105,12 @@ async def profile_async(self, is_start: bool = True) -> None:
99105
async def reset_prefix_cache_async(self) -> None:
100106
raise NotImplementedError
101107

108+
async def sleep_async(self, level: int = 1) -> None:
109+
raise NotImplementedError
110+
111+
async def wake_up_async(self) -> None:
112+
raise NotImplementedError
113+
102114
async def abort_requests_async(self, request_ids: List[str]) -> None:
103115
raise NotImplementedError
104116

@@ -138,6 +150,12 @@ def profile(self, is_start: bool = True) -> None:
138150
def reset_prefix_cache(self) -> None:
139151
self.engine_core.reset_prefix_cache()
140152

153+
def sleep(self, level: int = 1) -> None:
154+
self.engine_core.sleep(level)
155+
156+
def wake_up(self) -> None:
157+
self.engine_core.wake_up()
158+
141159
def add_lora(self, lora_request: LoRARequest) -> None:
142160
self.engine_core.add_lora(lora_request)
143161

@@ -307,6 +325,12 @@ def reset_prefix_cache(self) -> None:
307325
def add_lora(self, lora_request: LoRARequest) -> None:
308326
self._call_utility("add_lora", lora_request)
309327

328+
def sleep(self, level: int = 1) -> None:
329+
self._call_utility("sleep", level)
330+
331+
def wake_up(self) -> None:
332+
self._call_utility("wake_up")
333+
310334

311335
class AsyncMPClient(MPClient):
312336
"""Asyncio-compatible client for multi-proc EngineCore."""
@@ -384,5 +408,11 @@ async def profile_async(self, is_start: bool = True) -> None:
384408
async def reset_prefix_cache_async(self) -> None:
385409
await self._call_utility_async("reset_prefix_cache")
386410

411+
async def sleep_async(self, level: int = 1) -> None:
412+
await self._call_utility_async("sleep", level)
413+
414+
async def wake_up_async(self) -> None:
415+
await self._call_utility_async("wake_up")
416+
387417
async def add_lora_async(self, lora_request: LoRARequest) -> None:
388418
await self._call_utility_async("add_lora", lora_request)

vllm/v1/engine/llm_engine.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,12 @@ def stop_profile(self):
169169
def reset_prefix_cache(self):
170170
self.engine_core.reset_prefix_cache()
171171

172+
def sleep(self, level: int = 1):
173+
self.engine_core.sleep(level)
174+
175+
def wake_up(self):
176+
self.engine_core.wake_up()
177+
172178
def get_tokenizer_group(
173179
self,
174180
group_type: Type[_G] = BaseTokenizerGroup,

0 commit comments

Comments
 (0)