Skip to content

Commit 7423cf0

Browse files
reidliu41reidliu41
and
reidliu41
authored
[Misc] refactor example - cpu_offload_lmcache (#17460)
Signed-off-by: reidliu41 <[email protected]> Co-authored-by: reidliu41 <[email protected]>
1 parent 460a2b1 commit 7423cf0

File tree

3 files changed

+64
-70
lines changed

3 files changed

+64
-70
lines changed

examples/lmcache/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ The main script generates several log files:
4444

4545
## 2. CPU Offload Examples
4646

47-
- `cpu_offload_lmcache_v0.py` - CPU offloading implementation for vLLM v0
48-
- `cpu_offload_lmcache_v1.py` - CPU offloading implementation for vLLM v1
47+
- `python cpu_offload_lmcache.py -v v0` - CPU offloading implementation for vLLM v0
48+
- `python cpu_offload_lmcache.py -v v1` - CPU offloading implementation for vLLM v1
4949

5050
## 3. KV Cache Sharing
5151

examples/lmcache/cpu_offload_lmcache_v0.py renamed to examples/lmcache/cpu_offload_lmcache.py

Lines changed: 62 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,37 @@
11
# SPDX-License-Identifier: Apache-2.0
22
"""
33
This file demonstrates the example usage of cpu offloading
4-
with LMCache.
4+
with LMCache in vLLM v1 or v0.
5+
6+
Usage:
7+
8+
Specify vLLM version
9+
10+
-v v0 : Use LMCacheConnector
11+
model = mistralai/Mistral-7B-Instruct-v0.2
12+
(Includes enable_chunked_prefill = True)
13+
14+
-v v1 : Use LMCacheConnectorV1 (default)
15+
model = meta-llama/Meta-Llama-3.1-8B-Instruct
16+
(Without enable_chunked_prefill)
517
618
Note that `lmcache` is needed to run this example.
719
Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
820
Learn more about LMCache environment setup, please refer to:
921
https://docs.lmcache.ai/getting_started/installation.html
1022
"""
23+
import argparse
1124
import contextlib
1225
import os
1326
import time
27+
from dataclasses import asdict
1428

1529
from lmcache.experimental.cache_engine import LMCacheEngineBuilder
1630
from lmcache.integration.vllm.utils import ENGINE_NAME
1731

1832
from vllm import LLM, SamplingParams
1933
from vllm.config import KVTransferConfig
34+
from vllm.engine.arg_utils import EngineArgs
2035

2136

2237
def setup_environment_variables():
@@ -32,18 +47,32 @@ def setup_environment_variables():
3247

3348

3449
@contextlib.contextmanager
35-
def build_llm_with_lmcache():
36-
ktc = KVTransferConfig.from_cli(
37-
'{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
50+
def build_llm_with_lmcache(lmcache_connector: str, model: str,
51+
vllm_version: str):
52+
ktc = KVTransferConfig(
53+
kv_connector=lmcache_connector,
54+
kv_role="kv_both",
55+
)
3856
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
3957
# memory. Reduce the value if your GPU has less memory.
4058
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
41-
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
42-
kv_transfer_config=ktc,
43-
max_model_len=8000,
44-
enable_chunked_prefill=True,
45-
gpu_memory_utilization=0.8)
46-
59+
if vllm_version == "v0":
60+
llm_args = EngineArgs(
61+
model=model,
62+
kv_transfer_config=ktc,
63+
max_model_len=8000,
64+
gpu_memory_utilization=0.8,
65+
enable_chunked_prefill=True, # Only in v0
66+
)
67+
else:
68+
llm_args = EngineArgs(
69+
model=model,
70+
kv_transfer_config=ktc,
71+
max_model_len=8000,
72+
gpu_memory_utilization=0.8,
73+
)
74+
75+
llm = LLM(**asdict(llm_args))
4776
try:
4877
yield llm
4978
finally:
@@ -57,6 +86,9 @@ def print_output(
5786
sampling_params: SamplingParams,
5887
req_str: str,
5988
):
89+
# Should be able to see logs like the following:
90+
# `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
91+
# This indicates that the KV cache has been stored in LMCache.
6092
start = time.time()
6193
outputs = llm.generate(prompt, sampling_params)
6294
print("-" * 50)
@@ -68,10 +100,29 @@ def print_output(
68100
print("-" * 50)
69101

70102

103+
def parse_args():
104+
parser = argparse.ArgumentParser()
105+
parser.add_argument("-v",
106+
"--version",
107+
choices=["v0", "v1"],
108+
default="v1",
109+
help="Specify vLLM version (default: v1)")
110+
return parser.parse_args()
111+
112+
71113
def main():
114+
args = parse_args()
115+
116+
if args.version == "v0":
117+
lmcache_connector = "LMCacheConnector"
118+
model = "mistralai/Mistral-7B-Instruct-v0.2"
119+
else:
120+
lmcache_connector = "LMCacheConnectorV1"
121+
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
122+
72123
setup_environment_variables()
73124

74-
with build_llm_with_lmcache() as llm:
125+
with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
75126

76127
# This example script runs two requests with a shared prefix.
77128
# Define the shared prompt and specific prompts

examples/lmcache/cpu_offload_lmcache_v1.py

Lines changed: 0 additions & 57 deletions
This file was deleted.

0 commit comments

Comments
 (0)