Skip to content

Commit b747119

Browse files
author
reidliu41
committed
[Misc] refactor example - cpu_offload_lmcache
Signed-off-by: reidliu41 <[email protected]>
1 parent b74d888 commit b747119

File tree

3 files changed

+65
-70
lines changed

3 files changed

+65
-70
lines changed

examples/lmcache/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ The main script generates several log files:
4444

4545
## 2. CPU Offload Examples
4646

47-
- `cpu_offload_lmcache_v0.py` - CPU offloading implementation for vLLM v0
48-
- `cpu_offload_lmcache_v1.py` - CPU offloading implementation for vLLM v1
47+
- `python cpu_offload_lmcache.py -v v0` - CPU offloading implementation for vLLM v0
48+
- `python cpu_offload_lmcache.py -v v1` - CPU offloading implementation for vLLM v1
4949

5050
## 3. KV Cache Sharing
5151

examples/lmcache/cpu_offload_lmcache_v0.py renamed to examples/lmcache/cpu_offload_lmcache.py

Lines changed: 63 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,37 @@
11
# SPDX-License-Identifier: Apache-2.0
22
"""
33
This file demonstrates the example usage of cpu offloading
4-
with LMCache.
4+
with LMCache in vLLM v1 or v0.
5+
6+
Usage:
7+
8+
Specify vLLM version
9+
10+
-v v0 : Use LMCacheConnector
11+
model = mistralai/Mistral-7B-Instruct-v0.2
12+
(Includes enable_chunked_prefill = True)
13+
14+
-v v1 : Use LMCacheConnectorV1 (default)
15+
model = meta-llama/Meta-Llama-3.1-8B-Instruct
16+
(Without enable_chunked_prefill)
517
618
Note that `lmcache` is needed to run this example.
719
Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
820
Learn more about LMCache environment setup, please refer to:
921
https://docs.lmcache.ai/getting_started/installation.html
1022
"""
23+
import argparse
1124
import contextlib
1225
import os
1326
import time
27+
from dataclasses import asdict
1428

1529
from lmcache.experimental.cache_engine import LMCacheEngineBuilder
1630
from lmcache.integration.vllm.utils import ENGINE_NAME
1731

1832
from vllm import LLM, SamplingParams
1933
from vllm.config import KVTransferConfig
34+
from vllm.engine.arg_utils import EngineArgs
2035

2136

2237
def setup_environment_variables():
@@ -32,18 +47,33 @@ def setup_environment_variables():
3247

3348

3449
@contextlib.contextmanager
35-
def build_llm_with_lmcache():
36-
ktc = KVTransferConfig.from_cli(
37-
'{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
50+
def build_llm_with_lmcache(lmcache_connector: str, model: str,
51+
vllm_version: str):
52+
ktc = KVTransferConfig(
53+
kv_connector=lmcache_connector,
54+
kv_role="kv_both",
55+
)
3856
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
3957
# memory. Reduce the value if your GPU has less memory.
4058
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
41-
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
42-
kv_transfer_config=ktc,
43-
max_model_len=8000,
44-
enable_chunked_prefill=True,
45-
gpu_memory_utilization=0.8)
46-
59+
llm_args: EngineArgs
60+
if vllm_version == "v0":
61+
llm_args = EngineArgs(
62+
model=model,
63+
kv_transfer_config=ktc,
64+
max_model_len=8000,
65+
gpu_memory_utilization=0.8,
66+
enable_chunked_prefill=True, # Only in v0
67+
)
68+
else:
69+
llm_args = EngineArgs(
70+
model=model,
71+
kv_transfer_config=ktc,
72+
max_model_len=8000,
73+
gpu_memory_utilization=0.8,
74+
)
75+
76+
llm = LLM(**asdict(llm_args))
4777
try:
4878
yield llm
4979
finally:
@@ -57,6 +87,9 @@ def print_output(
5787
sampling_params: SamplingParams,
5888
req_str: str,
5989
):
90+
# Should be able to see logs like the following:
91+
# `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
92+
# This indicates that the KV cache has been stored in LMCache.
6093
start = time.time()
6194
outputs = llm.generate(prompt, sampling_params)
6295
print("-" * 50)
@@ -68,10 +101,29 @@ def print_output(
68101
print("-" * 50)
69102

70103

104+
def parse_args():
105+
parser = argparse.ArgumentParser()
106+
parser.add_argument("-v",
107+
"--version",
108+
choices=["v0", "v1"],
109+
default="v1",
110+
help="Specify vLLM version (default: v1)")
111+
return parser.parse_args()
112+
113+
71114
def main():
115+
args = parse_args()
116+
117+
if args.version == "v0":
118+
lmcache_connector = "LMCacheConnector"
119+
model = "mistralai/Mistral-7B-Instruct-v0.2"
120+
else:
121+
lmcache_connector = "LMCacheConnectorV1"
122+
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
123+
72124
setup_environment_variables()
73125

74-
with build_llm_with_lmcache() as llm:
126+
with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
75127

76128
# This example script runs two requests with a shared prefix.
77129
# Define the shared prompt and specific prompts

examples/lmcache/cpu_offload_lmcache_v1.py

Lines changed: 0 additions & 57 deletions
This file was deleted.

0 commit comments

Comments
 (0)