Skip to content

Commit bfe5158

Browse files
author
reidliu41
committed
[Misc] refactor example - cpu_offload_lmcache
Signed-off-by: reidliu41 <[email protected]>
1 parent ece5a8b commit bfe5158

File tree

3 files changed

+57
-68
lines changed

3 files changed

+57
-68
lines changed

examples/lmcache/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ The main script generates several log files:
4444

4545
## 2. CPU Offload Examples
4646

47-
- `cpu_offload_lmcache_v0.py` - CPU offloading implementation for vLLM v0
48-
- `cpu_offload_lmcache_v1.py` - CPU offloading implementation for vLLM v1
47+
- `python cpu_offload_lmcache_v0.py -v v0` - CPU offloading implementation for vLLM v0
48+
- `python cpu_offload_lmcache_v1.py -v v1` - CPU offloading implementation for vLLM v1
4949

5050
## 3. KV Cache Sharing
5151

examples/lmcache/cpu_offload_lmcache_v0.py renamed to examples/lmcache/cpu_offload_lmcache.py

Lines changed: 55 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,19 @@
11
# SPDX-License-Identifier: Apache-2.0
22
"""
33
This file demonstrates the example usage of cpu offloading
4-
with LMCache.
4+
with LMCache in vLLM v1 or v0.
5+
6+
Usage:
7+
8+
Specify vLLM version
9+
10+
-v v0 : Use LMCacheConnector
11+
model = mistralai/Mistral-7B-Instruct-v0.2
12+
(Includes enable_chunked_prefill = True)
13+
14+
-v v1 : Use LMCacheConnectorV1 (default)
15+
model = meta-llama/Meta-Llama-3.1-8B-Instruct
16+
(Without enable_chunked_prefill)
517
618
Note that `lmcache` is needed to run this example.
719
Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
@@ -11,6 +23,7 @@
1123
import contextlib
1224
import os
1325
import time
26+
import argparse
1427

1528
from lmcache.experimental.cache_engine import LMCacheEngineBuilder
1629
from lmcache.integration.vllm.utils import ENGINE_NAME
@@ -32,17 +45,28 @@ def setup_environment_variables():
3245

3346

3447
@contextlib.contextmanager
35-
def build_llm_with_lmcache():
48+
def build_llm_with_lmcache(
49+
lmcache_connector: str,
50+
model: str,
51+
vllm_version: str
52+
):
3653
ktc = KVTransferConfig.from_cli(
37-
'{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
54+
f'{{"kv_connector":"{lmcache_connector}", "kv_role":"kv_both"}}')
3855
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
3956
# memory. Reduce the value if your GPU has less memory.
4057
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
41-
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
42-
kv_transfer_config=ktc,
43-
max_model_len=8000,
44-
enable_chunked_prefill=True,
45-
gpu_memory_utilization=0.8)
58+
llm_args = {
59+
"model": model,
60+
"kv_transfer_config": ktc,
61+
"max_model_len": 8000,
62+
"gpu_memory_utilization": 0.8,
63+
}
64+
65+
# Only for v0
66+
if vllm_version == "v0":
67+
llm_args["enable_chunked_prefill"] = True
68+
69+
llm = LLM(**llm_args)
4670

4771
try:
4872
yield llm
@@ -57,6 +81,9 @@ def print_output(
5781
sampling_params: SamplingParams,
5882
req_str: str,
5983
):
84+
# Should be able to see logs like the following:
85+
# `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
86+
# This indicates that the KV cache has been stored in LMCache.
6087
start = time.time()
6188
outputs = llm.generate(prompt, sampling_params)
6289
print("-" * 50)
@@ -68,10 +95,29 @@ def print_output(
6895
print("-" * 50)
6996

7097

98+
def parse_args():
99+
parser = argparse.ArgumentParser()
100+
parser.add_argument("-v",
101+
"--version",
102+
choices=["v0", "v1"],
103+
default="v1",
104+
help="Specify vLLM version (default: v1)")
105+
return parser.parse_args()
106+
107+
71108
def main():
109+
args = parse_args()
110+
111+
if args.version == "v0":
112+
lmcache_connector = "LMCacheConnector"
113+
model = "mistralai/Mistral-7B-Instruct-v0.2"
114+
else:
115+
lmcache_connector = "LMCacheConnectorV1"
116+
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
117+
72118
setup_environment_variables()
73119

74-
with build_llm_with_lmcache() as llm:
120+
with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
75121

76122
# This example script runs two requests with a shared prefix.
77123
# Define the shared prompt and specific prompts

examples/lmcache/cpu_offload_lmcache_v1.py

Lines changed: 0 additions & 57 deletions
This file was deleted.

0 commit comments

Comments
 (0)