1
1
# SPDX-License-Identifier: Apache-2.0
2
2
"""
3
3
This file demonstrates the example usage of cpu offloading
4
- with LMCache.
4
+ with LMCache in vLLM v1 or v0.
5
+
6
+ Usage:
7
+
8
+ Specify vLLM version
9
+
10
+ -v v0 : Use LMCacheConnector
11
+ model = mistralai/Mistral-7B-Instruct-v0.2
12
+ (Includes enable_chunked_prefill = True)
13
+
14
+ -v v1 : Use LMCacheConnectorV1 (default)
15
+ model = meta-llama/Meta-Llama-3.1-8B-Instruct
16
+ (Without enable_chunked_prefill)
5
17
6
18
Note that `lmcache` is needed to run this example.
7
19
Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
8
20
Learn more about LMCache environment setup, please refer to:
9
21
https://docs.lmcache.ai/getting_started/installation.html
10
22
"""
23
+ import argparse
11
24
import contextlib
12
25
import os
13
26
import time
27
+ from dataclasses import asdict
14
28
15
29
from lmcache .experimental .cache_engine import LMCacheEngineBuilder
16
30
from lmcache .integration .vllm .utils import ENGINE_NAME
17
31
18
32
from vllm import LLM , SamplingParams
19
33
from vllm .config import KVTransferConfig
34
+ from vllm .engine .arg_utils import EngineArgs
20
35
21
36
22
37
def setup_environment_variables ():
@@ -32,18 +47,32 @@ def setup_environment_variables():
32
47
33
48
34
49
@contextlib .contextmanager
35
- def build_llm_with_lmcache ():
36
- ktc = KVTransferConfig .from_cli (
37
- '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}' )
50
+ def build_llm_with_lmcache (lmcache_connector : str , model : str ,
51
+ vllm_version : str ):
52
+ ktc = KVTransferConfig (
53
+ kv_connector = lmcache_connector ,
54
+ kv_role = "kv_both" ,
55
+ )
38
56
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
39
57
# memory. Reduce the value if your GPU has less memory.
40
58
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
41
- llm = LLM (model = "mistralai/Mistral-7B-Instruct-v0.2" ,
42
- kv_transfer_config = ktc ,
43
- max_model_len = 8000 ,
44
- enable_chunked_prefill = True ,
45
- gpu_memory_utilization = 0.8 )
46
-
59
+ if vllm_version == "v0" :
60
+ llm_args = EngineArgs (
61
+ model = model ,
62
+ kv_transfer_config = ktc ,
63
+ max_model_len = 8000 ,
64
+ gpu_memory_utilization = 0.8 ,
65
+ enable_chunked_prefill = True , # Only in v0
66
+ )
67
+ else :
68
+ llm_args = EngineArgs (
69
+ model = model ,
70
+ kv_transfer_config = ktc ,
71
+ max_model_len = 8000 ,
72
+ gpu_memory_utilization = 0.8 ,
73
+ )
74
+
75
+ llm = LLM (** asdict (llm_args ))
47
76
try :
48
77
yield llm
49
78
finally :
@@ -57,6 +86,9 @@ def print_output(
57
86
sampling_params : SamplingParams ,
58
87
req_str : str ,
59
88
):
89
+ # Should be able to see logs like the following:
90
+ # `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
91
+ # This indicates that the KV cache has been stored in LMCache.
60
92
start = time .time ()
61
93
outputs = llm .generate (prompt , sampling_params )
62
94
print ("-" * 50 )
@@ -68,10 +100,29 @@ def print_output(
68
100
print ("-" * 50 )
69
101
70
102
103
+ def parse_args ():
104
+ parser = argparse .ArgumentParser ()
105
+ parser .add_argument ("-v" ,
106
+ "--version" ,
107
+ choices = ["v0" , "v1" ],
108
+ default = "v1" ,
109
+ help = "Specify vLLM version (default: v1)" )
110
+ return parser .parse_args ()
111
+
112
+
71
113
def main ():
114
+ args = parse_args ()
115
+
116
+ if args .version == "v0" :
117
+ lmcache_connector = "LMCacheConnector"
118
+ model = "mistralai/Mistral-7B-Instruct-v0.2"
119
+ else :
120
+ lmcache_connector = "LMCacheConnectorV1"
121
+ model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
122
+
72
123
setup_environment_variables ()
73
124
74
- with build_llm_with_lmcache () as llm :
125
+ with build_llm_with_lmcache (lmcache_connector , model , args . version ) as llm :
75
126
76
127
# This example script runs two requests with a shared prefix.
77
128
# Define the shared prompt and specific prompts
0 commit comments