1
1
# SPDX-License-Identifier: Apache-2.0
2
2
"""
3
3
This file demonstrates the example usage of cpu offloading
4
- with LMCache.
4
+ with LMCache in vLLM v1 or v0.
5
+
6
+ Usage:
7
+
8
+ Specify vLLM version
9
+
10
+ -v v0 : Use LMCacheConnector
11
+ model = mistralai/Mistral-7B-Instruct-v0.2
12
+ (Includes enable_chunked_prefill = True)
13
+
14
+ -v v1 : Use LMCacheConnectorV1 (default)
15
+ model = meta-llama/Meta-Llama-3.1-8B-Instruct
16
+ (Without enable_chunked_prefill)
5
17
6
18
Note that `lmcache` is needed to run this example.
7
19
Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
8
20
Learn more about LMCache environment setup, please refer to:
9
21
https://docs.lmcache.ai/getting_started/installation.html
10
22
"""
23
+ import argparse
11
24
import contextlib
12
25
import os
13
26
import time
27
+ from dataclasses import asdict
14
28
15
29
from lmcache .experimental .cache_engine import LMCacheEngineBuilder
16
30
from lmcache .integration .vllm .utils import ENGINE_NAME
17
31
18
32
from vllm import LLM , SamplingParams
19
33
from vllm .config import KVTransferConfig
34
+ from vllm .engine .arg_utils import EngineArgs
20
35
21
36
22
37
def setup_environment_variables ():
@@ -32,18 +47,33 @@ def setup_environment_variables():
32
47
33
48
34
49
@contextlib .contextmanager
35
- def build_llm_with_lmcache ():
36
- ktc = KVTransferConfig .from_cli (
37
- '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}' )
50
+ def build_llm_with_lmcache (lmcache_connector : str , model : str ,
51
+ vllm_version : str ):
52
+ ktc = KVTransferConfig (
53
+ kv_connector = lmcache_connector ,
54
+ kv_role = "kv_both" ,
55
+ )
38
56
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
39
57
# memory. Reduce the value if your GPU has less memory.
40
58
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
41
- llm = LLM (model = "mistralai/Mistral-7B-Instruct-v0.2" ,
42
- kv_transfer_config = ktc ,
43
- max_model_len = 8000 ,
44
- enable_chunked_prefill = True ,
45
- gpu_memory_utilization = 0.8 )
46
-
59
+ llm_args : EngineArgs
60
+ if vllm_version == "v0" :
61
+ llm_args = EngineArgs (
62
+ model = model ,
63
+ kv_transfer_config = ktc ,
64
+ max_model_len = 8000 ,
65
+ gpu_memory_utilization = 0.8 ,
66
+ enable_chunked_prefill = True , # Only in v0
67
+ )
68
+ else :
69
+ llm_args = EngineArgs (
70
+ model = model ,
71
+ kv_transfer_config = ktc ,
72
+ max_model_len = 8000 ,
73
+ gpu_memory_utilization = 0.8 ,
74
+ )
75
+
76
+ llm = LLM (** asdict (llm_args ))
47
77
try :
48
78
yield llm
49
79
finally :
@@ -57,6 +87,9 @@ def print_output(
57
87
sampling_params : SamplingParams ,
58
88
req_str : str ,
59
89
):
90
+ # Should be able to see logs like the following:
91
+ # `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
92
+ # This indicates that the KV cache has been stored in LMCache.
60
93
start = time .time ()
61
94
outputs = llm .generate (prompt , sampling_params )
62
95
print ("-" * 50 )
@@ -68,10 +101,29 @@ def print_output(
68
101
print ("-" * 50 )
69
102
70
103
104
+ def parse_args ():
105
+ parser = argparse .ArgumentParser ()
106
+ parser .add_argument ("-v" ,
107
+ "--version" ,
108
+ choices = ["v0" , "v1" ],
109
+ default = "v1" ,
110
+ help = "Specify vLLM version (default: v1)" )
111
+ return parser .parse_args ()
112
+
113
+
71
114
def main ():
115
+ args = parse_args ()
116
+
117
+ if args .version == "v0" :
118
+ lmcache_connector = "LMCacheConnector"
119
+ model = "mistralai/Mistral-7B-Instruct-v0.2"
120
+ else :
121
+ lmcache_connector = "LMCacheConnectorV1"
122
+ model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
123
+
72
124
setup_environment_variables ()
73
125
74
- with build_llm_with_lmcache () as llm :
126
+ with build_llm_with_lmcache (lmcache_connector , model , args . version ) as llm :
75
127
76
128
# This example script runs two requests with a shared prefix.
77
129
# Define the shared prompt and specific prompts
0 commit comments