1
1
# SPDX-License-Identifier: Apache-2.0
2
2
"""
3
3
This file demonstrates the example usage of cpu offloading
4
- with LMCache.
4
+ with LMCache in vLLM v1 or v0.
5
+
6
+ Usage:
7
+
8
+ Specify vLLM version
9
+
10
+ -v v0 : Use LMCacheConnector
11
+ model = mistralai/Mistral-7B-Instruct-v0.2
12
+ (Includes enable_chunked_prefill = True)
13
+
14
+ -v v1 : Use LMCacheConnectorV1 (default)
15
+ model = meta-llama/Meta-Llama-3.1-8B-Instruct
16
+ (Without enable_chunked_prefill)
5
17
6
18
Note that `lmcache` is needed to run this example.
7
19
Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
11
23
import contextlib
12
24
import os
13
25
import time
26
+ import argparse
14
27
15
28
from lmcache .experimental .cache_engine import LMCacheEngineBuilder
16
29
from lmcache .integration .vllm .utils import ENGINE_NAME
@@ -32,17 +45,28 @@ def setup_environment_variables():
32
45
33
46
34
47
@contextlib .contextmanager
35
- def build_llm_with_lmcache ():
48
+ def build_llm_with_lmcache (
49
+ lmcache_connector : str ,
50
+ model : str ,
51
+ vllm_version : str
52
+ ):
36
53
ktc = KVTransferConfig .from_cli (
37
- '{ "kv_connector":"LMCacheConnector ", "kv_role":"kv_both"}' )
54
+ f'{{ "kv_connector":"{ lmcache_connector } ", "kv_role":"kv_both"} }' )
38
55
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
39
56
# memory. Reduce the value if your GPU has less memory.
40
57
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
41
- llm = LLM (model = "mistralai/Mistral-7B-Instruct-v0.2" ,
42
- kv_transfer_config = ktc ,
43
- max_model_len = 8000 ,
44
- enable_chunked_prefill = True ,
45
- gpu_memory_utilization = 0.8 )
58
+ llm_args = {
59
+ "model" : model ,
60
+ "kv_transfer_config" : ktc ,
61
+ "max_model_len" : 8000 ,
62
+ "gpu_memory_utilization" : 0.8 ,
63
+ }
64
+
65
+ # Only for v0
66
+ if vllm_version == "v0" :
67
+ llm_args ["enable_chunked_prefill" ] = True
68
+
69
+ llm = LLM (** llm_args )
46
70
47
71
try :
48
72
yield llm
@@ -57,6 +81,9 @@ def print_output(
57
81
sampling_params : SamplingParams ,
58
82
req_str : str ,
59
83
):
84
+ # Should be able to see logs like the following:
85
+ # `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
86
+ # This indicates that the KV cache has been stored in LMCache.
60
87
start = time .time ()
61
88
outputs = llm .generate (prompt , sampling_params )
62
89
print ("-" * 50 )
@@ -68,10 +95,29 @@ def print_output(
68
95
print ("-" * 50 )
69
96
70
97
98
+ def parse_args ():
99
+ parser = argparse .ArgumentParser ()
100
+ parser .add_argument ("-v" ,
101
+ "--version" ,
102
+ choices = ["v0" , "v1" ],
103
+ default = "v1" ,
104
+ help = "Specify vLLM version (default: v1)" )
105
+ return parser .parse_args ()
106
+
107
+
71
108
def main ():
109
+ args = parse_args ()
110
+
111
+ if args .version == "v0" :
112
+ lmcache_connector = "LMCacheConnector"
113
+ model = "mistralai/Mistral-7B-Instruct-v0.2"
114
+ else :
115
+ lmcache_connector = "LMCacheConnectorV1"
116
+ model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
117
+
72
118
setup_environment_variables ()
73
119
74
- with build_llm_with_lmcache () as llm :
120
+ with build_llm_with_lmcache (lmcache_connector , model , args . version ) as llm :
75
121
76
122
# This example script runs two requests with a shared prefix.
77
123
# Define the shared prompt and specific prompts
0 commit comments