18
18
import requests
19
19
from openai import OpenAI
20
20
21
- from vllm .assets .audio import AudioAsset
22
21
from vllm .utils import FlexibleArgumentParser
23
22
24
23
# Modify OpenAI's API key and API base to use vLLM's API server.
@@ -151,8 +150,66 @@ def run_multi_image() -> None:
151
150
print ("Chat completion output:" , result )
152
151
153
152
153
+ # Video input inference
154
+ def run_video () -> None :
155
+ video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
156
+ video_base64 = encode_base64_content_from_url (video_url )
157
+
158
+ ## Use video url in the payload
159
+ chat_completion_from_url = client .chat .completions .create (
160
+ messages = [{
161
+ "role" :
162
+ "user" ,
163
+ "content" : [
164
+ {
165
+ "type" : "text" ,
166
+ "text" : "What's in this video?"
167
+ },
168
+ {
169
+ "type" : "video_url" ,
170
+ "video_url" : {
171
+ "url" : video_url
172
+ },
173
+ },
174
+ ],
175
+ }],
176
+ model = model ,
177
+ max_completion_tokens = 64 ,
178
+ )
179
+
180
+ result = chat_completion_from_url .choices [0 ].message .content
181
+ print ("Chat completion output from image url:" , result )
182
+
183
+ ## Use base64 encoded video in the payload
184
+ chat_completion_from_base64 = client .chat .completions .create (
185
+ messages = [{
186
+ "role" :
187
+ "user" ,
188
+ "content" : [
189
+ {
190
+ "type" : "text" ,
191
+ "text" : "What's in this video?"
192
+ },
193
+ {
194
+ "type" : "video_url" ,
195
+ "video_url" : {
196
+ "url" : f"data:video/mp4;base64,{ video_base64 } "
197
+ },
198
+ },
199
+ ],
200
+ }],
201
+ model = model ,
202
+ max_completion_tokens = 64 ,
203
+ )
204
+
205
+ result = chat_completion_from_base64 .choices [0 ].message .content
206
+ print ("Chat completion output from base64 encoded image:" , result )
207
+
208
+
154
209
# Audio input inference
155
210
def run_audio () -> None :
211
+ from vllm .assets .audio import AudioAsset
212
+
156
213
audio_url = AudioAsset ("winning_call" ).url
157
214
audio_base64 = encode_base64_content_from_url (audio_url )
158
215
@@ -240,6 +297,7 @@ def run_audio() -> None:
240
297
"text-only" : run_text_only ,
241
298
"single-image" : run_single_image ,
242
299
"multi-image" : run_multi_image ,
300
+ "video" : run_video ,
243
301
"audio" : run_audio ,
244
302
}
245
303
@@ -253,12 +311,11 @@ def main(args) -> None:
253
311
parser = FlexibleArgumentParser (
254
312
description = 'Demo on using OpenAI client for online inference with '
255
313
'multimodal language models served with vLLM.' )
256
- parser .add_argument (
257
- '--chat-type' ,
258
- '-c' ,
259
- type = str ,
260
- default = "single-image" ,
261
- choices = ["text-only" , "single-image" , "multi-image" , "audio" ],
262
- help = 'Conversation type with multimodal data.' )
314
+ parser .add_argument ('--chat-type' ,
315
+ '-c' ,
316
+ type = str ,
317
+ default = "single-image" ,
318
+ choices = list (example_function_map .keys ()),
319
+ help = 'Conversation type with multimodal data.' )
263
320
args = parser .parse_args ()
264
321
main (args )
0 commit comments