Skip to content

Commit b85a977

Browse files
[Doc] Add video example to openai client for multimodal (#11521)
Signed-off-by: Isotr0py <[email protected]> Co-authored-by: Cyrus Leung <[email protected]>
1 parent eec906d commit b85a977

File tree

2 files changed

+114
-11
lines changed

2 files changed

+114
-11
lines changed

docs/source/usage/multimodal_inputs.md

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -294,12 +294,58 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
294294

295295
### Video
296296

297-
Instead of {code}`image_url`, you can pass a video file via {code}`video_url`.
297+
Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
298298

299-
You can use [these tests](gh-file:entrypoints/openai/test_video.py) as reference.
299+
First, launch the OpenAI-compatible server:
300+
301+
```bash
302+
vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192
303+
```
304+
305+
Then, you can use the OpenAI client as follows:
306+
```python
307+
from openai import OpenAI
308+
309+
openai_api_key = "EMPTY"
310+
openai_api_base = "http://localhost:8000/v1"
311+
312+
client = OpenAI(
313+
api_key=openai_api_key,
314+
base_url=openai_api_base,
315+
)
316+
317+
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
318+
319+
## Use video url in the payload
320+
chat_completion_from_url = client.chat.completions.create(
321+
messages=[{
322+
"role":
323+
"user",
324+
"content": [
325+
{
326+
"type": "text",
327+
"text": "What's in this video?"
328+
},
329+
{
330+
"type": "video_url",
331+
"video_url": {
332+
"url": video_url
333+
},
334+
},
335+
],
336+
}],
337+
model=model,
338+
max_completion_tokens=64,
339+
)
340+
341+
result = chat_completion_from_url.choices[0].message.content
342+
print("Chat completion output from image url:", result)
343+
```
344+
345+
Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
300346

301347
````{note}
302-
By default, the timeout for fetching videos through HTTP URL url is `30` seconds.
348+
By default, the timeout for fetching videos through HTTP URL is `30` seconds.
303349
You can override this by setting the environment variable:
304350
305351
```console

examples/openai_chat_completion_client_for_multimodal.py

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import requests
1919
from openai import OpenAI
2020

21-
from vllm.assets.audio import AudioAsset
2221
from vllm.utils import FlexibleArgumentParser
2322

2423
# Modify OpenAI's API key and API base to use vLLM's API server.
@@ -151,8 +150,66 @@ def run_multi_image() -> None:
151150
print("Chat completion output:", result)
152151

153152

153+
# Video input inference
154+
def run_video() -> None:
155+
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
156+
video_base64 = encode_base64_content_from_url(video_url)
157+
158+
## Use video url in the payload
159+
chat_completion_from_url = client.chat.completions.create(
160+
messages=[{
161+
"role":
162+
"user",
163+
"content": [
164+
{
165+
"type": "text",
166+
"text": "What's in this video?"
167+
},
168+
{
169+
"type": "video_url",
170+
"video_url": {
171+
"url": video_url
172+
},
173+
},
174+
],
175+
}],
176+
model=model,
177+
max_completion_tokens=64,
178+
)
179+
180+
result = chat_completion_from_url.choices[0].message.content
181+
print("Chat completion output from image url:", result)
182+
183+
## Use base64 encoded video in the payload
184+
chat_completion_from_base64 = client.chat.completions.create(
185+
messages=[{
186+
"role":
187+
"user",
188+
"content": [
189+
{
190+
"type": "text",
191+
"text": "What's in this video?"
192+
},
193+
{
194+
"type": "video_url",
195+
"video_url": {
196+
"url": f"data:video/mp4;base64,{video_base64}"
197+
},
198+
},
199+
],
200+
}],
201+
model=model,
202+
max_completion_tokens=64,
203+
)
204+
205+
result = chat_completion_from_base64.choices[0].message.content
206+
print("Chat completion output from base64 encoded image:", result)
207+
208+
154209
# Audio input inference
155210
def run_audio() -> None:
211+
from vllm.assets.audio import AudioAsset
212+
156213
audio_url = AudioAsset("winning_call").url
157214
audio_base64 = encode_base64_content_from_url(audio_url)
158215

@@ -240,6 +297,7 @@ def run_audio() -> None:
240297
"text-only": run_text_only,
241298
"single-image": run_single_image,
242299
"multi-image": run_multi_image,
300+
"video": run_video,
243301
"audio": run_audio,
244302
}
245303

@@ -253,12 +311,11 @@ def main(args) -> None:
253311
parser = FlexibleArgumentParser(
254312
description='Demo on using OpenAI client for online inference with '
255313
'multimodal language models served with vLLM.')
256-
parser.add_argument(
257-
'--chat-type',
258-
'-c',
259-
type=str,
260-
default="single-image",
261-
choices=["text-only", "single-image", "multi-image", "audio"],
262-
help='Conversation type with multimodal data.')
314+
parser.add_argument('--chat-type',
315+
'-c',
316+
type=str,
317+
default="single-image",
318+
choices=list(example_function_map.keys()),
319+
help='Conversation type with multimodal data.')
263320
args = parser.parse_args()
264321
main(args)

0 commit comments

Comments
 (0)