NVIDIA
diff --git a/‎docs/source/commands/trtllm-serve.rst
+37 b/‎docs/source/commands/trtllm-serve.rst
+37
diff --git a/‎examples/serve/curl_chat_client.sh
+1-1 b/‎examples/serve/curl_chat_client.sh
+1-1
diff --git a/‎examples/serve/curl_chat_client_for_multimodal.sh
+28 b/‎examples/serve/curl_chat_client_for_multimodal.sh
+28
diff --git a/‎examples/serve/curl_completion_client.sh
+1-1 b/‎examples/serve/curl_completion_client.sh
+1-1
diff --git a/‎examples/serve/openai_chat_client_for_multimodal.py
+36 b/‎examples/serve/openai_chat_client_for_multimodal.py
+36
diff --git a/‎tensorrt_llm/_torch/models/modeling_llava_next.py
+1-1 b/‎tensorrt_llm/_torch/models/modeling_llava_next.py
+1-1
diff --git a/‎tensorrt_llm/_torch/models/modeling_qwen2vl.py
+1-1 b/‎tensorrt_llm/_torch/models/modeling_qwen2vl.py
+1-1
diff --git a/‎tensorrt_llm/_torch/models/modeling_vila.py
+2-2 b/‎tensorrt_llm/_torch/models/modeling_vila.py
+2-2
@@ -66,6 +66,43 @@ Another example uses ``curl``:
     :language: bash
     :linenos:
 
+Multimodal Serving
+~~~~~~~~~~~~~~~~~
+
+For multimodal models (e.g., Qwen2-VL), you'll need to create a configuration file and start the server with additional options:
+
+First, create a configuration file:
+
+.. code-block:: bash
+
+   cat >./extra-llm-api-config.yml<<EOF
+   kv_cache_config:
+       enable_block_reuse: false
+   EOF
+
+Then, start the server with the configuration file:
+
+.. code-block:: bash
+
+   trtllm-serve Qwen/Qwen2-VL-7B-Instruct \
+       --extra_llm_api_options ./extra-llm-api-config.yml \
+       --backend pytorch
+
+Completions API
+~~~~~~~~~~~~~~~
+
+You can query Completions API with any http clients, a typical example is OpenAI Python client:
+
+.. literalinclude:: ../../../examples/serve/openai_completion_client_for_multimodal.py
+    :language: python
+    :linenos:
+
+Another example uses ``curl``:
+
+.. literalinclude:: ../../../examples/serve/curl_completion_client_for_multimodal.sh
+    :language: bash
+    :linenos:
+
 Benchmark
 ---------
 
 
@@ -3,7 +3,7 @@
 curl http://localhost:8000/v1/chat/completions \
     -H "Content-Type: application/json" \
     -d '{
-        "model": TinyLlama-1.1B-Chat-v1.0,
+        "model": "TinyLlama-1.1B-Chat-v1.0",
         "messages":[{"role": "system", "content": "You are a helpful assistant."},
                     {"role": "user", "content": "Where is New York?"}],
         "max_tokens": 16,
 
@@ -0,0 +1,28 @@
+#! /usr/bin/env bash
+
+# Single image inference
+curl http://localhost:8000/v1/chat/completions \
+    -H "Content-Type: application/json"  \
+    -d '{
+        "model": "Qwen2-VL-7B-Instruct",
+        "messages":[{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Describe the natural environment in the image."
+                },
+                {
+                    "type":"image_url",
+                    "image_url": {
+                        "url": "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png"
+                    }
+                }
+            ]
+        }],
+        "max_tokens": 64,
+        "temperature": 0
+    }'
@@ -3,7 +3,7 @@
 curl http://localhost:8000/v1/completions \
     -H "Content-Type: application/json" \
     -d '{
-        "model": TinyLlama-1.1B-Chat-v1.0,
+        "model": "TinyLlama-1.1B-Chat-v1.0",
         "prompt": "Where is New York?",
         "max_tokens": 16,
         "temperature": 0
 
@@ -0,0 +1,36 @@
+### OpenAI Chat Client
+
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="tensorrt_llm",
+)
+
+# Single image inference
+response = client.chat.completions.create(
+    model="Qwen2-VL-7B-Instruct",
+    messages=[{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content": [{
+            "type": "text",
+            "text": "Describe the natural environment in the image."
+        }, {
+            "type": "image_url",
+            "image_url": {
+                "url":
+                "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png"
+            }
+        }]
+    }],
+    max_tokens=64,
+)
+print(response)
+
+# TODO
+# multi-image inference
+# video inference
@@ -136,7 +136,7 @@ def __call__(
         self, inputs: TextPrompt, sampling_params: SamplingParams
     ) -> Tuple[List[int], Optional[ExtraProcessedInputs]]:
         text_prompt, mm_data = inputs.get("prompt"), inputs.get(
-            "multi_modal_data")
+            "multi_modal_data", {})
         assert 'image' in mm_data
 
         input_ids = self.tokenizer(
 
@@ -312,7 +312,7 @@ def __call__(
         sampling_params: SamplingParams,
     ) -> Tuple[List[int], Optional[ExtraProcessedInputs]]:
         text_prompt, mm_data, mm_processor_kwargs = inputs.get("prompt"), \
-                        inputs.get("multi_modal_data"), inputs.get("mm_processor_kwargs", {})
+                        inputs.get("multi_modal_data", {}), inputs.get("mm_processor_kwargs", {})
 
         # NOTE: Since we are passed in Tensor images, we don't need to rescale them.
         mm_processor_kwargs['do_rescale'] = False
 
@@ -1093,8 +1093,8 @@ def __call__(
         (3) passed input_ids and mm_embed via LlmRequest's prompt_token_ids and prompt_embedding_table fields respectively. LlmRequests can be inflight batched, and the mm_embed is passed to LLM model as `multi_modal_data` which is List[torch.Tensor] for batched requests.
         """
 
-        text_prompt = inputs["prompt"]
-        mm_data = inputs["multi_modal_data"]
+        text_prompt, mm_data = inputs.get("prompt"), inputs.get(
+            "multi_modal_data", {})
         mm_processor_kwargs = inputs.get("mm_processor_kwargs", {})
 
         text_prompt = _apply_chat_template(text_prompt, self.conv_mode,