From 49c1b75b421ebd517c9faba8a6910b10a4be29e2 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 29 Apr 2025 07:00:57 +0000
Subject: [PATCH 1/3] [Frontend] Support `chat_template_kwargs` in `LLM.chat`

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/llm/test_chat.py | 34 +++++++++++++++++++++++++++++-
 vllm/entrypoints/llm.py            | 21 +++++++++++-------
 2 files changed, 46 insertions(+), 9 deletions(-)
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index 6a4862123b5..c9cd35043d4 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -109,7 +109,8 @@ def test_llm_chat_tokenization_no_double_bos():
     ]
     outputs = llm.chat(messages)
     assert len(outputs) == 1
-    prompt_token_ids = getattr(outputs[0], "prompt_token_ids", None)
+
+    prompt_token_ids = outputs[0].prompt_token_ids
     assert prompt_token_ids is not None
 
     bos_token = llm.get_tokenizer().bos_token_id
@@ -117,3 +118,34 @@ def test_llm_chat_tokenization_no_double_bos():
     # Ensure we have a single BOS
     assert prompt_token_ids[0] == bos_token
     assert prompt_token_ids[1] != bos_token, "Double BOS"
+
+
+@pytest.mark.parametrize("enable_thinking", [True, False])
+def test_chat_extra_kwargs(enable_thinking):
+    llm = LLM(model="Qwen/Qwen3-0.6B", enforce_eager=True)
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "What is 1+1?"
+        },
+    ]
+
+    outputs = llm.chat(
+        messages,
+        chat_template_kwargs={"enable_thinking": enable_thinking},
+    )
+    assert len(outputs) == 1
+
+    prompt_token_ids = outputs[0].prompt_token_ids
+    assert prompt_token_ids is not None
+
+    think_id = llm.get_tokenizer().get_vocab()["<think>"]
+
+    if enable_thinking:
+        assert think_id in prompt_token_ids
+    else:
+        assert think_id not in prompt_token_ids
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 653e61a11eb..948e8f36e0e 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -656,6 +656,7 @@ def chat(
         add_generation_prompt: bool = True,
         continue_final_message: bool = False,
         tools: Optional[list[dict[str, Any]]] = None,
+        chat_template_kwargs: Optional[dict[str, Any]] = None,
         mm_processor_kwargs: Optional[dict[str, Any]] = None,
     ) -> list[RequestOutput]:
         """
@@ -696,6 +697,8 @@ def chat(
             continue_final_message: If True, continues the final message in
                 the conversation instead of starting a new one. Cannot be
                 ``True`` if ``add_generation_prompt`` is also ``True``.
+            chat_template_kwargs: Additional kwargs to pass to the chat
+                template.
             mm_processor_kwargs: Multimodal processor kwarg overrides for this
                 chat request. Only used for offline requests.
 
@@ -726,6 +729,14 @@ def chat(
             trust_remote_code=model_config.trust_remote_code,
         )
 
+        _chat_template_kwargs: dict[str, Any] = dict(
+            chat_template=chat_template,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=continue_final_message,
+            tools=tools,
+        )
+        _chat_template_kwargs.update(chat_template_kwargs or {})
+
         prompts: list[Union[TokensPrompt, TextPrompt]] = []
 
         for msgs in list_of_messages:
@@ -743,20 +754,14 @@ def chat(
                 prompt_token_ids = apply_mistral_chat_template(
                     tokenizer,
                     messages=msgs,
-                    chat_template=chat_template,
-                    tools=tools,
-                    add_generation_prompt=add_generation_prompt,
-                    continue_final_message=continue_final_message,
+                    **_chat_template_kwargs,
                 )
             else:
                 prompt_str = apply_hf_chat_template(
                     tokenizer,
                     trust_remote_code=model_config.trust_remote_code,
                     conversation=conversation,
-                    chat_template=chat_template,
-                    tools=tools,
-                    add_generation_prompt=add_generation_prompt,
-                    continue_final_message=continue_final_message,
+                    **_chat_template_kwargs,
                 )
                 # Special tokens are already included in chat templates so
                 # should not be added by the tokenizer in this case.

From 1c88d69bd41df8d9870586e06f07e91a17c2f32b Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 29 Apr 2025 07:44:52 +0000
Subject: [PATCH 2/3] Fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/llm/test_chat.py | 87 ++++++++++++++++++++++--------
 1 file changed, 66 insertions(+), 21 deletions(-)

diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index c9cd35043d4..8a5c7c2e078 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -1,15 +1,31 @@
 # SPDX-License-Identifier: Apache-2.0
+import weakref
 
 import pytest
 
 from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
 
 from ..openai.test_vision import TEST_IMAGE_URLS
 
 
-def test_chat():
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
+@pytest.fixture(scope="module")
+def text_llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+              enforce_eager=True,
+              seed=0)
 
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+def test_chat(text_llm):
     prompt1 = "Explain the concept of entropy."
     messages = [
         {
@@ -21,13 +37,11 @@ def test_chat():
             "content": prompt1
         },
     ]
-    outputs = llm.chat(messages)
+    outputs = text_llm.chat(messages)
     assert len(outputs) == 1
 
 
-def test_multi_chat():
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
-
+def test_multi_chat(text_llm):
     prompt1 = "Explain the concept of entropy."
     prompt2 = "Explain what among us is."
 
@@ -55,13 +69,14 @@ def test_multi_chat():
 
     messages = [conversation1, conversation2]
 
-    outputs = llm.chat(messages)
+    outputs = text_llm.chat(messages)
     assert len(outputs) == 2
 
 
-@pytest.mark.parametrize("image_urls",
-                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
-def test_chat_multi_image(image_urls: list[str]):
+@pytest.fixture(scope="module")
+def vision_llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
     llm = LLM(
         model="microsoft/Phi-3.5-vision-instruct",
         max_model_len=4096,
@@ -69,8 +84,20 @@ def test_chat_multi_image(image_urls: list[str]):
         enforce_eager=True,
         trust_remote_code=True,
         limit_mm_per_prompt={"image": 2},
+        seed=0,
     )
 
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.parametrize("image_urls",
+                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+def test_chat_multi_image(vision_llm, image_urls: list[str]):
     messages = [{
         "role":
         "user",
@@ -87,16 +114,15 @@ def test_chat_multi_image(image_urls: list[str]):
             },
         ],
     }]
-    outputs = llm.chat(messages)
+    outputs = vision_llm.chat(messages)
     assert len(outputs) >= 0
 
 
-def test_llm_chat_tokenization_no_double_bos():
+def test_llm_chat_tokenization_no_double_bos(text_llm):
     """
     LLM.chat() should not add special tokens when using chat templates.
     Check we get a single BOS token for llama chat.
     """
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True)
     messages = [
         {
             "role": "system",
@@ -107,22 +133,40 @@ def test_llm_chat_tokenization_no_double_bos():
             "content": "Hello!"
         },
     ]
-    outputs = llm.chat(messages)
+    outputs = text_llm.chat(messages)
     assert len(outputs) == 1
 
     prompt_token_ids = outputs[0].prompt_token_ids
     assert prompt_token_ids is not None
 
-    bos_token = llm.get_tokenizer().bos_token_id
+    bos_token = text_llm.get_tokenizer().bos_token_id
 
     # Ensure we have a single BOS
     assert prompt_token_ids[0] == bos_token
     assert prompt_token_ids[1] != bos_token, "Double BOS"
 
 
+@pytest.fixture(scope="module")
+def thinking_llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model="Qwen/Qwen3-0.6B",
+        max_model_len=4096,
+        enforce_eager=True,
+        seed=0,
+    )
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
 @pytest.mark.parametrize("enable_thinking", [True, False])
-def test_chat_extra_kwargs(enable_thinking):
-    llm = LLM(model="Qwen/Qwen3-0.6B", enforce_eager=True)
+def test_chat_extra_kwargs(thinking_llm, enable_thinking):
     messages = [
         {
             "role": "system",
@@ -134,7 +178,7 @@ def test_chat_extra_kwargs(enable_thinking):
         },
     ]
 
-    outputs = llm.chat(
+    outputs = thinking_llm.chat(
         messages,
         chat_template_kwargs={"enable_thinking": enable_thinking},
     )
@@ -143,9 +187,10 @@ def test_chat_extra_kwargs(enable_thinking):
     prompt_token_ids = outputs[0].prompt_token_ids
     assert prompt_token_ids is not None
 
-    think_id = llm.get_tokenizer().get_vocab()["<think>"]
+    think_id = thinking_llm.get_tokenizer().get_vocab()["<think>"]
 
     if enable_thinking:
-        assert think_id in prompt_token_ids
-    else:
         assert think_id not in prompt_token_ids
+    else:
+        # The chat template includes dummy thinking process
+        assert think_id in prompt_token_ids

From 212977e9b03aeefe2f1f56e3c3d9a6f49ca4f0bb Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 29 Apr 2025 07:49:48 +0000
Subject: [PATCH 3/3] Avoid OOM

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/llm/test_chat.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index 8a5c7c2e078..742a6668344 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -9,7 +9,7 @@
 from ..openai.test_vision import TEST_IMAGE_URLS
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def text_llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
@@ -73,7 +73,7 @@ def test_multi_chat(text_llm):
     assert len(outputs) == 2
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def vision_llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
@@ -146,7 +146,7 @@ def test_llm_chat_tokenization_no_double_bos(text_llm):
     assert prompt_token_ids[1] != bos_token, "Double BOS"
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def thinking_llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection