vllm-project · DarkLight1337 · Oct 4, 2024 · Sep 12, 2024 · Sep 12, 2024 · Sep 12, 2024
@@ -12,4 +12,5 @@ torch
 py-cpuinfo
 transformers
 mistral_common >= 1.3.4
-openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
+openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
+partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
@@ -157,8 +157,9 @@ vLLM will use guided decoding to ensure the response matches the tool parameter
 To enable this feature, you should set the following flags:
 * `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it 
 deems appropriate.
-* `--tool-call-parser` -- select the tool parser to use - currently either `hermes` or `mistral`. Additional tool parsers 
-will continue to be added in the future.
+* `--tool-call-parser` -- select the tool parser to use - currently either `hermes` or `mistral` or `internlm`. Additional tool parsers 
+will continue to be added in the future, and also can register your own tool parsers in the `--tool-parser-plugin`.
+* `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`.
 * `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages 
 that contain previously generated tool calls. Hermes and Mistral models have tool-compatible chat templates in their 
 `tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat 
@@ -197,3 +198,73 @@ when tools are provided, that results in much better reliability when working wi
 
 
 Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
+
+#### Internlm Models
+Supported models:
+* `internlm/internlm2_5-7b-chat` (confirmed)
+* Additional internlm2.5 function-calling models are compatible as well
+
+Known issues:
+* Although this implementation also supports Internlm2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model.
+
+Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja`
+
+
+### How to write a tool parser plugin
+
+A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py.
+
+Here is a summary of a plugin file:
+
+```python
+
+# import the required packages
+
+# define a tool parser and register it to vllm
+# the name list in register_module can be used
+# in --tool-call-parser. you can define as many
+# tool parsers as you want here.
+@ToolParserManager.register_module(["example"])
+class ExampleToolParser(ToolParser):
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+    # adjust request. e.g.: set skip special tokens
+    # to False for tool call output.
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        return request
+
+    # implement the tool call parse for stream call
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        return delta
+
+    # implement the tool parse for non-stream call
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        return ExtractedToolCallInformation(tools_called=False,
+                                            tool_calls=[],
+                                            content=text)
+
+
+```
+
+Then you can use this plugin in the command line like this.
+```
+    --enable-auto-tool-choice \
+    --tool-parser-plugin <absolute path of the plugin file>
+    --tool-call-parser example \
+    --chat-template <your chat template> \
+``` 
diff --git a/examples/tool_chat_template_internlm2_tool.jinja b/examples/tool_chat_template_internlm2_tool.jinja
@@ -0,0 +1,60 @@
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{{- bos_token }}
+{%- if system_message is defined %}
+{{- "<|im_start|>system\n" + system_message + "<|im_end|>\n" }}
+{%- endif %}
+
+{%- if tools is not none %}
+    {{- "<|im_start|>system name=<|plugin|>\n[" }}
+    {%- for tool in tools %}
+        {{- tool.function|tojson }}
+        {%- if not loop.last %}
+            {{- ", " }}
+        {%- else %}
+            {{- "]" }}
+        {%- endif %}
+    {%- endfor %}
+    {{- "<|im_end|>\n" }}
+{%- endif %}
+
+{%- for message in loop_messages %}
+    {%- if message["role"] == "user" %}
+        {{- "<|im_start|>user\n" + message["content"] + "<|im_end|>\n"}}
+    {%- elif message.tool_calls is defined and message.tool_calls is not none %}
+        {%- set content = message["content"] if message["content"] else "" %}
+        {{- "<|im_start|>assistant\n" + content }}
+        {%- for tool_call in message.tool_calls %}
+            {%- set function=tool_call.function %}
+            {{- "<|action_start|><|plugin|>\n" }}
+            {{- '{"name": "' + function.name + '", '}}
+            {{- '"arguments": ' + function.arguments|tojson + '}' }}
+            {{- "<|action_end|>" }}
+        {%- endfor %}
+        {{- "<|im_end|>\n" }}
+    {%- elif message["role"] == "assistant" %}
+        {{- "<|im_start|>assistant\n" + message["content"] + "<|im_end|>\n"}}
+    {%- elif message["role"] == "tool_results" or message["role"] == "tool" or message["role"] == "function" %}
+        {%- if message.content is defined and message.content.content is defined %}
+            {%- set content = message.content.content %}
+        {%- else %}
+            {%- set content = message.content %}
+        {%- endif %}
+        {{- "<|im_start|>environment name=<|plugin|>\n" + content|string + "<|im_end|>\n" }}
+    {%- else %}
+        {{- raise_exception("Only user and assistant and tool_results and tool and function roles are supported, with the exception of an initial optional system message!") }}
+    {%- endif %}
+{%- endfor %}
+
+{%- if add_generation_prompt %}
+{{- '<|im_start|>assistant\n' }}
+{%- endif %}
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
@@ -6,17 +6,20 @@
 
 from .utils import (MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
                     MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, SEARCH_TOOL,
-                    WEATHER_TOOL)
+                    WEATHER_TOOL, ServerConfig)
 
 
 # test: getting the model to generate parallel tool calls (streaming/not)
 # when requested. NOTE that not all models may support this, so some exclusions
 # may be added in the future. e.g. llama 3.1 models are not designed to support
 # parallel tool calls.
 @pytest.mark.asyncio
-async def test_parallel_tool_calls(client: openai.AsyncOpenAI):
+async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
+                                   server_config: ServerConfig):
     models = await client.models.list()
     model_name: str = models.data[0].id
+    if server_config.get("skip_parallel", False):
+        pytest.skip(f"skip parallel test for {model_name}")
     chat_completion = await client.chat.completions.create(
         messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
         temperature=0,
@@ -136,9 +139,12 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI):
 # test: providing parallel tool calls back to the model to get a response
 # (streaming/not)
 @pytest.mark.asyncio
-async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI):
+async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
+                                                server_config: ServerConfig):
     models = await client.models.list()
     model_name: str = models.data[0].id
+    if server_config.get("skip_parallel", False):
+        pytest.skip(f"skip parallel test for {model_name}")
     chat_completion = await client.chat.completions.create(
         messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
         temperature=0,

diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
@@ -1,4 +1,4 @@
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 from openai.types.chat import (ChatCompletionMessageParam,
                                ChatCompletionToolParam)
@@ -10,6 +10,7 @@
 class ServerConfig(TypedDict):
     model: str
     arguments: List[str]
+    skip_parallel: Optional[bool]
 
 
 # universal args for all models go here. also good if you need to test locally
@@ -23,7 +24,9 @@ class ServerConfig(TypedDict):
         "arguments": [
             "--tool-call-parser", "hermes", "--chat-template",
             str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
-        ]
+        ],
+        "skip_parallel":
+        False
     },
     "mistral": {
         "model":
@@ -32,7 +35,21 @@ class ServerConfig(TypedDict):
             "--tool-call-parser", "mistral", "--chat-template",
             str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"),
             "--ignore-patterns=\"consolidated.safetensors\""
-        ]
+        ],
+        "skip_parallel":
+        False
+    },
+    "internlm": {
+        "model":
+        "internlm/internlm2_5-7b-chat",
+        "arguments": [
+            "--tool-call-parser", "internlm", "--chat-template",
+            str(VLLM_PATH /
+                "examples/tool_chat_template_internlm2_tool.jinja"),
+            "--trust_remote_code"
+        ],
+        "skip_parallel":
+        True
     }
 }
 
@@ -55,7 +72,7 @@ class ServerConfig(TypedDict):
                     "type":
                     "string",
                     "description":
-                    "the two-letter abbreviation for the state "
+                    "must the two-letter abbreviation for the state "
                     "that the city is in, e.g. 'CA' which would "
                     "mean 'California'"
                 },

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -52,6 +52,7 @@
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
+from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path
@@ -526,6 +527,15 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     logger.info("vLLM API server version %s", VLLM_VERSION)
     logger.info("args: %s", args)
 
+    if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
+        ToolParserManager.import_tool_parser(args.tool_parser_plugin)
+
+    valide_tool_parses = ToolParserManager.tool_parsers.keys()
+    if args.enable_auto_tool_choice \
+        and args.tool_call_parser not in valide_tool_parses:
+        raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
+                       f"(chose from {{ {','.join(valide_tool_parses)} }})")
+
     temp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     temp_socket.bind(("", args.port))
 

diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
@@ -12,6 +12,7 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
                                                     PromptAdapterPath)
+from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.utils import FlexibleArgumentParser
 
 
@@ -171,16 +172,27 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "Enable auto tool choice for supported models. Use --tool-call-parser"
         "to specify which parser to use")
 
+    valid_tool_parsers = ToolParserManager.tool_parsers.keys()
     parser.add_argument(
         "--tool-call-parser",
         type=str,
-        choices=["mistral", "hermes"],
+        metavar="{" + ",".join(valid_tool_parsers) + "} or name registered in "
+        "--tool-parser-plugin",
         default=None,
         help=
         "Select the tool call parser depending on the model that you're using."
         " This is used to parse the model-generated tool call into OpenAI API "
         "format. Required for --enable-auto-tool-choice.")
 
+    parser.add_argument(
+        "--tool-parser-plugin",
+        type=str,
+        default="",
+        help=
+        "Special the tool parser plugin write to parse the model-generated tool"
+        " into OpenAI API format, the name register in this plugin can be used "
+        "in --tool-call-parser.")
+
     parser = AsyncEngineArgs.add_cli_args(parser)
 
     parser.add_argument('--max-log-len',