[Bugfix][Structured Output] Support outlines with reasoning outputs

gaocegege · gaocegege · commit 002848a0d38f · 2025-03-05T20:54:06.000+08:00
Signed-off-by: Ce Gao &lt;cegao@tensorchord.ai&gt;
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
@@ -33,6 +33,42 @@
 models = client.models.list()
 model = models.data[0].id
 
+# Guided decoding by Regex
+prompt = ("What is the capital of France?")
+
+completion = client.chat.completions.create(
+    model=model,
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={
+        "guided_regex": "(Paris|London)",
+    },
+)
+print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("content: ", completion.choices[0].message.content)
+
+
+class People(BaseModel):
+    name: str
+    age: int
+
+
+json_schema = People.model_json_schema()
+
+prompt = ("Generate a JSON with the name and age of one random person.")
+completion = client.chat.completions.create(
+    model=model,
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={"guided_json": json_schema},
+)
+print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("content: ", completion.choices[0].message.content)
+
 
 # Guided decoding by JSON using Pydantic schema
 class CarType(str, Enum):
@@ -51,7 +87,7 @@ class CarDescription(BaseModel):
 json_schema = CarDescription.model_json_schema()
 
 prompt = ("Generate a JSON with the brand, model and car_type of"
-          "the most iconic car from the 90's, think in 100 tokens")
+          "the most iconic car from the 90's")
 completion = client.chat.completions.create(
     model=model,
     messages=[{
@@ -60,5 +96,34 @@ class CarDescription(BaseModel):
     }],
     extra_body={"guided_json": json_schema},
 )
-print("content", completion.choices[0].message.content)
 print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("content: ", completion.choices[0].message.content)
+
+# Guided decoding by Grammar
+simplified_sql_grammar = """
+    ?start: select_statement
+
+    ?select_statement: "SELECT " column_list " FROM " table_name
+
+    ?column_list: column_name ("," column_name)*
+
+    ?table_name: identifier
+
+    ?column_name: identifier
+
+    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+"""
+
+# This may be very slow https://github.com/vllm-project/vllm/issues/12122
+prompt = ("Generate an SQL query to show the 'username' and 'email'"
+          "from the 'users' table.")
+completion = client.chat.completions.create(
+    model=model,
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={"guided_grammar": simplified_sql_grammar},
+)
+print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("content: ", completion.choices[0].message.content)
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
@@ -112,6 +112,7 @@ async def get_guided_decoding_logits_processor(
     reasoner = get_reasoner(tokenizer, reasoning_backend)
 
     guided_params = maybe_backend_fallback(guided_params)
+
     # CFG grammar not supported by LMFE, so we use outlines instead
     if guided_params.backend_name == 'outlines':
         # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -43,7 +43,7 @@ class BaseLogitsProcessor:
 
     def __init__(self, guide: Guide, reasoner: Optional[Reasoner]):
         self._guide: Guide = guide
-        self._reasoner = reasoner
+        self._reasoner: Optional[Reasoner] = reasoner
         # CFGState is used for the FSM state for CFGGuide
         self._fsm_state: DefaultDict[int, Union[int,
                                                 CFGState]] = defaultdict(int)
@@ -54,10 +54,14 @@ def __call__(self, input_ids: List[int],
 
         # Skip the structured logits processing if reasoning is not finished.
         # reasoner is not None only when `--enable-reasoning` is set.
-        if self._reasoner is not None and \
-        not self._reasoner.is_reasoning_end(
-                input_ids):
-            return scores
+        if self._reasoner is not None:
+            if not self._reasoner.is_reasoning_end(input_ids):
+                return scores
+            else:
+                # Remove the reasoning tokens from the input_ids
+                # We need this because our implementation relies on the
+                # hash of the input_ids to store the FSM state.
+                input_ids = self._reasoner.extract_content(input_ids)
 
         seq_id = hash(tuple(input_ids))
 
diff --git a/vllm/model_executor/guided_decoding/reasoner/__init__.py b/vllm/model_executor/guided_decoding/reasoner/__init__.py
@@ -4,10 +4,13 @@
 
 from transformers import PreTrainedTokenizer
 
+from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding.reasoner.deepseek_reasoner import (  # noqa: E501
     DeepSeekReasoner)
 from vllm.model_executor.guided_decoding.reasoner.reasoner import Reasoner
 
+logger = init_logger(__name__)
+
 
 def get_reasoner(tokenizer: PreTrainedTokenizer,
                  reasoning_backend: str | None) -> Reasoner | None:
@@ -17,7 +20,12 @@ def get_reasoner(tokenizer: PreTrainedTokenizer,
     elif reasoning_backend == "deepseek_r1":
         return DeepSeekReasoner.from_tokenizer(tokenizer)
     else:
-        raise ValueError(f"Unknown reasoning backend '{reasoning_backend}'")
+        # Raise a warning for unknown reasoning backend and return None
+        # We cannot raise an error here because some reasoning models
+        # may not have a corresponding Reasoner class.
+        logger.warning("Unknown reasoning backend %s for structured outputs ",
+                       reasoning_backend)
+        return None
 
 
 __all__ = ["Reasoner", "get_reasoner"]
diff --git a/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py b/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
@@ -26,3 +26,14 @@ def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
         return self.end_token_id in input_ids
+
+    def extract_content(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract the content after the end tokens
+        """
+        if self.end_token_id not in input_ids:
+            return input_ids
+        elif input_ids.index(self.end_token_id) + 1 == len(input_ids):
+            return []
+        else:
+            return input_ids[input_ids.index(self.end_token_id) + 1:]
diff --git a/vllm/model_executor/guided_decoding/reasoner/reasoner.py b/vllm/model_executor/guided_decoding/reasoner/reasoner.py
@@ -17,3 +17,7 @@ def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
     @abstractmethod
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
         pass
+
+    @abstractmethod
+    def extract_content(self, input_ids: list[int]) -> list[int]:
+        pass
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -392,7 +392,7 @@ def __call__(self, input_ids: list[int],
     def clone(self) -> XGrammarLogitsProcessor:
         """Create a new instance with shared compiled grammar
           but separate state"""
-        new_processor = XGrammarLogitsProcessor(self.config)
+        new_processor = XGrammarLogitsProcessor(self.config, self.reasoner)
 
         # Share the compiled grammar context (immutable after compilation)
         new_processor.ctx = self.ctx