Include matched stop string/token in responses

njhill · njhill · commit bb6f83191d11 · 2024-02-26T14:29:31.000-08:00
Currently a finish_reason of "stop" is returned if any of the following are encountered:
- One of the provided stop strings
- One of the provided stop tokens
- The EOS token

It can be useful to know specifically which of these caused the sequence generation to stop, especially since by default the stop strings/tokens are omitted from the output text (and output token_ids?).

This PR adds a "stop_reason" field to the CompletionOutput class which will contain the matched stop string or integer token id. It will be None otherwise, including the EOS token case. This means in particular that EOS can be inferred by (finish_reason=="stop" and stop_reason=None).

I've also added to the openai server responses but not sure whether or not this should be included since it isn't part of the official API.
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -933,12 +933,15 @@ def _check_stop(self, seq: Sequence,
             if seq.output_text.endswith(stop_str):
                 self._finalize_sequence(seq, sampling_params, stop_str)
                 seq.status = SequenceStatus.FINISHED_STOPPED
+                seq.stop_reason = stop_str
                 return
-        if seq.get_last_token_id() in sampling_params.stop_token_ids:
+        last_token_id = seq.get_last_token_id()
+        if last_token_id in sampling_params.stop_token_ids:
             stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens(
-                seq.get_last_token_id())
+                last_token_id)
             self._finalize_sequence(seq, sampling_params, stop_str)
             seq.status = SequenceStatus.FINISHED_STOPPED
+            seq.stop_reason = last_token_id
             return
 
         # Check if the sequence has reached max_model_len.
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -187,6 +187,7 @@ class CompletionResponseChoice(BaseModel):
     text: str
     logprobs: Optional[LogProbs] = None
     finish_reason: Optional[Literal["stop", "length"]] = None
+    stop_reason: Union[None, int, str] = None
 
 
 class CompletionResponse(BaseModel):
@@ -203,6 +204,7 @@ class CompletionResponseStreamChoice(BaseModel):
     text: str
     logprobs: Optional[LogProbs] = None
     finish_reason: Optional[Literal["stop", "length"]] = None
+    stop_reason: Union[None, int, str] = None
 
 
 class CompletionStreamResponse(BaseModel):
@@ -224,6 +226,7 @@ class ChatCompletionResponseChoice(BaseModel):
     message: ChatMessage
     logprobs: Optional[LogProbs] = None
     finish_reason: Optional[Literal["stop", "length"]] = None
+    stop_reason: Union[None, int, str] = None
 
 
 class ChatCompletionResponse(BaseModel):
@@ -245,6 +248,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
     delta: DeltaMessage
     logprobs: Optional[LogProbs] = None
     finish_reason: Optional[Literal["stop", "length"]] = None
+    stop_reason: Union[None, int, str] = None
 
 
 class ChatCompletionStreamResponse(BaseModel):
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -194,7 +194,8 @@ async def chat_completion_stream_generator(
                         index=i,
                         delta=DeltaMessage(content=delta_text),
                         logprobs=logprobs,
-                        finish_reason=output.finish_reason)
+                        finish_reason=output.finish_reason,
+                        stop_reason=output.stop_reason)
                     chunk = ChatCompletionStreamResponse(
                         id=request_id,
                         object=chunk_object_type,
@@ -248,6 +249,7 @@ async def chat_completion_full_generator(
                 message=ChatMessage(role=role, content=output.text),
                 logprobs=logprobs,
                 finish_reason=output.finish_reason,
+                stop_reason=output.stop_reason,
             )
             choices.append(choice_data)
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -84,6 +84,7 @@ async def completion_stream_generator(
             previous_texts[i] = output.text
             previous_num_tokens[i] = len(output.token_ids)
             finish_reason = output.finish_reason
+            stop_reason = output.stop_reason
             response_json = CompletionStreamResponse(
                 id=request_id,
                 created=created_time,
@@ -94,6 +95,7 @@ async def completion_stream_generator(
                         text=delta_text,
                         logprobs=logprobs,
                         finish_reason=finish_reason,
+                        stop_reason=stop_reason,
                     )
                 ]).model_dump_json(exclude_unset=True)
             yield f"data: {response_json}\n\n"
@@ -117,6 +119,7 @@ async def completion_stream_generator(
                             text="",
                             logprobs=logprobs,
                             finish_reason=output.finish_reason,
+                            stop_reason=output.stop_reason,
                         )
                     ],
                     usage=final_usage,
@@ -195,6 +198,7 @@ def request_output_to_completion_response(
                 text=output_text,
                 logprobs=logprobs,
                 finish_reason=output.finish_reason,
+                stop_reason=output.stop_reason,
             )
             choices.append(choice_data)
 
diff --git a/vllm/outputs.py b/vllm/outputs.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List, Optional, Union
 import time
 
 from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup,
@@ -18,6 +18,9 @@ class CompletionOutput:
         logprobs: The log probabilities of the top probability words at each
             position if the logprobs are requested.
         finish_reason: The reason why the sequence is finished.
+        stop_reason: The stop string or token id that caused the completion to stop,
+            None if the completion finished for some other reason including
+            encountering the EOS token.
         lora_request: The LoRA request that was used to generate the output.
     """
 
@@ -29,6 +32,7 @@ def __init__(
         cumulative_logprob: float,
         logprobs: Optional[SampleLogprobs],
         finish_reason: Optional[str] = None,
+        stop_reason: Union[int, str, None] = None,
         lora_request: Optional[LoRARequest] = None,
     ) -> None:
         self.index = index
@@ -37,6 +41,7 @@ def __init__(
         self.cumulative_logprob = cumulative_logprob
         self.logprobs = logprobs
         self.finish_reason = finish_reason
+        self.stop_reason = stop_reason
         self.lora_request = lora_request
 
     def finished(self) -> bool:
@@ -48,7 +53,8 @@ def __repr__(self) -> str:
                 f"token_ids={self.token_ids}, "
                 f"cumulative_logprob={self.cumulative_logprob}, "
                 f"logprobs={self.logprobs}, "
-                f"finish_reason={self.finish_reason})")
+                f"finish_reason={self.finish_reason}, "
+                f"stop_reason={self.stop_reason})")
 
 
 class RequestOutput:
@@ -111,7 +117,7 @@ def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
             output = CompletionOutput(seqs.index(seq), seq.output_text,
                                       seq.get_output_token_ids(),
                                       seq.get_cumulative_logprob(), logprobs,
-                                      finshed_reason)
+                                      finshed_reason, seq.stop_reason)
             outputs.append(output)
 
         # Every sequence in the sequence group should have the same prompt.
diff --git a/vllm/sequence.py b/vllm/sequence.py
@@ -150,6 +150,7 @@ def __init__(
         # Initialize the logical token blocks with the prompt token ids.
         self._append_tokens_to_blocks(prompt_token_ids)
         self.status = SequenceStatus.WAITING
+        self.stop_reason: Union[int, str, None] = None
 
         # Used for incremental detokenization
         self.prefix_offset = 0