Skip to content

Commit bb6f831

Browse files
committed
Include matched stop string/token in responses
Currently a finish_reason of "stop" is returned if any of the following are encountered: - One of the provided stop strings - One of the provided stop tokens - The EOS token It can be useful to know specifically which of these caused the sequence generation to stop, especially since by default the stop strings/tokens are omitted from the output text (and output token_ids?). This PR adds a "stop_reason" field to the CompletionOutput class which will contain the matched stop string or integer token id. It will be None otherwise, including the EOS token case. This means in particular that EOS can be inferred by (finish_reason=="stop" and stop_reason=None). I've also added to the openai server responses but not sure whether or not this should be included since it isn't part of the official API.
1 parent cfc15a1 commit bb6f831

File tree

6 files changed

+26
-6
lines changed

6 files changed

+26
-6
lines changed

vllm/engine/llm_engine.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -933,12 +933,15 @@ def _check_stop(self, seq: Sequence,
933933
if seq.output_text.endswith(stop_str):
934934
self._finalize_sequence(seq, sampling_params, stop_str)
935935
seq.status = SequenceStatus.FINISHED_STOPPED
936+
seq.stop_reason = stop_str
936937
return
937-
if seq.get_last_token_id() in sampling_params.stop_token_ids:
938+
last_token_id = seq.get_last_token_id()
939+
if last_token_id in sampling_params.stop_token_ids:
938940
stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens(
939-
seq.get_last_token_id())
941+
last_token_id)
940942
self._finalize_sequence(seq, sampling_params, stop_str)
941943
seq.status = SequenceStatus.FINISHED_STOPPED
944+
seq.stop_reason = last_token_id
942945
return
943946

944947
# Check if the sequence has reached max_model_len.

vllm/entrypoints/openai/protocol.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ class CompletionResponseChoice(BaseModel):
187187
text: str
188188
logprobs: Optional[LogProbs] = None
189189
finish_reason: Optional[Literal["stop", "length"]] = None
190+
stop_reason: Union[None, int, str] = None
190191

191192

192193
class CompletionResponse(BaseModel):
@@ -203,6 +204,7 @@ class CompletionResponseStreamChoice(BaseModel):
203204
text: str
204205
logprobs: Optional[LogProbs] = None
205206
finish_reason: Optional[Literal["stop", "length"]] = None
207+
stop_reason: Union[None, int, str] = None
206208

207209

208210
class CompletionStreamResponse(BaseModel):
@@ -224,6 +226,7 @@ class ChatCompletionResponseChoice(BaseModel):
224226
message: ChatMessage
225227
logprobs: Optional[LogProbs] = None
226228
finish_reason: Optional[Literal["stop", "length"]] = None
229+
stop_reason: Union[None, int, str] = None
227230

228231

229232
class ChatCompletionResponse(BaseModel):
@@ -245,6 +248,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
245248
delta: DeltaMessage
246249
logprobs: Optional[LogProbs] = None
247250
finish_reason: Optional[Literal["stop", "length"]] = None
251+
stop_reason: Union[None, int, str] = None
248252

249253

250254
class ChatCompletionStreamResponse(BaseModel):

vllm/entrypoints/openai/serving_chat.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,8 @@ async def chat_completion_stream_generator(
194194
index=i,
195195
delta=DeltaMessage(content=delta_text),
196196
logprobs=logprobs,
197-
finish_reason=output.finish_reason)
197+
finish_reason=output.finish_reason,
198+
stop_reason=output.stop_reason)
198199
chunk = ChatCompletionStreamResponse(
199200
id=request_id,
200201
object=chunk_object_type,
@@ -248,6 +249,7 @@ async def chat_completion_full_generator(
248249
message=ChatMessage(role=role, content=output.text),
249250
logprobs=logprobs,
250251
finish_reason=output.finish_reason,
252+
stop_reason=output.stop_reason,
251253
)
252254
choices.append(choice_data)
253255

vllm/entrypoints/openai/serving_completion.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ async def completion_stream_generator(
8484
previous_texts[i] = output.text
8585
previous_num_tokens[i] = len(output.token_ids)
8686
finish_reason = output.finish_reason
87+
stop_reason = output.stop_reason
8788
response_json = CompletionStreamResponse(
8889
id=request_id,
8990
created=created_time,
@@ -94,6 +95,7 @@ async def completion_stream_generator(
9495
text=delta_text,
9596
logprobs=logprobs,
9697
finish_reason=finish_reason,
98+
stop_reason=stop_reason,
9799
)
98100
]).model_dump_json(exclude_unset=True)
99101
yield f"data: {response_json}\n\n"
@@ -117,6 +119,7 @@ async def completion_stream_generator(
117119
text="",
118120
logprobs=logprobs,
119121
finish_reason=output.finish_reason,
122+
stop_reason=output.stop_reason,
120123
)
121124
],
122125
usage=final_usage,
@@ -195,6 +198,7 @@ def request_output_to_completion_response(
195198
text=output_text,
196199
logprobs=logprobs,
197200
finish_reason=output.finish_reason,
201+
stop_reason=output.stop_reason,
198202
)
199203
choices.append(choice_data)
200204

vllm/outputs.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Optional
1+
from typing import List, Optional, Union
22
import time
33

44
from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup,
@@ -18,6 +18,9 @@ class CompletionOutput:
1818
logprobs: The log probabilities of the top probability words at each
1919
position if the logprobs are requested.
2020
finish_reason: The reason why the sequence is finished.
21+
stop_reason: The stop string or token id that caused the completion to stop,
22+
None if the completion finished for some other reason including
23+
encountering the EOS token.
2124
lora_request: The LoRA request that was used to generate the output.
2225
"""
2326

@@ -29,6 +32,7 @@ def __init__(
2932
cumulative_logprob: float,
3033
logprobs: Optional[SampleLogprobs],
3134
finish_reason: Optional[str] = None,
35+
stop_reason: Union[int, str, None] = None,
3236
lora_request: Optional[LoRARequest] = None,
3337
) -> None:
3438
self.index = index
@@ -37,6 +41,7 @@ def __init__(
3741
self.cumulative_logprob = cumulative_logprob
3842
self.logprobs = logprobs
3943
self.finish_reason = finish_reason
44+
self.stop_reason = stop_reason
4045
self.lora_request = lora_request
4146

4247
def finished(self) -> bool:
@@ -48,7 +53,8 @@ def __repr__(self) -> str:
4853
f"token_ids={self.token_ids}, "
4954
f"cumulative_logprob={self.cumulative_logprob}, "
5055
f"logprobs={self.logprobs}, "
51-
f"finish_reason={self.finish_reason})")
56+
f"finish_reason={self.finish_reason}, "
57+
f"stop_reason={self.stop_reason})")
5258

5359

5460
class RequestOutput:
@@ -111,7 +117,7 @@ def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
111117
output = CompletionOutput(seqs.index(seq), seq.output_text,
112118
seq.get_output_token_ids(),
113119
seq.get_cumulative_logprob(), logprobs,
114-
finshed_reason)
120+
finshed_reason, seq.stop_reason)
115121
outputs.append(output)
116122

117123
# Every sequence in the sequence group should have the same prompt.

vllm/sequence.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ def __init__(
150150
# Initialize the logical token blocks with the prompt token ids.
151151
self._append_tokens_to_blocks(prompt_token_ids)
152152
self.status = SequenceStatus.WAITING
153+
self.stop_reason: Union[int, str, None] = None
153154

154155
# Used for incremental detokenization
155156
self.prefix_offset = 0

0 commit comments

Comments
 (0)