2
2
3
3
import asyncio
4
4
from dataclasses import dataclass
5
- from typing import Dict , List , Optional
5
+ from typing import Dict , List , Optional , Union
6
6
7
7
from vllm .outputs import RequestOutput
8
8
from vllm .sampling_params import RequestOutputKind
@@ -164,6 +164,7 @@ def process_outputs(
164
164
165
165
new_token_ids = engine_core_output .new_token_ids
166
166
finish_reason = engine_core_output .finish_reason
167
+ stop_reason = engine_core_output .stop_reason
167
168
168
169
# TODO(andy): prompt logprobs + chunked prefill can
169
170
# result in engine core returning an output for a
@@ -181,9 +182,10 @@ def process_outputs(
181
182
182
183
# 2) Detokenize the token ids into text and check for stop
183
184
# strings.
184
- stop_reason = req_state .detokenizer .update (new_token_ids )
185
- if stop_reason :
185
+ stop_string = req_state .detokenizer .update (new_token_ids )
186
+ if stop_string and finish_reason != FinishReason . STOP :
186
187
finish_reason = FinishReason .STOP
188
+ stop_reason = stop_string
187
189
188
190
# 3) Compute sample and prompt logprobs for request,
189
191
# if required.
@@ -250,7 +252,7 @@ def _make_request_output(
250
252
request_state : RequestState ,
251
253
new_token_ids : List [int ],
252
254
finish_reason : Optional [FinishReason ],
253
- stop_reason : Optional [ str ],
255
+ stop_reason : Union [ int , str , None ],
254
256
) -> Optional [RequestOutput ]:
255
257
256
258
finished = finish_reason is not None
0 commit comments