3
3
import pytest
4
4
from transformers import AutoTokenizer
5
5
6
+ from vllm .engine .arg_utils import EngineArgs
6
7
from vllm .sampling_params import RequestOutputKind , SamplingParams
8
+ from vllm .transformers_utils .tokenizer_group import init_tokenizer_from_configs
7
9
from vllm .v1 .engine import EngineCoreOutput , EngineCoreRequest
8
- from vllm .v1 .engine .detokenizer import Detokenizer
10
+ from vllm .v1 .engine .output_processor import OutputProcessor
9
11
10
12
TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
13
+ VLLM_CONFIG = EngineArgs (model = TOKENIZER_NAME ).create_engine_config ()
14
+ TOKENIZER_GROUP = init_tokenizer_from_configs (VLLM_CONFIG .model_config ,
15
+ VLLM_CONFIG .scheduler_config ,
16
+ VLLM_CONFIG .parallel_config ,
17
+ VLLM_CONFIG .lora_config )
11
18
tokenizer = AutoTokenizer .from_pretrained (TOKENIZER_NAME )
12
19
13
20
FULL_STRINGS = [
@@ -66,7 +73,7 @@ def get_outputs(self) -> List[EngineCoreOutput]:
66
73
"request_output_kind" ,
67
74
[RequestOutputKind .DELTA , RequestOutputKind .FINAL_ONLY ])
68
75
def test_incremental_detokenization (request_output_kind : RequestOutputKind ):
69
- detokenizer = Detokenizer ( TOKENIZER_NAME )
76
+ output_processor = OutputProcessor ( TOKENIZER_GROUP , log_stats = False )
70
77
engine_core = MockEngineCore (GENERATION_TOKENS )
71
78
72
79
# Make N requests.
@@ -93,7 +100,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
93
100
94
101
# Add requests to the detokenizer.
95
102
for request in requests :
96
- detokenizer .add_request (request )
103
+ output_processor .add_request (request )
97
104
98
105
gen_strings = {}
99
106
gen_tokens = {}
@@ -104,7 +111,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
104
111
break
105
112
106
113
# Step the Detokenizer.
107
- request_outputs , requests_to_abort = detokenizer .step (outputs )
114
+ processed_outputs = output_processor .process_outputs (outputs , )
115
+ request_outputs = processed_outputs .request_outputs
116
+ requests_to_abort = processed_outputs .reqs_to_abort
108
117
assert len (requests_to_abort ) == 0
109
118
110
119
# Update tracking.
@@ -128,13 +137,13 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
128
137
assert gen_str == ref_gen_str , f"{ gen_str = } , { ref_gen_str = } "
129
138
assert gen_toks == ref_gen_toks , f"{ gen_toks = } , { ref_gen_toks = } "
130
139
131
- assert detokenizer .get_num_unfinished_requests () == 0
132
- assert not detokenizer .has_unfinished_requests ()
140
+ assert output_processor .get_num_unfinished_requests () == 0
141
+ assert not output_processor .has_unfinished_requests ()
133
142
134
143
135
144
@pytest .mark .parametrize ("include_stop_str_in_output" , [True , False ])
136
145
def test_stop_string (include_stop_str_in_output : bool ):
137
- detokenizer = Detokenizer ( TOKENIZER_NAME )
146
+ output_processor = OutputProcessor ( TOKENIZER_GROUP , log_stats = False )
138
147
engine_core = MockEngineCore (GENERATION_TOKENS )
139
148
140
149
# Make N requests.
@@ -162,7 +171,7 @@ def test_stop_string(include_stop_str_in_output: bool):
162
171
163
172
# Add requests to the detokenizer.
164
173
for request in requests :
165
- detokenizer .add_request (request )
174
+ output_processor .add_request (request )
166
175
167
176
gen_strings = {}
168
177
aborted = []
@@ -173,7 +182,9 @@ def test_stop_string(include_stop_str_in_output: bool):
173
182
break
174
183
175
184
# Step the Detokenizer.
176
- request_outputs , requests_to_abort = detokenizer .step (outputs )
185
+ processed_outputs = output_processor .process_outputs (outputs )
186
+ request_outputs = processed_outputs .request_outputs
187
+ requests_to_abort = processed_outputs .reqs_to_abort
177
188
for request_output in request_outputs :
178
189
# If aborted, we should not get a request output.
179
190
assert request_output .request_id not in aborted
@@ -214,5 +225,71 @@ def test_stop_string(include_stop_str_in_output: bool):
214
225
assert gen_str == ref_str_exc_stop , (
215
226
f"{ gen_str = } , { ref_str_exc_stop = } " )
216
227
217
- assert detokenizer .get_num_unfinished_requests () == 0
218
- assert not detokenizer .has_unfinished_requests ()
228
+ assert output_processor .get_num_unfinished_requests () == 0
229
+ assert not output_processor .has_unfinished_requests ()
230
+
231
+
232
+ def test_iteration_stats ():
233
+ output_processor = OutputProcessor (TOKENIZER_GROUP , log_stats = True )
234
+ engine_core = MockEngineCore (GENERATION_TOKENS )
235
+
236
+ # Make N requests.
237
+ requests = [
238
+ EngineCoreRequest (
239
+ request_id = f"request-{ idx } " ,
240
+ prompt = prompt ,
241
+ prompt_token_ids = prompt_tokens ,
242
+ arrival_time = 0 ,
243
+ mm_inputs = None ,
244
+ mm_hashes = None ,
245
+ mm_placeholders = None ,
246
+ eos_token_id = None ,
247
+ lora_request = None ,
248
+ sampling_params = SamplingParams (),
249
+ ) for idx , (
250
+ prompt ,
251
+ prompt_tokens ) in enumerate (zip (PROMPT_STRINGS , PROMPT_TOKENS ))
252
+ ]
253
+
254
+ # Add all requests except one to the OutputProcessor.
255
+ num_active = len (GENERATION_TOKENS ) - 1
256
+ for request in requests [:num_active ]:
257
+ output_processor .add_request (request )
258
+ inactive_request = requests [num_active ]
259
+
260
+ # First iteration has 2 prefills.
261
+ outputs = engine_core .get_outputs ()[:num_active ]
262
+ processed_outputs = output_processor .process_outputs (outputs )
263
+ iteration_stats = processed_outputs .iteration_stats
264
+ total_prompt_tokens = sum (
265
+ [len (prompt_tokens ) for prompt_tokens in PROMPT_TOKENS [:num_active ]])
266
+
267
+ assert iteration_stats .num_prompt_tokens == total_prompt_tokens
268
+ assert iteration_stats .num_generation_tokens == num_active
269
+
270
+ # Just decodes in this step.
271
+ outputs = engine_core .get_outputs ()[:num_active ]
272
+ processed_outputs = output_processor .process_outputs (outputs )
273
+ iteration_stats = processed_outputs .iteration_stats
274
+
275
+ assert iteration_stats .num_prompt_tokens == 0
276
+ assert iteration_stats .num_generation_tokens == num_active
277
+
278
+ # Add a new request - prefill and 2 decodes in this step.
279
+ output_processor .add_request (inactive_request )
280
+ num_active += 1
281
+ outputs = engine_core .get_outputs ()[:num_active ]
282
+ processed_outputs = output_processor .process_outputs (outputs )
283
+ iteration_stats = processed_outputs .iteration_stats
284
+ total_prompt_tokens = len (PROMPT_TOKENS [num_active - 1 ])
285
+
286
+ assert iteration_stats .num_prompt_tokens == total_prompt_tokens
287
+ assert iteration_stats .num_generation_tokens == num_active
288
+
289
+ # Just decodes in this step.
290
+ outputs = engine_core .get_outputs ()[:num_active ]
291
+ processed_outputs = output_processor .process_outputs (outputs )
292
+ iteration_stats = processed_outputs .iteration_stats
293
+
294
+ assert iteration_stats .num_prompt_tokens == 0
295
+ assert iteration_stats .num_generation_tokens == num_active
0 commit comments