14
14
from vllm .transformers_utils .tokenizer_group import init_tokenizer_from_configs
15
15
from vllm .v1 .core .encoder_cache_manager import (EncoderCacheManager ,
16
16
compute_encoder_budget )
17
- from vllm .v1 .core .guided_decoding import GuidedDecodingManager
18
- from vllm .v1 .core .guided_decoding .grammar import Grammar
17
+ from vllm .v1 .core .guided_decoding import Grammar
19
18
from vllm .v1 .core .kv_cache_manager import KVCacheManager
20
19
from vllm .v1 .engine import EngineCoreOutput , EngineCoreOutputs
21
20
from vllm .v1 .metrics .stats import SchedulerStats
@@ -40,13 +39,11 @@ def __init__(
40
39
cache_config : CacheConfig ,
41
40
parallel_config : ParallelConfig ,
42
41
lora_config : Optional [LoRAConfig ],
43
- decoding_config : DecodingConfig ,
44
42
) -> None :
45
43
self .scheduler_config = scheduler_config
46
44
self .cache_config = cache_config
47
45
self .lora_config = lora_config
48
46
self .model_config = model_config
49
- self .decoding_config = decoding_config
50
47
# TODO: Support LoRA.
51
48
assert lora_config is None , "V1 does not support LoRA yet."
52
49
# Scheduling constraints.
@@ -103,21 +100,6 @@ def __init__(
103
100
self .encoder_cache_manager = EncoderCacheManager (
104
101
cache_size = encoder_cache_size )
105
102
106
- # A request queue for grammar compilation
107
- self .grammar : Deque [Request ] = deque ()
108
- # initialize the tokenizer on the scheduler (this is used for constrained decoding)
109
- tokenizer_group = init_tokenizer_from_configs (
110
- model_config = model_config ,
111
- scheduler_config = scheduler_config ,
112
- parallel_config = parallel_config ,
113
- lora_config = lora_config )
114
- tokenizer_group .ping ()
115
- # setup guided decoding, right now uses xgrammar
116
- self .guided_decoding_manager = GuidedDecodingManager (
117
- backend = decoding_config .guided_decoding_backend ,
118
- tokenizer_group = tokenizer_group ,
119
- model_config = model_config )
120
-
121
103
def schedule (self ) -> "SchedulerOutput" :
122
104
# NOTE(woosuk) on the scheduling algorithm:
123
105
# There's no "decoding phase" nor "prefill phase" in the scheduler.
@@ -133,25 +115,6 @@ def schedule(self) -> "SchedulerOutput":
133
115
scheduled_running_reqs : List [Request ] = []
134
116
preempted_reqs : List [Request ] = []
135
117
136
- # we need to check the grammar queue for any requests that have finished FSM compilation
137
- newly_grammar_reqs : List [Request ] = []
138
- scheduled_grammar_reqs : Deque [Request ] = deque ()
139
- while self .grammar :
140
- request = self .grammar .popleft ()
141
- try :
142
- # When request first added via add_request, then it will be a future call
143
- # check timeout and add it directly to previous queue
144
- request .grammar = request .grammar .result (timeout = 0.05 )
145
- request .status = RequestStatus .WAITING
146
- newly_grammar_reqs .append (request )
147
- except futures ._base .TimeoutError :
148
- scheduled_grammar_reqs .append (request )
149
- self .grammar = scheduled_grammar_reqs
150
-
151
- # append all newly ready requests to waiting queue with higher priority
152
- for req in newly_grammar_reqs :
153
- self .waiting .appendleft (req )
154
-
155
118
req_to_new_block_ids : Dict [str , List [int ]] = {}
156
119
num_scheduled_tokens : Dict [str , int ] = {}
157
120
token_budget = self .max_num_scheduled_tokens
@@ -238,13 +201,6 @@ def schedule(self) -> "SchedulerOutput":
238
201
self .encoder_cache_manager .allocate (request , i )
239
202
encoder_budget = new_encoder_budget
240
203
241
- # Track if we need guided decoding
242
- # Create individual bitmask for requests with grammar
243
- if request .grammar is not None :
244
- if request .request_id not in guided_decoding_bitmasks :
245
- bitmask = request .grammar .allocate_bitmask (1 , vocab_size )
246
- guided_decoding_bitmasks [request .request_id ] = bitmask
247
-
248
204
# Next, schedule the WAITING requests.
249
205
if not preempted_reqs :
250
206
while self .waiting :
@@ -258,7 +214,8 @@ def schedule(self) -> "SchedulerOutput":
258
214
request = self .waiting [0 ]
259
215
260
216
# allocate bitmask on request on first round
261
- if request .grammar : request .allocate_grammar_bitmask (vocab_size = vocab_size )
217
+ if request .grammar :
218
+ request .allocate_grammar_bitmask (vocab_size = vocab_size )
262
219
263
220
# Get already-cached tokens.
264
221
computed_blocks , num_computed_tokens = \
@@ -356,8 +313,12 @@ def schedule(self) -> "SchedulerOutput":
356
313
]
357
314
running_reqs_data = [
358
315
self ._make_running_request_data (
359
- req , req_to_new_block_ids [req .request_id ],
360
- req .num_computed_tokens , grammar = req .grammar , grammar_bitmask = req .grammar_bitmask ) for req in scheduled_running_reqs
316
+ req ,
317
+ req_to_new_block_ids [req .request_id ],
318
+ req .num_computed_tokens ,
319
+ grammar = req .grammar ,
320
+ grammar_bitmask = req .grammar_bitmask )
321
+ for req in scheduled_running_reqs
361
322
]
362
323
preempted_req_ids = {req .request_id for req in preempted_reqs }
363
324
@@ -375,7 +336,6 @@ def schedule(self) -> "SchedulerOutput":
375
336
# It contains the request IDs that are finished in between
376
337
# the previous and the current steps.
377
338
finished_req_ids = self .finished_req_ids ,
378
- guided_decoding_bitmasks = guided_decoding_bitmasks ,
379
339
free_encoder_input_ids = self .encoder_cache_manager .get_freed_ids (),
380
340
)
381
341
@@ -398,7 +358,7 @@ def _make_running_request_data(
398
358
req_data .new_block_ids = new_block_ids
399
359
req_data .num_computed_tokens = num_computed_tokens
400
360
req_data .grammar = grammar
401
- req_data .grammar_bitmask = grammar_bitmask
361
+ req_data .grammar_bitmask = grammar_bitmask
402
362
else :
403
363
req_data = RunningRequestData .from_request (request , new_block_ids ,
404
364
num_computed_tokens )
@@ -480,6 +440,8 @@ def update_from_output(
480
440
scheduler_output : "SchedulerOutput" ,
481
441
model_runner_output : "ModelRunnerOutput" ,
482
442
) -> EngineCoreOutputs :
443
+ # concern: batchsize >>>1000
444
+ # compilation << update
483
445
# NOTE(woosuk): This method doesn't consider speculative decoding.
484
446
sampled_token_ids = model_runner_output .sampled_token_ids
485
447
num_scheduled_tokens = scheduler_output .num_scheduled_tokens
@@ -560,11 +522,7 @@ def _check_stop(self, request: Request) -> bool:
560
522
561
523
def add_request (self , request : Request ) -> None :
562
524
self .requests [request .request_id ] = request
563
-
564
- if self .guided_decoding_manager .collect (request ):
565
- self .grammar .append (request )
566
- else :
567
- self .waiting .append (request )
525
+ self .waiting .append (request )
568
526
569
527
def finish_requests (
570
528
self ,
@@ -648,7 +606,8 @@ def from_request(
648
606
sampling_params = request .sampling_params ,
649
607
block_ids = block_ids ,
650
608
num_computed_tokens = num_computed_tokens ,
651
- grammar = request .grammar , grammar_bitmask = request .grammar_bitmask )
609
+ grammar = request .grammar ,
610
+ grammar_bitmask = request .grammar_bitmask )
652
611
653
612
654
613
@dataclass
@@ -671,7 +630,8 @@ def from_request(
671
630
return cls (req_id = request .request_id ,
672
631
block_ids = block_ids ,
673
632
num_computed_tokens = num_computed_tokens ,
674
- grammar = request .grammar , grammar_bitmask = request .grammar_bitmask )
633
+ grammar = request .grammar ,
634
+ grammar_bitmask = request .grammar_bitmask )
675
635
676
636
677
637
@dataclass
@@ -694,7 +654,8 @@ def from_request(
694
654
return cls (req_id = request .request_id ,
695
655
new_block_ids = new_block_ids ,
696
656
num_computed_tokens = num_computed_tokens ,
697
- grammar = request .grammar , grammar_bitmask = request .grammar_bitmask )
657
+ grammar = request .grammar ,
658
+ grammar_bitmask = request .grammar_bitmask )
698
659
699
660
700
661
@dataclass
0 commit comments