@@ -199,6 +199,8 @@ def __init__(
199
199
self .logit_bias : list [Optional [dict [int ,
200
200
float ]]] = [None ] * max_num_reqs
201
201
self .has_allowed_token_ids : set [str ] = set ()
202
+ # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
203
+ # the value is False. Since we use masked_fill_ to set -inf.
202
204
self .allowed_token_ids_mask : Optional [torch .Tensor ] = None
203
205
self .allowed_token_ids_mask_cpu_tensor : Optional [torch .Tensor ] = None
204
206
@@ -300,6 +302,7 @@ def add_request(
300
302
self .has_allowed_token_ids .add (req_id )
301
303
if self .allowed_token_ids_mask_cpu_tensor is None :
302
304
# Lazy allocation for this tensor, which can be large.
305
+ # False means we don't fill with -inf.
303
306
self .allowed_token_ids_mask = torch .zeros (self .max_num_reqs ,
304
307
self .vocab_size ,
305
308
dtype = torch .bool ,
@@ -309,8 +312,10 @@ def add_request(
309
312
self .vocab_size ,
310
313
dtype = torch .bool ,
311
314
device = "cpu" )
315
+ self .allowed_token_ids_mask_cpu_tensor [req_index ] = True
316
+ # False means we don't fill with -inf.
312
317
self .allowed_token_ids_mask_cpu_tensor [req_index ][
313
- sampling_params .allowed_token_ids ] = True
318
+ sampling_params .allowed_token_ids ] = False
314
319
315
320
# Add request lora ID
316
321
if request .lora_request :
@@ -359,6 +364,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
359
364
self .logit_bias [req_index ] = None
360
365
self .has_allowed_token_ids .discard (req_id )
361
366
if self .allowed_token_ids_mask_cpu_tensor is not None :
367
+ # False means we don't fill with -inf.
362
368
self .allowed_token_ids_mask_cpu_tensor [req_index ].fill_ (False )
363
369
return req_index
364
370
0 commit comments