Skip to content

Commit 21fab45

Browse files
houseroadlulmer
authored andcommitted
[Bugfix][V1] Fix allowed_token_ids for v1 Sampler (vllm-project#14169)
Signed-off-by: Lu Fang <[email protected]> Signed-off-by: Louis Ulmer <[email protected]>
1 parent 2d0ac48 commit 21fab45

File tree

2 files changed

+12
-4
lines changed

2 files changed

+12
-4
lines changed

vllm/v1/engine/processor.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,12 @@ def _validate_allowed_token_ids(
9292
return
9393
if params.allowed_token_ids is None:
9494
return
95-
if not all(0 <= tid < self.model_config.vocab_size
96-
for tid in params.allowed_token_ids):
95+
if not params.allowed_token_ids:
96+
raise ValueError("allowed_token_ids is not None and empty!")
97+
vocab_size = self.model_config.get_vocab_size()
98+
if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
9799
raise ValueError(
98-
"allowed_token_ids contains out-of-vocab token id")
100+
"allowed_token_ids contains out-of-vocab token id!")
99101

100102
def process_inputs(
101103
self,

vllm/v1/worker/gpu_input_batch.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ def __init__(
199199
self.logit_bias: list[Optional[dict[int,
200200
float]]] = [None] * max_num_reqs
201201
self.has_allowed_token_ids: set[str] = set()
202+
# NOTE(lufang): In the mask tensor, if the corresponding token allowed,
203+
# the value is False. Since we use masked_fill_ to set -inf.
202204
self.allowed_token_ids_mask: Optional[torch.Tensor] = None
203205
self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
204206

@@ -300,6 +302,7 @@ def add_request(
300302
self.has_allowed_token_ids.add(req_id)
301303
if self.allowed_token_ids_mask_cpu_tensor is None:
302304
# Lazy allocation for this tensor, which can be large.
305+
# False means we don't fill with -inf.
303306
self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
304307
self.vocab_size,
305308
dtype=torch.bool,
@@ -309,8 +312,10 @@ def add_request(
309312
self.vocab_size,
310313
dtype=torch.bool,
311314
device="cpu")
315+
self.allowed_token_ids_mask_cpu_tensor[req_index] = True
316+
# False means we don't fill with -inf.
312317
self.allowed_token_ids_mask_cpu_tensor[req_index][
313-
sampling_params.allowed_token_ids] = True
318+
sampling_params.allowed_token_ids] = False
314319

315320
# Add request lora ID
316321
if request.lora_request:
@@ -359,6 +364,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
359364
self.logit_bias[req_index] = None
360365
self.has_allowed_token_ids.discard(req_id)
361366
if self.allowed_token_ids_mask_cpu_tensor is not None:
367+
# False means we don't fill with -inf.
362368
self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
363369
return req_index
364370

0 commit comments

Comments
 (0)