vllm-project · ywang96 · Mar 5, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025
@@ -92,10 +92,13 @@ def _validate_allowed_token_ids(
             return
         if params.allowed_token_ids is None:
             return
-        if not all(0 <= tid < self.model_config.vocab_size
+        if params.allowed_token_ids is not None and len(
+                params.allowed_token_ids) == 0:
+            raise ValueError("allowed_token_ids is not None and empty!")
+        if not all(0 <= tid < self.model_config.get_vocab_size()
-        if params.allowed_token_ids is not None and len(
-                params.allowed_token_ids) == 0:
-            raise ValueError("allowed_token_ids is not None and empty!")
-        if not all(0 <= tid < self.model_config.get_vocab_size()
+        if not params.allowed_token_ids:
+            raise ValueError("allowed_token_ids cannot be empty")
+        vocab_size = self.model_config.get_vocab_size()
+        if not all(0 <= tid < vocab_size
-        if params.allowed_token_ids is not None and len(
-                params.allowed_token_ids) == 0:
-            raise ValueError("allowed_token_ids is not None and empty!")
-        if not all(0 <= tid < self.model_config.get_vocab_size()
+        if not params.allowed_token_ids:
+            raise ValueError("allowed_token_ids cannot be empty")
+        vocab_size = self.model_config.get_vocab_size()
+        if not all(0 <= tid < vocab_size
                    for tid in params.allowed_token_ids):
             raise ValueError(
-                "allowed_token_ids contains out-of-vocab token id")
+                "allowed_token_ids contains out-of-vocab token id!")
 
     def process_inputs(
         self,

@@ -300,17 +300,17 @@ def add_request(
             self.has_allowed_token_ids.add(req_id)
             if self.allowed_token_ids_mask_cpu_tensor is None:
                 # Lazy allocation for this tensor, which can be large.
-                self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
-                                                          self.vocab_size,
-                                                          dtype=torch.bool,
-                                                          device=self.device)
-                self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
+                self.allowed_token_ids_mask = torch.ones(self.max_num_reqs,
+                                                         self.vocab_size,
+                                                         dtype=torch.bool,
+                                                         device=self.device)
+                self.allowed_token_ids_mask_cpu_tensor = torch.ones(
                     self.max_num_reqs,
                     self.vocab_size,
                     dtype=torch.bool,
                     device="cpu")
             self.allowed_token_ids_mask_cpu_tensor[req_index][
-                sampling_params.allowed_token_ids] = True
+                sampling_params.allowed_token_ids] = False
 
         # Add request lora ID
         if request.lora_request:
@@ -359,7 +359,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.logit_bias[req_index] = None
         self.has_allowed_token_ids.discard(req_id)
         if self.allowed_token_ids_mask_cpu_tensor is not None:
-            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
+            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(True)
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None: