vllm-project · WoosukKwon · Apr 23, 2025 · Apr 17, 2025 · Apr 17, 2025 · Apr 17, 2025
@@ -274,6 +274,11 @@ def __init__(
                                         pin_memory=self.pin_memory)
         self.seq_lens_np = self.seq_lens_cpu.numpy()
 
+        max_token_idx = self.max_num_reqs * self.max_model_len - 1
+        # if max token idx exceeds int32 max, use int64 to avoid overflow
+        self.token_indices_dtype = np.int32 \
+            if max_token_idx <= np.iinfo(np.int32).max else np.int64
+
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         """Update the cached states and the persistent batch with the scheduler
         output.
@@ -521,8 +526,10 @@ def _prepare_inputs(
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
         # where M is the max_model_len.
+        # For long context, may need to cast to int64 to avoid overflow
         token_indices = (positions_np +
-                         req_indices * self.input_batch.token_ids_cpu.shape[1])
+                         req_indices.astype(self.token_indices_dtype) *
+                         self.input_batch.token_ids_cpu.shape[1])
 
         # NOTE(woosuk): We use torch.index_select instead of np.take here
         # because torch.index_select is much faster than np.take for large

@@ -219,6 +219,11 @@ def __init__(
         self.arange_np = np.arange(self.max_num_tokens, dtype=np.int32)
         self.num_reqs_paddings = _get_req_paddings(
             min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
+
+        max_token_idx = self.max_num_reqs * self.max_model_len - 1
+        # if max token idx exceeds int32 max, use int64 to avoid overflow
+        self.token_indices_dtype = np.int32 \
+            if max_token_idx <= np.iinfo(np.int32).max else np.int64
 
     def _update_num_xla_graphs(self, case_str):
         check_comp = self.check_recompilation and not self.enforce_eager
@@ -457,8 +462,10 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
         # where M is the max_model_len.
+        # For long context, may need to cast to int64 to avoid overflow
         token_indices = (positions_np +
-                         req_indices * self.input_batch.token_ids_cpu.shape[1])
+                         req_indices.astype(self.token_indices_dtype) * 
+                         self.input_batch.token_ids_cpu.shape[1])
 
         # NOTE(woosuk): We use torch.index_select instead of np.take here
         # because torch.index_select is much faster than np.take for large