From 5994ac23b5255c66a366293c61a16cac966a7d32 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 5 Mar 2025 10:10:59 -0800 Subject: [PATCH] [V1][BugFix] Fix for mixed top_k batch We weren't properly handling the case where there are multiple temperature > 0 requests in a batch where some specify top_k and some don't. Signed-off-by: Nick Hill --- vllm/v1/worker/gpu_input_batch.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index c0e9ff0286d..d5ed615ab23 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -264,9 +264,12 @@ def add_request( self.top_p_cpu[req_index] = sampling_params.top_p if sampling_params.top_p < 1: self.top_p_reqs.add(req_id) - self.top_k_cpu[req_index] = sampling_params.top_k - if sampling_params.top_k > 0: + top_k = sampling_params.top_k + if 0 < top_k < self.vocab_size: self.top_k_reqs.add(req_id) + else: + top_k = self.vocab_size + self.top_k_cpu[req_index] = top_k self.min_p_cpu[req_index] = sampling_params.min_p self.frequency_penalties_cpu[ req_index] = sampling_params.frequency_penalty