[Kernel] fix moe_align_block_size error condition (vllm-project#12239)

jinzhen-lin · tjtanaa · commit e4564cb90c4d · 2025-01-28T06:08:54.000Z
Signed-off-by: Jinzhen Lin &lt;linjinzhen@hotmail.com&gt;
diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
@@ -33,7 +33,9 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
 
   extern __shared__ int32_t shared_mem[];
   int32_t* cumsum = shared_mem;  // 1d tensor with shape (num_experts + 1)
-  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + blockDim.x + 1);
+  token_cnts_t* tokens_cnts =
+      (token_cnts_t*)(shared_mem + num_experts +
+                      1);  // 2d tensor with shape (blockDim.x + 1, num_experts)
 
   for (int i = 0; i < num_experts; ++i) {
     tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
@@ -234,14 +236,16 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
 
   bool use_global_memory = false;
   bool use_i16 = false;  // Use uint16_t for shared memory token counts
-  if (shared_mem_i16 > device_max_shared_mem) {
-    use_global_memory = true;
-  } else if (shared_mem_i32 > device_max_shared_mem &&
+  if (shared_mem_i32 < device_max_shared_mem) {
+    // Do nothing in this case. We're all set to use int32_t token counts
+  } else if (shared_mem_i16 < device_max_shared_mem &&
              topk_ids.numel() <= 65535) {
     // when nelements of topk_ids is smaller than 65535 (max value of uint16),
     // element value of token_cnts would also smaller than 65535,
     // so we can use uint16 as dtype of token_cnts
     use_i16 = true;
+  } else {
+    use_global_memory = true;
   }
 
   if (use_global_memory) {
@@ -342,4 +346,4 @@ void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
       at::sum_out(output, input, 1);
       break;
   }
-}
+}
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
@@ -147,7 +147,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
             "up_proj",
         ],
     }
-    
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
@@ -348,12 +348,10 @@ def __init__(
         if quant_config and quant_config.get_name() == "bitsandbytes":
             quantizable = True
         else:
-            # For other quantization, we require the hidden size to be a 
+            # For other quantization, we require the hidden size to be a
             # multiple of 64
-            quantizable = (
-                config.hidden_size % 64 == 0
-                and config.intermediate_size % 64 == 0
-            )
+            quantizable = (config.hidden_size % 64 == 0
+                           and config.intermediate_size % 64 == 0)
         self.fc1 = ColumnParallelLinear(
             config.hidden_size,
             config.intermediate_size,

Original file line number	Diff line number	Diff line change
`@@ -147,7 +147,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,`
`147`	`147`	`"up_proj",`
`148`	`148`	`],`
`149`	`149`	`}`
`150`		`-`
	`150`	`+`
`151`	`151`	`def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):`
`152`	`152`	`super().__init__()`
`153`	`153`	`config = vllm_config.model_config.hf_config`