[Core] Default to using per_token quantization for fp8 when cutlass is supported. (vllm-project#8651)

elfiegg · mgoin · Isotr0py · commit e58e3773bf12 · 2025-02-02T21:35:02.000+08:00
Signed-off-by: mgoin &lt;michael@neuralmagic.com&gt;
Co-authored-by: Michael Goin &lt;mgoin@redhat.com&gt;
Co-authored-by: mgoin &lt;michael@neuralmagic.com&gt;
Signed-off-by: Isotr0py &lt;2037008807@qq.com&gt;
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -355,7 +355,8 @@ def apply(self,
             input_scale=layer.input_scale,
             bias=bias,
             cutlass_fp8_supported=self.cutlass_fp8_supported,
-            use_per_token_if_dynamic=False)
+            # Default to using per_token quantization if cutlass is supported
+            use_per_token_if_dynamic=self.cutlass_fp8_supported)
 
 
 class Fp8MoEMethod(FusedMoEMethodBase):