[DeepseekR1] bring back dequant option and fix acc (vllm-project#883)

xuechendi · michalkuligowski · commit ebb622d8af2c · 2025-03-14T13:28:16.000+01:00
1. use VLLM_ENABLE_RUNTIME_DEQUANT=1 to run with runtime dequantize
2. use VLLM_DMOE_DYNAMIC_SCALE=1 to run with dynamic dequantize +
dynamic MOE
3.acc looks good as below 
```
VLLM_DMOE_DYNAMIC_SCALE=1 python scripts/run_lm_eval.py -l 64 --batch_size 8
{"gsm8k": {"alias": "gsm8k", "exact_match,strict-match": 0.96875, "exact_match_stderr,strict-match": 0.021921011700381302, "exact_match,flexible-extract": 0.96875, "exact_match_stderr,flexible-extract": 0.021921011700381302}}{"e2e time(secs)": 938.2986768169999}
```

---------

Signed-off-by: Chendi Xue &lt;chendi.xue@intel.com&gt;
diff --git a/scripts/DEEPSEEK_R1_ON_GAUDI.md b/scripts/DEEPSEEK_R1_ON_GAUDI.md
@@ -11,19 +11,39 @@ cd vllm;  pip install -r requirements-hpu.txt; VLLM_TARGET_DEVICE=hpu pip instal
 huggingface-cli download --local-dir ${YOUR_PATH}/DeepSeek-R1 deepseek-ai/DeepSeek-R1
 ```
 
-# Option 1. run with dynamic quantization
+# Option 1. run with runtime dequantize with block-based scale
 > expect new DynamicMOE kernel ready in few weeks.
 > Current Performance is worse than static quantization due to lack of dynamic MOE support.
 ## step 1. run example
 ```
-python scripts/run_example_tp.py --model ${YOUR_PATH}/DeepSeek-R1
+VLLM_ENABLE_RUNTIME_DEQUANT=1 python scripts/run_example_tp.py --model ${YOUR_PATH}/DeepSeek-R1
 ```
-## step 2. run benchmark
+## step 2. run lm_eval
 ```
-bash scripts/benchmark-dynamicfp8-i1k-o1k-ep8-bestperf.sh
+VLLM_ENABLE_RUNTIME_DEQUANT=1 python scripts/run_lm_eval.py -l 64 --batch_size 1 --ep_size 1
+{"gsm8k": {"alias": "gsm8k", "exact_match,strict-match": 0.96875, "exact_match_stderr,strict-match": 0.021921011700381302, "exact_match,flexible-extract": 0.96875, "exact_match_stderr,flexible-extract": 0.021921011700381302}}{"e2e time(secs)": 938.2986768169999}
 ```
 
-# Option 2. run with static quantization
+# Option 2. run with dynamic quantization
+> expect new DynamicMOE kernel ready in few weeks.
+> Current Performance is worse than static quantization due to lack of dynamic MOE support.
+## step 1. run example
+```
+# if you're testing with patched kernel
+# use VLLM_DMOE_DYNAMIC_SCALE=1 to enable dynamic scaling supported DynamicMOE
+VLLM_DMOE_DYNAMIC_SCALE=1 python scripts/run_example_tp.py --model ${YOUR_PATH}/DeepSeek-R1
+```
+## step 2. run lm_eval
+```
+VLLM_DMOE_DYNAMIC_SCALE=1 python scripts/run_lm_eval.py -l 64 --batch_size 1
+{"gsm8k": {"alias": "gsm8k", "exact_match,strict-match": 0.96875, "exact_match_stderr,strict-match": 0.021921011700381302, "exact_match,flexible-extract": 0.96875, "exact_match_stderr,flexible-extract": 0.021921011700381302}}{"e2e time(secs)": 938.2986768169999}
+```
+## step 3. run benchmark
+```
+VLLM_DMOE_DYNAMIC_SCALE=1 bash scripts/benchmark-dynamicfp8-i1k-o1k-ep8-bestperf.sh
+```
+
+# Option 3. run with static quantization
 > current best performance
 ## step 1. Prepare static quantization model
 ```
diff --git a/scripts/run_lm_eval.py b/scripts/run_lm_eval.py
@@ -5,7 +5,7 @@
 import json
 import time
 
-model_path = "/data/models/DeepSeek-R1-static/"
+model_path = "/data/models/DeepSeek-R1/"
 #model_path = "/mnt/workdisk/dohayon/Projects/R1/DeepSeek-R1-fp8/"
 # model_path = "deepseek-ai/DeepSeek-V2-Lite"
 
@@ -16,8 +16,8 @@
 parser.add_argument("--tokenizer", type=str, default=model_path, help="The model path.")
 parser.add_argument("--tp_size", type=int, default=8, help="Tensor Parallelism size.")
 parser.add_argument("--ep_size", type=int, default=8, help="Expert Parallelism size.")
-parser.add_argument("-l", "--limit", type=int, default=256, help="test request counts.")
-parser.add_argument("--batch_size", type=int, default=128, help="The batch size.")
+parser.add_argument("-l", "--limit", type=int, default=64, help="test request counts.")
+parser.add_argument("--batch_size", type=int, default=1, help="The batch size.")
 args = parser.parse_args()
 
 os.environ["VLLM_SKIP_WARMUP"] = "true"
@@ -30,10 +30,11 @@
     os.environ["VLLM_MOE_N_SLICE"] = "4"
     os.environ["VLLM_EP_SIZE"] = "1"
 
-os.environ["VLLM_MLA_DISABLE_REQUANTIZATION"] = "0"
+os.environ["VLLM_MLA_DISABLE_REQUANTIZATION"] = "1"
 os.environ["PT_HPU_WEIGHT_SHARING"] = "0"
 
-os.environ['VLLM_DMOE_DYNAMIC_SCALE']='1'
+#os.environ['VLLM_DMOE_DYNAMIC_SCALE']='1'
+#os.environ['VLLM_ENABLE_RUNTIME_DEQUANT']='1'
 
 if __name__ == "__main__":
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -610,14 +610,15 @@ def weight_loader(self, param: torch.nn.Parameter,
                     expert_data=expert_data,
                     tp_rank=tp_rank,
                     expert_id=expert_id)
-            elif current_platform.is_hpu():
-                self._load_per_channel_weight_scale(
-                    shard_id=shard_id,
-                    shard_dim=shard_dim,
-                    loaded_weight=loaded_weight,
-                    expert_data=expert_data,
-                    tp_rank=tp_rank,
-                    expert_id=expert_id)
+            # elif current_platform.is_hpu():
+            #     print(f"!!!!!!!!!!!!! HPU load per channel weight scale")
+            #     self._load_per_channel_weight_scale(
+            #         shard_id=shard_id,
+            #         shard_dim=shard_dim,
+            #         loaded_weight=loaded_weight,
+            #         expert_data=expert_data,
+            #         tp_rank=tp_rank,
+            #         expert_id=expert_id)
             elif quant_method in [
                     FusedMoeWeightScaleSupported.GROUP.value,
                     FusedMoeWeightScaleSupported.BLOCK.value,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -37,7 +37,8 @@
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     dynamic_quant,
     dequant_block_fp8_weight_naive,
-    apply_block_fp8_linear_hpu_dynamic)
+    apply_block_fp8_linear_hpu_dynamic,
+    apply_block_fp8_linear_hpu_dequant)
 
 if current_platform.is_hpu():
     import habana_frameworks.torch as htorch
@@ -58,6 +59,7 @@ def __init__(
         ignored_layers: Optional[List[str]] = None,
         weight_block_size: Optional[List[int]] = None,
     ) -> None:
+        self.enable_runtime_dequant = os.environ.get("VLLM_ENABLE_RUNTIME_DEQUANT", "0") in ["1", "true"]
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
         if is_checkpoint_fp8_serialized:
             logger.warning("Detected fp8 checkpoint. Please note that the "
@@ -282,17 +284,24 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     layer.weight.data,
                     layer.weight_scale_inv.data,
                     self.quant_config.weight_block_size)
-                weight, weight_scale_inv = dynamic_quant(dequant_block_fp8_weight_naive(
-                    weight,
-                    layer.weight_scale_inv.data,
-                    self.quant_config.weight_block_size,
-                    original_M=orig_M,
-                    original_N=orig_N,
-                    do_unpad=True))
-                weight_scale_inv = weight_scale_inv.squeeze(-1)
-                layer.weight.data.copy_(weight)
-                layer.weight_scale_inv = Parameter(weight_scale_inv,
-                                                   requires_grad=False)
+                if self.quant_config.enable_runtime_dequant:
+                    layer.weight = torch.nn.Parameter(weight, requires_grad=False)
+                    orig_M = torch.nn.Parameter(torch.tensor(orig_M, dtype=torch.int32), requires_grad=False)
+                    orig_N = torch.nn.Parameter(torch.tensor(orig_N, dtype=torch.int32), requires_grad=False)
+                    layer.register_parameter("orig_M", orig_M)
+                    layer.register_parameter("orig_N", orig_N)
+                else:
+                    weight, weight_scale_inv = dynamic_quant(dequant_block_fp8_weight_naive(
+                        weight,
+                        layer.weight_scale_inv.data,
+                        self.quant_config.weight_block_size,
+                        original_M=orig_M,
+                        original_N=orig_N,
+                        do_unpad=True))
+                    weight_scale_inv = weight_scale_inv.squeeze(-1)
+                    layer.weight.data.copy_(weight)
+                    layer.weight_scale_inv = Parameter(weight_scale_inv,
+                                                    requires_grad=False)
                 return
             if current_platform.is_rocm():
                 weight, weight_scale_inv, _ = \
@@ -404,13 +413,26 @@ def apply(self,
         if self.block_quant:
             assert self.quant_config.weight_block_size is not None
             if current_platform.is_hpu():
-                return apply_block_fp8_linear_hpu_dynamic(
-                    input=x,
-                    weight=layer.weight,
-                    weight_scale=layer.weight_scale_inv,
-                    input_scale=layer.input_scale,
-                    bias=bias,
-                )
+                if self.quant_config.enable_runtime_dequant:
+                    return apply_block_fp8_linear_hpu_dequant(
+                        input=x,
+                        weight=layer.weight,
+                        block_size=self.quant_config.weight_block_size,
+                        weight_scale=layer.weight_scale_inv,
+                        input_scale=layer.input_scale,
+                        bias=bias,
+                        original_M=layer.orig_M,
+                        original_N=layer.orig_N,
+                        do_unpad=True,
+                    )
+                else:
+                    return apply_block_fp8_linear_hpu_dynamic(
+                        input=x,
+                        weight=layer.weight,
+                        weight_scale=layer.weight_scale_inv,
+                        input_scale=layer.input_scale,
+                        bias=bias,
+                    )
             return apply_w8a8_block_fp8_linear(
                 input=x,
                 weight=layer.weight,
@@ -615,6 +637,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
             if current_platform.is_hpu():
+                if self.quant_config.enable_runtime_dequant:
+                    return
                 w13_weight, w13_weight_scale_inv = dynamic_quant(dequant_block_fp8_weight_naive(
                     layer.w13_weight.data,
                     layer.w13_weight_scale_inv.data,
@@ -946,6 +970,7 @@ def do_dynamic_moe_with_static_scaling(x, topk_ids, topk_weights, w13_weight_fp8
                                             activation="silu",
                                             experts_min=min_expert + ep_shift,
                                             experts_max=max_expert - 1 + ep_shift)
+                htorch.core.mark_step()
                 if i == 0:
                     final_hidden_states = current_hidden_states
                 else:
@@ -976,6 +1001,40 @@ def do_dynamic_moe_with_dynamic_scaling(x, topk_ids, topk_weights, w13_weight_fp
                                             activation="silu",
                                             experts_min=min_expert + ep_shift,
                                             experts_max=max_expert - 1 + ep_shift)
+                htorch.core.mark_step()
+                if i == 0:
+                    final_hidden_states = current_hidden_states
+                else:
+                    final_hidden_states.add_(current_hidden_states)
+            return final_hidden_states
+
+        def do_dynamic_moe_with_dequant(x, topk_ids, topk_weights, w13_weight_fp8, w2_weight_fp8, moe_n_slice, n_expert_slice, w13_weight_scale_inv_fp8=None, w2_weight_scale_inv_fp8=None):
+            w13_weight = dequant_block_fp8_weight_naive(w13_weight_fp8,
+                                                        w13_weight_scale_inv_fp8,
+                                                        block_size=self.quant_config.weight_block_size,
+                                                        dtype=x.dtype)
+            w2_weight = dequant_block_fp8_weight_naive(w2_weight_fp8,
+                                                    w2_weight_scale_inv_fp8,
+                                                    block_size=self.quant_config.weight_block_size,
+                                                    dtype=x.dtype)
+            for i in range(moe_n_slice):
+                min_expert = i * n_expert_slice
+                max_expert = (i + 1) * n_expert_slice
+
+                w13_list_slice = [w13_weight[j, ...] for j in range(min_expert, max_expert)]
+                w2_list_slice = [w2_weight[j, ...] for j in range(min_expert, max_expert)]
+
+                current_hidden_states = torch.ops.hpu.mixture_of_experts(
+                                            hidden_states=x,
+                                            expert_routing_table=topk_ids.to(torch.int64),
+                                            router_weights=topk_weights.to(x.dtype),
+                                            w12=w13_list_slice,
+                                            w3=w2_list_slice,
+                                            permuted_weights=True,
+                                            activation="silu",
+                                            experts_min=min_expert + ep_shift,
+                                            experts_max=max_expert - 1 + ep_shift)
+                htorch.core.mark_step()
                 if i == 0:
                     final_hidden_states = current_hidden_states
                 else:
@@ -1003,7 +1062,9 @@ def do_dynamic_moe_with_dynamic_scaling(x, topk_ids, topk_weights, w13_weight_fp
             moe_n_slice = self.moe_n_slice
 
         if self.quant_config.activation_scheme == "dynamic":
-            if not use_static_moe and self.enable_dmoe_dynamic_scale:
+            if self.quant_config.enable_runtime_dequant:
+                final_hidden_states = do_dynamic_moe_with_dequant(x, topk_ids, topk_weights, w13_weight_fp8, w2_weight_fp8, moe_n_slice, n_expert_slice, w13_weight_scale_inv_fp8, w2_weight_scale_inv_fp8)
+            elif not use_static_moe and self.enable_dmoe_dynamic_scale:
                 final_hidden_states = do_dynamic_moe_with_dynamic_scaling(x, topk_ids, topk_weights, w13_weight_fp8, w2_weight_fp8, moe_n_slice, n_expert_slice, w13_weight_scale_inv_fp8, w2_weight_scale_inv_fp8)
             else:
                 final_hidden_states = do_static_moe_with_dynamic_scaling(x, topk_ids, topk_weights, w13_weight_fp8, w2_weight_fp8, actual_total_experts, actual_num_experts, w13_weight_scale_inv_fp8, w2_weight_scale_inv_fp8)
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -230,6 +230,28 @@ def apply_block_fp8_linear_hpu_dynamic(
     return output.to(dtype=input.dtype).view(*output_shape)
 
 
+def apply_block_fp8_linear_hpu_dequant(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    original_M: Optional[torch.Tensor] = None,
+    original_N: Optional[torch.Tensor] = None,
+    do_unpad: bool = False,
+) -> torch.Tensor:
+    assert input_scale is None
+    # View input as 2D matrix for fp8 methods
+    input_2d = input.view(-1, input.shape[-1])
+    original_M = original_M.data.item()
+    original_N = original_N.data.item()
+    weight = dequant_block_fp8_weight_naive(weight, weight_scale, block_size, input.dtype, original_M, original_N, do_unpad)
+    output = torch.nn.functional.linear(input_2d, weight, bias=None)
+    if bias is not None:
+        output = output + bias
+    return output.to(dtype=input.dtype).view(*input.shape[:-1], -1)
+
 def input_to_float8(
         x: torch.Tensor,
         dtype: Optional[torch.dtype] = None