Skip to content

Commit ebb622d

Browse files
xuechendimichalkuligowski
authored andcommitted
[DeepseekR1] bring back dequant option and fix acc (vllm-project#883)
1. use VLLM_ENABLE_RUNTIME_DEQUANT=1 to run with runtime dequantize 2. use VLLM_DMOE_DYNAMIC_SCALE=1 to run with dynamic dequantize + dynamic MOE 3.acc looks good as below ``` VLLM_DMOE_DYNAMIC_SCALE=1 python scripts/run_lm_eval.py -l 64 --batch_size 8 {"gsm8k": {"alias": "gsm8k", "exact_match,strict-match": 0.96875, "exact_match_stderr,strict-match": 0.021921011700381302, "exact_match,flexible-extract": 0.96875, "exact_match_stderr,flexible-extract": 0.021921011700381302}}{"e2e time(secs)": 938.2986768169999} ``` --------- Signed-off-by: Chendi Xue <[email protected]>
1 parent 35c4b81 commit ebb622d

File tree

5 files changed

+143
-38
lines changed

5 files changed

+143
-38
lines changed

scripts/DEEPSEEK_R1_ON_GAUDI.md

+25-5
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,39 @@ cd vllm; pip install -r requirements-hpu.txt; VLLM_TARGET_DEVICE=hpu pip instal
1111
huggingface-cli download --local-dir ${YOUR_PATH}/DeepSeek-R1 deepseek-ai/DeepSeek-R1
1212
```
1313

14-
# Option 1. run with dynamic quantization
14+
# Option 1. run with runtime dequantize with block-based scale
1515
> expect new DynamicMOE kernel ready in few weeks.
1616
> Current Performance is worse than static quantization due to lack of dynamic MOE support.
1717
## step 1. run example
1818
```
19-
python scripts/run_example_tp.py --model ${YOUR_PATH}/DeepSeek-R1
19+
VLLM_ENABLE_RUNTIME_DEQUANT=1 python scripts/run_example_tp.py --model ${YOUR_PATH}/DeepSeek-R1
2020
```
21-
## step 2. run benchmark
21+
## step 2. run lm_eval
2222
```
23-
bash scripts/benchmark-dynamicfp8-i1k-o1k-ep8-bestperf.sh
23+
VLLM_ENABLE_RUNTIME_DEQUANT=1 python scripts/run_lm_eval.py -l 64 --batch_size 1 --ep_size 1
24+
{"gsm8k": {"alias": "gsm8k", "exact_match,strict-match": 0.96875, "exact_match_stderr,strict-match": 0.021921011700381302, "exact_match,flexible-extract": 0.96875, "exact_match_stderr,flexible-extract": 0.021921011700381302}}{"e2e time(secs)": 938.2986768169999}
2425
```
2526

26-
# Option 2. run with static quantization
27+
# Option 2. run with dynamic quantization
28+
> expect new DynamicMOE kernel ready in few weeks.
29+
> Current Performance is worse than static quantization due to lack of dynamic MOE support.
30+
## step 1. run example
31+
```
32+
# if you're testing with patched kernel
33+
# use VLLM_DMOE_DYNAMIC_SCALE=1 to enable dynamic scaling supported DynamicMOE
34+
VLLM_DMOE_DYNAMIC_SCALE=1 python scripts/run_example_tp.py --model ${YOUR_PATH}/DeepSeek-R1
35+
```
36+
## step 2. run lm_eval
37+
```
38+
VLLM_DMOE_DYNAMIC_SCALE=1 python scripts/run_lm_eval.py -l 64 --batch_size 1
39+
{"gsm8k": {"alias": "gsm8k", "exact_match,strict-match": 0.96875, "exact_match_stderr,strict-match": 0.021921011700381302, "exact_match,flexible-extract": 0.96875, "exact_match_stderr,flexible-extract": 0.021921011700381302}}{"e2e time(secs)": 938.2986768169999}
40+
```
41+
## step 3. run benchmark
42+
```
43+
VLLM_DMOE_DYNAMIC_SCALE=1 bash scripts/benchmark-dynamicfp8-i1k-o1k-ep8-bestperf.sh
44+
```
45+
46+
# Option 3. run with static quantization
2747
> current best performance
2848
## step 1. Prepare static quantization model
2949
```

scripts/run_lm_eval.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import json
66
import time
77

8-
model_path = "/data/models/DeepSeek-R1-static/"
8+
model_path = "/data/models/DeepSeek-R1/"
99
#model_path = "/mnt/workdisk/dohayon/Projects/R1/DeepSeek-R1-fp8/"
1010
# model_path = "deepseek-ai/DeepSeek-V2-Lite"
1111

@@ -16,8 +16,8 @@
1616
parser.add_argument("--tokenizer", type=str, default=model_path, help="The model path.")
1717
parser.add_argument("--tp_size", type=int, default=8, help="Tensor Parallelism size.")
1818
parser.add_argument("--ep_size", type=int, default=8, help="Expert Parallelism size.")
19-
parser.add_argument("-l", "--limit", type=int, default=256, help="test request counts.")
20-
parser.add_argument("--batch_size", type=int, default=128, help="The batch size.")
19+
parser.add_argument("-l", "--limit", type=int, default=64, help="test request counts.")
20+
parser.add_argument("--batch_size", type=int, default=1, help="The batch size.")
2121
args = parser.parse_args()
2222

2323
os.environ["VLLM_SKIP_WARMUP"] = "true"
@@ -30,10 +30,11 @@
3030
os.environ["VLLM_MOE_N_SLICE"] = "4"
3131
os.environ["VLLM_EP_SIZE"] = "1"
3232

33-
os.environ["VLLM_MLA_DISABLE_REQUANTIZATION"] = "0"
33+
os.environ["VLLM_MLA_DISABLE_REQUANTIZATION"] = "1"
3434
os.environ["PT_HPU_WEIGHT_SHARING"] = "0"
3535

36-
os.environ['VLLM_DMOE_DYNAMIC_SCALE']='1'
36+
#os.environ['VLLM_DMOE_DYNAMIC_SCALE']='1'
37+
#os.environ['VLLM_ENABLE_RUNTIME_DEQUANT']='1'
3738

3839
if __name__ == "__main__":
3940

vllm/model_executor/layers/fused_moe/layer.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -610,14 +610,15 @@ def weight_loader(self, param: torch.nn.Parameter,
610610
expert_data=expert_data,
611611
tp_rank=tp_rank,
612612
expert_id=expert_id)
613-
elif current_platform.is_hpu():
614-
self._load_per_channel_weight_scale(
615-
shard_id=shard_id,
616-
shard_dim=shard_dim,
617-
loaded_weight=loaded_weight,
618-
expert_data=expert_data,
619-
tp_rank=tp_rank,
620-
expert_id=expert_id)
613+
# elif current_platform.is_hpu():
614+
# print(f"!!!!!!!!!!!!! HPU load per channel weight scale")
615+
# self._load_per_channel_weight_scale(
616+
# shard_id=shard_id,
617+
# shard_dim=shard_dim,
618+
# loaded_weight=loaded_weight,
619+
# expert_data=expert_data,
620+
# tp_rank=tp_rank,
621+
# expert_id=expert_id)
621622
elif quant_method in [
622623
FusedMoeWeightScaleSupported.GROUP.value,
623624
FusedMoeWeightScaleSupported.BLOCK.value,

vllm/model_executor/layers/quantization/fp8.py

+81-20
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@
3737
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
3838
dynamic_quant,
3939
dequant_block_fp8_weight_naive,
40-
apply_block_fp8_linear_hpu_dynamic)
40+
apply_block_fp8_linear_hpu_dynamic,
41+
apply_block_fp8_linear_hpu_dequant)
4142

4243
if current_platform.is_hpu():
4344
import habana_frameworks.torch as htorch
@@ -58,6 +59,7 @@ def __init__(
5859
ignored_layers: Optional[List[str]] = None,
5960
weight_block_size: Optional[List[int]] = None,
6061
) -> None:
62+
self.enable_runtime_dequant = os.environ.get("VLLM_ENABLE_RUNTIME_DEQUANT", "0") in ["1", "true"]
6163
self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
6264
if is_checkpoint_fp8_serialized:
6365
logger.warning("Detected fp8 checkpoint. Please note that the "
@@ -282,17 +284,24 @@ def process_weights_after_loading(self, layer: Module) -> None:
282284
layer.weight.data,
283285
layer.weight_scale_inv.data,
284286
self.quant_config.weight_block_size)
285-
weight, weight_scale_inv = dynamic_quant(dequant_block_fp8_weight_naive(
286-
weight,
287-
layer.weight_scale_inv.data,
288-
self.quant_config.weight_block_size,
289-
original_M=orig_M,
290-
original_N=orig_N,
291-
do_unpad=True))
292-
weight_scale_inv = weight_scale_inv.squeeze(-1)
293-
layer.weight.data.copy_(weight)
294-
layer.weight_scale_inv = Parameter(weight_scale_inv,
295-
requires_grad=False)
287+
if self.quant_config.enable_runtime_dequant:
288+
layer.weight = torch.nn.Parameter(weight, requires_grad=False)
289+
orig_M = torch.nn.Parameter(torch.tensor(orig_M, dtype=torch.int32), requires_grad=False)
290+
orig_N = torch.nn.Parameter(torch.tensor(orig_N, dtype=torch.int32), requires_grad=False)
291+
layer.register_parameter("orig_M", orig_M)
292+
layer.register_parameter("orig_N", orig_N)
293+
else:
294+
weight, weight_scale_inv = dynamic_quant(dequant_block_fp8_weight_naive(
295+
weight,
296+
layer.weight_scale_inv.data,
297+
self.quant_config.weight_block_size,
298+
original_M=orig_M,
299+
original_N=orig_N,
300+
do_unpad=True))
301+
weight_scale_inv = weight_scale_inv.squeeze(-1)
302+
layer.weight.data.copy_(weight)
303+
layer.weight_scale_inv = Parameter(weight_scale_inv,
304+
requires_grad=False)
296305
return
297306
if current_platform.is_rocm():
298307
weight, weight_scale_inv, _ = \
@@ -404,13 +413,26 @@ def apply(self,
404413
if self.block_quant:
405414
assert self.quant_config.weight_block_size is not None
406415
if current_platform.is_hpu():
407-
return apply_block_fp8_linear_hpu_dynamic(
408-
input=x,
409-
weight=layer.weight,
410-
weight_scale=layer.weight_scale_inv,
411-
input_scale=layer.input_scale,
412-
bias=bias,
413-
)
416+
if self.quant_config.enable_runtime_dequant:
417+
return apply_block_fp8_linear_hpu_dequant(
418+
input=x,
419+
weight=layer.weight,
420+
block_size=self.quant_config.weight_block_size,
421+
weight_scale=layer.weight_scale_inv,
422+
input_scale=layer.input_scale,
423+
bias=bias,
424+
original_M=layer.orig_M,
425+
original_N=layer.orig_N,
426+
do_unpad=True,
427+
)
428+
else:
429+
return apply_block_fp8_linear_hpu_dynamic(
430+
input=x,
431+
weight=layer.weight,
432+
weight_scale=layer.weight_scale_inv,
433+
input_scale=layer.input_scale,
434+
bias=bias,
435+
)
414436
return apply_w8a8_block_fp8_linear(
415437
input=x,
416438
weight=layer.weight,
@@ -615,6 +637,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
615637
# TODO (rob): refactor block quant into separate class.
616638
if self.block_quant:
617639
if current_platform.is_hpu():
640+
if self.quant_config.enable_runtime_dequant:
641+
return
618642
w13_weight, w13_weight_scale_inv = dynamic_quant(dequant_block_fp8_weight_naive(
619643
layer.w13_weight.data,
620644
layer.w13_weight_scale_inv.data,
@@ -946,6 +970,7 @@ def do_dynamic_moe_with_static_scaling(x, topk_ids, topk_weights, w13_weight_fp8
946970
activation="silu",
947971
experts_min=min_expert + ep_shift,
948972
experts_max=max_expert - 1 + ep_shift)
973+
htorch.core.mark_step()
949974
if i == 0:
950975
final_hidden_states = current_hidden_states
951976
else:
@@ -976,6 +1001,40 @@ def do_dynamic_moe_with_dynamic_scaling(x, topk_ids, topk_weights, w13_weight_fp
9761001
activation="silu",
9771002
experts_min=min_expert + ep_shift,
9781003
experts_max=max_expert - 1 + ep_shift)
1004+
htorch.core.mark_step()
1005+
if i == 0:
1006+
final_hidden_states = current_hidden_states
1007+
else:
1008+
final_hidden_states.add_(current_hidden_states)
1009+
return final_hidden_states
1010+
1011+
def do_dynamic_moe_with_dequant(x, topk_ids, topk_weights, w13_weight_fp8, w2_weight_fp8, moe_n_slice, n_expert_slice, w13_weight_scale_inv_fp8=None, w2_weight_scale_inv_fp8=None):
1012+
w13_weight = dequant_block_fp8_weight_naive(w13_weight_fp8,
1013+
w13_weight_scale_inv_fp8,
1014+
block_size=self.quant_config.weight_block_size,
1015+
dtype=x.dtype)
1016+
w2_weight = dequant_block_fp8_weight_naive(w2_weight_fp8,
1017+
w2_weight_scale_inv_fp8,
1018+
block_size=self.quant_config.weight_block_size,
1019+
dtype=x.dtype)
1020+
for i in range(moe_n_slice):
1021+
min_expert = i * n_expert_slice
1022+
max_expert = (i + 1) * n_expert_slice
1023+
1024+
w13_list_slice = [w13_weight[j, ...] for j in range(min_expert, max_expert)]
1025+
w2_list_slice = [w2_weight[j, ...] for j in range(min_expert, max_expert)]
1026+
1027+
current_hidden_states = torch.ops.hpu.mixture_of_experts(
1028+
hidden_states=x,
1029+
expert_routing_table=topk_ids.to(torch.int64),
1030+
router_weights=topk_weights.to(x.dtype),
1031+
w12=w13_list_slice,
1032+
w3=w2_list_slice,
1033+
permuted_weights=True,
1034+
activation="silu",
1035+
experts_min=min_expert + ep_shift,
1036+
experts_max=max_expert - 1 + ep_shift)
1037+
htorch.core.mark_step()
9791038
if i == 0:
9801039
final_hidden_states = current_hidden_states
9811040
else:
@@ -1003,7 +1062,9 @@ def do_dynamic_moe_with_dynamic_scaling(x, topk_ids, topk_weights, w13_weight_fp
10031062
moe_n_slice = self.moe_n_slice
10041063

10051064
if self.quant_config.activation_scheme == "dynamic":
1006-
if not use_static_moe and self.enable_dmoe_dynamic_scale:
1065+
if self.quant_config.enable_runtime_dequant:
1066+
final_hidden_states = do_dynamic_moe_with_dequant(x, topk_ids, topk_weights, w13_weight_fp8, w2_weight_fp8, moe_n_slice, n_expert_slice, w13_weight_scale_inv_fp8, w2_weight_scale_inv_fp8)
1067+
elif not use_static_moe and self.enable_dmoe_dynamic_scale:
10071068
final_hidden_states = do_dynamic_moe_with_dynamic_scaling(x, topk_ids, topk_weights, w13_weight_fp8, w2_weight_fp8, moe_n_slice, n_expert_slice, w13_weight_scale_inv_fp8, w2_weight_scale_inv_fp8)
10081069
else:
10091070
final_hidden_states = do_static_moe_with_dynamic_scaling(x, topk_ids, topk_weights, w13_weight_fp8, w2_weight_fp8, actual_total_experts, actual_num_experts, w13_weight_scale_inv_fp8, w2_weight_scale_inv_fp8)

vllm/model_executor/layers/quantization/utils/fp8_utils.py

+22
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,28 @@ def apply_block_fp8_linear_hpu_dynamic(
230230
return output.to(dtype=input.dtype).view(*output_shape)
231231

232232

233+
def apply_block_fp8_linear_hpu_dequant(
234+
input: torch.Tensor,
235+
weight: torch.Tensor,
236+
block_size: List[int],
237+
weight_scale: torch.Tensor,
238+
input_scale: Optional[torch.Tensor] = None,
239+
bias: Optional[torch.Tensor] = None,
240+
original_M: Optional[torch.Tensor] = None,
241+
original_N: Optional[torch.Tensor] = None,
242+
do_unpad: bool = False,
243+
) -> torch.Tensor:
244+
assert input_scale is None
245+
# View input as 2D matrix for fp8 methods
246+
input_2d = input.view(-1, input.shape[-1])
247+
original_M = original_M.data.item()
248+
original_N = original_N.data.item()
249+
weight = dequant_block_fp8_weight_naive(weight, weight_scale, block_size, input.dtype, original_M, original_N, do_unpad)
250+
output = torch.nn.functional.linear(input_2d, weight, bias=None)
251+
if bias is not None:
252+
output = output + bias
253+
return output.to(dtype=input.dtype).view(*input.shape[:-1], -1)
254+
233255
def input_to_float8(
234256
x: torch.Tensor,
235257
dtype: Optional[torch.dtype] = None

0 commit comments

Comments
 (0)