Skip to content

Commit 35c4b81

Browse files
xuechendimengniwang95yangulei
authored andcommitted
[Deepseek r1] dynamic/static quant fp8 support + fp8 KV cache (vllm-project#881)
Signed-off-by: Chendi Xue <[email protected]> Signed-off-by: Mengni Wang <[email protected]> Co-authored-by: Mengni Wang <[email protected]> Co-authored-by: Youlei Yang <[email protected]>
1 parent 117555d commit 35c4b81

18 files changed

+659
-1219
lines changed

scripts/DEEPSEEK_R1_ON_GAUDI.md

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# install
2+
3+
```
4+
git clone https://github.com/HabanaAI/vllm-fork.git; git checkout deepseek_r1
5+
cd vllm; pip install -r requirements-hpu.txt; VLLM_TARGET_DEVICE=hpu pip install -e . --no-build-isolation;
6+
```
7+
8+
# prepare model
9+
10+
```
11+
huggingface-cli download --local-dir ${YOUR_PATH}/DeepSeek-R1 deepseek-ai/DeepSeek-R1
12+
```
13+
14+
# Option 1. run with dynamic quantization
15+
> expect new DynamicMOE kernel ready in few weeks.
16+
> Current Performance is worse than static quantization due to lack of dynamic MOE support.
17+
## step 1. run example
18+
```
19+
python scripts/run_example_tp.py --model ${YOUR_PATH}/DeepSeek-R1
20+
```
21+
## step 2. run benchmark
22+
```
23+
bash scripts/benchmark-dynamicfp8-i1k-o1k-ep8-bestperf.sh
24+
```
25+
26+
# Option 2. run with static quantization
27+
> current best performance
28+
## step 1. Prepare static quantization model
29+
```
30+
python scripts/convert_block_fp8_to_channel_fp8.py --model_path ${YOUR_PATH}/DeepSeek-R1 --qmodel_path ${YOUR_PATH}/DeepSeek-R1-static --input_scales_path scripts/DeepSeek-R1-BF16-w8afp8-static-no-ste_input_scale_inv.pkl.gz
31+
```
32+
## step 2. run example
33+
```
34+
python scripts/run_example_tp.py --model ${YOUR_PATH}/DeepSeek-R1-static
35+
```
36+
## step 3. run benchmark
37+
```
38+
bash scripts/benchmark-staticfp8-i1k-o1k-ep8-bestperf.sh
39+
```
40+
41+
# Others. run with multi nodes
42+
```
43+
# head node
44+
HABANA_VISIBLE_MODULES='0,1,2,3,4,5,6,7' \
45+
PT_HPU_WEIGHT_SHARING=0 \
46+
PT_HPUGRAPH_DISABLE_TENSOR_CACHE=1 \
47+
PT_HPU_ENABLE_LAZY_COLLECTIVES="true" \
48+
VLLM_RAY_DISABLE_LOG_TO_DRIVER="1" \
49+
RAY_IGNORE_UNHANDLED_ERRORS="1" \
50+
ray start --head --resources='{"HPU": 8, "TPU": 0}'
51+
```
52+
53+
```
54+
# worker node
55+
HABANA_VISIBLE_MODULES='0,1,2,3,4,5,6,7' \
56+
PT_HPU_WEIGHT_SHARING=0 \
57+
PT_HPUGRAPH_DISABLE_TENSOR_CACHE=1 \
58+
PT_HPU_ENABLE_LAZY_COLLECTIVES="true" \
59+
VLLM_RAY_DISABLE_LOG_TO_DRIVER="1" \
60+
RAY_IGNORE_UNHANDLED_ERRORS="1" \
61+
ray start --address='${head_ip}:6379' --resources='{"HPU": 8, "TPU": 0}'
62+
```
63+
64+
```
65+
python scripts/run_example_tp_2nodes.py --model ${YOUR_PATH}/DeepSeek-R1-static
66+
```
67+
Binary file not shown.

scripts/run_static-online-i1k-o1k-ep8-bestperf.sh renamed to scripts/benchmark-dynamicfp8-i1k-o1k-ep8-bestperf.sh

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,29 +11,29 @@ if [ $((total_len % 128)) -ne 0 ]; then
1111
fi
1212
ep_size=8
1313
moe_n_slice=1
14-
gpu_utils=0.82
15-
bs=192
16-
num_prompts=192
14+
gpu_utils=0.92
15+
bs=448
16+
num_prompts=448
1717
request_rate=inf
18-
log_name="static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_mdllen${total_len}"
18+
log_name="[dynamicfp8-dmoe-fp8kv]static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_mdllen${total_len}"
1919

2020
VLLM_DECODE_BLOCK_BUCKET_MIN=$((in_len * bs / 128))
2121
VLLM_DECODE_BLOCK_BUCKET_MAX=$((total_len * bs / 128 + 128))
22-
# model="/data/models/DeepSeek-R1/"
23-
# tokenizer="/data/models/DeepSeek-R1/"
24-
model="/data/models/DeepSeek-R1/"
25-
tokenizer="/data/models/DeepSeek-R1/"
22+
model="/data/models/DeepSeek-R1-dynamic/"
23+
tokenizer="/data/models/DeepSeek-R1-dynamic/"
24+
# model="/data/models/DeepSeek-R1-static/"
25+
# tokenizer="/data/models/DeepSeek-R1-static/"
2626
model_name="DeepSeek-R1"
2727

28-
#VLLM_USE_STATIC_MOE=1 \
28+
#VLLM_DMOE_DYNAMIC_SCALE=1 \
2929
HABANA_VISIBLE_DEVICES="ALL" \
3030
VLLM_MOE_N_SLICE=${moe_n_slice} \
3131
VLLM_EP_SIZE=${ep_size} \
3232
VLLM_MLA_DISABLE_REQUANTIZATION=1 \
3333
PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
3434
PT_HPU_WEIGHT_SHARING=0 \
3535
VLLM_PROMPT_BS_BUCKET_MIN=1 \
36-
VLLM_PROMPT_BS_BUCKET_MAX=4 \
36+
VLLM_PROMPT_BS_BUCKET_MAX=16 \
3737
VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \
3838
VLLM_PROMPT_SEQ_BUCKET_MAX=${in_len} \
3939
VLLM_DECODE_BS_BUCKET_MIN=${bs} \
@@ -49,9 +49,10 @@ python -m vllm.entrypoints.openai.api_server \
4949
--dtype bfloat16 \
5050
--use-v2-block-manager \
5151
--num_scheduler_steps ${multi_step}\
52-
--max-model-len ${total_len} \
52+
--max-model-len 4096 \
5353
--distributed_executor_backend mp \
5454
--gpu_memory_utilization ${gpu_utils} \
55+
--kv_cache_dtype "fp8_inc" \
5556
--trust_remote_code 2>&1 | tee benchmark_logs/${log_name}_serving.log &
5657
pid=$(($!-1))
5758

@@ -77,13 +78,13 @@ echo "Time elapsed: $((end_time - start_time))s"
7778

7879
sleep 10
7980

80-
start_time=$(date +%s)
81-
echo "Start to benchmark"
82-
python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run2.log
83-
end_time=$(date +%s)
84-
echo "Time elapsed: $((end_time - start_time))s"
81+
# start_time=$(date +%s)
82+
# echo "Start to benchmark"
83+
# python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run2.log
84+
# end_time=$(date +%s)
85+
# echo "Time elapsed: $((end_time - start_time))s"
8586

86-
sleep 10
87+
# sleep 10
8788

8889
kill ${pid}
8990
kill ${hl_pid}

scripts/run_static-online-i1k-o3k-ep8-bestperf.sh renamed to scripts/benchmark-staticfp8-i1k-o1k-ep8-bestperf.sh

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22
tp_parrallel=8
33
in_len=1024
4-
out_len=3072
4+
out_len=1024
55
multi_step=1
66
total_len=$((in_len + out_len))
77
# if total_len is not multiple of 128, round up to the next multiple of 128
@@ -11,28 +11,27 @@ if [ $((total_len % 128)) -ne 0 ]; then
1111
fi
1212
ep_size=8
1313
moe_n_slice=1
14-
gpu_utils=0.82
15-
bs=96
16-
num_prompts=96
14+
gpu_utils=0.92
15+
bs=448
16+
num_prompts=448
1717
request_rate=inf
18-
log_name="[0211]static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_mdllen${total_len}"
18+
log_name="[staticfp8-dmoe-fp8kv]static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_mdllen${total_len}"
1919

2020
VLLM_DECODE_BLOCK_BUCKET_MIN=$((in_len * bs / 128))
2121
VLLM_DECODE_BLOCK_BUCKET_MAX=$((total_len * bs / 128 + 128))
22-
# model="/data/models/DeepSeek-R1/"
23-
# tokenizer="/data/models/DeepSeek-R1/"
24-
model="/data/models/DeepSeek-R1/"
25-
tokenizer="/data/models/DeepSeek-R1/"
22+
model="/data/models/DeepSeek-R1-static/"
23+
tokenizer="/data/models/DeepSeek-R1-static/"
2624
model_name="DeepSeek-R1"
2725

26+
2827
HABANA_VISIBLE_DEVICES="ALL" \
2928
VLLM_MOE_N_SLICE=${moe_n_slice} \
3029
VLLM_EP_SIZE=${ep_size} \
3130
VLLM_MLA_DISABLE_REQUANTIZATION=1 \
3231
PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
3332
PT_HPU_WEIGHT_SHARING=0 \
3433
VLLM_PROMPT_BS_BUCKET_MIN=1 \
35-
VLLM_PROMPT_BS_BUCKET_MAX=4 \
34+
VLLM_PROMPT_BS_BUCKET_MAX=16 \
3635
VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \
3736
VLLM_PROMPT_SEQ_BUCKET_MAX=${in_len} \
3837
VLLM_DECODE_BS_BUCKET_MIN=${bs} \
@@ -48,9 +47,10 @@ python -m vllm.entrypoints.openai.api_server \
4847
--dtype bfloat16 \
4948
--use-v2-block-manager \
5049
--num_scheduler_steps ${multi_step}\
51-
--max-model-len ${total_len} \
50+
--max-model-len 4096 \
5251
--distributed_executor_backend mp \
5352
--gpu_memory_utilization ${gpu_utils} \
53+
--kv_cache_dtype "fp8_inc" \
5454
--trust_remote_code 2>&1 | tee benchmark_logs/${log_name}_serving.log &
5555
pid=$(($!-1))
5656

@@ -76,13 +76,13 @@ echo "Time elapsed: $((end_time - start_time))s"
7676

7777
sleep 10
7878

79-
start_time=$(date +%s)
80-
echo "Start to benchmark"
81-
python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run2.log
82-
end_time=$(date +%s)
83-
echo "Time elapsed: $((end_time - start_time))s"
79+
# start_time=$(date +%s)
80+
# echo "Start to benchmark"
81+
# python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run2.log
82+
# end_time=$(date +%s)
83+
# echo "Time elapsed: $((end_time - start_time))s"
8484

85-
sleep 10
85+
# sleep 10
8686

8787
kill ${pid}
8888
kill ${hl_pid}

0 commit comments

Comments
 (0)