jikunshang
diff --git a/‎scripts/DEEPSEEK_R1_ON_GAUDI.md
Lines changed: 67 additions & 0 deletions b/‎scripts/DEEPSEEK_R1_ON_GAUDI.md
Lines changed: 67 additions & 0 deletions
diff --git a/‎scripts/DeepSeek-R1-BF16-w8afp8-static-no-ste_input_scale_inv.pkl.gz
603 KB b/‎scripts/DeepSeek-R1-BF16-w8afp8-static-no-ste_input_scale_inv.pkl.gz
603 KB
diff --git a/‎scripts/run_static-online-i1k-o1k-ep8-bestperf.sh renamed to ‎scripts/benchmark-dynamicfp8-i1k-o1k-ep8-bestperf.sh
Lines changed: 18 additions & 17 deletions b/‎scripts/run_static-online-i1k-o1k-ep8-bestperf.sh renamed to ‎scripts/benchmark-dynamicfp8-i1k-o1k-ep8-bestperf.sh
Lines changed: 18 additions & 17 deletions
diff --git a/‎scripts/run_static-online-i1k-o3k-ep8-bestperf.sh renamed to ‎scripts/benchmark-staticfp8-i1k-o1k-ep8-bestperf.sh
Lines changed: 17 additions & 17 deletions b/‎scripts/run_static-online-i1k-o3k-ep8-bestperf.sh renamed to ‎scripts/benchmark-staticfp8-i1k-o1k-ep8-bestperf.sh
Lines changed: 17 additions & 17 deletions
@@ -0,0 +1,67 @@
+# install
+
+```
+git clone https://github.com/HabanaAI/vllm-fork.git; git checkout deepseek_r1
+cd vllm;  pip install -r requirements-hpu.txt; VLLM_TARGET_DEVICE=hpu pip install -e .  --no-build-isolation;
+```
+
+# prepare model
+
+```
+huggingface-cli download --local-dir ${YOUR_PATH}/DeepSeek-R1 deepseek-ai/DeepSeek-R1
+```
+
+# Option 1. run with dynamic quantization
+> expect new DynamicMOE kernel ready in few weeks.
+> Current Performance is worse than static quantization due to lack of dynamic MOE support.
+## step 1. run example
+```
+python scripts/run_example_tp.py --model ${YOUR_PATH}/DeepSeek-R1
+```
+## step 2. run benchmark
+```
+bash scripts/benchmark-dynamicfp8-i1k-o1k-ep8-bestperf.sh
+```
+
+# Option 2. run with static quantization
+> current best performance
+## step 1. Prepare static quantization model
+```
+python scripts/convert_block_fp8_to_channel_fp8.py --model_path ${YOUR_PATH}/DeepSeek-R1 --qmodel_path ${YOUR_PATH}/DeepSeek-R1-static --input_scales_path scripts/DeepSeek-R1-BF16-w8afp8-static-no-ste_input_scale_inv.pkl.gz
+```
+## step 2. run example
+```
+python scripts/run_example_tp.py --model ${YOUR_PATH}/DeepSeek-R1-static
+```
+## step 3. run benchmark
+```
+bash scripts/benchmark-staticfp8-i1k-o1k-ep8-bestperf.sh
+```
+
+# Others. run with multi nodes
+```
+# head node
+HABANA_VISIBLE_MODULES='0,1,2,3,4,5,6,7'  \
+PT_HPU_WEIGHT_SHARING=0 \
+PT_HPUGRAPH_DISABLE_TENSOR_CACHE=1 \
+PT_HPU_ENABLE_LAZY_COLLECTIVES="true" \
+VLLM_RAY_DISABLE_LOG_TO_DRIVER="1" \
+RAY_IGNORE_UNHANDLED_ERRORS="1" \
+ray start --head --resources='{"HPU": 8, "TPU": 0}'
+```
+
+```
+# worker node
+HABANA_VISIBLE_MODULES='0,1,2,3,4,5,6,7'  \
+PT_HPU_WEIGHT_SHARING=0 \
+PT_HPUGRAPH_DISABLE_TENSOR_CACHE=1 \
+PT_HPU_ENABLE_LAZY_COLLECTIVES="true" \
+VLLM_RAY_DISABLE_LOG_TO_DRIVER="1" \
+RAY_IGNORE_UNHANDLED_ERRORS="1" \
+ray start --address='${head_ip}:6379' --resources='{"HPU": 8, "TPU": 0}'
+```
+
+```
+python scripts/run_example_tp_2nodes.py --model ${YOUR_PATH}/DeepSeek-R1-static
+```
+
@@ -11,29 +11,29 @@ if [ $((total_len % 128)) -ne 0 ]; then
 fi
 ep_size=8
 moe_n_slice=1
-gpu_utils=0.82
-bs=192
-num_prompts=192
+gpu_utils=0.92
+bs=448
+num_prompts=448
 request_rate=inf
-log_name="static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_mdllen${total_len}"
+log_name="[dynamicfp8-dmoe-fp8kv]static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_mdllen${total_len}"
 
 VLLM_DECODE_BLOCK_BUCKET_MIN=$((in_len * bs / 128))
 VLLM_DECODE_BLOCK_BUCKET_MAX=$((total_len * bs / 128 + 128))
-# model="/data/models/DeepSeek-R1/"
-# tokenizer="/data/models/DeepSeek-R1/"
-model="/data/models/DeepSeek-R1/"
-tokenizer="/data/models/DeepSeek-R1/"
+model="/data/models/DeepSeek-R1-dynamic/"
+tokenizer="/data/models/DeepSeek-R1-dynamic/"
+# model="/data/models/DeepSeek-R1-static/"
+# tokenizer="/data/models/DeepSeek-R1-static/"
 model_name="DeepSeek-R1"
 
-#VLLM_USE_STATIC_MOE=1 \
+#VLLM_DMOE_DYNAMIC_SCALE=1 \
 HABANA_VISIBLE_DEVICES="ALL" \
 VLLM_MOE_N_SLICE=${moe_n_slice} \
 VLLM_EP_SIZE=${ep_size} \
 VLLM_MLA_DISABLE_REQUANTIZATION=1 \
 PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
 PT_HPU_WEIGHT_SHARING=0 \
 VLLM_PROMPT_BS_BUCKET_MIN=1 \
-VLLM_PROMPT_BS_BUCKET_MAX=4 \
+VLLM_PROMPT_BS_BUCKET_MAX=16 \
 VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \
 VLLM_PROMPT_SEQ_BUCKET_MAX=${in_len} \
 VLLM_DECODE_BS_BUCKET_MIN=${bs} \
@@ -49,9 +49,10 @@ python -m vllm.entrypoints.openai.api_server \
     --dtype bfloat16 \
     --use-v2-block-manager \
     --num_scheduler_steps ${multi_step}\
-    --max-model-len ${total_len} \
+    --max-model-len 4096 \
     --distributed_executor_backend mp \
     --gpu_memory_utilization ${gpu_utils} \
+    --kv_cache_dtype "fp8_inc" \
     --trust_remote_code 2>&1 | tee benchmark_logs/${log_name}_serving.log &
 pid=$(($!-1))
 
@@ -77,13 +78,13 @@ echo "Time elapsed: $((end_time - start_time))s"
 
 sleep 10
 
-start_time=$(date +%s)
-echo "Start to benchmark"
-python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run2.log
-end_time=$(date +%s)
-echo "Time elapsed: $((end_time - start_time))s"
+# start_time=$(date +%s)
+# echo "Start to benchmark"
+# python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run2.log
+# end_time=$(date +%s)
+# echo "Time elapsed: $((end_time - start_time))s"
 
-sleep 10
+# sleep 10
 
 kill ${pid}
 kill ${hl_pid}
 
@@ -1,7 +1,7 @@
 #!/bin/bash
 tp_parrallel=8
 in_len=1024
-out_len=3072
+out_len=1024
 multi_step=1
 total_len=$((in_len + out_len))
 # if total_len is not multiple of 128, round up to the next multiple of 128
@@ -11,28 +11,27 @@ if [ $((total_len % 128)) -ne 0 ]; then
 fi
 ep_size=8
 moe_n_slice=1
-gpu_utils=0.82
-bs=96
-num_prompts=96
+gpu_utils=0.92
+bs=448
+num_prompts=448
 request_rate=inf
-log_name="[0211]static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_mdllen${total_len}"
+log_name="[staticfp8-dmoe-fp8kv]static-online-gaudi3-${gpu_utils}util-TPparallel${tp_parrallel}-EP${ep_size}-loop${moe_n_slice}moegroups-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}_mdllen${total_len}"
 
 VLLM_DECODE_BLOCK_BUCKET_MIN=$((in_len * bs / 128))
 VLLM_DECODE_BLOCK_BUCKET_MAX=$((total_len * bs / 128 + 128))
-# model="/data/models/DeepSeek-R1/"
-# tokenizer="/data/models/DeepSeek-R1/"
-model="/data/models/DeepSeek-R1/"
-tokenizer="/data/models/DeepSeek-R1/"
+model="/data/models/DeepSeek-R1-static/"
+tokenizer="/data/models/DeepSeek-R1-static/"
 model_name="DeepSeek-R1"
 
+
 HABANA_VISIBLE_DEVICES="ALL" \
 VLLM_MOE_N_SLICE=${moe_n_slice} \
 VLLM_EP_SIZE=${ep_size} \
 VLLM_MLA_DISABLE_REQUANTIZATION=1 \
 PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
 PT_HPU_WEIGHT_SHARING=0 \
 VLLM_PROMPT_BS_BUCKET_MIN=1 \
-VLLM_PROMPT_BS_BUCKET_MAX=4 \
+VLLM_PROMPT_BS_BUCKET_MAX=16 \
 VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \
 VLLM_PROMPT_SEQ_BUCKET_MAX=${in_len} \
 VLLM_DECODE_BS_BUCKET_MIN=${bs} \
@@ -48,9 +47,10 @@ python -m vllm.entrypoints.openai.api_server \
     --dtype bfloat16 \
     --use-v2-block-manager \
     --num_scheduler_steps ${multi_step}\
-    --max-model-len ${total_len} \
+    --max-model-len 4096 \
     --distributed_executor_backend mp \
     --gpu_memory_utilization ${gpu_utils} \
+    --kv_cache_dtype "fp8_inc" \
     --trust_remote_code 2>&1 | tee benchmark_logs/${log_name}_serving.log &
 pid=$(($!-1))
 
@@ -76,13 +76,13 @@ echo "Time elapsed: $((end_time - start_time))s"
 
 sleep 10
 
-start_time=$(date +%s)
-echo "Start to benchmark"
-python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run2.log
-end_time=$(date +%s)
-echo "Time elapsed: $((end_time - start_time))s"
+# start_time=$(date +%s)
+# echo "Start to benchmark"
+# python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 2>&1 | tee benchmark_logs/${log_name}_run2.log
+# end_time=$(date +%s)
+# echo "Time elapsed: $((end_time - start_time))s"
 
-sleep 10
+# sleep 10
 
 kill ${pid}
 kill ${hl_pid}