2
2
log_sum=" log/service_model_device.txt"
3
3
4
4
model_ids=(" TinyLlama/TinyLlama-1.1B-Chat-v1.0" ) # "facebook/opt-1.3b" "huggyllama/llama-7b")
5
- num_devices=(2)
5
+ num_lpu_devices=(2) # 4
6
+ num_gpu_devices=(0)
6
7
7
8
current_datetime=$( date " +%Y-%m-%d %H:%M:%S" )
8
9
echo " $current_datetime "
9
10
echo " $current_datetime " >> ${log_sum}
10
11
11
12
# LLMEngine Test
12
13
for model_id in " ${model_ids[@]} " ; do
13
- for num_device in " ${num_devices[@]} " ; do
14
+ for num_lpu_device in " ${num_lpu_devices[@]} " ; do
15
+ for num_gpu_device in " ${num_gpu_devices[@]} " ; do
14
16
# IFS='\' read -ra parts <<< "$model_id"
15
17
# model_name="${parts[-1]}"
16
18
model_name=$( echo " $model_id " | awk -F' /' ' {print $NF}' )
17
19
echo " *********************************"
18
- echo " **** Start inference_${model_name} _${num_device } "
20
+ echo " **** Start inference_${model_name} _${num_lpu_device} _ ${num_gpu_device }"
19
21
echo " *********************************"
20
- python lpu_inference_arg.py -m ${model_id} -n ${num_device} > log/inference_${model_name} _${num_device } .txt
22
+ python lpu_inference_arg.py -m ${model_id} -l ${num_lpu_device} -g ${num_gpu_device} > log/inference_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt
21
23
echo " *********************************" >> ${log_sum}
22
- echo " [Testbench] The Result of log/inference_${model_name} _${num_device } .txt" >> ${log_sum}
23
- tail -n 1 " log/inference_${model_name} _${num_device } .txt" >> ${log_sum}
24
+ echo " [Testbench] The Result of log/inference_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt" >> ${log_sum}
25
+ tail -n 1 " log/inference_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt" >> ${log_sum}
24
26
echo " " >> ${log_sum}
27
+ done
25
28
done
26
29
done
27
30
28
31
# LLMEngineAsync Test with vLLM serve
29
32
for model_id in " ${model_ids[@]} " ; do
30
- for num_device in " ${num_devices[@]} " ; do
33
+ for num_lpu_device in " ${num_lpu_devices[@]} " ; do
34
+ for num_gpu_device in " ${num_gpu_devices[@]} " ; do
31
35
model_name=$( echo " $model_id " | awk -F' /' ' {print $NF}' )
32
36
echo " *********************************"
33
- echo " **** Start serving_${model_name} _${num_device } "
37
+ echo " **** Start serving_${model_name} _${num_lpu_device} _ ${num_gpu_device }"
34
38
echo " *********************************"
35
- python -m vllm.entrypoints.api_server --model ${model_id} --device fpga --tensor-parallel-size ${num_device } &
39
+ python -m vllm.entrypoints.api_server --model ${model_id} --device fpga --num-lpu-devices ${num_lpu_device} --num-gpu-devices ${num_gpu_device } &
36
40
37
41
# Waiting for server
38
42
while ! nc -z localhost " 8000" ; do
@@ -41,7 +45,7 @@ for model_id in "${model_ids[@]}"; do
41
45
done
42
46
echo " [Testbench] The server is ready!"
43
47
44
- python lpu_client.py > log/vllm_serve_${model_name} _${num_device } .txt
48
+ python lpu_client.py > log/vllm_serve_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt
45
49
46
50
# Waiting for process kill
47
51
PID=$( jobs -p | tail -n 1)
@@ -60,22 +64,24 @@ for model_id in "${model_ids[@]}"; do
60
64
61
65
# Write log in text file
62
66
echo " *********************************" >> ${log_sum}
63
- echo " The Result of log/vllm_serve_${model_name} _${num_device } .txt" >> ${log_sum}
64
- tail -n 1 " log/vllm_serve_${model_name} _${num_device } .txt" >> ${log_sum}
67
+ echo " The Result of log/vllm_serve_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt" >> ${log_sum}
68
+ tail -n 1 " log/vllm_serve_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt" >> ${log_sum}
65
69
echo " " >> ${log_sum}
70
+ done
66
71
done
67
72
done
68
73
69
74
70
75
71
76
# OpenAI API Test
72
77
model_id=${model_ids[0]}
73
- num_device=${num_devices[0]}
78
+ num_lpu_device=${num_lpu_devices[0]}
79
+ num_gpu_device=${num_gpu_devices[0]}
74
80
model_name=$( echo " $model_id " | awk -F' /' ' {print $NF}' )
75
81
echo " *********************************"
76
- echo " **** Start serving_${model_name} _${num_device } "
82
+ echo " **** Start serving_${model_name} _${num_lpu_device} _ ${num_gpu_device }"
77
83
echo " *********************************"
78
- python -m vllm.entrypoints.api_server --model ${model_id} --device fpga --tensor-parallel-size ${num_device } &
84
+ python -m vllm.entrypoints.openai. api_server --model ${model_id} --device fpga --num-lpu-devices ${num_lpu_device} --num_gpu_devices ${num_gpu_device } &
79
85
80
86
# Waiting for server
81
87
while ! nc -z localhost " 8000" ; do
@@ -84,7 +90,7 @@ while ! nc -z localhost "8000"; do
84
90
done
85
91
echo " [Testbench] The server is ready!"
86
92
87
- python lpu_openai_completion_client.py > log/openai_serve_${model_name} _${num_device } .txt
93
+ python lpu_openai_completion_client.py > log/openai_serve_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt
88
94
89
95
# Waiting for process kill
90
96
PID=$( jobs -p | tail -n 1)
103
109
104
110
# Write log in text file
105
111
echo " *********************************" >> ${log_sum}
106
- echo " The Result of log/openai_serve_${model_name} _${num_device } .txt" >> ${log_sum}
107
- tail -n 1 " log/openai_serve_${model_name} _${num_device } .txt" >> ${log_sum}
112
+ echo " The Result of log/openai_serve_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt" >> ${log_sum}
113
+ tail -n 1 " log/openai_serve_${model_name} _${num_lpu_device} _ ${num_gpu_device } .txt" >> ${log_sum}
108
114
echo " " >> ${log_sum}
109
-
110
-
0 commit comments