1
1
# SPDX-License-Identifier: Apache-2.0
2
2
"""Benchmark the latency of processing a single batch of requests."""
3
+
3
4
import argparse
4
5
import dataclasses
5
6
import json
7
+ import os
6
8
import time
7
9
from pathlib import Path
8
- from typing import List , Optional
10
+ from typing import Any , Dict , List , Optional
9
11
10
12
import numpy as np
11
13
import torch
14
+ from benchmark_utils import convert_to_pytorch_benchmark_format
12
15
from tqdm import tqdm
13
16
14
17
from vllm import LLM , SamplingParams
18
21
from vllm .utils import FlexibleArgumentParser
19
22
20
23
24
+ def save_to_pytorch_benchmark_format (args : argparse .Namespace ,
25
+ results : Dict [str , Any ]) -> None :
26
+ pt_records = convert_to_pytorch_benchmark_format (
27
+ args = args ,
28
+ metrics = {"latency" : results ["latencies" ]},
29
+ extra_info = {k : results [k ]
30
+ for k in ["avg_latency" , "percentiles" ]})
31
+ if pt_records :
32
+ pt_file = f"{ os .path .splitext (args .output_json )[0 ]} .pytorch.json"
33
+ with open (pt_file , "w" ) as f :
34
+ json .dump (pt_records , f )
35
+
36
+
21
37
def main (args : argparse .Namespace ):
22
38
print (args )
23
39
@@ -54,7 +70,8 @@ def llm_generate():
54
70
beam_width = args .n ,
55
71
max_tokens = args .output_len ,
56
72
ignore_eos = True ,
57
- ))
73
+ ),
74
+ )
58
75
59
76
def run_to_completion (profile_dir : Optional [str ] = None ):
60
77
if profile_dir :
@@ -64,7 +81,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
64
81
torch .profiler .ProfilerActivity .CUDA ,
65
82
],
66
83
on_trace_ready = torch .profiler .tensorboard_trace_handler (
67
- str (profile_dir ))) as p :
84
+ str (profile_dir )),
85
+ ) as p :
68
86
llm_generate ()
69
87
print (p .key_averages ().table (sort_by = "self_cuda_time_total" ))
70
88
else :
@@ -81,9 +99,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
81
99
if args .profile :
82
100
profile_dir = args .profile_result_dir
83
101
if not profile_dir :
84
- profile_dir = Path (
85
- "."
86
- ) / "vllm_benchmark_result" / f"latency_result_{ time .time ()} "
102
+ profile_dir = (Path ("." ) / "vllm_benchmark_result" /
103
+ f"latency_result_{ time .time ()} " )
87
104
print (f"Profiling (results will be saved to '{ profile_dir } ')..." )
88
105
run_to_completion (profile_dir = profile_dir )
89
106
return
@@ -95,9 +112,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
95
112
latencies = np .array (latencies )
96
113
percentages = [10 , 25 , 50 , 75 , 90 , 99 ]
97
114
percentiles = np .percentile (latencies , percentages )
98
- print (f' Avg latency: { np .mean (latencies )} seconds' )
115
+ print (f" Avg latency: { np .mean (latencies )} seconds" )
99
116
for percentage , percentile in zip (percentages , percentiles ):
100
- print (f' { percentage } % percentile latency: { percentile } seconds' )
117
+ print (f" { percentage } % percentile latency: { percentile } seconds" )
101
118
102
119
# Output JSON results if specified
103
120
if args .output_json :
@@ -108,43 +125,51 @@ def run_to_completion(profile_dir: Optional[str] = None):
108
125
}
109
126
with open (args .output_json , "w" ) as f :
110
127
json .dump (results , f , indent = 4 )
128
+ save_to_pytorch_benchmark_format (args , results )
111
129
112
130
113
- if __name__ == ' __main__' :
131
+ if __name__ == " __main__" :
114
132
parser = FlexibleArgumentParser (
115
- description = 'Benchmark the latency of processing a single batch of '
116
- 'requests till completion.' )
117
- parser .add_argument ('--input-len' , type = int , default = 32 )
118
- parser .add_argument ('--output-len' , type = int , default = 128 )
119
- parser .add_argument ('--batch-size' , type = int , default = 8 )
120
- parser .add_argument ('--n' ,
121
- type = int ,
122
- default = 1 ,
123
- help = 'Number of generated sequences per prompt.' )
124
- parser .add_argument ('--use-beam-search' , action = 'store_true' )
125
- parser .add_argument ('--num-iters-warmup' ,
126
- type = int ,
127
- default = 10 ,
128
- help = 'Number of iterations to run for warmup.' )
129
- parser .add_argument ('--num-iters' ,
133
+ description = "Benchmark the latency of processing a single batch of "
134
+ "requests till completion." )
135
+ parser .add_argument ("--input-len" , type = int , default = 32 )
136
+ parser .add_argument ("--output-len" , type = int , default = 128 )
137
+ parser .add_argument ("--batch-size" , type = int , default = 8 )
138
+ parser .add_argument (
139
+ "--n" ,
140
+ type = int ,
141
+ default = 1 ,
142
+ help = "Number of generated sequences per prompt." ,
143
+ )
144
+ parser .add_argument ("--use-beam-search" , action = "store_true" )
145
+ parser .add_argument (
146
+ "--num-iters-warmup" ,
147
+ type = int ,
148
+ default = 10 ,
149
+ help = "Number of iterations to run for warmup." ,
150
+ )
151
+ parser .add_argument ("--num-iters" ,
130
152
type = int ,
131
153
default = 30 ,
132
- help = ' Number of iterations to run.' )
154
+ help = " Number of iterations to run." )
133
155
parser .add_argument (
134
- '--profile' ,
135
- action = 'store_true' ,
136
- help = 'profile the generation process of a single batch' )
156
+ "--profile" ,
157
+ action = "store_true" ,
158
+ help = "profile the generation process of a single batch" ,
159
+ )
137
160
parser .add_argument (
138
- ' --profile-result-dir' ,
161
+ " --profile-result-dir" ,
139
162
type = str ,
140
163
default = None ,
141
- help = ('path to save the pytorch profiler output. Can be visualized '
142
- 'with ui.perfetto.dev or Tensorboard.' ))
164
+ help = ("path to save the pytorch profiler output. Can be visualized "
165
+ "with ui.perfetto.dev or Tensorboard." ),
166
+ )
143
167
parser .add_argument (
144
- ' --output-json' ,
168
+ " --output-json" ,
145
169
type = str ,
146
170
default = None ,
147
- help = 'Path to save the latency results in JSON format.' )
171
+ help = "Path to save the latency results in JSON format." ,
172
+ )
148
173
149
174
parser = EngineArgs .add_cli_args (parser )
150
175
args = parser .parse_args ()
0 commit comments