Skip to content

Commit 14df7b8

Browse files
huydhnlulmer
authored andcommitted
Run v1 benchmark and integrate with PyTorch OSS benchmark database (vllm-project#13068)
Signed-off-by: Huy Do <[email protected]> Signed-off-by: Louis Ulmer <[email protected]>
1 parent 3edeeca commit 14df7b8

File tree

7 files changed

+167
-45
lines changed

7 files changed

+167
-45
lines changed

.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh

+5
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,11 @@ main() {
345345
check_gpus
346346
check_hf_token
347347

348+
# Set to v1 to run v1 benchmark
349+
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
350+
export VLLM_USE_V1=1
351+
fi
352+
348353
# dependencies
349354
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
350355
(which jq) || (apt-get update && apt-get -y install jq)

.buildkite/nightly-benchmarks/tests/latency-tests.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,4 @@
2929
"num-iters": 15
3030
}
3131
}
32-
]
32+
]

benchmarks/benchmark_latency.py

+58-33
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
# SPDX-License-Identifier: Apache-2.0
22
"""Benchmark the latency of processing a single batch of requests."""
3+
34
import argparse
45
import dataclasses
56
import json
7+
import os
68
import time
79
from pathlib import Path
8-
from typing import List, Optional
10+
from typing import Any, Dict, List, Optional
911

1012
import numpy as np
1113
import torch
14+
from benchmark_utils import convert_to_pytorch_benchmark_format
1215
from tqdm import tqdm
1316

1417
from vllm import LLM, SamplingParams
@@ -18,6 +21,19 @@
1821
from vllm.utils import FlexibleArgumentParser
1922

2023

24+
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
25+
results: Dict[str, Any]) -> None:
26+
pt_records = convert_to_pytorch_benchmark_format(
27+
args=args,
28+
metrics={"latency": results["latencies"]},
29+
extra_info={k: results[k]
30+
for k in ["avg_latency", "percentiles"]})
31+
if pt_records:
32+
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
33+
with open(pt_file, "w") as f:
34+
json.dump(pt_records, f)
35+
36+
2137
def main(args: argparse.Namespace):
2238
print(args)
2339

@@ -54,7 +70,8 @@ def llm_generate():
5470
beam_width=args.n,
5571
max_tokens=args.output_len,
5672
ignore_eos=True,
57-
))
73+
),
74+
)
5875

5976
def run_to_completion(profile_dir: Optional[str] = None):
6077
if profile_dir:
@@ -64,7 +81,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
6481
torch.profiler.ProfilerActivity.CUDA,
6582
],
6683
on_trace_ready=torch.profiler.tensorboard_trace_handler(
67-
str(profile_dir))) as p:
84+
str(profile_dir)),
85+
) as p:
6886
llm_generate()
6987
print(p.key_averages().table(sort_by="self_cuda_time_total"))
7088
else:
@@ -81,9 +99,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
8199
if args.profile:
82100
profile_dir = args.profile_result_dir
83101
if not profile_dir:
84-
profile_dir = Path(
85-
"."
86-
) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
102+
profile_dir = (Path(".") / "vllm_benchmark_result" /
103+
f"latency_result_{time.time()}")
87104
print(f"Profiling (results will be saved to '{profile_dir}')...")
88105
run_to_completion(profile_dir=profile_dir)
89106
return
@@ -95,9 +112,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
95112
latencies = np.array(latencies)
96113
percentages = [10, 25, 50, 75, 90, 99]
97114
percentiles = np.percentile(latencies, percentages)
98-
print(f'Avg latency: {np.mean(latencies)} seconds')
115+
print(f"Avg latency: {np.mean(latencies)} seconds")
99116
for percentage, percentile in zip(percentages, percentiles):
100-
print(f'{percentage}% percentile latency: {percentile} seconds')
117+
print(f"{percentage}% percentile latency: {percentile} seconds")
101118

102119
# Output JSON results if specified
103120
if args.output_json:
@@ -108,43 +125,51 @@ def run_to_completion(profile_dir: Optional[str] = None):
108125
}
109126
with open(args.output_json, "w") as f:
110127
json.dump(results, f, indent=4)
128+
save_to_pytorch_benchmark_format(args, results)
111129

112130

113-
if __name__ == '__main__':
131+
if __name__ == "__main__":
114132
parser = FlexibleArgumentParser(
115-
description='Benchmark the latency of processing a single batch of '
116-
'requests till completion.')
117-
parser.add_argument('--input-len', type=int, default=32)
118-
parser.add_argument('--output-len', type=int, default=128)
119-
parser.add_argument('--batch-size', type=int, default=8)
120-
parser.add_argument('--n',
121-
type=int,
122-
default=1,
123-
help='Number of generated sequences per prompt.')
124-
parser.add_argument('--use-beam-search', action='store_true')
125-
parser.add_argument('--num-iters-warmup',
126-
type=int,
127-
default=10,
128-
help='Number of iterations to run for warmup.')
129-
parser.add_argument('--num-iters',
133+
description="Benchmark the latency of processing a single batch of "
134+
"requests till completion.")
135+
parser.add_argument("--input-len", type=int, default=32)
136+
parser.add_argument("--output-len", type=int, default=128)
137+
parser.add_argument("--batch-size", type=int, default=8)
138+
parser.add_argument(
139+
"--n",
140+
type=int,
141+
default=1,
142+
help="Number of generated sequences per prompt.",
143+
)
144+
parser.add_argument("--use-beam-search", action="store_true")
145+
parser.add_argument(
146+
"--num-iters-warmup",
147+
type=int,
148+
default=10,
149+
help="Number of iterations to run for warmup.",
150+
)
151+
parser.add_argument("--num-iters",
130152
type=int,
131153
default=30,
132-
help='Number of iterations to run.')
154+
help="Number of iterations to run.")
133155
parser.add_argument(
134-
'--profile',
135-
action='store_true',
136-
help='profile the generation process of a single batch')
156+
"--profile",
157+
action="store_true",
158+
help="profile the generation process of a single batch",
159+
)
137160
parser.add_argument(
138-
'--profile-result-dir',
161+
"--profile-result-dir",
139162
type=str,
140163
default=None,
141-
help=('path to save the pytorch profiler output. Can be visualized '
142-
'with ui.perfetto.dev or Tensorboard.'))
164+
help=("path to save the pytorch profiler output. Can be visualized "
165+
"with ui.perfetto.dev or Tensorboard."),
166+
)
143167
parser.add_argument(
144-
'--output-json',
168+
"--output-json",
145169
type=str,
146170
default=None,
147-
help='Path to save the latency results in JSON format.')
171+
help="Path to save the latency results in JSON format.",
172+
)
148173

149174
parser = EngineArgs.add_cli_args(parser)
150175
args = parser.parse_args()

benchmarks/benchmark_serving.py

+39-9
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@
5656
except ImportError:
5757
from argparse import ArgumentParser as FlexibleArgumentParser
5858

59+
from benchmark_utils import convert_to_pytorch_benchmark_format
60+
5961
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
6062

6163

@@ -402,21 +404,21 @@ async def get_request(
402404
burstiness: float = 1.0,
403405
) -> AsyncGenerator[Tuple[str, int, int], None]:
404406
"""
405-
Asynchronously generates requests at a specified rate
407+
Asynchronously generates requests at a specified rate
406408
with OPTIONAL burstiness.
407-
409+
408410
Args:
409-
input_requests:
411+
input_requests:
410412
A list of input requests, each represented as a tuple.
411-
request_rate:
413+
request_rate:
412414
The rate at which requests are generated (requests/s).
413-
burstiness (optional):
414-
The burstiness factor of the request generation.
415+
burstiness (optional):
416+
The burstiness factor of the request generation.
415417
Only takes effect when request_rate is not inf.
416418
Default value is 1, which follows a Poisson process.
417419
Otherwise, the request intervals follow a gamma distribution.
418-
A lower burstiness value (0 < burstiness < 1) results
419-
in more bursty requests, while a higher burstiness value
420+
A lower burstiness value (0 < burstiness < 1) results
421+
in more bursty requests, while a higher burstiness value
420422
(burstiness > 1) results in a more uniform arrival of requests.
421423
"""
422424
input_requests = iter(input_requests)
@@ -817,6 +819,32 @@ def parse_goodput(slo_pairs):
817819
return goodput_config_dict
818820

819821

822+
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
823+
results: Dict[str, Any],
824+
file_name: str) -> None:
825+
metrics = [
826+
"median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
827+
"mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
828+
"median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
829+
]
830+
# These raw data might be useful, but they are rather big. They can be added
831+
# later if needed
832+
ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
833+
pt_records = convert_to_pytorch_benchmark_format(
834+
args=args,
835+
metrics={k: [results[k]]
836+
for k in metrics},
837+
extra_info={
838+
k: results[k]
839+
for k in results if k not in metrics and k not in ignored_metrics
840+
})
841+
if pt_records:
842+
# Don't use json suffix here as we don't want CI to pick it up
843+
pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
844+
with open(pt_file, "w") as f:
845+
json.dump(pt_records, f)
846+
847+
820848
def main(args: argparse.Namespace):
821849
print(args)
822850
random.seed(args.seed)
@@ -997,6 +1025,7 @@ def main(args: argparse.Namespace):
9971025
file_name = os.path.join(args.result_dir, file_name)
9981026
with open(file_name, "w", encoding='utf-8') as outfile:
9991027
json.dump(result_json, outfile)
1028+
save_to_pytorch_benchmark_format(args, result_json, file_name)
10001029

10011030

10021031
if __name__ == "__main__":
@@ -1014,7 +1043,8 @@ def main(args: argparse.Namespace):
10141043
default=None,
10151044
help="Server or API base url if not using http host and port.",
10161045
)
1017-
parser.add_argument("--host", type=str, default="localhost")
1046+
# Use 127.0.0.1 here instead of localhost to force the use of ipv4
1047+
parser.add_argument("--host", type=str, default="127.0.0.1")
10181048
parser.add_argument("--port", type=int, default=8000)
10191049
parser.add_argument(
10201050
"--endpoint",

benchmarks/benchmark_serving_guided.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -731,7 +731,8 @@ def main(args: argparse.Namespace):
731731
default=None,
732732
help="Server or API base url if not using http host and port.",
733733
)
734-
parser.add_argument("--host", type=str, default="localhost")
734+
# Use 127.0.0.1 here instead of localhost to force the use of ipv4
735+
parser.add_argument("--host", type=str, default="127.0.0.1")
735736
parser.add_argument("--port", type=int, default=8000)
736737
parser.add_argument(
737738
"--endpoint",

benchmarks/benchmark_throughput.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@
33
import argparse
44
import dataclasses
55
import json
6+
import os
67
import random
78
import time
89
from functools import cache
9-
from typing import Dict, List, Optional, Tuple
10+
from typing import Any, Dict, List, Optional, Tuple
1011

1112
import torch
1213
import uvloop
14+
from benchmark_utils import convert_to_pytorch_benchmark_format
1315
from PIL import Image
1416
from tqdm import tqdm
1517
from transformers import (AutoModelForCausalLM, AutoTokenizer,
@@ -338,6 +340,25 @@ def run_mii(
338340
return end - start
339341

340342

343+
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
344+
results: Dict[str, Any]) -> None:
345+
pt_records = convert_to_pytorch_benchmark_format(
346+
args=args,
347+
metrics={
348+
"requests_per_second": [results["requests_per_second"]],
349+
"tokens_per_second": [results["tokens_per_second"]],
350+
},
351+
extra_info={
352+
k: results[k]
353+
for k in ["elapsed_time", "num_requests", "total_num_tokens"]
354+
})
355+
if pt_records:
356+
# Don't use json suffix here as we don't want CI to pick it up
357+
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
358+
with open(pt_file, "w") as f:
359+
json.dump(pt_records, f)
360+
361+
341362
def main(args: argparse.Namespace):
342363
print(args)
343364
random.seed(args.seed)
@@ -435,6 +456,7 @@ def main(args: argparse.Namespace):
435456
}
436457
with open(args.output_json, "w") as f:
437458
json.dump(results, f, indent=4)
459+
save_to_pytorch_benchmark_format(args, results)
438460

439461

440462
if __name__ == "__main__":

benchmarks/benchmark_utils.py

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
import argparse
4+
import os
5+
from typing import Any, Dict, List
6+
7+
8+
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
9+
metrics: Dict[str, List],
10+
extra_info: Dict[str, Any]) -> List:
11+
"""
12+
Save the benchmark results in the format used by PyTorch OSS benchmark with
13+
on metric per record
14+
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
15+
"""
16+
records = []
17+
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
18+
return records
19+
20+
for name, benchmark_values in metrics.items():
21+
record = {
22+
"benchmark": {
23+
"name": "vLLM benchmark",
24+
"extra_info": {
25+
"args": vars(args),
26+
},
27+
},
28+
"model": {
29+
"name": args.model,
30+
},
31+
"metric": {
32+
"name": name,
33+
"benchmark_values": benchmark_values,
34+
"extra_info": extra_info,
35+
},
36+
}
37+
records.append(record)
38+
39+
return records

0 commit comments

Comments
 (0)