From ce79bc5e514d4e3c143dc35a675fdcaa3f7c760e Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 10 Feb 2025 17:03:30 -0800 Subject: [PATCH 01/17] Run v1 benchmark Signed-off-by: Huy Do --- .../scripts/run-performance-benchmarks.sh | 13 +- .../tests/latency-tests.json | 2 +- benchmarks/benchmark_latency.py | 147 ++++++++++++------ 3 files changed, 111 insertions(+), 51 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index 0d16a83781a..261d340a30d 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -345,6 +345,15 @@ main() { check_gpus check_hf_token + # Set to v1 to run v1 benchmark + VLLM_VERSION=$1 + if [[ "${VLLM_VERSION:-v0}" == "v1" ]]; then + export VLLM_USE_V1=1 + fi + + # Set to 0 to run the benchmark script locally without uploading to Buildkite + UPLOAD_TO_BUILDKITE=$2 + # dependencies (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) @@ -371,7 +380,9 @@ main() { pip install tabulate pandas python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py - upload_to_buildkite + if [[ "${UPLOAD_TO_BUILDKITE:-1}" == "1" ]]; then + upload_to_buildkite + fi } main "$@" diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json index 1841186da15..7762a239f96 100644 --- a/.buildkite/nightly-benchmarks/tests/latency-tests.json +++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json @@ -29,4 +29,4 @@ "num-iters": 15 } } -] \ No newline at end of file +] diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 89631294531..b4ff489be5e 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,11 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 """Benchmark the latency of processing a single batch of requests.""" + +import os import argparse import dataclasses import json import time from pathlib import Path -from typing import List, Optional +from typing import List, Optional, Dict, Any import numpy as np import torch @@ -18,6 +20,38 @@ from vllm.utils import FlexibleArgumentParser +def save_to_pytorch_benchmark_format( + args: argparse.Namespace, results: Dict[str, Any] +) -> None: + # https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database + record = { + "benchmark": { + "name": "vLLM benchmark", + "extra_info": { + "args": args, + }, + }, + "model": { + "name": args.model, + }, + "metric": { + "name": "latency", + "benchmark_values": results.get("latencies", []), + "extra_info": { + "avg_latency": results.get("avg_latency", 0), + "percentiles": results.get("percentiles", {}), + }, + }, + } + + if os.environ.get("SAVE_IN_PYTORCH_BENCHMARK_FORMAT", False): + output_file = ( + f"{os.path.splitext(args.output_json)[0]}_pytorch_format.json" + ) + with open(output_file, "w") as f: + json.dump(record, f) + + def main(args: argparse.Namespace): print(args) @@ -35,18 +69,18 @@ def main(args: argparse.Namespace): max_tokens=args.output_len, ) print(sampling_params) - dummy_prompt_token_ids = np.random.randint(10000, - size=(args.batch_size, - args.input_len)) - dummy_prompts: List[PromptType] = [{ - "prompt_token_ids": batch - } for batch in dummy_prompt_token_ids.tolist()] + dummy_prompt_token_ids = np.random.randint( + 10000, size=(args.batch_size, args.input_len) + ) + dummy_prompts: List[PromptType] = [ + {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() + ] def llm_generate(): if not args.use_beam_search: - llm.generate(dummy_prompts, - sampling_params=sampling_params, - use_tqdm=False) + llm.generate( + dummy_prompts, sampling_params=sampling_params, use_tqdm=False + ) else: llm.beam_search( dummy_prompts, @@ -54,17 +88,20 @@ def llm_generate(): beam_width=args.n, max_tokens=args.output_len, ignore_eos=True, - )) + ), + ) def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir))) as p: + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + on_trace_ready=torch.profiler.tensorboard_trace_handler( + str(profile_dir) + ), + ) as p: llm_generate() print(p.key_averages().table(sort_by="self_cuda_time_total")) else: @@ -81,9 +118,11 @@ def run_to_completion(profile_dir: Optional[str] = None): if args.profile: profile_dir = args.profile_result_dir if not profile_dir: - profile_dir = Path( - "." - ) / "vllm_benchmark_result" / f"latency_result_{time.time()}" + profile_dir = ( + Path(".") + / "vllm_benchmark_result" + / f"latency_result_{time.time()}" + ) print(f"Profiling (results will be saved to '{profile_dir}')...") run_to_completion(profile_dir=profile_dir) return @@ -95,9 +134,9 @@ def run_to_completion(profile_dir: Optional[str] = None): latencies = np.array(latencies) percentages = [10, 25, 50, 75, 90, 99] percentiles = np.percentile(latencies, percentages) - print(f'Avg latency: {np.mean(latencies)} seconds') + print(f"Avg latency: {np.mean(latencies)} seconds") for percentage, percentile in zip(percentages, percentiles): - print(f'{percentage}% percentile latency: {percentile} seconds') + print(f"{percentage}% percentile latency: {percentile} seconds") # Output JSON results if specified if args.output_json: @@ -108,43 +147,53 @@ def run_to_completion(profile_dir: Optional[str] = None): } with open(args.output_json, "w") as f: json.dump(results, f, indent=4) + save_to_pytorch_benchmark_format(args, results) -if __name__ == '__main__': +if __name__ == "__main__": parser = FlexibleArgumentParser( - description='Benchmark the latency of processing a single batch of ' - 'requests till completion.') - parser.add_argument('--input-len', type=int, default=32) - parser.add_argument('--output-len', type=int, default=128) - parser.add_argument('--batch-size', type=int, default=8) - parser.add_argument('--n', - type=int, - default=1, - help='Number of generated sequences per prompt.') - parser.add_argument('--use-beam-search', action='store_true') - parser.add_argument('--num-iters-warmup', - type=int, - default=10, - help='Number of iterations to run for warmup.') - parser.add_argument('--num-iters', - type=int, - default=30, - help='Number of iterations to run.') + description="Benchmark the latency of processing a single batch of " + "requests till completion." + ) + parser.add_argument("--input-len", type=int, default=32) + parser.add_argument("--output-len", type=int, default=128) + parser.add_argument("--batch-size", type=int, default=8) + parser.add_argument( + "--n", + type=int, + default=1, + help="Number of generated sequences per prompt.", + ) + parser.add_argument("--use-beam-search", action="store_true") parser.add_argument( - '--profile', - action='store_true', - help='profile the generation process of a single batch') + "--num-iters-warmup", + type=int, + default=10, + help="Number of iterations to run for warmup.", + ) + parser.add_argument( + "--num-iters", type=int, default=30, help="Number of iterations to run." + ) parser.add_argument( - '--profile-result-dir', + "--profile", + action="store_true", + help="profile the generation process of a single batch", + ) + parser.add_argument( + "--profile-result-dir", type=str, default=None, - help=('path to save the pytorch profiler output. Can be visualized ' - 'with ui.perfetto.dev or Tensorboard.')) + help=( + "path to save the pytorch profiler output. Can be visualized " + "with ui.perfetto.dev or Tensorboard." + ), + ) parser.add_argument( - '--output-json', + "--output-json", type=str, default=None, - help='Path to save the latency results in JSON format.') + help="Path to save the latency results in JSON format.", + ) parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() From d84671fdef6764a8033358b082d7d64dd35087bf Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 10 Feb 2025 17:16:59 -0800 Subject: [PATCH 02/17] Fix env variables Signed-off-by: Huy Do --- .../nightly-benchmarks/scripts/run-performance-benchmarks.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index 261d340a30d..eb2ad434945 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -346,14 +346,10 @@ main() { check_hf_token # Set to v1 to run v1 benchmark - VLLM_VERSION=$1 if [[ "${VLLM_VERSION:-v0}" == "v1" ]]; then export VLLM_USE_V1=1 fi - # Set to 0 to run the benchmark script locally without uploading to Buildkite - UPLOAD_TO_BUILDKITE=$2 - # dependencies (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) @@ -380,6 +376,7 @@ main() { pip install tabulate pandas python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py + # Set to 0 to run the benchmark script locally without uploading to Buildkite if [[ "${UPLOAD_TO_BUILDKITE:-1}" == "1" ]]; then upload_to_buildkite fi From 27caeb30a0fc29c1f6dc7890ec8898ad46a6e500 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 10 Feb 2025 17:33:29 -0800 Subject: [PATCH 03/17] Another tweak in the output format Signed-off-by: Huy Do --- benchmarks/benchmark_latency.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index b4ff489be5e..4924cd78efb 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -28,7 +28,7 @@ def save_to_pytorch_benchmark_format( "benchmark": { "name": "vLLM benchmark", "extra_info": { - "args": args, + "args": str(args), }, }, "model": { @@ -45,9 +45,9 @@ def save_to_pytorch_benchmark_format( } if os.environ.get("SAVE_IN_PYTORCH_BENCHMARK_FORMAT", False): - output_file = ( - f"{os.path.splitext(args.output_json)[0]}_pytorch_format.json" - ) + # Don't use json suffix here as we don't want convert-results-json-to-markdown.py + # to pick it up + output_file = f"{os.path.splitext(args.output_json)[0]}.pytorch" with open(output_file, "w") as f: json.dump(record, f) From 4ae88ea29e35d1437e7365d56937db2d44d8d813 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 10 Feb 2025 17:39:43 -0800 Subject: [PATCH 04/17] Use vars Signed-off-by: Huy Do --- benchmarks/benchmark_latency.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 4924cd78efb..96d0eeee6f3 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -28,7 +28,7 @@ def save_to_pytorch_benchmark_format( "benchmark": { "name": "vLLM benchmark", "extra_info": { - "args": str(args), + "args": vars(args), }, }, "model": { From c04e53ff56dc6602c743fda7cbaff8b33b7634f6 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 10 Feb 2025 17:47:30 -0800 Subject: [PATCH 05/17] No need to skip buildkite upload Signed-off-by: Huy Do --- .../nightly-benchmarks/scripts/run-performance-benchmarks.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index eb2ad434945..0a9cf84a873 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -376,10 +376,7 @@ main() { pip install tabulate pandas python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py - # Set to 0 to run the benchmark script locally without uploading to Buildkite - if [[ "${UPLOAD_TO_BUILDKITE:-1}" == "1" ]]; then - upload_to_buildkite - fi + upload_to_buildkite } main "$@" From 10971bdfd6943c829fe990c584a099cf58368669 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 10 Feb 2025 18:02:04 -0800 Subject: [PATCH 06/17] Fix typo Signed-off-by: Huy Do --- benchmarks/benchmark_latency.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 96d0eeee6f3..ff5e252c1ad 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -44,7 +44,7 @@ def save_to_pytorch_benchmark_format( }, } - if os.environ.get("SAVE_IN_PYTORCH_BENCHMARK_FORMAT", False): + if os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False): # Don't use json suffix here as we don't want convert-results-json-to-markdown.py # to pick it up output_file = f"{os.path.splitext(args.output_json)[0]}.pytorch" From 93c3b8510f9b3720d8b95998e61361716a051722 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 10 Feb 2025 18:51:03 -0800 Subject: [PATCH 07/17] Fix pre-commit Signed-off-by: Huy Do --- benchmarks/benchmark_latency.py | 67 +++++++++++++++------------------ 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index ff5e252c1ad..48a43c1daa4 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 """Benchmark the latency of processing a single batch of requests.""" -import os import argparse import dataclasses import json +import os import time from pathlib import Path -from typing import List, Optional, Dict, Any +from typing import Any, Dict, List, Optional import numpy as np import torch @@ -20,9 +20,8 @@ from vllm.utils import FlexibleArgumentParser -def save_to_pytorch_benchmark_format( - args: argparse.Namespace, results: Dict[str, Any] -) -> None: +def save_to_pytorch_benchmark_format(args: argparse.Namespace, + results: Dict[str, Any]) -> None: # https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database record = { "benchmark": { @@ -45,8 +44,8 @@ def save_to_pytorch_benchmark_format( } if os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False): - # Don't use json suffix here as we don't want convert-results-json-to-markdown.py - # to pick it up + # Don't use json suffix here as we don't want + # convert-results-json-to-markdown.py to pick it up output_file = f"{os.path.splitext(args.output_json)[0]}.pytorch" with open(output_file, "w") as f: json.dump(record, f) @@ -69,18 +68,18 @@ def main(args: argparse.Namespace): max_tokens=args.output_len, ) print(sampling_params) - dummy_prompt_token_ids = np.random.randint( - 10000, size=(args.batch_size, args.input_len) - ) - dummy_prompts: List[PromptType] = [ - {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() - ] + dummy_prompt_token_ids = np.random.randint(10000, + size=(args.batch_size, + args.input_len)) + dummy_prompts: List[PromptType] = [{ + "prompt_token_ids": batch + } for batch in dummy_prompt_token_ids.tolist()] def llm_generate(): if not args.use_beam_search: - llm.generate( - dummy_prompts, sampling_params=sampling_params, use_tqdm=False - ) + llm.generate(dummy_prompts, + sampling_params=sampling_params, + use_tqdm=False) else: llm.beam_search( dummy_prompts, @@ -94,13 +93,12 @@ def llm_generate(): def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir) - ), + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + on_trace_ready=torch.profiler.tensorboard_trace_handler( + str(profile_dir)), ) as p: llm_generate() print(p.key_averages().table(sort_by="self_cuda_time_total")) @@ -118,11 +116,8 @@ def run_to_completion(profile_dir: Optional[str] = None): if args.profile: profile_dir = args.profile_result_dir if not profile_dir: - profile_dir = ( - Path(".") - / "vllm_benchmark_result" - / f"latency_result_{time.time()}" - ) + profile_dir = (Path(".") / "vllm_benchmark_result" / + f"latency_result_{time.time()}") print(f"Profiling (results will be saved to '{profile_dir}')...") run_to_completion(profile_dir=profile_dir) return @@ -153,8 +148,7 @@ def run_to_completion(profile_dir: Optional[str] = None): if __name__ == "__main__": parser = FlexibleArgumentParser( description="Benchmark the latency of processing a single batch of " - "requests till completion." - ) + "requests till completion.") parser.add_argument("--input-len", type=int, default=32) parser.add_argument("--output-len", type=int, default=128) parser.add_argument("--batch-size", type=int, default=8) @@ -171,9 +165,10 @@ def run_to_completion(profile_dir: Optional[str] = None): default=10, help="Number of iterations to run for warmup.", ) - parser.add_argument( - "--num-iters", type=int, default=30, help="Number of iterations to run." - ) + parser.add_argument("--num-iters", + type=int, + default=30, + help="Number of iterations to run.") parser.add_argument( "--profile", action="store_true", @@ -183,10 +178,8 @@ def run_to_completion(profile_dir: Optional[str] = None): "--profile-result-dir", type=str, default=None, - help=( - "path to save the pytorch profiler output. Can be visualized " - "with ui.perfetto.dev or Tensorboard." - ), + help=("path to save the pytorch profiler output. Can be visualized " + "with ui.perfetto.dev or Tensorboard."), ) parser.add_argument( "--output-json", From 7cddf747e1746ad42e15bc3c779c3450537bb6f9 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 11 Feb 2025 23:17:32 -0800 Subject: [PATCH 08/17] Address review comments Signed-off-by: Huy Do --- .../scripts/run-performance-benchmarks.sh | 2 +- benchmarks/benchmark_latency.py | 47 ++++++------------- benchmarks/benchmark_throughput.py | 21 +++++++++ benchmarks/benchmark_utils.py | 39 +++++++++++++++ 4 files changed, 76 insertions(+), 33 deletions(-) create mode 100644 benchmarks/benchmark_utils.py diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index 0a9cf84a873..9425cb07ec0 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -346,7 +346,7 @@ main() { check_hf_token # Set to v1 to run v1 benchmark - if [[ "${VLLM_VERSION:-v0}" == "v1" ]]; then + if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then export VLLM_USE_V1=1 fi diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 48a43c1daa4..157cdafc861 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -5,9 +5,10 @@ import dataclasses import json import os +import sys import time from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import List, Optional import numpy as np import torch @@ -19,36 +20,8 @@ from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser - -def save_to_pytorch_benchmark_format(args: argparse.Namespace, - results: Dict[str, Any]) -> None: - # https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database - record = { - "benchmark": { - "name": "vLLM benchmark", - "extra_info": { - "args": vars(args), - }, - }, - "model": { - "name": args.model, - }, - "metric": { - "name": "latency", - "benchmark_values": results.get("latencies", []), - "extra_info": { - "avg_latency": results.get("avg_latency", 0), - "percentiles": results.get("percentiles", {}), - }, - }, - } - - if os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False): - # Don't use json suffix here as we don't want - # convert-results-json-to-markdown.py to pick it up - output_file = f"{os.path.splitext(args.output_json)[0]}.pytorch" - with open(output_file, "w") as f: - json.dump(record, f) +sys.path.append(os.path.dirname(os.path.realpath(__file__))) +from benchmark_utils import save_to_pytorch_benchmark_format def main(args: argparse.Namespace): @@ -142,7 +115,17 @@ def run_to_completion(profile_dir: Optional[str] = None): } with open(args.output_json, "w") as f: json.dump(results, f, indent=4) - save_to_pytorch_benchmark_format(args, results) + + pt_records = save_to_pytorch_benchmark_format( + args=args, + metrics={"latency": results["latencies"]}, + extra_info={k: results[k] + for k in ["avg_latency", "percentiles"]}) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch" + with open(pt_file, "w") as f: + json.dump(pt_records, f) if __name__ == "__main__": diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 658eab6a278..40d2c0daa64 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -3,7 +3,9 @@ import argparse import dataclasses import json +import os import random +import sys import time from functools import cache from typing import Dict, List, Optional, Tuple @@ -26,6 +28,9 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer from vllm.utils import FlexibleArgumentParser, merge_async_iterators +sys.path.append(os.path.dirname(os.path.realpath(__file__))) +from benchmark_utils import save_to_pytorch_benchmark_format + @dataclasses.dataclass class SampleRequest: @@ -436,6 +441,22 @@ def main(args: argparse.Namespace): with open(args.output_json, "w") as f: json.dump(results, f, indent=4) + pt_records = save_to_pytorch_benchmark_format( + args=args, + metrics={ + "requests_per_second": results["requests_per_second"], + "tokens_per_second": results["tokens_per_second"], + }, + extra_info={ + k: results[k] + for k in ["elapsed_time", "num_requests", "total_num_tokens"] + }) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch" + with open(pt_file, "w") as f: + json.dump(pt_records, f) + if __name__ == "__main__": parser = FlexibleArgumentParser(description="Benchmark the throughput.") diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py new file mode 100644 index 00000000000..90bb6a5dcc2 --- /dev/null +++ b/benchmarks/benchmark_utils.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import os +from typing import Any, Dict, List + + +def save_to_pytorch_benchmark_format(args: argparse.Namespace, + metrics: Dict[str, List], + extra_info: Dict[str, Any]) -> List: + """ + Save the benchmark results in the format used by PyTorch OSS benchmark with + on metric per record + https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database + """ + records = [] + if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False): + return records + + for name, benchmark_values in metrics.items(): + record = { + "benchmark": { + "name": "vLLM benchmark", + "extra_info": { + "args": vars(args), + }, + }, + "model": { + "name": args.model, + }, + "metric": { + "name": name, + "benchmark_values": benchmark_values, + "extra_info": extra_info, + }, + } + records.append(record) + + return records From a14865c1b9d7ba201d986c8a027bad64d6b75c84 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 11 Feb 2025 23:42:44 -0800 Subject: [PATCH 09/17] Add benchmark_serving Signed-off-by: Huy Do --- benchmarks/benchmark_latency.py | 30 ++++++++++++---------- benchmarks/benchmark_serving.py | 38 ++++++++++++++++++++++------ benchmarks/benchmark_throughput.py | 40 ++++++++++++++++-------------- benchmarks/benchmark_utils.py | 6 ++--- 4 files changed, 72 insertions(+), 42 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 157cdafc861..f337deb83bb 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -8,7 +8,7 @@ import sys import time from pathlib import Path -from typing import List, Optional +from typing import Any, Dict, List, Optional import numpy as np import torch @@ -21,7 +21,21 @@ from vllm.utils import FlexibleArgumentParser sys.path.append(os.path.dirname(os.path.realpath(__file__))) -from benchmark_utils import save_to_pytorch_benchmark_format +from benchmark_utils import convert_to_pytorch_benchmark_format + + +def save_to_pytorch_benchmark_format(args: argparse.Namespace, + results: Dict[str, Any]) -> None: + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={"latency": results["latencies"]}, + extra_info={k: results[k] + for k in ["avg_latency", "percentiles"]}) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch" + with open(pt_file, "w") as f: + json.dump(pt_records, f) def main(args: argparse.Namespace): @@ -115,17 +129,7 @@ def run_to_completion(profile_dir: Optional[str] = None): } with open(args.output_json, "w") as f: json.dump(results, f, indent=4) - - pt_records = save_to_pytorch_benchmark_format( - args=args, - metrics={"latency": results["latencies"]}, - extra_info={k: results[k] - for k in ["avg_latency", "percentiles"]}) - if pt_records: - # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch" - with open(pt_file, "w") as f: - json.dump(pt_records, f) + save_to_pytorch_benchmark_format(args, results) if __name__ == "__main__": diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 1044bef5941..888aa2bc470 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -31,6 +31,7 @@ import json import os import random +import sys import time import warnings from dataclasses import dataclass @@ -55,6 +56,9 @@ except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser +sys.path.append(os.path.dirname(os.path.realpath(__file__))) +from benchmark_utils import convert_to_pytorch_benchmark_format + MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -372,21 +376,21 @@ async def get_request( burstiness: float = 1.0, ) -> AsyncGenerator[Tuple[str, int, int], None]: """ - Asynchronously generates requests at a specified rate + Asynchronously generates requests at a specified rate with OPTIONAL burstiness. - + Args: - input_requests: + input_requests: A list of input requests, each represented as a tuple. - request_rate: + request_rate: The rate at which requests are generated (requests/s). - burstiness (optional): - The burstiness factor of the request generation. + burstiness (optional): + The burstiness factor of the request generation. Only takes effect when request_rate is not inf. Default value is 1, which follows a Poisson process. Otherwise, the request intervals follow a gamma distribution. - A lower burstiness value (0 < burstiness < 1) results - in more bursty requests, while a higher burstiness value + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value (burstiness > 1) results in a more uniform arrival of requests. """ input_requests = iter(input_requests) @@ -787,6 +791,23 @@ def parse_goodput(slo_pairs): return goodput_config_dict +def save_to_pytorch_benchmark_format(args: argparse.Namespace, + results: Dict[str, Any], + file_name: str) -> None: + metrics = ["ttft", "tpot", "itl", "e2el"] + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={k: results[k] + for k in metrics}, + extra_info={k: results[k] + for k in results if k not in metrics}) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(file_name)[0]}.pytorch" + with open(pt_file, "w") as f: + json.dump(pt_records, f) + + def main(args: argparse.Namespace): print(args) random.seed(args.seed) @@ -959,6 +980,7 @@ def main(args: argparse.Namespace): file_name = os.path.join(args.result_dir, file_name) with open(file_name, "w", encoding='utf-8') as outfile: json.dump(result_json, outfile) + save_to_pytorch_benchmark_format(args, result_json, file_name) if __name__ == "__main__": diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 40d2c0daa64..9f7d6103202 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -8,7 +8,7 @@ import sys import time from functools import cache -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import torch import uvloop @@ -29,7 +29,7 @@ from vllm.utils import FlexibleArgumentParser, merge_async_iterators sys.path.append(os.path.dirname(os.path.realpath(__file__))) -from benchmark_utils import save_to_pytorch_benchmark_format +from benchmark_utils import convert_to_pytorch_benchmark_format @dataclasses.dataclass @@ -343,6 +343,25 @@ def run_mii( return end - start +def save_to_pytorch_benchmark_format(args: argparse.Namespace, + results: Dict[str, Any]) -> None: + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={ + "requests_per_second": [results["requests_per_second"]], + "tokens_per_second": [results["tokens_per_second"]], + }, + extra_info={ + k: results[k] + for k in ["elapsed_time", "num_requests", "total_num_tokens"] + }) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch" + with open(pt_file, "w") as f: + json.dump(pt_records, f) + + def main(args: argparse.Namespace): print(args) random.seed(args.seed) @@ -440,22 +459,7 @@ def main(args: argparse.Namespace): } with open(args.output_json, "w") as f: json.dump(results, f, indent=4) - - pt_records = save_to_pytorch_benchmark_format( - args=args, - metrics={ - "requests_per_second": results["requests_per_second"], - "tokens_per_second": results["tokens_per_second"], - }, - extra_info={ - k: results[k] - for k in ["elapsed_time", "num_requests", "total_num_tokens"] - }) - if pt_records: - # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch" - with open(pt_file, "w") as f: - json.dump(pt_records, f) + save_to_pytorch_benchmark_format(args, results) if __name__ == "__main__": diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index 90bb6a5dcc2..6f01cf20e17 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -5,9 +5,9 @@ from typing import Any, Dict, List -def save_to_pytorch_benchmark_format(args: argparse.Namespace, - metrics: Dict[str, List], - extra_info: Dict[str, Any]) -> List: +def convert_to_pytorch_benchmark_format(args: argparse.Namespace, + metrics: Dict[str, List], + extra_info: Dict[str, Any]) -> List: """ Save the benchmark results in the format used by PyTorch OSS benchmark with on metric per record From 19f436d590dad413ca9a95e4b57061e0a472eca6 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 12 Feb 2025 00:20:50 -0800 Subject: [PATCH 10/17] There are only ttfts and itls Signed-off-by: Huy Do --- benchmarks/benchmark_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 888aa2bc470..05a442b91b5 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -794,7 +794,7 @@ def parse_goodput(slo_pairs): def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: Dict[str, Any], file_name: str) -> None: - metrics = ["ttft", "tpot", "itl", "e2el"] + metrics = ["ttfts", "itls"] pt_records = convert_to_pytorch_benchmark_format( args=args, metrics={k: results[k] From 6874d6596807e522cfb58ea49fced4cf57edc47a Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 12 Feb 2025 00:45:54 -0800 Subject: [PATCH 11/17] Ignore some raw metrics in serving benchmark Signed-off-by: Huy Do --- benchmarks/benchmark_serving.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 05a442b91b5..d3024aacd78 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -794,13 +794,18 @@ def parse_goodput(slo_pairs): def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: Dict[str, Any], file_name: str) -> None: - metrics = ["ttfts", "itls"] + metrics = ["median_ttft_ms", "median_itl_ms"] + # These raw data might be useful, but they are rather big. They can be added + # later if needed + ignored_metrics = ["ttfts", "itls", "generated_texts"] pt_records = convert_to_pytorch_benchmark_format( args=args, metrics={k: results[k] for k in metrics}, - extra_info={k: results[k] - for k in results if k not in metrics}) + extra_info={ + k: results[k] + for k in results if k not in metrics and k not in ignored_metrics + }) if pt_records: # Don't use json suffix here as we don't want CI to pick it up pt_file = f"{os.path.splitext(file_name)[0]}.pytorch" From 826887cc4307289b9b70c481fea8afbe60628dc9 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 12 Feb 2025 01:01:25 -0800 Subject: [PATCH 12/17] Another typo Signed-off-by: Huy Do --- benchmarks/benchmark_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index d3024aacd78..1bd3372aa47 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -800,7 +800,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, ignored_metrics = ["ttfts", "itls", "generated_texts"] pt_records = convert_to_pytorch_benchmark_format( args=args, - metrics={k: results[k] + metrics={k: [results[k]] for k in metrics}, extra_info={ k: results[k] From 42288cba68807b0f9f53031afdf0299585817178 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 14 Feb 2025 11:35:38 -0800 Subject: [PATCH 13/17] Add the rest of serving metrics Signed-off-by: Huy Do --- benchmarks/benchmark_serving.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 1bd3372aa47..9c57bc36617 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -794,10 +794,14 @@ def parse_goodput(slo_pairs): def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: Dict[str, Any], file_name: str) -> None: - metrics = ["median_ttft_ms", "median_itl_ms"] + metrics = [ + "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms", + "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms", + "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms" + ] # These raw data might be useful, but they are rather big. They can be added # later if needed - ignored_metrics = ["ttfts", "itls", "generated_texts"] + ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] pt_records = convert_to_pytorch_benchmark_format( args=args, metrics={k: [results[k]] From 61ca7c2cfaf26346a4c51d8787a7d9c54871d9b4 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 14 Feb 2025 23:43:20 -0800 Subject: [PATCH 14/17] Remove redundant sys.path Signed-off-by: Huy Do --- benchmarks/benchmark_latency.py | 5 +---- benchmarks/benchmark_serving.py | 2 -- benchmarks/benchmark_throughput.py | 5 +---- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index f337deb83bb..21c9929a5b2 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -5,13 +5,13 @@ import dataclasses import json import os -import sys import time from pathlib import Path from typing import Any, Dict, List, Optional import numpy as np import torch +from benchmark_utils import convert_to_pytorch_benchmark_format from tqdm import tqdm from vllm import LLM, SamplingParams @@ -20,9 +20,6 @@ from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser -sys.path.append(os.path.dirname(os.path.realpath(__file__))) -from benchmark_utils import convert_to_pytorch_benchmark_format - def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: Dict[str, Any]) -> None: diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 9c57bc36617..1077caa621d 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -31,7 +31,6 @@ import json import os import random -import sys import time import warnings from dataclasses import dataclass @@ -56,7 +55,6 @@ except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser -sys.path.append(os.path.dirname(os.path.realpath(__file__))) from benchmark_utils import convert_to_pytorch_benchmark_format MILLISECONDS_TO_SECONDS_CONVERSION = 1000 diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 9f7d6103202..604da40fe7f 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -5,13 +5,13 @@ import json import os import random -import sys import time from functools import cache from typing import Any, Dict, List, Optional, Tuple import torch import uvloop +from benchmark_utils import convert_to_pytorch_benchmark_format from PIL import Image from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, @@ -28,9 +28,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer from vllm.utils import FlexibleArgumentParser, merge_async_iterators -sys.path.append(os.path.dirname(os.path.realpath(__file__))) -from benchmark_utils import convert_to_pytorch_benchmark_format - @dataclasses.dataclass class SampleRequest: From a4e24e489c50ca15472b299e4b70cc8b12d5dc07 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Sat, 15 Feb 2025 00:25:40 -0800 Subject: [PATCH 15/17] Use 127.0.0.1 for ipv4 Signed-off-by: Huy Do --- benchmarks/benchmark_serving.py | 2 +- benchmarks/benchmark_serving_guided.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 46616aa7ecf..f31784ebe66 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1043,7 +1043,7 @@ def main(args: argparse.Namespace): default=None, help="Server or API base url if not using http host and port.", ) - parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--host", type=str, default="127.0.0.1") parser.add_argument("--port", type=int, default=8000) parser.add_argument( "--endpoint", diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py index 561e500d8b6..ec085960f7b 100644 --- a/benchmarks/benchmark_serving_guided.py +++ b/benchmarks/benchmark_serving_guided.py @@ -731,7 +731,7 @@ def main(args: argparse.Namespace): default=None, help="Server or API base url if not using http host and port.", ) - parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--host", type=str, default="127.0.0.1") parser.add_argument("--port", type=int, default=8000) parser.add_argument( "--endpoint", From fd8fc67535599b8cfa1b0f2b6ad41a2fccb0b2d8 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Sat, 15 Feb 2025 00:43:48 -0800 Subject: [PATCH 16/17] Use .pytorch is weird, let just use .json Signed-off-by: Huy Do --- benchmarks/benchmark_latency.py | 3 +-- benchmarks/benchmark_serving.py | 2 +- benchmarks/benchmark_throughput.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 21c9929a5b2..b041626550b 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -29,8 +29,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, extra_info={k: results[k] for k in ["avg_latency", "percentiles"]}) if pt_records: - # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch" + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" with open(pt_file, "w") as f: json.dump(pt_records, f) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index f31784ebe66..e94357f6a91 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -840,7 +840,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, }) if pt_records: # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(file_name)[0]}.pytorch" + pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" with open(pt_file, "w") as f: json.dump(pt_records, f) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 604da40fe7f..f7d87f1b336 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -354,7 +354,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, }) if pt_records: # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch" + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" with open(pt_file, "w") as f: json.dump(pt_records, f) From 85910f50d56d52720edab2c0aa87f5a7e1e8a9c0 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Sat, 15 Feb 2025 23:00:06 -0800 Subject: [PATCH 17/17] Add a comment Signed-off-by: Huy Do --- benchmarks/benchmark_serving.py | 1 + benchmarks/benchmark_serving_guided.py | 1 + 2 files changed, 2 insertions(+) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index e94357f6a91..9760737ccec 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1043,6 +1043,7 @@ def main(args: argparse.Namespace): default=None, help="Server or API base url if not using http host and port.", ) + # Use 127.0.0.1 here instead of localhost to force the use of ipv4 parser.add_argument("--host", type=str, default="127.0.0.1") parser.add_argument("--port", type=int, default=8000) parser.add_argument( diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py index ec085960f7b..04942b06ffd 100644 --- a/benchmarks/benchmark_serving_guided.py +++ b/benchmarks/benchmark_serving_guided.py @@ -731,6 +731,7 @@ def main(args: argparse.Namespace): default=None, help="Server or API base url if not using http host and port.", ) + # Use 127.0.0.1 here instead of localhost to force the use of ipv4 parser.add_argument("--host", type=str, default="127.0.0.1") parser.add_argument("--port", type=int, default=8000) parser.add_argument(