From ce79bc5e514d4e3c143dc35a675fdcaa3f7c760e Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 10 Feb 2025 17:03:30 -0800
Subject: [PATCH 01/17] Run v1 benchmark

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .../scripts/run-performance-benchmarks.sh     |  13 +-
 .../tests/latency-tests.json                  |   2 +-
 benchmarks/benchmark_latency.py               | 147 ++++++++++++------
 3 files changed, 111 insertions(+), 51 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index 0d16a83781a..261d340a30d 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -345,6 +345,15 @@ main() {
   check_gpus
   check_hf_token
 
+  # Set to v1 to run v1 benchmark
+  VLLM_VERSION=$1
+  if [[ "${VLLM_VERSION:-v0}" == "v1" ]]; then
+    export VLLM_USE_V1=1
+  fi
+
+  # Set to 0 to run the benchmark script locally without uploading to Buildkite
+  UPLOAD_TO_BUILDKITE=$2
+
   # dependencies
   (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
   (which jq) || (apt-get update && apt-get -y install jq)
@@ -371,7 +380,9 @@ main() {
   pip install tabulate pandas
   python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
 
-  upload_to_buildkite
+  if [[ "${UPLOAD_TO_BUILDKITE:-1}" == "1" ]]; then
+    upload_to_buildkite
+  fi
 }
 
 main "$@"
diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json
index 1841186da15..7762a239f96 100644
--- a/.buildkite/nightly-benchmarks/tests/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@@ -29,4 +29,4 @@
             "num-iters": 15
         }
     }
-]
\ No newline at end of file
+]
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 89631294531..b4ff489be5e 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 """Benchmark the latency of processing a single batch of requests."""
+
+import os
 import argparse
 import dataclasses
 import json
 import time
 from pathlib import Path
-from typing import List, Optional
+from typing import List, Optional, Dict, Any
 
 import numpy as np
 import torch
@@ -18,6 +20,38 @@
 from vllm.utils import FlexibleArgumentParser
 
 
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: Dict[str, Any]
+) -> None:
+    # https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    record = {
+        "benchmark": {
+            "name": "vLLM benchmark",
+            "extra_info": {
+                "args": args,
+            },
+        },
+        "model": {
+            "name": args.model,
+        },
+        "metric": {
+            "name": "latency",
+            "benchmark_values": results.get("latencies", []),
+            "extra_info": {
+                "avg_latency": results.get("avg_latency", 0),
+                "percentiles": results.get("percentiles", {}),
+            },
+        },
+    }
+
+    if os.environ.get("SAVE_IN_PYTORCH_BENCHMARK_FORMAT", False):
+        output_file = (
+            f"{os.path.splitext(args.output_json)[0]}_pytorch_format.json"
+        )
+        with open(output_file, "w") as f:
+            json.dump(record, f)
+
+
 def main(args: argparse.Namespace):
     print(args)
 
@@ -35,18 +69,18 @@ def main(args: argparse.Namespace):
         max_tokens=args.output_len,
     )
     print(sampling_params)
-    dummy_prompt_token_ids = np.random.randint(10000,
-                                               size=(args.batch_size,
-                                                     args.input_len))
-    dummy_prompts: List[PromptType] = [{
-        "prompt_token_ids": batch
-    } for batch in dummy_prompt_token_ids.tolist()]
+    dummy_prompt_token_ids = np.random.randint(
+        10000, size=(args.batch_size, args.input_len)
+    )
+    dummy_prompts: List[PromptType] = [
+        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
+    ]
 
     def llm_generate():
         if not args.use_beam_search:
-            llm.generate(dummy_prompts,
-                         sampling_params=sampling_params,
-                         use_tqdm=False)
+            llm.generate(
+                dummy_prompts, sampling_params=sampling_params, use_tqdm=False
+            )
         else:
             llm.beam_search(
                 dummy_prompts,
@@ -54,17 +88,20 @@ def llm_generate():
                     beam_width=args.n,
                     max_tokens=args.output_len,
                     ignore_eos=True,
-                ))
+                ),
+            )
 
     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
             with torch.profiler.profile(
-                    activities=[
-                        torch.profiler.ProfilerActivity.CPU,
-                        torch.profiler.ProfilerActivity.CUDA,
-                    ],
-                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                        str(profile_dir))) as p:
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CUDA,
+                ],
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    str(profile_dir)
+                ),
+            ) as p:
                 llm_generate()
             print(p.key_averages().table(sort_by="self_cuda_time_total"))
         else:
@@ -81,9 +118,11 @@ def run_to_completion(profile_dir: Optional[str] = None):
     if args.profile:
         profile_dir = args.profile_result_dir
         if not profile_dir:
-            profile_dir = Path(
-                "."
-            ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
+            profile_dir = (
+                Path(".")
+                / "vllm_benchmark_result"
+                / f"latency_result_{time.time()}"
+            )
         print(f"Profiling (results will be saved to '{profile_dir}')...")
         run_to_completion(profile_dir=profile_dir)
         return
@@ -95,9 +134,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
     latencies = np.array(latencies)
     percentages = [10, 25, 50, 75, 90, 99]
     percentiles = np.percentile(latencies, percentages)
-    print(f'Avg latency: {np.mean(latencies)} seconds')
+    print(f"Avg latency: {np.mean(latencies)} seconds")
     for percentage, percentile in zip(percentages, percentiles):
-        print(f'{percentage}% percentile latency: {percentile} seconds')
+        print(f"{percentage}% percentile latency: {percentile} seconds")
 
     # Output JSON results if specified
     if args.output_json:
@@ -108,43 +147,53 @@ def run_to_completion(profile_dir: Optional[str] = None):
         }
         with open(args.output_json, "w") as f:
             json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = FlexibleArgumentParser(
-        description='Benchmark the latency of processing a single batch of '
-        'requests till completion.')
-    parser.add_argument('--input-len', type=int, default=32)
-    parser.add_argument('--output-len', type=int, default=128)
-    parser.add_argument('--batch-size', type=int, default=8)
-    parser.add_argument('--n',
-                        type=int,
-                        default=1,
-                        help='Number of generated sequences per prompt.')
-    parser.add_argument('--use-beam-search', action='store_true')
-    parser.add_argument('--num-iters-warmup',
-                        type=int,
-                        default=10,
-                        help='Number of iterations to run for warmup.')
-    parser.add_argument('--num-iters',
-                        type=int,
-                        default=30,
-                        help='Number of iterations to run.')
+        description="Benchmark the latency of processing a single batch of "
+        "requests till completion."
+    )
+    parser.add_argument("--input-len", type=int, default=32)
+    parser.add_argument("--output-len", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=1,
+        help="Number of generated sequences per prompt.",
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
     parser.add_argument(
-        '--profile',
-        action='store_true',
-        help='profile the generation process of a single batch')
+        "--num-iters-warmup",
+        type=int,
+        default=10,
+        help="Number of iterations to run for warmup.",
+    )
+    parser.add_argument(
+        "--num-iters", type=int, default=30, help="Number of iterations to run."
+    )
     parser.add_argument(
-        '--profile-result-dir',
+        "--profile",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--profile-result-dir",
         type=str,
         default=None,
-        help=('path to save the pytorch profiler output. Can be visualized '
-              'with ui.perfetto.dev or Tensorboard.'))
+        help=(
+            "path to save the pytorch profiler output. Can be visualized "
+            "with ui.perfetto.dev or Tensorboard."
+        ),
+    )
     parser.add_argument(
-        '--output-json',
+        "--output-json",
         type=str,
         default=None,
-        help='Path to save the latency results in JSON format.')
+        help="Path to save the latency results in JSON format.",
+    )
 
     parser = EngineArgs.add_cli_args(parser)
     args = parser.parse_args()

From d84671fdef6764a8033358b082d7d64dd35087bf Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 10 Feb 2025 17:16:59 -0800
Subject: [PATCH 02/17] Fix env variables

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .../nightly-benchmarks/scripts/run-performance-benchmarks.sh | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index 261d340a30d..eb2ad434945 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -346,14 +346,10 @@ main() {
   check_hf_token
 
   # Set to v1 to run v1 benchmark
-  VLLM_VERSION=$1
   if [[ "${VLLM_VERSION:-v0}" == "v1" ]]; then
     export VLLM_USE_V1=1
   fi
 
-  # Set to 0 to run the benchmark script locally without uploading to Buildkite
-  UPLOAD_TO_BUILDKITE=$2
-
   # dependencies
   (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
   (which jq) || (apt-get update && apt-get -y install jq)
@@ -380,6 +376,7 @@ main() {
   pip install tabulate pandas
   python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
 
+  # Set to 0 to run the benchmark script locally without uploading to Buildkite
   if [[ "${UPLOAD_TO_BUILDKITE:-1}" == "1" ]]; then
     upload_to_buildkite
   fi

From 27caeb30a0fc29c1f6dc7890ec8898ad46a6e500 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 10 Feb 2025 17:33:29 -0800
Subject: [PATCH 03/17] Another tweak in the output format

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 benchmarks/benchmark_latency.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index b4ff489be5e..4924cd78efb 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -28,7 +28,7 @@ def save_to_pytorch_benchmark_format(
         "benchmark": {
             "name": "vLLM benchmark",
             "extra_info": {
-                "args": args,
+                "args": str(args),
             },
         },
         "model": {
@@ -45,9 +45,9 @@ def save_to_pytorch_benchmark_format(
     }
 
     if os.environ.get("SAVE_IN_PYTORCH_BENCHMARK_FORMAT", False):
-        output_file = (
-            f"{os.path.splitext(args.output_json)[0]}_pytorch_format.json"
-        )
+        # Don't use json suffix here as we don't want convert-results-json-to-markdown.py
+        # to pick it up
+        output_file = f"{os.path.splitext(args.output_json)[0]}.pytorch"
         with open(output_file, "w") as f:
             json.dump(record, f)
 

From 4ae88ea29e35d1437e7365d56937db2d44d8d813 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 10 Feb 2025 17:39:43 -0800
Subject: [PATCH 04/17] Use vars

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 benchmarks/benchmark_latency.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 4924cd78efb..96d0eeee6f3 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -28,7 +28,7 @@ def save_to_pytorch_benchmark_format(
         "benchmark": {
             "name": "vLLM benchmark",
             "extra_info": {
-                "args": str(args),
+                "args": vars(args),
             },
         },
         "model": {

From c04e53ff56dc6602c743fda7cbaff8b33b7634f6 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 10 Feb 2025 17:47:30 -0800
Subject: [PATCH 05/17] No need to skip buildkite upload

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .../nightly-benchmarks/scripts/run-performance-benchmarks.sh | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index eb2ad434945..0a9cf84a873 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -376,10 +376,7 @@ main() {
   pip install tabulate pandas
   python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
 
-  # Set to 0 to run the benchmark script locally without uploading to Buildkite
-  if [[ "${UPLOAD_TO_BUILDKITE:-1}" == "1" ]]; then
-    upload_to_buildkite
-  fi
+  upload_to_buildkite
 }
 
 main "$@"

From 10971bdfd6943c829fe990c584a099cf58368669 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 10 Feb 2025 18:02:04 -0800
Subject: [PATCH 06/17] Fix typo

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 benchmarks/benchmark_latency.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 96d0eeee6f3..ff5e252c1ad 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -44,7 +44,7 @@ def save_to_pytorch_benchmark_format(
         },
     }
 
-    if os.environ.get("SAVE_IN_PYTORCH_BENCHMARK_FORMAT", False):
+    if os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
         # Don't use json suffix here as we don't want convert-results-json-to-markdown.py
         # to pick it up
         output_file = f"{os.path.splitext(args.output_json)[0]}.pytorch"

From 93c3b8510f9b3720d8b95998e61361716a051722 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 10 Feb 2025 18:51:03 -0800
Subject: [PATCH 07/17] Fix pre-commit

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 benchmarks/benchmark_latency.py | 67 +++++++++++++++------------------
 1 file changed, 30 insertions(+), 37 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index ff5e252c1ad..48a43c1daa4 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 """Benchmark the latency of processing a single batch of requests."""
 
-import os
 import argparse
 import dataclasses
 import json
+import os
 import time
 from pathlib import Path
-from typing import List, Optional, Dict, Any
+from typing import Any, Dict, List, Optional
 
 import numpy as np
 import torch
@@ -20,9 +20,8 @@
 from vllm.utils import FlexibleArgumentParser
 
 
-def save_to_pytorch_benchmark_format(
-    args: argparse.Namespace, results: Dict[str, Any]
-) -> None:
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     results: Dict[str, Any]) -> None:
     # https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
     record = {
         "benchmark": {
@@ -45,8 +44,8 @@ def save_to_pytorch_benchmark_format(
     }
 
     if os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
-        # Don't use json suffix here as we don't want convert-results-json-to-markdown.py
-        # to pick it up
+        # Don't use json suffix here as we don't want
+        # convert-results-json-to-markdown.py to pick it up
         output_file = f"{os.path.splitext(args.output_json)[0]}.pytorch"
         with open(output_file, "w") as f:
             json.dump(record, f)
@@ -69,18 +68,18 @@ def main(args: argparse.Namespace):
         max_tokens=args.output_len,
     )
     print(sampling_params)
-    dummy_prompt_token_ids = np.random.randint(
-        10000, size=(args.batch_size, args.input_len)
-    )
-    dummy_prompts: List[PromptType] = [
-        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
-    ]
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_prompts: List[PromptType] = [{
+        "prompt_token_ids": batch
+    } for batch in dummy_prompt_token_ids.tolist()]
 
     def llm_generate():
         if not args.use_beam_search:
-            llm.generate(
-                dummy_prompts, sampling_params=sampling_params, use_tqdm=False
-            )
+            llm.generate(dummy_prompts,
+                         sampling_params=sampling_params,
+                         use_tqdm=False)
         else:
             llm.beam_search(
                 dummy_prompts,
@@ -94,13 +93,12 @@ def llm_generate():
     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
             with torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                    torch.profiler.ProfilerActivity.CUDA,
-                ],
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    str(profile_dir)
-                ),
+                    activities=[
+                        torch.profiler.ProfilerActivity.CPU,
+                        torch.profiler.ProfilerActivity.CUDA,
+                    ],
+                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                        str(profile_dir)),
             ) as p:
                 llm_generate()
             print(p.key_averages().table(sort_by="self_cuda_time_total"))
@@ -118,11 +116,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
     if args.profile:
         profile_dir = args.profile_result_dir
         if not profile_dir:
-            profile_dir = (
-                Path(".")
-                / "vllm_benchmark_result"
-                / f"latency_result_{time.time()}"
-            )
+            profile_dir = (Path(".") / "vllm_benchmark_result" /
+                           f"latency_result_{time.time()}")
         print(f"Profiling (results will be saved to '{profile_dir}')...")
         run_to_completion(profile_dir=profile_dir)
         return
@@ -153,8 +148,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(
         description="Benchmark the latency of processing a single batch of "
-        "requests till completion."
-    )
+        "requests till completion.")
     parser.add_argument("--input-len", type=int, default=32)
     parser.add_argument("--output-len", type=int, default=128)
     parser.add_argument("--batch-size", type=int, default=8)
@@ -171,9 +165,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
         default=10,
         help="Number of iterations to run for warmup.",
     )
-    parser.add_argument(
-        "--num-iters", type=int, default=30, help="Number of iterations to run."
-    )
+    parser.add_argument("--num-iters",
+                        type=int,
+                        default=30,
+                        help="Number of iterations to run.")
     parser.add_argument(
         "--profile",
         action="store_true",
@@ -183,10 +178,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
         "--profile-result-dir",
         type=str,
         default=None,
-        help=(
-            "path to save the pytorch profiler output. Can be visualized "
-            "with ui.perfetto.dev or Tensorboard."
-        ),
+        help=("path to save the pytorch profiler output. Can be visualized "
+              "with ui.perfetto.dev or Tensorboard."),
     )
     parser.add_argument(
         "--output-json",

From 7cddf747e1746ad42e15bc3c779c3450537bb6f9 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 11 Feb 2025 23:17:32 -0800
Subject: [PATCH 08/17] Address review comments

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .../scripts/run-performance-benchmarks.sh     |  2 +-
 benchmarks/benchmark_latency.py               | 47 ++++++-------------
 benchmarks/benchmark_throughput.py            | 21 +++++++++
 benchmarks/benchmark_utils.py                 | 39 +++++++++++++++
 4 files changed, 76 insertions(+), 33 deletions(-)
 create mode 100644 benchmarks/benchmark_utils.py

diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index 0a9cf84a873..9425cb07ec0 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -346,7 +346,7 @@ main() {
   check_hf_token
 
   # Set to v1 to run v1 benchmark
-  if [[ "${VLLM_VERSION:-v0}" == "v1" ]]; then
+  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
     export VLLM_USE_V1=1
   fi
 
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 48a43c1daa4..157cdafc861 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -5,9 +5,10 @@
 import dataclasses
 import json
 import os
+import sys
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import List, Optional
 
 import numpy as np
 import torch
@@ -19,36 +20,8 @@
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser
 
-
-def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: Dict[str, Any]) -> None:
-    # https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
-    record = {
-        "benchmark": {
-            "name": "vLLM benchmark",
-            "extra_info": {
-                "args": vars(args),
-            },
-        },
-        "model": {
-            "name": args.model,
-        },
-        "metric": {
-            "name": "latency",
-            "benchmark_values": results.get("latencies", []),
-            "extra_info": {
-                "avg_latency": results.get("avg_latency", 0),
-                "percentiles": results.get("percentiles", {}),
-            },
-        },
-    }
-
-    if os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
-        # Don't use json suffix here as we don't want
-        # convert-results-json-to-markdown.py to pick it up
-        output_file = f"{os.path.splitext(args.output_json)[0]}.pytorch"
-        with open(output_file, "w") as f:
-            json.dump(record, f)
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+from benchmark_utils import save_to_pytorch_benchmark_format
 
 
 def main(args: argparse.Namespace):
@@ -142,7 +115,17 @@ def run_to_completion(profile_dir: Optional[str] = None):
         }
         with open(args.output_json, "w") as f:
             json.dump(results, f, indent=4)
-        save_to_pytorch_benchmark_format(args, results)
+
+        pt_records = save_to_pytorch_benchmark_format(
+            args=args,
+            metrics={"latency": results["latencies"]},
+            extra_info={k: results[k]
+                        for k in ["avg_latency", "percentiles"]})
+        if pt_records:
+            # Don't use json suffix here as we don't want CI to pick it up
+            pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch"
+            with open(pt_file, "w") as f:
+                json.dump(pt_records, f)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 658eab6a278..40d2c0daa64 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -3,7 +3,9 @@
 import argparse
 import dataclasses
 import json
+import os
 import random
+import sys
 import time
 from functools import cache
 from typing import Dict, List, Optional, Tuple
@@ -26,6 +28,9 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
 
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+from benchmark_utils import save_to_pytorch_benchmark_format
+
 
 @dataclasses.dataclass
 class SampleRequest:
@@ -436,6 +441,22 @@ def main(args: argparse.Namespace):
         with open(args.output_json, "w") as f:
             json.dump(results, f, indent=4)
 
+        pt_records = save_to_pytorch_benchmark_format(
+            args=args,
+            metrics={
+                "requests_per_second": results["requests_per_second"],
+                "tokens_per_second": results["tokens_per_second"],
+            },
+            extra_info={
+                k: results[k]
+                for k in ["elapsed_time", "num_requests", "total_num_tokens"]
+            })
+        if pt_records:
+            # Don't use json suffix here as we don't want CI to pick it up
+            pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch"
+            with open(pt_file, "w") as f:
+                json.dump(pt_records, f)
+
 
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(description="Benchmark the throughput.")
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
new file mode 100644
index 00000000000..90bb6a5dcc2
--- /dev/null
+++ b/benchmarks/benchmark_utils.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import os
+from typing import Any, Dict, List
+
+
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     metrics: Dict[str, List],
+                                     extra_info: Dict[str, Any]) -> List:
+    """
+    Save the benchmark results in the format used by PyTorch OSS benchmark with
+    on metric per record
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    """
+    records = []
+    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
+        return records
+
+    for name, benchmark_values in metrics.items():
+        record = {
+            "benchmark": {
+                "name": "vLLM benchmark",
+                "extra_info": {
+                    "args": vars(args),
+                },
+            },
+            "model": {
+                "name": args.model,
+            },
+            "metric": {
+                "name": name,
+                "benchmark_values": benchmark_values,
+                "extra_info": extra_info,
+            },
+        }
+        records.append(record)
+
+    return records

From a14865c1b9d7ba201d986c8a027bad64d6b75c84 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 11 Feb 2025 23:42:44 -0800
Subject: [PATCH 09/17] Add benchmark_serving

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 benchmarks/benchmark_latency.py    | 30 ++++++++++++----------
 benchmarks/benchmark_serving.py    | 38 ++++++++++++++++++++++------
 benchmarks/benchmark_throughput.py | 40 ++++++++++++++++--------------
 benchmarks/benchmark_utils.py      |  6 ++---
 4 files changed, 72 insertions(+), 42 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 157cdafc861..f337deb83bb 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -8,7 +8,7 @@
 import sys
 import time
 from pathlib import Path
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 
 import numpy as np
 import torch
@@ -21,7 +21,21 @@
 from vllm.utils import FlexibleArgumentParser
 
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-from benchmark_utils import save_to_pytorch_benchmark_format
+from benchmark_utils import convert_to_pytorch_benchmark_format
+
+
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     results: Dict[str, Any]) -> None:
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={"latency": results["latencies"]},
+        extra_info={k: results[k]
+                    for k in ["avg_latency", "percentiles"]})
+    if pt_records:
+        # Don't use json suffix here as we don't want CI to pick it up
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch"
+        with open(pt_file, "w") as f:
+            json.dump(pt_records, f)
 
 
 def main(args: argparse.Namespace):
@@ -115,17 +129,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
         }
         with open(args.output_json, "w") as f:
             json.dump(results, f, indent=4)
-
-        pt_records = save_to_pytorch_benchmark_format(
-            args=args,
-            metrics={"latency": results["latencies"]},
-            extra_info={k: results[k]
-                        for k in ["avg_latency", "percentiles"]})
-        if pt_records:
-            # Don't use json suffix here as we don't want CI to pick it up
-            pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch"
-            with open(pt_file, "w") as f:
-                json.dump(pt_records, f)
+        save_to_pytorch_benchmark_format(args, results)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 1044bef5941..888aa2bc470 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -31,6 +31,7 @@
 import json
 import os
 import random
+import sys
 import time
 import warnings
 from dataclasses import dataclass
@@ -55,6 +56,9 @@
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+from benchmark_utils import convert_to_pytorch_benchmark_format
+
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
 
@@ -372,21 +376,21 @@ async def get_request(
     burstiness: float = 1.0,
 ) -> AsyncGenerator[Tuple[str, int, int], None]:
     """
-    Asynchronously generates requests at a specified rate 
+    Asynchronously generates requests at a specified rate
     with OPTIONAL burstiness.
-    
+
     Args:
-        input_requests: 
+        input_requests:
             A list of input requests, each represented as a tuple.
-        request_rate: 
+        request_rate:
             The rate at which requests are generated (requests/s).
-        burstiness (optional): 
-            The burstiness factor of the request generation. 
+        burstiness (optional):
+            The burstiness factor of the request generation.
             Only takes effect when request_rate is not inf.
             Default value is 1, which follows a Poisson process.
             Otherwise, the request intervals follow a gamma distribution.
-            A lower burstiness value (0 < burstiness < 1) results 
-            in more bursty requests, while a higher burstiness value 
+            A lower burstiness value (0 < burstiness < 1) results
+            in more bursty requests, while a higher burstiness value
             (burstiness > 1) results in a more uniform arrival of requests.
     """
     input_requests = iter(input_requests)
@@ -787,6 +791,23 @@ def parse_goodput(slo_pairs):
     return goodput_config_dict
 
 
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     results: Dict[str, Any],
+                                     file_name: str) -> None:
+    metrics = ["ttft", "tpot", "itl", "e2el"]
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={k: results[k]
+                 for k in metrics},
+        extra_info={k: results[k]
+                    for k in results if k not in metrics})
+    if pt_records:
+        # Don't use json suffix here as we don't want CI to pick it up
+        pt_file = f"{os.path.splitext(file_name)[0]}.pytorch"
+        with open(pt_file, "w") as f:
+            json.dump(pt_records, f)
+
+
 def main(args: argparse.Namespace):
     print(args)
     random.seed(args.seed)
@@ -959,6 +980,7 @@ def main(args: argparse.Namespace):
             file_name = os.path.join(args.result_dir, file_name)
         with open(file_name, "w", encoding='utf-8') as outfile:
             json.dump(result_json, outfile)
+        save_to_pytorch_benchmark_format(args, result_json, file_name)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 40d2c0daa64..9f7d6103202 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -8,7 +8,7 @@
 import sys
 import time
 from functools import cache
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 import uvloop
@@ -29,7 +29,7 @@
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
 
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-from benchmark_utils import save_to_pytorch_benchmark_format
+from benchmark_utils import convert_to_pytorch_benchmark_format
 
 
 @dataclasses.dataclass
@@ -343,6 +343,25 @@ def run_mii(
     return end - start
 
 
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     results: Dict[str, Any]) -> None:
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={
+            "requests_per_second": [results["requests_per_second"]],
+            "tokens_per_second": [results["tokens_per_second"]],
+        },
+        extra_info={
+            k: results[k]
+            for k in ["elapsed_time", "num_requests", "total_num_tokens"]
+        })
+    if pt_records:
+        # Don't use json suffix here as we don't want CI to pick it up
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch"
+        with open(pt_file, "w") as f:
+            json.dump(pt_records, f)
+
+
 def main(args: argparse.Namespace):
     print(args)
     random.seed(args.seed)
@@ -440,22 +459,7 @@ def main(args: argparse.Namespace):
         }
         with open(args.output_json, "w") as f:
             json.dump(results, f, indent=4)
-
-        pt_records = save_to_pytorch_benchmark_format(
-            args=args,
-            metrics={
-                "requests_per_second": results["requests_per_second"],
-                "tokens_per_second": results["tokens_per_second"],
-            },
-            extra_info={
-                k: results[k]
-                for k in ["elapsed_time", "num_requests", "total_num_tokens"]
-            })
-        if pt_records:
-            # Don't use json suffix here as we don't want CI to pick it up
-            pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch"
-            with open(pt_file, "w") as f:
-                json.dump(pt_records, f)
+        save_to_pytorch_benchmark_format(args, results)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 90bb6a5dcc2..6f01cf20e17 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -5,9 +5,9 @@
 from typing import Any, Dict, List
 
 
-def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     metrics: Dict[str, List],
-                                     extra_info: Dict[str, Any]) -> List:
+def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                        metrics: Dict[str, List],
+                                        extra_info: Dict[str, Any]) -> List:
     """
     Save the benchmark results in the format used by PyTorch OSS benchmark with
     on metric per record

From 19f436d590dad413ca9a95e4b57061e0a472eca6 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 12 Feb 2025 00:20:50 -0800
Subject: [PATCH 10/17] There are only ttfts and itls

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 benchmarks/benchmark_serving.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 888aa2bc470..05a442b91b5 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -794,7 +794,7 @@ def parse_goodput(slo_pairs):
 def save_to_pytorch_benchmark_format(args: argparse.Namespace,
                                      results: Dict[str, Any],
                                      file_name: str) -> None:
-    metrics = ["ttft", "tpot", "itl", "e2el"]
+    metrics = ["ttfts", "itls"]
     pt_records = convert_to_pytorch_benchmark_format(
         args=args,
         metrics={k: results[k]

From 6874d6596807e522cfb58ea49fced4cf57edc47a Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 12 Feb 2025 00:45:54 -0800
Subject: [PATCH 11/17] Ignore some raw metrics in serving benchmark

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 benchmarks/benchmark_serving.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 05a442b91b5..d3024aacd78 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -794,13 +794,18 @@ def parse_goodput(slo_pairs):
 def save_to_pytorch_benchmark_format(args: argparse.Namespace,
                                      results: Dict[str, Any],
                                      file_name: str) -> None:
-    metrics = ["ttfts", "itls"]
+    metrics = ["median_ttft_ms", "median_itl_ms"]
+    # These raw data might be useful, but they are rather big. They can be added
+    # later if needed
+    ignored_metrics = ["ttfts", "itls", "generated_texts"]
     pt_records = convert_to_pytorch_benchmark_format(
         args=args,
         metrics={k: results[k]
                  for k in metrics},
-        extra_info={k: results[k]
-                    for k in results if k not in metrics})
+        extra_info={
+            k: results[k]
+            for k in results if k not in metrics and k not in ignored_metrics
+        })
     if pt_records:
         # Don't use json suffix here as we don't want CI to pick it up
         pt_file = f"{os.path.splitext(file_name)[0]}.pytorch"

From 826887cc4307289b9b70c481fea8afbe60628dc9 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 12 Feb 2025 01:01:25 -0800
Subject: [PATCH 12/17] Another typo

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 benchmarks/benchmark_serving.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index d3024aacd78..1bd3372aa47 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -800,7 +800,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
     ignored_metrics = ["ttfts", "itls", "generated_texts"]
     pt_records = convert_to_pytorch_benchmark_format(
         args=args,
-        metrics={k: results[k]
+        metrics={k: [results[k]]
                  for k in metrics},
         extra_info={
             k: results[k]

From 42288cba68807b0f9f53031afdf0299585817178 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 14 Feb 2025 11:35:38 -0800
Subject: [PATCH 13/17] Add the rest of serving metrics

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 benchmarks/benchmark_serving.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 1bd3372aa47..9c57bc36617 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -794,10 +794,14 @@ def parse_goodput(slo_pairs):
 def save_to_pytorch_benchmark_format(args: argparse.Namespace,
                                      results: Dict[str, Any],
                                      file_name: str) -> None:
-    metrics = ["median_ttft_ms", "median_itl_ms"]
+    metrics = [
+        "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
+        "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
+        "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
+    ]
     # These raw data might be useful, but they are rather big. They can be added
     # later if needed
-    ignored_metrics = ["ttfts", "itls", "generated_texts"]
+    ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
     pt_records = convert_to_pytorch_benchmark_format(
         args=args,
         metrics={k: [results[k]]

From 61ca7c2cfaf26346a4c51d8787a7d9c54871d9b4 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 14 Feb 2025 23:43:20 -0800
Subject: [PATCH 14/17] Remove redundant sys.path

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 benchmarks/benchmark_latency.py    | 5 +----
 benchmarks/benchmark_serving.py    | 2 --
 benchmarks/benchmark_throughput.py | 5 +----
 3 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index f337deb83bb..21c9929a5b2 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -5,13 +5,13 @@
 import dataclasses
 import json
 import os
-import sys
 import time
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
 import numpy as np
 import torch
+from benchmark_utils import convert_to_pytorch_benchmark_format
 from tqdm import tqdm
 
 from vllm import LLM, SamplingParams
@@ -20,9 +20,6 @@
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser
 
-sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-from benchmark_utils import convert_to_pytorch_benchmark_format
-
 
 def save_to_pytorch_benchmark_format(args: argparse.Namespace,
                                      results: Dict[str, Any]) -> None:
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 9c57bc36617..1077caa621d 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -31,7 +31,6 @@
 import json
 import os
 import random
-import sys
 import time
 import warnings
 from dataclasses import dataclass
@@ -56,7 +55,6 @@
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
-sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 from benchmark_utils import convert_to_pytorch_benchmark_format
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 9f7d6103202..604da40fe7f 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -5,13 +5,13 @@
 import json
 import os
 import random
-import sys
 import time
 from functools import cache
 from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 import uvloop
+from benchmark_utils import convert_to_pytorch_benchmark_format
 from PIL import Image
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
@@ -28,9 +28,6 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
 
-sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-from benchmark_utils import convert_to_pytorch_benchmark_format
-
 
 @dataclasses.dataclass
 class SampleRequest:

From a4e24e489c50ca15472b299e4b70cc8b12d5dc07 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Sat, 15 Feb 2025 00:25:40 -0800
Subject: [PATCH 15/17] Use 127.0.0.1 for ipv4

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 benchmarks/benchmark_serving.py        | 2 +-
 benchmarks/benchmark_serving_guided.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 46616aa7ecf..f31784ebe66 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1043,7 +1043,7 @@ def main(args: argparse.Namespace):
         default=None,
         help="Server or API base url if not using http host and port.",
     )
-    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--host", type=str, default="127.0.0.1")
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument(
         "--endpoint",
diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py
index 561e500d8b6..ec085960f7b 100644
--- a/benchmarks/benchmark_serving_guided.py
+++ b/benchmarks/benchmark_serving_guided.py
@@ -731,7 +731,7 @@ def main(args: argparse.Namespace):
         default=None,
         help="Server or API base url if not using http host and port.",
     )
-    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--host", type=str, default="127.0.0.1")
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument(
         "--endpoint",

From fd8fc67535599b8cfa1b0f2b6ad41a2fccb0b2d8 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Sat, 15 Feb 2025 00:43:48 -0800
Subject: [PATCH 16/17] Use .pytorch is weird, let just use .json

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 benchmarks/benchmark_latency.py    | 3 +--
 benchmarks/benchmark_serving.py    | 2 +-
 benchmarks/benchmark_throughput.py | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 21c9929a5b2..b041626550b 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -29,8 +29,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
         extra_info={k: results[k]
                     for k in ["avg_latency", "percentiles"]})
     if pt_records:
-        # Don't use json suffix here as we don't want CI to pick it up
-        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch"
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
         with open(pt_file, "w") as f:
             json.dump(pt_records, f)
 
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index f31784ebe66..e94357f6a91 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -840,7 +840,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
         })
     if pt_records:
         # Don't use json suffix here as we don't want CI to pick it up
-        pt_file = f"{os.path.splitext(file_name)[0]}.pytorch"
+        pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
         with open(pt_file, "w") as f:
             json.dump(pt_records, f)
 
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 604da40fe7f..f7d87f1b336 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -354,7 +354,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
         })
     if pt_records:
         # Don't use json suffix here as we don't want CI to pick it up
-        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch"
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
         with open(pt_file, "w") as f:
             json.dump(pt_records, f)
 

From 85910f50d56d52720edab2c0aa87f5a7e1e8a9c0 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Sat, 15 Feb 2025 23:00:06 -0800
Subject: [PATCH 17/17] Add a comment

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 benchmarks/benchmark_serving.py        | 1 +
 benchmarks/benchmark_serving_guided.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index e94357f6a91..9760737ccec 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1043,6 +1043,7 @@ def main(args: argparse.Namespace):
         default=None,
         help="Server or API base url if not using http host and port.",
     )
+    # Use 127.0.0.1 here instead of localhost to force the use of ipv4
     parser.add_argument("--host", type=str, default="127.0.0.1")
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument(
diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py
index ec085960f7b..04942b06ffd 100644
--- a/benchmarks/benchmark_serving_guided.py
+++ b/benchmarks/benchmark_serving_guided.py
@@ -731,6 +731,7 @@ def main(args: argparse.Namespace):
         default=None,
         help="Server or API base url if not using http host and port.",
     )
+    # Use 127.0.0.1 here instead of localhost to force the use of ipv4
     parser.add_argument("--host", type=str, default="127.0.0.1")
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument(