update benchmarking guide with latest results with vllm v1 (#559)

kaushikmitr · web-flow · commit 61125a818d5a · 2025-03-28T06:58:40.000-07:00
* update benchmarking guide with latest results with vllm v1

* update graph
diff --git a/site-src/performance/benchmark/example-bar-chart.png b/site-src/performance/benchmark/example-bar-chart.png
diff --git a/site-src/performance/benchmark/index.md b/site-src/performance/benchmark/index.md
@@ -45,7 +45,7 @@ The LPG benchmark tool works by sending traffic to the specified target IP and p
     # Get gateway IP
     GW_IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}')
     # Get LoadBalancer k8s service IP
-    SVC_IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}')
+    SVC_IP=$(kubectl get service/vllm-llama2-7b -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
 
     echo $GW_IP
     echo $SVC_IP
@@ -62,8 +62,7 @@ the script below will watch for that log line and then start downloading results
     ```bash
     benchmark_id='my-benchmark' ./tools/benchmark/download-benchmark-results.bash
     ```
-
-1. After the script finishes, you should see benchmark results under `./tools/benchmark/output/default-run/my-benchmark/results/json` folder.
+    1. After the script finishes, you should see benchmark results under `./tools/benchmark/output/default-run/my-benchmark/results/json` folder. Here is a [sample json file](./sample.json).
 
 ### Tips
 
@@ -93,6 +92,6 @@ This guide shows how to run the jupyter notebook using vscode.
     ```
 
 1. Open the notebook `./tools/benchmark/benchmark.ipynb`, and run each cell. At the end you should
-    see a bar chart like below:
+    see a bar chart like below where **"ie"** represents inference extension. This chart is generated using this benchmarking tool with 6 vLLM (v1) model servers (H100 80 GB), [llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main) and the [ShareGPT dataset](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json).
     
     ![alt text](example-bar-chart.png)
diff --git a/site-src/performance/benchmark/sample.json b/site-src/performance/benchmark/sample.json
@@ -0,0 +1 @@
+{"metrics": {"num_prompts_attempted": 59999, "num_prompts_succeeded": 59999, "request_rate": 200.0, "server_metrics": {}, "benchmark_time": 377.69680404663086, "throughput_rps": 158.85757929948576, "throughput": 35786.07723228514, "total_output_token": 13516287, "output_tokens_per_min": 2147164.6339371083, "total_input_tokens": 15092072, "input_tokens_per_min": 2397490.024533549, "total_tokens": 28608359, "tokens_per_min": 4544654.658470658, "avg_per_token_latency": 0.038136584066158385, "median_per_token_latency": 0.03260710797991071, "sd_per_token_latency": 0.039995399094383204, "min_per_token_latency": 0.00010268625128206123, "max_per_token_latency": 0.8718070238828659, "p90_per_token_latency": 0.07052694590421603, "p99_per_token_latency": 0.19175863699585777, "avg_latency": 13490.14784723948, "median_latency": 10904.660940170288, "sd_latency": 10759.461472867813, "min_latency": 53.10511589050293, "max_latency": 55610.99076271057, "p90_latency": 28706.796979904175, "p99_latency": 45658.41965198513, "avg_per_output_token_latency": 148.97623456610614, "median_per_output_token_latency": 60.334928053662296, "sd_per_output_token_latency": 232.28505133364948, "min_per_output_token_latency": 7.44791825612386, "max_per_output_token_latency": 3108.849883079529, "p90_per_output_token_latency": 393.8944477023501, "p99_per_output_token_latency": 1193.081065813697, "avg_input_len": 251.53872564542743, "median_input_len": 109.0, "sd_input_len": 281.6475735479433, "min_input_len": 4.0, "max_input_len": 1024.0, "p90_input_len": 714.0, "p99_input_len": 987.0, "avg_output_len": 225.27520458674311, "median_output_len": 144.0, "sd_output_len": 234.48900674005114, "min_output_len": 3.0, "max_output_len": 1025.0, "p90_output_len": 564.0, "p99_output_len": 948.0, "ClientConnectorError": 0, "TimeoutError": 0, "ContentTypeError": 1, "ClientOSError": 0, "ServerDisconnectedError": 0, "unknown_error": 0}, "dimensions": {"date": "20250328-043623", "backend": "vllm", "model_id": "meta-llama/Llama-2-7b-hf", "tokenizer_id": "meta-llama/Llama-2-7b-hf"}, "config": {"model": "meta-llama/Llama-2-7b-hf", "num_models": 1, "model_server": "vllm", "start_time": {"seconds": 1743136583, "nanos": 238149000}}, "summary_stats": {"stats": [{"request_rate": 200.0, "request_latency": {"mean": 13490.14784723948, "median": 10904.660940170288, "sd": 10759.461472867813, "min": 53.10511589050293, "max": 55610.99076271057, "p90": 28706.796979904175, "p99": 45658.41965198513}, "throughput": {"mean": 35786.07723228514}, "input_length": {"mean": 251.53872564542743, "median": 109.0, "sd": 281.6475735479433, "min": 4.0, "max": 1024.0, "p90": 714.0, "p99": 987.0}, "output_length": {"mean": 225.27520458674311, "median": 144.0, "sd": 234.48900674005114, "min": 3.0, "max": 1025.0, "p90": 564.0, "p99": 948.0}, "tpot": {"mean": 148.97623456610614, "median": 60.334928053662296, "sd": 232.28505133364948, "min": 7.44791825612386, "max": 3108.849883079529, "p90": 393.8944477023501, "p99": 1193.081065813697}, "model_server_metrics": []}]}}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"metrics": {"num_prompts_attempted": 59999, "num_prompts_succeeded": 59999, "request_rate": 200.0, "server_metrics": {}, "benchmark_time": 377.69680404663086, "throughput_rps": 158.85757929948576, "throughput": 35786.07723228514, "total_output_token": 13516287, "output_tokens_per_min": 2147164.6339371083, "total_input_tokens": 15092072, "input_tokens_per_min": 2397490.024533549, "total_tokens": 28608359, "tokens_per_min": 4544654.658470658, "avg_per_token_latency": 0.038136584066158385, "median_per_token_latency": 0.03260710797991071, "sd_per_token_latency": 0.039995399094383204, "min_per_token_latency": 0.00010268625128206123, "max_per_token_latency": 0.8718070238828659, "p90_per_token_latency": 0.07052694590421603, "p99_per_token_latency": 0.19175863699585777, "avg_latency": 13490.14784723948, "median_latency": 10904.660940170288, "sd_latency": 10759.461472867813, "min_latency": 53.10511589050293, "max_latency": 55610.99076271057, "p90_latency": 28706.796979904175, "p99_latency": 45658.41965198513, "avg_per_output_token_latency": 148.97623456610614, "median_per_output_token_latency": 60.334928053662296, "sd_per_output_token_latency": 232.28505133364948, "min_per_output_token_latency": 7.44791825612386, "max_per_output_token_latency": 3108.849883079529, "p90_per_output_token_latency": 393.8944477023501, "p99_per_output_token_latency": 1193.081065813697, "avg_input_len": 251.53872564542743, "median_input_len": 109.0, "sd_input_len": 281.6475735479433, "min_input_len": 4.0, "max_input_len": 1024.0, "p90_input_len": 714.0, "p99_input_len": 987.0, "avg_output_len": 225.27520458674311, "median_output_len": 144.0, "sd_output_len": 234.48900674005114, "min_output_len": 3.0, "max_output_len": 1025.0, "p90_output_len": 564.0, "p99_output_len": 948.0, "ClientConnectorError": 0, "TimeoutError": 0, "ContentTypeError": 1, "ClientOSError": 0, "ServerDisconnectedError": 0, "unknown_error": 0}, "dimensions": {"date": "20250328-043623", "backend": "vllm", "model_id": "meta-llama/Llama-2-7b-hf", "tokenizer_id": "meta-llama/Llama-2-7b-hf"}, "config": {"model": "meta-llama/Llama-2-7b-hf", "num_models": 1, "model_server": "vllm", "start_time": {"seconds": 1743136583, "nanos": 238149000}}, "summary_stats": {"stats": [{"request_rate": 200.0, "request_latency": {"mean": 13490.14784723948, "median": 10904.660940170288, "sd": 10759.461472867813, "min": 53.10511589050293, "max": 55610.99076271057, "p90": 28706.796979904175, "p99": 45658.41965198513}, "throughput": {"mean": 35786.07723228514}, "input_length": {"mean": 251.53872564542743, "median": 109.0, "sd": 281.6475735479433, "min": 4.0, "max": 1024.0, "p90": 714.0, "p99": 987.0}, "output_length": {"mean": 225.27520458674311, "median": 144.0, "sd": 234.48900674005114, "min": 3.0, "max": 1025.0, "p90": 564.0, "p99": 948.0}, "tpot": {"mean": 148.97623456610614, "median": 60.334928053662296, "sd": 232.28505133364948, "min": 7.44791825612386, "max": 3108.849883079529, "p90": 393.8944477023501, "p99": 1193.081065813697}, "model_server_metrics": []}]}}