pre-commit fix and throughput test fixes

louie-tsai · louie-tsai · commit 72d890c3048b · 2025-06-02T14:51:02.000-07:00
Signed-off-by: Tsai, Louie &lt;louie.tsai@intel.com&gt;
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -75,6 +75,7 @@ def results_to_json(latency, throughput, serving):
         }
     )
 
+
 def get_size(bytes, suffix="B"):
     """
     Scale bytes to its proper format
@@ -88,6 +89,7 @@ def get_size(bytes, suffix="B"):
             return f"{bytes:.2f}{unit}{suffix}"
         bytes /= factor
 
+
 if __name__ == "__main__":
     # collect results
     for test_file in results_folder.glob("*.json"):
@@ -168,20 +170,23 @@ def get_size(bytes, suffix="B"):
     serving_results = pd.DataFrame.from_dict(serving_results)
     throughput_results = pd.DataFrame.from_dict(throughput_results)
 
-    from cpuinfo import get_cpu_info
+    import pandas as pd
     import psutil
+    from cpuinfo import get_cpu_info
     from numa import info
-    import pandas as pd
+
     svmem = psutil.virtual_memory()
     numa_size = info.get_num_configured_nodes()
     platform_data = {
-        'CPU Brand': [get_cpu_info()['brand_raw']],
-        'Physical cores': [psutil.cpu_count(logical=False)],
-        'Total cores': [psutil.cpu_count(logical=True)],
-        'Total Memory': [get_size(svmem.total)],
-        'Total NUMA nodes': [numa_size]
+        "CPU Brand": [get_cpu_info()["brand_raw"]],
+        "Physical cores": [psutil.cpu_count(logical=False)],
+        "Total cores": [psutil.cpu_count(logical=True)],
+        "Total Memory": [get_size(svmem.total)],
+        "Total NUMA nodes": [numa_size],
     }
-    platform_results = pd.DataFrame.from_dict(platform_data, orient='index', columns=['Platform Info'])
+    platform_results = pd.DataFrame.from_dict(
+        platform_data, orient="index", columns=["Platform Info"]
+    )
 
     raw_results_json = results_to_json(
         latency_results, throughput_results, serving_results
@@ -228,10 +233,9 @@ def get_size(bytes, suffix="B"):
     throughput_md_table = tabulate(
         throughput_results, headers="keys", tablefmt="pipe", showindex=False
     )
-    platform_md_table = tabulate(platform_results,
-                                   headers='keys',
-                                   tablefmt='pipe',
-                                   showindex=True)
+    platform_md_table = tabulate(
+        platform_results, headers="keys", tablefmt="pipe", showindex=True
+    )
 
     # document the result
     with open(results_folder / "benchmark_results.md", "w") as f:
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -272,7 +272,7 @@ run_throughput_tests() {
       fi
     fi
 
-    throughput_command="python3 benchmark_throughput.py \
+    throughput_command=" $throughput_envs python3 benchmark_throughput.py \
       --output-json $RESULTS_FOLDER/${test_name}.json \
       $throughput_args"
 
@@ -329,7 +329,7 @@ run_serving_tests() {
     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
     echo "Running over qps list $qps_list"
 
-    # check if there is enough resouces to run the test
+    # check if there is enough resources to run the test
     tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
     if [ "$ON_CPU" == "1" ];then
       if [[ $numa_count -lt $tp ]]; then
@@ -455,7 +455,7 @@ main() {
 
   # postprocess benchmarking results
   pip install tabulate pandas
-  ON_CPU=$ON_CPU python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
+  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
 
   upload_to_buildkite
 }
diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@@ -16,35 +16,19 @@
         }
     },
     {
-        "test_name": "latency_llama70B_tp4",
+        "test_name": "latency_llama8B_tp4",
         "environment_variables": {
             "VLLM_RPC_TIMEOUT": 1000000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 600,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 4,
             "load_format": "dummy",
-            "num-iters-warmup": 5,
-            "num-iters": 15
-        }
-    },
-    {
-        "test_name": "latency_mixtral8x7B_tp2",
-        "environment_variables": {
-            "VLLM_RPC_TIMEOUT": 1000000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 600,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tensor_parallel_size": 2,
-            "load_format": "dummy",
-            "num-iters-warmup": 5,
-            "num-iters": 15
+            "num_iters_warmup": 5,
+            "num_iters": 15
         }
     }
 ]
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -45,7 +45,7 @@
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "True",
+	    "enable_chunked_prefill": "",
             "disable_log_stats": "",
             "disable_log_requests": "",
             "load_format": "dummy"
@@ -77,7 +77,7 @@
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "True",
+	    "enable_chunked_prefill": "",
             "disable_log_stats": "",
             "disable_log_requests": "",
             "load_format": "dummy"
diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@@ -17,37 +17,20 @@
         }
     },
     {
-        "test_name": "throughput_llama70B_tp4",
+        "test_name": "throughput_llama8B_tp4",
         "environment_variables": {
             "VLLM_RPC_TIMEOUT": 1000000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 600,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 4,
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
             "backend": "vllm"
         }
-    },
-    {
-        "test_name": "throughput_mixtral8x7B_tp2",
-        "environment_variables": {
-            "VLLM_RPC_TIMEOUT": 1000000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 600,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tensor_parallel_size": 2,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
     }
 ]