kubernetes-sigs · k8s-ci-robot · Mar 18, 2025 · Mar 6, 2025 · Mar 17, 2025 · Mar 18, 2025
diff --git a/benchmark/Inference_Extension_Benchmark.ipynb b/benchmark/Inference_Extension_Benchmark.ipynb
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -0,0 +1,104 @@
+# Benchmark
+
+This user guide shows how to run benchmarks against a vLLM deployment, by using both the Gateway API
+inference extension, and a Kubernetes service as the load balancing strategy. The
+benchmark uses the [Latency Profile Generator](https://github.com/AI-Hypercomputer/inference-benchmark) (LPG)
+tool to generate load and collect results.
+
+## Prerequisites
+
+### Deploy the inference extension and sample model server
+
+Follow this user guide https://gateway-api-inference-extension.sigs.k8s.io/guides/ to deploy the
+sample vLLM application, and the inference extension.
+
+### [Optional] Scale the sample vLLM deployment
+
+You will more likely to see the benefits of the inference extension when there are a decent number of replicas to make the optimal routing decision. 
+
+```bash
+kubectl scale deployment my-pool --replicas=8
+```
+
+### Expose the model server via a k8s service
+
+As the baseline, let's also expose the vLLM deployment as a k8s service by simply applying the yaml:
+
+```bash
+kubectl apply -f .manifests/ModelServerService.yaml
+```
+
+## Run benchmark
+
+### Run benchmark using the inference extension as the load balancing strategy
+
+1. Get the gateway IP: 
+
+    ```bash
+    IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}')
+    echo "Update the <gateway-ip> in ./manifests/BenchmarkInferenceExtension.yaml to: $IP"
+    ```
+
+1. Then update the `<gateway-ip>` in `./manifests/BenchmarkInferenceExtension.yaml` to the IP
+of the gateway. Feel free to adjust other parameters such as request_rates as well.
+
+1. Start the benchmark tool. `kubectl apply -f ./manifests/BenchmarkInferenceExtension.yaml`
+
+1. Wait for benchmark to finish and download the results. Use the `benchmark_id` environment variable
+to specify what this benchmark is for. In this case, the result is for the `inference-extension`. You
+can use any id you like.
+
+    ```bash
+    benchmark_id='inference-extension' ./download-benchmark-results.bash
+    ```
+
+1. After the script finishes, you should see benchmark results under `./output/default-run/inference-extension/results/json` folder.
+
+### Run benchmark using k8s service as the load balancing strategy
+
+1. Get the service IP: 
+
+    ```bash
+    IP=$(kubectl get service/my-pool-service -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
+    echo "Update the <svc-ip> in ./manifests/BenchmarkK8sService.yaml to: $IP"
+    ```
+
+2. Then update the `<svc-ip>` in `./manifests/BenchmarkK8sService.yaml` to the IP
+of the service. Feel free to adjust other parameters such as **request_rates** as well.
+
+1. Start the benchmark tool. `kubectl apply -f ./manifests/BenchmarkK8sService.yaml`
+
+2. Wait for benchmark to finish and download the results.
+
+    ```bash
+    benchmark_id='k8s-svc' ./download-benchmark-results.bash
+    ```
+
+3. After the script finishes, you should see benchmark results under `./output/default-run/k8s-svc/results/json` folder.
+
+### Tips
+
+* You can specify `run_id="runX"` environment variable when running the `./download-benchmark-results.bash` script.
+This is useful when you run benchmarks multiple times and group the results accordingly.
+
+## Analyze the results
+
+This guide shows how to run the jupyter notebook using vscode.
+
+1. Create a python virtual environment.
+
+    ```bash
+    python3 -m venv .venv
+    source .venv/bin/activate
+    ```
+
+1. Install the dependencies.
+
+    ```bash
+    pip install -r requirements.txt
+    ```
+
+1. Open the notebook `Inference_Extension_Benchmark.ipynb`, and run each cell. At the end you should
+    see a bar chart like below:
+
+    ![alt text](image.png)
diff --git a/benchmark/download-benchmark-results.bash b/benchmark/download-benchmark-results.bash
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Downloads the benchmark result files from the benchmark tool pod.
+download_benchmark_results() {
+  until echo $(kubectl logs deployment/benchmark-tool -n ${namespace}) | grep -q -m 1 "LPG_FINISHED"; do sleep 30 ; done;
+      benchmark_pod=$(kubectl get pods -l app=benchmark-tool -n ${namespace} -o jsonpath="{.items[0].metadata.name}")
+      echo "Downloading JSON results from pod ${benchmark_pod}"
+      kubectl exec ${benchmark_pod} -n ${namespace} -- rm -f ShareGPT_V3_unfiltered_cleaned_split.json
+      for f in $(kubectl exec ${benchmark_pod} -n ${namespace} -- /bin/sh -c ls -l | grep json); do
+        echo "Downloading json file ${f}"
+        kubectl cp -n ${namespace} ${benchmark_pod}:$f ${benchmark_output_dir}/results/json/$f; 
+      done
+}
+
+# Env vars to be passed when calling this script.
+# The id of the benchmark. This is needed to identify what the benchmark is for.
+# It decides the filepath to save the results, which later is used by the jupyter notebook to assign
+# the benchmark_id as data labels for plotting. 
+benchmark_id=${benchmark_id:-"inference-extension"}
+# run_id can be used to group different runs of the same benchmarks for comparison.
+run_id=${run_id:-"default-run"}
+namespace=${namespace:-"default"}
+output_dir=${output_dir:-'output'}
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+benchmark_output_dir=${SCRIPT_DIR}/${output_dir}/${run_id}/${benchmark_id}
+
+echo "Saving benchmark results to ${benchmark_output_dir}/results/json/"
+download_benchmark_results
diff --git a/benchmark/image.png b/benchmark/image.png
diff --git a/benchmark/manifests/BenchmarkInferenceExtension.yaml b/benchmark/manifests/BenchmarkInferenceExtension.yaml
@@ -0,0 +1,60 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: benchmark-tool
+  name: benchmark-tool
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: benchmark-tool
+  template:
+    metadata:
+      labels:
+        app: benchmark-tool
+    spec:
+      containers:
+      - image: 'us-docker.pkg.dev/cloud-tpu-images/inference/inference-benchmark@sha256:1c100b0cc949c7df7a2db814ae349c790f034b4b373aaad145e77e815e838438'
+        imagePullPolicy: Always
+        name: benchmark-tool
+        command:
+        - bash
+        - -c
+        - ./latency_throughput_curve.sh
+        env:
+        - name: IP
+          value: '<gateway-ip>'
+          # value: 'envoy-default-inference-gateway-6454a873.envoy-gateway-system.svc.cluster.local'
+        - name: REQUEST_RATES
+          value: '40,80,120,160,200'
+        - name: BENCHMARK_TIME_SECONDS
+          value: '60'
+        - name: TOKENIZER
+          value: 'meta-llama/Llama-2-7b-hf'
+        - name: MODELS
+          value: 'meta-llama/Llama-2-7b-hf'
+        - name: BACKEND
+          value: vllm
+        - name: PORT
+          value: "8081"
+        - name: INPUT_LENGTH
+          value: "1024"
+        - name: OUTPUT_LENGTH
+          value: '2048'
+        - name: FILE_PREFIX
+          value: benchmark
+        - name: PROMPT_DATASET_FILE
+          value: ShareGPT_V3_unfiltered_cleaned_split.json
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: token
+              name: hf-token
+        resources:
+          limits:
+            cpu: "2"
+            memory: 20Gi
+          requests:
+            cpu: "2"
+            memory: 20Gi
diff --git a/benchmark/manifests/BenchmarkK8sService.yaml b/benchmark/manifests/BenchmarkK8sService.yaml
@@ -0,0 +1,59 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: benchmark-tool
+  name: benchmark-tool
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: benchmark-tool
+  template:
+    metadata:
+      labels:
+        app: benchmark-tool
+    spec:
+      containers:
+      - image: 'us-docker.pkg.dev/cloud-tpu-images/inference/inference-benchmark@sha256:1c100b0cc949c7df7a2db814ae349c790f034b4b373aaad145e77e815e838438'
+        imagePullPolicy: Always
+        name: benchmark-tool
+        command:
+        - bash
+        - -c
+        - ./latency_throughput_curve.sh
+        env:
+        - name: IP
+          value: 'my-pool-service.default.svc.cluster.local'
+        - name: REQUEST_RATES
+          value: '40,80,120,160,200'
+        - name: BENCHMARK_TIME_SECONDS
+          value: '60'
+        - name: TOKENIZER
+          value: 'meta-llama/Llama-2-7b-hf'
+        - name: MODELS
+          value: 'meta-llama/Llama-2-7b-hf'
+        - name: BACKEND
+          value: vllm
+        - name: PORT
+          value: "8081"
+        - name: INPUT_LENGTH
+          value: "1024"
+        - name: OUTPUT_LENGTH
+          value: '2048'
+        - name: FILE_PREFIX
+          value: benchmark
+        - name: PROMPT_DATASET_FILE
+          value: ShareGPT_V3_unfiltered_cleaned_split.json
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: token
+              name: hf-token
+        resources:
+          limits:
+            cpu: "2"
+            memory: 20Gi
+          requests:
+            cpu: "2"
+            memory: 20Gi
diff --git a/benchmark/manifests/ModelServerService.yaml b/benchmark/manifests/ModelServerService.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: my-pool-service
+spec:
+  ports:
+  - port: 8081
+    protocol: TCP
+    targetPort: 8000
+  selector:
+    app: my-pool
+  type: LoadBalancer
diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt
@@ -0,0 +1,3 @@
+pandas
+numpy
+matplotlib