kubernetes-sigs · k8s-ci-robot · Mar 18, 2025 · Mar 6, 2025 · Mar 17, 2025 · Mar 18, 2025
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -0,0 +1 @@
+This folder contains resources to run performance benchmarks. Pls follow the benchmark guide here https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark.
diff --git a/benchmark/benchmark.ipynb b/benchmark/benchmark.ipynb
diff --git a/benchmark/download-benchmark-results.bash b/benchmark/download-benchmark-results.bash
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Downloads the benchmark result files from the benchmark tool pod.
+download_benchmark_results() {
+  until echo $(kubectl logs deployment/benchmark-tool -n ${namespace}) | grep -q -m 1 "LPG_FINISHED"; do sleep 30 ; done;
+      benchmark_pod=$(kubectl get pods -l app=benchmark-tool -n ${namespace} -o jsonpath="{.items[0].metadata.name}")
+      echo "Downloading JSON results from pod ${benchmark_pod}"
+      kubectl exec ${benchmark_pod} -n ${namespace} -- rm -f ShareGPT_V3_unfiltered_cleaned_split.json
+      for f in $(kubectl exec ${benchmark_pod} -n ${namespace} -- /bin/sh -c ls -l | grep json); do
+        echo "Downloading json file ${f}"
+        kubectl cp -n ${namespace} ${benchmark_pod}:$f ${benchmark_output_dir}/results/json/$f; 
+      done
+}
+
+# Env vars to be passed when calling this script.
+# The id of the benchmark. This is needed to identify what the benchmark is for.
+# It decides the filepath to save the results, which later is used by the jupyter notebook to assign
+# the benchmark_id as data labels for plotting. 
+benchmark_id=${benchmark_id:-"inference-extension"}
+# run_id can be used to group different runs of the same benchmarks for comparison.
+run_id=${run_id:-"default-run"}
+namespace=${namespace:-"default"}
+output_dir=${output_dir:-'output'}
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+benchmark_output_dir=${SCRIPT_DIR}/${output_dir}/${run_id}/${benchmark_id}
+
+echo "Saving benchmark results to ${benchmark_output_dir}/results/json/"
+download_benchmark_results
+kubectl delete -f ${SCRIPT_DIR}/../config/manifests/benchmark/benchmark.yaml
diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt
@@ -0,0 +1,3 @@
+pandas
+numpy
+matplotlib
diff --git a/config/manifests/benchmark/benchmark.yaml b/config/manifests/benchmark/benchmark.yaml
@@ -0,0 +1,60 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: benchmark-tool
+  name: benchmark-tool
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: benchmark-tool
+  template:
+    metadata:
+      labels:
+        app: benchmark-tool
+    spec:
+      containers:
+        # The following image was built from this source https://github.com/AI-Hypercomputer/inference-benchmark/tree/07628c9fe01b748f5a4cc9e5c2ee4234aaf47699
+      - image: 'us-docker.pkg.dev/cloud-tpu-images/inference/inference-benchmark@sha256:1c100b0cc949c7df7a2db814ae349c790f034b4b373aaad145e77e815e838438'
+        imagePullPolicy: Always
+        name: benchmark-tool
+        command:
+        - bash
+        - -c
+        - ./latency_throughput_curve.sh
+        env:
+        - name: IP
+          value: '<target-ip>'
+        - name: REQUEST_RATES
+          value: '10,20,30'
+        - name: BENCHMARK_TIME_SECONDS
+          value: '60'
+        - name: TOKENIZER
+          value: 'meta-llama/Llama-2-7b-hf'
+        - name: MODELS
+          value: 'meta-llama/Llama-2-7b-hf'
+        - name: BACKEND
+          value: vllm
+        - name: PORT
+          value: "8081"
+        - name: INPUT_LENGTH
+          value: "1024"
+        - name: OUTPUT_LENGTH
+          value: '2048'
+        - name: FILE_PREFIX
+          value: benchmark
+        - name: PROMPT_DATASET_FILE
+          value: ShareGPT_V3_unfiltered_cleaned_split.json
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: token
+              name: hf-token
+        resources:
+          limits:
+            cpu: "2"
+            memory: 20Gi
+          requests:
+            cpu: "2"
+            memory: 20Gi
diff --git a/config/manifests/benchmark/model-server-service.yaml b/config/manifests/benchmark/model-server-service.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: my-pool-service
+spec:
+  ports:
+  - port: 8081
+    protocol: TCP
+    targetPort: 8000
+  selector:
+    app: my-pool
+  type: LoadBalancer
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -59,6 +59,8 @@ nav:
       - Adapter Rollout: guides/adapter-rollout.md
       - Metrics: guides/metrics.md
     - Implementer's Guide: guides/implementers.md
+  - Performance:
+    - Benchmark: performance/benchmark/index.md
   - Reference:
     - API Reference: reference/spec.md
     - API Types:

diff --git a/site-src/performance/benchmark/example-bar-chart.png b/site-src/performance/benchmark/example-bar-chart.png
diff --git a/site-src/performance/benchmark/index.md b/site-src/performance/benchmark/index.md
@@ -0,0 +1,98 @@
+# Benchmark
+
+This user guide shows how to run benchmarks against a vLLM deployment, by using both the Gateway API
+inference extension, and a Kubernetes service as the load balancing strategy. The
+benchmark uses the [Latency Profile Generator](https://github.com/AI-Hypercomputer/inference-benchmark) (LPG)
+tool to generate load and collect results.
+
+## Prerequisites
+
+### Deploy the inference extension and sample model server
+
+Follow this user guide https://gateway-api-inference-extension.sigs.k8s.io/guides/ to deploy the
+sample vLLM application, and the inference extension.
+
+### [Optional] Scale the sample vLLM deployment
+
+You will more likely to see the benefits of the inference extension when there are a decent number of replicas to make the optimal routing decision. 
+
+```bash
+kubectl scale --replicas=8 -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml
+```
+
+### Expose the model server via a k8s service
+
+As the baseline, let's also expose the vLLM deployment as a k8s service:
+
+```bash
+kubectl expose -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --port=8081 --target-port=8000 --type=LoadBalancer
+```
+
+## Run benchmark
+
+The LPG benchmark tool works by sending traffic to the specified target IP and port, and collect results. Follow the steps below to run a single benchmark. You can deploy multiple LPG instances if you want to run benchmarks in parallel against different targets.
+
+1. Check out the repo.
+
+    ```bash
+    git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension
+    cd gateway-api-inference-extension
+    ```
+
+1. Get the target IP. Examples below show how to get the IP of a gateway or a LoadBalancer k8s service.
+
+    ```bash
+    # Get gateway IP
+    GW_IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}')
+    # Get LoadBalancer k8s service IP
+    SVC_IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}')
+
+    echo $GW_IP
+    echo $SVC_IP
+    ```
+
+1. Then update the `<target-ip>` in `./config/manifests/benchmark/benchmark.yaml` to your target IP. Feel free to adjust other parameters such as request_rates as well. For a complete list of LPG configurations, pls refer to the [LPG user guide](https://github.com/AI-Hypercomputer/inference-benchmark?tab=readme-ov-file#configuring-the-benchmark).
+
+1. Start the benchmark tool. `kubectl apply -f ./config/manifests/benchmark/benchmark.yaml`
+
+1. Wait for benchmark to finish and download the results. Use the `benchmark_id` environment variable
+to specify what this benchmark is for. For instance, `inference-extension` or `k8s-svc`. When the LPG tool finishes benchmarking, it will print a log line `LPG_FINISHED`,
+the script below will watch for that log line and then start downloading results.
+
+    ```bash
+    benchmark_id='my-benchmark' ./benchmark/download-benchmark-results.bash
+    ```
+
+1. After the script finishes, you should see benchmark results under `./benchmark/output/default-run/my-benchmark/results/json` folder.
+
+### Tips
+
+* You can specify `run_id="runX"` environment variable when running the `./download-benchmark-results.bash` script.
+This is useful when you run benchmarks multiple times to get a more statistically meaningful results and group the results accordingly.
+* Update the `request_rates` that best suit your benchmark environment.
+
+### Advanced Benchmark Configurations
+
+Pls refer to the [LPG user guide](https://github.com/AI-Hypercomputer/inference-benchmark?tab=readme-ov-file#configuring-the-benchmark) for a detailed list of configuration knobs.
+
+## Analyze the results
+
+This guide shows how to run the jupyter notebook using vscode.
+
+1. Create a python virtual environment.
+
+    ```bash
+    python3 -m venv .venv
+    source .venv/bin/activate
+    ```
+
+1. Install the dependencies.
+
+    ```bash
+    pip install -r ./benchmark/requirements.txt
+    ```
+
+1. Open the notebook `./benchmark/benchmark.ipynb`, and run each cell. At the end you should
+    see a bar chart like below:
+
+    ![alt text](example-bar-chart.png)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		This folder contains resources to run performance benchmarks. Pls follow the benchmark guide here https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark.