diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 00000000..ffd3ee7b
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1 @@
+This folder contains resources to run performance benchmarks. Pls follow the benchmark guide here https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark.
\ No newline at end of file
diff --git a/benchmark/benchmark.ipynb b/benchmark/benchmark.ipynb
new file mode 100644
index 00000000..993279cb
--- /dev/null
+++ b/benchmark/benchmark.ipynb
@@ -0,0 +1,358 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 26,
+      "metadata": {
+        "executionInfo": {
+          "elapsed": 391,
+          "status": "ok",
+          "timestamp": 1741734317446,
+          "user": {
+            "displayName": "Cong Liu",
+            "userId": "18222691451061354557"
+          },
+          "user_tz": 420
+        },
+        "id": "ziJD5zt0c1Rt"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Configuration. Edit this before running the rest.\n",
+        "\n",
+        "OUTPUT_DIR='output'\n",
+        "RUN_ID='example-run'\n",
+        "# Path to the benchmark dir under `gateway-api-inference-extension/benchmark`\n",
+        "BENCHMARK_DIR =\"./\"\n",
+        "# A regex to match the model name, which matches the output file name.\n",
+        "MODEL_MATCHER='.*llama.*'"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 27,
+      "metadata": {
+        "executionInfo": {
+          "elapsed": 33,
+          "status": "ok",
+          "timestamp": 1741735749209,
+          "user": {
+            "displayName": "Cong Liu",
+            "userId": "18222691451061354557"
+          },
+          "user_tz": 420
+        },
+        "id": "dB7xALgLawN-"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Plot Helper\n",
+        "import os\n",
+        "import pandas as pd\n",
+        "import re\n",
+        "import json\n",
+        "from collections import OrderedDict\n",
+        "import matplotlib.pyplot as plt\n",
+        "import numpy as np\n",
+        "import math\n",
+        "import logging\n",
+        "level = logging.INFO\n",
+        "logger = logging.getLogger(__name__)\n",
+        "logger.setLevel(level)\n",
+        "handler = logging.StreamHandler()  # This sends output to the console\n",
+        "handler.setLevel(level) # Set handler level\n",
+        "logger.addHandler(handler)\n",
+        "\n",
+        "title_fontsize = 18\n",
+        "axis_label_fontsize = 18\n",
+        "legend_fontsize = 16\n",
+        "tick_label_fontsize = 14\n",
+        "\n",
+        "# Encapsulates some basic information needed to plot metrics.\n",
+        "class XY:\n",
+        "  def __init__(self, x: str, y: str, x_label=None, y_label=None):\n",
+        "    self.x = x\n",
+        "    self.y = y\n",
+        "    self.x_label = x if x_label is None else x_label\n",
+        "    self.y_label = y if y_label is None else y_label\n",
+        "\n",
+        "NUM_PLOTS_PER_ROW = 4\n",
+        "# The arguments need to match the metric name fields generated by the benchmark tool.\n",
+        "CORE_METRICS = [\n",
+        "    XY(x = 'request_rate', x_label = 'QPS', y = 'output_tokens_per_min'),\n",
+        "    XY(x = \"request_rate\", x_label = 'QPS', y = \"p90_per_output_token_latency\"),\n",
+        "    XY(x = \"request_rate\", x_label = 'QPS', y = \"p90_latency\"),\n",
+        "]\n",
+        "SANITY_CHECK_METRICS = [\n",
+        "    XY(x = 'request_rate', x_label = 'QPS', y = 'benchmark_time'),\n",
+        "    XY(x = \"request_rate\", x_label = 'QPS', y=\"num_prompts_attempted\"),\n",
+        "    XY(x = \"request_rate\", x_label = 'QPS', y=\"num_prompts_succeeded\"),\n",
+        "    XY(x = 'request_rate', x_label = 'QPS', y = 'throughput_rps'),\n",
+        "    XY(x = 'request_rate', x_label = 'QPS', y = 'total_input_tokens'),\n",
+        "    XY(x = 'request_rate', x_label = 'QPS', y = 'total_output_token'),\n",
+        "    XY(x = 'request_rate', x_label = 'QPS', y = 'avg_input_len'),\n",
+        "    XY(x = 'request_rate', x_label = 'QPS', y = 'avg_output_len'),\n",
+        "]\n",
+        "\n",
+        "class Label:\n",
+        "  def __init__(self, name, alias=None):\n",
+        "    self.name = name\n",
+        "    self.alias = name if alias is None else alias\n",
+        "\n",
+        "ALL_METRICS = CORE_METRICS  + SANITY_CHECK_METRICS\n",
+        "\n",
+        "class Plotter:\n",
+        "  def __init__(self, run_id, labels=None, metrics=CORE_METRICS, num_plots_per_row=5, interactive=False, annotate=False, output_dir=OUTPUT_DIR):\n",
+        "    self.run_id = run_id\n",
+        "    self.labels = labels\n",
+        "    self.metrics = metrics\n",
+        "    self.num_plots_per_row = num_plots_per_row\n",
+        "    self.interactive = interactive\n",
+        "    self.annotate = annotate\n",
+        "    self.output_dir = output_dir\n",
+        "\n",
+        "  def withRunId(self, run_id):\n",
+        "    return Plotter(run_id, self.labels, self.metrics, self.num_plots_per_row, self.interactive, self.annotate, self.output_dir)\n",
+        "\n",
+        "  def withLabels(self, labels):\n",
+        "    return Plotter(self.run_id, labels, self.metrics, self.num_plots_per_row, self.interactive, self.annotate, self.output_dir)\n",
+        "\n",
+        "  def withMetrics(self, metrics):\n",
+        "    return Plotter(self.run_id, self.labels, metrics, self.num_plots_per_row, self.interactive, self.annotate, self.output_dir)\n",
+        "\n",
+        "  def withOutputDir(self, output_dir):\n",
+        "    return Plotter(self.run_id, self.labels, self.metrics, self.num_plots_per_row, self.interactive, self.annotate, output_dir)\n",
+        "\n",
+        "  def plot_bar(self):\n",
+        "    data = load_data(self.labels, self.run_id, self.output_dir)\n",
+        "    groups = group_data(data, self.metrics)\n",
+        "    logger.debug(\"Plotting run id...\")\n",
+        "    plot_bar(self.labels, groups, self.metrics, self.num_plots_per_row, self.interactive, annotate=self.annotate)\n",
+        "\n",
+        "def filepaths(root_dir):\n",
+        "    \"\"\"\n",
+        "    Recursively reads files within a directory and returns a list of file paths.\n",
+        "    \"\"\"\n",
+        "\n",
+        "    filepaths = []\n",
+        "    for dirpath, dirnames, filenames in os.walk(root_dir):\n",
+        "        for filename in filenames:\n",
+        "            filepath = os.path.join(dirpath, filename)\n",
+        "            filepaths.append(filepath)\n",
+        "    return filepaths\n",
+        "\n",
+        "def flatten_server_metrics(server_metrics):\n",
+        "  \"\"\"\n",
+        "  Flattens the server metrics json to a single level.\n",
+        "  \"\"\"\n",
+        "  flattend = {}\n",
+        "  for k, v in server_metrics.items():\n",
+        "    if isinstance(v, dict):\n",
+        "      for k2, v2 in v.items():\n",
+        "        flattend[k + \".\" + k2] = v2\n",
+        "\n",
+        "  return flattend\n",
+        "\n",
+        "def load_data(labels, run_id, output_dir=OUTPUT_DIR):\n",
+        "  data_path =f\"{BENCHMARK_DIR}/{output_dir}/{run_id}\"\n",
+        "  records = []\n",
+        "  logger.debug(f\"Loading data for {data_path}\")\n",
+        "  for file in filepaths(data_path):\n",
+        "    for label in labels:\n",
+        "      regex = f\".*\\/{label.name}\\/results/json/{MODEL_MATCHER}.json\"\n",
+        "      logger.debug(f\"matching file {file} for regex {regex} and label {label}\")\n",
+        "      if re.match(regex, file):\n",
+        "        logger.debug(f\"found match file {file} for regex {regex} and label {label}\")\n",
+        "        with open(file, 'r') as f:\n",
+        "          raw_data = json.load(f)\n",
+        "          sample_data = {\n",
+        "              'file_name': f.name,\n",
+        "              'label': label.alias,\n",
+        "              **raw_data.get(\"metrics\",{}),\n",
+        "              **flatten_server_metrics(raw_data.get(\"metrics\",{}).get(\"server_metrics\", {})),\n",
+        "          }\n",
+        "          sample_data['request_rate'] = sample_data['request_rate'] * raw_data['config']['num_models']\n",
+        "          records.append(sample_data)\n",
+        "  all_data = pd.DataFrame.from_records(records, index='file_name') if len(records) > 0 else pd.DataFrame()\n",
+        "  return all_data\n",
+        "\n",
+        "def group_data(all_data, metrics=CORE_METRICS):\n",
+        "  try:\n",
+        "    data = all_data.sort_values(by=['request_rate'], ascending=True).copy().dropna()\n",
+        "  except:\n",
+        "    # print(\"No data found\")\n",
+        "    return None\n",
+        "\n",
+        "  # Ensure there is exactly one benchmark result per label and x-axis for each\n",
+        "  # metric.\n",
+        "  x_axes = set()\n",
+        "  for m in metrics:\n",
+        "    x_axes.add(m.x)\n",
+        "\n",
+        "  for x in x_axes:\n",
+        "    sizes = data.groupby(by=['label', x], dropna=True).size()\n",
+        "    for index, v in sizes.items():\n",
+        "      if v > 1:\n",
+        "        label, _ = index\n",
+        "        # print(f\"Multiple benchmark results for the same label ({label}), and x-axis ({x}). {index}: {v}. Please use more selective file filters.\")\n",
+        "        # raise ValueError(f\"Multiple benchmark results for the same label ({label}), and x-axis ({x}). Please use more selective file filters.\")\n",
+        "\n",
+        "  # Group by label.\n",
+        "  groups = data.groupby(by=['label'],sort=True)\n",
+        "  return groups\n",
+        "\n",
+        "def init_plot(metrics, num_plots_per_row=NUM_PLOTS_PER_ROW):\n",
+        "  num_plots_per_row = min(num_plots_per_row, len(metrics))\n",
+        "  row_count = math.ceil(len(metrics) / num_plots_per_row)\n",
+        "  fig, axes = plt.subplots(nrows=row_count, ncols=num_plots_per_row, figsize=(20, 5*row_count), tight_layout=True)\n",
+        "  if row_count == 1 and num_plots_per_row == 1:\n",
+        "    axes = [axes]\n",
+        "  return fig, axes\n",
+        "\n",
+        "def plot_metrics(metrics, plot_func, num_plots_per_row=NUM_PLOTS_PER_ROW, fig=None, axes=None):\n",
+        "  \"\"\"\n",
+        "  plot_func: a function in the form of def plot_func(ax:~matplotlib.axes.Axes , m: XY):\n",
+        "  \"\"\"\n",
+        "  logger.debug(f'Plotting metrics: {metrics}')\n",
+        "  num_plots_per_row = min(num_plots_per_row, len(metrics))\n",
+        "  if fig is None or axes is None:\n",
+        "    logger.debug(f'Creating new figure and axes')\n",
+        "    fig, axes = init_plot(metrics, num_plots_per_row)\n",
+        "  row_count = math.ceil(len(metrics) / num_plots_per_row)\n",
+        "  for i, m in enumerate(metrics):\n",
+        "    row = math.floor(i/num_plots_per_row)\n",
+        "    col = i%num_plots_per_row\n",
+        "    if row_count == 1:\n",
+        "      curAx = axes[col]\n",
+        "    else:\n",
+        "      curAx = axes[row, col]\n",
+        "    plot_func(curAx, m)\n",
+        "  return fig, axes\n",
+        "\n",
+        "def plot_bar(labels, groups, metrics=CORE_METRICS, num_plots_per_row=NUM_PLOTS_PER_ROW, interactive=INTERACTIVE_PLOT, annotate=False):\n",
+        "    labels = [label.alias for label in labels]\n",
+        "    logger.debug(f'Prnting bar chart for {labels}')\n",
+        "    logger.debug(f'groups: {groups}')\n",
+        "    dataframes = []\n",
+        "    for label in labels:\n",
+        "      try:\n",
+        "        dataframes.append(groups.get_group((label,)))\n",
+        "      except:\n",
+        "        logger.debug(f\"No data found for label {label}\")\n",
+        "        continue\n",
+        "    y_columns = [m.y for m in metrics]\n",
+        "    logger.debug(f'y_columns: {y_columns}')\n",
+        "    logger.debug(f'dataframes: {dataframes}')\n",
+        "\n",
+        "    # 1. Combine all request rates\n",
+        "    all_request_rates = set()\n",
+        "    for df in dataframes:\n",
+        "        all_request_rates.update(df['request_rate'].astype(int))\n",
+        "    all_request_rates = sorted(list(all_request_rates))\n",
+        "\n",
+        "    # 2. Prepare data for plotting:  Create a nested dictionary\n",
+        "    plot_data = {y_col: {label: {} for label in labels} for y_col in y_columns}\n",
+        "\n",
+        "    for i, df in enumerate(dataframes):\n",
+        "        label = labels[i]\n",
+        "        df_dict = df.set_index('request_rate').to_dict()\n",
+        "        for y_col in y_columns:\n",
+        "            for request_rate in all_request_rates:\n",
+        "                plot_data[y_col][label][request_rate] = df_dict.get(y_col, {}).get(request_rate, np.nan)\n",
+        "\n",
+        "    logger.debug(f'Plot_data: {plot_data}')\n",
+        "\n",
+        "    # 3. Plotting\n",
+        "    def plot_func(curAx, m):\n",
+        "      num_request_rates = len(all_request_rates)\n",
+        "      num_labels = len(labels)\n",
+        "      x = np.arange(num_request_rates)  # the label locations (x-axis positions)\n",
+        "      width = 0.4 / num_labels   # width of the bars\n",
+        "\n",
+        "      for i, label in enumerate(labels):\n",
+        "          bar_x = x - (width*num_labels)/2 + i*width + width/2\n",
+        "          #Extract y-values to plot\n",
+        "          y_values = [plot_data[m.y][label][rr] for rr in all_request_rates]\n",
+        "\n",
+        "          rects = curAx.bar(bar_x, y_values, width, label=label)\n",
+        "          if annotate:\n",
+        "            for rect, val in zip(rects, y_values):\n",
+        "                if not np.isnan(val):\n",
+        "                    height = rect.get_height()\n",
+        "                    curAx.annotate(f'{val:.2f}',\n",
+        "                            xy=(rect.get_x() + rect.get_width() / 2, height),\n",
+        "                            xytext=(0, 3),  # 3 points vertical offset\n",
+        "                            textcoords=\"offset points\",\n",
+        "                            ha='center', va='bottom')\n",
+        "      # Add labels, title, and legend\n",
+        "      curAx.set_xlabel(m.x_label, fontsize=axis_label_fontsize)\n",
+        "      curAx.set_ylabel(m.y_label, fontsize=axis_label_fontsize)\n",
+        "      curAx.set_xticks(x)\n",
+        "      curAx.set_xticklabels(all_request_rates)\n",
+        "      curAx.tick_params(axis='both', labelsize=tick_label_fontsize)\n",
+        "      curAx.legend(fontsize=legend_fontsize, loc='upper left', frameon=True, framealpha=0.8, edgecolor='black')\n",
+        "    fig, axes = plot_metrics(metrics, plot_func, num_plots_per_row)\n",
+        "    fig.tight_layout(rect=[0, 0.03, 1, 0.95])\n",
+        "    plt.show()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "height": 1000
+        },
+        "executionInfo": {
+          "elapsed": 2232,
+          "status": "ok",
+          "timestamp": 1741735855456,
+          "user": {
+            "displayName": "Cong Liu",
+            "userId": "18222691451061354557"
+          },
+          "user_tz": 420
+        },
+        "id": "HbGEAOucb_Jn",
+        "outputId": "faf0304b-92f4-4fa7-ae71-83b8bd987e70"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Plot Result\n",
+        "\n",
+        "pl = Plotter(run_id=RUN_ID, labels=[Label('inference-extension'),Label('k8s-svc')], output_dir=OUTPUT_DIR)\n",
+        "pl.plot_bar()"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "last_runtime": {
+        "build_target": "",
+        "kind": "local"
+      },
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": ".venv",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.6"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/benchmark/download-benchmark-results.bash b/benchmark/download-benchmark-results.bash
new file mode 100755
index 00000000..333fc6cc
--- /dev/null
+++ b/benchmark/download-benchmark-results.bash
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Downloads the benchmark result files from the benchmark tool pod.
+download_benchmark_results() {
+  until echo $(kubectl logs deployment/benchmark-tool -n ${namespace}) | grep -q -m 1 "LPG_FINISHED"; do sleep 30 ; done;
+      benchmark_pod=$(kubectl get pods -l app=benchmark-tool -n ${namespace} -o jsonpath="{.items[0].metadata.name}")
+      echo "Downloading JSON results from pod ${benchmark_pod}"
+      kubectl exec ${benchmark_pod} -n ${namespace} -- rm -f ShareGPT_V3_unfiltered_cleaned_split.json
+      for f in $(kubectl exec ${benchmark_pod} -n ${namespace} -- /bin/sh -c ls -l | grep json); do
+        echo "Downloading json file ${f}"
+        kubectl cp -n ${namespace} ${benchmark_pod}:$f ${benchmark_output_dir}/results/json/$f; 
+      done
+}
+
+# Env vars to be passed when calling this script.
+# The id of the benchmark. This is needed to identify what the benchmark is for.
+# It decides the filepath to save the results, which later is used by the jupyter notebook to assign
+# the benchmark_id as data labels for plotting. 
+benchmark_id=${benchmark_id:-"inference-extension"}
+# run_id can be used to group different runs of the same benchmarks for comparison.
+run_id=${run_id:-"default-run"}
+namespace=${namespace:-"default"}
+output_dir=${output_dir:-'output'}
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+benchmark_output_dir=${SCRIPT_DIR}/${output_dir}/${run_id}/${benchmark_id}
+
+echo "Saving benchmark results to ${benchmark_output_dir}/results/json/"
+download_benchmark_results
+kubectl delete -f ${SCRIPT_DIR}/../config/manifests/benchmark/benchmark.yaml
\ No newline at end of file
diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt
new file mode 100644
index 00000000..44974cf4
--- /dev/null
+++ b/benchmark/requirements.txt
@@ -0,0 +1,3 @@
+pandas
+numpy
+matplotlib
\ No newline at end of file
diff --git a/config/manifests/benchmark/benchmark.yaml b/config/manifests/benchmark/benchmark.yaml
new file mode 100644
index 00000000..a47b4617
--- /dev/null
+++ b/config/manifests/benchmark/benchmark.yaml
@@ -0,0 +1,60 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: benchmark-tool
+  name: benchmark-tool
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: benchmark-tool
+  template:
+    metadata:
+      labels:
+        app: benchmark-tool
+    spec:
+      containers:
+        # The following image was built from this source https://github.com/AI-Hypercomputer/inference-benchmark/tree/07628c9fe01b748f5a4cc9e5c2ee4234aaf47699
+      - image: 'us-docker.pkg.dev/cloud-tpu-images/inference/inference-benchmark@sha256:1c100b0cc949c7df7a2db814ae349c790f034b4b373aaad145e77e815e838438'
+        imagePullPolicy: Always
+        name: benchmark-tool
+        command:
+        - bash
+        - -c
+        - ./latency_throughput_curve.sh
+        env:
+        - name: IP
+          value: '<target-ip>'
+        - name: REQUEST_RATES
+          value: '10,20,30'
+        - name: BENCHMARK_TIME_SECONDS
+          value: '60'
+        - name: TOKENIZER
+          value: 'meta-llama/Llama-2-7b-hf'
+        - name: MODELS
+          value: 'meta-llama/Llama-2-7b-hf'
+        - name: BACKEND
+          value: vllm
+        - name: PORT
+          value: "8081"
+        - name: INPUT_LENGTH
+          value: "1024"
+        - name: OUTPUT_LENGTH
+          value: '2048'
+        - name: FILE_PREFIX
+          value: benchmark
+        - name: PROMPT_DATASET_FILE
+          value: ShareGPT_V3_unfiltered_cleaned_split.json
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: token
+              name: hf-token
+        resources:
+          limits:
+            cpu: "2"
+            memory: 20Gi
+          requests:
+            cpu: "2"
+            memory: 20Gi
diff --git a/config/manifests/benchmark/model-server-service.yaml b/config/manifests/benchmark/model-server-service.yaml
new file mode 100644
index 00000000..014054cf
--- /dev/null
+++ b/config/manifests/benchmark/model-server-service.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: my-pool-service
+spec:
+  ports:
+  - port: 8081
+    protocol: TCP
+    targetPort: 8000
+  selector:
+    app: my-pool
+  type: LoadBalancer
diff --git a/mkdocs.yml b/mkdocs.yml
index 8cd3f3fb..fc4c9438 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -59,6 +59,8 @@ nav:
       - Adapter Rollout: guides/adapter-rollout.md
       - Metrics: guides/metrics.md
     - Implementer's Guide: guides/implementers.md
+  - Performance:
+    - Benchmark: performance/benchmark/index.md
   - Reference:
     - API Reference: reference/spec.md
     - API Types:
diff --git a/site-src/performance/benchmark/example-bar-chart.png b/site-src/performance/benchmark/example-bar-chart.png
new file mode 100644
index 00000000..54dc6589
Binary files /dev/null and b/site-src/performance/benchmark/example-bar-chart.png differ
diff --git a/site-src/performance/benchmark/index.md b/site-src/performance/benchmark/index.md
new file mode 100644
index 00000000..445729a6
--- /dev/null
+++ b/site-src/performance/benchmark/index.md
@@ -0,0 +1,98 @@
+# Benchmark
+
+This user guide shows how to run benchmarks against a vLLM deployment, by using both the Gateway API
+inference extension, and a Kubernetes service as the load balancing strategy. The
+benchmark uses the [Latency Profile Generator](https://github.com/AI-Hypercomputer/inference-benchmark) (LPG)
+tool to generate load and collect results.
+
+## Prerequisites
+
+### Deploy the inference extension and sample model server
+
+Follow this user guide https://gateway-api-inference-extension.sigs.k8s.io/guides/ to deploy the
+sample vLLM application, and the inference extension.
+
+### [Optional] Scale the sample vLLM deployment
+
+You will more likely to see the benefits of the inference extension when there are a decent number of replicas to make the optimal routing decision. 
+
+```bash
+kubectl scale --replicas=8 -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml
+```
+
+### Expose the model server via a k8s service
+
+As the baseline, let's also expose the vLLM deployment as a k8s service:
+
+```bash
+kubectl expose -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --port=8081 --target-port=8000 --type=LoadBalancer
+```
+
+## Run benchmark
+
+The LPG benchmark tool works by sending traffic to the specified target IP and port, and collect results. Follow the steps below to run a single benchmark. You can deploy multiple LPG instances if you want to run benchmarks in parallel against different targets.
+
+1. Check out the repo.
+    
+    ```bash
+    git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension
+    cd gateway-api-inference-extension
+    ```
+
+1. Get the target IP. Examples below show how to get the IP of a gateway or a LoadBalancer k8s service.
+
+    ```bash
+    # Get gateway IP
+    GW_IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}')
+    # Get LoadBalancer k8s service IP
+    SVC_IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}')
+
+    echo $GW_IP
+    echo $SVC_IP
+    ```
+
+1. Then update the `<target-ip>` in `./config/manifests/benchmark/benchmark.yaml` to your target IP. Feel free to adjust other parameters such as request_rates as well. For a complete list of LPG configurations, pls refer to the [LPG user guide](https://github.com/AI-Hypercomputer/inference-benchmark?tab=readme-ov-file#configuring-the-benchmark).
+
+1. Start the benchmark tool. `kubectl apply -f ./config/manifests/benchmark/benchmark.yaml`
+
+1. Wait for benchmark to finish and download the results. Use the `benchmark_id` environment variable
+to specify what this benchmark is for. For instance, `inference-extension` or `k8s-svc`. When the LPG tool finishes benchmarking, it will print a log line `LPG_FINISHED`,
+the script below will watch for that log line and then start downloading results.
+
+    ```bash
+    benchmark_id='my-benchmark' ./benchmark/download-benchmark-results.bash
+    ```
+
+1. After the script finishes, you should see benchmark results under `./benchmark/output/default-run/my-benchmark/results/json` folder.
+
+### Tips
+
+* You can specify `run_id="runX"` environment variable when running the `./download-benchmark-results.bash` script.
+This is useful when you run benchmarks multiple times to get a more statistically meaningful results and group the results accordingly.
+* Update the `request_rates` that best suit your benchmark environment.
+
+### Advanced Benchmark Configurations
+
+Pls refer to the [LPG user guide](https://github.com/AI-Hypercomputer/inference-benchmark?tab=readme-ov-file#configuring-the-benchmark) for a detailed list of configuration knobs.
+
+## Analyze the results
+
+This guide shows how to run the jupyter notebook using vscode.
+
+1. Create a python virtual environment.
+
+    ```bash
+    python3 -m venv .venv
+    source .venv/bin/activate
+    ```
+
+1. Install the dependencies.
+
+    ```bash
+    pip install -r ./benchmark/requirements.txt
+    ```
+
+1. Open the notebook `./benchmark/benchmark.ipynb`, and run each cell. At the end you should
+    see a bar chart like below:
+    
+    ![alt text](example-bar-chart.png)
\ No newline at end of file