kubernetes-sigs · k8s-ci-robot · Mar 27, 2025 · Mar 26, 2025
diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md
@@ -5,12 +5,12 @@ A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) depl
 
 ## Install
 
-To install an InferencePool named `vllm-llama2-7b`  that selects from endpoints with label `app: vllm-llama2-7b` and listening on port `8000`, you can run the following command:
+To install an InferencePool named `vllm-llama3-8b-instruct`  that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port `8000`, you can run the following command:
 
 ```txt
-$ helm install vllm-llama2-7b ./config/charts/inferencepool \
-  --set inferencePool.name=vllm-llama2-7b \
-  --set inferencePool.modelServers.matchLabels.app=vllm-llama2-7b \
+$ helm install vllm-llama3-8b-instruct ./config/charts/inferencepool \
+  --set inferencePool.name=vllm-llama3-8b-instruct \
+  --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
   --set inferencePool.targetPortNumber=8000
 ```
 
@@ -19,9 +19,9 @@ where `inferencePool.targetPortNumber` is the pod that vllm backends served on a
 To install via the latest published chart in staging  (--version v0 indicates latest dev version), you can run the following command:
 
 ```txt
-$ helm install vllm-llama2-7b \
-  --set inferencePool.name=vllm-llama2-7b \
-  --set inferencePool.modelServers.matchLabels.app=vllm-llama2-7b \
+$ helm install vllm-llama3-8b-instruct \
+  --set inferencePool.name=vllm-llama3-8b-instruct \
+  --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
   --set inferencePool.targetPortNumber=8000 \
   oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
 ```

diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml
@@ -12,4 +12,4 @@ inferencePool:
   targetPortNumber: 8000
   # modelServers: # REQUIRED
     # matchLabels: 
-    #   app: vllm-llama2-7b
+    #   app: vllm-llama3-8b-instruct
diff --git a/config/manifests/benchmark/benchmark.yaml b/config/manifests/benchmark/benchmark.yaml
@@ -31,9 +31,9 @@ spec:
         - name: BENCHMARK_TIME_SECONDS
           value: '60'
         - name: TOKENIZER
-          value: 'meta-llama/Llama-2-7b-hf'
+          value: 'meta-llama/Llama-3.1-8B-Instruct'
         - name: MODELS
-          value: 'meta-llama/Llama-2-7b-hf'
+          value: 'meta-llama/Llama-3.1-8B-Instruct'
         - name: BACKEND
           value: vllm
         - name: PORT

diff --git a/config/manifests/gateway/patch_policy.yaml b/config/manifests/gateway/patch_policy.yaml
@@ -99,7 +99,7 @@ spec:
     - backendRefs:
       - group: ""
         kind: Service
-        name: vllm-llama2-7b-epp
+        name: vllm-llama3-8b-instruct-epp
         port: 9002
       processingMode:
         allowModeOverride: true

diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/inferencemodel.yaml
@@ -3,12 +3,12 @@ kind: InferenceModel
 metadata:
   name: inferencemodel-sample
 spec:
-  modelName: tweet-summary
-  criticality: Critical
+  modelName: food-review
+  criticality: Standard
   poolRef:
-    name: vllm-llama2-7b
+    name: vllm-llama3-8b-instruct
   targetModels:
-  - name: tweet-summary-1
+  - name: food-review-1
     weight: 100
 
 ---
@@ -17,10 +17,10 @@ kind: InferenceModel
 metadata:
   name: inferencemodel-base-model
 spec:
-  modelName: meta-llama/Llama-2-7b-hf
+  modelName: meta-llama/Llama-3.1-8B-Instruct
   criticality: Critical
   poolRef:
-    name: vllm-llama2-7b
+    name: vllm-llama3-8b-instruct
 
 ---
 apiVersion: inference.networking.x-k8s.io/v1alpha2
@@ -31,4 +31,4 @@ spec:
   modelName: Qwen/Qwen2.5-1.5B-Instruct
   criticality: Critical
   poolRef:
-    name: vllm-llama2-7b
+    name: vllm-llama3-8b-instruct
diff --git a/config/manifests/inferencepool.yaml b/config/manifests/inferencepool.yaml
@@ -2,22 +2,22 @@ apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
 metadata:
   labels:
-  name: vllm-llama2-7b
+  name: vllm-llama3-8b-instruct
 spec:
   targetPortNumber: 8000
   selector:
-    app: vllm-llama2-7b
+    app: vllm-llama3-8b-instruct
   extensionRef:
-    name: vllm-llama2-7b-epp
+    name: vllm-llama3-8b-instruct-epp
 ---
 apiVersion: v1
 kind: Service
 metadata:
-  name: vllm-llama2-7b-epp
+  name: vllm-llama3-8b-instruct-epp
   namespace: default
 spec:
   selector:
-    app: vllm-llama2-7b-epp
+    app: vllm-llama3-8b-instruct-epp
   ports:
     - protocol: TCP
       port: 9002
@@ -27,27 +27,27 @@ spec:
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: vllm-llama2-7b-epp
+  name: vllm-llama3-8b-instruct-epp
   namespace: default
   labels:
-    app: vllm-llama2-7b-epp
+    app: vllm-llama3-8b-instruct-epp
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: vllm-llama2-7b-epp
+      app: vllm-llama3-8b-instruct-epp
   template:
     metadata:
       labels:
-        app: vllm-llama2-7b-epp
+        app: vllm-llama3-8b-instruct-epp
     spec:
       containers:
       - name: epp
         image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
         imagePullPolicy: Always
         args:
         - -poolName
-        - "vllm-llama2-7b"
+        - "vllm-llama3-8b-instruct"
         - -v
         - "4"
         - --zap-encoder

diff --git a/config/manifests/vllm/cpu-deployment.yaml b/config/manifests/vllm/cpu-deployment.yaml
@@ -1,16 +1,16 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: vllm-llama2-7b
+  name: vllm-llama3-8b-instruct
 spec:
   replicas: 3
   selector:
     matchLabels:
-      app: vllm-llama2-7b
+      app: vllm-llama3-8b-instruct
   template:
     metadata:
       labels:
-        app: vllm-llama2-7b
+        app: vllm-llama3-8b-instruct
     spec:
       containers:
         - name: lora
@@ -26,8 +26,8 @@ spec:
           - "--max-loras"
           - "4"
           - "--lora-modules"
-          - '{"name": "tweet-summary-0", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
-          - '{"name": "tweet-summary-1", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
+          - '{"name": "food-review-0", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
+          - '{"name": "food-review-1", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
           env:
             - name: PORT
               value: "8000"
@@ -108,10 +108,10 @@ metadata:
 data:
   configmap.yaml: |
       vLLMLoRAConfig:
-        name: vllm-llama2-7b
+        name: vllm-llama3-8b-instruct
         port: 8000
         ensureExist:
           models:
           - base-model: Qwen/Qwen2.5-1.5B
-            id: tweet-summary-1
+            id: food-review-1
             source: SriSanth2345/Qwen-1.5B-Tweet-Generations
diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml
@@ -1,37 +1,34 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: vllm-llama2-7b
+  name: vllm-llama3-8b-instruct
 spec:
   replicas: 3
   selector:
     matchLabels:
-      app: vllm-llama2-7b
+      app: vllm-llama3-8b-instruct
   template:
     metadata:
       labels:
-        app: vllm-llama2-7b
+        app: vllm-llama3-8b-instruct
     spec:
       containers:
-        - name: lora
+        - name: vllm
           image: "vllm/vllm-openai:latest"
           imagePullPolicy: Always
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args:
           - "--model"
-          - "meta-llama/Llama-2-7b-hf"
+          - "meta-llama/Llama-3.1-8B-Instruct"
           - "--tensor-parallel-size"
           - "1"
           - "--port"
           - "8000"
           - "--enable-lora"
           - "--max-loras"
-          - "4"
+          - "2"
           - "--max-cpu-loras"
           - "12"
-          - "--lora-modules"
-          - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
-          - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
           env:
             # Enabling LoRA support temporarily disables automatic v1, we want to force it on
             # until 0.8.3 vLLM is released.
@@ -238,20 +235,22 @@ spec:
           emptyDir: {}
         - name: config-volume
           configMap:
-            name: vllm-llama2-7b-adapters
+            name: vllm-llama3.1-8b-adapters
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: vllm-llama2-7b-adapters
+  name: vllm-llama3.1-8b-adapters
 data:
   configmap.yaml: |
       vLLMLoRAConfig:
-        name: vllm-llama2-7b
+        name: vllm-llama3.1-8b-instruct
         port: 8000
         ensureExist:
           models:
-          - base-model: meta-llama/Llama-2-7b-hf
-            id: tweet-summary-1
-            source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
-  
+          - base-model: meta-llama/Llama-3.1-8B-Instruct
+            id: food-review
+            source: Kawon/llama3.1-food-finetune_v14_r8
+          - base-model: meta-llama/Llama-3.1-8B-Instruct
+            id: cad-fabricator
+            source: redcathode/fabricator
diff --git a/hack/test-e2e.sh b/hack/test-e2e.sh
@@ -124,14 +124,14 @@ if [[ "$CURL_POD" == "true" ]]; then
     while [ $SECONDS -lt $end ]; do
         kubectl exec po/curl -- curl -i "$IP:$PORT/v1/completions" \
             -H 'Content-Type: application/json' \
-            -d '{"model": "tweet-summary","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}'
+            -d '{"model": "food-review","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}'
         sleep 5
     done
 else
     while [ $SECONDS -lt $end ]; do
         curl -i "$IP:$PORT/v1/completions" \
             -H 'Content-Type: application/json' \
-            -d '{"model": "tweet-summary","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}'
+            -d '{"model": "food-review","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}'
         sleep 5
     done
 fi
diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go
@@ -97,7 +97,7 @@ func TestPool(t *testing.T) {
 
 func TestModel(t *testing.T) {
 	chatModel := "chat"
-	tsModel := "tweet-summary"
+	tsModel := "food-review"
 	model1ts := testutil.MakeInferenceModel("model1").
 		CreationTimestamp(metav1.Unix(1000, 0)).
 		ModelName(tsModel).ObjRef()
@@ -126,7 +126,7 @@ func TestModel(t *testing.T) {
 		wantModels     []*v1alpha2.InferenceModel
 	}{
 		{
-			name: "Add model1 with tweet-summary as modelName",
+			name: "Add model1 with food-review as modelName",
 			op: func(ds Datastore) bool {
 				return ds.ModelSetIfOlder(model1ts)
 			},
@@ -161,7 +161,7 @@ func TestModel(t *testing.T) {
 			wantModels:   []*v1alpha2.InferenceModel{model2ts},
 		},
 		{
-			name:           "Set model1 with the tweet-summary modelName, both models should exist",
+			name:           "Set model1 with the food-review modelName, both models should exist",
 			existingModels: []*v1alpha2.InferenceModel{model2chat},
 			op: func(ds Datastore) bool {
 				return ds.ModelSetIfOlder(model1ts)
@@ -170,7 +170,7 @@ func TestModel(t *testing.T) {
 			wantModels:   []*v1alpha2.InferenceModel{model2chat, model1ts},
 		},
 		{
-			name:           "Set model1 with the tweet-summary modelName, both models should exist",
+			name:           "Set model1 with the food-review modelName, both models should exist",
 			existingModels: []*v1alpha2.InferenceModel{model2chat, model1ts},
 			op: func(ds Datastore) bool {
 				return ds.ModelSetIfOlder(model1ts)

diff --git a/pkg/epp/handlers/response.go b/pkg/epp/handlers/response.go
@@ -127,7 +127,7 @@ func (s *Server) HandleResponseHeaders(
     "id": "cmpl-573498d260f2423f9e42817bbba3743a",
     "object": "text_completion",
     "created": 1732563765,
-    "model": "meta-llama/Llama-2-7b-hf",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "choices": [
         {
             "index": 0,
@@ -217,7 +217,7 @@ func (s *Server) HandleStreaming(
 }
 
 // Example message if "stream_options": {"include_usage": "true"} is included in the request:
-// data: {"id":"...","object":"text_completion","created":1739400043,"model":"tweet-summary-0","choices":[],
+// data: {"id":"...","object":"text_completion","created":1739400043,"model":"food-review-0","choices":[],
 // "usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
 //
 // data: [DONE]

diff --git a/pkg/epp/handlers/response_test.go b/pkg/epp/handlers/response_test.go
@@ -31,7 +31,7 @@ const (
 		"id": "cmpl-573498d260f2423f9e42817bbba3743a",
 		"object": "text_completion",
 		"created": 1732563765,
-		"model": "meta-llama/Llama-2-7b-hf",
+		"model": "meta-llama/Llama-3.1-8B-Instruct",
 		"choices": [
 			{
 				"index": 0,
@@ -50,10 +50,10 @@ const (
 	}
 	`
 
-	streamingBodyWithoutUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"tweet-summary-0","choices":[],"usage":null}
+	streamingBodyWithoutUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"food-review-0","choices":[],"usage":null}
 	`
 
-	streamingBodyWithUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"tweet-summary-0","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
+	streamingBodyWithUsage = `data: {"id":"cmpl-41764c93-f9d2-4f31-be08-3ba04fa25394","object":"text_completion","created":1740002445,"model":"food-review-0","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
 data: [DONE]
 	`
 )

diff --git a/site-src/guides/adapter-rollout.md b/site-src/guides/adapter-rollout.md
@@ -18,7 +18,7 @@ Modify the LoRA syncer ConfigMap to initiate loading of the new adapter version.
 
 
 ```bash
-   kubectl edit configmap vllm-llama2-7b-adapters
+   kubectl edit configmap vllm-llama3-8b-instruct-adapters
 ```
 
 Change the ConfigMap to match the following (note the new entry under models):
@@ -27,19 +27,19 @@ Change the ConfigMap to match the following (note the new entry under models):
         apiVersion: v1
         kind: ConfigMap
         metadata:
-        name: vllm-llama2-7b-adapters
+        name: vllm-llama3-8b-instruct-adapters
         data:
         configmap.yaml: |
              vLLMLoRAConfig:
-                name: vllm-llama2-7b-adapters
+                name: vllm-llama3-8b-instruct-adapters
                 port: 8000
                 ensureExist:
                     models:
-                    - base-model: meta-llama/Llama-2-7b-hf
-                      id: tweet-summary-1
+                    - base-model: meta-llama/Llama-3.1-8B-Instruct
+                      id: food-review-1
                       source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
-                    - base-model: meta-llama/Llama-2-7b-hf
-                      id: tweet-summary-2
+                    - base-model: meta-llama/Llama-3.1-8B-Instruct
+                      id: food-review-2
                       source: mahimairaja/tweet-summarization-llama-2-finetuned
 ```
 
@@ -48,11 +48,11 @@ The new adapter version is applied to the model servers live, without requiring
 
 ### Direct traffic to the new adapter version
 
-Modify the InferenceModel to configure a canary rollout with traffic splitting. In this example, 10% of traffic for tweet-summary model will be sent to the new ***tweet-summary-2*** adapter.
+Modify the InferenceModel to configure a canary rollout with traffic splitting. In this example, 10% of traffic for food-review model will be sent to the new ***food-review-2*** adapter.
 
 
 ```bash
-   kubectl edit inferencemodel tweet-summary
+   kubectl edit inferencemodel food-review
 ```
 
 Change the targetModels list in InferenceModel to match the following:
@@ -64,14 +64,14 @@ kind: InferenceModel
 metadata:
   name: inferencemodel-sample
 spec:
-  modelName: tweet-summary
+  modelName: food-review
   criticality: Critical
   poolRef:
-    name: vllm-llama2-7b-pool
+    name: vllm-llama3-8b-instruct-pool
   targetModels:
-  - name: tweet-summary-1
+  - name: food-review-1
     weight: 90
-  - name: tweet-summary-2
+  - name: food-review-2
     weight: 10
 
 ```
@@ -86,7 +86,7 @@ IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].va
 2. Send a few requests as follows:
 ```bash
 curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
-"model": "tweet-summary",
+"model": "food-review",
 "prompt": "Write as if you were a critic: San Francisco",
 "max_tokens": 100,
 "temperature": 0
@@ -100,9 +100,9 @@ Modify the InferenceModel to direct 100% of the traffic to the latest version of
 
 ```yaml
 model:
-    name: tweet-summary
+    name: food-review
     targetModels:
-    targetModelName: tweet-summary-2
+    targetModelName: food-review-2
             weight: 100
 ```
 
@@ -120,13 +120,13 @@ Unload the older versions from the servers by updating the LoRA syncer ConfigMap
                 port: 8000
                 ensureExist:
                     models:
-                    - base-model: meta-llama/Llama-2-7b-hf
-                      id: tweet-summary-2
+                    - base-model: meta-llama/Llama-3.1-8B-Instruct
+                      id: food-review-2
                       source: mahimairaja/tweet-summarization-llama-2-finetuned
                 ensureNotExist:
                     models:
-                    - base-model: meta-llama/Llama-2-7b-hf
-                      id: tweet-summary-1
+                    - base-model: meta-llama/Llama-3.1-8B-Instruct
+                      id: food-review-1
                       source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
 ```
 

diff --git a/site-src/guides/index.md b/site-src/guides/index.md
@@ -17,7 +17,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
    Two options are supported for running the model server:
 
    1. GPU-based model server.  
-      Requirements: a Hugging Face access token that grants access to the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf).
+      Requirements: a Hugging Face access token that grants access to the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
 
    1. CPU-based model server (not using GPUs).  
       The sample uses the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct).  
@@ -27,11 +27,11 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
 === "GPU-Based Model Server"
 
       For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/gpu-deployment.yaml` as needed.
-      Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model.
+      Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). Ensure that the token grants access to this model.
       
       Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway.
       ```bash
-      kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2
+      kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to the set of Llama models
       kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml
       ```
 
@@ -59,7 +59,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
 
 ### Deploy InferenceModel
 
-   Deploy the sample InferenceModel which is configured to load balance traffic between the `tweet-summary-0` and `tweet-summary-1`
+   Deploy the sample InferenceModel which is configured to load balance traffic between the `food-review-0` and `food-review-1`
    [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server.
    ```bash
    kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml
@@ -116,7 +116,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
    PORT=8081
 
    curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
-   "model": "tweet-summary",
+   "model": "food-review",
    "prompt": "Write as if you were a critic: San Francisco",
    "max_tokens": 100,
    "temperature": 0

diff --git a/site-src/guides/metrics.md b/site-src/guides/metrics.md
@@ -29,7 +29,7 @@ If you want to include usage metrics for vLLM model server streaming request, se
 
 ```
 curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
-"model": "tweet-summary",
+"model": "food-review",
 "prompt": "whats your fav movie?",
 "max_tokens": 10,
 "temperature": 0,

diff --git a/test/e2e/epp/README.md b/test/e2e/epp/README.md
@@ -10,7 +10,7 @@ The end-to-end tests are designed to validate end-to-end Gateway API Inference E
 
 - [Go](https://golang.org/doc/install) installed on your machine.
 - [Make](https://www.gnu.org/software/make/manual/make.html) installed to run the end-to-end test target.
-- A Hugging Face Hub token with access to the [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) model.
+- A Hugging Face Hub token with access to the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model.
 
 ## Running the End-to-End Tests
 
@@ -34,5 +34,5 @@ Follow these steps to run the end-to-end tests:
    make test-e2e
    ```
 
-   The test suite prints details for each step. Note that the `vllm-llama2-7b-pool` model server deployment
+   The test suite prints details for each step. Note that the `vllm-llama3-8b-instruct-pool` model server deployment
    may take several minutes to report an `Available=True` status due to the time required for bootstraping.
diff --git a/test/e2e/epp/e2e_suite_test.go b/test/e2e/epp/e2e_suite_test.go
@@ -57,15 +57,15 @@ const (
 	// TODO [danehans]: Must be "default" until https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/227 is fixed
 	nsName = "default"
 	// modelServerName is the name of the model server test resources.
-	modelServerName = "vllm-llama2-7b"
+	modelServerName = "vllm-llama3-8b-instruct"
 	// modelName is the test model name.
-	modelName = "tweet-summary"
+	modelName = "food-review"
 	// envoyName is the name of the envoy proxy test resources.
 	envoyName = "envoy"
 	// envoyPort is the listener port number of the test envoy proxy.
 	envoyPort = "8081"
 	// inferExtName is the name of the inference extension test resources.
-	inferExtName = "vllm-llama2-7b-epp"
+	inferExtName = "vllm-llama3-8b-instruct-epp"
 	// clientManifest is the manifest for the client test resources.
 	clientManifest = "../../testdata/client.yaml"
 	// modelServerSecretManifest is the manifest for the model server secret resource.

diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go
@@ -1198,42 +1198,42 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 				{
 					Request: &extProcPb.ProcessingRequest_ResponseBody{
 						ResponseBody: &extProcPb.HttpBody{
-							Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
+							Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
 							EndOfStream: false},
 					},
 				},
 				{
 					Request: &extProcPb.ProcessingRequest_ResponseBody{
 						ResponseBody: &extProcPb.HttpBody{
-							Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
+							Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
 							EndOfStream: false},
 					},
 				},
 				{
 					Request: &extProcPb.ProcessingRequest_ResponseBody{
 						ResponseBody: &extProcPb.HttpBody{
-							Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
+							Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
 							EndOfStream: false},
 					},
 				},
 				{
 					Request: &extProcPb.ProcessingRequest_ResponseBody{
 						ResponseBody: &extProcPb.HttpBody{
-							Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
+							Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
 							EndOfStream: false},
 					},
 				},
 				{
 					Request: &extProcPb.ProcessingRequest_ResponseBody{
 						ResponseBody: &extProcPb.HttpBody{
-							Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
+							Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
 							EndOfStream: false},
 					},
 				},
 				{
 					Request: &extProcPb.ProcessingRequest_ResponseBody{
 						ResponseBody: &extProcPb.HttpBody{
-							Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
+							Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
 		data: [DONE]`,
 							),
 							EndOfStream: false},
@@ -1300,7 +1300,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 								BodyMutation: &extProcPb.BodyMutation{
 									Mutation: &extProcPb.BodyMutation_StreamedResponse{
 										StreamedResponse: &extProcPb.StreamedBodyResponse{
-											Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
+											Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
 											EndOfStream: false,
 										},
 									},
@@ -1316,7 +1316,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 								BodyMutation: &extProcPb.BodyMutation{
 									Mutation: &extProcPb.BodyMutation_StreamedResponse{
 										StreamedResponse: &extProcPb.StreamedBodyResponse{
-											Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
+											Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
 											EndOfStream: false,
 										},
 									},
@@ -1332,7 +1332,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 								BodyMutation: &extProcPb.BodyMutation{
 									Mutation: &extProcPb.BodyMutation_StreamedResponse{
 										StreamedResponse: &extProcPb.StreamedBodyResponse{
-											Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
+											Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
 											EndOfStream: false,
 										},
 									},
@@ -1348,7 +1348,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 								BodyMutation: &extProcPb.BodyMutation{
 									Mutation: &extProcPb.BodyMutation_StreamedResponse{
 										StreamedResponse: &extProcPb.StreamedBodyResponse{
-											Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
+											Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
 											EndOfStream: false,
 										},
 									},
@@ -1364,7 +1364,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 								BodyMutation: &extProcPb.BodyMutation{
 									Mutation: &extProcPb.BodyMutation_StreamedResponse{
 										StreamedResponse: &extProcPb.StreamedBodyResponse{
-											Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
+											Body:        []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`),
 											EndOfStream: false,
 										},
 									},
@@ -1380,7 +1380,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
 								BodyMutation: &extProcPb.BodyMutation{
 									Mutation: &extProcPb.BodyMutation_StreamedResponse{
 										StreamedResponse: &extProcPb.StreamedBodyResponse{
-											Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
+											Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
 		data: [DONE]`,
 											),
 											EndOfStream: false,
@@ -1507,7 +1507,7 @@ func setUpHermeticServer(t *testing.T, podAndMetrics map[backendmetrics.Pod]*bac
 
 	// TODO: this should be consistent with the inference pool
 	podLabels := map[string]string{
-		"app": "vllm-llama2-7b-pool",
+		"app": "vllm-llama3-8b-instruct-pool",
 	}
 
 	for pod := range podAndMetrics {
@@ -1602,7 +1602,7 @@ func BeforeSuite() func() {
 	// Init runtime.
 	ctrl.SetLogger(logger)
 
-	mgr, err := server.NewManagerWithOptions(cfg, managerTestOptions("default", "vllm-llama2-7b-pool"))
+	mgr, err := server.NewManagerWithOptions(cfg, managerTestOptions("default", "vllm-llama3-8b-instruct-pool"))
 	if err != nil {
 		logutil.Fatal(logger, err, "Failed to create controller manager")
 	}
@@ -1615,7 +1615,7 @@ func BeforeSuite() func() {
 	serverRunner.TestPodMetricsClient = &backendmetrics.FakePodMetricsClient{}
 	pmf := backendmetrics.NewPodMetricsFactory(serverRunner.TestPodMetricsClient, 10*time.Millisecond)
 	// Adjust from defaults
-	serverRunner.PoolName = "vllm-llama2-7b-pool"
+	serverRunner.PoolName = "vllm-llama3-8b-instruct-pool"
 	serverRunner.Datastore = datastore.NewDatastore(context.Background(), pmf)
 	serverRunner.SecureServing = false
 

diff --git a/test/testdata/envoy.yaml b/test/testdata/envoy.yaml
@@ -100,7 +100,7 @@ data:
                           grpc_service:
                             envoy_grpc:
                               cluster_name: ext_proc
-                              authority: vllm-llama2-7b-epp.default:9002
+                              authority: vllm-llama3-8b-instruct-epp.default:9002
                             timeout: 10s
                           processing_mode:
                             request_header_mode: SEND
@@ -194,7 +194,7 @@ data:
                   - endpoint:
                       address:
                         socket_address:
-                          address: vllm-llama2-7b-epp.default
+                          address: vllm-llama3-8b-instruct-epp.default
                           port_value: 9002
                     health_status: HEALTHY
                     load_balancing_weight: 1

diff --git a/test/testdata/inferencepool-with-model-hermetic.yaml b/test/testdata/inferencepool-with-model-hermetic.yaml
@@ -1,12 +1,12 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
 metadata:
-  name: vllm-llama2-7b-pool
+  name: vllm-llama3-8b-instruct-pool
   namespace: default
 spec:
   targetPortNumber: 8000
   selector:
-    app: vllm-llama2-7b-pool
+    app: vllm-llama3-8b-instruct-pool
   extensionRef:
     name: epp
 ---
@@ -19,7 +19,7 @@ spec:
   modelName: sql-lora
   criticality: Critical
   poolRef:
-    name: vllm-llama2-7b-pool
+    name: vllm-llama3-8b-instruct-pool
   targetModels:
   - name: sql-lora-1fdg2
     weight: 100
@@ -32,7 +32,7 @@ metadata:
 spec:
   modelName: sql-lora-sheddable
   poolRef:
-    name: vllm-llama2-7b-pool
+    name: vllm-llama3-8b-instruct-pool
   targetModels:
   - name: sql-lora-1fdg3
     weight: 100
@@ -46,7 +46,7 @@ spec:
   modelName: my-model
   criticality: Critical
   poolRef:
-    name: vllm-llama2-7b-pool
+    name: vllm-llama3-8b-instruct-pool
   targetModels:
   - name: my-model-12345
     weight: 100    
@@ -60,4 +60,4 @@ spec:
   modelName: direct-model
   criticality: Critical
   poolRef:
-    name: vllm-llama2-7b-pool
+    name: vllm-llama3-8b-instruct-pool
diff --git a/tools/dynamic-lora-sidecar/deployment.yaml b/tools/dynamic-lora-sidecar/deployment.yaml
@@ -32,7 +32,7 @@ spec:
             nvidia.com/gpu : 1
         command: ["/bin/sh", "-c"]
         args:
-        - vllm serve meta-llama/Llama-2-7b-hf
+        - vllm serve meta-llama/Llama-3.1-8B-Instruct
         - --host=0.0.0.0
         - --port=8000
         - --tensor-parallel-size=1
@@ -111,17 +111,17 @@ data:
         port: modelServerPort
         ensureExist:
           models:
-          - base-model: meta-llama/Llama-2-7b-hf
+          - base-model: meta-llama/Llama-3.1-8B-Instruct
             id: sql-lora-v1
             source: yard1/llama-2-7b-sql-lora-test
-          - base-model: meta-llama/Llama-2-7b-hf
+          - base-model: meta-llama/Llama-3.1-8B-Instruct
             id: sql-lora-v3
             source: yard1/llama-2-7b-sql-lora-test
-          - base-model: meta-llama/Llama-2-7b-hf
+          - base-model: meta-llama/Llama-3.1-8B-Instruct
             id: sql-lora-v4
             source: yard1/llama-2-7b-sql-lora-test
         ensureNotExist:
           models:
-          - base-model: meta-llama/Llama-2-7b-hf
+          - base-model: meta-llama/Llama-3.1-8B-Instruct
             id: sql-lora-v2
             source: yard1/llama-2-7b-sql-lora-test
diff --git a/tools/dynamic-lora-sidecar/sidecar/test_sidecar.py b/tools/dynamic-lora-sidecar/sidecar/test_sidecar.py
@@ -12,17 +12,17 @@
         "ensureExist": {
             "models": [
                 {
-                    "base-model": "meta-llama/Llama-2-7b-hf",
+                    "base-model": "meta-llama/Llama-3.1-8B-Instruct",
                     "id": "sql-lora-v1",
                     "source": "yard1/llama-2-7b-sql-lora-test",
                 },
                 {
-                    "base-model": "meta-llama/Llama-2-7b-hf",
+                    "base-model": "meta-llama/Llama-3.1-8B-Instruct",
                     "id": "sql-lora-v3",
                     "source": "yard1/llama-2-7b-sql-lora-test",
                 },
                 {
-                    "base-model": "meta-llama/Llama-2-7b-hf",
+                    "base-model": "meta-llama/Llama-3.1-8B-Instruct",
                     "id": "already_exists",
                     "source": "yard1/llama-2-7b-sql-lora-test",
                 },
@@ -31,17 +31,17 @@
         "ensureNotExist": {
             "models": [
                 {
-                    "base-model": "meta-llama/Llama-2-7b-hf",
+                    "base-model": "meta-llama/Llama-3.1-8B-Instruct",
                     "id": "sql-lora-v2",
                     "source": "yard1/llama-2-7b-sql-lora-test",
                 },
                 {
-                    "base-model": "meta-llama/Llama-2-7b-hf",
+                    "base-model": "meta-llama/Llama-3.1-8B-Instruct",
                     "id": "sql-lora-v3",
                     "source": "yard1/llama-2-7b-sql-lora-test",
                 },
                 {
-                    "base-model": "meta-llama/Llama-2-7b-hf",
+                    "base-model": "meta-llama/Llama-3.1-8B-Instruct",
                     "id": "to_remove",
                     "source": "yard1/llama-2-7b-sql-lora-test",
                 },
@@ -67,7 +67,7 @@
                 "object": "model",
                 "created": 1729693000,
                 "owned_by": "vllm",
-                "root": "meta-llama/Llama-2-7b-hf",
+                "root": "meta-llama/Llama-3.1-8B-Instruct",
                 "parent": None,
                 "max_model_len": 4096,
             },