kubernetes-sigs · k8s-ci-robot · Mar 4, 2025 · Mar 3, 2025 · Mar 4, 2025
diff --git a/config/manifests/ext_proc.yaml b/config/manifests/ext_proc.yaml
@@ -44,11 +44,11 @@ apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
 metadata:
   labels:
-  name: vllm-llama2-7b-pool
+  name: my-pool
 spec:
   targetPortNumber: 8000
   selector:
-    app: vllm-llama2-7b-pool
+    app: my-pool
   extensionRef:
     name: inference-gateway-ext-proc
 ---
@@ -75,7 +75,7 @@ spec:
         imagePullPolicy: Always
         args:
         - -poolName
-        - "vllm-llama2-7b-pool"
+        - "my-pool"
         - -v
         - "3"
         - -grpcPort

diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/inferencemodel.yaml
@@ -6,7 +6,7 @@ spec:
   modelName: tweet-summary
   criticality: Critical
   poolRef:
-    name: vllm-llama2-7b-pool
+    name: my-pool
   targetModels:
   - name: tweet-summary-1
     weight: 100
diff --git a/config/manifests/vllm/cpu-deployment.yaml b/config/manifests/vllm/cpu-deployment.yaml
@@ -0,0 +1,101 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: my-pool
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: my-pool
+  template:
+    metadata:
+      labels:
+        app: my-pool
+    spec:
+      containers:
+        - name: lora
+          image: "seedjeffwan/vllm-cpu-env:bb392af4-20250203"
+          imagePullPolicy: Always
+          command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+          args:
+          - "--model"
+          - "Qwen/Qwen2.5-1.5B-Instruct"
+          - "--port"
+          - "8000"
+          - "--enable-lora"
+          - "--lora-modules"
+          - '{"name": "tweet-summary-0", "path": "/adapters/hub/models--ai-blond--Qwen-Qwen2.5-Coder-1.5B-Instruct-lora/snapshots/9cde18d8ed964b0519fb481cca6acd936b2ca811"}'
+          - '{"name": "tweet-summary-1", "path": "/adapters/hub/models--ai-blond--Qwen-Qwen2.5-Coder-1.5B-Instruct-lora/snapshots/9cde18d8ed964b0519fb481cca6acd936b2ca811"}'
+          env:
+            - name: PORT
+              value: "8000"
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: token
+            - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+              value: "true"
+          ports:
+            - containerPort: 8000
+              name: http
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 240
+            httpGet:
+              path: /health
+              port: http
+              scheme: HTTP
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          readinessProbe:
+            failureThreshold: 600
+            httpGet:
+              path: /health
+              port: http
+              scheme: HTTP
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          volumeMounts:
+            - mountPath: /data
+              name: data
+            - mountPath: /dev/shm
+              name: shm
+            - name: adapters
+              mountPath: "/adapters"
+      initContainers:
+        - name: adapter-loader
+          image: ghcr.io/tomatillo-and-multiverse/adapter-puller:demo
+          command: ["python"]
+          args:
+            - ./pull_adapters.py
+            - --adapter
+            - ai-blond/Qwen-Qwen2.5-Coder-1.5B-Instruct-lora
+            - --duplicate-count
+            - "4"
+          env:
+            - name: HF_TOKEN 
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: token
+            - name: HF_HOME
+              value: /adapters
+          volumeMounts:
+            - name: adapters
+              mountPath: "/adapters"
+      restartPolicy: Always
+      schedulerName: default-scheduler
+      terminationGracePeriodSeconds: 30
+      volumes:
+        - name: data
+          emptyDir: {}
+        - name: shm
+          emptyDir:
+            medium: Memory
+        - name: adapters
+          emptyDir: {}
diff --git a/config/manifests/vllm/deployment.yaml → config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/deployment.yaml → config/manifests/vllm/gpu-deployment.yaml
@@ -1,16 +1,16 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: vllm-llama2-7b-pool
+  name: my-pool
 spec:
   replicas: 3
   selector:
     matchLabels:
-      app: vllm-llama2-7b-pool
+      app: my-pool
   template:
     metadata:
       labels:
-        app: vllm-llama2-7b-pool
+        app: my-pool
     spec:
       containers:
         - name: lora

diff --git a/site-src/guides/index.md b/site-src/guides/index.md
@@ -5,19 +5,40 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
 ## **Prerequisites**
  - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher
  - A cluster with:
-   - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind,
-     you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer).
-   - 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/deployment.yaml` as needed.
+   - Support for services of typs `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running).
+   For example, with Kind, you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer).
 
 ## **Steps**
 
 ### Deploy Sample Model Server
 
+   This quickstart guide contains two options for setting up model server:    
+
+   1. GPU-based model server.  
+      Requirements: a Hugging Face access token that grants access to the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf).
+
+   1. CPU-based model server (not using GPUs).  
+      Requirements: a Hugging Face access token that grants access to the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct).  
+
+   Choose one of these options and follow the steps below. Please do not deploy both, as the deployments have the same name and will override each other.
+
+#### GPU-Based Model Server
+
+   For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/deployment.yaml` as needed.  
    Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model.
    Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway.
    ```bash
    kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2
-   kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/deployment.yaml
+   kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml
+   ```
+
+#### CPU-Based Model Server
+
+   Create a Hugging Face secret to download the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct). Ensure that the token grants access to this model.
+   Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway.
+   ```bash
+   kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Qwen
+   kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml
    ```
 
 ### Install the Inference Extension CRDs
@@ -49,7 +70,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
    ```bash
    kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gateway.yaml
    ```
-   > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./manifests/gateway/ext-proc.yaml` file, and an additional `./manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.***
+   > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./config/manifests/gateway/ext-proc.yaml` file, and an additional `./config/manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.***
 
    Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status:
    ```bash