kubernetes-sigs · k8s-ci-robot · Feb 18, 2025 · Feb 15, 2025 · Feb 15, 2025 · Feb 15, 2025
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -56,6 +56,7 @@ nav:
   - Guides:
     - User Guides:
       - Getting started: guides/index.md
+      - Adapter Rollout: guides/adapter-rollout.md
     - Implementer's Guide: guides/implementers.md
   - Reference:
     - API Reference: reference/spec.md

diff --git a/pkg/manifests/inferencemodel.yaml b/pkg/manifests/inferencemodel.yaml
@@ -1,21 +1,12 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: InferenceModel
 metadata:
-  labels:
-    app.kubernetes.io/name: api
-    app.kubernetes.io/managed-by: kustomize
   name: inferencemodel-sample
 spec:
   modelName: tweet-summary
   criticality: Critical
   poolRef:
-    # this is the default val:
-    group: inference.networking.x-k8s.io
-    # this is the default val:
-    kind: InferencePool
     name: vllm-llama2-7b-pool
   targetModels:
-  - name: tweet-summary-0
-    weight: 50
   - name: tweet-summary-1
-    weight: 50
+    weight: 100
diff --git a/pkg/manifests/vllm/deployment-with-syncer.yaml b/pkg/manifests/vllm/deployment-with-syncer.yaml
diff --git a/pkg/manifests/vllm/deployment.yaml b/pkg/manifests/vllm/deployment.yaml
@@ -1,16 +1,3 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: vllm-llama2-7b-pool
-spec:
-  selector:
-    app: vllm-llama2-7b-pool
-  ports:
-  - protocol: TCP
-    port: 8000
-    targetPort: 8000
-  type: ClusterIP
----
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -39,7 +26,7 @@ spec:
           - "8000"
           - "--enable-lora"
           - "--max-loras"
-          - "4"
+          - "2"
           - "--max-cpu-loras"
           - "12"
           - "--lora-modules"
@@ -53,6 +40,8 @@ spec:
                 secretKeyRef:
                   name: hf-token
                   key: token
+            - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+              value: "true"
           ports:
             - containerPort: 8000
               name: http
@@ -89,6 +78,19 @@ spec:
               name: shm
             - name: adapters
               mountPath: "/adapters"
+      initContainers:
+        - name: lora-adapter-syncer
+          tty: true
+          stdin: true 
+          image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main
+          restartPolicy: Always
+          imagePullPolicy: Always
+          env: 
+            - name: DYNAMIC_LORA_ROLLOUT_CONFIG
+              value: "/config/configmap.yaml"
+          volumeMounts: # DO NOT USE subPath, dynamic configmap updates don't work on subPaths
+          - name: config-volume
+            mountPath:  /config
       restartPolicy: Always
       schedulerName: default-scheduler
       terminationGracePeriodSeconds: 30
@@ -100,3 +102,22 @@ spec:
             medium: Memory
         - name: adapters
           emptyDir: {}
+        - name: config-volume
+          configMap:
+            name: vllm-llama2-7b-adapters
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: vllm-llama2-7b-adapters
+data:
+  configmap.yaml: |
+      vLLMLoRAConfig:
+        name: vllm-llama2-7b
+        port: 8000
+        ensureExist:
+          models:
+          - base-model: meta-llama/Llama-2-7b-hf
+            id: tweet-summary-1
+            source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
+  
diff --git a/site-src/guides/adapter-rollout.md b/site-src/guides/adapter-rollout.md
@@ -0,0 +1,133 @@
+# Adapter Rollout
+
+The goal of this guide is to demonstrate how to rollout a new adapter version.
+
+## **Prerequisites**
+
+Follow the steps in the [main guide](index.md)
+
+
+## **Safely rollout v2 adapter**
-## **Safely rollout v2 adapter**
+## Safely rollout v2 adapter
-## **Safely rollout v2 adapter**
+## Safely rollout v2 adapter
+
+### Load the new adapter version to the model servers
+
+This guide leverages the LoRA syncer sidecar to dynamically manage adapters within a vLLM deployment, enabling users to add or remove them through a shared ConfigMap.
+
+
+Modify the LoRA syncer ConfigMap to initiate loading of the new adapter version.
+
+
+```bash
+   kubectl edit configmap vllm-llama2-7b-adapters
+```
+
+Change the ConfigMap to match the following (note the new entry under models):
+
+```yaml
+        apiVersion: v1
+        kind: ConfigMap
+        metadata:
+        name: vllm-llama2-7b-adapters
+        data:
+        configmap.yaml: |
+             vLLMLoRAConfig:
+                name: vllm-llama2-7b-adapters
+                port: 8000
+                ensureExist:
+                    models:
+                    - base-model: meta-llama/Llama-2-7b-hf
+                      id: tweet-summary-1
+                      source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
+                    - base-model: meta-llama/Llama-2-7b-hf
+                      id: tweet-summary-2
+                      source: mahimairaja/tweet-summarization-llama-2-finetuned
+```
+
+The new adapter version is applied to the model servers live, without requiring a restart.
+
+
+### Direct traffic to the new adapter version
+
+Modify the InferenceModel to configure a canary rollout with traffic splitting. In this example, 10% of traffic for tweet-summary model will be sent to the new ***tweet-summary-2*** adapter.
+
+
+```bash
+   kubectl edit inferencemodel tweet-summary
+```
+
+Change the targetModels list in InferenceModel to match the following:
+
+
+```yaml
+apiVersion: inference.networking.x-k8s.io/v1alpha1
+kind: InferenceModel
+metadata:
+  name: inferencemodel-sample
+spec:
+  modelName: tweet-summary
+  criticality: Critical
+  poolRef:
+    name: vllm-llama2-7b-pool
+  targetModels:
+  - name: tweet-summary-1
+    weight: 90
+  - name: tweet-summary-2
+    weight: 10
+
+```
+
+The above configuration means one in every ten requests should be sent to the new version. Try it out:
+
+1. Get the gateway IP:
+```bash
+IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=8081
+```
+
+2. Send a few requests as follows:
+```bash
+curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
+"model": "tweet-summary",
+"prompt": "Write as if you were a critic: San Francisco",
+"max_tokens": 100,
+"temperature": 0
+}'
+```
+
+### Finish the rollout
+
+
+Modify the InferenceModel to direct 100% of the traffic to the latest version of the adapter.
+
+```yaml
+model:
+    name: tweet-summary
+    targetModels:
+    targetModelName: tweet-summary-2
+            weight: 100
+```
+
+Unload the older versions from the servers by updating the LoRA syncer ConfigMap to list the older version under the `ensureNotExist` list:
+
+```yaml
+    apiVersion: v1
+    kind: ConfigMap
+    metadata:
+    name: dynamic-lora-config
+    data:
+    configmap.yaml: |
+            vLLMLoRAConfig:
+                name: sql-loras-llama
+                port: 8000
+                ensureExist:
+                    models:
+                    - base-model: meta-llama/Llama-2-7b-hf
+                      id: tweet-summary-2
+                      source: mahimairaja/tweet-summarization-llama-2-finetuned
+                ensureNotExist:
+                    models:
+                    - base-model: meta-llama/Llama-2-7b-hf
+                      id: tweet-summary-1
+                      source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
+```
+
+With this, all requests should be served by the new adapter version.
diff --git a/site-src/guides/dynamic-lora.md b/site-src/guides/dynamic-lora.md
diff --git a/site-src/guides/index.md b/site-src/guides/index.md
@@ -2,16 +2,16 @@
 
 This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get a first, single InferencePool up and running! 
 
-### Requirements
+## **Prerequisites**
  - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher
  - A cluster with:
    - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind,
      you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer).
    - 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed.
 
-### Steps
+## **Steps**
 
-1. **Deploy Sample Model Server**
+### Deploy Sample Model Server
 
    Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model.
    Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway.
@@ -20,30 +20,29 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
    kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml
    ```
 
+### Install the Inference Extension CRDs
 
-
-
-1. **Install the Inference Extension CRDs:**
-
-   ```sh
+   ```bash
    kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v0.1.0/manifests.yaml
 
-1. **Deploy InferenceModel**
+### Deploy InferenceModel
 
    Deploy the sample InferenceModel which is configured to load balance traffic between the `tweet-summary-0` and `tweet-summary-1`
    [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server.
    ```bash
    kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/inferencemodel.yaml
    ```
-1. **Update Envoy Gateway Config to enable Patch Policy**
+
+### Update Envoy Gateway Config to enable Patch Policy**
 
    Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run:
    ```bash
    kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/enable_patch_policy.yaml
    kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system
    ```
    Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again.
-1. **Deploy Gateway**
+
+### Deploy Gateway
 
    ```bash
    kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/gateway.yaml
@@ -56,26 +55,28 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
    NAME                CLASS               ADDRESS         PROGRAMMED   AGE
    inference-gateway   inference-gateway   <MY_ADDRESS>    True         22s
    ```
-1. **Deploy the Inference Extension and InferencePool**
+### Deploy the Inference Extension and InferencePool
 
    ```bash
    kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/ext_proc.yaml
    ```
-1. **Deploy Envoy Gateway Custom Policies**
+### Deploy Envoy Gateway Custom Policies
 
    ```bash
    kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/extension_policy.yaml
    kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/patch_policy.yaml
    ```
    > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further.
-1. **OPTIONALLY**: Apply Traffic Policy
+
+### **OPTIONALLY**: Apply Traffic Policy
 
    For high-traffic benchmarking you can apply this manifest to avoid any defaults that can cause timeouts/errors.
 
    ```bash
    kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/traffic_policy.yaml
    ```
-1. **Try it out**
+
+### Try it out
 
    Wait until the gateway is ready.
 
@@ -89,4 +90,4 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
    "max_tokens": 100,
    "temperature": 0
    }'
-   ```
+   ```