kubernetes-sigs · k8s-ci-robot · Feb 18, 2025 · Feb 15, 2025 · Feb 15, 2025 · Feb 15, 2025
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -56,6 +56,7 @@ nav:
   - Guides:
     - User Guides:
       - Getting started: guides/index.md
+      - Adapter Rollout: guides/adapter-rollout.md
     - Implementer's Guide: guides/implementers.md
   - Reference:
     - API Reference: reference/spec.md

diff --git a/pkg/manifests/inferencemodel.yaml b/pkg/manifests/inferencemodel.yaml
@@ -1,21 +1,12 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: InferenceModel
 metadata:
-  labels:
-    app.kubernetes.io/name: api
-    app.kubernetes.io/managed-by: kustomize
   name: inferencemodel-sample
 spec:
   modelName: tweet-summary
   criticality: Critical
   poolRef:
-    # this is the default val:
-    group: inference.networking.x-k8s.io
-    # this is the default val:
-    kind: InferencePool
     name: vllm-llama2-7b-pool
   targetModels:
-  - name: tweet-summary-0
-    weight: 50
   - name: tweet-summary-1
-    weight: 50
+    weight: 100
diff --git a/pkg/manifests/vllm/deployment-with-syncer.yaml b/pkg/manifests/vllm/deployment-with-syncer.yaml
diff --git a/pkg/manifests/vllm/deployment.yaml b/pkg/manifests/vllm/deployment.yaml
@@ -1,16 +1,3 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: vllm-llama2-7b-pool
-spec:
-  selector:
-    app: vllm-llama2-7b-pool
-  ports:
-  - protocol: TCP
-    port: 8000
-    targetPort: 8000
-  type: ClusterIP
----
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -39,7 +26,7 @@ spec:
           - "8000"
           - "--enable-lora"
           - "--max-loras"
-          - "4"
+          - "2"
           - "--max-cpu-loras"
           - "12"
           - "--lora-modules"
@@ -53,6 +40,8 @@ spec:
                 secretKeyRef:
                   name: hf-token
                   key: token
+            - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+              value: "true"
           ports:
             - containerPort: 8000
               name: http
@@ -89,6 +78,19 @@ spec:
               name: shm
             - name: adapters
               mountPath: "/adapters"
+      initContainers:
+        - name: lora-adapter-syncer
+          tty: true
+          stdin: true 
+          image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main
+          restartPolicy: Always
+          imagePullPolicy: Always
+          env: 
+            - name: DYNAMIC_LORA_ROLLOUT_CONFIG
+              value: "/config/configmap.yaml"
+          volumeMounts: # DO NOT USE subPath, dynamic configmap updates don't work on subPaths
+          - name: config-volume
+            mountPath:  /config
       restartPolicy: Always
       schedulerName: default-scheduler
       terminationGracePeriodSeconds: 30
@@ -100,3 +102,22 @@ spec:
             medium: Memory
         - name: adapters
           emptyDir: {}
+        - name: config-volume
+          configMap:
+            name: vllm-llama2-7b-adapters
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: vllm-llama2-7b-adapters
+data:
+  configmap.yaml: |
+      vLLMLoRAConfig:
+        name: vllm-llama2-7b
+        port: 8000
+        ensureExist:
+          models:
+          - base-model: meta-llama/Llama-2-7b-hf
+            id: tweet-summary-1
+            source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
+
diff --git a/site-src/guides/adapter-rollout.md b/site-src/guides/adapter-rollout.md
@@ -0,0 +1,133 @@
+# Adapter Rollout
+
+The goal of this guide is to demonstrate how to rollout a new adapter version.
+
+## **Prerequisites**
+
+Follow the steps in the [main guide](index.md)
+
+
+## **Safely rollout v2 adapter**
-## **Safely rollout v2 adapter**
+## Safely rollout v2 adapter
-## **Safely rollout v2 adapter**
+## Safely rollout v2 adapter
+
+### Load the new adapter version to the model servers
+
+This guide leverages the LoRA syncer sidecar to dynamically manage adapters within a vLLM deployment, enabling users to add or remove them through a shared ConfigMap.
+
+
+Modify the LoRA syncer ConfigMap to initiate loading of the new adapter version.
+
+
+```bash
+   kubectl edit configmap vllm-llama2-7b-adapters
+```
+
+Change the ConfigMap to match the following (note the new entry under models):
+
+```yaml
+        apiVersion: v1
+        kind: ConfigMap
+        metadata:
+        name: vllm-llama2-7b-adapters
+        data:
+        configmap.yaml: |
+             vLLMLoRAConfig:
+                name: vllm-llama2-7b-adapters
+                port: 8000
+                ensureExist:
+                    models:
+                    - base-model: meta-llama/Llama-2-7b-hf
+                      id: tweet-summary-1
+                      source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
+                    - base-model: meta-llama/Llama-2-7b-hf
+                      id: tweet-summary-2
+                      source: mahimairaja/tweet-summarization-llama-2-finetuned
+```
+
+The new adapter version is applied to the model servers live, without requiring a restart.
+
+
+### Direct traffic to the new adapter version
+
+Modify the InferenceModel to configure a canary rollout with traffic splitting. In this example, 10% of traffic for tweet-summary model will be sent to the new ***tweet-summary-2*** adapter.
+
+
+```bash
+   kubectl edit inferencemodel tweet-summary
+```
+
+Change the targetModels list in InferenceModel to match the following:
+
+
+```yaml
+apiVersion: inference.networking.x-k8s.io/v1alpha1
+kind: InferenceModel
+metadata:
+  name: inferencemodel-sample
+spec:
+  modelName: tweet-summary
+  criticality: Critical
+  poolRef:
+    name: vllm-llama2-7b-pool
+  targetModels:
+  - name: tweet-summary-1
+    weight: 90
+  - name: tweet-summary-2
+    weight: 10
+
+```
+
+The above configuration means one in every ten requests should be sent to the new version. Try it out:
+
+1. Get the gateway IP:
+```bash
+IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=8081
+```
+
+2. Send a few requests as follows:
+```bash
+curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
+"model": "tweet-summary",
+"prompt": "Write as if you were a critic: San Francisco",
+"max_tokens": 100,
+"temperature": 0
+}'
+```
+
+### Finish the rollout
+
+
+Modify the InferenceModel to direct 100% of the traffic to the latest version of the adapter.
+
+```yaml
+model:
+    name: tweet-summary
+    targetModels:
+    targetModelName: tweet-summary-2
+            weight: 100
+```
+
+Unload the older versions from the servers by updating the LoRA syncer ConfigMap to list the older version under the `ensureNotExist` list:
+
+```yaml
+    apiVersion: v1
+    kind: ConfigMap
+    metadata:
+    name: dynamic-lora-config
+    data:
+    configmap.yaml: |
+            vLLMLoRAConfig:
+                name: sql-loras-llama
+                port: 8000
+                ensureExist:
+                    models:
+                    - base-model: meta-llama/Llama-2-7b-hf
+                      id: tweet-summary-2
+                      source: mahimairaja/tweet-summarization-llama-2-finetuned
+                ensureNotExist:
+                    models:
+                    - base-model: meta-llama/Llama-2-7b-hf
+                      id: tweet-summary-1
+                      source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
+```
+
+With this, all requests should be served by the new adapter version.