kubernetes-sigs · k8s-ci-robot · Feb 14, 2025 · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025
diff --git a/Makefile b/Makefile
@@ -26,11 +26,16 @@ PLATFORMS ?= linux/amd64
 DOCKER_BUILDX_CMD ?= docker buildx
 IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build
 IMAGE_BUILD_EXTRA_OPTS ?=
+SYNCER_IMAGE_BUILD_EXTRA_OPTS ?=
 IMAGE_REGISTRY ?= us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
 IMAGE_NAME := epp
 IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(IMAGE_NAME)
 IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG)
 
+SYNCER_IMAGE_NAME := lora-syncer
+SYNCER_IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(SYNCER_IMAGE_NAME)
+SYNCER_IMAGE_TAG ?= $(SYNCER_IMAGE_REPO):$(GIT_TAG)
+
 BASE_IMAGE ?= gcr.io/distroless/base-debian10
 BUILDER_IMAGE ?= golang:1.23-alpine
 ifdef GO_VERSION
@@ -39,9 +44,11 @@ endif
 
 ifdef EXTRA_TAG
 IMAGE_EXTRA_TAG ?= $(IMAGE_REPO):$(EXTRA_TAG)
+SYNCER_IMAGE_EXTRA_TAG ?= $(SYNCER_IMAGE_REPO):$(EXTRA_TAG)
 endif
 ifdef IMAGE_EXTRA_TAG
 IMAGE_BUILD_EXTRA_OPTS += -t $(IMAGE_EXTRA_TAG)
+SYNCER_IMAGE_BUILD_EXTRA_OPTS += -t $(SYNCER_IMAGE_EXTRA_TAG)
 endif
 
 # The name of the kind cluster to use for the "kind-load" target.
@@ -171,6 +178,31 @@ image-load: image-build
 image-kind: image-build ## Build the EPP image and load it to kind cluster $KIND_CLUSTER ("kind" by default).
 	kind load docker-image $(IMAGE_TAG) --name $(KIND_CLUSTER)
 
+##@ Lora Syncer
+
+.PHONY: syncer-image-local-build
+syncer-image-local-build:
+	BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use)
+	$(MAKE) image-build PUSH=$(PUSH)
+	$(DOCKER_BUILDX_CMD) rm $$BUILDER
+
+.PHONY: syncer-image-local-push
+syncer-image-local-push: PUSH=--push
+syncer-image-local-push: syncer-image-local-build
+
+.PHONY: syncer-image-build
+syncer-image-build:
+	$ cd $(CURDIR)/tools/dynamic-lora-sidecar && $(IMAGE_BUILD_CMD) -t $(SYNCER_IMAGE_TAG) \
+		--platform=$(PLATFORMS) \
+		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
+		--build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
+		$(PUSH) \
+		$(SYNCER_IMAGE_BUILD_EXTRA_OPTS) ./
+
+.PHONY: syncer-image-push
+syncer-image-push: PUSH=--push
+syncer-image-push: syncer-image-build
+
 ##@ Docs
 
 .PHONY: build-docs

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
@@ -12,6 +12,14 @@ steps:
     - GIT_TAG=$_GIT_TAG
     - EXTRA_TAG=$_PULL_BASE_REF
     - DOCKER_BUILDX_CMD=/buildx-entrypoint
+  - name: lora-adapter-syncer
+    entrypoint: make
+    args:
+      - syncer-image-push
+    env:
+    - GIT_TAG=$_GIT_TAG
+    - EXTRA_TAG=$_PULL_BASE_REF
+    - DOCKER_BUILDX_CMD=/buildx-entrypoint
 substitutions:
   # _GIT_TAG will be filled with a git-based tag for the image, of the form vYYYYMMDD-hash, and
   # can be used as a substitution

diff --git a/pkg/manifests/vllm/deployment-with-syncer.yaml b/pkg/manifests/vllm/deployment-with-syncer.yaml
@@ -0,0 +1,145 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-llama2-7b-pool
+spec:
+  selector:
+    app: vllm-llama2-7b-pool
+  ports:
+  - protocol: TCP
+    port: 8000
+    targetPort: 8000
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-llama2-7b-pool
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: vllm-llama2-7b-pool
+  template:
+    metadata:
+      labels:
+        app: vllm-llama2-7b-pool
+    spec:
+      containers:
+        - name: lora
+          image: "vllm/vllm-openai:latest"
+          imagePullPolicy: Always
+          command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+          args:
+          - "--model"
+          - "meta-llama/Llama-2-7b-hf"
+          - "--tensor-parallel-size"
+          - "1"
+          - "--port"
+          - "8000"
+          - "--enable-lora"
+          - "--max-loras"
+          - "4"
+          - "--max-cpu-loras"
+          - "12"
+          - "--lora-modules"
+          - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          env:
+            - name: PORT
+              value: "8000"
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: token
+            - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+              value: "true"
+          ports:
+            - containerPort: 8000
+              name: http
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 240
+            httpGet:
+              path: /health
+              port: http
+              scheme: HTTP
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          readinessProbe:
+            failureThreshold: 600
+            httpGet:
+              path: /health
+              port: http
+              scheme: HTTP
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+            requests:
+              nvidia.com/gpu: 1
+          volumeMounts:
+            - mountPath: /data
+              name: data
+            - mountPath: /dev/shm
+              name: shm
+            - name: adapters
+              mountPath: "/adapters"
+      initContainers:
+        - name: lora-adapter-syncer
+          tty: true
+          stdin: true 
+          image: us-central1-docker.pkg.dev/ahg-gke-dev/jobset2/lora-syncer:6dc97be
+          restartPolicy: Always
+          imagePullPolicy: Always
+          env: 
+            - name: DYNAMIC_LORA_ROLLOUT_CONFIG
+              value: "/config/configmap.yaml"
+          volumeMounts: # DO NOT USE subPath
+          - name: config-volume
+            mountPath:  /config
+      restartPolicy: Always
+      schedulerName: default-scheduler
+      terminationGracePeriodSeconds: 30
+      volumes:
+        - name: data
+          emptyDir: {}
+        - name: shm
+          emptyDir:
+            medium: Memory
+        - name: adapters
+          emptyDir: {}
+        - name: config-volume
+          configMap:
+            name: dynamic-lora-config
+
+---
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: dynamic-lora-config
+data:
+  configmap.yaml: |
+      vLLMLoRAConfig:
+        name: sql-loras-llama
+        port: 8000
+        ensureExist:
+          models:
+          - base-model: meta-llama/Llama-2-7b-hf
+            id: tweet-summary-0
+            source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
+          - base-model: meta-llama/Llama-2-7b-hf
+            id: tweet-summary-1
+            source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
+        ensureNotExist:
+          models:
+          - base-model: meta-llama/Llama-2-7b-hf
+            id: tweet-summary-2
+            source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
diff --git a/pkg/manifests/vllm/deployment.yaml b/pkg/manifests/vllm/deployment.yaml
@@ -43,18 +43,8 @@ spec:
           - "--max-cpu-loras"
           - "12"
           - "--lora-modules"
-          - "sql-lora=/adapters/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/"
-          - "tweet-summary=/adapters/hub/models--vineetsharma--qlora-adapter-Llama-2-7b-hf-TweetSumm/snapshots/796337d8e866318c59e38f16416e3ecd11fe5403"
-          - 'sql-lora-0=/adapters/yard1/llama-2-7b-sql-lora-test_0'
-          - 'sql-lora-1=/adapters/yard1/llama-2-7b-sql-lora-test_1'
-          - 'sql-lora-2=/adapters/yard1/llama-2-7b-sql-lora-test_2'
-          - 'sql-lora-3=/adapters/yard1/llama-2-7b-sql-lora-test_3'
-          - 'sql-lora-4=/adapters/yard1/llama-2-7b-sql-lora-test_4'
-          - 'tweet-summary-0=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_0'
-          - 'tweet-summary-1=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_1'
-          - 'tweet-summary-2=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_2'
-          - 'tweet-summary-3=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_3'
-          - 'tweet-summary-4=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_4'
+          - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
+          - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
           env:
             - name: PORT
               value: "8000"
@@ -99,29 +89,6 @@ spec:
               name: shm
             - name: adapters
               mountPath: "/adapters"
-      initContainers:
-        - name: adapter-loader
-          image: ghcr.io/tomatillo-and-multiverse/adapter-puller:demo
-          command: ["python"]
-          args:
-            - ./pull_adapters.py
-            - --adapter
-            - yard1/llama-2-7b-sql-lora-test
-            - --adapter
-            - vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
-            - --duplicate-count
-            - "5"
-          env:
-            - name: HF_TOKEN 
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token
-                  key: token
-            - name: HF_HOME
-              value: /adapters
-          volumeMounts:
-            - name: adapters
-              mountPath: "/adapters"
       restartPolicy: Always
       schedulerName: default-scheduler
       terminationGracePeriodSeconds: 30

diff --git a/site-src/guides/dynamic-lora.md b/site-src/guides/dynamic-lora.md
@@ -0,0 +1,95 @@
+# Getting started with Gateway API Inference Extension with Dynamic lora updates on vllm
+
+The goal of this guide is to get a single InferencePool running with vLLM and demonstrate use of dynamic lora updating! 
+
+### Requirements
+ - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher
+ - A cluster with:
+   - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind,
+     you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer).
+   - 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed.
+
+### Steps
+
+1. **Deploy Sample VLLM Model Server with dynamic lora update enabled and dynamic lora syncer sidecar **
+    [Deploy sample vllm deployment with Dynamic lora adapter enabled and Lora syncer sidecar and configmap](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/manifests/vllm/dynamic-lora-sidecar/deployment.yaml)
+
+Rest of the steps are same as [general setup](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/site-src/guides/index.md)
+
+
+### Safely rollout v2 adapter
+
+1. Update lora configmap
+
+``` yaml
+        apiVersion: v1
+        kind: ConfigMap
+        metadata:
+        name: dynamic-lora-config
+        data:
+        configmap.yaml: |
+             vLLMLoRAConfig:
+                name: sql-loras-llama
+                port: 8000
+                ensureExist:
+                    models:
+                    - base-model: meta-llama/Llama-2-7b-hf
+                      id: tweet-summary-0
+                      source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
+                    - base-model: meta-llama/Llama-2-7b-hf
+                      id: tweet-summary-1
+                      source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
+                    - base-model: meta-llama/Llama-2-7b-hf
+                      id: tweet-summary-2
+                      source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
+    ```
+
+2. Configure a canary rollout with traffic split using LLMService. In this example, 40% of traffic for tweet-summary model will be sent to the ***tweet-summary-2*** adapter .
+
+``` yaml
+model:
+    name: tweet-summary
+    targetModels:
+    targetModelName: tweet-summary-0
+            weight: 20
+    targetModelName: tweet-summary-1
+            weight: 40
+    targetModelName: tweet-summary-2
+            weight: 40
+
+```
+
+3. Finish rollout by setting the traffic to the new version 100%.
+```yaml
+model:
+    name: tweet-summary
+    targetModels:
+    targetModelName: tweet-summary-2
+            weight: 100
+```
+
+4. Remove v1 from dynamic lora configmap.
+```yaml
+    apiVersion: v1
+    kind: ConfigMap
+    metadata:
+    name: dynamic-lora-config
+    data:
+    configmap.yaml: |
+            vLLMLoRAConfig:
+                name: sql-loras-llama
+                port: 8000
+                ensureExist:
+                    models:
+                    - base-model: meta-llama/Llama-2-7b-hf
+                      id: tweet-summary-2
+                      source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
+                ensureNotExist:
+                    models:
+                    - base-model: meta-llama/Llama-2-7b-hf
+                      id: tweet-summary-1
+                      source: gs://[HUGGING FACE PATH]
+                    - base-model: meta-llama/Llama-2-7b-hf
+                      id: tweet-summary-0
+                      source: gs://[HUGGING FACE PATH]
+```
diff --git a/site-src/guides/index.md b/site-src/guides/index.md
@@ -19,6 +19,10 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
    kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2
    kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml
    ```
+
+
+
+
 1. **Install the Inference Extension CRDs:**
 
    ```sh
-Original file line number
+Diff line change
@@ Expand Up @@
        kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2
        kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml
        ```
 . **Install the Inference Extension CRDs:**
        ```sh
@@ Expand Down @@