Add makefile and cloudbuild file to build and push lora-syncer

coolkp · coolkp · commit 9efc0962abcc · 2025-02-12T08:22:45.000-08:00
Signed-off-by: Kunjan &lt;kunjanp@google.com&gt;
diff --git a/pkg/manifests/vllm/deployment-with-syncer.yaml b/pkg/manifests/vllm/deployment-with-syncer.yaml
@@ -0,0 +1,181 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-llama2-7b-pool
+spec:
+  selector:
+    app: vllm-llama2-7b-pool
+  ports:
+  - protocol: TCP
+    port: 8000
+    targetPort: 8000
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-llama2-7b-pool
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: vllm-llama2-7b-pool
+  template:
+    metadata:
+      labels:
+        app: vllm-llama2-7b-pool
+    spec:
+      containers:
+        - name: lora
+          image: "vllm/vllm-openai:latest"
+          imagePullPolicy: Always
+          command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+          args:
+          - "--model"
+          - "meta-llama/Llama-2-7b-hf"
+          - "--tensor-parallel-size"
+          - "1"
+          - "--port"
+          - "8000"
+          - "--enable-lora"
+          - "--max-loras"
+          - "4"
+          - "--max-cpu-loras"
+          - "12"
+          - "--lora-modules"
+          - "sql-lora=/adapters/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/"
+          - "tweet-summary=/adapters/hub/models--vineetsharma--qlora-adapter-Llama-2-7b-hf-TweetSumm/snapshots/796337d8e866318c59e38f16416e3ecd11fe5403"
+          - 'sql-lora-0=/adapters/yard1/llama-2-7b-sql-lora-test_0'
+          - 'sql-lora-1=/adapters/yard1/llama-2-7b-sql-lora-test_1'
+          - 'sql-lora-2=/adapters/yard1/llama-2-7b-sql-lora-test_2'
+          - 'sql-lora-3=/adapters/yard1/llama-2-7b-sql-lora-test_3'
+          - 'sql-lora-4=/adapters/yard1/llama-2-7b-sql-lora-test_4'
+          - 'tweet-summary-0=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_0'
+          - 'tweet-summary-1=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_1'
+          - 'tweet-summary-2=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_2'
+          - 'tweet-summary-3=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_3'
+          - 'tweet-summary-4=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_4'
+          env:
+            - name: PORT
+              value: "8000"
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: token
+            - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+              value: "true"
+          ports:
+            - containerPort: 8000
+              name: http
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 240
+            httpGet:
+              path: /health
+              port: http
+              scheme: HTTP
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          readinessProbe:
+            failureThreshold: 600
+            httpGet:
+              path: /health
+              port: http
+              scheme: HTTP
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+            requests:
+              nvidia.com/gpu: 1
+          volumeMounts:
+            - mountPath: /data
+              name: data
+            - mountPath: /dev/shm
+              name: shm
+            - name: adapters
+              mountPath: "/adapters"
+      initContainers:
+        - name: adapter-loader
+          image: ghcr.io/tomatillo-and-multiverse/adapter-puller:demo
+          command: ["python"]
+          args:
+            - ./pull_adapters.py
+            - --adapter
+            - yard1/llama-2-7b-sql-lora-test
+            - --adapter
+            - vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
+            - --duplicate-count
+            - "5"
+          env:
+            - name: HF_TOKEN 
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: token
+            - name: HF_HOME
+              value: /adapters
+          volumeMounts:
+            - name: adapters
+              mountPath: "/adapters"
+        - name: lora-adapter-syncer
+          tty: true
+          stdin: true 
+          image: <SIDECAR_IMAGE>
+          restartPolicy: Always
+          imagePullPolicy: Always
+          env: 
+            - name: DYNAMIC_LORA_ROLLOUT_CONFIG
+              value: "/config/configmap.yaml"
+          volumeMounts: # DO NOT USE subPath
+          - name: config-volume
+            mountPath:  /config
+      restartPolicy: Always
+      schedulerName: default-scheduler
+      terminationGracePeriodSeconds: 30
+      volumes:
+        - name: data
+          emptyDir: {}
+        - name: shm
+          emptyDir:
+            medium: Memory
+        - name: adapters
+          emptyDir: {}
+        - name: config-volume
+          configMap:
+            name: dynamic-lora-config
+
+---
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: dynamic-lora-config
+data:
+  configmap.yaml: |
+      vLLMLoRAConfig:
+        host: modelServerHost
+        name: sql-loras-llama
+        port: modelServerPort
+        ensureExist:
+          models:
+          - base-model: meta-llama/Llama-2-7b-hf
+            id: sql-lora-v1
+            source: yard1/llama-2-7b-sql-lora-test
+          - base-model: meta-llama/Llama-2-7b-hf
+            id: sql-lora-v3
+            source: yard1/llama-2-7b-sql-lora-test
+          - base-model: meta-llama/Llama-2-7b-hf
+            id: sql-lora-v4
+            source: yard1/llama-2-7b-sql-lora-test
+        ensureNotExist:
+          models:
+          - base-model: meta-llama/Llama-2-7b-hf
+            id: sql-lora-v2
+            source: yard1/llama-2-7b-sql-lora-test
diff --git a/site-src/guides/dynamic-lora.md b/site-src/guides/dynamic-lora.md
@@ -0,0 +1,79 @@
+# Getting started with Gateway API Inference Extension with Dynamic lora updates on vllm
+
+The goal of this guide is to get a single InferencePool running with VLLM and demonstrate use of dynamic lora updating ! 
+
+### Requirements
+ - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher
+ - A cluster with:
+   - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind,
+     you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer).
+   - 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed.
+
+### Steps
+
+1. **Deploy Sample VLLM Model Server with dynamic lora update enabled and dynamic lora syncer sidecar **
+    [Deploy sample vllm deployment with Dynamic lora adapter enabled and Lora syncer sidecar and configmap](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/manifests/vllm/dynamic-lora-sidecar/deployment.yaml)
+
+Rest of the steps are same as [general setup](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/site-src/guides/index.md)
+
+
+### Safely rollout v2 adapter
+    
+1. Update lora configmap
+
+``` yaml
+
+        apiVersion: v1
+        kind: ConfigMap
+        metadata:
+        name: dynamic-lora-config
+        data:
+        configmap.yaml: |
+            vLLMLoRAConfig:
+            ensureExist:   
+                models:
+                - id: tweet-summary-v1
+                    source: tweet-summary-1=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_1
+                - id: tweet-summary-v2
+                    source: tweet-summary-2=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_2
+    ```
+
+2. Configure a canary rollout with traffic split using LLMService. In this example, 10% of traffic to the chatbot model will be sent to v2.
+
+``` yaml
+model:
+    name: chatbot
+    targetModels:
+    targetModelName: chatbot-v1
+            weight: 90
+    targetModelName: chatbot-v2
+            weight: 10
+```
+            
+3. Finish rollout by setting the traffic to the new version 100%.
+```yaml
+model:
+    name: chatbot
+    targetModels:
+    targetModelName: chatbot-v2
+            weight: 100
+```
+    
+4. Remove v1 from dynamic lora configmap.
+```yaml
+    apiVersion: v1
+    kind: ConfigMap
+    metadata:
+    name: dynamic-lora-config
+    data:
+    configmap.yaml: |
+            vLLMLoRAConfig:
+            ensureExist:
+            models:
+            - id: chatbot-v2
+                source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v2
+            ensureNotExist: # Explicitly unregisters the adapter from  model servers
+            models:
+            - id: chatbot-v1
+                source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v1
+```
diff --git a/site-src/guides/index.md b/site-src/guides/index.md
@@ -19,70 +19,6 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
    kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2
    kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml
    ```
-   **OPTIONALLY**: Enable Dynamic loading of Lora adapters.
-   
-     [Deploy sample vllm deployment with Dynamic lora adapter enabled and Lora syncer sidecar](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/tools/dynamic-lora-sidecar/deployment.yaml)
-         
-    ***Safely rollout v2 adapter***
-    
-     1. Update lora configmap
-
-        ``` yaml
-
-              apiVersion: v1
-              kind: ConfigMap
-              metadata:
-              name: dynamic-lora-config
-              data:
-              configmap.yaml: |
-                    vLLMLoRAConfig:
-                    ensureExist:   
-                       models:
-                       - id: chatbot-v1
-                          source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v1
-                       - id: chatbot-v2
-                          source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v2     
-         ```
-
-     2. Configure a canary rollout with traffic split using LLMService. In this example, 10% of traffic to the chatbot model will be sent to v2.
-
-        ``` yaml
-        model:
-           name: chatbot
-           targetModels:
-           targetModelName: chatbot-v1
-                 weight: 90
-           targetModelName: chatbot-v2
-                 weight: 10
-        ```
-            
-     3. Finish rollout by setting the traffic to the new version 100%.
-        ```yaml
-        model:
-           name: chatbot
-           targetModels:
-           targetModelName: chatbot-v2
-                 weight: 100
-        ```
-         
-     4. Remove v1 from dynamic lora configmap.
-        ```yaml
-           apiVersion: v1
-           kind: ConfigMap
-           metadata:
-           name: dynamic-lora-config
-           data:
-           configmap.yaml: |
-                 vLLMLoRAConfig:
-                 ensureExist:
-                    models:
-                    - id: chatbot-v2
-                       source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v2
-                 ensureNotExist: # Explicitly unregisters the adapter from  model servers
-                    models:
-                    - id: chatbot-v1
-                       source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v1
-        ```
 
 
 
diff --git a/tools/dynamic-lora-sidecar/Makefile b/tools/dynamic-lora-sidecar/Makefile
@@ -0,0 +1,59 @@
+IMAGE_NAME := lora-syncer
+IMAGE_REGISTRY ?= us-central1-docker.pkg.dev/k8s-staging-images/llm-instance-gateway
+IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(IMAGE_NAME)
+
+GIT_TAG ?= $(shell git describe --tags --dirty --always)
+EXTRA_TAG ?= $(if $(_PULL_BASE_REF),$(_PULL_BASE_REF),main)
+IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG)
+EXTRA_IMAGE_TAG ?= $(IMAGE_REPO):$(EXTRA_TAG)
+
+
+PLATFORMS ?= linux/amd64
+
+
+DOCKER_BUILDX_CMD ?= docker buildx
+IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build
+IMAGE_BUILD_EXTRA_OPTS ?=
+
+# --- Targets ---
+.PHONY: image-local-build
+image-local-build:
+	BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use)
+	$(MAKE) image-build PUSH=$(PUSH)
+	$(DOCKER_BUILDX_CMD) rm $$BUILDER
+
+.PHONY: image-local-push
+image-local-push: PUSH=--push
+image-local-push: image-local-build
+
+.PHONY: image-build
+image-build:
+	$(IMAGE_BUILD_CMD) -t $(IMAGE_TAG) \
+		--platform=$(PLATFORMS) \
+		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
+		--build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
+		$(PUSH) \
+		$(IMAGE_BUILD_EXTRA_OPTS) ./
+
+.PHONY: image-push
+image-push: PUSH=--push
+image-push: image-build
+
+.PHONY: run
+run:
+	docker run -v $(CURDIR)/config:/config -u appuser $(IMAGE_TAG) # Use the user name
+
+.PHONY: clean
+clean:
+	docker rmi $(IMAGE_TAG) $(EXTRA_IMAGE_TAG) 2>/dev/null || true
+
+.PHONY: clean-dangling
+clean-dangling:
+	docker rmi $(docker images -f "dangling=true" -q) 2>/dev/null || true
+
+.PHONY: test
+test:
+	python -m unittest discover
+
+.PHONY: all
+all: test image-build
diff --git a/tools/dynamic-lora-sidecar/cloudbuild.yaml b/tools/dynamic-lora-sidecar/cloudbuild.yaml