diff --git a/Makefile b/Makefile index b7654ed7..1d8fc531 100644 --- a/Makefile +++ b/Makefile @@ -26,11 +26,16 @@ PLATFORMS ?= linux/amd64 DOCKER_BUILDX_CMD ?= docker buildx IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build IMAGE_BUILD_EXTRA_OPTS ?= +SYNCER_IMAGE_BUILD_EXTRA_OPTS ?= IMAGE_REGISTRY ?= us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension IMAGE_NAME := epp IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(IMAGE_NAME) IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG) +SYNCER_IMAGE_NAME := lora-syncer +SYNCER_IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(SYNCER_IMAGE_NAME) +SYNCER_IMAGE_TAG ?= $(SYNCER_IMAGE_REPO):$(GIT_TAG) + BASE_IMAGE ?= gcr.io/distroless/base-debian10 BUILDER_IMAGE ?= golang:1.23-alpine ifdef GO_VERSION @@ -39,9 +44,11 @@ endif ifdef EXTRA_TAG IMAGE_EXTRA_TAG ?= $(IMAGE_REPO):$(EXTRA_TAG) +SYNCER_IMAGE_EXTRA_TAG ?= $(SYNCER_IMAGE_REPO):$(EXTRA_TAG) endif ifdef IMAGE_EXTRA_TAG IMAGE_BUILD_EXTRA_OPTS += -t $(IMAGE_EXTRA_TAG) +SYNCER_IMAGE_BUILD_EXTRA_OPTS += -t $(SYNCER_IMAGE_EXTRA_TAG) endif # The name of the kind cluster to use for the "kind-load" target. @@ -171,6 +178,31 @@ image-load: image-build image-kind: image-build ## Build the EPP image and load it to kind cluster $KIND_CLUSTER ("kind" by default). kind load docker-image $(IMAGE_TAG) --name $(KIND_CLUSTER) +##@ Lora Syncer + +.PHONY: syncer-image-local-build +syncer-image-local-build: + BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use) + $(MAKE) image-build PUSH=$(PUSH) + $(DOCKER_BUILDX_CMD) rm $$BUILDER + +.PHONY: syncer-image-local-push +syncer-image-local-push: PUSH=--push +syncer-image-local-push: syncer-image-local-build + +.PHONY: syncer-image-build +syncer-image-build: + $ cd $(CURDIR)/tools/dynamic-lora-sidecar && $(IMAGE_BUILD_CMD) -t $(SYNCER_IMAGE_TAG) \ + --platform=$(PLATFORMS) \ + --build-arg BASE_IMAGE=$(BASE_IMAGE) \ + --build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \ + $(PUSH) \ + $(SYNCER_IMAGE_BUILD_EXTRA_OPTS) ./ + +.PHONY: syncer-image-push +syncer-image-push: PUSH=--push +syncer-image-push: syncer-image-build + ##@ Docs .PHONY: build-docs diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 2da147f4..40e45923 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -12,6 +12,14 @@ steps: - GIT_TAG=$_GIT_TAG - EXTRA_TAG=$_PULL_BASE_REF - DOCKER_BUILDX_CMD=/buildx-entrypoint + - name: lora-adapter-syncer + entrypoint: make + args: + - syncer-image-push + env: + - GIT_TAG=$_GIT_TAG + - EXTRA_TAG=$_PULL_BASE_REF + - DOCKER_BUILDX_CMD=/buildx-entrypoint substitutions: # _GIT_TAG will be filled with a git-based tag for the image, of the form vYYYYMMDD-hash, and # can be used as a substitution diff --git a/pkg/manifests/vllm/deployment-with-syncer.yaml b/pkg/manifests/vllm/deployment-with-syncer.yaml new file mode 100644 index 00000000..d6110f4b --- /dev/null +++ b/pkg/manifests/vllm/deployment-with-syncer.yaml @@ -0,0 +1,145 @@ +apiVersion: v1 +kind: Service +metadata: + name: vllm-llama2-7b-pool +spec: + selector: + app: vllm-llama2-7b-pool + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-llama2-7b-pool +spec: + replicas: 3 + selector: + matchLabels: + app: vllm-llama2-7b-pool + template: + metadata: + labels: + app: vllm-llama2-7b-pool + spec: + containers: + - name: lora + image: "vllm/vllm-openai:latest" + imagePullPolicy: Always + command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args: + - "--model" + - "meta-llama/Llama-2-7b-hf" + - "--tensor-parallel-size" + - "1" + - "--port" + - "8000" + - "--enable-lora" + - "--max-loras" + - "4" + - "--max-cpu-loras" + - "12" + - "--lora-modules" + - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' + - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' + env: + - name: PORT + value: "8000" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: token + - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING + value: "true" + ports: + - containerPort: 8000 + name: http + protocol: TCP + livenessProbe: + failureThreshold: 240 + httpGet: + path: /health + port: http + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: + failureThreshold: 600 + httpGet: + path: /health + port: http + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + nvidia.com/gpu: 1 + requests: + nvidia.com/gpu: 1 + volumeMounts: + - mountPath: /data + name: data + - mountPath: /dev/shm + name: shm + - name: adapters + mountPath: "/adapters" + initContainers: + - name: lora-adapter-syncer + tty: true + stdin: true + image: us-central1-docker.pkg.dev/ahg-gke-dev/jobset2/lora-syncer:6dc97be + restartPolicy: Always + imagePullPolicy: Always + env: + - name: DYNAMIC_LORA_ROLLOUT_CONFIG + value: "/config/configmap.yaml" + volumeMounts: # DO NOT USE subPath + - name: config-volume + mountPath: /config + restartPolicy: Always + schedulerName: default-scheduler + terminationGracePeriodSeconds: 30 + volumes: + - name: data + emptyDir: {} + - name: shm + emptyDir: + medium: Memory + - name: adapters + emptyDir: {} + - name: config-volume + configMap: + name: dynamic-lora-config + +--- + +apiVersion: v1 +kind: ConfigMap +metadata: + name: dynamic-lora-config +data: + configmap.yaml: | + vLLMLoRAConfig: + name: sql-loras-llama + port: 8000 + ensureExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-0 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-1 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + ensureNotExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-2 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm \ No newline at end of file diff --git a/pkg/manifests/vllm/deployment.yaml b/pkg/manifests/vllm/deployment.yaml index 4af0891d..1d115f4d 100644 --- a/pkg/manifests/vllm/deployment.yaml +++ b/pkg/manifests/vllm/deployment.yaml @@ -43,18 +43,8 @@ spec: - "--max-cpu-loras" - "12" - "--lora-modules" - - "sql-lora=/adapters/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/" - - "tweet-summary=/adapters/hub/models--vineetsharma--qlora-adapter-Llama-2-7b-hf-TweetSumm/snapshots/796337d8e866318c59e38f16416e3ecd11fe5403" - - 'sql-lora-0=/adapters/yard1/llama-2-7b-sql-lora-test_0' - - 'sql-lora-1=/adapters/yard1/llama-2-7b-sql-lora-test_1' - - 'sql-lora-2=/adapters/yard1/llama-2-7b-sql-lora-test_2' - - 'sql-lora-3=/adapters/yard1/llama-2-7b-sql-lora-test_3' - - 'sql-lora-4=/adapters/yard1/llama-2-7b-sql-lora-test_4' - - 'tweet-summary-0=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_0' - - 'tweet-summary-1=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_1' - - 'tweet-summary-2=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_2' - - 'tweet-summary-3=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_3' - - 'tweet-summary-4=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_4' + - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' + - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' env: - name: PORT value: "8000" @@ -99,29 +89,6 @@ spec: name: shm - name: adapters mountPath: "/adapters" - initContainers: - - name: adapter-loader - image: ghcr.io/tomatillo-and-multiverse/adapter-puller:demo - command: ["python"] - args: - - ./pull_adapters.py - - --adapter - - yard1/llama-2-7b-sql-lora-test - - --adapter - - vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - - --duplicate-count - - "5" - env: - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token - key: token - - name: HF_HOME - value: /adapters - volumeMounts: - - name: adapters - mountPath: "/adapters" restartPolicy: Always schedulerName: default-scheduler terminationGracePeriodSeconds: 30 diff --git a/site-src/guides/dynamic-lora.md b/site-src/guides/dynamic-lora.md new file mode 100644 index 00000000..ef3c2b0f --- /dev/null +++ b/site-src/guides/dynamic-lora.md @@ -0,0 +1,93 @@ +# Getting started with Gateway API Inference Extension with Dynamic lora updates on vllm + +The goal of this guide is to get a single InferencePool running with vLLM and demonstrate use of dynamic lora updating! + +### Requirements + - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher + - A cluster with: + - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, + you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). + - 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed. + +### Steps + +1. **Deploy Sample VLLM Model Server with dynamic lora update enabled and dynamic lora syncer sidecar ** + [Redeploy the vLLM deployment with Dynamic lora adapter enabled and Lora syncer sidecar and configmap](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/manifests/vllm/dynamic-lora-sidecar/deployment.yaml) + +Rest of the steps are same as [general setup](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/site-src/guides/index.md) + + +### Safely rollout v2 adapter + +1. Update the LoRA syncer ConfigMap to make the new adapter version available on the model servers. + +```yaml + apiVersion: v1 + kind: ConfigMap + metadata: + name: dynamic-lora-config + data: + configmap.yaml: | + vLLMLoRAConfig: + name: sql-loras-llama + port: 8000 + ensureExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-0 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-1 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-2 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm +2. Configure a canary rollout with traffic split using LLMService. In this example, 40% of traffic for tweet-summary model will be sent to the ***tweet-summary-2*** adapter . + +```yaml +model: + name: tweet-summary + targetModels: + targetModelName: tweet-summary-0 + weight: 20 + targetModelName: tweet-summary-1 + weight: 40 + targetModelName: tweet-summary-2 + weight: 40 + +``` + +3. Finish rollout by setting the traffic to the new version 100%. +```yaml +model: + name: tweet-summary + targetModels: + targetModelName: tweet-summary-2 + weight: 100 +``` + +4. Remove v1 from dynamic lora configmap. +```yaml + apiVersion: v1 + kind: ConfigMap + metadata: + name: dynamic-lora-config + data: + configmap.yaml: | + vLLMLoRAConfig: + name: sql-loras-llama + port: 8000 + ensureExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-2 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + ensureNotExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-1 + source: gs://[HUGGING FACE PATH] + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-0 + source: gs://[HUGGING FACE PATH] +``` diff --git a/site-src/guides/index.md b/site-src/guides/index.md index e4cbec6f..2cc971c6 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -19,6 +19,10 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2 kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml ``` + + + + 1. **Install the Inference Extension CRDs:** ```sh