Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lora syncer docs #320

Merged
merged 13 commits into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,16 @@ PLATFORMS ?= linux/amd64
DOCKER_BUILDX_CMD ?= docker buildx
IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build
IMAGE_BUILD_EXTRA_OPTS ?=
SYNCER_IMAGE_BUILD_EXTRA_OPTS ?=
IMAGE_REGISTRY ?= us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
IMAGE_NAME := epp
IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(IMAGE_NAME)
IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG)

SYNCER_IMAGE_NAME := lora-syncer
SYNCER_IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(SYNCER_IMAGE_NAME)
SYNCER_IMAGE_TAG ?= $(SYNCER_IMAGE_REPO):$(GIT_TAG)

BASE_IMAGE ?= gcr.io/distroless/base-debian10
BUILDER_IMAGE ?= golang:1.23-alpine
ifdef GO_VERSION
Expand All @@ -39,9 +44,11 @@ endif

ifdef EXTRA_TAG
IMAGE_EXTRA_TAG ?= $(IMAGE_REPO):$(EXTRA_TAG)
SYNCER_IMAGE_EXTRA_TAG ?= $(SYNCER_IMAGE_REPO):$(EXTRA_TAG)
endif
ifdef IMAGE_EXTRA_TAG
IMAGE_BUILD_EXTRA_OPTS += -t $(IMAGE_EXTRA_TAG)
SYNCER_IMAGE_BUILD_EXTRA_OPTS += -t $(SYNCER_IMAGE_EXTRA_TAG)
endif

# The name of the kind cluster to use for the "kind-load" target.
Expand Down Expand Up @@ -171,6 +178,31 @@ image-load: image-build
image-kind: image-build ## Build the EPP image and load it to kind cluster $KIND_CLUSTER ("kind" by default).
kind load docker-image $(IMAGE_TAG) --name $(KIND_CLUSTER)

##@ Lora Syncer

.PHONY: syncer-image-local-build
syncer-image-local-build:
BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use)
$(MAKE) image-build PUSH=$(PUSH)
$(DOCKER_BUILDX_CMD) rm $$BUILDER

.PHONY: syncer-image-local-push
syncer-image-local-push: PUSH=--push
syncer-image-local-push: syncer-image-local-build

.PHONY: syncer-image-build
syncer-image-build:
$ cd $(CURDIR)/tools/dynamic-lora-sidecar && $(IMAGE_BUILD_CMD) -t $(SYNCER_IMAGE_TAG) \
--platform=$(PLATFORMS) \
--build-arg BASE_IMAGE=$(BASE_IMAGE) \
--build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
$(PUSH) \
$(SYNCER_IMAGE_BUILD_EXTRA_OPTS) ./

.PHONY: syncer-image-push
syncer-image-push: PUSH=--push
syncer-image-push: syncer-image-build

##@ Docs

.PHONY: build-docs
Expand Down
8 changes: 8 additions & 0 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ steps:
- GIT_TAG=$_GIT_TAG
- EXTRA_TAG=$_PULL_BASE_REF
- DOCKER_BUILDX_CMD=/buildx-entrypoint
- name: lora-adapter-syncer
entrypoint: make
args:
- syncer-image-push
env:
- GIT_TAG=$_GIT_TAG
- EXTRA_TAG=$_PULL_BASE_REF
- DOCKER_BUILDX_CMD=/buildx-entrypoint
substitutions:
# _GIT_TAG will be filled with a git-based tag for the image, of the form vYYYYMMDD-hash, and
# can be used as a substitution
Expand Down
145 changes: 145 additions & 0 deletions pkg/manifests/vllm/deployment-with-syncer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
apiVersion: v1
kind: Service
metadata:
name: vllm-llama2-7b-pool
spec:
selector:
app: vllm-llama2-7b-pool
ports:
- protocol: TCP
port: 8000
targetPort: 8000
type: ClusterIP
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-llama2-7b-pool
spec:
replicas: 3
selector:
matchLabels:
app: vllm-llama2-7b-pool
template:
metadata:
labels:
app: vllm-llama2-7b-pool
spec:
containers:
- name: lora
image: "vllm/vllm-openai:latest"
imagePullPolicy: Always
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
args:
- "--model"
- "meta-llama/Llama-2-7b-hf"
- "--tensor-parallel-size"
- "1"
- "--port"
- "8000"
- "--enable-lora"
- "--max-loras"
- "4"
- "--max-cpu-loras"
- "12"
- "--lora-modules"
- '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
- '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
env:
- name: PORT
value: "8000"
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: token
- name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
value: "true"
ports:
- containerPort: 8000
name: http
protocol: TCP
livenessProbe:
failureThreshold: 240
httpGet:
path: /health
port: http
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
readinessProbe:
failureThreshold: 600
httpGet:
path: /health
port: http
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
nvidia.com/gpu: 1
requests:
nvidia.com/gpu: 1
volumeMounts:
- mountPath: /data
name: data
- mountPath: /dev/shm
name: shm
- name: adapters
mountPath: "/adapters"
initContainers:
- name: lora-adapter-syncer
tty: true
stdin: true
image: us-central1-docker.pkg.dev/ahg-gke-dev/jobset2/lora-syncer:6dc97be
restartPolicy: Always
imagePullPolicy: Always
env:
- name: DYNAMIC_LORA_ROLLOUT_CONFIG
value: "/config/configmap.yaml"
volumeMounts: # DO NOT USE subPath
- name: config-volume
mountPath: /config
restartPolicy: Always
schedulerName: default-scheduler
terminationGracePeriodSeconds: 30
volumes:
- name: data
emptyDir: {}
- name: shm
emptyDir:
medium: Memory
- name: adapters
emptyDir: {}
- name: config-volume
configMap:
name: dynamic-lora-config

---

apiVersion: v1
kind: ConfigMap
metadata:
name: dynamic-lora-config
data:
configmap.yaml: |
vLLMLoRAConfig:
name: sql-loras-llama
port: 8000
ensureExist:
models:
- base-model: meta-llama/Llama-2-7b-hf
id: tweet-summary-0
source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
- base-model: meta-llama/Llama-2-7b-hf
id: tweet-summary-1
source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
ensureNotExist:
models:
- base-model: meta-llama/Llama-2-7b-hf
id: tweet-summary-2
source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
37 changes: 2 additions & 35 deletions pkg/manifests/vllm/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,8 @@ spec:
- "--max-cpu-loras"
- "12"
- "--lora-modules"
- "sql-lora=/adapters/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/"
- "tweet-summary=/adapters/hub/models--vineetsharma--qlora-adapter-Llama-2-7b-hf-TweetSumm/snapshots/796337d8e866318c59e38f16416e3ecd11fe5403"
- 'sql-lora-0=/adapters/yard1/llama-2-7b-sql-lora-test_0'
- 'sql-lora-1=/adapters/yard1/llama-2-7b-sql-lora-test_1'
- 'sql-lora-2=/adapters/yard1/llama-2-7b-sql-lora-test_2'
- 'sql-lora-3=/adapters/yard1/llama-2-7b-sql-lora-test_3'
- 'sql-lora-4=/adapters/yard1/llama-2-7b-sql-lora-test_4'
- 'tweet-summary-0=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_0'
- 'tweet-summary-1=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_1'
- 'tweet-summary-2=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_2'
- 'tweet-summary-3=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_3'
- 'tweet-summary-4=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_4'
- '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
- '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
env:
- name: PORT
value: "8000"
Expand Down Expand Up @@ -99,29 +89,6 @@ spec:
name: shm
- name: adapters
mountPath: "/adapters"
initContainers:
- name: adapter-loader
image: ghcr.io/tomatillo-and-multiverse/adapter-puller:demo
command: ["python"]
args:
- ./pull_adapters.py
- --adapter
- yard1/llama-2-7b-sql-lora-test
- --adapter
- vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
- --duplicate-count
- "5"
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: token
- name: HF_HOME
value: /adapters
volumeMounts:
- name: adapters
mountPath: "/adapters"
restartPolicy: Always
schedulerName: default-scheduler
terminationGracePeriodSeconds: 30
Expand Down
95 changes: 95 additions & 0 deletions site-src/guides/dynamic-lora.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Getting started with Gateway API Inference Extension with Dynamic lora updates on vllm

The goal of this guide is to get a single InferencePool running with vLLM and demonstrate use of dynamic lora updating!

### Requirements
- Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher
- A cluster with:
- Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind,
you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer).
- 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed.

### Steps

1. **Deploy Sample VLLM Model Server with dynamic lora update enabled and dynamic lora syncer sidecar **
[Deploy sample vllm deployment with Dynamic lora adapter enabled and Lora syncer sidecar and configmap](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/manifests/vllm/dynamic-lora-sidecar/deployment.yaml)

Rest of the steps are same as [general setup](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/site-src/guides/index.md)


### Safely rollout v2 adapter

1. Update lora configmap

``` yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: dynamic-lora-config
data:
configmap.yaml: |
vLLMLoRAConfig:
name: sql-loras-llama
port: 8000
ensureExist:
models:
- base-model: meta-llama/Llama-2-7b-hf
id: tweet-summary-0
source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
- base-model: meta-llama/Llama-2-7b-hf
id: tweet-summary-1
source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
- base-model: meta-llama/Llama-2-7b-hf
id: tweet-summary-2
source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
```

2. Configure a canary rollout with traffic split using LLMService. In this example, 40% of traffic for tweet-summary model will be sent to the ***tweet-summary-2*** adapter .

``` yaml
model:
name: tweet-summary
targetModels:
targetModelName: tweet-summary-0
weight: 20
targetModelName: tweet-summary-1
weight: 40
targetModelName: tweet-summary-2
weight: 40

```

3. Finish rollout by setting the traffic to the new version 100%.
```yaml
model:
name: tweet-summary
targetModels:
targetModelName: tweet-summary-2
weight: 100
```

4. Remove v1 from dynamic lora configmap.
```yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: dynamic-lora-config
data:
configmap.yaml: |
vLLMLoRAConfig:
name: sql-loras-llama
port: 8000
ensureExist:
models:
- base-model: meta-llama/Llama-2-7b-hf
id: tweet-summary-2
source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
ensureNotExist:
models:
- base-model: meta-llama/Llama-2-7b-hf
id: tweet-summary-1
source: gs://[HUGGING FACE PATH]
- base-model: meta-llama/Llama-2-7b-hf
id: tweet-summary-0
source: gs://[HUGGING FACE PATH]
```
4 changes: 4 additions & 0 deletions site-src/guides/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml
```




1. **Install the Inference Extension CRDs:**

```sh
Expand Down