Skip to content

Commit 88c20f1

Browse files
coolkpahg-gdanehans
authored
Lora syncer docs (#320)
* Integrate dynamic-lora-sidecar into main guide and add makefile, cloudbuild to build and publish lora-syncer image Signed-off-by: Kunjan <[email protected]> * Add makefile and cloudbuild file to build and push lora-syncer Signed-off-by: Kunjan <[email protected]> * Add makefile and cloudbuild file to build and push lora-syncer Signed-off-by: Kunjan <[email protected]> * Update site-src/guides/dynamic-lora.md Co-authored-by: Abdullah Gharaibeh <[email protected]> * Update site-src/guides/dynamic-lora.md Co-authored-by: Abdullah Gharaibeh <[email protected]> * Add makefile and cloudbuild file to build and push lora-syncer Signed-off-by: Kunjan <[email protected]> * Adds image-load and kind-load Make targets (#288) Signed-off-by: Daneyon Hansen <[email protected]> * Add makefile and cloudbuild file to build and push lora-syncer Signed-off-by: Kunjan <[email protected]> * Add build targets for lora syncer Signed-off-by: Kunjan <[email protected]> * Apply suggestions from code review * Apply suggestions from code review * Apply suggestions from code review * Apply suggestions from code review --------- Signed-off-by: Kunjan <[email protected]> Signed-off-by: Daneyon Hansen <[email protected]> Co-authored-by: Abdullah Gharaibeh <[email protected]> Co-authored-by: Daneyon Hansen <[email protected]>
1 parent 8233946 commit 88c20f1

File tree

6 files changed

+284
-35
lines changed

6 files changed

+284
-35
lines changed

Makefile

+32
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,16 @@ PLATFORMS ?= linux/amd64
2626
DOCKER_BUILDX_CMD ?= docker buildx
2727
IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build
2828
IMAGE_BUILD_EXTRA_OPTS ?=
29+
SYNCER_IMAGE_BUILD_EXTRA_OPTS ?=
2930
IMAGE_REGISTRY ?= us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
3031
IMAGE_NAME := epp
3132
IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(IMAGE_NAME)
3233
IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG)
3334

35+
SYNCER_IMAGE_NAME := lora-syncer
36+
SYNCER_IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(SYNCER_IMAGE_NAME)
37+
SYNCER_IMAGE_TAG ?= $(SYNCER_IMAGE_REPO):$(GIT_TAG)
38+
3439
BASE_IMAGE ?= gcr.io/distroless/base-debian10
3540
BUILDER_IMAGE ?= golang:1.23-alpine
3641
ifdef GO_VERSION
@@ -39,9 +44,11 @@ endif
3944

4045
ifdef EXTRA_TAG
4146
IMAGE_EXTRA_TAG ?= $(IMAGE_REPO):$(EXTRA_TAG)
47+
SYNCER_IMAGE_EXTRA_TAG ?= $(SYNCER_IMAGE_REPO):$(EXTRA_TAG)
4248
endif
4349
ifdef IMAGE_EXTRA_TAG
4450
IMAGE_BUILD_EXTRA_OPTS += -t $(IMAGE_EXTRA_TAG)
51+
SYNCER_IMAGE_BUILD_EXTRA_OPTS += -t $(SYNCER_IMAGE_EXTRA_TAG)
4552
endif
4653

4754
# The name of the kind cluster to use for the "kind-load" target.
@@ -171,6 +178,31 @@ image-load: image-build
171178
image-kind: image-build ## Build the EPP image and load it to kind cluster $KIND_CLUSTER ("kind" by default).
172179
kind load docker-image $(IMAGE_TAG) --name $(KIND_CLUSTER)
173180

181+
##@ Lora Syncer
182+
183+
.PHONY: syncer-image-local-build
184+
syncer-image-local-build:
185+
BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use)
186+
$(MAKE) image-build PUSH=$(PUSH)
187+
$(DOCKER_BUILDX_CMD) rm $$BUILDER
188+
189+
.PHONY: syncer-image-local-push
190+
syncer-image-local-push: PUSH=--push
191+
syncer-image-local-push: syncer-image-local-build
192+
193+
.PHONY: syncer-image-build
194+
syncer-image-build:
195+
$ cd $(CURDIR)/tools/dynamic-lora-sidecar && $(IMAGE_BUILD_CMD) -t $(SYNCER_IMAGE_TAG) \
196+
--platform=$(PLATFORMS) \
197+
--build-arg BASE_IMAGE=$(BASE_IMAGE) \
198+
--build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
199+
$(PUSH) \
200+
$(SYNCER_IMAGE_BUILD_EXTRA_OPTS) ./
201+
202+
.PHONY: syncer-image-push
203+
syncer-image-push: PUSH=--push
204+
syncer-image-push: syncer-image-build
205+
174206
##@ Docs
175207

176208
.PHONY: build-docs

cloudbuild.yaml

+8
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@ steps:
1212
- GIT_TAG=$_GIT_TAG
1313
- EXTRA_TAG=$_PULL_BASE_REF
1414
- DOCKER_BUILDX_CMD=/buildx-entrypoint
15+
- name: lora-adapter-syncer
16+
entrypoint: make
17+
args:
18+
- syncer-image-push
19+
env:
20+
- GIT_TAG=$_GIT_TAG
21+
- EXTRA_TAG=$_PULL_BASE_REF
22+
- DOCKER_BUILDX_CMD=/buildx-entrypoint
1523
substitutions:
1624
# _GIT_TAG will be filled with a git-based tag for the image, of the form vYYYYMMDD-hash, and
1725
# can be used as a substitution
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: vllm-llama2-7b-pool
5+
spec:
6+
selector:
7+
app: vllm-llama2-7b-pool
8+
ports:
9+
- protocol: TCP
10+
port: 8000
11+
targetPort: 8000
12+
type: ClusterIP
13+
---
14+
apiVersion: apps/v1
15+
kind: Deployment
16+
metadata:
17+
name: vllm-llama2-7b-pool
18+
spec:
19+
replicas: 3
20+
selector:
21+
matchLabels:
22+
app: vllm-llama2-7b-pool
23+
template:
24+
metadata:
25+
labels:
26+
app: vllm-llama2-7b-pool
27+
spec:
28+
containers:
29+
- name: lora
30+
image: "vllm/vllm-openai:latest"
31+
imagePullPolicy: Always
32+
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
33+
args:
34+
- "--model"
35+
- "meta-llama/Llama-2-7b-hf"
36+
- "--tensor-parallel-size"
37+
- "1"
38+
- "--port"
39+
- "8000"
40+
- "--enable-lora"
41+
- "--max-loras"
42+
- "4"
43+
- "--max-cpu-loras"
44+
- "12"
45+
- "--lora-modules"
46+
- '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
47+
- '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
48+
env:
49+
- name: PORT
50+
value: "8000"
51+
- name: HUGGING_FACE_HUB_TOKEN
52+
valueFrom:
53+
secretKeyRef:
54+
name: hf-token
55+
key: token
56+
- name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
57+
value: "true"
58+
ports:
59+
- containerPort: 8000
60+
name: http
61+
protocol: TCP
62+
livenessProbe:
63+
failureThreshold: 240
64+
httpGet:
65+
path: /health
66+
port: http
67+
scheme: HTTP
68+
initialDelaySeconds: 5
69+
periodSeconds: 5
70+
successThreshold: 1
71+
timeoutSeconds: 1
72+
readinessProbe:
73+
failureThreshold: 600
74+
httpGet:
75+
path: /health
76+
port: http
77+
scheme: HTTP
78+
initialDelaySeconds: 5
79+
periodSeconds: 5
80+
successThreshold: 1
81+
timeoutSeconds: 1
82+
resources:
83+
limits:
84+
nvidia.com/gpu: 1
85+
requests:
86+
nvidia.com/gpu: 1
87+
volumeMounts:
88+
- mountPath: /data
89+
name: data
90+
- mountPath: /dev/shm
91+
name: shm
92+
- name: adapters
93+
mountPath: "/adapters"
94+
initContainers:
95+
- name: lora-adapter-syncer
96+
tty: true
97+
stdin: true
98+
image: us-central1-docker.pkg.dev/ahg-gke-dev/jobset2/lora-syncer:6dc97be
99+
restartPolicy: Always
100+
imagePullPolicy: Always
101+
env:
102+
- name: DYNAMIC_LORA_ROLLOUT_CONFIG
103+
value: "/config/configmap.yaml"
104+
volumeMounts: # DO NOT USE subPath
105+
- name: config-volume
106+
mountPath: /config
107+
restartPolicy: Always
108+
schedulerName: default-scheduler
109+
terminationGracePeriodSeconds: 30
110+
volumes:
111+
- name: data
112+
emptyDir: {}
113+
- name: shm
114+
emptyDir:
115+
medium: Memory
116+
- name: adapters
117+
emptyDir: {}
118+
- name: config-volume
119+
configMap:
120+
name: dynamic-lora-config
121+
122+
---
123+
124+
apiVersion: v1
125+
kind: ConfigMap
126+
metadata:
127+
name: dynamic-lora-config
128+
data:
129+
configmap.yaml: |
130+
vLLMLoRAConfig:
131+
name: sql-loras-llama
132+
port: 8000
133+
ensureExist:
134+
models:
135+
- base-model: meta-llama/Llama-2-7b-hf
136+
id: tweet-summary-0
137+
source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
138+
- base-model: meta-llama/Llama-2-7b-hf
139+
id: tweet-summary-1
140+
source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
141+
ensureNotExist:
142+
models:
143+
- base-model: meta-llama/Llama-2-7b-hf
144+
id: tweet-summary-2
145+
source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm

pkg/manifests/vllm/deployment.yaml

+2-35
Original file line numberDiff line numberDiff line change
@@ -43,18 +43,8 @@ spec:
4343
- "--max-cpu-loras"
4444
- "12"
4545
- "--lora-modules"
46-
- "sql-lora=/adapters/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/"
47-
- "tweet-summary=/adapters/hub/models--vineetsharma--qlora-adapter-Llama-2-7b-hf-TweetSumm/snapshots/796337d8e866318c59e38f16416e3ecd11fe5403"
48-
- 'sql-lora-0=/adapters/yard1/llama-2-7b-sql-lora-test_0'
49-
- 'sql-lora-1=/adapters/yard1/llama-2-7b-sql-lora-test_1'
50-
- 'sql-lora-2=/adapters/yard1/llama-2-7b-sql-lora-test_2'
51-
- 'sql-lora-3=/adapters/yard1/llama-2-7b-sql-lora-test_3'
52-
- 'sql-lora-4=/adapters/yard1/llama-2-7b-sql-lora-test_4'
53-
- 'tweet-summary-0=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_0'
54-
- 'tweet-summary-1=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_1'
55-
- 'tweet-summary-2=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_2'
56-
- 'tweet-summary-3=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_3'
57-
- 'tweet-summary-4=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_4'
46+
- '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
47+
- '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
5848
env:
5949
- name: PORT
6050
value: "8000"
@@ -99,29 +89,6 @@ spec:
9989
name: shm
10090
- name: adapters
10191
mountPath: "/adapters"
102-
initContainers:
103-
- name: adapter-loader
104-
image: ghcr.io/tomatillo-and-multiverse/adapter-puller:demo
105-
command: ["python"]
106-
args:
107-
- ./pull_adapters.py
108-
- --adapter
109-
- yard1/llama-2-7b-sql-lora-test
110-
- --adapter
111-
- vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
112-
- --duplicate-count
113-
- "5"
114-
env:
115-
- name: HF_TOKEN
116-
valueFrom:
117-
secretKeyRef:
118-
name: hf-token
119-
key: token
120-
- name: HF_HOME
121-
value: /adapters
122-
volumeMounts:
123-
- name: adapters
124-
mountPath: "/adapters"
12592
restartPolicy: Always
12693
schedulerName: default-scheduler
12794
terminationGracePeriodSeconds: 30

site-src/guides/dynamic-lora.md

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
# Getting started with Gateway API Inference Extension with Dynamic lora updates on vllm
2+
3+
The goal of this guide is to get a single InferencePool running with vLLM and demonstrate use of dynamic lora updating!
4+
5+
### Requirements
6+
- Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher
7+
- A cluster with:
8+
- Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind,
9+
you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer).
10+
- 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed.
11+
12+
### Steps
13+
14+
1. **Deploy Sample VLLM Model Server with dynamic lora update enabled and dynamic lora syncer sidecar **
15+
[Redeploy the vLLM deployment with Dynamic lora adapter enabled and Lora syncer sidecar and configmap](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/manifests/vllm/dynamic-lora-sidecar/deployment.yaml)
16+
17+
Rest of the steps are same as [general setup](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/site-src/guides/index.md)
18+
19+
20+
### Safely rollout v2 adapter
21+
22+
1. Update the LoRA syncer ConfigMap to make the new adapter version available on the model servers.
23+
24+
```yaml
25+
apiVersion: v1
26+
kind: ConfigMap
27+
metadata:
28+
name: dynamic-lora-config
29+
data:
30+
configmap.yaml: |
31+
vLLMLoRAConfig:
32+
name: sql-loras-llama
33+
port: 8000
34+
ensureExist:
35+
models:
36+
- base-model: meta-llama/Llama-2-7b-hf
37+
id: tweet-summary-0
38+
source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
39+
- base-model: meta-llama/Llama-2-7b-hf
40+
id: tweet-summary-1
41+
source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
42+
- base-model: meta-llama/Llama-2-7b-hf
43+
id: tweet-summary-2
44+
source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
45+
2. Configure a canary rollout with traffic split using LLMService. In this example, 40% of traffic for tweet-summary model will be sent to the ***tweet-summary-2*** adapter .
46+
47+
```yaml
48+
model:
49+
name: tweet-summary
50+
targetModels:
51+
targetModelName: tweet-summary-0
52+
weight: 20
53+
targetModelName: tweet-summary-1
54+
weight: 40
55+
targetModelName: tweet-summary-2
56+
weight: 40
57+
58+
```
59+
60+
3. Finish rollout by setting the traffic to the new version 100%.
61+
```yaml
62+
model:
63+
name: tweet-summary
64+
targetModels:
65+
targetModelName: tweet-summary-2
66+
weight: 100
67+
```
68+
69+
4. Remove v1 from dynamic lora configmap.
70+
```yaml
71+
apiVersion: v1
72+
kind: ConfigMap
73+
metadata:
74+
name: dynamic-lora-config
75+
data:
76+
configmap.yaml: |
77+
vLLMLoRAConfig:
78+
name: sql-loras-llama
79+
port: 8000
80+
ensureExist:
81+
models:
82+
- base-model: meta-llama/Llama-2-7b-hf
83+
id: tweet-summary-2
84+
source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
85+
ensureNotExist:
86+
models:
87+
- base-model: meta-llama/Llama-2-7b-hf
88+
id: tweet-summary-1
89+
source: gs://[HUGGING FACE PATH]
90+
- base-model: meta-llama/Llama-2-7b-hf
91+
id: tweet-summary-0
92+
source: gs://[HUGGING FACE PATH]
93+
```

site-src/guides/index.md

+4
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
1919
kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2
2020
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml
2121
```
22+
23+
24+
25+
2226
1. **Install the Inference Extension CRDs:**
2327

2428
```sh

0 commit comments

Comments
 (0)