Skip to content

Commit 9efc096

Browse files
committed
Add makefile and cloudbuild file to build and push lora-syncer
Signed-off-by: Kunjan <[email protected]>
1 parent cd2fc96 commit 9efc096

File tree

5 files changed

+336
-64
lines changed

5 files changed

+336
-64
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: vllm-llama2-7b-pool
5+
spec:
6+
selector:
7+
app: vllm-llama2-7b-pool
8+
ports:
9+
- protocol: TCP
10+
port: 8000
11+
targetPort: 8000
12+
type: ClusterIP
13+
---
14+
apiVersion: apps/v1
15+
kind: Deployment
16+
metadata:
17+
name: vllm-llama2-7b-pool
18+
spec:
19+
replicas: 3
20+
selector:
21+
matchLabels:
22+
app: vllm-llama2-7b-pool
23+
template:
24+
metadata:
25+
labels:
26+
app: vllm-llama2-7b-pool
27+
spec:
28+
containers:
29+
- name: lora
30+
image: "vllm/vllm-openai:latest"
31+
imagePullPolicy: Always
32+
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
33+
args:
34+
- "--model"
35+
- "meta-llama/Llama-2-7b-hf"
36+
- "--tensor-parallel-size"
37+
- "1"
38+
- "--port"
39+
- "8000"
40+
- "--enable-lora"
41+
- "--max-loras"
42+
- "4"
43+
- "--max-cpu-loras"
44+
- "12"
45+
- "--lora-modules"
46+
- "sql-lora=/adapters/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/"
47+
- "tweet-summary=/adapters/hub/models--vineetsharma--qlora-adapter-Llama-2-7b-hf-TweetSumm/snapshots/796337d8e866318c59e38f16416e3ecd11fe5403"
48+
- 'sql-lora-0=/adapters/yard1/llama-2-7b-sql-lora-test_0'
49+
- 'sql-lora-1=/adapters/yard1/llama-2-7b-sql-lora-test_1'
50+
- 'sql-lora-2=/adapters/yard1/llama-2-7b-sql-lora-test_2'
51+
- 'sql-lora-3=/adapters/yard1/llama-2-7b-sql-lora-test_3'
52+
- 'sql-lora-4=/adapters/yard1/llama-2-7b-sql-lora-test_4'
53+
- 'tweet-summary-0=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_0'
54+
- 'tweet-summary-1=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_1'
55+
- 'tweet-summary-2=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_2'
56+
- 'tweet-summary-3=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_3'
57+
- 'tweet-summary-4=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_4'
58+
env:
59+
- name: PORT
60+
value: "8000"
61+
- name: HUGGING_FACE_HUB_TOKEN
62+
valueFrom:
63+
secretKeyRef:
64+
name: hf-token
65+
key: token
66+
- name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
67+
value: "true"
68+
ports:
69+
- containerPort: 8000
70+
name: http
71+
protocol: TCP
72+
livenessProbe:
73+
failureThreshold: 240
74+
httpGet:
75+
path: /health
76+
port: http
77+
scheme: HTTP
78+
initialDelaySeconds: 5
79+
periodSeconds: 5
80+
successThreshold: 1
81+
timeoutSeconds: 1
82+
readinessProbe:
83+
failureThreshold: 600
84+
httpGet:
85+
path: /health
86+
port: http
87+
scheme: HTTP
88+
initialDelaySeconds: 5
89+
periodSeconds: 5
90+
successThreshold: 1
91+
timeoutSeconds: 1
92+
resources:
93+
limits:
94+
nvidia.com/gpu: 1
95+
requests:
96+
nvidia.com/gpu: 1
97+
volumeMounts:
98+
- mountPath: /data
99+
name: data
100+
- mountPath: /dev/shm
101+
name: shm
102+
- name: adapters
103+
mountPath: "/adapters"
104+
initContainers:
105+
- name: adapter-loader
106+
image: ghcr.io/tomatillo-and-multiverse/adapter-puller:demo
107+
command: ["python"]
108+
args:
109+
- ./pull_adapters.py
110+
- --adapter
111+
- yard1/llama-2-7b-sql-lora-test
112+
- --adapter
113+
- vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
114+
- --duplicate-count
115+
- "5"
116+
env:
117+
- name: HF_TOKEN
118+
valueFrom:
119+
secretKeyRef:
120+
name: hf-token
121+
key: token
122+
- name: HF_HOME
123+
value: /adapters
124+
volumeMounts:
125+
- name: adapters
126+
mountPath: "/adapters"
127+
- name: lora-adapter-syncer
128+
tty: true
129+
stdin: true
130+
image: <SIDECAR_IMAGE>
131+
restartPolicy: Always
132+
imagePullPolicy: Always
133+
env:
134+
- name: DYNAMIC_LORA_ROLLOUT_CONFIG
135+
value: "/config/configmap.yaml"
136+
volumeMounts: # DO NOT USE subPath
137+
- name: config-volume
138+
mountPath: /config
139+
restartPolicy: Always
140+
schedulerName: default-scheduler
141+
terminationGracePeriodSeconds: 30
142+
volumes:
143+
- name: data
144+
emptyDir: {}
145+
- name: shm
146+
emptyDir:
147+
medium: Memory
148+
- name: adapters
149+
emptyDir: {}
150+
- name: config-volume
151+
configMap:
152+
name: dynamic-lora-config
153+
154+
---
155+
156+
apiVersion: v1
157+
kind: ConfigMap
158+
metadata:
159+
name: dynamic-lora-config
160+
data:
161+
configmap.yaml: |
162+
vLLMLoRAConfig:
163+
host: modelServerHost
164+
name: sql-loras-llama
165+
port: modelServerPort
166+
ensureExist:
167+
models:
168+
- base-model: meta-llama/Llama-2-7b-hf
169+
id: sql-lora-v1
170+
source: yard1/llama-2-7b-sql-lora-test
171+
- base-model: meta-llama/Llama-2-7b-hf
172+
id: sql-lora-v3
173+
source: yard1/llama-2-7b-sql-lora-test
174+
- base-model: meta-llama/Llama-2-7b-hf
175+
id: sql-lora-v4
176+
source: yard1/llama-2-7b-sql-lora-test
177+
ensureNotExist:
178+
models:
179+
- base-model: meta-llama/Llama-2-7b-hf
180+
id: sql-lora-v2
181+
source: yard1/llama-2-7b-sql-lora-test

site-src/guides/dynamic-lora.md

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Getting started with Gateway API Inference Extension with Dynamic lora updates on vllm
2+
3+
The goal of this guide is to get a single InferencePool running with VLLM and demonstrate use of dynamic lora updating !
4+
5+
### Requirements
6+
- Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher
7+
- A cluster with:
8+
- Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind,
9+
you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer).
10+
- 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed.
11+
12+
### Steps
13+
14+
1. **Deploy Sample VLLM Model Server with dynamic lora update enabled and dynamic lora syncer sidecar **
15+
[Deploy sample vllm deployment with Dynamic lora adapter enabled and Lora syncer sidecar and configmap](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/manifests/vllm/dynamic-lora-sidecar/deployment.yaml)
16+
17+
Rest of the steps are same as [general setup](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/site-src/guides/index.md)
18+
19+
20+
### Safely rollout v2 adapter
21+
22+
1. Update lora configmap
23+
24+
``` yaml
25+
26+
apiVersion: v1
27+
kind: ConfigMap
28+
metadata:
29+
name: dynamic-lora-config
30+
data:
31+
configmap.yaml: |
32+
vLLMLoRAConfig:
33+
ensureExist:
34+
models:
35+
- id: tweet-summary-v1
36+
source: tweet-summary-1=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_1
37+
- id: tweet-summary-v2
38+
source: tweet-summary-2=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_2
39+
```
40+
41+
2. Configure a canary rollout with traffic split using LLMService. In this example, 10% of traffic to the chatbot model will be sent to v2.
42+
43+
``` yaml
44+
model:
45+
name: chatbot
46+
targetModels:
47+
targetModelName: chatbot-v1
48+
weight: 90
49+
targetModelName: chatbot-v2
50+
weight: 10
51+
```
52+
53+
3. Finish rollout by setting the traffic to the new version 100%.
54+
```yaml
55+
model:
56+
name: chatbot
57+
targetModels:
58+
targetModelName: chatbot-v2
59+
weight: 100
60+
```
61+
62+
4. Remove v1 from dynamic lora configmap.
63+
```yaml
64+
apiVersion: v1
65+
kind: ConfigMap
66+
metadata:
67+
name: dynamic-lora-config
68+
data:
69+
configmap.yaml: |
70+
vLLMLoRAConfig:
71+
ensureExist:
72+
models:
73+
- id: chatbot-v2
74+
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v2
75+
ensureNotExist: # Explicitly unregisters the adapter from model servers
76+
models:
77+
- id: chatbot-v1
78+
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v1
79+
```

site-src/guides/index.md

-64
Original file line numberDiff line numberDiff line change
@@ -19,70 +19,6 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
1919
kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2
2020
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml
2121
```
22-
**OPTIONALLY**: Enable Dynamic loading of Lora adapters.
23-
24-
[Deploy sample vllm deployment with Dynamic lora adapter enabled and Lora syncer sidecar](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/tools/dynamic-lora-sidecar/deployment.yaml)
25-
26-
***Safely rollout v2 adapter***
27-
28-
1. Update lora configmap
29-
30-
``` yaml
31-
32-
apiVersion: v1
33-
kind: ConfigMap
34-
metadata:
35-
name: dynamic-lora-config
36-
data:
37-
configmap.yaml: |
38-
vLLMLoRAConfig:
39-
ensureExist:
40-
models:
41-
- id: chatbot-v1
42-
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v1
43-
- id: chatbot-v2
44-
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v2
45-
```
46-
47-
2. Configure a canary rollout with traffic split using LLMService. In this example, 10% of traffic to the chatbot model will be sent to v2.
48-
49-
``` yaml
50-
model:
51-
name: chatbot
52-
targetModels:
53-
targetModelName: chatbot-v1
54-
weight: 90
55-
targetModelName: chatbot-v2
56-
weight: 10
57-
```
58-
59-
3. Finish rollout by setting the traffic to the new version 100%.
60-
```yaml
61-
model:
62-
name: chatbot
63-
targetModels:
64-
targetModelName: chatbot-v2
65-
weight: 100
66-
```
67-
68-
4. Remove v1 from dynamic lora configmap.
69-
```yaml
70-
apiVersion: v1
71-
kind: ConfigMap
72-
metadata:
73-
name: dynamic-lora-config
74-
data:
75-
configmap.yaml: |
76-
vLLMLoRAConfig:
77-
ensureExist:
78-
models:
79-
- id: chatbot-v2
80-
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v2
81-
ensureNotExist: # Explicitly unregisters the adapter from model servers
82-
models:
83-
- id: chatbot-v1
84-
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v1
85-
```
8622

8723

8824

tools/dynamic-lora-sidecar/Makefile

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
IMAGE_NAME := lora-syncer
2+
IMAGE_REGISTRY ?= us-central1-docker.pkg.dev/k8s-staging-images/llm-instance-gateway
3+
IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(IMAGE_NAME)
4+
5+
GIT_TAG ?= $(shell git describe --tags --dirty --always)
6+
EXTRA_TAG ?= $(if $(_PULL_BASE_REF),$(_PULL_BASE_REF),main)
7+
IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG)
8+
EXTRA_IMAGE_TAG ?= $(IMAGE_REPO):$(EXTRA_TAG)
9+
10+
11+
PLATFORMS ?= linux/amd64
12+
13+
14+
DOCKER_BUILDX_CMD ?= docker buildx
15+
IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build
16+
IMAGE_BUILD_EXTRA_OPTS ?=
17+
18+
# --- Targets ---
19+
.PHONY: image-local-build
20+
image-local-build:
21+
BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use)
22+
$(MAKE) image-build PUSH=$(PUSH)
23+
$(DOCKER_BUILDX_CMD) rm $$BUILDER
24+
25+
.PHONY: image-local-push
26+
image-local-push: PUSH=--push
27+
image-local-push: image-local-build
28+
29+
.PHONY: image-build
30+
image-build:
31+
$(IMAGE_BUILD_CMD) -t $(IMAGE_TAG) \
32+
--platform=$(PLATFORMS) \
33+
--build-arg BASE_IMAGE=$(BASE_IMAGE) \
34+
--build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
35+
$(PUSH) \
36+
$(IMAGE_BUILD_EXTRA_OPTS) ./
37+
38+
.PHONY: image-push
39+
image-push: PUSH=--push
40+
image-push: image-build
41+
42+
.PHONY: run
43+
run:
44+
docker run -v $(CURDIR)/config:/config -u appuser $(IMAGE_TAG) # Use the user name
45+
46+
.PHONY: clean
47+
clean:
48+
docker rmi $(IMAGE_TAG) $(EXTRA_IMAGE_TAG) 2>/dev/null || true
49+
50+
.PHONY: clean-dangling
51+
clean-dangling:
52+
docker rmi $(docker images -f "dangling=true" -q) 2>/dev/null || true
53+
54+
.PHONY: test
55+
test:
56+
python -m unittest discover
57+
58+
.PHONY: all
59+
all: test image-build

0 commit comments

Comments
 (0)