Skip to content

Commit e43f5e9

Browse files
committed
Add makefile and cloudbuild file to build and push lora-syncer
Signed-off-by: Kunjan <[email protected]>
1 parent cd2fc96 commit e43f5e9

File tree

5 files changed

+334
-64
lines changed

5 files changed

+334
-64
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: vllm-llama2-7b-pool
5+
spec:
6+
selector:
7+
app: vllm-llama2-7b-pool
8+
ports:
9+
- protocol: TCP
10+
port: 8000
11+
targetPort: 8000
12+
type: ClusterIP
13+
---
14+
apiVersion: apps/v1
15+
kind: Deployment
16+
metadata:
17+
name: vllm-llama2-7b-pool
18+
spec:
19+
replicas: 3
20+
selector:
21+
matchLabels:
22+
app: vllm-llama2-7b-pool
23+
template:
24+
metadata:
25+
labels:
26+
app: vllm-llama2-7b-pool
27+
spec:
28+
containers:
29+
- name: lora
30+
image: "vllm/vllm-openai:latest"
31+
imagePullPolicy: Always
32+
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
33+
args:
34+
- "--model"
35+
- "meta-llama/Llama-2-7b-hf"
36+
- "--tensor-parallel-size"
37+
- "1"
38+
- "--port"
39+
- "8000"
40+
- "--enable-lora"
41+
- "--max-loras"
42+
- "4"
43+
- "--max-cpu-loras"
44+
- "12"
45+
- "--lora-modules"
46+
- "sql-lora=/adapters/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/"
47+
- "tweet-summary=/adapters/hub/models--vineetsharma--qlora-adapter-Llama-2-7b-hf-TweetSumm/snapshots/796337d8e866318c59e38f16416e3ecd11fe5403"
48+
- 'sql-lora-0=/adapters/yard1/llama-2-7b-sql-lora-test_0'
49+
- 'sql-lora-1=/adapters/yard1/llama-2-7b-sql-lora-test_1'
50+
- 'sql-lora-2=/adapters/yard1/llama-2-7b-sql-lora-test_2'
51+
- 'sql-lora-3=/adapters/yard1/llama-2-7b-sql-lora-test_3'
52+
- 'sql-lora-4=/adapters/yard1/llama-2-7b-sql-lora-test_4'
53+
- 'tweet-summary-0=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_0'
54+
- 'tweet-summary-1=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_1'
55+
- 'tweet-summary-2=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_2'
56+
- 'tweet-summary-3=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_3'
57+
- 'tweet-summary-4=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_4'
58+
env:
59+
- name: PORT
60+
value: "8000"
61+
- name: HUGGING_FACE_HUB_TOKEN
62+
valueFrom:
63+
secretKeyRef:
64+
name: hf-token
65+
key: token
66+
ports:
67+
- containerPort: 8000
68+
name: http
69+
protocol: TCP
70+
livenessProbe:
71+
failureThreshold: 240
72+
httpGet:
73+
path: /health
74+
port: http
75+
scheme: HTTP
76+
initialDelaySeconds: 5
77+
periodSeconds: 5
78+
successThreshold: 1
79+
timeoutSeconds: 1
80+
readinessProbe:
81+
failureThreshold: 600
82+
httpGet:
83+
path: /health
84+
port: http
85+
scheme: HTTP
86+
initialDelaySeconds: 5
87+
periodSeconds: 5
88+
successThreshold: 1
89+
timeoutSeconds: 1
90+
resources:
91+
limits:
92+
nvidia.com/gpu: 1
93+
requests:
94+
nvidia.com/gpu: 1
95+
volumeMounts:
96+
- mountPath: /data
97+
name: data
98+
- mountPath: /dev/shm
99+
name: shm
100+
- name: adapters
101+
mountPath: "/adapters"
102+
initContainers:
103+
- name: adapter-loader
104+
image: ghcr.io/tomatillo-and-multiverse/adapter-puller:demo
105+
command: ["python"]
106+
args:
107+
- ./pull_adapters.py
108+
- --adapter
109+
- yard1/llama-2-7b-sql-lora-test
110+
- --adapter
111+
- vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
112+
- --duplicate-count
113+
- "5"
114+
env:
115+
- name: HF_TOKEN
116+
valueFrom:
117+
secretKeyRef:
118+
name: hf-token
119+
key: token
120+
- name: HF_HOME
121+
value: /adapters
122+
volumeMounts:
123+
- name: adapters
124+
mountPath: "/adapters"
125+
- name: lora-adapter-syncer
126+
tty: true
127+
stdin: true
128+
image: <SIDECAR_IMAGE>
129+
restartPolicy: Always
130+
imagePullPolicy: Always
131+
env:
132+
- name: DYNAMIC_LORA_ROLLOUT_CONFIG
133+
value: "/config/configmap.yaml"
134+
volumeMounts: # DO NOT USE subPath
135+
- name: config-volume
136+
mountPath: /config
137+
restartPolicy: Always
138+
schedulerName: default-scheduler
139+
terminationGracePeriodSeconds: 30
140+
volumes:
141+
- name: data
142+
emptyDir: {}
143+
- name: shm
144+
emptyDir:
145+
medium: Memory
146+
- name: adapters
147+
emptyDir: {}
148+
- name: config-volume
149+
configMap:
150+
name: dynamic-lora-config
151+
152+
---
153+
154+
apiVersion: v1
155+
kind: ConfigMap
156+
metadata:
157+
name: dynamic-lora-config
158+
data:
159+
configmap.yaml: |
160+
vLLMLoRAConfig:
161+
host: modelServerHost
162+
name: sql-loras-llama
163+
port: modelServerPort
164+
ensureExist:
165+
models:
166+
- base-model: meta-llama/Llama-2-7b-hf
167+
id: sql-lora-v1
168+
source: yard1/llama-2-7b-sql-lora-test
169+
- base-model: meta-llama/Llama-2-7b-hf
170+
id: sql-lora-v3
171+
source: yard1/llama-2-7b-sql-lora-test
172+
- base-model: meta-llama/Llama-2-7b-hf
173+
id: sql-lora-v4
174+
source: yard1/llama-2-7b-sql-lora-test
175+
ensureNotExist:
176+
models:
177+
- base-model: meta-llama/Llama-2-7b-hf
178+
id: sql-lora-v2
179+
source: yard1/llama-2-7b-sql-lora-test

site-src/guides/dynamic-lora.md

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Getting started with Gateway API Inference Extension with Dynamic lora updates on vllm
2+
3+
The goal of this guide is to get a single InferencePool running with VLLM and demonstrate use of dynamic lora updating !
4+
5+
### Requirements
6+
- Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher
7+
- A cluster with:
8+
- Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind,
9+
you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer).
10+
- 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed.
11+
12+
### Steps
13+
14+
1. **Deploy Sample VLLM Model Server with dynamic lora update enabled and dynamic lora syncer sidecar **
15+
[Deploy sample vllm deployment with Dynamic lora adapter enabled and Lora syncer sidecar and configmap](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/manifests/vllm/dynamic-lora-sidecar/deployment.yaml)
16+
17+
Rest of the steps are same as [general setup](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/site-src/guides/index.md)
18+
19+
20+
### Safely rollout v2 adapter
21+
22+
1. Update lora configmap
23+
24+
``` yaml
25+
26+
apiVersion: v1
27+
kind: ConfigMap
28+
metadata:
29+
name: dynamic-lora-config
30+
data:
31+
configmap.yaml: |
32+
vLLMLoRAConfig:
33+
ensureExist:
34+
models:
35+
- id: chatbot-v1
36+
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v1
37+
- id: chatbot-v2
38+
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v2
39+
```
40+
41+
2. Configure a canary rollout with traffic split using LLMService. In this example, 10% of traffic to the chatbot model will be sent to v2.
42+
43+
``` yaml
44+
model:
45+
name: chatbot
46+
targetModels:
47+
targetModelName: chatbot-v1
48+
weight: 90
49+
targetModelName: chatbot-v2
50+
weight: 10
51+
```
52+
53+
3. Finish rollout by setting the traffic to the new version 100%.
54+
```yaml
55+
model:
56+
name: chatbot
57+
targetModels:
58+
targetModelName: chatbot-v2
59+
weight: 100
60+
```
61+
62+
4. Remove v1 from dynamic lora configmap.
63+
```yaml
64+
apiVersion: v1
65+
kind: ConfigMap
66+
metadata:
67+
name: dynamic-lora-config
68+
data:
69+
configmap.yaml: |
70+
vLLMLoRAConfig:
71+
ensureExist:
72+
models:
73+
- id: chatbot-v2
74+
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v2
75+
ensureNotExist: # Explicitly unregisters the adapter from model servers
76+
models:
77+
- id: chatbot-v1
78+
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v1
79+
```

site-src/guides/index.md

-64
Original file line numberDiff line numberDiff line change
@@ -19,70 +19,6 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
1919
kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2
2020
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml
2121
```
22-
**OPTIONALLY**: Enable Dynamic loading of Lora adapters.
23-
24-
[Deploy sample vllm deployment with Dynamic lora adapter enabled and Lora syncer sidecar](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/tools/dynamic-lora-sidecar/deployment.yaml)
25-
26-
***Safely rollout v2 adapter***
27-
28-
1. Update lora configmap
29-
30-
``` yaml
31-
32-
apiVersion: v1
33-
kind: ConfigMap
34-
metadata:
35-
name: dynamic-lora-config
36-
data:
37-
configmap.yaml: |
38-
vLLMLoRAConfig:
39-
ensureExist:
40-
models:
41-
- id: chatbot-v1
42-
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v1
43-
- id: chatbot-v2
44-
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v2
45-
```
46-
47-
2. Configure a canary rollout with traffic split using LLMService. In this example, 10% of traffic to the chatbot model will be sent to v2.
48-
49-
``` yaml
50-
model:
51-
name: chatbot
52-
targetModels:
53-
targetModelName: chatbot-v1
54-
weight: 90
55-
targetModelName: chatbot-v2
56-
weight: 10
57-
```
58-
59-
3. Finish rollout by setting the traffic to the new version 100%.
60-
```yaml
61-
model:
62-
name: chatbot
63-
targetModels:
64-
targetModelName: chatbot-v2
65-
weight: 100
66-
```
67-
68-
4. Remove v1 from dynamic lora configmap.
69-
```yaml
70-
apiVersion: v1
71-
kind: ConfigMap
72-
metadata:
73-
name: dynamic-lora-config
74-
data:
75-
configmap.yaml: |
76-
vLLMLoRAConfig:
77-
ensureExist:
78-
models:
79-
- id: chatbot-v2
80-
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v2
81-
ensureNotExist: # Explicitly unregisters the adapter from model servers
82-
models:
83-
- id: chatbot-v1
84-
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v1
85-
```
8622

8723

8824

tools/dynamic-lora-sidecar/Makefile

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
IMAGE_NAME := lora-syncer
2+
IMAGE_REGISTRY ?= us-central1-docker.pkg.dev/k8s-staging-images/llm-instance-gateway
3+
IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(IMAGE_NAME)
4+
5+
GIT_TAG ?= $(shell git describe --tags --dirty --always)
6+
EXTRA_TAG ?= $(if $(_PULL_BASE_REF),$(_PULL_BASE_REF),main)
7+
IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG)
8+
EXTRA_IMAGE_TAG ?= $(IMAGE_REPO):$(EXTRA_TAG)
9+
10+
11+
PLATFORMS ?= linux/amd64
12+
13+
14+
DOCKER_BUILDX_CMD ?= docker buildx
15+
IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build
16+
IMAGE_BUILD_EXTRA_OPTS ?=
17+
18+
# --- Targets ---
19+
.PHONY: image-local-build
20+
image-local-build:
21+
BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use)
22+
$(MAKE) image-build PUSH=$(PUSH)
23+
$(DOCKER_BUILDX_CMD) rm $$BUILDER
24+
25+
.PHONY: image-local-push
26+
image-local-push: PUSH=--push
27+
image-local-push: image-local-build
28+
29+
.PHONY: image-build
30+
image-build:
31+
$(IMAGE_BUILD_CMD) -t $(IMAGE_TAG) \
32+
--platform=$(PLATFORMS) \
33+
--build-arg BASE_IMAGE=$(BASE_IMAGE) \
34+
--build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
35+
$(PUSH) \
36+
$(IMAGE_BUILD_EXTRA_OPTS) ./
37+
38+
.PHONY: image-push
39+
image-push: PUSH=--push
40+
image-push: image-build
41+
42+
.PHONY: run
43+
run:
44+
docker run -v $(CURDIR)/config:/config -u appuser $(IMAGE_TAG) # Use the user name
45+
46+
.PHONY: clean
47+
clean:
48+
docker rmi $(IMAGE_TAG) $(EXTRA_IMAGE_TAG) 2>/dev/null || true
49+
50+
.PHONY: clean-dangling
51+
clean-dangling:
52+
docker rmi $(docker images -f "dangling=true" -q) 2>/dev/null || true
53+
54+
.PHONY: test
55+
test:
56+
python -m unittest discover
57+
58+
.PHONY: all
59+
all: test image-build

0 commit comments

Comments
 (0)