Skip to content

Commit 310c5a8

Browse files
committed
Add makefile and cloudbuild file to build and push lora-syncer
Signed-off-by: Kunjan <[email protected]>
1 parent cd2fc96 commit 310c5a8

File tree

6 files changed

+325
-99
lines changed

6 files changed

+325
-99
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: vllm-llama2-7b-pool
5+
spec:
6+
selector:
7+
app: vllm-llama2-7b-pool
8+
ports:
9+
- protocol: TCP
10+
port: 8000
11+
targetPort: 8000
12+
type: ClusterIP
13+
---
14+
apiVersion: apps/v1
15+
kind: Deployment
16+
metadata:
17+
name: vllm-llama2-7b-pool
18+
spec:
19+
replicas: 3
20+
selector:
21+
matchLabels:
22+
app: vllm-llama2-7b-pool
23+
template:
24+
metadata:
25+
labels:
26+
app: vllm-llama2-7b-pool
27+
spec:
28+
containers:
29+
- name: lora
30+
image: "vllm/vllm-openai:latest"
31+
imagePullPolicy: Always
32+
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
33+
args:
34+
- "--model"
35+
- "meta-llama/Llama-2-7b-hf"
36+
- "--tensor-parallel-size"
37+
- "1"
38+
- "--port"
39+
- "8000"
40+
- "--enable-lora"
41+
- "--max-loras"
42+
- "4"
43+
- "--max-cpu-loras"
44+
- "12"
45+
- "--lora-modules"
46+
- '{"name": "sql-lora-0", "path": "yard1/llama-2-7b-sql-lora-test", "base_model_name": "llama-2"}'
47+
- '{"name": "sql-lora-1", "path": "yard1/llama-2-7b-sql-lora-test", "base_model_name": "llama-2"}'
48+
- '{"name": "sql-lora-2", "path": "yard1/llama-2-7b-sql-lora-test", "base_model_name": "llama-2"}'
49+
- '{"name": "sql-lora-3", "path": "yard1/llama-2-7b-sql-lora-test", "base_model_name": "llama-2"}'
50+
- '{"name": "sql-lora-4", "path": "yard1/llama-2-7b-sql-lora-test", "base_model_name": "llama-2"}'
51+
- '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
52+
- '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
53+
- '{"name": "tweet-summary-2", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
54+
- '{"name": "tweet-summary-3", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
55+
- '{"name": "tweet-summary-4", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
56+
- '{"name": "sql-lora", "path": "yard1/llama-2-7b-sql-lora-test", "base_model_name": "llama-2"}'
57+
- '{"name": "tweet-summary", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
58+
env:
59+
- name: PORT
60+
value: "8000"
61+
- name: HUGGING_FACE_HUB_TOKEN
62+
valueFrom:
63+
secretKeyRef:
64+
name: hf-token
65+
key: token
66+
- name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
67+
value: "true"
68+
ports:
69+
- containerPort: 8000
70+
name: http
71+
protocol: TCP
72+
livenessProbe:
73+
failureThreshold: 240
74+
httpGet:
75+
path: /health
76+
port: http
77+
scheme: HTTP
78+
initialDelaySeconds: 5
79+
periodSeconds: 5
80+
successThreshold: 1
81+
timeoutSeconds: 1
82+
readinessProbe:
83+
failureThreshold: 600
84+
httpGet:
85+
path: /health
86+
port: http
87+
scheme: HTTP
88+
initialDelaySeconds: 5
89+
periodSeconds: 5
90+
successThreshold: 1
91+
timeoutSeconds: 1
92+
resources:
93+
limits:
94+
nvidia.com/gpu: 1
95+
requests:
96+
nvidia.com/gpu: 1
97+
volumeMounts:
98+
- mountPath: /data
99+
name: data
100+
- mountPath: /dev/shm
101+
name: shm
102+
- name: adapters
103+
mountPath: "/adapters"
104+
initContainers:
105+
- name: lora-adapter-syncer
106+
tty: true
107+
stdin: true
108+
image: <SIDECAR_IMAGE> #Replace image
109+
restartPolicy: Always
110+
imagePullPolicy: Always
111+
env:
112+
- name: DYNAMIC_LORA_ROLLOUT_CONFIG
113+
value: "/config/configmap.yaml"
114+
volumeMounts: # DO NOT USE subPath
115+
- name: config-volume
116+
mountPath: /config
117+
restartPolicy: Always
118+
schedulerName: default-scheduler
119+
terminationGracePeriodSeconds: 30
120+
volumes:
121+
- name: data
122+
emptyDir: {}
123+
- name: shm
124+
emptyDir:
125+
medium: Memory
126+
- name: adapters
127+
emptyDir: {}
128+
- name: config-volume
129+
configMap:
130+
name: dynamic-lora-config
131+
132+
---
133+
134+
apiVersion: v1
135+
kind: ConfigMap
136+
metadata:
137+
name: dynamic-lora-config
138+
data:
139+
configmap.yaml: |
140+
vLLMLoRAConfig:
141+
name: sql-loras-llama
142+
port: 8000
143+
ensureExist:
144+
models:
145+
- base-model: meta-llama/Llama-2-7b-hf
146+
id: sql-lora-v1
147+
source: yard1/llama-2-7b-sql-lora-test
148+
- base-model: meta-llama/Llama-2-7b-hf
149+
id: sql-lora-v3
150+
source: yard1/llama-2-7b-sql-lora-test
151+
- base-model: meta-llama/Llama-2-7b-hf
152+
id: sql-lora-v4
153+
source: yard1/llama-2-7b-sql-lora-test
154+
ensureNotExist:
155+
models:
156+
- base-model: meta-llama/Llama-2-7b-hf
157+
id: sql-lora-v2
158+
source: yard1/llama-2-7b-sql-lora-test

pkg/manifests/vllm/deployment.yaml

+12-35
Original file line numberDiff line numberDiff line change
@@ -43,18 +43,18 @@ spec:
4343
- "--max-cpu-loras"
4444
- "12"
4545
- "--lora-modules"
46-
- "sql-lora=/adapters/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/"
47-
- "tweet-summary=/adapters/hub/models--vineetsharma--qlora-adapter-Llama-2-7b-hf-TweetSumm/snapshots/796337d8e866318c59e38f16416e3ecd11fe5403"
48-
- 'sql-lora-0=/adapters/yard1/llama-2-7b-sql-lora-test_0'
49-
- 'sql-lora-1=/adapters/yard1/llama-2-7b-sql-lora-test_1'
50-
- 'sql-lora-2=/adapters/yard1/llama-2-7b-sql-lora-test_2'
51-
- 'sql-lora-3=/adapters/yard1/llama-2-7b-sql-lora-test_3'
52-
- 'sql-lora-4=/adapters/yard1/llama-2-7b-sql-lora-test_4'
53-
- 'tweet-summary-0=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_0'
54-
- 'tweet-summary-1=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_1'
55-
- 'tweet-summary-2=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_2'
56-
- 'tweet-summary-3=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_3'
57-
- 'tweet-summary-4=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_4'
46+
- '{"name": "sql-lora-0", "path": "yard1/llama-2-7b-sql-lora-test", "base_model_name": "llama-2"}'
47+
- '{"name": "sql-lora-1", "path": "yard1/llama-2-7b-sql-lora-test", "base_model_name": "llama-2"}'
48+
- '{"name": "sql-lora-2", "path": "yard1/llama-2-7b-sql-lora-test", "base_model_name": "llama-2"}'
49+
- '{"name": "sql-lora-3", "path": "yard1/llama-2-7b-sql-lora-test", "base_model_name": "llama-2"}'
50+
- '{"name": "sql-lora-4", "path": "yard1/llama-2-7b-sql-lora-test", "base_model_name": "llama-2"}'
51+
- '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
52+
- '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
53+
- '{"name": "tweet-summary-2", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
54+
- '{"name": "tweet-summary-3", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
55+
- '{"name": "tweet-summary-4", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
56+
- '{"name": "sql-lora", "path": "yard1/llama-2-7b-sql-lora-test", "base_model_name": "llama-2"}'
57+
- '{"name": "tweet-summary", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}'
5858
env:
5959
- name: PORT
6060
value: "8000"
@@ -99,29 +99,6 @@ spec:
9999
name: shm
100100
- name: adapters
101101
mountPath: "/adapters"
102-
initContainers:
103-
- name: adapter-loader
104-
image: ghcr.io/tomatillo-and-multiverse/adapter-puller:demo
105-
command: ["python"]
106-
args:
107-
- ./pull_adapters.py
108-
- --adapter
109-
- yard1/llama-2-7b-sql-lora-test
110-
- --adapter
111-
- vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
112-
- --duplicate-count
113-
- "5"
114-
env:
115-
- name: HF_TOKEN
116-
valueFrom:
117-
secretKeyRef:
118-
name: hf-token
119-
key: token
120-
- name: HF_HOME
121-
value: /adapters
122-
volumeMounts:
123-
- name: adapters
124-
mountPath: "/adapters"
125102
restartPolicy: Always
126103
schedulerName: default-scheduler
127104
terminationGracePeriodSeconds: 30

site-src/guides/dynamic-lora.md

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Getting started with Gateway API Inference Extension with Dynamic lora updates on vllm
2+
3+
The goal of this guide is to get a single InferencePool running with VLLM and demonstrate use of dynamic lora updating !
4+
5+
### Requirements
6+
- Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher
7+
- A cluster with:
8+
- Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind,
9+
you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer).
10+
- 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed.
11+
12+
### Steps
13+
14+
1. **Deploy Sample VLLM Model Server with dynamic lora update enabled and dynamic lora syncer sidecar **
15+
[Deploy sample vllm deployment with Dynamic lora adapter enabled and Lora syncer sidecar and configmap](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/manifests/vllm/dynamic-lora-sidecar/deployment.yaml)
16+
17+
Rest of the steps are same as [general setup](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/site-src/guides/index.md)
18+
19+
20+
### Safely rollout v2 adapter
21+
22+
1. Update lora configmap
23+
24+
``` yaml
25+
26+
apiVersion: v1
27+
kind: ConfigMap
28+
metadata:
29+
name: dynamic-lora-config
30+
data:
31+
configmap.yaml: |
32+
vLLMLoRAConfig:
33+
ensureExist:
34+
models:
35+
- id: tweet-summary-v1
36+
source: tweet-summary-1=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_1
37+
- id: tweet-summary-v2
38+
source: tweet-summary-2=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_2
39+
```
40+
41+
2. Configure a canary rollout with traffic split using LLMService. In this example, 10% of traffic to the chatbot model will be sent to v2.
42+
43+
``` yaml
44+
model:
45+
name: chatbot
46+
targetModels:
47+
targetModelName: chatbot-v1
48+
weight: 90
49+
targetModelName: chatbot-v2
50+
weight: 10
51+
```
52+
53+
3. Finish rollout by setting the traffic to the new version 100%.
54+
```yaml
55+
model:
56+
name: chatbot
57+
targetModels:
58+
targetModelName: chatbot-v2
59+
weight: 100
60+
```
61+
62+
4. Remove v1 from dynamic lora configmap.
63+
```yaml
64+
apiVersion: v1
65+
kind: ConfigMap
66+
metadata:
67+
name: dynamic-lora-config
68+
data:
69+
configmap.yaml: |
70+
vLLMLoRAConfig:
71+
ensureExist:
72+
models:
73+
- id: chatbot-v2
74+
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v2
75+
ensureNotExist: # Explicitly unregisters the adapter from model servers
76+
models:
77+
- id: chatbot-v1
78+
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v1
79+
```

site-src/guides/index.md

-64
Original file line numberDiff line numberDiff line change
@@ -19,70 +19,6 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
1919
kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2
2020
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml
2121
```
22-
**OPTIONALLY**: Enable Dynamic loading of Lora adapters.
23-
24-
[Deploy sample vllm deployment with Dynamic lora adapter enabled and Lora syncer sidecar](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/tools/dynamic-lora-sidecar/deployment.yaml)
25-
26-
***Safely rollout v2 adapter***
27-
28-
1. Update lora configmap
29-
30-
``` yaml
31-
32-
apiVersion: v1
33-
kind: ConfigMap
34-
metadata:
35-
name: dynamic-lora-config
36-
data:
37-
configmap.yaml: |
38-
vLLMLoRAConfig:
39-
ensureExist:
40-
models:
41-
- id: chatbot-v1
42-
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v1
43-
- id: chatbot-v2
44-
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v2
45-
```
46-
47-
2. Configure a canary rollout with traffic split using LLMService. In this example, 10% of traffic to the chatbot model will be sent to v2.
48-
49-
``` yaml
50-
model:
51-
name: chatbot
52-
targetModels:
53-
targetModelName: chatbot-v1
54-
weight: 90
55-
targetModelName: chatbot-v2
56-
weight: 10
57-
```
58-
59-
3. Finish rollout by setting the traffic to the new version 100%.
60-
```yaml
61-
model:
62-
name: chatbot
63-
targetModels:
64-
targetModelName: chatbot-v2
65-
weight: 100
66-
```
67-
68-
4. Remove v1 from dynamic lora configmap.
69-
```yaml
70-
apiVersion: v1
71-
kind: ConfigMap
72-
metadata:
73-
name: dynamic-lora-config
74-
data:
75-
configmap.yaml: |
76-
vLLMLoRAConfig:
77-
ensureExist:
78-
models:
79-
- id: chatbot-v2
80-
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v2
81-
ensureNotExist: # Explicitly unregisters the adapter from model servers
82-
models:
83-
- id: chatbot-v1
84-
source: gs://[TEAM-A-MODELS-BUCKET]/chatbot-v1
85-
```
8622

8723

8824

0 commit comments

Comments
 (0)