forked from kubernetes-sigs/gateway-api-inference-extension
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvllm-lora-deployment.yaml
138 lines (136 loc) · 4.19 KB
/
vllm-lora-deployment.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
apiVersion: v1
kind: Service
metadata:
name: vllm-llama2-7b-pool
spec:
selector:
app: vllm-llama2-7b-pool
ports:
- protocol: TCP
port: 8000
targetPort: 8000
type: ClusterIP
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-llama2-7b-pool
namespace: default
spec:
replicas: 3
selector:
matchLabels:
app: vllm-llama2-7b-pool
template:
metadata:
labels:
app: vllm-llama2-7b-pool
spec:
containers:
- name: lora
image: "vllm/vllm-openai:latest"
imagePullPolicy: Always
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
args:
- "--model"
- "meta-llama/Llama-2-7b-hf"
- "--tensor-parallel-size"
- "1"
- "--port"
- "8000"
- "--enable-lora"
- "--max-loras"
- "4"
- "--max-cpu-loras"
- "12"
- "--lora-modules"
- "sql-lora=/adapters/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/"
- "tweet-summary=/adapters/hub/models--vineetsharma--qlora-adapter-Llama-2-7b-hf-TweetSumm/snapshots/796337d8e866318c59e38f16416e3ecd11fe5403"
- 'sql-lora-0=/adapters/yard1/llama-2-7b-sql-lora-test_0'
- 'sql-lora-1=/adapters/yard1/llama-2-7b-sql-lora-test_1'
- 'sql-lora-2=/adapters/yard1/llama-2-7b-sql-lora-test_2'
- 'sql-lora-3=/adapters/yard1/llama-2-7b-sql-lora-test_3'
- 'sql-lora-4=/adapters/yard1/llama-2-7b-sql-lora-test_4'
- 'tweet-summary-0=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_0'
- 'tweet-summary-1=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_1'
- 'tweet-summary-2=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_2'
- 'tweet-summary-3=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_3'
- 'tweet-summary-4=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_4'
env:
- name: PORT
value: "8000"
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: token
ports:
- containerPort: 8000
name: http
protocol: TCP
livenessProbe:
failureThreshold: 240
httpGet:
path: /health
port: http
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
readinessProbe:
failureThreshold: 600
httpGet:
path: /health
port: http
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
nvidia.com/gpu: 1
requests:
nvidia.com/gpu: 1
volumeMounts:
- mountPath: /data
name: data
- mountPath: /dev/shm
name: shm
- name: adapters
mountPath: "/adapters"
initContainers:
- name: adapter-loader
image: ghcr.io/tomatillo-and-multiverse/adapter-puller:demo
command: ["python"]
args:
- ./pull_adapters.py
- --adapter
- yard1/llama-2-7b-sql-lora-test
- --adapter
- vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
- --duplicate-count
- "5"
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: token
- name: HF_HOME
value: /adapters
volumeMounts:
- name: adapters
mountPath: "/adapters"
restartPolicy: Always
schedulerName: default-scheduler
terminationGracePeriodSeconds: 30
volumes:
- name: data
emptyDir: {}
- name: shm
emptyDir:
medium: Memory
- name: adapters
emptyDir: {}