forked from kubernetes-sigs/gateway-api-inference-extension
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvllm-lora-deployment.yaml
124 lines (124 loc) · 4.01 KB
/
vllm-lora-deployment.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm
namespace: default
spec:
replicas: 6
selector:
matchLabels:
app: vllm
template:
metadata:
labels:
app: vllm
spec:
containers:
- name: lora
image: "ghcr.io/tomatillo-and-multiverse/vllm:demo"
imagePullPolicy: Always
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
args:
- "--model"
- "meta-llama/Llama-2-7b-hf"
- "--tensor-parallel-size"
- "1"
- "--port"
- "8000"
- "--disable-log-requests"
- "--enable-lora"
- "--max-loras"
- "4"
- "--max-cpu-loras"
- "12"
- "--lora-modules"
- "sql-lora=/adapters/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/"
- "tweet-summary=/adapters/hub/models--vineetsharma--qlora-adapter-Llama-2-7b-hf-TweetSumm/snapshots/796337d8e866318c59e38f16416e3ecd11fe5403"
- 'sql-lora-0=/adapters/yard1/llama-2-7b-sql-lora-test_0'
- 'sql-lora-1=/adapters/yard1/llama-2-7b-sql-lora-test_1'
- 'sql-lora-2=/adapters/yard1/llama-2-7b-sql-lora-test_2'
- 'sql-lora-3=/adapters/yard1/llama-2-7b-sql-lora-test_3'
- 'sql-lora-4=/adapters/yard1/llama-2-7b-sql-lora-test_4'
- 'tweet-summary-0=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_0'
- 'tweet-summary-1=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_1'
- 'tweet-summary-2=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_2'
- 'tweet-summary-3=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_3'
- 'tweet-summary-4=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_4'
env:
- name: PORT
value: "8000"
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: token
ports:
- containerPort: 8000
name: http
protocol: TCP
livenessProbe:
failureThreshold: 240
httpGet:
path: /health
port: http
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
readinessProbe:
failureThreshold: 600
httpGet:
path: /health
port: http
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
nvidia.com/gpu: 1
requests:
nvidia.com/gpu: 1
volumeMounts:
- mountPath: /data
name: data
- mountPath: /dev/shm
name: shm
- name: adapters
mountPath: "/adapters"
initContainers:
- name: adapter-loader
image: ghcr.io/tomatillo-and-multiverse/adapter-puller:demo
command: ["python"]
args:
- ./pull_adapters.py
- --adapter
- yard1/llama-2-7b-sql-lora-test
- --adapter
- vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm
- --duplicate-count
- "5"
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: token
- name: HF_HOME
value: /adapters
volumeMounts:
- name: adapters
mountPath: "/adapters"
restartPolicy: Always
schedulerName: default-scheduler
terminationGracePeriodSeconds: 30
volumes:
- name: data
emptyDir: {}
- name: shm
emptyDir:
medium: Memory
- name: adapters
emptyDir: {}