forked from kubernetes-sigs/gateway-api-inference-extension
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcpu-deployment.yaml
117 lines (117 loc) · 3.47 KB
/
cpu-deployment.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-llama2-7b
spec:
replicas: 3
selector:
matchLabels:
app: vllm-llama2-7b
template:
metadata:
labels:
app: vllm-llama2-7b
spec:
containers:
- name: lora
image: "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.7.2" # formal images can be found in https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
imagePullPolicy: Always
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
args:
- "--model"
- "Qwen/Qwen2.5-1.5B-Instruct"
- "--port"
- "8000"
- "--enable-lora"
- "--max-loras"
- "4"
- "--lora-modules"
- '{"name": "tweet-summary-0", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
- '{"name": "tweet-summary-1", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
env:
- name: PORT
value: "8000"
- name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
value: "true"
- name: VLLM_CPU_KVCACHE_SPACE
value: "4"
ports:
- containerPort: 8000
name: http
protocol: TCP
livenessProbe:
failureThreshold: 240
httpGet:
path: /health
port: http
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
readinessProbe:
failureThreshold: 600
httpGet:
path: /health
port: http
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
cpu: "12"
memory: "9000Mi"
requests:
cpu: "12"
memory: "9000Mi"
volumeMounts:
- mountPath: /data
name: data
- mountPath: /dev/shm
name: shm
- name: adapters
mountPath: "/adapters"
initContainers:
- name: lora-adapter-syncer
tty: true
stdin: true
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main
restartPolicy: Always
imagePullPolicy: Always
env:
- name: DYNAMIC_LORA_ROLLOUT_CONFIG
value: "/config/configmap.yaml"
volumeMounts: # DO NOT USE subPath, dynamic configmap updates don't work on subPaths
- name: config-volume
mountPath: /config
restartPolicy: Always
schedulerName: default-scheduler
terminationGracePeriodSeconds: 30
volumes:
- name: data
emptyDir: {}
- name: shm
emptyDir:
medium: Memory
- name: adapters
emptyDir: {}
- name: config-volume
configMap:
name: vllm-qwen-adapters
---
apiVersion: v1
kind: ConfigMap
metadata:
name: vllm-qwen-adapters
data:
configmap.yaml: |
vLLMLoRAConfig:
name: vllm-llama2-7b
port: 8000
ensureExist:
models:
- base-model: Qwen/Qwen2.5-1.5B
id: tweet-summary-1
source: SriSanth2345/Qwen-1.5B-Tweet-Generations