forked from kubernetes-sigs/gateway-api-inference-extension
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeployment.yaml
127 lines (124 loc) · 3.31 KB
/
deployment.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
apiVersion: apps/v1
kind: Deployment
metadata:
name: llama-deployment
spec:
replicas: 1
selector:
matchLabels:
app: llama-server
template:
metadata:
labels:
app: llama-server
ai.gke.io/model: LLaMA2_7B
ai.gke.io/inference-server: vllm
examples.ai.gke.io/source: model-garden
spec:
shareProcessNamespace: true
containers:
- name: inference-server
image: vllm/vllm-openai:v0.6.3.post1
resources:
requests:
cpu: 5
memory: 20Gi
ephemeral-storage: 40Gi
nvidia.com/gpu : 1
limits:
cpu: 5
memory: 20Gi
ephemeral-storage: 40Gi
nvidia.com/gpu : 1
command: ["/bin/sh", "-c"]
args:
- vllm serve meta-llama/Llama-2-7b-hf
- --host=0.0.0.0
- --port=8000
- --tensor-parallel-size=1
- --swap-space=16
- --gpu-memory-utilization=0.95
- --max-model-len=2048
- --max-num-batched-tokens=4096
- --disable-log-stats
- --enable-loras
- --max-loras=5
env:
- name: DEPLOY_SOURCE
value: UI_NATIVE_MODEL
- name: MODEL_ID
value: "Llama2-7B"
- name: AIP_STORAGE_URI
value: "gs://vertex-model-garden-public-us/llama2/llama2-7b-hf"
- name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
value: "true"
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token # The name of your Kubernetes Secret
key: token # The specific key within the Secret
- name: DYNAMIC_LORA_ROLLOUT_CONFIG
value: "/config/configmap.yaml"
volumeMounts:
- mountPath: /dev/shm
name: dshm
initContainers:
- name: lora-adapter-syncer
tty: true
stdin: true
image: <SIDECAR_IMAGE>
restartPolicy: Always
imagePullPolicy: Always
env:
- name: DYNAMIC_LORA_ROLLOUT_CONFIG
value: "/config/configmap.yaml"
volumeMounts: # DO NOT USE subPath
- name: config-volume
mountPath: /config
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: config-volume
configMap:
name: dynamic-lora-config
---
apiVersion: v1
kind: Service
metadata:
name: llama-service
spec:
selector:
app: llama-server
type: ClusterIP
ports:
- protocol: TCP
port: 8000
targetPort: 8000
---
apiVersion: v1
kind: ConfigMap
metadata:
name: dynamic-lora-config
data:
configmap.yaml: |
vLLMLoRAConfig:
host: modelServerHost
name: sql-loras-llama
port: modelServerPort
ensureExist:
models:
- base-model: meta-llama/Llama-2-7b-hf
id: sql-lora-v1
source: yard1/llama-2-7b-sql-lora-test
- base-model: meta-llama/Llama-2-7b-hf
id: sql-lora-v3
source: yard1/llama-2-7b-sql-lora-test
- base-model: meta-llama/Llama-2-7b-hf
id: sql-lora-v4
source: yard1/llama-2-7b-sql-lora-test
ensureNotExist:
models:
- base-model: meta-llama/Llama-2-7b-hf
id: sql-lora-v2
source: yard1/llama-2-7b-sql-lora-test