forked from kubernetes-sigs/gateway-api-inference-extension
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathllmservice.yaml
100 lines (100 loc) · 2.68 KB
/
llmservice.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: LLMServerPool
metadata:
labels:
name: vllm-llama2-7b-pool
spec:
targetPort: 8000
modelServerSelector:
"app": "vllm-llama2-7b-pool"
---
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: LLMService
metadata:
labels:
app.kubernetes.io/name: api
app.kubernetes.io/managed-by: kustomize
name: llmservice-sample
spec:
models:
- name: sql-lora
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora
weight: 100
- name: sql-lora-0
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora-0
weight: 100
- name: sql-lora-1
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora-1
weight: 100
- name: sql-lora-2
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora-2
weight: 100
- name: sql-lora-3
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora-3
weight: 100
- name: sql-lora-4
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora-4
weight: 100
- name: tweet-summary
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary
weight: 100
- name: tweet-summary-0
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary-0
weight: 100
- name: tweet-summary-1
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary-1
weight: 100
- name: tweet-summary-2
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary-2
weight: 100
- name: tweet-summary-3
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary-3
weight: 100
- name: tweet-summary-4
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary-4
weight: 100
- name: meta-llama/Llama-2-7b-hf
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: meta-llama/Llama-2-7b-hf
weight: 100
poolRef:
- kind: LLMServerPool
name: vllm-llama2-7b-pool