Skip to content

Commit 83f701b

Browse files
authored
Manifest updates (#81)
* squashed modify filter for LoRA affinity modify filter for LoRA affinity * update llm service and llm server pool yaml, readme * remove ununsed method from metrics.go * add flowchart image * update size flowchart image * remove image name * update queueingThresholdLoRA to 50 * rollback filter related changes * rollback filter related changes in docs * addressing comments * addressing comments
1 parent ca47aa2 commit 83f701b

File tree

6 files changed

+158
-45
lines changed

6 files changed

+158
-45
lines changed

Diff for: examples/poc/manifests/llmservice.yaml

+86-9
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,14 @@
11
apiVersion: inference.networking.x-k8s.io/v1alpha1
2+
kind: LLMServerPool
3+
metadata:
4+
labels:
5+
name: vllm-llama2-7b-pool
6+
spec:
7+
targetPort: 8000
8+
modelServerSelector:
9+
"app": "vllm-llama2-7b-pool"
10+
---
11+
apiVersion: inference.networking.x-k8s.io/v1alpha1
212
kind: LLMService
313
metadata:
414
labels:
@@ -7,17 +17,84 @@ metadata:
717
name: llmservice-sample
818
spec:
919
models:
10-
- name: sql-code-assist
11-
- name: npc-bot
20+
- name: sql-lora
21+
objective:
22+
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
23+
targetModels:
24+
- name: sql-lora
25+
weight: 100
26+
- name: sql-lora-0
27+
objective:
28+
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
29+
targetModels:
30+
- name: sql-lora-0
31+
weight: 100
32+
- name: sql-lora-1
33+
objective:
34+
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
35+
targetModels:
36+
- name: sql-lora-1
37+
weight: 100
38+
- name: sql-lora-2
39+
objective:
40+
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
41+
targetModels:
42+
- name: sql-lora-2
43+
weight: 100
44+
- name: sql-lora-3
45+
objective:
46+
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
47+
targetModels:
48+
- name: sql-lora-3
49+
weight: 100
50+
- name: sql-lora-4
51+
objective:
52+
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
53+
targetModels:
54+
- name: sql-lora-4
55+
weight: 100
56+
- name: tweet-summary
57+
objective:
58+
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
59+
targetModels:
60+
- name: tweet-summary
61+
weight: 100
62+
- name: tweet-summary-0
63+
objective:
64+
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
65+
targetModels:
66+
- name: tweet-summary-0
67+
weight: 100
68+
- name: tweet-summary-1
69+
objective:
70+
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
71+
targetModels:
72+
- name: tweet-summary-1
73+
weight: 100
74+
- name: tweet-summary-2
75+
objective:
76+
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
77+
targetModels:
78+
- name: tweet-summary-2
79+
weight: 100
80+
- name: tweet-summary-3
81+
objective:
82+
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
83+
targetModels:
84+
- name: tweet-summary-3
85+
weight: 100
86+
- name: tweet-summary-4
87+
objective:
88+
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
89+
targetModels:
90+
- name: tweet-summary-4
91+
weight: 100
92+
- name: meta-llama/Llama-2-7b-hf
1293
objective:
1394
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
1495
targetModels:
15-
- name: npc-bot-v1
16-
weight: 50
17-
- name: npc-bot-v2
18-
weight: 50
96+
- name: meta-llama/Llama-2-7b-hf
97+
weight: 100
1998
poolRef:
2099
- kind: LLMServerPool
21-
name: test-pool
22-
- name: gemini-pool
23-
kind: LLMServerPool
100+
name: vllm-llama2-7b-pool

Diff for: examples/poc/manifests/vllm/vllm-lora-deployment.yaml

+19-4
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,32 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: vllm-llama2-7b-pool
5+
spec:
6+
selector:
7+
app: vllm-llama2-7b-pool
8+
ports:
9+
- protocol: TCP
10+
port: 8000
11+
targetPort: 8000
12+
type: ClusterIP
13+
14+
---
15+
116
apiVersion: apps/v1
217
kind: Deployment
318
metadata:
4-
name: vllm
19+
name: vllm-llama2-7b-pool
520
namespace: default
621
spec:
7-
replicas: 6
22+
replicas: 3
823
selector:
924
matchLabels:
10-
app: vllm
25+
app: vllm-llama2-7b-pool
1126
template:
1227
metadata:
1328
labels:
14-
app: vllm
29+
app: vllm-llama2-7b-pool
1530
spec:
1631
containers:
1732
- name: lora

Diff for: examples/poc/manifests/vllm/vllm-lora-service.yaml

-12
This file was deleted.

Diff for: pkg/manifests/ext_proc.yaml

+47-14
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,77 @@
1+
kind: ClusterRole
2+
apiVersion: rbac.authorization.k8s.io/v1
3+
metadata:
4+
name: pod-read
5+
rules:
6+
- apiGroups: ["inference.networking.x-k8s.io"]
7+
resources: ["llmservices"]
8+
verbs: ["get", "watch", "list"]
9+
- apiGroups: [""]
10+
resources: ["pods"]
11+
verbs: ["get", "watch", "list"]
12+
- apiGroups: ["inference.networking.x-k8s.io"]
13+
resources: ["llmserverpools"]
14+
verbs: ["get", "watch", "list"]
15+
- apiGroups: ["discovery.k8s.io"]
16+
resources: ["endpointslices"]
17+
verbs: ["get", "watch", "list"]
18+
---
19+
kind: ClusterRoleBinding
20+
apiVersion: rbac.authorization.k8s.io/v1
21+
metadata:
22+
name: pod-read-binding
23+
subjects:
24+
- kind: ServiceAccount
25+
name: default
26+
namespace: default
27+
roleRef:
28+
kind: ClusterRole
29+
name: pod-read
30+
---
31+
132
apiVersion: apps/v1
233
kind: Deployment
334
metadata:
4-
name: instance-gateway-ext-proc
35+
name: inference-gateway-ext-proc
536
namespace: default
637
labels:
7-
app: instance-gateway-ext-proc
38+
app: inference-gateway-ext-proc
839
spec:
940
replicas: 1
1041
selector:
1142
matchLabels:
12-
app: instance-gateway-ext-proc
43+
app: inference-gateway-ext-proc
1344
template:
1445
metadata:
1546
labels:
16-
app: instance-gateway-ext-proc
47+
app: inference-gateway-ext-proc
1748
spec:
1849
containers:
19-
- name: instance-gateway-ext-proc
50+
- name: inference-gateway-ext-proc
2051
# TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/34) Update the image and args.
2152
image: <BUILT-IMAGE>
2253
args:
23-
# TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/12) Remove this once ext proc can dynamically reconcile on LLMServerPool.
24-
- -pods
25-
- "vllm-78665f78c4-h4kx4,vllm-78665f78c4-hnz84"
26-
- -podIPs
27-
- "10.24.11.6:8000,10.24.5.7:8000"
54+
- -serverPoolName
55+
- "vllm-llama2-7b-pool"
56+
- -v
57+
- "3"
58+
- -serviceName
59+
- "vllm-llama2-7b-pool"
2860
ports:
2961
- containerPort: 9002
62+
3063
- name: curl
3164
image: curlimages/curl
3265
command: ["sleep", "3600"]
3366
---
3467
apiVersion: v1
3568
kind: Service
3669
metadata:
37-
name: instance-gateway-ext-proc
70+
name: inference-gateway-ext-proc
3871
namespace: default
3972
spec:
4073
selector:
41-
app: instance-gateway-ext-proc
74+
app: inference-gateway-ext-proc
4275
ports:
4376
- protocol: TCP
4477
port: 9002
@@ -55,12 +88,12 @@ spec:
5588
- backendRefs:
5689
- group: ""
5790
kind: Service
58-
name: instance-gateway-ext-proc
91+
name: inference-gateway-ext-proc
5992
port: 9002
6093
processingMode:
6194
request:
6295
body: Buffered
63-
response:
96+
response: {}
6497
# The timeouts are likely not needed here. We can experiment with removing/tuning them slowly.
6598
# The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly.
6699
messageTimeout: 1000s

Diff for: pkg/manifests/gateway.yaml

+4-4
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
apiVersion: gateway.networking.k8s.io/v1
44
kind: Gateway
55
metadata:
6-
name: <GATEWAY-NAME>
6+
name: inference-gateway
77
spec:
8-
gatewayClassName: <GATEWAY-NAME>
8+
gatewayClassName: inference-gateway
99
listeners:
1010
- name: http
1111
protocol: HTTP
@@ -17,7 +17,7 @@ spec:
1717
apiVersion: gateway.networking.k8s.io/v1
1818
kind: GatewayClass
1919
metadata:
20-
name: <GATEWAY-NAME>
20+
name: inference-gateway
2121
spec:
2222
controllerName: gateway.envoyproxy.io/gatewayclass-controller
2323
---
@@ -38,7 +38,7 @@ metadata:
3838
name: llm-route
3939
spec:
4040
parentRefs:
41-
- name: <GATEWAY-NAME>
41+
- name: inference-gateway
4242
sectionName: llm-gw
4343
rules:
4444
- backendRefs:

Diff for: pkg/manifests/patch_policy.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ spec:
77
targetRef:
88
group: gateway.networking.k8s.io
99
kind: Gateway
10-
name: <GATEWAY-NAME>
10+
name: inference-gateway
1111
type: JSONPatch
1212
jsonPatches:
1313
# Necessary to create a cluster of the type: ORIGINAL_DST to allow for
@@ -36,7 +36,7 @@ spec:
3636
max_requests: 40000
3737

3838
- type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration"
39-
name: default/<GATEWAY-NAME>/llm-gw
39+
name: default/inference-gateway/llm-gw
4040
operation:
4141
op: replace
4242
path: "/virtual_hosts/0/routes/0/route/cluster"

0 commit comments

Comments
 (0)