Skip to content

Manifest updates #81

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Dec 10, 2024
95 changes: 86 additions & 9 deletions examples/poc/manifests/llmservice.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: LLMServerPool
metadata:
labels:
name: vllm-llama2-7b-pool
spec:
targetPort: 8000
modelServerSelector:
"app": "vllm-llama2-7b-pool"
---
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: LLMService
metadata:
labels:
Expand All @@ -7,17 +17,84 @@ metadata:
name: llmservice-sample
spec:
models:
- name: sql-code-assist
- name: npc-bot
- name: sql-lora
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora
weight: 100
- name: sql-lora-0
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora-0
weight: 100
- name: sql-lora-1
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora-1
weight: 100
- name: sql-lora-2
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora-2
weight: 100
- name: sql-lora-3
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora-3
weight: 100
- name: sql-lora-4
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora-4
weight: 100
- name: tweet-summary
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary
weight: 100
- name: tweet-summary-0
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary-0
weight: 100
- name: tweet-summary-1
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary-1
weight: 100
- name: tweet-summary-2
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary-2
weight: 100
- name: tweet-summary-3
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary-3
weight: 100
- name: tweet-summary-4
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary-4
weight: 100
- name: meta-llama/Llama-2-7b-hf
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: npc-bot-v1
weight: 50
- name: npc-bot-v2
weight: 50
- name: meta-llama/Llama-2-7b-hf
weight: 100
poolRef:
- kind: LLMServerPool
name: test-pool
- name: gemini-pool
kind: LLMServerPool
name: vllm-llama2-7b-pool
23 changes: 19 additions & 4 deletions examples/poc/manifests/vllm/vllm-lora-deployment.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,32 @@
apiVersion: v1
kind: Service
metadata:
name: vllm-llama2-7b-pool
spec:
selector:
app: vllm-llama2-7b-pool
ports:
- protocol: TCP
port: 8000
targetPort: 8000
type: ClusterIP

---

apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm
name: vllm-llama2-7b-pool
namespace: default
spec:
replicas: 6
replicas: 3
selector:
matchLabels:
app: vllm
app: vllm-llama2-7b-pool
template:
metadata:
labels:
app: vllm
app: vllm-llama2-7b-pool
spec:
containers:
- name: lora
Expand Down
12 changes: 0 additions & 12 deletions examples/poc/manifests/vllm/vllm-lora-service.yaml

This file was deleted.

61 changes: 47 additions & 14 deletions pkg/manifests/ext_proc.yaml
Original file line number Diff line number Diff line change
@@ -1,44 +1,77 @@
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: pod-read
rules:
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["llmservices"]
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list"]
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["llmserverpools"]
verbs: ["get", "watch", "list"]
- apiGroups: ["discovery.k8s.io"]
resources: ["endpointslices"]
verbs: ["get", "watch", "list"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: pod-read-binding
subjects:
- kind: ServiceAccount
name: default
namespace: default
roleRef:
kind: ClusterRole
name: pod-read
---

apiVersion: apps/v1
kind: Deployment
metadata:
name: instance-gateway-ext-proc
name: inference-gateway-ext-proc
namespace: default
labels:
app: instance-gateway-ext-proc
app: inference-gateway-ext-proc
spec:
replicas: 1
selector:
matchLabels:
app: instance-gateway-ext-proc
app: inference-gateway-ext-proc
template:
metadata:
labels:
app: instance-gateway-ext-proc
app: inference-gateway-ext-proc
spec:
containers:
- name: instance-gateway-ext-proc
- name: inference-gateway-ext-proc
# TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/34) Update the image and args.
image: <BUILT-IMAGE>
args:
# TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/12) Remove this once ext proc can dynamically reconcile on LLMServerPool.
- -pods
- "vllm-78665f78c4-h4kx4,vllm-78665f78c4-hnz84"
- -podIPs
- "10.24.11.6:8000,10.24.5.7:8000"
- -serverPoolName
- "vllm-llama2-7b-pool"
- -v
- "3"
- -serviceName
- "vllm-llama2-7b-pool"
ports:
- containerPort: 9002

- name: curl
image: curlimages/curl
command: ["sleep", "3600"]
---
apiVersion: v1
kind: Service
metadata:
name: instance-gateway-ext-proc
name: inference-gateway-ext-proc
namespace: default
spec:
selector:
app: instance-gateway-ext-proc
app: inference-gateway-ext-proc
ports:
- protocol: TCP
port: 9002
Expand All @@ -55,12 +88,12 @@ spec:
- backendRefs:
- group: ""
kind: Service
name: instance-gateway-ext-proc
name: inference-gateway-ext-proc
port: 9002
processingMode:
request:
body: Buffered
response:
response: {}
# The timeouts are likely not needed here. We can experiment with removing/tuning them slowly.
# The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly.
messageTimeout: 1000s
Expand Down
8 changes: 4 additions & 4 deletions pkg/manifests/gateway.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
apiVersion: gateway.networking.k8s.io/v1
kind: Gateway
metadata:
name: <GATEWAY-NAME>
name: inference-gateway
spec:
gatewayClassName: <GATEWAY-NAME>
gatewayClassName: inference-gateway
listeners:
- name: http
protocol: HTTP
Expand All @@ -17,7 +17,7 @@ spec:
apiVersion: gateway.networking.k8s.io/v1
kind: GatewayClass
metadata:
name: <GATEWAY-NAME>
name: inference-gateway
spec:
controllerName: gateway.envoyproxy.io/gatewayclass-controller
---
Expand All @@ -38,7 +38,7 @@ metadata:
name: llm-route
spec:
parentRefs:
- name: <GATEWAY-NAME>
- name: inference-gateway
sectionName: llm-gw
rules:
- backendRefs:
Expand Down
4 changes: 2 additions & 2 deletions pkg/manifests/patch_policy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ spec:
targetRef:
group: gateway.networking.k8s.io
kind: Gateway
name: <GATEWAY-NAME>
name: inference-gateway
type: JSONPatch
jsonPatches:
# Necessary to create a cluster of the type: ORIGINAL_DST to allow for
Expand Down Expand Up @@ -36,7 +36,7 @@ spec:
max_requests: 40000

- type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration"
name: default/<GATEWAY-NAME>/llm-gw
name: default/inference-gateway/llm-gw
operation:
op: replace
path: "/virtual_hosts/0/routes/0/route/cluster"
Expand Down