diff --git a/config/manifests/ext_proc.yaml b/config/manifests/inferencepool.yaml similarity index 86% rename from config/manifests/ext_proc.yaml rename to config/manifests/inferencepool.yaml index d70467ee..64008639 100644 --- a/config/manifests/ext_proc.yaml +++ b/config/manifests/inferencepool.yaml @@ -1,81 +1,53 @@ -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: pod-read -rules: -- apiGroups: ["inference.networking.x-k8s.io"] - resources: ["inferencemodels"] - verbs: ["get", "watch", "list"] -- apiGroups: [""] - resources: ["pods"] - verbs: ["get", "watch", "list"] -- apiGroups: ["inference.networking.x-k8s.io"] - resources: ["inferencepools"] - verbs: ["get", "watch", "list"] -- apiGroups: ["discovery.k8s.io"] - resources: ["endpointslices"] - verbs: ["get", "watch", "list"] -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: pod-read-binding -subjects: -- kind: ServiceAccount - name: default - namespace: default -roleRef: - kind: ClusterRole - name: pod-read ---- apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool metadata: labels: - name: my-pool + name: vllm-llama2-7b spec: targetPortNumber: 8000 selector: - app: my-pool + app: vllm-llama2-7b extensionRef: - name: inference-gateway-ext-proc + name: vllm-llama2-7b-epp +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-llama2-7b-epp + namespace: default +spec: + selector: + app: vllm-llama2-7b-epp + ports: + - protocol: TCP + port: 9002 + targetPort: 9002 + type: ClusterIP --- apiVersion: apps/v1 kind: Deployment metadata: - name: inference-gateway-ext-proc + name: vllm-llama2-7b-epp namespace: default labels: - app: inference-gateway-ext-proc + app: vllm-llama2-7b-epp spec: replicas: 1 selector: matchLabels: - app: inference-gateway-ext-proc + app: vllm-llama2-7b-epp template: metadata: labels: - app: inference-gateway-ext-proc + app: vllm-llama2-7b-epp spec: containers: - - name: inference-gateway-ext-proc + - name: epp image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main imagePullPolicy: Always args: - -poolName - - "my-pool" + - "vllm-llama2-7b" - -v - "4" - -grpcPort @@ -103,16 +75,44 @@ spec: initialDelaySeconds: 5 periodSeconds: 10 --- -apiVersion: v1 -kind: Service +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: inference-gateway-ext-proc + name: pod-read +rules: +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencemodels"] + verbs: ["get", "watch", "list"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "list"] +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencepools"] + verbs: ["get", "watch", "list"] +- apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["get", "watch", "list"] +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read-binding +subjects: +- kind: ServiceAccount + name: default namespace: default -spec: - selector: - app: inference-gateway-ext-proc - ports: - - protocol: TCP - port: 9002 - targetPort: 9002 - type: ClusterIP +roleRef: + kind: ClusterRole + name: pod-read diff --git a/config/manifests/vllm/cpu-deployment.yaml b/config/manifests/vllm/cpu-deployment.yaml index a0925c83..68dfd18d 100644 --- a/config/manifests/vllm/cpu-deployment.yaml +++ b/config/manifests/vllm/cpu-deployment.yaml @@ -1,16 +1,16 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: my-pool + name: vllm-llama2-7b spec: replicas: 3 selector: matchLabels: - app: my-pool + app: vllm-llama2-7b template: metadata: labels: - app: my-pool + app: vllm-llama2-7b spec: containers: - name: lora diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml index d16a46a4..cdc4d82c 100644 --- a/config/manifests/vllm/gpu-deployment.yaml +++ b/config/manifests/vllm/gpu-deployment.yaml @@ -1,16 +1,16 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: my-pool + name: vllm-llama2-7b spec: replicas: 3 selector: matchLabels: - app: my-pool + app: vllm-llama2-7b template: metadata: labels: - app: my-pool + app: vllm-llama2-7b spec: containers: - name: lora diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 94f5c9c1..d6ff8459 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -80,10 +80,10 @@ This quickstart guide is intended for engineers familiar with k8s and model serv NAME CLASS ADDRESS PROGRAMMED AGE inference-gateway inference-gateway True 22s ``` -### Deploy the Inference Extension and InferencePool +### Deploy the InferencePool and Extension ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/ext_proc.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml ``` ### Deploy Envoy Gateway Custom Policies @@ -134,4 +134,4 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found kubectl delete secret hf-token --ignore-not-found - ``` \ No newline at end of file + ``` diff --git a/test/e2e/epp/e2e_suite_test.go b/test/e2e/epp/e2e_suite_test.go index bc7dc87a..92521bf7 100644 --- a/test/e2e/epp/e2e_suite_test.go +++ b/test/e2e/epp/e2e_suite_test.go @@ -57,7 +57,7 @@ const ( // TODO [danehans]: Must be "default" until https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/227 is fixed nsName = "default" // modelServerName is the name of the model server test resources. - modelServerName = "my-pool" + modelServerName = "vllm-llama2-7b" // modelName is the test model name. modelName = "tweet-summary" // envoyName is the name of the envoy proxy test resources. @@ -65,7 +65,7 @@ const ( // envoyPort is the listener port number of the test envoy proxy. envoyPort = "8081" // inferExtName is the name of the inference extension test resources. - inferExtName = "inference-gateway-ext-proc" + inferExtName = "vllm-llama2-7b-epp" // clientManifest is the manifest for the client test resources. clientManifest = "../../testdata/client.yaml" // modelServerSecretManifest is the manifest for the model server secret resource. @@ -75,7 +75,7 @@ const ( // inferModelManifest is the manifest for the inference model CRD. inferModelManifest = "../../../config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml" // inferExtManifest is the manifest for the inference extension test resources. - inferExtManifest = "../../../config/manifests/ext_proc.yaml" + inferExtManifest = "../../../config/manifests/inferencepool.yaml" // envoyManifest is the manifest for the envoy proxy test resources. envoyManifest = "../../testdata/envoy.yaml" // modelServerManifestFilepathEnvVar is the env var that holds absolute path to the manifest for the model server test resource. diff --git a/test/testdata/envoy.yaml b/test/testdata/envoy.yaml index ffb8add7..2598428c 100644 --- a/test/testdata/envoy.yaml +++ b/test/testdata/envoy.yaml @@ -100,7 +100,7 @@ data: grpc_service: envoy_grpc: cluster_name: ext_proc - authority: inference-gateway-ext-proc.default:9002 + authority: vllm-llama2-7b-epp.default:9002 timeout: 10s processing_mode: request_header_mode: SEND @@ -194,7 +194,7 @@ data: - endpoint: address: socket_address: - address: inference-gateway-ext-proc.default + address: vllm-llama2-7b-epp.default port_value: 9002 health_status: HEALTHY load_balancing_weight: 1