From 6bc07c6836892dbdf96322e5b66d6c1f57a4a93e Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Tue, 25 Mar 2025 14:33:50 -0700 Subject: [PATCH 01/49] Create resources.yaml for kgateway --- .../manifests/gateway/kgateway/resources.yaml | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 config/manifests/gateway/kgateway/resources.yaml diff --git a/config/manifests/gateway/kgateway/resources.yaml b/config/manifests/gateway/kgateway/resources.yaml new file mode 100644 index 00000000..2856a6d2 --- /dev/null +++ b/config/manifests/gateway/kgateway/resources.yaml @@ -0,0 +1,40 @@ +# Requires Kgateway 2.0.0 or greater. +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: inference-gateway +spec: + gatewayClassName: kgateway + listeners: + - name: http + protocol: HTTP + port: 8080 + - name: llm-gw + protocol: HTTP + port: 8081 +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + sectionName: llm-gw + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: vllm-llama2-7b + port: 8000 + weight: 1 + matches: + - path: + type: PathPrefix + value: / + timeouts: + backendRequest: 24h + request: 24h From 63d7c403de61ed3ba4c5b8d0c73a69ed7ae0b123 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Tue, 25 Mar 2025 14:35:29 -0700 Subject: [PATCH 02/49] Update getting started guide for KGateway --- site-src/guides/index.md | 124 ++++++++++++++++++++++++++++----------- 1 file changed, 91 insertions(+), 33 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index bcea5f9b..5637a8ee 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -3,7 +3,6 @@ This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get a first, single InferencePool up and running! ## **Prerequisites** - - Envoy Gateway [v1.3.0](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher - A cluster with: - Support for services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). @@ -56,55 +55,114 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml ``` - + ### Deploy InferenceModel Deploy the sample InferenceModel which is configured to load balance traffic between the `tweet-summary-0` and `tweet-summary-1` [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. + ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml ``` -### Update Envoy Gateway Config to enable Patch Policy** +### Deploy Inference Gateway - Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run: - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/enable_patch_policy.yaml - kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system - ``` - Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again. + Choose one of the following options to deploy an Inference Gateway. -### Deploy Gateway +=== "Envoy Gateway" - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gateway.yaml - ``` - > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./config/manifests/gateway/ext-proc.yaml` file, and an additional `./config/manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.*** + 1. Requirements - Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: - ```bash - $ kubectl get gateway inference-gateway - NAME CLASS ADDRESS PROGRAMMED AGE - inference-gateway inference-gateway True 22s - ``` -### Deploy the InferencePool and Extension + - Envoy Gateway [v1.3.0](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher. - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml - ``` -### Deploy Envoy Gateway Custom Policies + 1. Update Envoy Gateway Config to enable Patch Policy - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml - ``` - > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further. - -### **OPTIONALLY**: Apply Traffic Policy + Our custom LLM Gateway ext-proc is patched into the existing Envoy Gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the + Envoy Gateway config map. To do this, apply the following manifest and restart Envoy Gateway: + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/enable_patch_policy.yaml + kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system + ``` + + Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again. + + 1. Deploy GatewayClass, Gateway, Backend, and HTTPRoute resources + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gateway.yaml + ``` + + > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./config/manifests/gateway/ext-proc.yaml` file, and an additional `./config/manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.*** + + Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: + ```bash + $ kubectl get gateway inference-gateway + NAME CLASS ADDRESS PROGRAMMED AGE + inference-gateway inference-gateway True 22s + ``` - For high-traffic benchmarking you can apply this manifest to avoid any defaults that can cause timeouts/errors. + 1. Deploy Envoy Gateway Custom Policies + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml + ``` + + > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further. + + 1. Apply Traffic Policy (Optional) + + For high-traffic benchmarking you can apply this manifest to avoid any defaults that can cause timeouts/errors. + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/traffic_policy.yaml + ``` + +=== "Kgateway" + + [Kgateway](https://kgateway.dev/) v2.0.0 adds support for inference extension as a **technical preview**. This means do not + run Kgateway with inference extension in production environments. Refer to [Issue 10411](https://github.com/kgateway-dev/kgateway/issues/10411) + for the list of caveats, supported features, etc. + + 1. Requirements + + - [Helm](https://helm.sh/docs/intro/install/) installed. + - Gateway API [CRDs](https://gateway-api.sigs.k8s.io/guides/#installing-gateway-api) installed. + + 2. Install Kgateway CRDs + + ```bash + helm upgrade -i --create-namespace --namespace kgateway-system --version v2.0.0-main kgateway-crds https://github.com/danehans/toolbox/raw/refs/heads/main/charts/338661f3be-kgateway-crds-1.0.1-dev.tgz + ``` + + 3. Install Kgateway + + ```bash + helm upgrade --install kgateway "https://github.com/danehans/toolbox/raw/refs/heads/main/charts/338661f3be-kgateway-1.0.1-dev.tgz" \ + -n kgateway-system \ + --set image.registry=danehans \ + --set image.pullPolicy=Always \ + --set inferenceExtension.enabled="true" \ + --version 1.0.1-dev + ``` + + 4. Deploy Gateway and HTTPRoute resources + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/resources.yaml + ``` + + Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: + ```bash + $ kubectl get gateway inference-gateway + NAME CLASS ADDRESS PROGRAMMED AGE + inference-gateway kgateway True 22s + ``` + +### Deploy the InferencePool and Extension ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/traffic_policy.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml ``` ### Try it out From 048189af7cf50787d1c281d002021e4519284713 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Tue, 25 Mar 2025 14:58:25 -0700 Subject: [PATCH 03/49] Replace Envoy Gateway user guide with GKE user guide --- site-src/guides/index.md | 49 ++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 5637a8ee..f26f0038 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -69,32 +69,33 @@ This quickstart guide is intended for engineers familiar with k8s and model serv Choose one of the following options to deploy an Inference Gateway. -=== "Envoy Gateway" +=== "GKE" - 1. Requirements - - - Envoy Gateway [v1.3.0](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher. - - 1. Update Envoy Gateway Config to enable Patch Policy - - Our custom LLM Gateway ext-proc is patched into the existing Envoy Gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the - Envoy Gateway config map. To do this, apply the following manifest and restart Envoy Gateway: + 1. Enable the Gateway API ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/enable_patch_policy.yaml - kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system + gcloud container clusters update \ + --location= \ + --gateway-api=standard ``` - Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again. + 1. Create the proxy-only subnet + A proxy-only subnet provides a set of IP addresses that Google uses to run Envoy proxies on your behalf. + ``` + gcloud compute networks subnets create proxy-only-subnet \ + --purpose=REGIONAL_MANAGED_PROXY \ + --role=ACTIVE \ + --region= \ + --network= \ + --range= + ``` - 1. Deploy GatewayClass, Gateway, Backend, and HTTPRoute resources + 1. Deploy Gateway, HTTPRoute and HealthCheckPolicy resources ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gateway.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/resources.yaml ``` - > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./config/manifests/gateway/ext-proc.yaml` file, and an additional `./config/manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.*** - Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: ```bash $ kubectl get gateway inference-gateway @@ -102,22 +103,6 @@ This quickstart guide is intended for engineers familiar with k8s and model serv inference-gateway inference-gateway True 22s ``` - 1. Deploy Envoy Gateway Custom Policies - - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml - ``` - - > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further. - - 1. Apply Traffic Policy (Optional) - - For high-traffic benchmarking you can apply this manifest to avoid any defaults that can cause timeouts/errors. - - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/traffic_policy.yaml - ``` - === "Kgateway" [Kgateway](https://kgateway.dev/) v2.0.0 adds support for inference extension as a **technical preview**. This means do not From a679070d895361507f7130cdb41f2ff3550f4b4d Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Tue, 25 Mar 2025 15:00:02 -0700 Subject: [PATCH 04/49] Create resources.yaml for GKE Gateway --- config/manifests/gateway/gke/resources.yaml | 46 +++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 config/manifests/gateway/gke/resources.yaml diff --git a/config/manifests/gateway/gke/resources.yaml b/config/manifests/gateway/gke/resources.yaml new file mode 100644 index 00000000..b4461660 --- /dev/null +++ b/config/manifests/gateway/gke/resources.yaml @@ -0,0 +1,46 @@ +kind: Gateway +apiVersion: gateway.networking.k8s.io/v1beta1 +metadata: + name: e2e-inference-gateway +spec: + gatewayClassName: gke-l7-regional-external-managed + listeners: + - name: http + port: 80 + protocol: HTTP + allowedRoutes: + kinds: + - kind: HTTPRoute + namespaces: + from: All +--- +apiVersion: gateway.networking.k8s.io/v1beta1 +kind: HTTPRoute +metadata: + name: mytest-inference-httproute +spec: + parentRefs: + - name: e2e-inference-gateway + kind: Gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + name: vllm-llama2-7b + kind: InferencePool +--- +kind: HealthCheckPolicy +apiVersion: networking.gke.io/v1 +metadata: + name: health-check-policy + namespace: default +spec: + targetRef: + group: "inference.networking.x-k8s.io" + kind: InferencePool + name: vllm-llama2-7b + default: + config: + type: HTTP + httpHealthCheck: + requestPath: /health + port: 8000 From a627ea735e255876dfffa6dae9c7c688000a9c87 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Tue, 25 Mar 2025 15:01:08 -0700 Subject: [PATCH 05/49] Delete config/manifests/gateway/enable_patch_policy.yaml --- .../gateway/enable_patch_policy.yaml | 27 ------------------- 1 file changed, 27 deletions(-) delete mode 100644 config/manifests/gateway/enable_patch_policy.yaml diff --git a/config/manifests/gateway/enable_patch_policy.yaml b/config/manifests/gateway/enable_patch_policy.yaml deleted file mode 100644 index 1e9818a1..00000000 --- a/config/manifests/gateway/enable_patch_policy.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: envoy-gateway-config - namespace: envoy-gateway-system -data: -# This manifest's main purpose is to set `enabledEnvoyPatchPolicy` to `true`. -# This only needs to be ran once on your cluster (unless you'd like to change anything. i.e. enabling the admin dash) -# Any field under `admin` is optional, and only for enabling the admin endpoints, for debugging. -# Admin Interface: https://www.envoyproxy.io/docs/envoy/latest/operations/admin -# PatchPolicy docs: https://gateway.envoyproxy.io/docs/tasks/extensibility/envoy-patch-policy/#enable-envoypatchpolicy - envoy-gateway.yaml: | - apiVersion: gateway.envoyproxy.io/v1alpha1 - kind: EnvoyGateway - provider: - type: Kubernetes - gateway: - controllerName: gateway.envoyproxy.io/gatewayclass-controller - extensionApis: - enableEnvoyPatchPolicy: true - enableBackend: true -# admin: -# enablePprof: true -# address: -# host: 127.0.0.1 -# port: 19000 -# enabledDumpConfig: true From 7b490deede6474de748d67eb204f53f91ef0adc5 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Tue, 25 Mar 2025 15:01:17 -0700 Subject: [PATCH 06/49] Delete config/manifests/gateway/gateway.yaml --- config/manifests/gateway/gateway.yaml | 50 --------------------------- 1 file changed, 50 deletions(-) delete mode 100644 config/manifests/gateway/gateway.yaml diff --git a/config/manifests/gateway/gateway.yaml b/config/manifests/gateway/gateway.yaml deleted file mode 100644 index 32f5d484..00000000 --- a/config/manifests/gateway/gateway.yaml +++ /dev/null @@ -1,50 +0,0 @@ - ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: Gateway -metadata: - name: inference-gateway -spec: - gatewayClassName: inference-gateway - listeners: - - name: http - protocol: HTTP - port: 8080 - - name: llm-gw - protocol: HTTP - port: 8081 ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: GatewayClass -metadata: - name: inference-gateway -spec: - controllerName: gateway.envoyproxy.io/gatewayclass-controller ---- -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: Backend -metadata: - name: backend-dummy -spec: - endpoints: - - fqdn: - # Both these values are arbitrary and unused as the PatchPolicy redirects requests. - hostname: 'foo.bar.com' - port: 8080 ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: llm-route -spec: - parentRefs: - - name: inference-gateway - sectionName: llm-gw - rules: - - backendRefs: - - group: gateway.envoyproxy.io - kind: Backend - name: backend-dummy - timeouts: - request: "24h" - backendRequest: "24h" From 9c8d00d204718dc391fb56cb53b811705bde083f Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Tue, 25 Mar 2025 15:01:25 -0700 Subject: [PATCH 07/49] Delete config/manifests/gateway/patch_policy.yaml --- config/manifests/gateway/patch_policy.yaml | 123 --------------------- 1 file changed, 123 deletions(-) delete mode 100644 config/manifests/gateway/patch_policy.yaml diff --git a/config/manifests/gateway/patch_policy.yaml b/config/manifests/gateway/patch_policy.yaml deleted file mode 100644 index a40c8e27..00000000 --- a/config/manifests/gateway/patch_policy.yaml +++ /dev/null @@ -1,123 +0,0 @@ -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyPatchPolicy -metadata: - name: custom-response-patch-policy - namespace: default -spec: - targetRef: - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway - type: JSONPatch - jsonPatches: - # Necessary to create a cluster of the type: ORIGINAL_DST to allow for - # direct pod scheduling. Which is heavily utilized in our scheduling. - # Specifically the field `original_dst_lb_config` allows us to enable - # `use_http_header` and `http_header_name`. - # Source: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/cluster/v3/cluster.proto - - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" - name: original_destination_cluster - operation: - op: add - path: "" - value: - name: original_destination_cluster - type: ORIGINAL_DST - original_dst_lb_config: - use_http_header: true - http_header_name: "x-gateway-destination-endpoint" - connect_timeout: 1000s - lb_policy: CLUSTER_PROVIDED - dns_lookup_family: V4_ONLY - circuit_breakers: - thresholds: - - max_connections: 40000 - max_pending_requests: 40000 - max_requests: 40000 - - # This ensures that envoy accepts untrusted certificates. We tried to explicitly - # set TrustChainVerification to ACCEPT_UNSTRUSTED, but that actually didn't work - # and what worked is setting the common_tls_context to empty. - - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" - name: "envoyextensionpolicy/default/ext-proc-policy/extproc/0" - operation: - op: add - path: "/transport_socket" - value: - name: "envoy.transport_sockets.tls" - typed_config: - "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext" - common_tls_context: {} - - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration" - name: default/inference-gateway/llm-gw - operation: - op: replace - path: "/virtual_hosts/0/routes/0/route/cluster" - value: original_destination_cluster -# Comment the below to disable full duplex streaming -# NOTE: As of https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/552 -# FULL_DUPLEX_STREAMED is the primary supported protocol for ext-proc. The buffered variant is no longer -# being actively developed, may be missing features/fixes, and will soon be removed. - - type: "type.googleapis.com/envoy.config.listener.v3.Listener" - name: "default/inference-gateway/llm-gw" - operation: - op: add - path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_body_mode" - value: FULL_DUPLEX_STREAMED - - type: "type.googleapis.com/envoy.config.listener.v3.Listener" - name: "default/inference-gateway/llm-gw" - operation: - op: add - path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_trailer_mode" - value: SEND - - type: "type.googleapis.com/envoy.config.listener.v3.Listener" - name: "default/inference-gateway/llm-gw" - operation: - op: add - path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_body_mode" - value: FULL_DUPLEX_STREAMED - - type: "type.googleapis.com/envoy.config.listener.v3.Listener" - name: "default/inference-gateway/llm-gw" - operation: - op: replace - path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_trailer_mode" - value: SEND - - type: "type.googleapis.com/envoy.config.listener.v3.Listener" - name: "default/inference-gateway/llm-gw" - operation: - op: replace - path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_header_mode" - value: SEND ---- -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyExtensionPolicy -metadata: - name: ext-proc-policy - namespace: default -spec: - extProc: - - backendRefs: - - group: "" - kind: Service - name: vllm-llama2-7b-epp - port: 9002 - processingMode: - allowModeOverride: true - request: - body: Buffered - response: - # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly. - # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. - messageTimeout: 1000s - backendSettings: - circuitBreaker: - maxConnections: 40000 - maxPendingRequests: 40000 - maxParallelRequests: 40000 - timeout: - tcp: - connectTimeout: 24h - targetRef: - group: gateway.networking.k8s.io - kind: HTTPRoute - name: llm-route From 05199350bcbe5d55b5d1551852663468a2de6b7d Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Tue, 25 Mar 2025 15:01:37 -0700 Subject: [PATCH 08/49] Delete config/manifests/gateway/traffic_policy.yaml --- config/manifests/gateway/traffic_policy.yaml | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 config/manifests/gateway/traffic_policy.yaml diff --git a/config/manifests/gateway/traffic_policy.yaml b/config/manifests/gateway/traffic_policy.yaml deleted file mode 100644 index e110f173..00000000 --- a/config/manifests/gateway/traffic_policy.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: BackendTrafficPolicy -metadata: - name: high-connection-route-policy -spec: - targetRefs: - - group: gateway.networking.k8s.io - kind: HTTPRoute - name: llm-route - circuitBreaker: - maxConnections: 40000 - maxPendingRequests: 40000 - maxParallelRequests: 40000 - timeout: - tcp: - connectTimeout: 24h \ No newline at end of file From 3e7e74ebb07bdc819f2c8536c5ac073af347d8a3 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Tue, 25 Mar 2025 15:51:38 -0700 Subject: [PATCH 09/49] Add http2 appProtocol to EPP service --- config/manifests/inferencepool.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/config/manifests/inferencepool.yaml b/config/manifests/inferencepool.yaml index ca2e4a88..96246cbd 100644 --- a/config/manifests/inferencepool.yaml +++ b/config/manifests/inferencepool.yaml @@ -22,6 +22,7 @@ spec: - protocol: TCP port: 9002 targetPort: 9002 + appProtocol: http2 type: ClusterIP --- apiVersion: apps/v1 From a140a3e07a8b20869f44e3293a2c25213cfe185b Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Tue, 25 Mar 2025 16:29:46 -0700 Subject: [PATCH 10/49] Add user guide for Istio --- site-src/guides/index.md | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index f26f0038..de7c795a 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -80,6 +80,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ``` 1. Create the proxy-only subnet + A proxy-only subnet provides a set of IP addresses that Google uses to run Envoy proxies on your behalf. ``` gcloud compute networks subnets create proxy-only-subnet \ @@ -103,6 +104,32 @@ This quickstart guide is intended for engineers familiar with k8s and model serv inference-gateway inference-gateway True 22s ``` +=== "Istio" + + 1. Install Istio + + Follow the Istio installation guide https://istio.io/latest/docs/setup/install/ + + 1. Deploy Gateway and HTTPRoute + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/resources.yaml + ``` + + 1. Label the gateway + + ```bash + kubectl label gateway llm-gateway istio.io/enable-inference-extproc=true + ``` + + 1. Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: + + ```bash + $ kubectl get gateway inference-gateway + NAME CLASS ADDRESS PROGRAMMED AGE + inference-gateway inference-gateway True 22s + ``` + === "Kgateway" [Kgateway](https://kgateway.dev/) v2.0.0 adds support for inference extension as a **technical preview**. This means do not @@ -114,13 +141,13 @@ This quickstart guide is intended for engineers familiar with k8s and model serv - [Helm](https://helm.sh/docs/intro/install/) installed. - Gateway API [CRDs](https://gateway-api.sigs.k8s.io/guides/#installing-gateway-api) installed. - 2. Install Kgateway CRDs + 1. Install Kgateway CRDs ```bash helm upgrade -i --create-namespace --namespace kgateway-system --version v2.0.0-main kgateway-crds https://github.com/danehans/toolbox/raw/refs/heads/main/charts/338661f3be-kgateway-crds-1.0.1-dev.tgz ``` - 3. Install Kgateway + 1. Install Kgateway ```bash helm upgrade --install kgateway "https://github.com/danehans/toolbox/raw/refs/heads/main/charts/338661f3be-kgateway-1.0.1-dev.tgz" \ @@ -131,7 +158,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv --version 1.0.1-dev ``` - 4. Deploy Gateway and HTTPRoute resources + 1. Deploy Gateway and HTTPRoute resources ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/resources.yaml From 8a878f89e920aa06dcef89360729021f0209be3f Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Tue, 25 Mar 2025 16:36:48 -0700 Subject: [PATCH 11/49] Create resources.yaml for Istio --- config/manifests/gateway/istio/resources.yaml | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 config/manifests/gateway/istio/resources.yaml diff --git a/config/manifests/gateway/istio/resources.yaml b/config/manifests/gateway/istio/resources.yaml new file mode 100644 index 00000000..f943bd12 --- /dev/null +++ b/config/manifests/gateway/istio/resources.yaml @@ -0,0 +1,35 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: inference-gateway +spec: + gatewayClassName: istio + listeners: + - allowedRoutes: + namespaces: + from: Same + name: http + port: 80 + protocol: HTTP +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-route +spec: + hostnames: + - foo.example.com + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: vllm-llama2-7b + weight: 1 + matches: + - path: + type: PathPrefix + value: /completion From f0b59e458179e0552ba3245615947e256b2303f1 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Tue, 25 Mar 2025 16:39:31 -0700 Subject: [PATCH 12/49] Fix GKE gateway name to match the user guide --- config/manifests/gateway/gke/resources.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/manifests/gateway/gke/resources.yaml b/config/manifests/gateway/gke/resources.yaml index b4461660..c371b2b3 100644 --- a/config/manifests/gateway/gke/resources.yaml +++ b/config/manifests/gateway/gke/resources.yaml @@ -1,7 +1,7 @@ kind: Gateway apiVersion: gateway.networking.k8s.io/v1beta1 metadata: - name: e2e-inference-gateway + name: inference-gateway spec: gatewayClassName: gke-l7-regional-external-managed listeners: @@ -20,7 +20,7 @@ metadata: name: mytest-inference-httproute spec: parentRefs: - - name: e2e-inference-gateway + - name: inference-gateway kind: Gateway rules: - backendRefs: From c06cffd20b973e32796c6eb88be9a0b6f2ccd64e Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Tue, 25 Mar 2025 16:52:48 -0700 Subject: [PATCH 13/49] Fix cleanup instructions to refer up-to-date YAMLs --- site-src/guides/index.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index de7c795a..ea938ebe 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -198,12 +198,10 @@ This quickstart guide is intended for engineers familiar with k8s and model serv The following cleanup assumes you would like to clean ALL resources that were created in this quickstart guide. please be careful not to delete resources you'd like to keep. ```bash - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/traffic_policy.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/extension_policy.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/resources.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/resources.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/resources.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gateway.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/enable_patch_policy.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml --ignore-not-found From 21100f9e08c23680b9fe7a5dc7464b0d7f009f45 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 10:25:20 -0700 Subject: [PATCH 14/49] Allow Istio gateway to use HTTPRoute from all namespaces --- config/manifests/gateway/istio/resources.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/config/manifests/gateway/istio/resources.yaml b/config/manifests/gateway/istio/resources.yaml index f943bd12..2435f522 100644 --- a/config/manifests/gateway/istio/resources.yaml +++ b/config/manifests/gateway/istio/resources.yaml @@ -5,12 +5,14 @@ metadata: spec: gatewayClassName: istio listeners: - - allowedRoutes: - namespaces: - from: Same - name: http + - name: http port: 80 protocol: HTTP + allowedRoutes: + kinds: + - kind: HTTPRoute + namespaces: + from: All --- apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute From d8d4666895a23510ad54dffc20dbc76a91b41d15 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 10:31:14 -0700 Subject: [PATCH 15/49] Update Kgateway port number to 80 --- config/manifests/gateway/kgateway/resources.yaml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/config/manifests/gateway/kgateway/resources.yaml b/config/manifests/gateway/kgateway/resources.yaml index 2856a6d2..fc8dd1da 100644 --- a/config/manifests/gateway/kgateway/resources.yaml +++ b/config/manifests/gateway/kgateway/resources.yaml @@ -7,12 +7,14 @@ metadata: spec: gatewayClassName: kgateway listeners: - - name: http - protocol: HTTP - port: 8080 - - name: llm-gw - protocol: HTTP - port: 8081 + - name: http + port: 80 + protocol: HTTP + allowedRoutes: + kinds: + - kind: HTTPRoute + namespaces: + from: All --- apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute From ee7fa97faaa4b1691f64dd59d25658b1eaafd74a Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 10:34:09 -0700 Subject: [PATCH 16/49] Update gateway port to 80 --- site-src/guides/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index ea938ebe..bfa851c6 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -183,7 +183,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ```bash IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') - PORT=8081 + PORT=80 curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ "model": "tweet-summary", From 59cbe2e7ce2b4bfe6fbd7421f975d05ccd58b6d9 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 11:23:45 -0700 Subject: [PATCH 17/49] Remove the sectionName from Kgateway HTTPRoute --- config/manifests/gateway/kgateway/resources.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/config/manifests/gateway/kgateway/resources.yaml b/config/manifests/gateway/kgateway/resources.yaml index fc8dd1da..b66d47ab 100644 --- a/config/manifests/gateway/kgateway/resources.yaml +++ b/config/manifests/gateway/kgateway/resources.yaml @@ -25,7 +25,6 @@ spec: - group: gateway.networking.k8s.io kind: Gateway name: inference-gateway - sectionName: llm-gw rules: - backendRefs: - group: inference.networking.x-k8s.io From afc64dc4c1d475e823f599d2250228cd0bb0be01 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 13:55:10 -0700 Subject: [PATCH 18/49] Create common httproute YAML --- config/manifests/gateway/httproute.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 config/manifests/gateway/httproute.yaml diff --git a/config/manifests/gateway/httproute.yaml b/config/manifests/gateway/httproute.yaml new file mode 100644 index 00000000..500e26fd --- /dev/null +++ b/config/manifests/gateway/httproute.yaml @@ -0,0 +1,19 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: vllm-llama2-7b + weight: 1 + matches: + - path: + type: PathPrefix + value: / From 8d235f6b59f961702611a861c41e3e5f79e87a7e Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 13:56:00 -0700 Subject: [PATCH 19/49] Create healthcheck.yaml for GKE gateway --- config/manifests/gateway/gke/healthcheck.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 config/manifests/gateway/gke/healthcheck.yaml diff --git a/config/manifests/gateway/gke/healthcheck.yaml b/config/manifests/gateway/gke/healthcheck.yaml new file mode 100644 index 00000000..95f4f2d2 --- /dev/null +++ b/config/manifests/gateway/gke/healthcheck.yaml @@ -0,0 +1,16 @@ +kind: HealthCheckPolicy +apiVersion: networking.gke.io/v1 +metadata: + name: health-check-policy + namespace: default +spec: + targetRef: + group: "inference.networking.x-k8s.io" + kind: InferencePool + name: vllm-llama2-7b + default: + config: + type: HTTP + httpHealthCheck: + requestPath: /health + port: 8000 From 52318b35ed3336a94d2d7abb006ddba08633f69d Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 13:57:13 -0700 Subject: [PATCH 20/49] Separate gateway.yaml for GKE gateway --- config/manifests/gateway/gke/gateway.yaml | 15 +++++++ config/manifests/gateway/gke/resources.yaml | 46 --------------------- 2 files changed, 15 insertions(+), 46 deletions(-) create mode 100644 config/manifests/gateway/gke/gateway.yaml delete mode 100644 config/manifests/gateway/gke/resources.yaml diff --git a/config/manifests/gateway/gke/gateway.yaml b/config/manifests/gateway/gke/gateway.yaml new file mode 100644 index 00000000..3cf5645f --- /dev/null +++ b/config/manifests/gateway/gke/gateway.yaml @@ -0,0 +1,15 @@ +kind: Gateway +apiVersion: gateway.networking.k8s.io/v1beta1 +metadata: + name: inference-gateway +spec: + gatewayClassName: gke-l7-regional-external-managed + listeners: + - name: http + port: 80 + protocol: HTTP + allowedRoutes: + kinds: + - kind: HTTPRoute + namespaces: + from: All diff --git a/config/manifests/gateway/gke/resources.yaml b/config/manifests/gateway/gke/resources.yaml deleted file mode 100644 index c371b2b3..00000000 --- a/config/manifests/gateway/gke/resources.yaml +++ /dev/null @@ -1,46 +0,0 @@ -kind: Gateway -apiVersion: gateway.networking.k8s.io/v1beta1 -metadata: - name: inference-gateway -spec: - gatewayClassName: gke-l7-regional-external-managed - listeners: - - name: http - port: 80 - protocol: HTTP - allowedRoutes: - kinds: - - kind: HTTPRoute - namespaces: - from: All ---- -apiVersion: gateway.networking.k8s.io/v1beta1 -kind: HTTPRoute -metadata: - name: mytest-inference-httproute -spec: - parentRefs: - - name: inference-gateway - kind: Gateway - rules: - - backendRefs: - - group: inference.networking.x-k8s.io - name: vllm-llama2-7b - kind: InferencePool ---- -kind: HealthCheckPolicy -apiVersion: networking.gke.io/v1 -metadata: - name: health-check-policy - namespace: default -spec: - targetRef: - group: "inference.networking.x-k8s.io" - kind: InferencePool - name: vllm-llama2-7b - default: - config: - type: HTTP - httpHealthCheck: - requestPath: /health - port: 8000 From 9343660f330023a2e71cee0c193693d89cf9e7cf Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 13:58:01 -0700 Subject: [PATCH 21/49] Separate gateway.yaml for Istio --- config/manifests/gateway/istio/gateway.yaml | 15 ++++++++ config/manifests/gateway/istio/resources.yaml | 37 ------------------- 2 files changed, 15 insertions(+), 37 deletions(-) create mode 100644 config/manifests/gateway/istio/gateway.yaml delete mode 100644 config/manifests/gateway/istio/resources.yaml diff --git a/config/manifests/gateway/istio/gateway.yaml b/config/manifests/gateway/istio/gateway.yaml new file mode 100644 index 00000000..5376b6b0 --- /dev/null +++ b/config/manifests/gateway/istio/gateway.yaml @@ -0,0 +1,15 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: inference-gateway +spec: + gatewayClassName: istio + listeners: + - name: http + port: 80 + protocol: HTTP + allowedRoutes: + kinds: + - kind: HTTPRoute + namespaces: + from: All diff --git a/config/manifests/gateway/istio/resources.yaml b/config/manifests/gateway/istio/resources.yaml deleted file mode 100644 index 2435f522..00000000 --- a/config/manifests/gateway/istio/resources.yaml +++ /dev/null @@ -1,37 +0,0 @@ -apiVersion: gateway.networking.k8s.io/v1 -kind: Gateway -metadata: - name: inference-gateway -spec: - gatewayClassName: istio - listeners: - - name: http - port: 80 - protocol: HTTP - allowedRoutes: - kinds: - - kind: HTTPRoute - namespaces: - from: All ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: llm-route -spec: - hostnames: - - foo.example.com - parentRefs: - - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway - rules: - - backendRefs: - - group: inference.networking.x-k8s.io - kind: InferencePool - name: vllm-llama2-7b - weight: 1 - matches: - - path: - type: PathPrefix - value: /completion From 8ef12a843f038717bf2361fb943e0ff75b4688af Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 13:58:45 -0700 Subject: [PATCH 22/49] Separate gateway.yaml for Kgateway --- .../manifests/gateway/kgateway/gateway.yaml | 17 ++++++++ .../manifests/gateway/kgateway/resources.yaml | 41 ------------------- 2 files changed, 17 insertions(+), 41 deletions(-) create mode 100644 config/manifests/gateway/kgateway/gateway.yaml delete mode 100644 config/manifests/gateway/kgateway/resources.yaml diff --git a/config/manifests/gateway/kgateway/gateway.yaml b/config/manifests/gateway/kgateway/gateway.yaml new file mode 100644 index 00000000..dccd2889 --- /dev/null +++ b/config/manifests/gateway/kgateway/gateway.yaml @@ -0,0 +1,17 @@ +# Requires Kgateway 2.0.0 or greater. +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: inference-gateway +spec: + gatewayClassName: kgateway + listeners: + - name: http + port: 80 + protocol: HTTP + allowedRoutes: + kinds: + - kind: HTTPRoute + namespaces: + from: All diff --git a/config/manifests/gateway/kgateway/resources.yaml b/config/manifests/gateway/kgateway/resources.yaml deleted file mode 100644 index b66d47ab..00000000 --- a/config/manifests/gateway/kgateway/resources.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# Requires Kgateway 2.0.0 or greater. ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: Gateway -metadata: - name: inference-gateway -spec: - gatewayClassName: kgateway - listeners: - - name: http - port: 80 - protocol: HTTP - allowedRoutes: - kinds: - - kind: HTTPRoute - namespaces: - from: All ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: llm-route -spec: - parentRefs: - - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway - rules: - - backendRefs: - - group: inference.networking.x-k8s.io - kind: InferencePool - name: vllm-llama2-7b - port: 8000 - weight: 1 - matches: - - path: - type: PathPrefix - value: / - timeouts: - backendRequest: 24h - request: 24h From 557c44f47512ebdb4ecc71f8e8ea09ec49341c8d Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 14:09:19 -0700 Subject: [PATCH 23/49] Update the user guide to use shared HTTPRoute YAML --- site-src/guides/index.md | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index bfa851c6..97f58a8a 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -91,10 +91,11 @@ This quickstart guide is intended for engineers familiar with k8s and model serv --range= ``` - 1. Deploy Gateway, HTTPRoute and HealthCheckPolicy resources + 1. Deploy Gateway and HealthCheckPolicy resources ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/resources.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gateway.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/healthcheck.yaml ``` Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: @@ -106,14 +107,17 @@ This quickstart guide is intended for engineers familiar with k8s and model serv === "Istio" + Please note that this feature is currently in an experimental phase and is not intended for production use. + The implementation and user experience are subject to changes as we continue to iterate on this project. + 1. Install Istio - Follow the Istio installation guide https://istio.io/latest/docs/setup/install/ + Please follow the [Istio installation guide](https://istio.io/latest/docs/setup/install/). - 1. Deploy Gateway and HTTPRoute + 1. Deploy Gateway ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/resources.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/gateway.yaml ``` 1. Label the gateway @@ -158,10 +162,10 @@ This quickstart guide is intended for engineers familiar with k8s and model serv --version 1.0.1-dev ``` - 1. Deploy Gateway and HTTPRoute resources + 1. Deploy Gateway ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/resources.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml ``` Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: @@ -177,6 +181,12 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml ``` +### Deploy the HTTPRoute + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/httproute.yaml + ``` + ### Try it out Wait until the gateway is ready. @@ -198,9 +208,11 @@ This quickstart guide is intended for engineers familiar with k8s and model serv The following cleanup assumes you would like to clean ALL resources that were created in this quickstart guide. please be careful not to delete resources you'd like to keep. ```bash - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/resources.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/resources.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/resources.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/healthcheck.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/httproute.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml --ignore-not-found From 6d48b5bbef37f77ed43ec4b8dd0e918b1d0199ad Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 17:18:26 -0700 Subject: [PATCH 24/49] Add EPP DestinationRule for Istio --- config/manifests/gateway/istio/destination-rule.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 config/manifests/gateway/istio/destination-rule.yaml diff --git a/config/manifests/gateway/istio/destination-rule.yaml b/config/manifests/gateway/istio/destination-rule.yaml new file mode 100644 index 00000000..a295273c --- /dev/null +++ b/config/manifests/gateway/istio/destination-rule.yaml @@ -0,0 +1,10 @@ +apiVersion: networking.istio.io/v1 +kind: DestinationRule +metadata: + name: epp-insecure-tls +spec: + host: vllm-llama2-7b-epp.default.svc.cluster.local + trafficPolicy: + tls: + mode: SIMPLE + insecureSkipVerify: true From e512145fb3d663851ecb3457e46048c79a228329 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 17:30:16 -0700 Subject: [PATCH 25/49] Add instructions for bypassing TLS verification for Istio --- site-src/guides/index.md | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 97f58a8a..110235fb 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -65,6 +65,12 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml ``` +### Deploy the InferencePool and Extension + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml + ``` + ### Deploy Inference Gateway Choose one of the following options to deploy an Inference Gateway. @@ -114,6 +120,14 @@ This quickstart guide is intended for engineers familiar with k8s and model serv Please follow the [Istio installation guide](https://istio.io/latest/docs/setup/install/). + 1. If you run the Endpoint Picker (EPP) with TLS (with `--secureServing=true`), it is currently using a self-signed certificate + and the gateway cannot successfully validate the CA signature and the SAN. Apply the destination rule to bypass verification as + a temporary workaround. A better TLS implementation is being discussed in https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/582. + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml + ``` + 1. Deploy Gateway ```bash @@ -175,12 +189,6 @@ This quickstart guide is intended for engineers familiar with k8s and model serv inference-gateway kgateway True 22s ``` -### Deploy the InferencePool and Extension - - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml - ``` - ### Deploy the HTTPRoute ```bash @@ -211,6 +219,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gateway.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/healthcheck.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/httproute.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml --ignore-not-found From e82e074883ddad31ad6c7ac61319aecef0d41bb0 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 17:32:59 -0700 Subject: [PATCH 26/49] Update CRDs to the latest v0.2.0 release Co-authored-by: Rob Scott --- site-src/guides/index.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 110235fb..e8d528dd 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -52,8 +52,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Install the Inference Extension CRDs ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v0.2.0/manifests.yaml ``` ### Deploy InferenceModel From ff8b2a17b951b0b66a5f3ea7e99e8ebac72e1ecb Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 17:33:53 -0700 Subject: [PATCH 27/49] Update gateway to use the v1 API Co-authored-by: Rob Scott --- config/manifests/gateway/gke/gateway.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/manifests/gateway/gke/gateway.yaml b/config/manifests/gateway/gke/gateway.yaml index 3cf5645f..b231e207 100644 --- a/config/manifests/gateway/gke/gateway.yaml +++ b/config/manifests/gateway/gke/gateway.yaml @@ -1,5 +1,5 @@ kind: Gateway -apiVersion: gateway.networking.k8s.io/v1beta1 +apiVersion: gateway.networking.k8s.io/v1 metadata: name: inference-gateway spec: From f6f9538cd108648cb0e7742c4443c7eaffa17395 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 17:34:25 -0700 Subject: [PATCH 28/49] Remove weight from HTTPRoute Co-authored-by: Rob Scott --- config/manifests/gateway/httproute.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/config/manifests/gateway/httproute.yaml b/config/manifests/gateway/httproute.yaml index 500e26fd..5bd8bfb6 100644 --- a/config/manifests/gateway/httproute.yaml +++ b/config/manifests/gateway/httproute.yaml @@ -12,7 +12,6 @@ spec: - group: inference.networking.x-k8s.io kind: InferencePool name: vllm-llama2-7b - weight: 1 matches: - path: type: PathPrefix From efb8c354437d66a577c8385b0c956f6689f09472 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 17:36:15 -0700 Subject: [PATCH 29/49] Update gateway.yaml Remove allowed routes from GKE gateway YAML --- config/manifests/gateway/gke/gateway.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/config/manifests/gateway/gke/gateway.yaml b/config/manifests/gateway/gke/gateway.yaml index b231e207..942cde5c 100644 --- a/config/manifests/gateway/gke/gateway.yaml +++ b/config/manifests/gateway/gke/gateway.yaml @@ -8,8 +8,3 @@ spec: - name: http port: 80 protocol: HTTP - allowedRoutes: - kinds: - - kind: HTTPRoute - namespaces: - from: All From 5a2677efc737fc6fd4fa2481d8b9bc92fd78279f Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 17:39:13 -0700 Subject: [PATCH 30/49] Remove allowedRoutes from Istio gateway --- config/manifests/gateway/istio/gateway.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/config/manifests/gateway/istio/gateway.yaml b/config/manifests/gateway/istio/gateway.yaml index 5376b6b0..dd762678 100644 --- a/config/manifests/gateway/istio/gateway.yaml +++ b/config/manifests/gateway/istio/gateway.yaml @@ -8,8 +8,3 @@ spec: - name: http port: 80 protocol: HTTP - allowedRoutes: - kinds: - - kind: HTTPRoute - namespaces: - from: All From ce19438ac5471591bb79a26d92c6ff9036052c38 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 26 Mar 2025 17:39:49 -0700 Subject: [PATCH 31/49] Remove allowedRoutes from Kgateway --- config/manifests/gateway/kgateway/gateway.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/config/manifests/gateway/kgateway/gateway.yaml b/config/manifests/gateway/kgateway/gateway.yaml index dccd2889..fb146b75 100644 --- a/config/manifests/gateway/kgateway/gateway.yaml +++ b/config/manifests/gateway/kgateway/gateway.yaml @@ -10,8 +10,3 @@ spec: - name: http port: 80 protocol: HTTP - allowedRoutes: - kinds: - - kind: HTTPRoute - namespaces: - from: All From d4932587c4a6d5cdf3761d098fc90269332030fd Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Thu, 27 Mar 2025 10:32:19 -0700 Subject: [PATCH 32/49] Update latest instructions for installing Istio and addressing some comments --- site-src/guides/index.md | 75 +++++++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 28 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index e8d528dd..31ad86e2 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -51,8 +51,17 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Install the Inference Extension CRDs +=== "Latest Release" + + ```bash + VERSION=v0.2.0 + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/$VERSION/manifests.yaml + ``` + +=== "Dev Version" + ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v0.2.0/manifests.yaml + kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd ``` ### Deploy InferenceModel @@ -76,25 +85,8 @@ This quickstart guide is intended for engineers familiar with k8s and model serv === "GKE" - 1. Enable the Gateway API - - ```bash - gcloud container clusters update \ - --location= \ - --gateway-api=standard - ``` - - 1. Create the proxy-only subnet - - A proxy-only subnet provides a set of IP addresses that Google uses to run Envoy proxies on your behalf. - ``` - gcloud compute networks subnets create proxy-only-subnet \ - --purpose=REGIONAL_MANAGED_PROXY \ - --role=ACTIVE \ - --region= \ - --network= \ - --range= - ``` + 1. Enable the Gateway API and configure proxy-only subnets when necessary. See [Deploy Gateways](https://cloud.google.com/kubernetes-engine/docs/how-to/deploying-gateways) + for detailed instructions. 1. Deploy Gateway and HealthCheckPolicy resources @@ -117,11 +109,24 @@ This quickstart guide is intended for engineers familiar with k8s and model serv 1. Install Istio - Please follow the [Istio installation guide](https://istio.io/latest/docs/setup/install/). + ``` + TAG=1.26-alpha.80c74f7f43482c226f4f4b10b4dda6261b67a71f + # on Linux + wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-linux-amd64.tar.gz + tar -xvf istioctl-$TAG-linux-amd64.tar.gz + # on macOS + wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-osx.tar.gz + tar -xvf istioctl-$TAG-osx.tar.gz + # on Windows + wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-win.zip + unzip istioctl-$TAG-win.zip + + ./istioctl install --set tag=$TAG --set hub=gcr.io/istio-testing + ``` 1. If you run the Endpoint Picker (EPP) with TLS (with `--secureServing=true`), it is currently using a self-signed certificate and the gateway cannot successfully validate the CA signature and the SAN. Apply the destination rule to bypass verification as - a temporary workaround. A better TLS implementation is being discussed in https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/582. + a temporary workaround. A better TLS implementation is being discussed in [Issue 582](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/582). ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml @@ -213,7 +218,20 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Cleanup The following cleanup assumes you would like to clean ALL resources that were created in this quickstart guide. - please be careful not to delete resources you'd like to keep. + Please be careful not to delete resources you'd like to keep. + + 1. Uninstall the Inference Pool + + ```bash + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found + kubectl delete secret hf-token --ignore-not-found + ``` + + 1. Uninstall the Gateway + ```bash kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gateway.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/healthcheck.yaml --ignore-not-found @@ -221,11 +239,12 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/httproute.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml --ignore-not-found + ``` + + 1. Uninstall the CRDs + + ```bash kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found - kubectl delete secret hf-token --ignore-not-found + kubectl delete -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd --ignore-not-found ``` From 9cb25759f1c6c6db6a8608e14b399baf6b616d2b Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Thu, 27 Mar 2025 10:41:58 -0700 Subject: [PATCH 33/49] Fix indentation for installing CRDs --- site-src/guides/index.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 31ad86e2..b7821674 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -53,16 +53,16 @@ This quickstart guide is intended for engineers familiar with k8s and model serv === "Latest Release" - ```bash - VERSION=v0.2.0 - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/$VERSION/manifests.yaml - ``` + ```bash + VERSION=v0.2.0 + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/$VERSION/manifests.yaml + ``` === "Dev Version" - ```bash - kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd - ``` + ```bash + kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd + ``` ### Deploy InferenceModel From 35a835fa6431f061360179e8fba0b09be52ac0a4 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 10:39:41 -0700 Subject: [PATCH 34/49] Addressing code review comments --- site-src/guides/index.md | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index e2c54250..d3b462a7 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -1,6 +1,10 @@ # Getting started with Gateway API Inference Extension -This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get a first, single InferencePool up and running! +??? example "Experimental" + + This project is still in an alpha state and breaking changes may occur in the future. + +This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get an Inference Gateway up and running! ## **Prerequisites** - A cluster with: @@ -124,15 +128,15 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ./istioctl install --set tag=$TAG --set hub=gcr.io/istio-testing ``` - 1. If you run the Endpoint Picker (EPP) with TLS (with `--secureServing=true`), it is currently using a self-signed certificate - and the gateway cannot successfully validate the CA signature and the SAN. Apply the destination rule to bypass verification as - a temporary workaround. A better TLS implementation is being discussed in [Issue 582](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/582). + 1. Deploy Gateway - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml - ``` + ??? note - 1. Deploy Gateway + If you run the Endpoint Picker (EPP) with the `--secureServing` flag set to `true`, it is currently using a self-signed certificate. As a security measure, Istio does not trust self-signed certificates by default. As a temporary workaround, you can apply the destination rule to bypass TLS verification for EPP. A more secure TLS implementation in EPP is being discussed in [Issue 582](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/582). + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml + ``` ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/gateway.yaml @@ -166,18 +170,14 @@ This quickstart guide is intended for engineers familiar with k8s and model serv 1. Install Kgateway CRDs ```bash - helm upgrade -i --create-namespace --namespace kgateway-system --version v2.0.0-main kgateway-crds https://github.com/danehans/toolbox/raw/refs/heads/main/charts/338661f3be-kgateway-crds-1.0.1-dev.tgz + helm upgrade -i --create-namespace --namespace kgateway-system --version $VERSION kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds ``` 1. Install Kgateway ```bash - helm upgrade --install kgateway "https://github.com/danehans/toolbox/raw/refs/heads/main/charts/338661f3be-kgateway-1.0.1-dev.tgz" \ - -n kgateway-system \ - --set image.registry=danehans \ - --set image.pullPolicy=Always \ - --set inferenceExtension.enabled="true" \ - --version 1.0.1-dev + helm upgrade -i --namespace kgateway-system --version $VERSION kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway +--set inferenceExtension.enabled=true ``` 1. Deploy Gateway @@ -244,7 +244,5 @@ This quickstart guide is intended for engineers familiar with k8s and model serv 1. Uninstall the CRDs ```bash - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml --ignore-not-found kubectl delete -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd --ignore-not-found ``` From 0a24389fc42b4b0a5a1248541050eb7582d849f4 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 10:46:30 -0700 Subject: [PATCH 35/49] Fix indentation --- site-src/guides/index.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index d3b462a7..2e1d3ee9 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -128,15 +128,15 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ./istioctl install --set tag=$TAG --set hub=gcr.io/istio-testing ``` - 1. Deploy Gateway + ??? note - ??? note + If you run the Endpoint Picker (EPP) with the `--secureServing` flag set to `true`, it is currently using a self-signed certificate. As a security measure, Istio does not trust self-signed certificates by default. As a temporary workaround, you can apply the destination rule to bypass TLS verification for EPP. A more secure TLS implementation in EPP is being discussed in [Issue 582](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/582). - If you run the Endpoint Picker (EPP) with the `--secureServing` flag set to `true`, it is currently using a self-signed certificate. As a security measure, Istio does not trust self-signed certificates by default. As a temporary workaround, you can apply the destination rule to bypass TLS verification for EPP. A more secure TLS implementation in EPP is being discussed in [Issue 582](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/582). + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml + ``` - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml - ``` + 1. Deploy Gateway ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/gateway.yaml @@ -177,7 +177,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ```bash helm upgrade -i --namespace kgateway-system --version $VERSION kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway ---set inferenceExtension.enabled=true + --set inferenceExtension.enabled=true ``` 1. Deploy Gateway From c1b563b6e5caa984f131280fdd3cf8e7e7f24421 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 10:54:18 -0700 Subject: [PATCH 36/49] Update Istio installation instructions --- site-src/guides/index.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 2e1d3ee9..44ed7953 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -111,6 +111,10 @@ This quickstart guide is intended for engineers familiar with k8s and model serv Please note that this feature is currently in an experimental phase and is not intended for production use. The implementation and user experience are subject to changes as we continue to iterate on this project. + 1. Requirements + + - Gateway API [CRDs](https://gateway-api.sigs.k8s.io/guides/#installing-gateway-api) installed. + 1. Install Istio ``` @@ -128,13 +132,11 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ./istioctl install --set tag=$TAG --set hub=gcr.io/istio-testing ``` - ??? note - - If you run the Endpoint Picker (EPP) with the `--secureServing` flag set to `true`, it is currently using a self-signed certificate. As a security measure, Istio does not trust self-signed certificates by default. As a temporary workaround, you can apply the destination rule to bypass TLS verification for EPP. A more secure TLS implementation in EPP is being discussed in [Issue 582](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/582). + 1. If you run the Endpoint Picker (EPP) with the `--secureServing` flag set to `true`, it is currently using a self-signed certificate. As a security measure, Istio does not trust self-signed certificates by default. As a temporary workaround, you can apply the destination rule to bypass TLS verification for EPP. A more secure TLS implementation in EPP is being discussed in [Issue 582](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/582). - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml - ``` + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml + ``` 1. Deploy Gateway From 6d3642a40c4c78504ee3f452610696df1cf4e7c9 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 10:58:42 -0700 Subject: [PATCH 37/49] Fix indentation --- site-src/guides/index.md | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 44ed7953..4ce33229 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -150,8 +150,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl label gateway llm-gateway istio.io/enable-inference-extproc=true ``` - 1. Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: - + Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: ```bash $ kubectl get gateway inference-gateway NAME CLASS ADDRESS PROGRAMMED AGE @@ -224,27 +223,27 @@ This quickstart guide is intended for engineers familiar with k8s and model serv 1. Uninstall the Inference Pool - ```bash - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found - kubectl delete secret hf-token --ignore-not-found + ```bash + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found + kubectl delete secret hf-token --ignore-not-found ``` 1. Uninstall the Gateway - ```bash - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gateway.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/healthcheck.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/gateway.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/httproute.yaml --ignore-not-found - ``` + ```bash + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/healthcheck.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/httproute.yaml --ignore-not-found + ``` 1. Uninstall the CRDs - ```bash - kubectl delete -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd --ignore-not-found - ``` + ```bash + kubectl delete -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd --ignore-not-found + ``` From 6a9f91a03209f40551b13fe65448959fb2a1dd02 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 11:00:05 -0700 Subject: [PATCH 38/49] Fix indentation --- site-src/guides/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 4ce33229..34e9cec6 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -229,7 +229,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found kubectl delete secret hf-token --ignore-not-found - ``` + ``` 1. Uninstall the Gateway From b6d4c7a8b1082b858b3648a80c759aaa1a1a0a8e Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 11:07:16 -0700 Subject: [PATCH 39/49] Add more spacing to the CPU based model instructions --- site-src/guides/index.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 34e9cec6..57e5a231 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -42,11 +42,10 @@ This quickstart guide is intended for engineers familiar with k8s and model serv This setup is using the formal `vllm-cpu` image, which according to the documentation can run vLLM on x86 CPU platform. For this setup, we use approximately 9.5GB of memory and 12 CPUs for each replica. - While it is possible to deploy the model server with less resources, this is not recommended. - For example, in our tests, loading the model using 8GB of memory and 1 CPU was possible but took almost 3.5 minutes and inference requests took unreasonable time. - In general, there is a tradeoff between the memory and CPU we allocate to our pods and the performance. The more memory and CPU we allocate the better performance we can get. - After running multiple configurations of these values we decided in this sample to use 9.5GB of memory and 12 CPUs for each replica, which gives reasonable response times. You can increase those numbers and potentially may even get better response times. - For modifying the allocated resources, adjust the numbers in `./config/manifests/vllm/cpu-deployment.yaml` as needed. + + While it is possible to deploy the model server with less resources, this is not recommended. For example, in our tests, loading the model using 8GB of memory and 1 CPU was possible but took almost 3.5 minutes and inference requests took unreasonable time. In general, there is a tradeoff between the memory and CPU we allocate to our pods and the performance. The more memory and CPU we allocate the better performance we can get. + + After running multiple configurations of these values we decided in this sample to use 9.5GB of memory and 12 CPUs for each replica, which gives reasonable response times. You can increase those numbers and potentially may even get better response times. For modifying the allocated resources, adjust the numbers in [cpu-deployment.yaml](https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml) as needed. Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. ```bash From e9f2298315ef5e208b7f7da88a482d1a497ffc75 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 11:25:06 -0700 Subject: [PATCH 40/49] Removing comments from kgateway --- config/manifests/gateway/kgateway/gateway.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/config/manifests/gateway/kgateway/gateway.yaml b/config/manifests/gateway/kgateway/gateway.yaml index fb146b75..7bcd08a6 100644 --- a/config/manifests/gateway/kgateway/gateway.yaml +++ b/config/manifests/gateway/kgateway/gateway.yaml @@ -1,5 +1,3 @@ -# Requires Kgateway 2.0.0 or greater. ---- apiVersion: gateway.networking.k8s.io/v1 kind: Gateway metadata: From 484f19f22a8d70f1aa928617314b6a308b3dadf6 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 14:12:44 -0700 Subject: [PATCH 41/49] Add clarification on the EPP secureServing default value. Co-authored-by: Rob Scott --- site-src/guides/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 57e5a231..cf4b8dad 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -131,7 +131,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ./istioctl install --set tag=$TAG --set hub=gcr.io/istio-testing ``` - 1. If you run the Endpoint Picker (EPP) with the `--secureServing` flag set to `true`, it is currently using a self-signed certificate. As a security measure, Istio does not trust self-signed certificates by default. As a temporary workaround, you can apply the destination rule to bypass TLS verification for EPP. A more secure TLS implementation in EPP is being discussed in [Issue 582](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/582). + 1. If you run the Endpoint Picker (EPP) with the `--secureServing` flag set to `true` (the default mode), it is currently using a self-signed certificate. As a security measure, Istio does not trust self-signed certificates by default. As a temporary workaround, you can apply the destination rule to bypass TLS verification for EPP. A more secure TLS implementation in EPP is being discussed in [Issue 582](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/582). ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml From d71f29cb524720ff81e679a538313eae4a033610 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 14:28:31 -0700 Subject: [PATCH 42/49] Add instructions for configuring timeout --- site-src/guides/index.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index cf4b8dad..b671461a 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -199,6 +199,28 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/httproute.yaml ``` +### Configure Timeouts + + Given that default timeouts for above implementations may be insufficient for most inference workloads, it is recommended to configure a timeout appropriate for your intended use case. + +=== "GKE" + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gcp-backend-policy.yaml + ``` + +=== "Istio" + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/httproute-with-timeout.yaml + ``` + +=== "Kgateway" + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/httproute-with-timeout.yaml + ``` + ### Try it out Wait until the gateway is ready. From 41fc08323fe0287e8fb85305a4ce84c1ccaacac2 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 14:32:55 -0700 Subject: [PATCH 43/49] Create httproute-with-timeout.yaml --- .../gateway/httproute-with-timeout.yaml | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 config/manifests/gateway/httproute-with-timeout.yaml diff --git a/config/manifests/gateway/httproute-with-timeout.yaml b/config/manifests/gateway/httproute-with-timeout.yaml new file mode 100644 index 00000000..060f18c5 --- /dev/null +++ b/config/manifests/gateway/httproute-with-timeout.yaml @@ -0,0 +1,20 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: vllm-llama2-7b + matches: + - path: + type: PathPrefix + value: / + timeouts: + request: 300s From d5fd70fd45fd5a1070f9bf1d71ec755a3d495279 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 14:39:51 -0700 Subject: [PATCH 44/49] Create gcp-backend-policy.yaml --- config/manifests/gateway/gke/gcp-backend-policy.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 config/manifests/gateway/gke/gcp-backend-policy.yaml diff --git a/config/manifests/gateway/gke/gcp-backend-policy.yaml b/config/manifests/gateway/gke/gcp-backend-policy.yaml new file mode 100644 index 00000000..519a5a93 --- /dev/null +++ b/config/manifests/gateway/gke/gcp-backend-policy.yaml @@ -0,0 +1,11 @@ +apiVersion: networking.gke.io/v1 +kind: GCPBackendPolicy +metadata: + name: inferencepool-backend-policy +spec: + targetRef: + group: "inference.networking.x-k8s.io" + kind: InferencePool + name: vllm-llama3-8b-instruct + default: + timeoutSec: 300 From d0ddd165945f2d183c35a523716e050b59191839 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 14:45:25 -0700 Subject: [PATCH 45/49] Add cleanup for GCPBackendPolicy --- site-src/guides/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index b671461a..e147f554 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -257,6 +257,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ```bash kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gateway.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/healthcheck.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gcp-backend-policy.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/gateway.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/istio/destination-rule.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml --ignore-not-found From e1c0b1d53814a9cf0a2bd9d2d7313af58f4bf07a Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 15:07:27 -0700 Subject: [PATCH 46/49] Remove namespace from destination-rule.yaml --- config/manifests/gateway/istio/destination-rule.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/manifests/gateway/istio/destination-rule.yaml b/config/manifests/gateway/istio/destination-rule.yaml index a295273c..f9cd0c3c 100644 --- a/config/manifests/gateway/istio/destination-rule.yaml +++ b/config/manifests/gateway/istio/destination-rule.yaml @@ -3,7 +3,7 @@ kind: DestinationRule metadata: name: epp-insecure-tls spec: - host: vllm-llama2-7b-epp.default.svc.cluster.local + host: vllm-llama2-7b-epp trafficPolicy: tls: mode: SIMPLE From e4471ec610161b9086d2398bb130e15e5566a7e4 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 15:11:51 -0700 Subject: [PATCH 47/49] Rename inferencepool.yaml to inferencepool-resources.yaml --- .../{inferencepool.yaml => inferencepool-resources.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename config/manifests/{inferencepool.yaml => inferencepool-resources.yaml} (100%) diff --git a/config/manifests/inferencepool.yaml b/config/manifests/inferencepool-resources.yaml similarity index 100% rename from config/manifests/inferencepool.yaml rename to config/manifests/inferencepool-resources.yaml From 365d847ccc64331bb09a21a8f026e71da83f978b Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 15:15:22 -0700 Subject: [PATCH 48/49] Rename inferencepool.yaml to inferencepool-resources.yaml --- test/e2e/epp/e2e_suite_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/epp/e2e_suite_test.go b/test/e2e/epp/e2e_suite_test.go index f9dea1cc..643bbf75 100644 --- a/test/e2e/epp/e2e_suite_test.go +++ b/test/e2e/epp/e2e_suite_test.go @@ -75,7 +75,7 @@ const ( // inferModelManifest is the manifest for the inference model CRD. inferModelManifest = "../../../config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml" // inferExtManifest is the manifest for the inference extension test resources. - inferExtManifest = "../../../config/manifests/inferencepool.yaml" + inferExtManifest = "../../../config/manifests/inferencepool-resources.yaml" // envoyManifest is the manifest for the envoy proxy test resources. envoyManifest = "../../testdata/envoy.yaml" // modelServerManifestFilepathEnvVar is the env var that holds absolute path to the manifest for the model server test resource. From c82487d6e599e525e69a0ba13b237199e91c266f Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Fri, 28 Mar 2025 15:16:27 -0700 Subject: [PATCH 49/49] Rename inferencepool.yaml to inferencepool-resources.yaml --- site-src/guides/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index e147f554..4548d5cd 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -79,7 +79,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Deploy the InferencePool and Extension ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool-resources.yaml ``` ### Deploy Inference Gateway @@ -245,7 +245,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv 1. Uninstall the Inference Pool ```bash - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool-resources.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found