From 6965c6a21f3679135adeaf5013974d21983b0ed9 Mon Sep 17 00:00:00 2001 From: ahg-g Date: Tue, 18 Mar 2025 01:03:06 +0000 Subject: [PATCH 1/2] split the extension policy since it is envoy specific --- .../manifests/gateway/extension_policy.yaml | 33 +++++++++++++++++++ config/manifests/inferencepool.yaml | 33 ------------------- site-src/guides/index.md | 1 + 3 files changed, 34 insertions(+), 33 deletions(-) create mode 100644 config/manifests/gateway/extension_policy.yaml diff --git a/config/manifests/gateway/extension_policy.yaml b/config/manifests/gateway/extension_policy.yaml new file mode 100644 index 00000000..69021209 --- /dev/null +++ b/config/manifests/gateway/extension_policy.yaml @@ -0,0 +1,33 @@ +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyExtensionPolicy +metadata: + name: ext-proc-policy + namespace: default +spec: + extProc: + - backendRefs: + - group: "" + kind: Service + name: vllm-llama2-7b-epp + port: 9002 + processingMode: + allowModeOverride: true + request: + body: Buffered + response: + # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly. + # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. + messageTimeout: 1000s + backendSettings: + circuitBreaker: + maxConnections: 40000 + maxPendingRequests: 40000 + maxParallelRequests: 40000 + timeout: + tcp: + connectTimeout: 24h + targetRef: + group: gateway.networking.k8s.io + kind: HTTPRoute + name: llm-route + diff --git a/config/manifests/inferencepool.yaml b/config/manifests/inferencepool.yaml index 8225bd7c..64008639 100644 --- a/config/manifests/inferencepool.yaml +++ b/config/manifests/inferencepool.yaml @@ -75,39 +75,6 @@ spec: initialDelaySeconds: 5 periodSeconds: 10 --- -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyExtensionPolicy -metadata: - name: ext-proc-policy - namespace: default -spec: - extProc: - - backendRefs: - - group: "" - kind: Service - name: vllm-llama2-7b-epp - port: 9002 - processingMode: - allowModeOverride: true - request: - body: Buffered - response: - # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly. - # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. - messageTimeout: 1000s - backendSettings: - circuitBreaker: - maxConnections: 40000 - maxPendingRequests: 40000 - maxParallelRequests: 40000 - timeout: - tcp: - connectTimeout: 24h - targetRef: - group: gateway.networking.k8s.io - kind: HTTPRoute - name: llm-route ---- kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: diff --git a/site-src/guides/index.md b/site-src/guides/index.md index d721e73f..5a3ad269 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -88,6 +88,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Deploy Envoy Gateway Custom Policies ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/extension_policy.yaml kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml ``` > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further. From a07beb20a3fcfdc4ab8a0d74fdc3ebc215e8ef36 Mon Sep 17 00:00:00 2001 From: ahg-g Date: Tue, 18 Mar 2025 01:15:05 +0000 Subject: [PATCH 2/2] merge extenstion and patch policy in one manifests --- .../manifests/gateway/extension_policy.yaml | 33 ------------------ config/manifests/gateway/patch_policy.yaml | 34 ++++++++++++++++++- site-src/guides/index.md | 1 - 3 files changed, 33 insertions(+), 35 deletions(-) delete mode 100644 config/manifests/gateway/extension_policy.yaml diff --git a/config/manifests/gateway/extension_policy.yaml b/config/manifests/gateway/extension_policy.yaml deleted file mode 100644 index 69021209..00000000 --- a/config/manifests/gateway/extension_policy.yaml +++ /dev/null @@ -1,33 +0,0 @@ -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyExtensionPolicy -metadata: - name: ext-proc-policy - namespace: default -spec: - extProc: - - backendRefs: - - group: "" - kind: Service - name: vllm-llama2-7b-epp - port: 9002 - processingMode: - allowModeOverride: true - request: - body: Buffered - response: - # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly. - # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. - messageTimeout: 1000s - backendSettings: - circuitBreaker: - maxConnections: 40000 - maxPendingRequests: 40000 - maxParallelRequests: 40000 - timeout: - tcp: - connectTimeout: 24h - targetRef: - group: gateway.networking.k8s.io - kind: HTTPRoute - name: llm-route - diff --git a/config/manifests/gateway/patch_policy.yaml b/config/manifests/gateway/patch_policy.yaml index 3c36ed7a..d293bc82 100644 --- a/config/manifests/gateway/patch_policy.yaml +++ b/config/manifests/gateway/patch_policy.yaml @@ -85,4 +85,36 @@ spec: # op: replace # path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_header_mode" # value: SEND - +--- +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyExtensionPolicy +metadata: + name: ext-proc-policy + namespace: default +spec: + extProc: + - backendRefs: + - group: "" + kind: Service + name: vllm-llama2-7b-epp + port: 9002 + processingMode: + allowModeOverride: true + request: + body: Buffered + response: + # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly. + # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. + messageTimeout: 1000s + backendSettings: + circuitBreaker: + maxConnections: 40000 + maxPendingRequests: 40000 + maxParallelRequests: 40000 + timeout: + tcp: + connectTimeout: 24h + targetRef: + group: gateway.networking.k8s.io + kind: HTTPRoute + name: llm-route diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 5a3ad269..d721e73f 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -88,7 +88,6 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Deploy Envoy Gateway Custom Policies ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/extension_policy.yaml kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml ``` > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further.