From 6965c6a21f3679135adeaf5013974d21983b0ed9 Mon Sep 17 00:00:00 2001
From: ahg-g <ahg@google.com>
Date: Tue, 18 Mar 2025 01:03:06 +0000
Subject: [PATCH 1/2] split the extension policy since it is envoy specific

---
 .../manifests/gateway/extension_policy.yaml   | 33 +++++++++++++++++++
 config/manifests/inferencepool.yaml           | 33 -------------------
 site-src/guides/index.md                      |  1 +
 3 files changed, 34 insertions(+), 33 deletions(-)
 create mode 100644 config/manifests/gateway/extension_policy.yaml

diff --git a/config/manifests/gateway/extension_policy.yaml b/config/manifests/gateway/extension_policy.yaml
new file mode 100644
index 00000000..69021209
--- /dev/null
+++ b/config/manifests/gateway/extension_policy.yaml
@@ -0,0 +1,33 @@
+apiVersion: gateway.envoyproxy.io/v1alpha1
+kind: EnvoyExtensionPolicy
+metadata:
+  name: ext-proc-policy
+  namespace: default
+spec:
+  extProc:
+    - backendRefs:
+      - group: ""
+        kind: Service
+        name: vllm-llama2-7b-epp
+        port: 9002
+      processingMode:
+        allowModeOverride: true
+        request:
+          body: Buffered
+        response:
+      # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly.
+      # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. 
+      messageTimeout: 1000s
+      backendSettings:
+        circuitBreaker:
+          maxConnections: 40000
+          maxPendingRequests: 40000
+          maxParallelRequests: 40000
+        timeout:
+          tcp:
+            connectTimeout: 24h
+  targetRef:
+    group: gateway.networking.k8s.io
+    kind: HTTPRoute
+    name: llm-route
+
diff --git a/config/manifests/inferencepool.yaml b/config/manifests/inferencepool.yaml
index 8225bd7c..64008639 100644
--- a/config/manifests/inferencepool.yaml
+++ b/config/manifests/inferencepool.yaml
@@ -75,39 +75,6 @@ spec:
           initialDelaySeconds: 5
           periodSeconds: 10
 ---
-apiVersion: gateway.envoyproxy.io/v1alpha1
-kind: EnvoyExtensionPolicy
-metadata:
-  name: ext-proc-policy
-  namespace: default
-spec:
-  extProc:
-    - backendRefs:
-      - group: ""
-        kind: Service
-        name: vllm-llama2-7b-epp
-        port: 9002
-      processingMode:
-        allowModeOverride: true
-        request:
-          body: Buffered
-        response:
-      # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly.
-      # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. 
-      messageTimeout: 1000s
-      backendSettings:
-        circuitBreaker:
-          maxConnections: 40000
-          maxPendingRequests: 40000
-          maxParallelRequests: 40000
-        timeout:
-          tcp:
-            connectTimeout: 24h
-  targetRef:
-    group: gateway.networking.k8s.io
-    kind: HTTPRoute
-    name: llm-route
----
 kind: ClusterRole
 apiVersion: rbac.authorization.k8s.io/v1
 metadata:
diff --git a/site-src/guides/index.md b/site-src/guides/index.md
index d721e73f..5a3ad269 100644
--- a/site-src/guides/index.md
+++ b/site-src/guides/index.md
@@ -88,6 +88,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
 ### Deploy Envoy Gateway Custom Policies
 
    ```bash
+   kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/extension_policy.yaml
    kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml
    ```
    > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further.

From a07beb20a3fcfdc4ab8a0d74fdc3ebc215e8ef36 Mon Sep 17 00:00:00 2001
From: ahg-g <ahg@google.com>
Date: Tue, 18 Mar 2025 01:15:05 +0000
Subject: [PATCH 2/2] merge extenstion and patch policy in one manifests

---
 .../manifests/gateway/extension_policy.yaml   | 33 ------------------
 config/manifests/gateway/patch_policy.yaml    | 34 ++++++++++++++++++-
 site-src/guides/index.md                      |  1 -
 3 files changed, 33 insertions(+), 35 deletions(-)
 delete mode 100644 config/manifests/gateway/extension_policy.yaml

diff --git a/config/manifests/gateway/extension_policy.yaml b/config/manifests/gateway/extension_policy.yaml
deleted file mode 100644
index 69021209..00000000
--- a/config/manifests/gateway/extension_policy.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-apiVersion: gateway.envoyproxy.io/v1alpha1
-kind: EnvoyExtensionPolicy
-metadata:
-  name: ext-proc-policy
-  namespace: default
-spec:
-  extProc:
-    - backendRefs:
-      - group: ""
-        kind: Service
-        name: vllm-llama2-7b-epp
-        port: 9002
-      processingMode:
-        allowModeOverride: true
-        request:
-          body: Buffered
-        response:
-      # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly.
-      # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. 
-      messageTimeout: 1000s
-      backendSettings:
-        circuitBreaker:
-          maxConnections: 40000
-          maxPendingRequests: 40000
-          maxParallelRequests: 40000
-        timeout:
-          tcp:
-            connectTimeout: 24h
-  targetRef:
-    group: gateway.networking.k8s.io
-    kind: HTTPRoute
-    name: llm-route
-
diff --git a/config/manifests/gateway/patch_policy.yaml b/config/manifests/gateway/patch_policy.yaml
index 3c36ed7a..d293bc82 100644
--- a/config/manifests/gateway/patch_policy.yaml
+++ b/config/manifests/gateway/patch_policy.yaml
@@ -85,4 +85,36 @@ spec:
     #     op: replace
     #     path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_header_mode"
     #     value: SEND
-
+---
+apiVersion: gateway.envoyproxy.io/v1alpha1
+kind: EnvoyExtensionPolicy
+metadata:
+  name: ext-proc-policy
+  namespace: default
+spec:
+  extProc:
+    - backendRefs:
+      - group: ""
+        kind: Service
+        name: vllm-llama2-7b-epp
+        port: 9002
+      processingMode:
+        allowModeOverride: true
+        request:
+          body: Buffered
+        response:
+      # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly.
+      # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. 
+      messageTimeout: 1000s
+      backendSettings:
+        circuitBreaker:
+          maxConnections: 40000
+          maxPendingRequests: 40000
+          maxParallelRequests: 40000
+        timeout:
+          tcp:
+            connectTimeout: 24h
+  targetRef:
+    group: gateway.networking.k8s.io
+    kind: HTTPRoute
+    name: llm-route
diff --git a/site-src/guides/index.md b/site-src/guides/index.md
index 5a3ad269..d721e73f 100644
--- a/site-src/guides/index.md
+++ b/site-src/guides/index.md
@@ -88,7 +88,6 @@ This quickstart guide is intended for engineers familiar with k8s and model serv
 ### Deploy Envoy Gateway Custom Policies
 
    ```bash
-   kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/extension_policy.yaml
    kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml
    ```
    > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further.