diff --git a/examples/poc/manifests/gateway/enable_patch_policy.yaml b/examples/poc/manifests/gateway/enable_patch_policy.yaml deleted file mode 100644 index c1d00e9a..00000000 --- a/examples/poc/manifests/gateway/enable_patch_policy.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: envoy-gateway-config - namespace: envoy-gateway-system -data: -# This manifest's main purpose is to set `enabledEnvoyPatchPolicy` to `true`. -# Any field under `admin` is optional, and only for enabling the admin endpoints, for debugging. -# Admin Interface: https://www.envoyproxy.io/docs/envoy/latest/operations/admin -# PatchPolicy docs: https://gateway.envoyproxy.io/docs/tasks/extensibility/envoy-patch-policy/#enable-envoypatchpolicy - envoy-gateway.yaml: | - apiVersion: gateway.envoyproxy.io/v1alpha1 - kind: EnvoyGateway - provider: - type: Kubernetes - gateway: - controllerName: gateway.envoyproxy.io/gatewayclass-controller - extensionApis: - enableEnvoyPatchPolicy: true - enableBackend: true -# admin: -# enablePprof: true -# address: -# host: 127.0.0.1 -# port: 19000 -# enabledDumpConfig: true diff --git a/examples/poc/manifests/gateway/ext_proc.yaml b/examples/poc/manifests/gateway/ext_proc.yaml deleted file mode 100644 index 6112fa9e..00000000 --- a/examples/poc/manifests/gateway/ext_proc.yaml +++ /dev/null @@ -1,69 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: instance-gateway-ext-proc - namespace: default - labels: - app: instance-gateway-ext-proc -spec: - replicas: 1 - selector: - matchLabels: - app: instance-gateway-ext-proc - template: - metadata: - labels: - app: instance-gateway-ext-proc - spec: - containers: - - name: instance-gateway-ext-proc - image: ghcr.io/tomatillo-and-multiverse/ext-proc:demo - args: - #TODO: specify label selector and dynamically update pods - - -pods - - "vllm-78665f78c4-h4kx4,vllm-78665f78c4-hnz84" - - -podIPs - - "10.24.11.6:8000,10.24.5.7:8000" - - -enable-fairness - - "false" - ports: - - containerPort: 9002 - - name: curl - image: curlimages/curl - command: ["sleep", "3600"] ---- -apiVersion: v1 -kind: Service -metadata: - name: instance-gateway-ext-proc - namespace: default -spec: - selector: - app: instance-gateway-ext-proc - ports: - - protocol: TCP - port: 9002 - targetPort: 9002 - type: ClusterIP ---- -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyExtensionPolicy -metadata: - name: ext-proc-policy - namespace: default -spec: - extProc: - - backendRefs: - - group: "" - kind: Service - name: instance-gateway-ext-proc - port: 9002 - processingMode: - request: - body: Buffered - response: - messageTimeout: 5s - targetRef: - group: gateway.networking.k8s.io - kind: HTTPRoute - name: llm-route \ No newline at end of file diff --git a/examples/poc/manifests/gateway/gateway.yaml b/examples/poc/manifests/gateway/gateway.yaml deleted file mode 100644 index b964f911..00000000 --- a/examples/poc/manifests/gateway/gateway.yaml +++ /dev/null @@ -1,47 +0,0 @@ - ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: Gateway -metadata: - name: -spec: - gatewayClassName: - listeners: - - name: http - protocol: HTTP - port: 8080 - - name: llm-gw - protocol: HTTP - port: 8081 ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: GatewayClass -metadata: - name: -spec: - controllerName: gateway.envoyproxy.io/gatewayclass-controller ---- -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: Backend -metadata: - name: backend-dummy -spec: - endpoints: - - fqdn: - # Both these values are arbitrary and unused as the PatchPolicy redirects requests. - hostname: 'foo.bar.com' - port: 8080 ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: llm-route -spec: - parentRefs: - - name: inference-gateway - sectionName: llm-gw - rules: - - backendRefs: - - group: gateway.envoyproxy.io - kind: Backend - name: backend-dummy \ No newline at end of file diff --git a/examples/poc/manifests/gateway/patch_policy.yaml b/examples/poc/manifests/gateway/patch_policy.yaml deleted file mode 100644 index b7681954..00000000 --- a/examples/poc/manifests/gateway/patch_policy.yaml +++ /dev/null @@ -1,38 +0,0 @@ -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyPatchPolicy -metadata: - name: custom-response-patch-policy - namespace: default -spec: - targetRef: - group: gateway.networking.k8s.io - kind: Gateway - name: - type: JSONPatch - jsonPatches: - # Necessary to create a cluster of the type: ORIGINAL_DST to allow for - # direct pod scheduling. Which is heavily utilized in our scheduling. - # Specifically the field `original_dst_lb_config` allows us to enable - # `use_http_header` and `http_header_name`. - # Source: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/cluster/v3/cluster.proto - - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" - name: original_destination_cluster - operation: - op: add - path: "" - value: - name: original_destination_cluster - type: ORIGINAL_DST - original_dst_lb_config: - use_http_header: true - http_header_name: "target-pod" - connect_timeout: 6s - lb_policy: CLUSTER_PROVIDED - dns_lookup_family: V4_ONLY - - - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration" - name: default//llm-gw - operation: - op: replace - path: "/virtual_hosts/1/routes/0/route/cluster" - value: original_destination_cluster \ No newline at end of file diff --git a/pkg/ext-proc/handlers/response.go b/pkg/ext-proc/handlers/response.go index 7041f8b8..aa705438 100644 --- a/pkg/ext-proc/handlers/response.go +++ b/pkg/ext-proc/handlers/response.go @@ -38,6 +38,9 @@ func (s *Server) HandleResponseHeaders(reqCtx *RequestContext, req *extProcPb.Pr } // HandleResponseBody parses response body to update information such as number of completion tokens. +// NOTE: The current implementation only supports Buffered mode, which is not enabled by default. To +// use it, you need to configure EnvoyExtensionPolicy to have response body in Buffered mode. +// https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/filters/http/ext_proc/v3/processing_mode.proto#envoy-v3-api-msg-extensions-filters-http-ext-proc-v3-processingmode // Example response /* { diff --git a/pkg/manifests/ext_proc.yaml b/pkg/manifests/ext_proc.yaml index 40adce79..2ea1e4f0 100644 --- a/pkg/manifests/ext_proc.yaml +++ b/pkg/manifests/ext_proc.yaml @@ -94,7 +94,6 @@ spec: request: body: Buffered response: - body: Buffered # The timeouts are likely not needed here. We can experiment with removing/tuning them slowly. # The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly. messageTimeout: 1000s