Skip to content

Commit 614199e

Browse files
authored
Update default target-pod and inject it into response metadata (#270)
* Update default target-pod and inject it into response metadata * Addressing comments round 1 * Update the endpoint picker proposal * define the behavior when the two values differ
1 parent 5d32bf3 commit 614199e

File tree

9 files changed

+63
-37
lines changed

9 files changed

+63
-37
lines changed

Diff for: docs/proposals/003-endpoint-picker-protocol/README.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ The EPP MUST implement the Envoy
1212
[external processing service](https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor)protocol.
1313

1414
For each HTTP request, the EPP MUST communicate to the proxy the picked model server endpoint, via
15-
adding the `target-pod` HTTP header in the request, or otherwise return an error.
15+
adding the `x-gateway-destination-endpoint` HTTP header in the request and as an unstructured entry in the [dynamic_metadata](https://github.com/envoyproxy/go-control-plane/blob/c19bf63a811c90bf9e02f8e0dc1dcef94931ebb4/envoy/service/ext_proc/v3/external_processor.pb.go#L320) field of the ext-proc response, or otherwise return an error. The EPP MUST not set two different values in the header and the response metadata.
16+
Setting different value leads to unpredictable behavior because proxies aren't guaranteed to support both paths, and so this protocol does not define what takes precedence.
1617

1718
## Model Server Protocol
1819

@@ -62,4 +63,4 @@ The model server MUST expose the following LoRA adapter metrics via the same Pro
6263
Requests will be queued if the model server has reached MaxActiveAdapter and canno load the
6364
requested adapter. Example: `"max_lora": "8"`.
6465
* `running_lora_adapters`: A comma separated list of adapters that are currently loaded in GPU
65-
memory and ready to serve requests. Example: `"running_lora_adapters": "adapter1, adapter2"`
66+
memory and ready to serve requests. Example: `"running_lora_adapters": "adapter1, adapter2"`

Diff for: pkg/ext-proc/handlers/request.go

+15-2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88

99
configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
1010
extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
11+
"google.golang.org/protobuf/types/known/structpb"
1112
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend"
1213
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling"
1314
logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging"
@@ -81,11 +82,11 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces
8182
reqCtx.RequestSize = len(v.RequestBody.Body)
8283
reqCtx.TargetPod = targetPod
8384

84-
// Insert "target-pod" to instruct Envoy to route requests to the specified target pod.
85+
// Insert target endpoint to instruct Envoy to route requests to the specified target pod.
8586
headers := []*configPb.HeaderValueOption{
8687
{
8788
Header: &configPb.HeaderValue{
88-
Key: s.targetPodHeader,
89+
Key: s.targetEndpointKey,
8990
RawValue: []byte(targetPod.Address),
9091
},
9192
},
@@ -104,6 +105,9 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces
104105
}
105106

106107
resp := &extProcPb.ProcessingResponse{
108+
// The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header
109+
// and as an unstructure ext-proc response metadata key/value pair. This enables different integration
110+
// options for gateway providers.
107111
Response: &extProcPb.ProcessingResponse_RequestBody{
108112
RequestBody: &extProcPb.BodyResponse{
109113
Response: &extProcPb.CommonResponse{
@@ -118,6 +122,15 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces
118122
},
119123
},
120124
},
125+
DynamicMetadata: &structpb.Struct{
126+
Fields: map[string]*structpb.Value{
127+
s.targetEndpointKey: {
128+
Kind: &structpb.Value_StringValue{
129+
StringValue: targetPod.Address,
130+
},
131+
},
132+
},
133+
},
121134
}
122135
return resp, nil
123136
}

Diff for: pkg/ext-proc/handlers/server.go

+7-7
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@ import (
1616
klog "k8s.io/klog/v2"
1717
)
1818

19-
func NewServer(pp PodProvider, scheduler Scheduler, targetPodHeader string, datastore ModelDataStore) *Server {
19+
func NewServer(pp PodProvider, scheduler Scheduler, targetEndpointKey string, datastore ModelDataStore) *Server {
2020
return &Server{
21-
scheduler: scheduler,
22-
podProvider: pp,
23-
targetPodHeader: targetPodHeader,
24-
datastore: datastore,
21+
scheduler: scheduler,
22+
podProvider: pp,
23+
targetEndpointKey: targetEndpointKey,
24+
datastore: datastore,
2525
}
2626
}
2727

@@ -32,8 +32,8 @@ type Server struct {
3232
podProvider PodProvider
3333
// The key of the header to specify the target pod address. This value needs to match Envoy
3434
// configuration.
35-
targetPodHeader string
36-
datastore ModelDataStore
35+
targetEndpointKey string
36+
datastore ModelDataStore
3737
}
3838

3939
type Scheduler interface {

Diff for: pkg/ext-proc/main.go

+4-4
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@ var (
4141
"The port used for gRPC liveness and readiness probes")
4242
metricsPort = flag.Int(
4343
"metricsPort", 9090, "The metrics port")
44-
targetPodHeader = flag.String(
45-
"targetPodHeader",
46-
runserver.DefaultTargetPodHeader,
44+
targetEndpointKey = flag.String(
45+
"targetEndpointKey",
46+
runserver.DefaultTargetEndpointKey,
4747
"Header key used by Envoy to route to the appropriate pod. This must match Envoy configuration.")
4848
poolName = flag.String(
4949
"poolName",
@@ -103,7 +103,7 @@ func main() {
103103

104104
serverRunner := &runserver.ExtProcServerRunner{
105105
GrpcPort: *grpcPort,
106-
TargetPodHeader: *targetPodHeader,
106+
TargetEndpointKey: *targetEndpointKey,
107107
PoolName: *poolName,
108108
PoolNamespace: *poolNamespace,
109109
ServiceName: *serviceName,

Diff for: pkg/ext-proc/server/runserver.go

+11-11
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ import (
2020
// ExtProcServerRunner provides methods to manage an external process server.
2121
type ExtProcServerRunner struct {
2222
GrpcPort int
23-
TargetPodHeader string
23+
TargetEndpointKey string
2424
PoolName string
2525
PoolNamespace string
2626
ServiceName string
@@ -35,20 +35,20 @@ type ExtProcServerRunner struct {
3535

3636
// Default values for CLI flags in main
3737
const (
38-
DefaultGrpcPort = 9002 // default for --grpcPort
39-
DefaultTargetPodHeader = "target-pod" // default for --targetPodHeader
40-
DefaultPoolName = "" // required but no default
41-
DefaultPoolNamespace = "default" // default for --poolNamespace
42-
DefaultServiceName = "" // required but no default
43-
DefaultZone = "" // default for --zone
44-
DefaultRefreshPodsInterval = 10 * time.Second // default for --refreshPodsInterval
45-
DefaultRefreshMetricsInterval = 50 * time.Millisecond // default for --refreshMetricsInterval
38+
DefaultGrpcPort = 9002 // default for --grpcPort
39+
DefaultTargetEndpointKey = "x-gateway-destination-endpoint" // default for --targetEndpointKey
40+
DefaultPoolName = "" // required but no default
41+
DefaultPoolNamespace = "default" // default for --poolNamespace
42+
DefaultServiceName = "" // required but no default
43+
DefaultZone = "" // default for --zone
44+
DefaultRefreshPodsInterval = 10 * time.Second // default for --refreshPodsInterval
45+
DefaultRefreshMetricsInterval = 50 * time.Millisecond // default for --refreshMetricsInterval
4646
)
4747

4848
func NewDefaultExtProcServerRunner() *ExtProcServerRunner {
4949
return &ExtProcServerRunner{
5050
GrpcPort: DefaultGrpcPort,
51-
TargetPodHeader: DefaultTargetPodHeader,
51+
TargetEndpointKey: DefaultTargetEndpointKey,
5252
PoolName: DefaultPoolName,
5353
PoolNamespace: DefaultPoolNamespace,
5454
ServiceName: DefaultServiceName,
@@ -130,7 +130,7 @@ func (r *ExtProcServerRunner) Start(
130130
// Register ext_proc handlers
131131
extProcPb.RegisterExternalProcessorServer(
132132
svr,
133-
handlers.NewServer(pp, scheduling.NewScheduler(pp), r.TargetPodHeader, r.Datastore),
133+
handlers.NewServer(pp, scheduling.NewScheduler(pp), r.TargetEndpointKey, r.Datastore),
134134
)
135135

136136
// Blocking and will return when shutdown is complete.

Diff for: pkg/manifests/gateway/patch_policy.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ spec:
2525
type: ORIGINAL_DST
2626
original_dst_lb_config:
2727
use_http_header: true
28-
http_header_name: "target-pod"
28+
http_header_name: "x-gateway-destination-endpoint"
2929
connect_timeout: 1000s
3030
lb_policy: CLUSTER_PROVIDED
3131
dns_lookup_family: V4_ONLY
@@ -40,4 +40,4 @@ spec:
4040
operation:
4141
op: replace
4242
path: "/virtual_hosts/0/routes/0/route/cluster"
43-
value: original_destination_cluster
43+
value: original_destination_cluster

Diff for: pkg/manifests/vllm/deployment.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -132,4 +132,4 @@ spec:
132132
emptyDir:
133133
medium: Memory
134134
- name: adapters
135-
emptyDir: {}
135+
emptyDir: {}

Diff for: test/integration/hermetic_test.go

+19-7
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"google.golang.org/grpc"
2222
"google.golang.org/grpc/credentials/insecure"
2323
"google.golang.org/protobuf/testing/protocmp"
24+
"google.golang.org/protobuf/types/known/structpb"
2425
"inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1"
2526
"inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend"
2627
runserver "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/server"
@@ -111,7 +112,7 @@ func SKIPTestHandleRequestBody(t *testing.T) {
111112
wantHeaders: []*configPb.HeaderValueOption{
112113
{
113114
Header: &configPb.HeaderValue{
114-
Key: runserver.DefaultTargetPodHeader,
115+
Key: runserver.DefaultTargetEndpointKey,
115116
RawValue: []byte("address-1"),
116117
},
117118
},
@@ -162,11 +163,12 @@ func SKIPTestHandleRequestBody(t *testing.T) {
162163

163164
func TestKubeInferenceModelRequest(t *testing.T) {
164165
tests := []struct {
165-
name string
166-
req *extProcPb.ProcessingRequest
167-
wantHeaders []*configPb.HeaderValueOption
168-
wantBody []byte
169-
wantErr bool
166+
name string
167+
req *extProcPb.ProcessingRequest
168+
wantHeaders []*configPb.HeaderValueOption
169+
wantMetadata *structpb.Struct
170+
wantBody []byte
171+
wantErr bool
170172
}{
171173
{
172174
name: "success",
@@ -176,7 +178,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
176178
wantHeaders: []*configPb.HeaderValueOption{
177179
{
178180
Header: &configPb.HeaderValue{
179-
Key: runserver.DefaultTargetPodHeader,
181+
Key: runserver.DefaultTargetEndpointKey,
180182
RawValue: []byte("address-1"),
181183
},
182184
},
@@ -187,6 +189,15 @@ func TestKubeInferenceModelRequest(t *testing.T) {
187189
},
188190
},
189191
},
192+
wantMetadata: &structpb.Struct{
193+
Fields: map[string]*structpb.Value{
194+
runserver.DefaultTargetEndpointKey: {
195+
Kind: &structpb.Value_StringValue{
196+
StringValue: "address-1",
197+
},
198+
},
199+
},
200+
},
190201
wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"hello\",\"temperature\":0}"),
191202
wantErr: false,
192203
},
@@ -249,6 +260,7 @@ func TestKubeInferenceModelRequest(t *testing.T) {
249260
},
250261
},
251262
},
263+
DynamicMetadata: test.wantMetadata,
252264
}
253265
res, err := sendRequest(t, client, test.req)
254266

Diff for: test/testdata/envoy.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ data:
158158
max_requests: 40000
159159
original_dst_lb_config:
160160
use_http_header: true
161-
http_header_name: target-pod
161+
http_header_name: x-gateway-destination-endpoint
162162
- name: ext_proc
163163
type: STRICT_DNS
164164
connect_timeout: 86400s

0 commit comments

Comments
 (0)