9
9
extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
10
10
klog "k8s.io/klog/v2"
11
11
12
+ "inference.networking.x-k8s.io/llm-instance-gateway/api/v1alpha1"
13
+ "inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/backend"
12
14
"inference.networking.x-k8s.io/llm-instance-gateway/pkg/ext-proc/scheduling"
13
15
)
14
16
@@ -33,25 +35,38 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces
33
35
return nil , fmt .Errorf ("model not found in request" )
34
36
}
35
37
klog .V (3 ).Infof ("Model requested: %v" , model )
38
+ modelName := model
39
+
40
+ // NOTE: The nil checking for the modelObject means that we DO allow passthrough currently.
41
+ // This might be a security risk in the future where adapters not registered in the LLMService
42
+ // are able to be requested by using their distinct name.
43
+ modelObj := s .FetchModelData (model )
44
+ if modelObj != nil && len (modelObj .TargetModels ) > 0 {
45
+ modelName = backend .RandomWeightedDraw (modelObj )
46
+ if modelName == "" {
47
+ return nil , fmt .Errorf ("Error getting target model name for model %v" , modelObj .Name )
48
+ }
49
+ }
50
+ klog .Infof ("Model is null %v" , modelObj == nil )
36
51
llmReq := & scheduling.LLMRequest {
37
- Model : model ,
38
- // For now use the model as the target model.
39
- // TODO: Once the API is approved, read the "LLMUseCase" configuration and apply traffic split.
40
- TargetModels : map [string ]int {model : 100 },
41
- ResolvedTargetModel : model ,
42
- // TODO: Read from LLMService CRD.
43
- Critical : true ,
52
+ Model : model ,
53
+ ResolvedTargetModel : modelName ,
54
+ Critical : backend .ModelHasObjective (modelObj ),
44
55
}
45
56
klog .V (3 ).Infof ("LLM Request: %+v" , llmReq )
46
57
58
+ requestBody := v .RequestBody .Body
59
+ var err error
47
60
// Update target models in the body.
48
- rb ["model" ] = llmReq .ResolvedTargetModel
49
- updatedBody , err := json .Marshal (rb )
50
- if err != nil {
51
- klog .Errorf ("Error marshaling request body: %v" , err )
52
- return nil , fmt .Errorf ("error marshaling request body: %v" , err )
61
+ if llmReq .Model != llmReq .ResolvedTargetModel {
62
+ rb ["model" ] = llmReq .ResolvedTargetModel
63
+ requestBody , err = json .Marshal (rb )
64
+ if err != nil {
65
+ klog .Errorf ("Error marshaling request body: %v" , err )
66
+ return nil , fmt .Errorf ("error marshaling request body: %v" , err )
67
+ }
68
+ klog .V (3 ).Infof ("Updated body: %v" , string (requestBody ))
53
69
}
54
- klog .V (3 ).Infof ("Updated body: %v" , string (updatedBody ))
55
70
56
71
targetPod , err := s .scheduler .Schedule (llmReq )
57
72
if err != nil {
@@ -75,7 +90,7 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces
75
90
{
76
91
Header : & configPb.HeaderValue {
77
92
Key : "Content-Length" ,
78
- RawValue : []byte (strconv .Itoa (len (updatedBody ))),
93
+ RawValue : []byte (strconv .Itoa (len (requestBody ))),
79
94
},
80
95
},
81
96
}
@@ -93,7 +108,7 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces
93
108
},
94
109
BodyMutation : & extProcPb.BodyMutation {
95
110
Mutation : & extProcPb.BodyMutation_Body {
96
- Body : updatedBody ,
111
+ Body : requestBody ,
97
112
},
98
113
},
99
114
},
@@ -103,6 +118,22 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces
103
118
return resp , nil
104
119
}
105
120
121
+ func (s * Server ) FetchModelData (modelName string ) (returnModel * v1alpha1.Model ) {
122
+ s .datastore .LLMServices .Range (func (k , v any ) bool {
123
+ service := v .(* v1alpha1.LLMService )
124
+ klog .Infof ("Service name: %v" , service .Name )
125
+ for _ , model := range service .Spec .Models {
126
+ if model .Name == modelName {
127
+ returnModel = & model
128
+ // We want to stop iterating, return false.
129
+ return false
130
+ }
131
+ }
132
+ return true
133
+ })
134
+ return
135
+ }
136
+
106
137
func HandleRequestHeaders (reqCtx * RequestContext , req * extProcPb.ProcessingRequest ) * extProcPb.ProcessingResponse {
107
138
klog .V (3 ).Info ("--- In RequestHeaders processing ..." )
108
139
r := req .Request
0 commit comments