kubernetes-sigs · k8s-ci-robot · Dec 10, 2024 · Nov 20, 2024 · Dec 6, 2024 · Dec 8, 2024
diff --git a/docs/schedular-flowchart.png b/docs/schedular-flowchart.png
diff --git a/examples/poc/manifests/llmservice.yaml b/examples/poc/manifests/llmservice.yaml
@@ -1,4 +1,14 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
+kind: LLMServerPool
+metadata:
+  labels:
+  name: vllm-llama2-7b-pool
+spec:
+  targetPort: 8000
+  modelServerSelector:
+    "app": "vllm-llama2-7b-pool"
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: LLMService
 metadata:
   labels:
@@ -7,17 +17,84 @@ metadata:
   name: llmservice-sample
 spec:
   models:
-  - name: sql-code-assist
-  - name: npc-bot
+  - name: sql-lora
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: sql-lora
+      weight: 100	
+  - name: sql-lora-0
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: sql-lora-0
+      weight: 100	
+  - name: sql-lora-1
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: sql-lora-1
+      weight: 100	
+  - name: sql-lora-2
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: sql-lora-2
+      weight: 100	
+  - name: sql-lora-3
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: sql-lora-3
+      weight: 100	
+  - name: sql-lora-4
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: sql-lora-4
+      weight: 100	
+  - name: tweet-summary
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: tweet-summary
+      weight: 100	
+  - name: tweet-summary-0
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: tweet-summary-0
+      weight: 100	
+  - name: tweet-summary-1
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: tweet-summary-1
+      weight: 100	
+  - name: tweet-summary-2
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name:  tweet-summary-2
+      weight: 100	
+  - name:  tweet-summary-3
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: tweet-summary-3
+      weight: 100	
+  - name: tweet-summary-4
+    objective: 
+      desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
+    targetModels:
+    - name: tweet-summary-4
+      weight: 100	
+  - name: meta-llama/Llama-2-7b-hf
     objective: 
       desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
     targetModels:
-    - name: npc-bot-v1
-      weight: 50
-    - name: npc-bot-v2
-      weight: 50 	
+    - name: meta-llama/Llama-2-7b-hf
+      weight: 100	
   poolRef:
   - kind: LLMServerPool
-    name: test-pool
-  - name: gemini-pool
-    kind: LLMServerPool
+    name: vllm-llama2-7b-pool
diff --git a/examples/poc/manifests/vllm/vllm-lora-deployment.yaml b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml
@@ -1,17 +1,32 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-llama2-7b-pool
+spec:
+  selector:
+    app: vllm-llama2-7b-pool
+  ports:
+  - protocol: TCP
+    port: 8000
+    targetPort: 8000
+  type: LoadBalancer
+
+---
+
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: vllm
+  name: vllm-llama2-7b-pool
   namespace: default
 spec:
-  replicas: 6
+  replicas: 3
   selector:
     matchLabels:
-      app: vllm
+      app: vllm-llama2-7b-pool
   template:
     metadata:
       labels:
-        app: vllm
+        app: vllm-llama2-7b-pool
     spec:
       containers:
         - name: lora

diff --git a/examples/poc/manifests/vllm/vllm-lora-service.yaml b/examples/poc/manifests/vllm/vllm-lora-service.yaml
diff --git a/pkg/README.md b/pkg/README.md
@@ -7,9 +7,14 @@ The current manifests rely on Envoy Gateway [v1.2.1](https://gateway.envoyproxy.
 
 1. **Deploy Sample vLLM Application**
 
-   A sample vLLM deployment with the proper protocol to work with LLM Instance Gateway can be found [here](https://github.com/kubernetes-sigs/llm-instance-gateway/blob/6f9869d6595d2d0f8e6febcbec0f348cb44a3012/examples/poc/manifests/samples/vllm-lora-deployment.yaml#L18).
+   A sample vLLM deployment with the proper protocol to work with LLM Instance Gateway can be found [here](https://github.com/kubernetes-sigs/llm-instance-gateway/tree/main/examples/poc/manifests/vllm/vllm-lora-deployment.yaml#L18).
 
-1. **Update Envoy Gateway Config to enable Patch Policy**
+2. **Deploy LLM Service and LLMServerPool**
+
+   You can find a sample LLM service and LLMServerPool configuration, based on the vLLM deployments mentioned above, [here](https://github.com/kubernetes-sigs/llm-instance-gateway/tree/main/examples/poc/manifests/llmservice.yaml).
+
+
+3. **Update Envoy Gateway Config to enable Patch Policy**
 
    Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run:
    ```bash
@@ -20,26 +25,25 @@ The current manifests rely on Envoy Gateway [v1.2.1](https://gateway.envoyproxy.
    Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again.
 
 
-1. **Deploy Gateway**
+4. **Deploy Gateway**
 
    ```bash
    kubectl apply -f ./manifests/gateway.yaml
    ```
 
-1. **Deploy Ext-Proc**
+5. **Deploy Ext-Proc**
 
    ```bash
    kubectl apply -f ./manifests/ext_proc.yaml
    kubectl apply -f ./manifests/patch_policy.yaml
    ```
-   **NOTE**: Ensure the `instance-gateway-ext-proc` deployment is updated with the pod names and internal IP addresses of the vLLM replicas. This step is crucial for the correct routing of requests based on headers. This won't be needed once we make ext proc dynamically read the pods.
 
-1. **Try it out**
+6. **Try it out**
 
    Wait until the gateway is ready.
 
    ```bash
-   IP=$(kubectl get gateway/llm-gateway -o jsonpath='{.status.addresses[0].value}')
+   IP=$(kubectl get gateway/instance-gateway -o jsonpath='{.status.addresses[0].value}')
    PORT=8081
 
    curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
@@ -48,4 +52,11 @@ The current manifests rely on Envoy Gateway [v1.2.1](https://gateway.envoyproxy.
    "max_tokens": 100,
    "temperature": 0
    }'
-   ```
+   ```
+
+
+## Scheduling Package in Ext Proc
+The scheduling package implements request scheduling algorithms for load balancing requests across backend pods in an inference gateway. The scheduler ensures efficient resource utilization while maintaining low latency and prioritizing critical requests. It applies a series of filters based on metrics and heuristics to select the best pod for a given request.
+
+# Flowchart
+<img src="../docs/schedular-flowchart.png" alt="Scheduling Algorithm" width="400" />
diff --git a/pkg/ext-proc/handlers/response.go b/pkg/ext-proc/handlers/response.go
@@ -12,24 +12,53 @@ func (s *Server) HandleResponseHeaders(reqCtx *RequestContext, req *extProcPb.Pr
 	h := req.Request.(*extProcPb.ProcessingRequest_ResponseHeaders)
 	klog.V(3).Infof("Headers before: %+v\n", h)
 
-	resp := &extProcPb.ProcessingResponse{
-		Response: &extProcPb.ProcessingResponse_ResponseHeaders{
-			ResponseHeaders: &extProcPb.HeadersResponse{
-				Response: &extProcPb.CommonResponse{
-					HeaderMutation: &extProcPb.HeaderMutation{
-						SetHeaders: []*configPb.HeaderValueOption{
-							{
-								Header: &configPb.HeaderValue{
-									// This is for debugging purpose only.
-									Key:      "x-went-into-resp-headers",
-									RawValue: []byte("true"),
+	var resp *extProcPb.ProcessingResponse
+	if reqCtx.TargetPod != nil {
+		resp = &extProcPb.ProcessingResponse{
+			Response: &extProcPb.ProcessingResponse_ResponseHeaders{
+				ResponseHeaders: &extProcPb.HeadersResponse{
+					Response: &extProcPb.CommonResponse{
+						HeaderMutation: &extProcPb.HeaderMutation{
+							SetHeaders: []*configPb.HeaderValueOption{
+								{
+									Header: &configPb.HeaderValue{
+										// This is for debugging purpose only.
+										Key:      "x-went-into-resp-headers",
+										RawValue: []byte("true"),
+									},
+								},
+								{
+									Header: &configPb.HeaderValue{
+										Key:      "target-pod",
+										RawValue: []byte(reqCtx.TargetPod.Address),
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+		}
+	} else {
+		resp = &extProcPb.ProcessingResponse{
+			Response: &extProcPb.ProcessingResponse_ResponseHeaders{
+				ResponseHeaders: &extProcPb.HeadersResponse{
+					Response: &extProcPb.CommonResponse{
+						HeaderMutation: &extProcPb.HeaderMutation{
+							SetHeaders: []*configPb.HeaderValueOption{
+								{
+									Header: &configPb.HeaderValue{
+										// This is for debugging purpose only.
+										Key:      "x-went-into-resp-headers",
+										RawValue: []byte("true"),
+									},
 								},
 							},
 						},
 					},
 				},
 			},
-		},
+		}
 	}
 	return resp, nil
 }
diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/ext-proc/scheduling/filter.go
@@ -121,6 +121,17 @@ func leastQueuingFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*bac
 	return filtered, nil
 }
 
+// lowQueuingFilterFunc filters pods that have queue size less than the threshold.
+func lowQueuingFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) {
+	filtered := []*backend.PodMetrics{}
+	for _, pod := range pods {
+		if pod.WaitingQueueSize < queueingThresholdLoRA {
+			filtered = append(filtered, pod)
+		}
+	}
+	return filtered, nil
+}
+
 // leastKVCacheFilterFunc finds the max and min KV cache of all pods, divides the whole range
 // (max-min) by the number of pods, and finds the pods that fall into the first range.
 // The intuition is that if there are multiple pods that share similar KV cache in the low range, we
@@ -159,6 +170,17 @@ func lowLoRACostPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
 	return ok || len(pod.ActiveModels) < pod.MaxActiveModels
 }
 
+// loRAAffinityPredicate is a filter function to check whether a pod has affinity to the lora requested.
+func loRAAffinityPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
+	_, ok := pod.ActiveModels[req.ResolvedTargetModel]
+	return ok
+}
+
+// minLoRAPredicate is a filter function to check whether a pod has room to load the adapter.
+func minLoRAPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
+	return len(pod.ActiveModels) < pod.MaxActiveModels
+}
+
 func criticalRequestPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
 	return req.Critical
 }