kubernetes-sigs · k8s-ci-robot · Dec 10, 2024 · Nov 20, 2024 · Dec 6, 2024 · Dec 8, 2024
diff --git a/docs/schedular-flowchart.png b/docs/schedular-flowchart.png
diff --git a/examples/poc/manifests/vllm/vllm-lora-deployment.yaml b/examples/poc/manifests/vllm/vllm-lora-deployment.yaml
@@ -121,4 +121,4 @@ spec:
           emptyDir:
             medium: Memory
         - name: adapters
-          emptyDir: {}
+          emptyDir: {}
diff --git a/pkg/README.md b/pkg/README.md
@@ -7,7 +7,12 @@ The current manifests rely on Envoy Gateway [v1.2.1](https://gateway.envoyproxy.
 
 1. **Deploy Sample vLLM Application**
 
-   A sample vLLM deployment with the proper protocol to work with LLM Instance Gateway can be found [here](https://github.com/kubernetes-sigs/llm-instance-gateway/blob/6f9869d6595d2d0f8e6febcbec0f348cb44a3012/examples/poc/manifests/samples/vllm-lora-deployment.yaml#L18).
+   A sample vLLM deployment with the proper protocol to work with LLM Instance Gateway can be found [here](https://github.com/kubernetes-sigs/llm-instance-gateway/tree/main/examples/poc/manifests/vllm/vllm-lora-deployment.yaml#L18).
+
+1. **Deploy LLM Service and LLMServerPool**
+
+   You can find a sample LLM service and LLMServerPool configuration, based on the vLLM deployments mentioned above, [here](https://github.com/kubernetes-sigs/llm-instance-gateway/tree/main/examples/poc/manifests/llmservice.yaml).
+
 
 1. **Update Envoy Gateway Config to enable Patch Policy**
 
@@ -32,14 +37,13 @@ The current manifests rely on Envoy Gateway [v1.2.1](https://gateway.envoyproxy.
    kubectl apply -f ./manifests/ext_proc.yaml
    kubectl apply -f ./manifests/patch_policy.yaml
    ```
-   **NOTE**: Ensure the `instance-gateway-ext-proc` deployment is updated with the pod names and internal IP addresses of the vLLM replicas. This step is crucial for the correct routing of requests based on headers. This won't be needed once we make ext proc dynamically read the pods.
 
 1. **Try it out**
 
    Wait until the gateway is ready.
 
    ```bash
-   IP=$(kubectl get gateway/llm-gateway -o jsonpath='{.status.addresses[0].value}')
+   IP=$(kubectl get gateway/instance-gateway -o jsonpath='{.status.addresses[0].value}')
    PORT=8081
 
    curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
@@ -48,4 +52,11 @@ The current manifests rely on Envoy Gateway [v1.2.1](https://gateway.envoyproxy.
    "max_tokens": 100,
    "temperature": 0
    }'
-   ```
+   ```
+
+
+## Scheduling Package in Ext Proc
+The scheduling package implements request scheduling algorithms for load balancing requests across backend pods in an inference gateway. The scheduler ensures efficient resource utilization while maintaining low latency and prioritizing critical requests. It applies a series of filters based on metrics and heuristics to select the best pod for a given request.
+
+# Flowchart
+<img src="../docs/schedular-flowchart.png" alt="Scheduling Algorithm" width="400" />
diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/ext-proc/scheduling/filter.go
@@ -121,6 +121,10 @@ func leastQueuingFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*bac
 	return filtered, nil
 }
 
+func lowQueueingPodPredicate(_ *LLMRequest, pod *backend.PodMetrics) bool {
+	return pod.WaitingQueueSize < queueingThresholdLoRA
+}
+
 // leastKVCacheFilterFunc finds the max and min KV cache of all pods, divides the whole range
 // (max-min) by the number of pods, and finds the pods that fall into the first range.
 // The intuition is that if there are multiple pods that share similar KV cache in the low range, we
@@ -153,12 +157,25 @@ func leastKVCacheFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*bac
 type podPredicate func(req *LLMRequest, pod *backend.PodMetrics) bool
 
 // We consider serving an adapter low cost it the adapter is active in the model server, or the
-// model server has room to load the adapter
+// model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by spreading the 
+// load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to a single pod. 
+// This gave good performance in our initial benchmarking results in the scenario where # of lora slots > # of lora adapters. 
 func lowLoRACostPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
 	_, ok := pod.ActiveModels[req.ResolvedTargetModel]
 	return ok || len(pod.ActiveModels) < pod.MaxActiveModels
 }
 
+// loRAAffinityPredicate is a filter function to check whether a pod has affinity to the lora requested.
+func loRAAffinityPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
+	_, ok := pod.ActiveModels[req.ResolvedTargetModel]
+	return ok
+}
+
+// canAcceptNewLoraPredicate is a filter function to check whether a pod has room to load the adapter.
+func canAcceptNewLoraPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
+	return len(pod.ActiveModels) < pod.MaxActiveModels
+}
+
 func criticalRequestPredicate(req *LLMRequest, pod *backend.PodMetrics) bool {
 	return req.Critical
 }

diff --git a/pkg/ext-proc/scheduling/scheduler.go b/pkg/ext-proc/scheduling/scheduler.go
@@ -16,7 +16,11 @@ const (
 	// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16) Make this configurable.
 	kvCacheThreshold = 0.8
 	// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16) Make this configurable.
-	queueThreshold = 5
+	queueThresholdCritical = 5
+	// TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/16) Make this configurable.
+	// the threshold for queued requests to be considered low below which we can prioritize LoRA affinity.
+	// The value of 50 is arrived heuristicically based on experiments.
+	queueingThresholdLoRA = 50
 )
 
 var (
@@ -27,9 +31,8 @@ var (
 		nextOnFailure: sheddableRequestFilter,
 	}
 
-	// lowLatencyFilter tries to minimize the latency. The heuristic is to pick a server with lower
-	// cost to load an adapter and has low KV cache, which typically yields lower latency.
-	lowLatencyFilter = &filter{
+	// queueLoRAAndKVCacheFilter applied least queue -> low cost lora ->  least KV Cache filter
+	queueLoRAAndKVCacheFilter = &filter{
 		name:   "least queuing",
 		filter: leastQueuingFilterFunc,
 		nextOnSuccessOrFailure: &filter{
@@ -42,13 +45,39 @@ var (
 		},
 	}
 
+	// queueAndKVCacheFilter applies least queue followed by least KV Cache filter
+	queueAndKVCacheFilter = &filter{
+		name:   "least queuing",
+		filter: leastQueuingFilterFunc,
+		nextOnSuccessOrFailure: &filter{
+			name:   "least KV cache percent",
+			filter: leastKVCacheFilterFunc,
+		},
+	}
+
+	lowLatencyFilter = &filter{
+		name:   "low queueing filter",
+		filter: toFilterFunc((lowQueueingPodPredicate)),
+		nextOnSuccess: &filter{
+			name:          "affinity LoRA",
+			filter:        toFilterFunc(loRAAffinityPredicate),
+			nextOnSuccess: queueAndKVCacheFilter,
+			nextOnFailure: &filter{
+				name:                   "can accept LoRA Adapter",
+				filter:                 toFilterFunc(canAcceptNewLoraPredicate),
+				nextOnSuccessOrFailure: queueAndKVCacheFilter,
+			},
+		},
+		nextOnFailure: queueLoRAAndKVCacheFilter,
+	}
+
 	sheddableRequestFilter = &filter{
 		// When there is at least one model server that's not queuing requests, and still has KV
 		// cache below a certain threshold, we consider this model server has capacity to handle
 		// a sheddable request without impacting critical requests.
 		name:          "has capacity for sheddable requests",
-		filter:        toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(queueThreshold, kvCacheThreshold)),
-		nextOnSuccess: lowLatencyFilter,
+		filter:        toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(queueThresholdCritical, kvCacheThreshold)),
+		nextOnSuccess: queueLoRAAndKVCacheFilter,
 		// If all pods are queuing or running above the KVCache threshold, we drop the sheddable
 		// request to make room for critical requests.
 		nextOnFailure: &filter{
@@ -62,6 +91,7 @@ var (
 )
 
 func NewScheduler(pmp PodMetricsProvider) *Scheduler {
+
 	return &Scheduler{
 		podMetricsProvider: pmp,
 		filter:             defaultFilter,