kubernetes-sigs · kfswain · May 15, 2025 · nirrozenbaum · May 18, 2025 · nirrozenbaum
diff --git a/docs/proposals/0683-epp-architecture-proposal/README.md b/docs/proposals/0683-epp-architecture-proposal/README.md
@@ -86,12 +86,17 @@ Due to the possibility of this becoming a bit of a dumping ground. The API will
 
 The flow controller will consume resource regime data, and enforce proper resource sharing between workloads. This will primarily be done through a queuing mechanism [as described here](https://docs.google.com/document/d/1VZL7opFWuwgWquvgiOzLlXAJ633qZ9U-A0ZixGjBgaI/edit?usp=sharing).
 
-#### Scheduling Layer
+#### Scheduling Subsystem
 
-As the Scheduling Layer is the final interface to the entirety of the pool, all configuration will be at the _pool_ level. The default scheduling layer will be an experimentally-backed LB algorithm, with exposed config values.
+The Scheduling Subsystem is intended to be 
+
+As the Scheduling  is the final interface to the entirety of the pool, all configuration will be at the _pool_ level. The default scheduling layer will be an experimentally-backed LB algorithm, with exposed config values.
 
 The Scheduler will define a strong interface API, so that new scheduling algos may be plugged & dark-launched to test in production traffic without impacting said traffic. Extension is expected to adhere to the [Scheduler Subsystem definition](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/603)
 
+
+<img src="./images/epp_arch.svg" alt="Scheduling Algorithm" width="1000" />
+
 ### `Non-extensible`
 
 #### Ext-Proc Server

diff --git a/docs/proposals/0683-epp-architecture-proposal/images/examples/example.yaml b/docs/proposals/0683-epp-architecture-proposal/images/examples/example.yaml
@@ -0,0 +1,34 @@
+#names are egregiously long, but attempting to descibe custom logic within a name
+profileSelection: disagg-token-length
+schedulingResult: log-shadowbox-label-pd-result 
+profiles:
+  prefill:
+    preschedule:
+      - decode-prefix-cache-check
+    filter:
+      - is-prefill
+      - has-required-accelerator
+    score:
+      - prefix-cache: 3
+      - latency-scorer: 2
+    selection:
+      - best-score
+    postschedule:
+      - log-full-scores
+  decode:
+    filter:
+      - is-decode
+    score:
+      - prefix-cache: 3
+      - kv-cache-util: 5
+    selection:
+      - random-top-3
+  shadowbox-decode:
+    filter:
+      - is-decode
+      - is-tpu
+    score:
+      - prefix-cache-v2: 4
+      - kv-cache-util: 1
+    selection:
+      - random-top-3
diff --git a/docs/proposals/0683-epp-architecture-proposal/images/scheduler_subsystem.svg b/docs/proposals/0683-epp-architecture-proposal/images/scheduler_subsystem.svg
diff --git a/docs/proposals/0683-epp-architecture-proposal/interfaces/interface.go b/docs/proposals/0683-epp-architecture-proposal/interfaces/interface.go
@@ -0,0 +1,123 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package framework
+
+import (
+	"context"
+	"sync"
+
+	scheduling "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+)
+
+// Plugin is the parent type for all the scheduling framework plugins.
+type Plugin interface {
+	Name() string
+}
+
+type Endpoint interface {
+	GetState() EndpointState
+	GetScore() float32
+	SetScore(val float32)
+}
+
+type EndpointState struct {
+	// only need to use a sync.Map if we do not plan on snapshotting data.
+	storage sync.Map
+}
+
+type SchedulingResult struct {
+	results map[string][]Endpoint
+}
+
+type Scheduler interface {
+	Plugin
+	// ProfileSelection selects scheduling profiles through the implemented
+	// logic, and returns a subset of the registered scheduling profiles.
+	ProfileSelection() map[string]SchedulingProfile
+
+	// SchedulingProfiles lists all of the scheduling profiles registered
+	// with the scheduler.
+	SchedulingProfiles() map[string]SchedulingProfile
+
+	// SchedulingResult takes the output of the result(s) of the scheduling cycle(s)
+	// and makes sense of the data to be consumed by request control.
+	// For example: suppose you have 2 profiles ShadowBoxing Profile & Production Profile.
+	// SchedulingResult would know to simply log the result of ShadowBoxing
+	// profile, and do nothing else with it.
+	SchedulingResult(map[string][]Endpoint) SchedulingResult
+}
+
+// SchedulingProfile is an interface to used to describe a profile that will
+// run for a given scheduling cycle.
+type SchedulingProfile interface {
+	Plugin
+	// PreSchedulePlugins are optional, and will be ran at the start of a
+	// scheduling cycle. This should be scoped to any foundational work needed
+	// that is custom to this scheduling profile.
+	PreSchedulePlugins() []PreSchedule
+	// Filters lists all Filter plugins associated with this Profile. Filters
+	// are optional.
+	Filters() []Filter
+	// Scorers lists all Score plugins associated with this Profile. At
+	// least 1 scorer must be registered for a profile to be valid.
+	Scorers() map[Scorer]int
+	// Selection returns the function that picks the endpoint(s).
+	Selection() Picker
+	// PostSchedulePlugins lists all Filter plugins associated with this
+	// Profile. PostSchedulePlugins are ran after every scheduling cycle,
+	// and are optional.
+	PostSchedulePlugins() []PostSchedule
+}
+
+// Preschedule will be ran at the start of a scheduling cycle. This should be
+// scoped to any foundational work needed that is custom to this scheduling
+// profile.
+type PreSchedule interface {
+	Plugin
+	PreSchedule(ctx context.Context, state scheduling.CycleState, endpoints []Endpoint)
+}
+
+// Filter runs before any scoring, and remove endpoints that are not fit for
+// selection. The framework will return an error to the client if the endpoints
+// are filtered to zero.
+type Filter interface {
+	Plugin
+	Filter(ctx context.Context, state scheduling.CycleState, endpoints []Endpoint) []Endpoint
+}
+
+// Scorer applies a score to each remaining endpoint provided. Scorers SHOULD
+// keep their score values in a normalized range: [0-1]. Any weighting should
+// be added at the SchedulingProfile configuration level.
+type Scorer interface {
+	Plugin
+	Score(ctx context.Context, state scheduling.CycleState, endpoints []Endpoint) []Endpoint
+}
+
+// Picker selects the endpoint(s) from the provided list of scored endpoints.
+// Picker MUST return, one endpoint at minimum.
+type Picker interface {
+	Plugin
+	Selection(ctx context.Context, state scheduling.CycleState, endpoints []Endpoint) []Endpoint
+}
+
+// PostSchedule runs per-scheduling cycle, and is part of a scheduling profile.
+// PostSchedule performs any remaining work needed for the scheduling cycle.
+// PostSchedule is not expected to change any values of the parameters.
+type PostSchedule interface {
+	Plugin
+	PostSchedule(ctx context.Context, state scheduling.CycleState, selectedEndpoints []Endpoint)
+}