diff --git a/docs/proposals/0683-epp-architecture-proposal/README.md b/docs/proposals/0683-epp-architecture-proposal/README.md index 48c7720fb..0da2d3705 100644 --- a/docs/proposals/0683-epp-architecture-proposal/README.md +++ b/docs/proposals/0683-epp-architecture-proposal/README.md @@ -86,12 +86,17 @@ Due to the possibility of this becoming a bit of a dumping ground. The API will The flow controller will consume resource regime data, and enforce proper resource sharing between workloads. This will primarily be done through a queuing mechanism [as described here](https://docs.google.com/document/d/1VZL7opFWuwgWquvgiOzLlXAJ633qZ9U-A0ZixGjBgaI/edit?usp=sharing). -#### Scheduling Layer +#### Scheduling Subsystem -As the Scheduling Layer is the final interface to the entirety of the pool, all configuration will be at the _pool_ level. The default scheduling layer will be an experimentally-backed LB algorithm, with exposed config values. +The Scheduling Subsystem is intended to be + +As the Scheduling is the final interface to the entirety of the pool, all configuration will be at the _pool_ level. The default scheduling layer will be an experimentally-backed LB algorithm, with exposed config values. The Scheduler will define a strong interface API, so that new scheduling algos may be plugged & dark-launched to test in production traffic without impacting said traffic. Extension is expected to adhere to the [Scheduler Subsystem definition](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/603) + +Scheduling Algorithm + ### `Non-extensible` #### Ext-Proc Server diff --git a/docs/proposals/0683-epp-architecture-proposal/images/examples/example.yaml b/docs/proposals/0683-epp-architecture-proposal/images/examples/example.yaml new file mode 100644 index 000000000..06725a981 --- /dev/null +++ b/docs/proposals/0683-epp-architecture-proposal/images/examples/example.yaml @@ -0,0 +1,34 @@ +#names are egregiously long, but attempting to descibe custom logic within a name +profileSelection: disagg-token-length +schedulingResult: log-shadowbox-label-pd-result +profiles: + prefill: + preschedule: + - decode-prefix-cache-check + filter: + - is-prefill + - has-required-accelerator + score: + - prefix-cache: 3 + - latency-scorer: 2 + selection: + - best-score + postschedule: + - log-full-scores + decode: + filter: + - is-decode + score: + - prefix-cache: 3 + - kv-cache-util: 5 + selection: + - random-top-3 + shadowbox-decode: + filter: + - is-decode + - is-tpu + score: + - prefix-cache-v2: 4 + - kv-cache-util: 1 + selection: + - random-top-3 diff --git a/docs/proposals/0683-epp-architecture-proposal/images/scheduler_subsystem.svg b/docs/proposals/0683-epp-architecture-proposal/images/scheduler_subsystem.svg new file mode 100644 index 000000000..0a6676bd5 --- /dev/null +++ b/docs/proposals/0683-epp-architecture-proposal/images/scheduler_subsystem.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/proposals/0683-epp-architecture-proposal/interfaces/interface.go b/docs/proposals/0683-epp-architecture-proposal/interfaces/interface.go new file mode 100644 index 000000000..804aac685 --- /dev/null +++ b/docs/proposals/0683-epp-architecture-proposal/interfaces/interface.go @@ -0,0 +1,123 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package framework + +import ( + "context" + "sync" + + scheduling "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +// Plugin is the parent type for all the scheduling framework plugins. +type Plugin interface { + Name() string +} + +type Endpoint interface { + GetState() EndpointState + GetScore() float32 + SetScore(val float32) +} + +type EndpointState struct { + // only need to use a sync.Map if we do not plan on snapshotting data. + storage sync.Map +} + +type SchedulingResult struct { + results map[string][]Endpoint +} + +type Scheduler interface { + Plugin + // ProfileSelection selects scheduling profiles through the implemented + // logic, and returns a subset of the registered scheduling profiles. + ProfileSelection() map[string]SchedulingProfile + + // SchedulingProfiles lists all of the scheduling profiles registered + // with the scheduler. + SchedulingProfiles() map[string]SchedulingProfile + + // SchedulingResult takes the output of the result(s) of the scheduling cycle(s) + // and makes sense of the data to be consumed by request control. + // For example: suppose you have 2 profiles ShadowBoxing Profile & Production Profile. + // SchedulingResult would know to simply log the result of ShadowBoxing + // profile, and do nothing else with it. + SchedulingResult(map[string][]Endpoint) SchedulingResult +} + +// SchedulingProfile is an interface to used to describe a profile that will +// run for a given scheduling cycle. +type SchedulingProfile interface { + Plugin + // PreSchedulePlugins are optional, and will be ran at the start of a + // scheduling cycle. This should be scoped to any foundational work needed + // that is custom to this scheduling profile. + PreSchedulePlugins() []PreSchedule + // Filters lists all Filter plugins associated with this Profile. Filters + // are optional. + Filters() []Filter + // Scorers lists all Score plugins associated with this Profile. At + // least 1 scorer must be registered for a profile to be valid. + Scorers() map[Scorer]int + // Selection returns the function that picks the endpoint(s). + Selection() Picker + // PostSchedulePlugins lists all Filter plugins associated with this + // Profile. PostSchedulePlugins are ran after every scheduling cycle, + // and are optional. + PostSchedulePlugins() []PostSchedule +} + +// Preschedule will be ran at the start of a scheduling cycle. This should be +// scoped to any foundational work needed that is custom to this scheduling +// profile. +type PreSchedule interface { + Plugin + PreSchedule(ctx context.Context, state scheduling.CycleState, endpoints []Endpoint) +} + +// Filter runs before any scoring, and remove endpoints that are not fit for +// selection. The framework will return an error to the client if the endpoints +// are filtered to zero. +type Filter interface { + Plugin + Filter(ctx context.Context, state scheduling.CycleState, endpoints []Endpoint) []Endpoint +} + +// Scorer applies a score to each remaining endpoint provided. Scorers SHOULD +// keep their score values in a normalized range: [0-1]. Any weighting should +// be added at the SchedulingProfile configuration level. +type Scorer interface { + Plugin + Score(ctx context.Context, state scheduling.CycleState, endpoints []Endpoint) []Endpoint +} + +// Picker selects the endpoint(s) from the provided list of scored endpoints. +// Picker MUST return, one endpoint at minimum. +type Picker interface { + Plugin + Selection(ctx context.Context, state scheduling.CycleState, endpoints []Endpoint) []Endpoint +} + +// PostSchedule runs per-scheduling cycle, and is part of a scheduling profile. +// PostSchedule performs any remaining work needed for the scheduling cycle. +// PostSchedule is not expected to change any values of the parameters. +type PostSchedule interface { + Plugin + PostSchedule(ctx context.Context, state scheduling.CycleState, selectedEndpoints []Endpoint) +}