kubernetes-sigs · k8s-ci-robot · Jan 21, 2025 · Jan 8, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/api/v1alpha1/inferencemodel_types.go b/api/v1alpha1/inferencemodel_types.go
@@ -58,21 +58,25 @@ type InferenceModelList struct {
 // creation timestamp, will be selected to remain valid. In the event of a race
 // condition, one will be selected at random.
 type InferenceModelSpec struct {
-	// ModelName is the name of the model as the users set in the "model" parameter in the requests.
-	// The name should be unique among the workloads that reference the same backend pool.
-	// This is the parameter that will be used to match the request with. In the future, we may
-	// allow to match on other request parameters. The other approach to support matching
-	// on other request parameters is to use a different ModelName per HTTPFilter.
-	// Names can be reserved without implementing an actual model in the pool.
+	// ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
+	// ModelNames must be unique for a referencing InferencePool
+	// (names can be reused for a different pool in the same cluster).
+	// The modelName with the oldest creation timestamp is retained, and the incoming
+	// InferenceModel is sets the Ready status to false with a corresponding reason.
+	// In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
+	// Names can be reserved without an underlying model configured in the pool.
 	// This can be done by specifying a target model and setting the weight to zero,
 	// an error will be returned specifying that no valid target model is found.
 	//
-	// +kubebuilder:validation:MaxLength=253
+	// +kubebuilder:validation:MaxLength=256
 	// +kubebuilder:validation:Required
 	ModelName string `json:"modelName"`
 
 	// Criticality defines how important it is to serve the model compared to other models referencing the same pool.
-	//
+	// Criticality impacts how traffic is handled in resource constrained situations. It handles this by
+	// queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
+	// fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
+	// and the proportionality of fairness will be configurable.
 	// +optional
 	// +kubebuilder:default="Default"
 	Criticality *Criticality `json:"criticality,omitempty"`
@@ -83,6 +87,7 @@ type InferenceModelSpec struct {
 	//
 	// +optional
 	// +kubebuilder:validation:MaxItems=10
+	// +kubebuilder:validation:XValidation:message="Weights should be set for all models, or none of the models.",rule="self.all(model, has(model.weight)) || self.all(model, !has(model.weight))"
 	TargetModels []TargetModel `json:"targetModels,omitempty"`
 
 	// PoolRef is a reference to the inference pool, the pool must exist in the same namespace.
@@ -120,16 +125,19 @@ type PoolObjectReference struct {
 }
 
 // Criticality defines how important it is to serve the model compared to other models.
-// +kubebuilder:validation:Enum=Critical;Default;Sheddable
+// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default.
+// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
+// +kubebuilder:validation:Enum=Critical;Standard;Sheddable
 type Criticality string
 
 const (
 	// Critical defines the highest level of criticality. Requests to this band will be shed last.
 	Critical Criticality = "Critical"
 
-	// Default defines the default criticality level and is more important than Sheddable but less
+	// Standard defines the base criticality level and is more important than Sheddable but less
 	// important than Critical. Requests in this band will be shed before critical traffic.
-	Default Criticality = "Default"
+	// Most models are expected to fall within this band.
+	Standard Criticality = "Standard"
 
 	// Sheddable defines the lowest level of criticality. Requests to this band will be shed before
 	// all other bands.
@@ -160,16 +168,13 @@ type TargetModel struct {
 	// implementation supports. Weight is not a percentage and the sum of
 	// weights does not need to equal 100.
 	//
-	// If only one model is specified and it has a weight greater than 0, 100%
-	// of the traffic is forwarded to that model. If weight is set to 0, no
-	// traffic should be forwarded for this model. If unspecified, weight
-	// defaults to 1.
+	// If a weight is set for any targetModel, it must be set for all targetModels.
+	// Conversely weights are optional, so long as ALL targetModels do not specify a weight.
 	//
 	// +optional
-	// +kubebuilder:default=1
 	// +kubebuilder:validation:Minimum=0
 	// +kubebuilder:validation:Maximum=1000000
-	Weight int32 `json:"weight,omitempty"`
+	Weight *int32 `json:"weight,omitempty"`
 }
 
 // InferenceModelStatus defines the observed state of InferenceModel

diff --git a/api/v1alpha1/inferencepool_types.go b/api/v1alpha1/inferencepool_types.go
@@ -44,11 +44,9 @@ type InferencePoolList struct {
 
 // InferencePoolSpec defines the desired state of InferencePool
 type InferencePoolSpec struct {
-	// Selector defines a map of label to watch model server pods
-	// that should be included in the InferencePool. ModelServers should not
-	// be with any other Service or InferencePool, that behavior is not supported
-	// and will result in sub-optimal utilization.
-	// In some cases, implementations may translate this to a Service selector, so this matches the simple
+	// Selector defines a map of labels to watch model server pods
+	// that should be included in the InferencePool.
+	// In some cases, implementations may translate this field to a Service selector, so this matches the simple
 	// map used for Service selectors instead of the full Kubernetes LabelSelector type.
 	//
 	// +kubebuilder:validation:Required

diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml
@@ -56,24 +56,29 @@ spec:
             properties:
               criticality:
                 default: Default
-                description: Criticality defines how important it is to serve the
-                  model compared to other models referencing the same pool.
+                description: |-
+                  Criticality defines how important it is to serve the model compared to other models referencing the same pool.
+                  Criticality impacts how traffic is handled in resource constrained situations. It handles this by
+                  queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
+                  fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
+                  and the proportionality of fairness will be configurable.
                 enum:
                 - Critical
-                - Default
+                - Standard
                 - Sheddable
                 type: string
               modelName:
                 description: |-
-                  ModelName is the name of the model as the users set in the "model" parameter in the requests.
-                  The name should be unique among the workloads that reference the same backend pool.
-                  This is the parameter that will be used to match the request with. In the future, we may
-                  allow to match on other request parameters. The other approach to support matching
-                  on other request parameters is to use a different ModelName per HTTPFilter.
-                  Names can be reserved without implementing an actual model in the pool.
+                  ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
+                  ModelNames must be unique for a referencing InferencePool
+                  (names can be reused for a different pool in the same cluster).
+                  The modelName with the oldest creation timestamp is retained, and the incoming
+                  InferenceModel is sets the Ready status to false with a corresponding reason.
+                  In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
+                  Names can be reserved without an underlying model configured in the pool.
                   This can be done by specifying a target model and setting the weight to zero,
                   an error will be returned specifying that no valid target model is found.
-                maxLength: 253
+                maxLength: 256
                 type: string
               poolRef:
                 description: PoolRef is a reference to the inference pool, the pool
@@ -121,7 +126,6 @@ spec:
                       maxLength: 253
                       type: string
                     weight:
-                      default: 1
                       description: |-
                         Weight is used to determine the proportion of traffic that should be
                         sent to this model when multiple target models are specified.
@@ -133,10 +137,8 @@ spec:
                         implementation supports. Weight is not a percentage and the sum of
                         weights does not need to equal 100.
 
-                        If only one model is specified and it has a weight greater than 0, 100%
-                        of the traffic is forwarded to that model. If weight is set to 0, no
-                        traffic should be forwarded for this model. If unspecified, weight
-                        defaults to 1.
+                        If a weight is set for any targetModel, it must be set for all targetModels.
+                        Conversely weights are optional, so long as ALL targetModels do not specify a weight.
                       format: int32
                       maximum: 1000000
                       minimum: 0
@@ -146,6 +148,9 @@ spec:
                   type: object
                 maxItems: 10
                 type: array
+                x-kubernetes-validations:
+                - message: Weights should be set for all models, or none of the models.
+                  rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight))
             required:
             - modelName
             - poolRef

diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml
@@ -58,11 +58,9 @@ spec:
                   pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$
                   type: string
                 description: |-
-                  Selector defines a map of label to watch model server pods
-                  that should be included in the InferencePool. ModelServers should not
-                  be with any other Service or InferencePool, that behavior is not supported
-                  and will result in sub-optimal utilization.
-                  In some cases, implementations may translate this to a Service selector, so this matches the simple
+                  Selector defines a map of labels to watch model server pods
+                  that should be included in the InferencePool.
+                  In some cases, implementations may translate this field to a Service selector, so this matches the simple
                   map used for Service selectors instead of the full Kubernetes LabelSelector type.
                 type: object
               targetPortNumber:

diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md
@@ -78,6 +78,7 @@ The API design is based on these axioms:
 - This solution should be composable with other Gateway solutions and flexible to fit customer needs
 - The MVP will heavily assume requests are done using the OpenAI spec, but open to extension in the future
 - The Gateway should route in a way that does not generate a queue of requests at the model server level
+- Model serving differs from web-serving in critical ways. One of these is the existence of multiple models for the same service, which can materially impact behavior, depending on the model served. As opposed to a web-service that has mechanisms to render implementation changes invisible to an end user 
 
 The [PoC](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458) was focused on lower-level scheduling. And the API follows that similar logic, which lead to the proposal of the **InferencePool**.
 
@@ -126,27 +127,21 @@ type InferencePool struct {
 
 type InferencePoolSpec struct {
         // ModelServerSelector uses label selection to watch model server pods
-        // that should be included in the InferencePool. ModelServers should not
-        // be with any other Service or InferencePool, that behavior is not supported
-        // and will result in sub-optimal utilization.
+        // that should be included in the InferencePool.
         ModelServerSelector map[string]string `json:"modelServerSelector,omitempty"`
 }
 ```
 
 **InferenceModel**
 ```golang
 // InferenceModel represents a set of Models/Adapters that are multiplexed onto one 
-// or more Inferencepools. This resource is managed by the "Inference Workload Owner"
+// or more InferencePools. This resource is managed by the "Inference Workload Owner"
 // persona. The Inference Workload Owner persona is: a team that trains, verifies, and
 // leverages a large language model from a model frontend, drives the lifecycle
 // and rollout of new versions of those models, and defines the specific
 // performance and latency goals for the model. These workloads are
-// expected to operate within an InferencePool sharing compute capacity with other
-// InferenceModels, defined by the Inference Platform Admin. We allow a user who
-// has multiple InferenceModels across multiple pools (with the same config) to
-// specify the configuration exactly once, and deploy to many pools 
-// simultaneously. Enabling a simpler config and single source of truth
-// for a given user. InferenceModel ModelNames are unique for a given InferencePool,
+// expected to coexist within an InferencePool: sharing compute capacity with other
+// InferenceModels, with sharing limitations defined by the Inference Platform Admin.
 type InferenceModel struct {
         metav1.ObjectMeta
         metav1.TypeMeta
@@ -155,28 +150,35 @@ type InferenceModel struct {
 }
 
 type InferenceModelSpec struct {
-        // The name of the model as the users set in the "model" parameter in the requests.
-        // The name should be unique among the workloads that reference the same backend pool.
-        // This is the parameter that will be used to match the request with. In the future, we may
-        // allow to match on other request parameters. The other approach to support matching on 
-        // on other request parameters is to use a different ModelName per HTTPFilter.
-        // Names can be reserved without implementing an actual model in the pool.
+        // The name of the model as it will be set in the "model" parameter for an incoming request.
+        // ModelNames are expected to be unique for a specific InferencePool 
+        // (names can be reused for a different pool in the same cluster). 
+        // The modelName with the oldest creation timestamp is retained, and the incoming
+        // InferenceModel is sets the Ready status to false with a corresponding reason. 
+        // In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
+        // Names can be reserved without an underlying model configured in the pool.
         // This can be done by specifying a target model and setting the weight to zero,
         // an error will be returned specifying that no valid target model is found.
         ModelName string
         // Optional
         // Defines how important it is to serve the model compared to other models referencing the same pool.
+        // Criticality impacts how traffic is handled in resource constrained situations. It handles this by 
+        // queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
+        // fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
+        // and the proportionality of fairness will be configurable.
         Criticality *Criticality
         // Optional.
-	    // Allow multiple versions of a model for traffic splitting. 
-	    // If not specified, the target model name is defaulted to the ModelName parameter.
+        // Allow multiple versions of a model for traffic splitting.
+        // If not specified, the target model name is defaulted to the ModelName parameter.
         // ModelName is often in reference to a LoRA adapter.
         TargetModels []TargetModel
         // Reference to the InferencePool that the model registers to. It must exist in the same namespace.
         PoolReference *LocalObjectReference
 }
 
 // Defines how important it is to serve the model compared to other models.
+// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field should ALWAYS be optional(use a pointer), and set no default.
+// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
 type Criticality string
 const (
     // Most important. Requests to this band will be shed last.
@@ -200,7 +202,7 @@ type TargetModel struct {
         Name string
         // Weight is used to determine the percentage of traffic that should be 
         // sent to this target model when multiple versions of the model are specified.
-        Weight int
+        Weight *int
 }
 
 // LocalObjectReference identifies an API object within the namespace of the

diff --git a/pkg/ext-proc/backend/datastore.go b/pkg/ext-proc/backend/datastore.go
@@ -91,15 +91,15 @@ func RandomWeightedDraw(model *v1alpha1.InferenceModel, seed int64) string {
 	}
 	r := rand.New(source)
 	for _, model := range model.Spec.TargetModels {
-		weights += model.Weight
+		weights += *model.Weight
 	}
 	klog.V(3).Infof("Weights for Model(%v) total to: %v", model.Name, weights)
 	randomVal := r.Int31n(weights)
 	for _, model := range model.Spec.TargetModels {
-		if randomVal < model.Weight {
+		if randomVal < *model.Weight {
 			return model.Name
 		}
-		randomVal -= model.Weight
+		randomVal -= *model.Weight
 	}
 	return ""
 }

diff --git a/pkg/ext-proc/backend/datastore_test.go b/pkg/ext-proc/backend/datastore_test.go
@@ -58,11 +58,11 @@ func TestRandomWeightedDraw(t *testing.T) {
 					TargetModels: []v1alpha1.TargetModel{
 						{
 							Name:   "canary",
-							Weight: 50,
+							Weight: pointer(50),
 						},
 						{
 							Name:   "v1",
-							Weight: 50,
+							Weight: pointer(50),
 						},
 					},
 				},
@@ -76,15 +76,15 @@ func TestRandomWeightedDraw(t *testing.T) {
 					TargetModels: []v1alpha1.TargetModel{
 						{
 							Name:   "canary",
-							Weight: 25,
+							Weight: pointer(25),
 						},
 						{
 							Name:   "v1.1",
-							Weight: 55,
+							Weight: pointer(55),
 						},
 						{
 							Name:   "v1",
-							Weight: 50,
+							Weight: pointer(50),
 						},
 					},
 				},
@@ -98,15 +98,15 @@ func TestRandomWeightedDraw(t *testing.T) {
 					TargetModels: []v1alpha1.TargetModel{
 						{
 							Name:   "canary",
-							Weight: 20,
+							Weight: pointer(20),
 						},
 						{
 							Name:   "v1.1",
-							Weight: 20,
+							Weight: pointer(20),
 						},
 						{
 							Name:   "v1",
-							Weight: 10,
+							Weight: pointer(10),
 						},
 					},
 				},
@@ -127,3 +127,7 @@ func TestRandomWeightedDraw(t *testing.T) {
 		})
 	}
 }
+
+func pointer(v int32) *int32 {
+	return &v
+}