Skip to content

Added v1alpha2 api #398

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 24, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions api/v1alpha2/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
Copyright 2025 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Package v1alpha2 contains API Schema definitions for the
// inference.networking.x-k8s.io API group.
//
// +k8s:openapi-gen=true
// +kubebuilder:object:generate=true
// +groupName=inference.networking.x-k8s.io
package v1alpha2
45 changes: 45 additions & 0 deletions api/v1alpha2/groupversion_info.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
Copyright 2025 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Package v1alpha2 contains API Schema definitions for the gateway v1alpha2 API group
// +kubebuilder:object:generate=true
// +groupName=inference.networking.x-k8s.io
package v1alpha2

import (
"k8s.io/apimachinery/pkg/runtime/schema"
"sigs.k8s.io/controller-runtime/pkg/scheme"
)

var (
// GroupVersion is group version used to register these objects
GroupVersion = schema.GroupVersion{Group: "inference.networking.x-k8s.io", Version: "v1alpha2"}

// SchemeGroupVersion is alias to GroupVersion for client-go libraries.
// It is required by pkg/client/informers/externalversions/...
SchemeGroupVersion = GroupVersion

// SchemeBuilder is used to add go types to the GroupVersionKind scheme
SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}

// AddToScheme adds the types in this group-version to the given scheme.
AddToScheme = SchemeBuilder.AddToScheme
)

// Resource is required by pkg/client/listers/...
func Resource(resource string) schema.GroupResource {
return GroupVersion.WithResource(resource).GroupResource()
}
235 changes: 235 additions & 0 deletions api/v1alpha2/inferencemodel_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
/*
Copyright 2025 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package v1alpha2

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// InferenceModel is the Schema for the InferenceModels API.
//
// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:storageversion
// +genclient
type InferenceModel struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Spec InferenceModelSpec `json:"spec,omitempty"`
Status InferenceModelStatus `json:"status,omitempty"`
}

// InferenceModelList contains a list of InferenceModel.
//
// +kubebuilder:object:root=true
type InferenceModelList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []InferenceModel `json:"items"`
}

// InferenceModelSpec represents the desired state of a specific model use case. This resource is
// managed by the "Inference Workload Owner" persona.
//
// The Inference Workload Owner persona is someone that trains, verifies, and
// leverages a large language model from a model frontend, drives the lifecycle
// and rollout of new versions of those models, and defines the specific
// performance and latency goals for the model. These workloads are
// expected to operate within an InferencePool sharing compute capacity with other
// InferenceModels, defined by the Inference Platform Admin.
//
// InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool,
// if the name is reused, an error will be shown on the status of a
// InferenceModel that attempted to reuse. The oldest InferenceModel, based on
// creation timestamp, will be selected to remain valid. In the event of a race
// condition, one will be selected at random.
type InferenceModelSpec struct {
// ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
// ModelNames must be unique for a referencing InferencePool
// (names can be reused for a different pool in the same cluster).
// The modelName with the oldest creation timestamp is retained, and the incoming
// InferenceModel is sets the Ready status to false with a corresponding reason.
// In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
// Names can be reserved without an underlying model configured in the pool.
// This can be done by specifying a target model and setting the weight to zero,
// an error will be returned specifying that no valid target model is found.
//
// +kubebuilder:validation:MaxLength=256
// +kubebuilder:validation:Required
ModelName string `json:"modelName"`

// Criticality defines how important it is to serve the model compared to other models referencing the same pool.
// Criticality impacts how traffic is handled in resource constrained situations. It handles this by
// queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
// fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
// and the proportionality of fairness will be configurable.
//
// Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
// Any implementations that may consume this field may treat an unset value as the 'Standard' range.
// +optional
Criticality *Criticality `json:"criticality,omitempty"`

// TargetModels allow multiple versions of a model for traffic splitting.
// If not specified, the target model name is defaulted to the modelName parameter.
// modelName is often in reference to a LoRA adapter.
//
// +optional
// +kubebuilder:validation:MaxItems=10
// +kubebuilder:validation:XValidation:message="Weights should be set for all models, or none of the models.",rule="self.all(model, has(model.weight)) || self.all(model, !has(model.weight))"
TargetModels []TargetModel `json:"targetModels,omitempty"`

// PoolRef is a reference to the inference pool, the pool must exist in the same namespace.
//
// +kubebuilder:validation:Required
PoolRef PoolObjectReference `json:"poolRef"`
}

// PoolObjectReference identifies an API object within the namespace of the
// referrer.
type PoolObjectReference struct {
// Group is the group of the referent.
//
// +optional
// +kubebuilder:default="inference.networking.x-k8s.io"
// +kubebuilder:validation:MaxLength=253
// +kubebuilder:validation:Pattern=`^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
Group string `json:"group,omitempty"`

// Kind is kind of the referent. For example "InferencePool".
//
// +optional
// +kubebuilder:default="InferencePool"
// +kubebuilder:validation:MinLength=1
// +kubebuilder:validation:MaxLength=63
// +kubebuilder:validation:Pattern=`^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
Kind string `json:"kind,omitempty"`

// Name is the name of the referent.
//
// +kubebuilder:validation:MinLength=1
// +kubebuilder:validation:MaxLength=253
// +kubebuilder:validation:Required
Name string `json:"name"`
}

// Criticality defines how important it is to serve the model compared to other models.
// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default.
// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
// +kubebuilder:validation:Enum=Critical;Standard;Sheddable
type Criticality string

const (
// Critical defines the highest level of criticality. Requests to this band will be shed last.
Critical Criticality = "Critical"

// Standard defines the base criticality level and is more important than Sheddable but less
// important than Critical. Requests in this band will be shed before critical traffic.
// Most models are expected to fall within this band.
Standard Criticality = "Standard"

// Sheddable defines the lowest level of criticality. Requests to this band will be shed before
// all other bands.
Sheddable Criticality = "Sheddable"
)

// TargetModel represents a deployed model or a LoRA adapter. The
// Name field is expected to match the name of the LoRA adapter
// (or base model) as it is registered within the model server. Inference
// Gateway assumes that the model exists on the model server and it's the
// responsibility of the user to validate a correct match. Should a model fail
// to exist at request time, the error is processed by the Inference Gateway
// and emitted on the appropriate InferenceModel object.
type TargetModel struct {
// Name is the name of the adapter or base model, as expected by the ModelServer.
//
// +kubebuilder:validation:MaxLength=253
// +kubebuilder:validation:Required
Name string `json:"name"`

// Weight is used to determine the proportion of traffic that should be
// sent to this model when multiple target models are specified.
//
// Weight defines the proportion of requests forwarded to the specified
// model. This is computed as weight/(sum of all weights in this
// TargetModels list). For non-zero values, there may be some epsilon from
// the exact proportion defined here depending on the precision an
// implementation supports. Weight is not a percentage and the sum of
// weights does not need to equal 100.
//
// If a weight is set for any targetModel, it must be set for all targetModels.
// Conversely weights are optional, so long as ALL targetModels do not specify a weight.
//
// +optional
// +kubebuilder:validation:Minimum=0
// +kubebuilder:validation:Maximum=1000000
Weight *int32 `json:"weight,omitempty"`
}

// InferenceModelStatus defines the observed state of InferenceModel
type InferenceModelStatus struct {
// Conditions track the state of the InferenceModel.
//
// Known condition types are:
//
// * "Accepted"
//
// +optional
// +listType=map
// +listMapKey=type
// +kubebuilder:validation:MaxItems=8
// +kubebuilder:default={{type: "Ready", status: "Unknown", reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"}}
Conditions []metav1.Condition `json:"conditions,omitempty"`
}

// InferenceModelConditionType is a type of condition for the InferenceModel.
type InferenceModelConditionType string

// InferenceModelConditionReason is the reason for a given InferenceModelConditionType.
type InferenceModelConditionReason string

const (
// ModelConditionAccepted indicates if the model config is accepted, and if not, why.
//
// Possible reasons for this condition to be True are:
//
// * "Accepted"
//
// Possible reasons for this condition to be False are:
//
// * "ModelNameInUse"
//
// Possible reasons for this condition to be Unknown are:
//
// * "Pending"
//
ModelConditionAccepted InferenceModelConditionType = "Accepted"

// ModelReasonAccepted is the desired state. Model conforms to the state of the pool.
ModelReasonAccepted InferenceModelConditionReason = "Accepted"

// ModelReasonNameInUse is used when a given ModelName already exists within the pool.
// Details about naming conflict resolution are on the ModelName field itself.
ModelReasonNameInUse InferenceModelConditionReason = "ModelNameInUse"

// ModelReasonPending is the initial state, and indicates that the controller has not yet reconciled the InferenceModel.
ModelReasonPending InferenceModelConditionReason = "Pending"
)

func init() {
SchemeBuilder.Register(&InferenceModel{}, &InferenceModelList{})
}
Loading