Skip to content

Commit cf905a1

Browse files
hzxuzhonghukaushikmitr
authored andcommitted
Added v1alpha2 api (kubernetes-sigs#398)
* copy api v1alpha1 to v1alpha2 * Add nested status * auto generate * use v1alpha2
1 parent 395af13 commit cf905a1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+3554
-141
lines changed

api/v1alpha2/doc.go

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
// Package v1alpha2 contains API Schema definitions for the
18+
// inference.networking.x-k8s.io API group.
19+
//
20+
// +k8s:openapi-gen=true
21+
// +kubebuilder:object:generate=true
22+
// +groupName=inference.networking.x-k8s.io
23+
package v1alpha2

api/v1alpha2/groupversion_info.go

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
// Package v1alpha2 contains API Schema definitions for the gateway v1alpha2 API group
18+
// +kubebuilder:object:generate=true
19+
// +groupName=inference.networking.x-k8s.io
20+
package v1alpha2
21+
22+
import (
23+
"k8s.io/apimachinery/pkg/runtime/schema"
24+
"sigs.k8s.io/controller-runtime/pkg/scheme"
25+
)
26+
27+
var (
28+
// GroupVersion is group version used to register these objects
29+
GroupVersion = schema.GroupVersion{Group: "inference.networking.x-k8s.io", Version: "v1alpha2"}
30+
31+
// SchemeGroupVersion is alias to GroupVersion for client-go libraries.
32+
// It is required by pkg/client/informers/externalversions/...
33+
SchemeGroupVersion = GroupVersion
34+
35+
// SchemeBuilder is used to add go types to the GroupVersionKind scheme
36+
SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}
37+
38+
// AddToScheme adds the types in this group-version to the given scheme.
39+
AddToScheme = SchemeBuilder.AddToScheme
40+
)
41+
42+
// Resource is required by pkg/client/listers/...
43+
func Resource(resource string) schema.GroupResource {
44+
return GroupVersion.WithResource(resource).GroupResource()
45+
}

api/v1alpha2/inferencemodel_types.go

+235
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package v1alpha2
18+
19+
import (
20+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21+
)
22+
23+
// InferenceModel is the Schema for the InferenceModels API.
24+
//
25+
// +kubebuilder:object:root=true
26+
// +kubebuilder:subresource:status
27+
// +kubebuilder:storageversion
28+
// +genclient
29+
type InferenceModel struct {
30+
metav1.TypeMeta `json:",inline"`
31+
metav1.ObjectMeta `json:"metadata,omitempty"`
32+
33+
Spec InferenceModelSpec `json:"spec,omitempty"`
34+
Status InferenceModelStatus `json:"status,omitempty"`
35+
}
36+
37+
// InferenceModelList contains a list of InferenceModel.
38+
//
39+
// +kubebuilder:object:root=true
40+
type InferenceModelList struct {
41+
metav1.TypeMeta `json:",inline"`
42+
metav1.ListMeta `json:"metadata,omitempty"`
43+
Items []InferenceModel `json:"items"`
44+
}
45+
46+
// InferenceModelSpec represents the desired state of a specific model use case. This resource is
47+
// managed by the "Inference Workload Owner" persona.
48+
//
49+
// The Inference Workload Owner persona is someone that trains, verifies, and
50+
// leverages a large language model from a model frontend, drives the lifecycle
51+
// and rollout of new versions of those models, and defines the specific
52+
// performance and latency goals for the model. These workloads are
53+
// expected to operate within an InferencePool sharing compute capacity with other
54+
// InferenceModels, defined by the Inference Platform Admin.
55+
//
56+
// InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool,
57+
// if the name is reused, an error will be shown on the status of a
58+
// InferenceModel that attempted to reuse. The oldest InferenceModel, based on
59+
// creation timestamp, will be selected to remain valid. In the event of a race
60+
// condition, one will be selected at random.
61+
type InferenceModelSpec struct {
62+
// ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
63+
// ModelNames must be unique for a referencing InferencePool
64+
// (names can be reused for a different pool in the same cluster).
65+
// The modelName with the oldest creation timestamp is retained, and the incoming
66+
// InferenceModel is sets the Ready status to false with a corresponding reason.
67+
// In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
68+
// Names can be reserved without an underlying model configured in the pool.
69+
// This can be done by specifying a target model and setting the weight to zero,
70+
// an error will be returned specifying that no valid target model is found.
71+
//
72+
// +kubebuilder:validation:MaxLength=256
73+
// +kubebuilder:validation:Required
74+
ModelName string `json:"modelName"`
75+
76+
// Criticality defines how important it is to serve the model compared to other models referencing the same pool.
77+
// Criticality impacts how traffic is handled in resource constrained situations. It handles this by
78+
// queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
79+
// fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
80+
// and the proportionality of fairness will be configurable.
81+
//
82+
// Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
83+
// Any implementations that may consume this field may treat an unset value as the 'Standard' range.
84+
// +optional
85+
Criticality *Criticality `json:"criticality,omitempty"`
86+
87+
// TargetModels allow multiple versions of a model for traffic splitting.
88+
// If not specified, the target model name is defaulted to the modelName parameter.
89+
// modelName is often in reference to a LoRA adapter.
90+
//
91+
// +optional
92+
// +kubebuilder:validation:MaxItems=10
93+
// +kubebuilder:validation:XValidation:message="Weights should be set for all models, or none of the models.",rule="self.all(model, has(model.weight)) || self.all(model, !has(model.weight))"
94+
TargetModels []TargetModel `json:"targetModels,omitempty"`
95+
96+
// PoolRef is a reference to the inference pool, the pool must exist in the same namespace.
97+
//
98+
// +kubebuilder:validation:Required
99+
PoolRef PoolObjectReference `json:"poolRef"`
100+
}
101+
102+
// PoolObjectReference identifies an API object within the namespace of the
103+
// referrer.
104+
type PoolObjectReference struct {
105+
// Group is the group of the referent.
106+
//
107+
// +optional
108+
// +kubebuilder:default="inference.networking.x-k8s.io"
109+
// +kubebuilder:validation:MaxLength=253
110+
// +kubebuilder:validation:Pattern=`^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
111+
Group string `json:"group,omitempty"`
112+
113+
// Kind is kind of the referent. For example "InferencePool".
114+
//
115+
// +optional
116+
// +kubebuilder:default="InferencePool"
117+
// +kubebuilder:validation:MinLength=1
118+
// +kubebuilder:validation:MaxLength=63
119+
// +kubebuilder:validation:Pattern=`^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
120+
Kind string `json:"kind,omitempty"`
121+
122+
// Name is the name of the referent.
123+
//
124+
// +kubebuilder:validation:MinLength=1
125+
// +kubebuilder:validation:MaxLength=253
126+
// +kubebuilder:validation:Required
127+
Name string `json:"name"`
128+
}
129+
130+
// Criticality defines how important it is to serve the model compared to other models.
131+
// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default.
132+
// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior.
133+
// +kubebuilder:validation:Enum=Critical;Standard;Sheddable
134+
type Criticality string
135+
136+
const (
137+
// Critical defines the highest level of criticality. Requests to this band will be shed last.
138+
Critical Criticality = "Critical"
139+
140+
// Standard defines the base criticality level and is more important than Sheddable but less
141+
// important than Critical. Requests in this band will be shed before critical traffic.
142+
// Most models are expected to fall within this band.
143+
Standard Criticality = "Standard"
144+
145+
// Sheddable defines the lowest level of criticality. Requests to this band will be shed before
146+
// all other bands.
147+
Sheddable Criticality = "Sheddable"
148+
)
149+
150+
// TargetModel represents a deployed model or a LoRA adapter. The
151+
// Name field is expected to match the name of the LoRA adapter
152+
// (or base model) as it is registered within the model server. Inference
153+
// Gateway assumes that the model exists on the model server and it's the
154+
// responsibility of the user to validate a correct match. Should a model fail
155+
// to exist at request time, the error is processed by the Inference Gateway
156+
// and emitted on the appropriate InferenceModel object.
157+
type TargetModel struct {
158+
// Name is the name of the adapter or base model, as expected by the ModelServer.
159+
//
160+
// +kubebuilder:validation:MaxLength=253
161+
// +kubebuilder:validation:Required
162+
Name string `json:"name"`
163+
164+
// Weight is used to determine the proportion of traffic that should be
165+
// sent to this model when multiple target models are specified.
166+
//
167+
// Weight defines the proportion of requests forwarded to the specified
168+
// model. This is computed as weight/(sum of all weights in this
169+
// TargetModels list). For non-zero values, there may be some epsilon from
170+
// the exact proportion defined here depending on the precision an
171+
// implementation supports. Weight is not a percentage and the sum of
172+
// weights does not need to equal 100.
173+
//
174+
// If a weight is set for any targetModel, it must be set for all targetModels.
175+
// Conversely weights are optional, so long as ALL targetModels do not specify a weight.
176+
//
177+
// +optional
178+
// +kubebuilder:validation:Minimum=0
179+
// +kubebuilder:validation:Maximum=1000000
180+
Weight *int32 `json:"weight,omitempty"`
181+
}
182+
183+
// InferenceModelStatus defines the observed state of InferenceModel
184+
type InferenceModelStatus struct {
185+
// Conditions track the state of the InferenceModel.
186+
//
187+
// Known condition types are:
188+
//
189+
// * "Accepted"
190+
//
191+
// +optional
192+
// +listType=map
193+
// +listMapKey=type
194+
// +kubebuilder:validation:MaxItems=8
195+
// +kubebuilder:default={{type: "Ready", status: "Unknown", reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"}}
196+
Conditions []metav1.Condition `json:"conditions,omitempty"`
197+
}
198+
199+
// InferenceModelConditionType is a type of condition for the InferenceModel.
200+
type InferenceModelConditionType string
201+
202+
// InferenceModelConditionReason is the reason for a given InferenceModelConditionType.
203+
type InferenceModelConditionReason string
204+
205+
const (
206+
// ModelConditionAccepted indicates if the model config is accepted, and if not, why.
207+
//
208+
// Possible reasons for this condition to be True are:
209+
//
210+
// * "Accepted"
211+
//
212+
// Possible reasons for this condition to be False are:
213+
//
214+
// * "ModelNameInUse"
215+
//
216+
// Possible reasons for this condition to be Unknown are:
217+
//
218+
// * "Pending"
219+
//
220+
ModelConditionAccepted InferenceModelConditionType = "Accepted"
221+
222+
// ModelReasonAccepted is the desired state. Model conforms to the state of the pool.
223+
ModelReasonAccepted InferenceModelConditionReason = "Accepted"
224+
225+
// ModelReasonNameInUse is used when a given ModelName already exists within the pool.
226+
// Details about naming conflict resolution are on the ModelName field itself.
227+
ModelReasonNameInUse InferenceModelConditionReason = "ModelNameInUse"
228+
229+
// ModelReasonPending is the initial state, and indicates that the controller has not yet reconciled the InferenceModel.
230+
ModelReasonPending InferenceModelConditionReason = "Pending"
231+
)
232+
233+
func init() {
234+
SchemeBuilder.Register(&InferenceModel{}, &InferenceModelList{})
235+
}

0 commit comments

Comments
 (0)