@@ -20,12 +20,32 @@ import (
20
20
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21
21
)
22
22
23
- // NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
23
+ // InferenceModel is the Schema for the InferenceModels API.
24
+ //
25
+ // +kubebuilder:object:root=true
26
+ // +kubebuilder:subresource:status
27
+ // +genclient
28
+ type InferenceModel struct {
29
+ metav1.TypeMeta `json:",inline"`
30
+ metav1.ObjectMeta `json:"metadata,omitempty"`
31
+
32
+ Spec InferenceModelSpec `json:"spec,omitempty"`
33
+ Status InferenceModelStatus `json:"status,omitempty"`
34
+ }
35
+
36
+ // InferenceModelList contains a list of InferenceModel.
37
+ //
38
+ // +kubebuilder:object:root=true
39
+ type InferenceModelList struct {
40
+ metav1.TypeMeta `json:",inline"`
41
+ metav1.ListMeta `json:"metadata,omitempty"`
42
+ Items []InferenceModel `json:"items"`
43
+ }
24
44
25
- // InferenceModelSpec represents a specific model use case. This resource is
45
+ // InferenceModelSpec represents the desired state of a specific model use case. This resource is
26
46
// managed by the "Inference Workload Owner" persona.
27
47
//
28
- // The Inference Workload Owner persona is: a team that trains, verifies, and
48
+ // The Inference Workload Owner persona is someone that trains, verifies, and
29
49
// leverages a large language model from a model frontend, drives the lifecycle
30
50
// and rollout of new versions of those models, and defines the specific
31
51
// performance and latency goals for the model. These workloads are
@@ -38,7 +58,7 @@ import (
38
58
// creation timestamp, will be selected to remain valid. In the event of a race
39
59
// condition, one will be selected at random.
40
60
type InferenceModelSpec struct {
41
- // The name of the model as the users set in the "model" parameter in the requests.
61
+ // ModelName is the name of the model as the users set in the "model" parameter in the requests.
42
62
// The name should be unique among the workloads that reference the same backend pool.
43
63
// This is the parameter that will be used to match the request with. In the future, we may
44
64
// allow to match on other request parameters. The other approach to support matching
@@ -47,22 +67,25 @@ type InferenceModelSpec struct {
47
67
// This can be done by specifying a target model and setting the weight to zero,
48
68
// an error will be returned specifying that no valid target model is found.
49
69
//
50
- // +optional
51
70
// +kubebuilder:validation:MaxLength=253
52
- ModelName string `json:"modelName,omitempty"`
53
- // Defines how important it is to serve the model compared to other models referencing the same pool.
71
+ // +kubebuilder:validation:Required
72
+ ModelName string `json:"modelName"`
73
+
74
+ // Criticality defines how important it is to serve the model compared to other models referencing the same pool.
54
75
//
55
76
// +optional
56
77
// +kubebuilder:default="Default"
57
78
Criticality * Criticality `json:"criticality,omitempty"`
58
- // Allow multiple versions of a model for traffic splitting.
79
+
80
+ // TargetModels allow multiple versions of a model for traffic splitting.
59
81
// If not specified, the target model name is defaulted to the modelName parameter.
60
82
// modelName is often in reference to a LoRA adapter.
61
83
//
62
84
// +optional
63
85
// +kubebuilder:validation:MaxItems=10
64
86
TargetModels []TargetModel `json:"targetModels,omitempty"`
65
- // Reference to the inference pool, the pool must exist in the same namespace.
87
+
88
+ // PoolRef is a reference to the inference pool, the pool must exist in the same namespace.
66
89
//
67
90
// +kubebuilder:validation:Required
68
91
PoolRef PoolObjectReference `json:"poolRef"`
@@ -93,39 +116,54 @@ type PoolObjectReference struct {
93
116
// +kubebuilder:validation:MinLength=1
94
117
// +kubebuilder:validation:MaxLength=253
95
118
// +kubebuilder:validation:Required
96
- Name string `json:"name,omitempty "`
119
+ Name string `json:"name"`
97
120
}
98
121
99
- // Defines how important it is to serve the model compared to other models.
122
+ // Criticality defines how important it is to serve the model compared to other models.
100
123
// +kubebuilder:validation:Enum=Critical;Default;Sheddable
101
124
type Criticality string
102
125
103
126
const (
104
- // Most important . Requests to this band will be shed last.
127
+ // Critical defines the highest level of criticality . Requests to this band will be shed last.
105
128
Critical Criticality = "Critical"
106
- // More important than Sheddable, less important than Critical.
107
- // Requests in this band will be shed before critical traffic.
108
- // +kubebuilder:default=Default
129
+
130
+ // Default defines the default criticality level and is more important than Sheddable but less
131
+ // important than Critical. Requests in this band will be shed before critical traffic.
109
132
Default Criticality = "Default"
110
- // Least important. Requests to this band will be shed before all other bands.
133
+
134
+ // Sheddable defines the lowest level of criticality. Requests to this band will be shed before
135
+ // all other bands.
111
136
Sheddable Criticality = "Sheddable"
112
137
)
113
138
114
139
// TargetModel represents a deployed model or a LoRA adapter. The
115
140
// Name field is expected to match the name of the LoRA adapter
116
141
// (or base model) as it is registered within the model server. Inference
117
- // Gateway assumes that the model exists on the model server and is the
142
+ // Gateway assumes that the model exists on the model server and it's the
118
143
// responsibility of the user to validate a correct match. Should a model fail
119
- // to exist at request time, the error is processed by the Instance Gateway,
120
- // and then emitted on the appropriate InferenceModel object.
144
+ // to exist at request time, the error is processed by the Inference Gateway
145
+ // and emitted on the appropriate InferenceModel object.
121
146
type TargetModel struct {
122
- // The name of the adapter as expected by the ModelServer.
147
+ // Name is the name of the adapter as expected by the ModelServer.
123
148
//
124
- // +optional
125
149
// +kubebuilder:validation:MaxLength=253
126
- Name string `json:"name,omitempty"`
150
+ // +kubebuilder:validation:Required
151
+ Name string `json:"name"`
152
+
127
153
// Weight is used to determine the proportion of traffic that should be
128
- // sent to this target model when multiple versions of the model are specified.
154
+ // sent to this model when multiple target models are specified.
155
+ //
156
+ // Weight defines the proportion of requests forwarded to the specified
157
+ // model. This is computed as weight/(sum of all weights in this
158
+ // TargetModels list). For non-zero values, there may be some epsilon from
159
+ // the exact proportion defined here depending on the precision an
160
+ // implementation supports. Weight is not a percentage and the sum of
161
+ // weights does not need to equal 100.
162
+ //
163
+ // If only one model is specified and it has a weight greater than 0, 100%
164
+ // of the traffic is forwarded to that model. If weight is set to 0, no
165
+ // traffic should be forwarded for this model. If unspecified, weight
166
+ // defaults to 1.
129
167
//
130
168
// +optional
131
169
// +kubebuilder:default=1
@@ -140,28 +178,6 @@ type InferenceModelStatus struct {
140
178
Conditions []metav1.Condition `json:"conditions,omitempty"`
141
179
}
142
180
143
- // +kubebuilder:object:root=true
144
- // +kubebuilder:subresource:status
145
- // +genclient
146
-
147
- // InferenceModel is the Schema for the InferenceModels API
148
- type InferenceModel struct {
149
- metav1.TypeMeta `json:",inline"`
150
- metav1.ObjectMeta `json:"metadata,omitempty"`
151
-
152
- Spec InferenceModelSpec `json:"spec,omitempty"`
153
- Status InferenceModelStatus `json:"status,omitempty"`
154
- }
155
-
156
- // +kubebuilder:object:root=true
157
-
158
- // InferenceModelList contains a list of InferenceModel
159
- type InferenceModelList struct {
160
- metav1.TypeMeta `json:",inline"`
161
- metav1.ListMeta `json:"metadata,omitempty"`
162
- Items []InferenceModel `json:"items"`
163
- }
164
-
165
181
func init () {
166
182
SchemeBuilder .Register (& InferenceModel {}, & InferenceModelList {})
167
183
}
0 commit comments