@@ -46,9 +46,9 @@ type LLMServiceSpec struct {
46
46
// Model can be in 2 priority classes, Critical and Noncritical.
47
47
// Priority class is implicitly set to Critical by specifying an Objective.
48
48
// Otherwise the Model is considered Noncritical.
49
- Models []Model
49
+ Models []Model `json:"models,omitempty"`
50
50
// PoolRef are references to the backend pools that the LLMService registers to.
51
- PoolRef []corev1.ObjectReference
51
+ PoolRef []corev1.ObjectReference `json:"poolRef,omitempty"`
52
52
}
53
53
54
54
// Model defines the policies for routing the traffic of a use case, this includes performance objectives
@@ -62,17 +62,17 @@ type Model struct {
62
62
// Names can be reserved without implementing an actual model in the pool.
63
63
// This can be done by specifying a target model and setting the weight to zero,
64
64
// an error will be returned specifying that no valid target model is found.
65
- Name string
65
+ Name string `json:"name,omitempty"`
66
66
// Optional
67
67
// LLM Services with an objective have higher priority than services without.
68
68
// IMPORTANT: By specifying an objective, this places the LLMService in a higher priority class than LLMServices without a defined priority class.
69
69
// In the face of resource-scarcity. Higher priority requests will be preserved, and lower priority class requests will be rejected.
70
- Objective * Objective
70
+ Objective * Objective `json:"objective,omitempty"`
71
71
// Optional.
72
72
// Allow multiple versions of a model for traffic splitting.
73
73
// If not specified, the target model name is defaulted to the modelName parameter.
74
74
// modelName is often in reference to a LoRA adapter.
75
- TargetModels []TargetModel
75
+ TargetModels []TargetModel `json:"targetModels,omitempty"`
76
76
}
77
77
78
78
// TargetModel represents a deployed model or a LoRA adapter. The
@@ -84,10 +84,10 @@ type Model struct {
84
84
// and then emitted on the appropriate LLMService object.
85
85
type TargetModel struct {
86
86
// The name of the adapter as expected by the ModelServer.
87
- Name string
87
+ Name string `json:"name,omitempty"`
88
88
// Weight is used to determine the percentage of traffic that should be
89
89
// sent to this target model when multiple versions of the model are specified.
90
- Weight int
90
+ Weight int `json:"weight,omitempty"`
91
91
}
92
92
93
93
// Objective captures the latency SLO of a LLM service.
@@ -100,7 +100,7 @@ type Objective struct {
100
100
// length. Note that this is different from what is known as TPOT (time per output token) which only
101
101
// takes decode time into account.
102
102
// The P95 is calculated over a fixed time window defined at the operator level.
103
- DesiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests * time.Duration
103
+ DesiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests * time.Duration `json:"desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests,omitempty"`
104
104
}
105
105
106
106
// LLMServiceStatus defines the observed state of LLMService
0 commit comments