Skip to content

Commit 404a303

Browse files
committed
Swapping LLMServerPool to reference a label selector field instead of a Service objectRef, and regenerating output. Also updating proposal to reflect this change.
1 parent 0c0a6ed commit 404a303

13 files changed

+382
-92
lines changed

api/v1alpha1/llmserverpool_types.go

+5-6
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ limitations under the License.
1717
package v1alpha1
1818

1919
import (
20-
corev1 "k8s.io/api/core/v1"
2120
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2221
)
2322

@@ -26,11 +25,11 @@ import (
2625
// LLMServerPoolSpec defines the desired state of LLMServerPool
2726
type LLMServerPoolSpec struct {
2827

29-
// ServiceRefs select the distinct services to include in the backend pool.
30-
// NOTE: These services should be consumed by only the llmServerPool they
31-
// are referenced by. Should this behavior be breached, Instance Gateway
32-
// behavior is not guaranteed.
33-
ServiceRefs []corev1.ObjectReference `json:"serviceRefs,omitempty"`
28+
// ModelServerSelector uses label selection to watch model server pods
29+
// that should be included in the LLMServerPool. ModelServers should not
30+
// be with any other Service or LLMServerPool, that behavior is not supported
31+
// and will result in sub-optimal utilization.
32+
ModelServerSelector map[string]string `json:"modelServerSelector,omitempty"`
3433
}
3534

3635
// LLMServerPoolStatus defines the observed state of LLMServerPool

api/v1alpha1/llmservice_types.go

+8-8
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,9 @@ type LLMServiceSpec struct {
4646
// Model can be in 2 priority classes, Critical and Noncritical.
4747
// Priority class is implicitly set to Critical by specifying an Objective.
4848
// Otherwise the Model is considered Noncritical.
49-
Models []Model
49+
Models []Model `json:"models,omitempty"`
5050
// PoolRef are references to the backend pools that the LLMService registers to.
51-
PoolRef []corev1.ObjectReference
51+
PoolRef []corev1.ObjectReference `json:"poolRef,omitempty"`
5252
}
5353

5454
// Model defines the policies for routing the traffic of a use case, this includes performance objectives
@@ -62,17 +62,17 @@ type Model struct {
6262
// Names can be reserved without implementing an actual model in the pool.
6363
// This can be done by specifying a target model and setting the weight to zero,
6464
// an error will be returned specifying that no valid target model is found.
65-
Name string
65+
Name string `json:"name,omitempty"`
6666
// Optional
6767
// LLM Services with an objective have higher priority than services without.
6868
// IMPORTANT: By specifying an objective, this places the LLMService in a higher priority class than LLMServices without a defined priority class.
6969
// In the face of resource-scarcity. Higher priority requests will be preserved, and lower priority class requests will be rejected.
70-
Objective *Objective
70+
Objective *Objective `json:"objective,omitempty"`
7171
// Optional.
7272
// Allow multiple versions of a model for traffic splitting.
7373
// If not specified, the target model name is defaulted to the modelName parameter.
7474
// modelName is often in reference to a LoRA adapter.
75-
TargetModels []TargetModel
75+
TargetModels []TargetModel `json:"targetModels,omitempty"`
7676
}
7777

7878
// TargetModel represents a deployed model or a LoRA adapter. The
@@ -84,10 +84,10 @@ type Model struct {
8484
// and then emitted on the appropriate LLMService object.
8585
type TargetModel struct {
8686
// The name of the adapter as expected by the ModelServer.
87-
Name string
87+
Name string `json:"name,omitempty"`
8888
// Weight is used to determine the percentage of traffic that should be
8989
// sent to this target model when multiple versions of the model are specified.
90-
Weight int
90+
Weight int `json:"weight,omitempty"`
9191
}
9292

9393
// Objective captures the latency SLO of a LLM service.
@@ -100,7 +100,7 @@ type Objective struct {
100100
// length. Note that this is different from what is known as TPOT (time per output token) which only
101101
// takes decode time into account.
102102
// The P95 is calculated over a fixed time window defined at the operator level.
103-
DesiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests *time.Duration
103+
DesiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests *time.Duration `json:"desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests,omitempty"`
104104
}
105105

106106
// LLMServiceStatus defines the observed state of LLMService

api/v1alpha1/zz_generated.deepcopy.go

+11-9
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

client-go/applyconfiguration/api/v1alpha1/llmserverpoolspec.go

+10-10
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

client-go/applyconfiguration/api/v1alpha1/llmservice.go

+3-4
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

client-go/applyconfiguration/api/v1alpha1/llmservicespec.go

+58
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

client-go/applyconfiguration/api/v1alpha1/model.go

+61
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

client-go/applyconfiguration/api/v1alpha1/objective.go

+42
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)