Skip to content

API Shift/Refactor #93

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions PROJECT
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@ resources:
namespaced: true
domain: x-k8s.io
group: inference
kind: LLMServerPool
kind: InferencePool
path: sigs.k8s.io/llm-instance-gateway/api/v1alpha1
version: v1alpha1
- api:
crdVersion: v1
namespaced: true
domain: x-k8s.io
group: inference
kind: LLMService
kind: InferenceModel
path: sigs.k8s.io/llm-instance-gateway/api/v1alpha1
version: v1alpha1
version: "3"
33 changes: 0 additions & 33 deletions api/Dockerfile

This file was deleted.

167 changes: 167 additions & 0 deletions api/v1alpha1/inferencemodel_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
/*
Copyright 2024.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package v1alpha1

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.

// InferenceModelSpec represents a specific model use case. This resource is
// managed by the "Inference Workload Owner" persona.
//
// The Inference Workload Owner persona is: a team that trains, verifies, and
// leverages a large language model from a model frontend, drives the lifecycle
// and rollout of new versions of those models, and defines the specific
// performance and latency goals for the model. These workloads are
// expected to operate within an InferencePool sharing compute capacity with other
// InferenceModels, defined by the Inference Platform Admin.
//
// InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool,
// if the name is reused, an error will be shown on the status of a
// InferenceModel that attempted to reuse. The oldest InferenceModel, based on
// creation timestamp, will be selected to remain valid. In the event of a race
// condition, one will be selected at random.
type InferenceModelSpec struct {
// The name of the model as the users set in the "model" parameter in the requests.
// The name should be unique among the workloads that reference the same backend pool.
// This is the parameter that will be used to match the request with. In the future, we may
// allow to match on other request parameters. The other approach to support matching on
// on other request parameters is to use a different ModelName per HTTPFilter.
// Names can be reserved without implementing an actual model in the pool.
// This can be done by specifying a target model and setting the weight to zero,
// an error will be returned specifying that no valid target model is found.
//
// +optional
// +kubebuilder:validation:MaxLength=253
ModelName string `json:"modelName,omitempty"`
// Defines how important it is to serve the model compared to other models referencing the same pool.
//
// +optional
// +kubebuilder:default="Default"
Criticality *Criticality `json:"criticality,omitempty"`
// Allow multiple versions of a model for traffic splitting.
// If not specified, the target model name is defaulted to the modelName parameter.
// modelName is often in reference to a LoRA adapter.
//
// +optional
// +kubebuilder:validation:MaxItems=10
TargetModels []TargetModel `json:"targetModels,omitempty"`
// Reference to the inference pool, the pool must exist in the same namespace.
//
// +kubebuilder:validation:Required
PoolRef *PoolObjectReference `json:"poolRef,omitempty"`
}

// PoolObjectReference identifies an API object within the namespace of the
// referrer.
type PoolObjectReference struct {
// Group is the group of the referent.
//
// +optional
// +kubebuilder:default="inference.networking.x-k8s.io"
// +kubebuilder:validation:MaxLength=253
// +kubebuilder:validation:Pattern=`^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
Group string `json:"group,omitempty"`

// Kind is kind of the referent. For example "InferencePool".
//
// +optional
// +kubebuilder:default="InferencePool"
// +kubebuilder:validation:MinLength=1
// +kubebuilder:validation:MaxLength=63
// +kubebuilder:validation:Pattern=`^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
Kind string `json:"kind,omitempty"`

// Name is the name of the referent.
//
// +kubebuilder:validation:MinLength=1
// +kubebuilder:validation:MaxLength=253
// +kubebuilder:validation:Required
Name string `json:"name,omitempty"`
}

// Defines how important it is to serve the model compared to other models.
// +kubebuilder:validation:Enum=Critical;Default;Sheddable
type Criticality string

const (
// Most important. Requests to this band will be shed last.
Critical Criticality = "Critical"
// More important than Sheddable, less important than Critical.
// Requests in this band will be shed before critical traffic.
// +kubebuilder:default=Default
Default Criticality = "Default"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't hate this name, but it always feels a bit funny to have to spell out "Default". Would something like "Moderate" or "Normal" work better here? Maybe there are some similar terms we can use as a reference? For example Kubernetes Pods have the following QoS classes:

  • Guaranteed
  • Burstable
  • BestEffort

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JeremyOT if you have thoughts on what other names we can use for "Default". Another suggestion is "Standard"? I can also get behind "Moderate" or "Normal".

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moderate sounds good to me here, seems at least better than Default

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually like Default, because we are really trying to communicate that this is what you get when unset. No docs necessary to understand that. Something like Moderate is fine, but I still need to remember that Moderate is the default. Normal is more clear.

I don't think K8s QoS is a great comparison, since there's so much difference between behaviors of each class. It's not just more/less.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about representing these in numbers like priorities instead of abstract names? Everything is relative to each other.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think defining a discrete value is sufficient, easier for users to work with (pre-defined options to select from) and makes provider implementations easier.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can get on board with starting with 3 simple values here, but likely worth revisiting in the future to see if additional granularity would be helpful. @terrytangyuan did you have any specific use cases in mind that would benefit from having additional levels here?

// Least important. Requests to this band will be shed before all other bands.
Sheddable Criticality = "Sheddable"
)

// TargetModel represents a deployed model or a LoRA adapter. The
// Name field is expected to match the name of the LoRA adapter
// (or base model) as it is registered within the model server. Inference
Comment on lines +114 to +116
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if TargetModel is a bit misleading if it also represents an adapter.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does, but this mirrors the Open AI API spec, which just expects a modelName. So we chose not to deviate from that pattern.

LMKWYT

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it may not be necessarily be an adapter too

// Gateway assumes that the model exists on the model server and is the
// responsibility of the user to validate a correct match. Should a model fail
// to exist at request time, the error is processed by the Instance Gateway,
// and then emitted on the appropriate InferenceModel object.
type TargetModel struct {
// The name of the adapter as expected by the ModelServer.
//
// +optional
// +kubebuilder:validation:MaxLength=253
Name string `json:"name,omitempty"`
// Weight is used to determine the proportion of traffic that should be
// sent to this target model when multiple versions of the model are specified.
//
// +optional
// +kubebuilder:default=1
// +kubebuilder:validation:Minimum=0
// +kubebuilder:validation:Maximum=1000000
Weight int32 `json:"weight,omitempty"`
}

// InferenceModelStatus defines the observed state of InferenceModel
type InferenceModelStatus struct {
// Conditions track the state of the InferencePool.
Conditions []metav1.Condition `json:"conditions,omitempty"`
}

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +genclient

// InferenceModel is the Schema for the InferenceModels API
type InferenceModel struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Spec InferenceModelSpec `json:"spec,omitempty"`
Status InferenceModelStatus `json:"status,omitempty"`
}

// +kubebuilder:object:root=true

// InferenceModelList contains a list of InferenceModel
type InferenceModelList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []InferenceModel `json:"items"`
}

func init() {
SchemeBuilder.Register(&InferenceModel{}, &InferenceModelList{})
}
119 changes: 119 additions & 0 deletions api/v1alpha1/inferencepool_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/*
Copyright 2024.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package v1alpha1

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.

// InferencePoolSpec defines the desired state of InferencePool
type InferencePoolSpec struct {

// Selector uses a map of label to watch model server pods
// that should be included in the InferencePool. ModelServers should not
// be with any other Service or InferencePool, that behavior is not supported
// and will result in sub-optimal utilization.
// In some cases, implementations may translate this to a Service selector, so this matches the simple
// map used for Service selectors instead of the full Kubernetes LabelSelector type.
//
// +kubebuilder:validation:Required
Selector map[LabelKey]LabelValue `json:"selector,omitempty"`

// TargetPort is the port number that the model servers within the pool expect
// to recieve traffic from.
// This maps to the TargetPort in: https://pkg.go.dev/k8s.io/api/core/v1#ServicePort
//
// +kubebuilder:validation:Minimum=0
// +kubebuilder:validation:Maximum=65535
// +kubebuilder:validation:Required
TargetPortNumber int32 `json:"targetPortNumber,omitempty"`
}

// Originally copied from: https://github.com/kubernetes-sigs/gateway-api/blob/99a3934c6bc1ce0874f3a4c5f20cafd8977ffcb4/apis/v1/shared_types.go#L694-L731
// Duplicated as to not take an unexpected dependency on gw's API.
//
// LabelKey is the key of a label. This is used for validation
// of maps. This matches the Kubernetes "qualified name" validation that is used for labels.
//
// Valid values include:
//
// * example
// * example.com
// * example.com/path
// * example.com/path.html
//
// Invalid values include:
//
// * example~ - "~" is an invalid character
// * example.com. - can not start or end with "."
//
// +kubebuilder:validation:MinLength=1
// +kubebuilder:validation:MaxLength=253
// +kubebuilder:validation:Pattern=`^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$`
type LabelKey string

// LabelValue is the value of a label. This is used for validation
// of maps. This matches the Kubernetes label validation rules:
// * must be 63 characters or less (can be empty),
// * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
// * could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
//
// Valid values include:
//
// * MyValue
// * my.name
// * 123-my-value
//
// +kubebuilder:validation:MinLength=0
// +kubebuilder:validation:MaxLength=63
// +kubebuilder:validation:Pattern=`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$`
type LabelValue string

// InferencePoolStatus defines the observed state of InferencePool
type InferencePoolStatus struct {

// Conditions track the state of the InferencePool.
Conditions []metav1.Condition `json:"conditions,omitempty"`
}

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +genclient

// InferencePool is the Schema for the Inferencepools API
type InferencePool struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Spec InferencePoolSpec `json:"spec,omitempty"`
Status InferencePoolStatus `json:"status,omitempty"`
}

// +kubebuilder:object:root=true

// InferencePoolList contains a list of InferencePool
type InferencePoolList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []InferencePool `json:"items"`
}

func init() {
SchemeBuilder.Register(&InferencePool{}, &InferencePoolList{})
}
Loading