Skip to content

Commit 90d5af8

Browse files
authored
add autopilotExempt annotation to allow per-AppWrapper opt-out (#203)
1 parent 59597a3 commit 90d5af8

File tree

5 files changed

+20
-8
lines changed

5 files changed

+20
-8
lines changed

api/v1beta2/appwrapper_types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ const (
174174
SuccessTTLAnnotation = "workload.codeflare.dev.appwrapper/successTTLDuration"
175175
TerminalExitCodesAnnotation = "workload.codeflare.dev.appwrapper/terminalExitCodes"
176176
RetryableExitCodesAnnotation = "workload.codeflare.dev.appwrapper/retryableExitCodes"
177+
AutopilotExemptAnnotation = "workload.codeflare.dev.appwrapper/autopilotExempt"
177178
)
178179

179180
//+kubebuilder:object:root=true

internal/controller/appwrapper/appwrapper_controller.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -510,7 +510,7 @@ func (r *AppWrapperReconciler) getPodStatus(ctx context.Context, aw *workloadv1b
510510
return nil, err
511511
}
512512
summary := &podStatusSummary{expected: pc}
513-
checkUnhealthyNodes := r.Config.Autopilot != nil && r.Config.Autopilot.EvacuateWorkloads
513+
checkUnhealthyNodes := r.Config.Autopilot != nil && r.Config.Autopilot.MigrateImpactedWorkloads && !r.isAutopilotExempt(ctx, aw)
514514

515515
for _, pod := range pods.Items {
516516
switch pod.Status.Phase {
@@ -855,6 +855,17 @@ func (r *AppWrapperReconciler) retryableExitCodes(_ context.Context, aw *workloa
855855
return ans
856856
}
857857

858+
func (r *AppWrapperReconciler) isAutopilotExempt(ctx context.Context, aw *workloadv1beta2.AppWrapper) bool {
859+
if v, ok := aw.Annotations[workloadv1beta2.AutopilotExemptAnnotation]; ok {
860+
if isExempt, err := strconv.ParseBool(v); err == nil {
861+
return isExempt
862+
} else {
863+
log.FromContext(ctx).Error(err, "Malformed autopilotExempt annotation; treating as false", "annotation", v)
864+
}
865+
}
866+
return false
867+
}
868+
858869
func clearCondition(aw *workloadv1beta2.AppWrapper, condition workloadv1beta2.AppWrapperCondition, reason string, message string) {
859870
if meta.IsStatusConditionTrue(aw.Status.Conditions, string(condition)) {
860871
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{

internal/controller/appwrapper/resource_management.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ func (r *AppWrapperReconciler) createComponent(ctx context.Context, aw *workload
252252
}
253253
}
254254

255-
if r.Config.Autopilot != nil && r.Config.Autopilot.InjectAffinity {
255+
if r.Config.Autopilot != nil && r.Config.Autopilot.InjectAntiAffinities && !r.isAutopilotExempt(ctx, aw) {
256256
toAdd := map[string]string{}
257257
for resource, labels := range r.Config.Autopilot.ResourceUnhealthyConfig {
258258
if hasResourceRequest(spec, resource) {

pkg/config/config.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,9 @@ type KueueJobReconcillerConfig struct {
4747
}
4848

4949
type AutopilotConfig struct {
50-
InjectAffinity bool `json:"injectAffinity,omitempty"`
51-
EvacuateWorkloads bool `json:"evacuateWorkloads,omitempty"`
52-
ResourceUnhealthyConfig map[string]map[string]string `json:"resourceUnhealthyConfig,omitempty"`
50+
InjectAntiAffinities bool `json:"injectAntiAffinities,omitempty"`
51+
MigrateImpactedWorkloads bool `json:"migrateImpactedWorkloads,omitempty"`
52+
ResourceUnhealthyConfig map[string]map[string]string `json:"resourceUnhealthyConfig,omitempty"`
5353
}
5454

5555
type FaultToleranceConfig struct {
@@ -100,8 +100,8 @@ func NewAppWrapperConfig() *AppWrapperConfig {
100100
LabelKeysToCopy: []string{},
101101
},
102102
Autopilot: &AutopilotConfig{
103-
InjectAffinity: true,
104-
EvacuateWorkloads: true,
103+
InjectAntiAffinities: true,
104+
MigrateImpactedWorkloads: true,
105105
ResourceUnhealthyConfig: map[string]map[string]string{
106106
"nvidia.com/gpu": {"autopilot.ibm.com/gpuhealth": "ERR"},
107107
},

pkg/controller/setup.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ func SetupControllers(mgr ctrl.Manager, awConfig *config.AppWrapperConfig) error
5050
}
5151
}
5252

53-
if awConfig.Autopilot != nil && awConfig.Autopilot.EvacuateWorkloads {
53+
if awConfig.Autopilot != nil && awConfig.Autopilot.MigrateImpactedWorkloads {
5454
if err := (&appwrapper.NodeHealthMonitor{
5555
Client: mgr.GetClient(),
5656
Config: awConfig,

0 commit comments

Comments
 (0)