Skip to content

Configurable automatic deletion of resources for Succeeded AppWrappers #124

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/v1beta2/appwrapper_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ const (
RetryLimitAnnotation = "workload.codeflare.dev.appwrapper/retryLimit"
DeletionGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/deletionGracePeriodDuration"
DebuggingFailureDeletionDelayDurationAnnotation = "workload.codeflare.dev.appwrapper/debuggingFailureDeletionDelayDuration"
SuccessTTLDurationAnnotation = "workload.codeflare.dev.appwrapper/successTTLDuration"
)

//+kubebuilder:object:root=true
Expand Down
45 changes: 44 additions & 1 deletion internal/controller/appwrapper/appwrapper_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,11 +208,18 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)

// Handle Success
if podStatus.succeeded >= podStatus.expected && (podStatus.pending+podStatus.running+podStatus.failed == 0) {
msg := fmt.Sprintf("%v pods succeeded and no running, pending, or failed pods", podStatus.succeeded)
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.QuotaReserved),
Status: metav1.ConditionFalse,
Reason: string(workloadv1beta2.AppWrapperSucceeded),
Message: fmt.Sprintf("%v pods succeeded and no running, pending, or failed pods", podStatus.succeeded),
Message: msg,
})
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.ResourcesDeployed),
Status: metav1.ConditionTrue,
Reason: string(workloadv1beta2.AppWrapperSucceeded),
Message: msg,
})
return r.updateStatus(ctx, aw, workloadv1beta2.AppWrapperSucceeded)
}
Expand Down Expand Up @@ -370,6 +377,29 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
Message: "No resources deployed",
})
return ctrl.Result{}, r.Status().Update(ctx, aw)

case workloadv1beta2.AppWrapperSucceeded:
if meta.IsStatusConditionTrue(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)) {
deletionDelay := r.timeToLiveAfterSucceededDuration(ctx, aw)
whenSucceeded := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)).LastTransitionTime
now := time.Now()
deadline := whenSucceeded.Add(deletionDelay)
if now.Before(deadline) {
return ctrl.Result{RequeueAfter: deadline.Sub(now)}, r.Status().Update(ctx, aw)
}

if !r.deleteComponents(ctx, aw) {
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.ResourcesDeployed),
Status: metav1.ConditionFalse,
Reason: string(workloadv1beta2.AppWrapperSucceeded),
Message: fmt.Sprintf("Time to live after success of %v expired", deletionDelay),
})
return ctrl.Result{}, r.Status().Update(ctx, aw)
}
return ctrl.Result{}, nil
}

return ctrl.Result{}, nil
Expand Down Expand Up @@ -495,6 +525,19 @@ func (r *AppWrapperReconciler) debuggingFailureDeletionDelay(ctx context.Context
return 0 * time.Second
}

func (r *AppWrapperReconciler) timeToLiveAfterSucceededDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
if userPeriod, ok := aw.Annotations[workloadv1beta2.SuccessTTLDurationAnnotation]; ok {
if duration, err := time.ParseDuration(userPeriod); err == nil {
if duration > 0 && duration < r.Config.FaultTolerance.SuccessTTLCeiling {
return duration
}
} else {
log.FromContext(ctx).Info("Malformed successTTL annotation", "annotation", userPeriod, "error", err)
}
}
return r.Config.FaultTolerance.SuccessTTLCeiling
}

func clearCondition(aw *workloadv1beta2.AppWrapper, condition workloadv1beta2.AppWrapperCondition, reason string, message string) {
if meta.IsStatusConditionTrue(aw.Status.Conditions, string(condition)) {
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Expand Down
5 changes: 5 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ type FaultToleranceConfig struct {
RetryLimit int32 `json:"retryLimit,omitempty"`
DeletionGracePeriod time.Duration `json:"deletionGracePeriod,omitempty"`
GracePeriodCeiling time.Duration `json:"gracePeriodCeiling,omitempty"`
SuccessTTLCeiling time.Duration `json:"successTTLCeiling,omitempty"`
}

type CertManagementConfig struct {
Expand Down Expand Up @@ -86,6 +87,7 @@ func NewAppWrapperConfig() *AppWrapperConfig {
RetryLimit: 3,
DeletionGracePeriod: 10 * time.Minute,
GracePeriodCeiling: 24 * time.Hour,
SuccessTTLCeiling: 7 * 24 * time.Hour,
},
}
}
Expand All @@ -107,6 +109,9 @@ func ValidateAppWrapperConfig(config *AppWrapperConfig) error {
return fmt.Errorf("WarmupGracePeriod %v exceeds GracePeriodCeiling %v",
config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodCeiling)
}
if config.FaultTolerance.SuccessTTLCeiling <= 0 {
return fmt.Errorf("SuccessTTLCeiling %v is not a positive duration", config.FaultTolerance.SuccessTTLCeiling)
}

return nil
}
Expand Down
1 change: 1 addition & 0 deletions samples/wrapped-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ metadata:
name: sample-job
annotations:
kueue.x-k8s.io/queue-name: user-queue
workload.codeflare.dev.appwrapper/successTTLDuration: "1m"
spec:
components:
- template:
Expand Down
2 changes: 2 additions & 0 deletions site/_pages/arch-controller.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ but will become false when the Framework Controller succeeds at deleting all res
in the Resuming phase.

ResourcesDeployed will be true in the Succeeded state (green), but QuotaReserved will be false.
After a configurable delay, the Framework controller will eventually delete the resources of
Succeeded AppWrappers and ResourcesDeployed will become false.

Any phase may transition to the Terminating phase (not shown) when the AppWrapper is deleted.
During the Terminating phase, QuotaReserved and ResourcesDeployed may initially be true
Expand Down
7 changes: 7 additions & 0 deletions site/_pages/arch-fault-tolerance.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ can be used to customize them.
| RetryLimit | 3 | workload.codeflare.dev.appwrapper/retryLimit |
| DeletionGracePeriod | 10 Minutes | workload.codeflare.dev.appwrapper/deletionGracePeriodDuration |
| GracePeriodCeiling | 24 Hours | Not Applicable |
| SuccessTTLCeiling | 7 Days | workload.codeflare.dev.appwrapper/successTTLDuration |


The `GracePeriodCeiling` imposes an upper limit on the other grace periods to
reduce the impact of user-added annotations on overall system utilization.
Expand All @@ -73,3 +75,8 @@ AppWrapper enters the `Failed` state and when the process of deleting its resour
begins. Since the AppWrapper continues to consume quota during this delayed deletion period,
this annotation should be used sparingly and only when interactive debugging of
the failed workload is being actively pursued.

All child resources for an AppWrapper that successfully completed will be automatically
deleted `SuccessTTLCeiling` time after the AppWrapper entered the `Succeeded` state.
This duration can be shortened on a per-AppWrapper basis using the
`workload.codeflare.dev.appwrapper/successTTLDuration` annotation.