Skip to content

Commit 802c322

Browse files
authored
Configurable automatic deletion of resources for Succeeded AppWrappers (#124)
Fixes #15.
1 parent 2adbbc8 commit 802c322

File tree

6 files changed

+60
-1
lines changed

6 files changed

+60
-1
lines changed

api/v1beta2/appwrapper_types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ const (
139139
RetryLimitAnnotation = "workload.codeflare.dev.appwrapper/retryLimit"
140140
DeletionGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/deletionGracePeriodDuration"
141141
DebuggingFailureDeletionDelayDurationAnnotation = "workload.codeflare.dev.appwrapper/debuggingFailureDeletionDelayDuration"
142+
SuccessTTLDurationAnnotation = "workload.codeflare.dev.appwrapper/successTTLDuration"
142143
)
143144

144145
//+kubebuilder:object:root=true

internal/controller/appwrapper/appwrapper_controller.go

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,11 +208,18 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
208208

209209
// Handle Success
210210
if podStatus.succeeded >= podStatus.expected && (podStatus.pending+podStatus.running+podStatus.failed == 0) {
211+
msg := fmt.Sprintf("%v pods succeeded and no running, pending, or failed pods", podStatus.succeeded)
211212
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
212213
Type: string(workloadv1beta2.QuotaReserved),
213214
Status: metav1.ConditionFalse,
214215
Reason: string(workloadv1beta2.AppWrapperSucceeded),
215-
Message: fmt.Sprintf("%v pods succeeded and no running, pending, or failed pods", podStatus.succeeded),
216+
Message: msg,
217+
})
218+
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
219+
Type: string(workloadv1beta2.ResourcesDeployed),
220+
Status: metav1.ConditionTrue,
221+
Reason: string(workloadv1beta2.AppWrapperSucceeded),
222+
Message: msg,
216223
})
217224
return r.updateStatus(ctx, aw, workloadv1beta2.AppWrapperSucceeded)
218225
}
@@ -370,6 +377,29 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
370377
Message: "No resources deployed",
371378
})
372379
return ctrl.Result{}, r.Status().Update(ctx, aw)
380+
381+
case workloadv1beta2.AppWrapperSucceeded:
382+
if meta.IsStatusConditionTrue(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)) {
383+
deletionDelay := r.timeToLiveAfterSucceededDuration(ctx, aw)
384+
whenSucceeded := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)).LastTransitionTime
385+
now := time.Now()
386+
deadline := whenSucceeded.Add(deletionDelay)
387+
if now.Before(deadline) {
388+
return ctrl.Result{RequeueAfter: deadline.Sub(now)}, r.Status().Update(ctx, aw)
389+
}
390+
391+
if !r.deleteComponents(ctx, aw) {
392+
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
393+
}
394+
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
395+
Type: string(workloadv1beta2.ResourcesDeployed),
396+
Status: metav1.ConditionFalse,
397+
Reason: string(workloadv1beta2.AppWrapperSucceeded),
398+
Message: fmt.Sprintf("Time to live after success of %v expired", deletionDelay),
399+
})
400+
return ctrl.Result{}, r.Status().Update(ctx, aw)
401+
}
402+
return ctrl.Result{}, nil
373403
}
374404

375405
return ctrl.Result{}, nil
@@ -495,6 +525,19 @@ func (r *AppWrapperReconciler) debuggingFailureDeletionDelay(ctx context.Context
495525
return 0 * time.Second
496526
}
497527

528+
func (r *AppWrapperReconciler) timeToLiveAfterSucceededDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
529+
if userPeriod, ok := aw.Annotations[workloadv1beta2.SuccessTTLDurationAnnotation]; ok {
530+
if duration, err := time.ParseDuration(userPeriod); err == nil {
531+
if duration > 0 && duration < r.Config.FaultTolerance.SuccessTTLCeiling {
532+
return duration
533+
}
534+
} else {
535+
log.FromContext(ctx).Info("Malformed successTTL annotation", "annotation", userPeriod, "error", err)
536+
}
537+
}
538+
return r.Config.FaultTolerance.SuccessTTLCeiling
539+
}
540+
498541
func clearCondition(aw *workloadv1beta2.AppWrapper, condition workloadv1beta2.AppWrapperCondition, reason string, message string) {
499542
if meta.IsStatusConditionTrue(aw.Status.Conditions, string(condition)) {
500543
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{

pkg/config/config.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ type FaultToleranceConfig struct {
4343
RetryLimit int32 `json:"retryLimit,omitempty"`
4444
DeletionGracePeriod time.Duration `json:"deletionGracePeriod,omitempty"`
4545
GracePeriodCeiling time.Duration `json:"gracePeriodCeiling,omitempty"`
46+
SuccessTTLCeiling time.Duration `json:"successTTLCeiling,omitempty"`
4647
}
4748

4849
type CertManagementConfig struct {
@@ -86,6 +87,7 @@ func NewAppWrapperConfig() *AppWrapperConfig {
8687
RetryLimit: 3,
8788
DeletionGracePeriod: 10 * time.Minute,
8889
GracePeriodCeiling: 24 * time.Hour,
90+
SuccessTTLCeiling: 7 * 24 * time.Hour,
8991
},
9092
}
9193
}
@@ -107,6 +109,9 @@ func ValidateAppWrapperConfig(config *AppWrapperConfig) error {
107109
return fmt.Errorf("WarmupGracePeriod %v exceeds GracePeriodCeiling %v",
108110
config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodCeiling)
109111
}
112+
if config.FaultTolerance.SuccessTTLCeiling <= 0 {
113+
return fmt.Errorf("SuccessTTLCeiling %v is not a positive duration", config.FaultTolerance.SuccessTTLCeiling)
114+
}
110115

111116
return nil
112117
}

samples/wrapped-job.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ metadata:
44
name: sample-job
55
annotations:
66
kueue.x-k8s.io/queue-name: user-queue
7+
workload.codeflare.dev.appwrapper/successTTLDuration: "1m"
78
spec:
89
components:
910
- template:

site/_pages/arch-controller.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ but will become false when the Framework Controller succeeds at deleting all res
114114
in the Resuming phase.
115115

116116
ResourcesDeployed will be true in the Succeeded state (green), but QuotaReserved will be false.
117+
After a configurable delay, the Framework controller will eventually delete the resources of
118+
Succeeded AppWrappers and ResourcesDeployed will become false.
117119

118120
Any phase may transition to the Terminating phase (not shown) when the AppWrapper is deleted.
119121
During the Terminating phase, QuotaReserved and ResourcesDeployed may initially be true

site/_pages/arch-fault-tolerance.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ can be used to customize them.
6262
| RetryLimit | 3 | workload.codeflare.dev.appwrapper/retryLimit |
6363
| DeletionGracePeriod | 10 Minutes | workload.codeflare.dev.appwrapper/deletionGracePeriodDuration |
6464
| GracePeriodCeiling | 24 Hours | Not Applicable |
65+
| SuccessTTLCeiling | 7 Days | workload.codeflare.dev.appwrapper/successTTLDuration |
66+
6567

6668
The `GracePeriodCeiling` imposes an upper limit on the other grace periods to
6769
reduce the impact of user-added annotations on overall system utilization.
@@ -73,3 +75,8 @@ AppWrapper enters the `Failed` state and when the process of deleting its resour
7375
begins. Since the AppWrapper continues to consume quota during this delayed deletion period,
7476
this annotation should be used sparingly and only when interactive debugging of
7577
the failed workload is being actively pursued.
78+
79+
All child resources for an AppWrapper that successfully completed will be automatically
80+
deleted `SuccessTTLCeiling` time after the AppWrapper entered the `Succeeded` state.
81+
This duration can be shortened on a per-AppWrapper basis using the
82+
`workload.codeflare.dev.appwrapper/successTTLDuration` annotation.

0 commit comments

Comments
 (0)