diff --git a/api/v1beta2/appwrapper_types.go b/api/v1beta2/appwrapper_types.go index 89f0418..b6ec126 100644 --- a/api/v1beta2/appwrapper_types.go +++ b/api/v1beta2/appwrapper_types.go @@ -133,14 +133,14 @@ const ( ) const ( - AdmissionGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/admissionGracePeriodDuration" - WarmupGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/warmupGracePeriodDuration" - FailureGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/failureGracePeriodDuration" - ResetPauseDurationAnnotation = "workload.codeflare.dev.appwrapper/resetPauseDuration" - RetryLimitAnnotation = "workload.codeflare.dev.appwrapper/retryLimit" - DeletionGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/deletionGracePeriodDuration" - DebuggingFailureDeletionDelayDurationAnnotation = "workload.codeflare.dev.appwrapper/debuggingFailureDeletionDelayDuration" - SuccessTTLDurationAnnotation = "workload.codeflare.dev.appwrapper/successTTLDuration" + AdmissionGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/admissionGracePeriodDuration" + WarmupGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/warmupGracePeriodDuration" + FailureGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/failureGracePeriodDuration" + RetryPausePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/retryPausePeriodDuration" + RetryLimitAnnotation = "workload.codeflare.dev.appwrapper/retryLimit" + ForcefulDeletionGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration" + DeletionOnFailureGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration" + SuccessTTLAnnotation = "workload.codeflare.dev.appwrapper/successTTLDuration" ) //+kubebuilder:object:root=true diff --git a/internal/controller/appwrapper/appwrapper_controller.go b/internal/controller/appwrapper/appwrapper_controller.go index 388b5b4..dbee642 100644 --- a/internal/controller/appwrapper/appwrapper_controller.go +++ b/internal/controller/appwrapper/appwrapper_controller.go @@ -322,7 +322,7 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request) // Pause before transitioning to Resuming to heuristically allow transient system problems to subside whenReset := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.Unhealthy)).LastTransitionTime - pauseDuration := r.resettingPauseDuration(ctx, aw) + pauseDuration := r.retryPauseDuration(ctx, aw) now := time.Now() deadline := whenReset.Add(pauseDuration) if now.Before(deadline) { @@ -342,14 +342,14 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request) // When an appwrapper is annotated with a non-zero debugging delay, // we hold quota for the delay period and do not delete the resources of // a failed appwrapper unless Kueue preempts it by setting Suspend to true. - deletionDelay := r.debuggingFailureDeletionDelay(ctx, aw) + deletionDelay := r.deletionOnFailureGraceDuration(ctx, aw) if deletionDelay > 0 && !aw.Spec.Suspend { meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{ Type: string(workloadv1beta2.DeletingResources), Status: metav1.ConditionFalse, Reason: "DeletionPaused", - Message: fmt.Sprintf("%v has value %v", workloadv1beta2.DebuggingFailureDeletionDelayDurationAnnotation, deletionDelay), + Message: fmt.Sprintf("%v has value %v", workloadv1beta2.DeletionOnFailureGracePeriodAnnotation, deletionDelay), }) whenDelayed := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.DeletingResources)).LastTransitionTime @@ -457,8 +457,8 @@ func (r *AppWrapperReconciler) workloadStatus(ctx context.Context, aw *workloadv func (r *AppWrapperReconciler) limitDuration(desired time.Duration) time.Duration { if desired < 0 { return 0 * time.Second - } else if desired > r.Config.FaultTolerance.GracePeriodCeiling { - return r.Config.FaultTolerance.GracePeriodCeiling + } else if desired > r.Config.FaultTolerance.GracePeriodMaximum { + return r.Config.FaultTolerance.GracePeriodMaximum } else { return desired } @@ -469,7 +469,7 @@ func (r *AppWrapperReconciler) admissionGraceDuration(ctx context.Context, aw *w if duration, err := time.ParseDuration(userPeriod); err == nil { return r.limitDuration(duration) } else { - log.FromContext(ctx).Info("Malformed warmup period annotation", "annotation", userPeriod, "error", err) + log.FromContext(ctx).Info("Malformed admission grace period annotation", "annotation", userPeriod, "error", err) } } return r.limitDuration(r.Config.FaultTolerance.AdmissionGracePeriod) @@ -480,7 +480,7 @@ func (r *AppWrapperReconciler) warmupGraceDuration(ctx context.Context, aw *work if duration, err := time.ParseDuration(userPeriod); err == nil { return r.limitDuration(duration) } else { - log.FromContext(ctx).Info("Malformed warmup period annotation", "annotation", userPeriod, "error", err) + log.FromContext(ctx).Info("Malformed warmup grace period annotation", "annotation", userPeriod, "error", err) } } return r.limitDuration(r.Config.FaultTolerance.WarmupGracePeriod) @@ -508,50 +508,50 @@ func (r *AppWrapperReconciler) retryLimit(ctx context.Context, aw *workloadv1bet return r.Config.FaultTolerance.RetryLimit } -func (r *AppWrapperReconciler) resettingPauseDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration { - if userPeriod, ok := aw.Annotations[workloadv1beta2.ResetPauseDurationAnnotation]; ok { +func (r *AppWrapperReconciler) retryPauseDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration { + if userPeriod, ok := aw.Annotations[workloadv1beta2.RetryPausePeriodDurationAnnotation]; ok { if duration, err := time.ParseDuration(userPeriod); err == nil { return r.limitDuration(duration) } else { - log.FromContext(ctx).Info("Malformed reset pause annotation", "annotation", userPeriod, "error", err) + log.FromContext(ctx).Info("Malformed retry pause annotation", "annotation", userPeriod, "error", err) } } - return r.limitDuration(r.Config.FaultTolerance.ResetPause) + return r.limitDuration(r.Config.FaultTolerance.RetryPausePeriod) } -func (r *AppWrapperReconciler) deletionGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration { - if userPeriod, ok := aw.Annotations[workloadv1beta2.DeletionGracePeriodAnnotation]; ok { +func (r *AppWrapperReconciler) forcefulDeletionGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration { + if userPeriod, ok := aw.Annotations[workloadv1beta2.ForcefulDeletionGracePeriodAnnotation]; ok { if duration, err := time.ParseDuration(userPeriod); err == nil { return r.limitDuration(duration) } else { - log.FromContext(ctx).Info("Malformed deletion period annotation", "annotation", userPeriod, "error", err) + log.FromContext(ctx).Info("Malformed forceful deletion period annotation", "annotation", userPeriod, "error", err) } } - return r.limitDuration(r.Config.FaultTolerance.DeletionGracePeriod) + return r.limitDuration(r.Config.FaultTolerance.ForcefulDeletionGracePeriod) } -func (r *AppWrapperReconciler) debuggingFailureDeletionDelay(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration { - if userPeriod, ok := aw.Annotations[workloadv1beta2.DebuggingFailureDeletionDelayDurationAnnotation]; ok { +func (r *AppWrapperReconciler) deletionOnFailureGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration { + if userPeriod, ok := aw.Annotations[workloadv1beta2.DeletionOnFailureGracePeriodAnnotation]; ok { if duration, err := time.ParseDuration(userPeriod); err == nil { return r.limitDuration(duration) } else { - log.FromContext(ctx).Info("Malformed delay deletion annotation", "annotation", userPeriod, "error", err) + log.FromContext(ctx).Info("Malformed delection on failue delay annotation", "annotation", userPeriod, "error", err) } } return 0 * time.Second } func (r *AppWrapperReconciler) timeToLiveAfterSucceededDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration { - if userPeriod, ok := aw.Annotations[workloadv1beta2.SuccessTTLDurationAnnotation]; ok { + if userPeriod, ok := aw.Annotations[workloadv1beta2.SuccessTTLAnnotation]; ok { if duration, err := time.ParseDuration(userPeriod); err == nil { - if duration > 0 && duration < r.Config.FaultTolerance.SuccessTTLCeiling { + if duration > 0 && duration < r.Config.FaultTolerance.SuccessTTL { return duration } } else { log.FromContext(ctx).Info("Malformed successTTL annotation", "annotation", userPeriod, "error", err) } } - return r.Config.FaultTolerance.SuccessTTLCeiling + return r.Config.FaultTolerance.SuccessTTL } func clearCondition(aw *workloadv1beta2.AppWrapper, condition workloadv1beta2.AppWrapperCondition, reason string, message string) { diff --git a/internal/controller/appwrapper/appwrapper_controller_test.go b/internal/controller/appwrapper/appwrapper_controller_test.go index 7c0b7c0..e206cb1 100644 --- a/internal/controller/appwrapper/appwrapper_controller_test.go +++ b/internal/controller/appwrapper/appwrapper_controller_test.go @@ -58,7 +58,7 @@ var _ = Describe("AppWrapper Controller", func() { } awConfig := config.NewAppWrapperConfig() awConfig.FaultTolerance.FailureGracePeriod = 0 * time.Second - awConfig.FaultTolerance.ResetPause = 0 * time.Second + awConfig.FaultTolerance.RetryPausePeriod = 0 * time.Second awConfig.FaultTolerance.RetryLimit = 0 awReconciler = &AppWrapperReconciler{ Client: k8sClient, diff --git a/internal/controller/appwrapper/resource_management.go b/internal/controller/appwrapper/resource_management.go index cb75d15..27be104 100644 --- a/internal/controller/appwrapper/resource_management.go +++ b/internal/controller/appwrapper/resource_management.go @@ -202,7 +202,7 @@ func (r *AppWrapperReconciler) deleteComponents(ctx context.Context, aw *workloa remaining++ // no error deleting resource, resource therefore still exists } - deletionGracePeriod := r.deletionGraceDuration(ctx, aw) + deletionGracePeriod := r.forcefulDeletionGraceDuration(ctx, aw) whenInitiated := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.DeletingResources)).LastTransitionTime gracePeriodExpired := time.Now().After(whenInitiated.Time.Add(deletionGracePeriod)) diff --git a/pkg/config/config.go b/pkg/config/config.go index 1c210df..51dafc2 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -37,14 +37,14 @@ type AppWrapperConfig struct { } type FaultToleranceConfig struct { - AdmissionGracePeriod time.Duration `json:"admissionGracePeriod,omitempty"` - WarmupGracePeriod time.Duration `json:"warmupGracePeriod,omitempty"` - FailureGracePeriod time.Duration `json:"failureGracePeriod,omitempty"` - ResetPause time.Duration `json:"resetPause,omitempty"` - RetryLimit int32 `json:"retryLimit,omitempty"` - DeletionGracePeriod time.Duration `json:"deletionGracePeriod,omitempty"` - GracePeriodCeiling time.Duration `json:"gracePeriodCeiling,omitempty"` - SuccessTTLCeiling time.Duration `json:"successTTLCeiling,omitempty"` + AdmissionGracePeriod time.Duration `json:"admissionGracePeriod,omitempty"` + WarmupGracePeriod time.Duration `json:"warmupGracePeriod,omitempty"` + FailureGracePeriod time.Duration `json:"failureGracePeriod,omitempty"` + RetryPausePeriod time.Duration `json:"resetPause,omitempty"` + RetryLimit int32 `json:"retryLimit,omitempty"` + ForcefulDeletionGracePeriod time.Duration `json:"deletionGracePeriod,omitempty"` + GracePeriodMaximum time.Duration `json:"gracePeriodCeiling,omitempty"` + SuccessTTL time.Duration `json:"successTTLCeiling,omitempty"` } type CertManagementConfig struct { @@ -82,41 +82,41 @@ func NewAppWrapperConfig() *AppWrapperConfig { DisableChildAdmissionCtrl: false, UserRBACAdmissionCheck: true, FaultTolerance: &FaultToleranceConfig{ - AdmissionGracePeriod: 1 * time.Minute, - WarmupGracePeriod: 5 * time.Minute, - FailureGracePeriod: 1 * time.Minute, - ResetPause: 90 * time.Second, - RetryLimit: 3, - DeletionGracePeriod: 10 * time.Minute, - GracePeriodCeiling: 24 * time.Hour, - SuccessTTLCeiling: 7 * 24 * time.Hour, + AdmissionGracePeriod: 1 * time.Minute, + WarmupGracePeriod: 5 * time.Minute, + FailureGracePeriod: 1 * time.Minute, + RetryPausePeriod: 90 * time.Second, + RetryLimit: 3, + ForcefulDeletionGracePeriod: 10 * time.Minute, + GracePeriodMaximum: 24 * time.Hour, + SuccessTTL: 7 * 24 * time.Hour, }, } } func ValidateAppWrapperConfig(config *AppWrapperConfig) error { - if config.FaultTolerance.DeletionGracePeriod > config.FaultTolerance.GracePeriodCeiling { - return fmt.Errorf("DelectionGracePeriod %v exceeds GracePeriodCeiling %v", - config.FaultTolerance.DeletionGracePeriod, config.FaultTolerance.GracePeriodCeiling) + if config.FaultTolerance.ForcefulDeletionGracePeriod > config.FaultTolerance.GracePeriodMaximum { + return fmt.Errorf("ForcefulDelectionGracePeriod %v exceeds GracePeriodCeiling %v", + config.FaultTolerance.ForcefulDeletionGracePeriod, config.FaultTolerance.GracePeriodMaximum) } - if config.FaultTolerance.ResetPause > config.FaultTolerance.GracePeriodCeiling { - return fmt.Errorf("ResetPause %v exceeds GracePeriodCeiling %v", - config.FaultTolerance.ResetPause, config.FaultTolerance.GracePeriodCeiling) + if config.FaultTolerance.RetryPausePeriod > config.FaultTolerance.GracePeriodMaximum { + return fmt.Errorf("RetryPausePeriod %v exceeds GracePeriodCeiling %v", + config.FaultTolerance.RetryPausePeriod, config.FaultTolerance.GracePeriodMaximum) } - if config.FaultTolerance.FailureGracePeriod > config.FaultTolerance.GracePeriodCeiling { + if config.FaultTolerance.FailureGracePeriod > config.FaultTolerance.GracePeriodMaximum { return fmt.Errorf("FailureGracePeriod %v exceeds GracePeriodCeiling %v", - config.FaultTolerance.FailureGracePeriod, config.FaultTolerance.GracePeriodCeiling) + config.FaultTolerance.FailureGracePeriod, config.FaultTolerance.GracePeriodMaximum) } - if config.FaultTolerance.AdmissionGracePeriod > config.FaultTolerance.GracePeriodCeiling { + if config.FaultTolerance.AdmissionGracePeriod > config.FaultTolerance.GracePeriodMaximum { return fmt.Errorf("AdmissionGracePeriod %v exceeds GracePeriodCeiling %v", - config.FaultTolerance.AdmissionGracePeriod, config.FaultTolerance.GracePeriodCeiling) + config.FaultTolerance.AdmissionGracePeriod, config.FaultTolerance.GracePeriodMaximum) } - if config.FaultTolerance.WarmupGracePeriod > config.FaultTolerance.GracePeriodCeiling { + if config.FaultTolerance.WarmupGracePeriod > config.FaultTolerance.GracePeriodMaximum { return fmt.Errorf("WarmupGracePeriod %v exceeds GracePeriodCeiling %v", - config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodCeiling) + config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodMaximum) } - if config.FaultTolerance.SuccessTTLCeiling <= 0 { - return fmt.Errorf("SuccessTTLCeiling %v is not a positive duration", config.FaultTolerance.SuccessTTLCeiling) + if config.FaultTolerance.SuccessTTL <= 0 { + return fmt.Errorf("SuccessTTL %v is not a positive duration", config.FaultTolerance.SuccessTTL) } return nil diff --git a/samples/wrapped-failing-job.yaml b/samples/wrapped-failing-job.yaml index e69de64..c2c52f8 100644 --- a/samples/wrapped-failing-job.yaml +++ b/samples/wrapped-failing-job.yaml @@ -6,7 +6,7 @@ metadata: kueue.x-k8s.io/queue-name: user-queue annotations: workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 10s - workload.codeflare.dev.appwrapper/resetPauseDuration: 10s + workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 10s workload.codeflare.dev.appwrapper/retryLimit: "3" spec: components: diff --git a/samples/wrapped-failing-pod.yaml b/samples/wrapped-failing-pod.yaml index acdc450..f13555b 100644 --- a/samples/wrapped-failing-pod.yaml +++ b/samples/wrapped-failing-pod.yaml @@ -6,9 +6,9 @@ metadata: kueue.x-k8s.io/queue-name: user-queue annotations: workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 10s - workload.codeflare.dev.appwrapper/resetPauseDuration: 10s + workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 10s workload.codeflare.dev.appwrapper/retryLimit: "1" - workload.codeflare.dev.appwrapper/debuggingFailureDeletionDelayDuration: "5m" + workload.codeflare.dev.appwrapper/delectionOnFailureGracePeriodDuration: "5m" spec: components: - template: diff --git a/site/_pages/arch-fault-tolerance.md b/site/_pages/arch-fault-tolerance.md index f491949..7b29a3f 100644 --- a/site/_pages/arch-fault-tolerance.md +++ b/site/_pages/arch-fault-tolerance.md @@ -15,33 +15,43 @@ workload is unhealthy. A workload can be deemed *unhealthy* if any of the following conditions are true: + There are a non-zero number of `Failed` Pods. + It takes longer than `AdmissionGracePeriod` for the expected - number of Pods to at least reach the `Pending` state. + number of Pods to reach the `Pending` state. + It takes longer than the `WarmupGracePeriod` for the expected - number of Pods to at least reach the `Running` state. + number of Pods to reach the `Running` state. If a workload is determined to be unhealthy, the AppWrapper controller first waits for a `FailureGracePeriod` to allow the primary resource controller an opportunity to react and return the workload to a -healthy state. If the `FailureGracePeriod` expires, the AppWrapper -controller will *reset* the workload by deleting its resources, waiting -for a `ResetPause`, and then creating new instances of the resources. -During this reset period, the AppWrapper **does not** release the workload's +healthy state. If the `FailureGracePeriod` passes and the workload +is still unhealthy, the AppWrapper controller will *reset* the workload by +deleting its resources, waiting for a `RetryPausePeriod`, and then creating +new instances of the resources. +During this retry pause, the AppWrapper **does not** release the workload's quota; this ensures that when the resources are recreated they will still have sufficient quota to execute. The number of times an AppWrapper is reset is tracked as part of its status; if the number of resets exceeds the `RetryLimit`, then the AppWrapper moves into a `Failed` state and its resources are deleted -(thus finally releasing its quota). If at any time during this retry loop, +(thus finally releasing its quota). If at any time during this retry loop, an AppWrapper is suspended (ie, Kueue decides to preempt the AppWrapper), the AppWrapper controller will respect this request by proceeding to delete -the resources +the resources. + +To support debugging `Failed` workloads, an annotation can be added to an +AppWrapper that adds a `DeletionOnFailureGracePeriod` between the time the +AppWrapper enters the `Failed` state and when the process of deleting its resources +begins. Since the AppWrapper continues to consume quota during this delayed deletion period, +this annotation should be used sparingly and only when interactive debugging of +the failed workload is being actively pursued. + +All child resources for an AppWrapper that successfully completed will be automatically +deleted after a `SuccessTTLPeriod` after the AppWrapper entered the `Succeeded` state. When the AppWrapper controller decides to delete the resources for a workload, -it proceeds through several phases. First it does a normal delete of the +it proceeds through several phases. First it does a normal delete of the resources, allowing the primary resource controllers time to cascade the deletion -through all child resources. During a `DeletionGracePeriod`, the AppWrapper controller -monitors to see if the primary controllers have managed to successfully delete -all of the workload's Pods and resources. If they fail to accomplish this within -the `DeletionGracePeriod`, the AppWrapper controller then initiates a *forceful* +through all child resources. If they are not able to successfully delete +all of the workload's Pods and resources within a `ForcefulDeletionGracePeriod`, +the AppWrapper controller then initiates a *forceful* deletion of all remaining Pods and resources by deleting them with a `GracePeriod` of `0`. An AppWrapper will continue to have its `ResourcesDeployed` condition to be `True` until all resources and Pods are successfully deleted. @@ -57,30 +67,17 @@ and can be customized on a per-AppWrapper basis by adding annotations. The table below lists the parameters, gives their default, and the annotation that can be used to customize them. -| Parameter | Default Value | Annotation | -|------------------------|---------------|------------------------------------------------------------------| -| AdmissionGracePeriod | 1 Minute | workload.codeflare.dev.appwrapper/admissionGracePeriodDuration | -| WarmupGracePeriod | 5 Minutes | workload.codeflare.dev.appwrapper/warmupGracePeriodDuration | -| FailureGracePeriod | 1 Minute | workload.codeflare.dev.appwrapper/failureGracePeriodDuration | -| ResetPause | 90 Seconds | workload.codeflare.dev.appwrapper/resetPauseDuration | -| RetryLimit | 3 | workload.codeflare.dev.appwrapper/retryLimit | -| DeletionGracePeriod | 10 Minutes | workload.codeflare.dev.appwrapper/deletionGracePeriodDuration | -| GracePeriodCeiling | 24 Hours | Not Applicable | -| SuccessTTLCeiling | 7 Days | workload.codeflare.dev.appwrapper/successTTLDuration | - +| Parameter | Default Value | Annotation | +|------------------------------|---------------|------------------------------------------------------------------------| +| AdmissionGracePeriod | 1 Minute | workload.codeflare.dev.appwrapper/admissionGracePeriodDuration | +| WarmupGracePeriod | 5 Minutes | workload.codeflare.dev.appwrapper/warmupGracePeriodDuration | +| FailureGracePeriod | 1 Minute | workload.codeflare.dev.appwrapper/failureGracePeriodDuration | +| RetryPausePeriod | 90 Seconds | workload.codeflare.dev.appwrapper/retryPausePeriodDuration | +| RetryLimit | 3 | workload.codeflare.dev.appwrapper/retryLimit | +| DeletionOnFailureGracePeriod | 0 Seconds | workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration | +| ForcefulDeletionGracePeriod | 10 Minutes | workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration | +| SuccessTTL | 7 Days | workload.codeflare.dev.appwrapper/successTTLDuration | +| GracePeriodMaximum | 24 Hours | Not Applicable | -The `GracePeriodCeiling` imposes an upper limit on the other grace periods to -reduce the impact of user-added annotations on overall system utilization. - -To support debugging `Failed` workloads, an additional annotation -`workload.codeflare.dev.appwrapper/debuggingFailureDeletionDelayDuration` can -be added to an AppWrapper when it is created to add a delay between the time the -AppWrapper enters the `Failed` state and when the process of deleting its resources -begins. Since the AppWrapper continues to consume quota during this delayed deletion period, -this annotation should be used sparingly and only when interactive debugging of -the failed workload is being actively pursued. - -All child resources for an AppWrapper that successfully completed will be automatically -deleted `SuccessTTLCeiling` time after the AppWrapper entered the `Succeeded` state. -This duration can be shortened on a per-AppWrapper basis using the -`workload.codeflare.dev.appwrapper/successTTLDuration` annotation. +The `GracePeriodMaximum` imposes a system-wide upper limit on all other grace periods to +limit the potential impact of user-added annotations on overall system utilization.