Skip to content

adjust names of grace period annotations #128

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions api/v1beta2/appwrapper_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,14 +133,14 @@ const (
)

const (
AdmissionGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/admissionGracePeriodDuration"
WarmupGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/warmupGracePeriodDuration"
FailureGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/failureGracePeriodDuration"
ResetPauseDurationAnnotation = "workload.codeflare.dev.appwrapper/resetPauseDuration"
RetryLimitAnnotation = "workload.codeflare.dev.appwrapper/retryLimit"
DeletionGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/deletionGracePeriodDuration"
DebuggingFailureDeletionDelayDurationAnnotation = "workload.codeflare.dev.appwrapper/debuggingFailureDeletionDelayDuration"
SuccessTTLDurationAnnotation = "workload.codeflare.dev.appwrapper/successTTLDuration"
AdmissionGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/admissionGracePeriodDuration"
WarmupGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/warmupGracePeriodDuration"
FailureGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/failureGracePeriodDuration"
RetryPausePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/retryPausePeriodDuration"
RetryLimitAnnotation = "workload.codeflare.dev.appwrapper/retryLimit"
ForcefulDeletionGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration"
DeletionOnFailureGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration"
SuccessTTLAnnotation = "workload.codeflare.dev.appwrapper/successTTLDuration"
)

//+kubebuilder:object:root=true
Expand Down
42 changes: 21 additions & 21 deletions internal/controller/appwrapper/appwrapper_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)

// Pause before transitioning to Resuming to heuristically allow transient system problems to subside
whenReset := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.Unhealthy)).LastTransitionTime
pauseDuration := r.resettingPauseDuration(ctx, aw)
pauseDuration := r.retryPauseDuration(ctx, aw)
now := time.Now()
deadline := whenReset.Add(pauseDuration)
if now.Before(deadline) {
Expand All @@ -342,14 +342,14 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
// When an appwrapper is annotated with a non-zero debugging delay,
// we hold quota for the delay period and do not delete the resources of
// a failed appwrapper unless Kueue preempts it by setting Suspend to true.
deletionDelay := r.debuggingFailureDeletionDelay(ctx, aw)
deletionDelay := r.deletionOnFailureGraceDuration(ctx, aw)

if deletionDelay > 0 && !aw.Spec.Suspend {
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.DeletingResources),
Status: metav1.ConditionFalse,
Reason: "DeletionPaused",
Message: fmt.Sprintf("%v has value %v", workloadv1beta2.DebuggingFailureDeletionDelayDurationAnnotation, deletionDelay),
Message: fmt.Sprintf("%v has value %v", workloadv1beta2.DeletionOnFailureGracePeriodAnnotation, deletionDelay),
})
whenDelayed := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.DeletingResources)).LastTransitionTime

Expand Down Expand Up @@ -457,8 +457,8 @@ func (r *AppWrapperReconciler) workloadStatus(ctx context.Context, aw *workloadv
func (r *AppWrapperReconciler) limitDuration(desired time.Duration) time.Duration {
if desired < 0 {
return 0 * time.Second
} else if desired > r.Config.FaultTolerance.GracePeriodCeiling {
return r.Config.FaultTolerance.GracePeriodCeiling
} else if desired > r.Config.FaultTolerance.GracePeriodMaximum {
return r.Config.FaultTolerance.GracePeriodMaximum
} else {
return desired
}
Expand All @@ -469,7 +469,7 @@ func (r *AppWrapperReconciler) admissionGraceDuration(ctx context.Context, aw *w
if duration, err := time.ParseDuration(userPeriod); err == nil {
return r.limitDuration(duration)
} else {
log.FromContext(ctx).Info("Malformed warmup period annotation", "annotation", userPeriod, "error", err)
log.FromContext(ctx).Info("Malformed admission grace period annotation", "annotation", userPeriod, "error", err)
}
}
return r.limitDuration(r.Config.FaultTolerance.AdmissionGracePeriod)
Expand All @@ -480,7 +480,7 @@ func (r *AppWrapperReconciler) warmupGraceDuration(ctx context.Context, aw *work
if duration, err := time.ParseDuration(userPeriod); err == nil {
return r.limitDuration(duration)
} else {
log.FromContext(ctx).Info("Malformed warmup period annotation", "annotation", userPeriod, "error", err)
log.FromContext(ctx).Info("Malformed warmup grace period annotation", "annotation", userPeriod, "error", err)
}
}
return r.limitDuration(r.Config.FaultTolerance.WarmupGracePeriod)
Expand Down Expand Up @@ -508,50 +508,50 @@ func (r *AppWrapperReconciler) retryLimit(ctx context.Context, aw *workloadv1bet
return r.Config.FaultTolerance.RetryLimit
}

func (r *AppWrapperReconciler) resettingPauseDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
if userPeriod, ok := aw.Annotations[workloadv1beta2.ResetPauseDurationAnnotation]; ok {
func (r *AppWrapperReconciler) retryPauseDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
if userPeriod, ok := aw.Annotations[workloadv1beta2.RetryPausePeriodDurationAnnotation]; ok {
if duration, err := time.ParseDuration(userPeriod); err == nil {
return r.limitDuration(duration)
} else {
log.FromContext(ctx).Info("Malformed reset pause annotation", "annotation", userPeriod, "error", err)
log.FromContext(ctx).Info("Malformed retry pause annotation", "annotation", userPeriod, "error", err)
}
}
return r.limitDuration(r.Config.FaultTolerance.ResetPause)
return r.limitDuration(r.Config.FaultTolerance.RetryPausePeriod)
}

func (r *AppWrapperReconciler) deletionGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
if userPeriod, ok := aw.Annotations[workloadv1beta2.DeletionGracePeriodAnnotation]; ok {
func (r *AppWrapperReconciler) forcefulDeletionGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
if userPeriod, ok := aw.Annotations[workloadv1beta2.ForcefulDeletionGracePeriodAnnotation]; ok {
if duration, err := time.ParseDuration(userPeriod); err == nil {
return r.limitDuration(duration)
} else {
log.FromContext(ctx).Info("Malformed deletion period annotation", "annotation", userPeriod, "error", err)
log.FromContext(ctx).Info("Malformed forceful deletion period annotation", "annotation", userPeriod, "error", err)
}
}
return r.limitDuration(r.Config.FaultTolerance.DeletionGracePeriod)
return r.limitDuration(r.Config.FaultTolerance.ForcefulDeletionGracePeriod)
}

func (r *AppWrapperReconciler) debuggingFailureDeletionDelay(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
if userPeriod, ok := aw.Annotations[workloadv1beta2.DebuggingFailureDeletionDelayDurationAnnotation]; ok {
func (r *AppWrapperReconciler) deletionOnFailureGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
if userPeriod, ok := aw.Annotations[workloadv1beta2.DeletionOnFailureGracePeriodAnnotation]; ok {
if duration, err := time.ParseDuration(userPeriod); err == nil {
return r.limitDuration(duration)
} else {
log.FromContext(ctx).Info("Malformed delay deletion annotation", "annotation", userPeriod, "error", err)
log.FromContext(ctx).Info("Malformed delection on failue delay annotation", "annotation", userPeriod, "error", err)
}
}
return 0 * time.Second
}

func (r *AppWrapperReconciler) timeToLiveAfterSucceededDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
if userPeriod, ok := aw.Annotations[workloadv1beta2.SuccessTTLDurationAnnotation]; ok {
if userPeriod, ok := aw.Annotations[workloadv1beta2.SuccessTTLAnnotation]; ok {
if duration, err := time.ParseDuration(userPeriod); err == nil {
if duration > 0 && duration < r.Config.FaultTolerance.SuccessTTLCeiling {
if duration > 0 && duration < r.Config.FaultTolerance.SuccessTTL {
return duration
}
} else {
log.FromContext(ctx).Info("Malformed successTTL annotation", "annotation", userPeriod, "error", err)
}
}
return r.Config.FaultTolerance.SuccessTTLCeiling
return r.Config.FaultTolerance.SuccessTTL
}

func clearCondition(aw *workloadv1beta2.AppWrapper, condition workloadv1beta2.AppWrapperCondition, reason string, message string) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ var _ = Describe("AppWrapper Controller", func() {
}
awConfig := config.NewAppWrapperConfig()
awConfig.FaultTolerance.FailureGracePeriod = 0 * time.Second
awConfig.FaultTolerance.ResetPause = 0 * time.Second
awConfig.FaultTolerance.RetryPausePeriod = 0 * time.Second
awConfig.FaultTolerance.RetryLimit = 0
awReconciler = &AppWrapperReconciler{
Client: k8sClient,
Expand Down
2 changes: 1 addition & 1 deletion internal/controller/appwrapper/resource_management.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ func (r *AppWrapperReconciler) deleteComponents(ctx context.Context, aw *workloa
remaining++ // no error deleting resource, resource therefore still exists
}

deletionGracePeriod := r.deletionGraceDuration(ctx, aw)
deletionGracePeriod := r.forcefulDeletionGraceDuration(ctx, aw)
whenInitiated := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.DeletingResources)).LastTransitionTime
gracePeriodExpired := time.Now().After(whenInitiated.Time.Add(deletionGracePeriod))

Expand Down
60 changes: 30 additions & 30 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ type AppWrapperConfig struct {
}

type FaultToleranceConfig struct {
AdmissionGracePeriod time.Duration `json:"admissionGracePeriod,omitempty"`
WarmupGracePeriod time.Duration `json:"warmupGracePeriod,omitempty"`
FailureGracePeriod time.Duration `json:"failureGracePeriod,omitempty"`
ResetPause time.Duration `json:"resetPause,omitempty"`
RetryLimit int32 `json:"retryLimit,omitempty"`
DeletionGracePeriod time.Duration `json:"deletionGracePeriod,omitempty"`
GracePeriodCeiling time.Duration `json:"gracePeriodCeiling,omitempty"`
SuccessTTLCeiling time.Duration `json:"successTTLCeiling,omitempty"`
AdmissionGracePeriod time.Duration `json:"admissionGracePeriod,omitempty"`
WarmupGracePeriod time.Duration `json:"warmupGracePeriod,omitempty"`
FailureGracePeriod time.Duration `json:"failureGracePeriod,omitempty"`
RetryPausePeriod time.Duration `json:"resetPause,omitempty"`
RetryLimit int32 `json:"retryLimit,omitempty"`
ForcefulDeletionGracePeriod time.Duration `json:"deletionGracePeriod,omitempty"`
GracePeriodMaximum time.Duration `json:"gracePeriodCeiling,omitempty"`
SuccessTTL time.Duration `json:"successTTLCeiling,omitempty"`
}

type CertManagementConfig struct {
Expand Down Expand Up @@ -82,41 +82,41 @@ func NewAppWrapperConfig() *AppWrapperConfig {
DisableChildAdmissionCtrl: false,
UserRBACAdmissionCheck: true,
FaultTolerance: &FaultToleranceConfig{
AdmissionGracePeriod: 1 * time.Minute,
WarmupGracePeriod: 5 * time.Minute,
FailureGracePeriod: 1 * time.Minute,
ResetPause: 90 * time.Second,
RetryLimit: 3,
DeletionGracePeriod: 10 * time.Minute,
GracePeriodCeiling: 24 * time.Hour,
SuccessTTLCeiling: 7 * 24 * time.Hour,
AdmissionGracePeriod: 1 * time.Minute,
WarmupGracePeriod: 5 * time.Minute,
FailureGracePeriod: 1 * time.Minute,
RetryPausePeriod: 90 * time.Second,
RetryLimit: 3,
ForcefulDeletionGracePeriod: 10 * time.Minute,
GracePeriodMaximum: 24 * time.Hour,
SuccessTTL: 7 * 24 * time.Hour,
},
}
}

func ValidateAppWrapperConfig(config *AppWrapperConfig) error {
if config.FaultTolerance.DeletionGracePeriod > config.FaultTolerance.GracePeriodCeiling {
return fmt.Errorf("DelectionGracePeriod %v exceeds GracePeriodCeiling %v",
config.FaultTolerance.DeletionGracePeriod, config.FaultTolerance.GracePeriodCeiling)
if config.FaultTolerance.ForcefulDeletionGracePeriod > config.FaultTolerance.GracePeriodMaximum {
return fmt.Errorf("ForcefulDelectionGracePeriod %v exceeds GracePeriodCeiling %v",
config.FaultTolerance.ForcefulDeletionGracePeriod, config.FaultTolerance.GracePeriodMaximum)
}
if config.FaultTolerance.ResetPause > config.FaultTolerance.GracePeriodCeiling {
return fmt.Errorf("ResetPause %v exceeds GracePeriodCeiling %v",
config.FaultTolerance.ResetPause, config.FaultTolerance.GracePeriodCeiling)
if config.FaultTolerance.RetryPausePeriod > config.FaultTolerance.GracePeriodMaximum {
return fmt.Errorf("RetryPausePeriod %v exceeds GracePeriodCeiling %v",
config.FaultTolerance.RetryPausePeriod, config.FaultTolerance.GracePeriodMaximum)
}
if config.FaultTolerance.FailureGracePeriod > config.FaultTolerance.GracePeriodCeiling {
if config.FaultTolerance.FailureGracePeriod > config.FaultTolerance.GracePeriodMaximum {
return fmt.Errorf("FailureGracePeriod %v exceeds GracePeriodCeiling %v",
config.FaultTolerance.FailureGracePeriod, config.FaultTolerance.GracePeriodCeiling)
config.FaultTolerance.FailureGracePeriod, config.FaultTolerance.GracePeriodMaximum)
}
if config.FaultTolerance.AdmissionGracePeriod > config.FaultTolerance.GracePeriodCeiling {
if config.FaultTolerance.AdmissionGracePeriod > config.FaultTolerance.GracePeriodMaximum {
return fmt.Errorf("AdmissionGracePeriod %v exceeds GracePeriodCeiling %v",
config.FaultTolerance.AdmissionGracePeriod, config.FaultTolerance.GracePeriodCeiling)
config.FaultTolerance.AdmissionGracePeriod, config.FaultTolerance.GracePeriodMaximum)
}
if config.FaultTolerance.WarmupGracePeriod > config.FaultTolerance.GracePeriodCeiling {
if config.FaultTolerance.WarmupGracePeriod > config.FaultTolerance.GracePeriodMaximum {
return fmt.Errorf("WarmupGracePeriod %v exceeds GracePeriodCeiling %v",
config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodCeiling)
config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodMaximum)
}
if config.FaultTolerance.SuccessTTLCeiling <= 0 {
return fmt.Errorf("SuccessTTLCeiling %v is not a positive duration", config.FaultTolerance.SuccessTTLCeiling)
if config.FaultTolerance.SuccessTTL <= 0 {
return fmt.Errorf("SuccessTTL %v is not a positive duration", config.FaultTolerance.SuccessTTL)
}

return nil
Expand Down
2 changes: 1 addition & 1 deletion samples/wrapped-failing-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ metadata:
kueue.x-k8s.io/queue-name: user-queue
annotations:
workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 10s
workload.codeflare.dev.appwrapper/resetPauseDuration: 10s
workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 10s
workload.codeflare.dev.appwrapper/retryLimit: "3"
spec:
components:
Expand Down
4 changes: 2 additions & 2 deletions samples/wrapped-failing-pod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ metadata:
kueue.x-k8s.io/queue-name: user-queue
annotations:
workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 10s
workload.codeflare.dev.appwrapper/resetPauseDuration: 10s
workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 10s
workload.codeflare.dev.appwrapper/retryLimit: "1"
workload.codeflare.dev.appwrapper/debuggingFailureDeletionDelayDuration: "5m"
workload.codeflare.dev.appwrapper/delectionOnFailureGracePeriodDuration: "5m"
spec:
components:
- template:
Expand Down
Loading