Skip to content

Commit 1151a1f

Browse files
committed
implement forceful deletion
Fixes #36
1 parent 0db706e commit 1151a1f

File tree

4 files changed

+85
-10
lines changed

4 files changed

+85
-10
lines changed

api/v1beta2/appwrapper_types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,13 +95,15 @@ const (
9595
ResourcesDeployed AppWrapperCondition = "ResourcesDeployed"
9696
PodsReady AppWrapperCondition = "PodsReady"
9797
Unhealthy AppWrapperCondition = "Unhealthy"
98+
DeletingResources AppWrapperCondition = "DeletingResources"
9899
)
99100

100101
const (
101102
WarmupGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/warmupGracePeriodDuration"
102103
FailureGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/failureGracePeriodDuration"
103104
ResetPauseDurationAnnotation = "workload.codeflare.dev.appwrapper/resetPauseDuration"
104105
RetryLimitAnnotation = "workload.codeflare.dev.appwrapper/retryLimit"
106+
DeletionGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/deletionGracePeriodDuration"
105107
)
106108

107109
//+kubebuilder:object:root=true

internal/controller/appwrapper/appwrapper_controller.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,17 @@ func (r *AppWrapperReconciler) resettingPauseDuration(ctx context.Context, aw *w
437437
return r.Config.FaultTolerance.ResetPause
438438
}
439439

440+
func (r *AppWrapperReconciler) deletionGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
441+
if userPeriod, ok := aw.Annotations[workloadv1beta2.DeletionGracePeriodAnnotation]; ok {
442+
if duration, err := time.ParseDuration(userPeriod); err == nil {
443+
return duration
444+
} else {
445+
log.FromContext(ctx).Info("Malformed deletion period annotation", "annotation", userPeriod, "error", err)
446+
}
447+
}
448+
return r.Config.FaultTolerance.DeletionGracePeriod
449+
}
450+
440451
func clearCondition(aw *workloadv1beta2.AppWrapper, condition workloadv1beta2.AppWrapperCondition, reason string, message string) {
441452
if meta.IsStatusConditionTrue(aw.Status.Conditions, string(condition)) {
442453
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{

internal/controller/appwrapper/resource_management.go

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@ package appwrapper
1919
import (
2020
"context"
2121
"fmt"
22+
"time"
2223

2324
workloadv1beta2 "github.com/project-codeflare/appwrapper/api/v1beta2"
2425
"github.com/project-codeflare/appwrapper/pkg/utils"
26+
v1 "k8s.io/api/core/v1"
2527
apierrors "k8s.io/apimachinery/pkg/api/errors"
2628
"k8s.io/apimachinery/pkg/api/meta"
2729
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -164,7 +166,11 @@ func (r *AppWrapperReconciler) createComponents(ctx context.Context, aw *workloa
164166
}
165167

166168
func (r *AppWrapperReconciler) deleteComponents(ctx context.Context, aw *workloadv1beta2.AppWrapper) bool {
167-
// TODO forceful deletion: See https://github.com/project-codeflare/appwrapper/issues/36
169+
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
170+
Type: string(workloadv1beta2.DeletingResources),
171+
Status: metav1.ConditionTrue,
172+
Reason: "DeletionInitiated",
173+
})
168174
log := log.FromContext(ctx)
169175
remaining := 0
170176
for _, component := range aw.Spec.Components {
@@ -181,5 +187,59 @@ func (r *AppWrapperReconciler) deleteComponents(ctx context.Context, aw *workloa
181187
}
182188
remaining++ // no error deleting resource, resource therefore still exists
183189
}
184-
return remaining == 0
190+
191+
deletionGracePeriod := r.deletionGraceDuration(ctx, aw)
192+
if deletionGracePeriod <= 0 {
193+
// forced deletion is disabled; once remaining is 0 we are done
194+
if remaining == 0 {
195+
clearCondition(aw, workloadv1beta2.DeletingResources, "DeletionComplete", "")
196+
return true
197+
} else {
198+
return false
199+
}
200+
}
201+
202+
pods := &v1.PodList{Items: []v1.Pod{}}
203+
if err := r.List(ctx, pods,
204+
client.UnsafeDisableDeepCopy,
205+
client.InNamespace(aw.Namespace),
206+
client.MatchingLabels{AppWrapperLabel: aw.Name}); err != nil {
207+
log.Error(err, "Pod list error")
208+
}
209+
210+
if remaining == 0 && len(pods.Items) == 0 {
211+
// no resources, no pods, deletion is complete
212+
clearCondition(aw, workloadv1beta2.DeletingResources, "DeletionComplete", "")
213+
return true
214+
}
215+
216+
whenInitiated := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.DeletingResources)).LastTransitionTime
217+
if time.Now().Before(whenInitiated.Time.Add(deletionGracePeriod)) {
218+
// Deadline hasn't expired, just requeue the deletion
219+
return false
220+
}
221+
222+
if len(pods.Items) > 0 {
223+
// force deletion of pods first
224+
for _, pod := range pods.Items {
225+
if err := r.Delete(ctx, &pod, client.GracePeriodSeconds(0)); err != nil {
226+
log.Error(err, "Forceful pod deletion error")
227+
}
228+
}
229+
} else {
230+
// force deletion of wrapped resources once pods are gone
231+
for _, component := range aw.Spec.Components {
232+
obj, err := parseComponent(aw, component.Template.Raw)
233+
if err != nil {
234+
log.Error(err, "Parsing error")
235+
continue
236+
}
237+
if err := r.Delete(ctx, obj, client.GracePeriodSeconds(0)); err != nil && !apierrors.IsNotFound(err) {
238+
log.Error(err, "Forceful deletion error")
239+
}
240+
}
241+
}
242+
243+
// requeue deletion
244+
return false
185245
}

pkg/config/config.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,11 @@ type AppWrapperConfig struct {
2626
}
2727

2828
type FaultToleranceConfig struct {
29-
WarmupGracePeriod time.Duration `json:"warmupGracePeriod,omitempty"`
30-
FailureGracePeriod time.Duration `json:"failureGracePeriod,omitempty"`
31-
ResetPause time.Duration `json:"resetPause,omitempty"`
32-
RetryLimit int32 `json:"retryLimit,omitempty"`
29+
WarmupGracePeriod time.Duration `json:"warmupGracePeriod,omitempty"`
30+
FailureGracePeriod time.Duration `json:"failureGracePeriod,omitempty"`
31+
ResetPause time.Duration `json:"resetPause,omitempty"`
32+
RetryLimit int32 `json:"retryLimit,omitempty"`
33+
DeletionGracePeriod time.Duration `json:"deletionGracePeriod,omitempty"`
3334
}
3435

3536
type CertManagementConfig struct {
@@ -49,10 +50,11 @@ func NewConfig(namespace string) *AppWrapperConfig {
4950
ManageJobsWithoutQueueName: true,
5051
StandaloneMode: false,
5152
FaultTolerance: FaultToleranceConfig{
52-
WarmupGracePeriod: 5 * time.Minute,
53-
FailureGracePeriod: 1 * time.Minute,
54-
ResetPause: 90 * time.Second,
55-
RetryLimit: 3,
53+
WarmupGracePeriod: 5 * time.Minute,
54+
FailureGracePeriod: 1 * time.Minute,
55+
ResetPause: 90 * time.Second,
56+
RetryLimit: 3,
57+
DeletionGracePeriod: 10 * time.Minute,
5658
},
5759
CertManagement: CertManagementConfig{
5860
Namespace: namespace,

0 commit comments

Comments
 (0)