@@ -344,6 +344,12 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
344
344
345
345
conditions .MarkTrue (m , clusterv1 .DrainingSucceededCondition )
346
346
r .recorder .Eventf (m , corev1 .EventTypeNormal , "SuccessfulDrainNode" , "success draining Machine's node %q" , m .Status .NodeRef .Name )
347
+
348
+ // after draining, wait for volumes to be detached from the node
349
+ if err := r .waitForVolumeDetach (ctx , cluster , m .Status .NodeRef .Name , m .Name ); err != nil {
350
+ r .recorder .Eventf (m , corev1 .EventTypeWarning , "FailedWaitForVolumeDetach" , "error wait for volume detach, node %q: %v" , m .Status .NodeRef .Name , err )
351
+ return ctrl.Result {}, err
352
+ }
347
353
}
348
354
}
349
355
@@ -553,6 +559,45 @@ func (r *MachineReconciler) drainNode(ctx context.Context, cluster *clusterv1.Cl
553
559
return nil
554
560
}
555
561
562
+ // pod deletion and volume detach happen asynchronously, so pod could be deleted before volume detached from the node
563
+ // for volume provisioner like vsphere-volume this could be problematic because if the node deleted before detach success
564
+ // then the under line vmdk will be deleted together with the Machine
565
+ // so after drain we wait here for volume detach from the node
566
+ func (r * MachineReconciler ) waitForVolumeDetach (ctx context.Context , cluster * clusterv1.Cluster , nodeName string , machineName string ) error {
567
+ logger := r .Log .WithValues ("machine" , machineName , "node" , nodeName , "cluster" , cluster .Name , "namespace" , cluster .Namespace )
568
+
569
+ restConfig , err := remote .RESTConfig (ctx , r .Client , util .ObjectKey (cluster ))
570
+ if err != nil {
571
+ logger .Error (err , "Error creating a remote client while deleting Machine, won't retry" )
572
+ return nil
573
+ }
574
+ kubeClient , err := kubernetes .NewForConfig (restConfig )
575
+ if err != nil {
576
+ logger .Error (err , "Error creating a remote client while deleting Machine, won't retry" )
577
+ return nil
578
+ }
579
+
580
+ waitErr := wait .PollImmediate (2 * time .Second , 20 * time .Second , func () (bool , error ) {
581
+ node , getErr := kubeClient .CoreV1 ().Nodes ().Get (nodeName , metav1.GetOptions {})
582
+ if getErr != nil {
583
+ if apierrors .IsNotFound (getErr ) {
584
+ logger .Error (getErr , "Could not find node from noderef, it may have already been deleted" )
585
+ return true , nil
586
+ } else {
587
+ return false , getErr
588
+ }
589
+ }
590
+
591
+ return len (node .Status .VolumesAttached ) == 0 , nil
592
+ })
593
+ if waitErr != nil {
594
+ return errors .Wrapf (waitErr , "failed to wait for volume detach from node %s" , nodeName )
595
+ }
596
+
597
+ logger .Info ("Node volumes all detached" , "" )
598
+ return nil
599
+ }
600
+
556
601
func (r * MachineReconciler ) deleteNode (ctx context.Context , cluster * clusterv1.Cluster , name string ) error {
557
602
logger := r .Log .WithValues ("machine" , name , "cluster" , cluster .Name , "namespace" , cluster .Namespace )
558
603
0 commit comments