Skip to content

Commit c340e68

Browse files
committed
wait for volume detach after node drain
1 parent a911c68 commit c340e68

File tree

1 file changed

+45
-0
lines changed

1 file changed

+45
-0
lines changed

controllers/machine_controller.go

+45
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,12 @@ func (r *MachineReconciler) reconcileDelete(ctx context.Context, cluster *cluste
344344

345345
conditions.MarkTrue(m, clusterv1.DrainingSucceededCondition)
346346
r.recorder.Eventf(m, corev1.EventTypeNormal, "SuccessfulDrainNode", "success draining Machine's node %q", m.Status.NodeRef.Name)
347+
348+
// after draining, wait for volumes to be detached from the node
349+
if err := r.waitForVolumeDetach(ctx, cluster, m.Status.NodeRef.Name, m.Name); err != nil {
350+
r.recorder.Eventf(m, corev1.EventTypeWarning, "FailedWaitForVolumeDetach", "error wait for volume detach, node %q: %v", m.Status.NodeRef.Name, err)
351+
return ctrl.Result{}, err
352+
}
347353
}
348354
}
349355

@@ -553,6 +559,45 @@ func (r *MachineReconciler) drainNode(ctx context.Context, cluster *clusterv1.Cl
553559
return nil
554560
}
555561

562+
// pod deletion and volume detach happen asynchronously, so pod could be deleted before volume detached from the node
563+
// for volume provisioner like vsphere-volume this could be problematic because if the node deleted before detach success
564+
// then the under line vmdk will be deleted together with the Machine
565+
// so after drain we wait here for volume detach from the node
566+
func (r *MachineReconciler) waitForVolumeDetach(ctx context.Context, cluster *clusterv1.Cluster, nodeName string, machineName string) error {
567+
logger := r.Log.WithValues("machine", machineName, "node", nodeName, "cluster", cluster.Name, "namespace", cluster.Namespace)
568+
569+
restConfig, err := remote.RESTConfig(ctx, r.Client, util.ObjectKey(cluster))
570+
if err != nil {
571+
logger.Error(err, "Error creating a remote client while deleting Machine, won't retry")
572+
return nil
573+
}
574+
kubeClient, err := kubernetes.NewForConfig(restConfig)
575+
if err != nil {
576+
logger.Error(err, "Error creating a remote client while deleting Machine, won't retry")
577+
return nil
578+
}
579+
580+
waitErr := wait.PollImmediate(2*time.Second, 20*time.Second, func() (bool, error) {
581+
node, getErr := kubeClient.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
582+
if getErr != nil {
583+
if apierrors.IsNotFound(getErr) {
584+
logger.Error(getErr, "Could not find node from noderef, it may have already been deleted")
585+
return true, nil
586+
} else {
587+
return false, getErr
588+
}
589+
}
590+
591+
return len(node.Status.VolumesAttached) == 0, nil
592+
})
593+
if waitErr != nil {
594+
return errors.Wrapf(waitErr, "failed to wait for volume detach from node %s", nodeName)
595+
}
596+
597+
logger.Info("Node volumes all detached", "")
598+
return nil
599+
}
600+
556601
func (r *MachineReconciler) deleteNode(ctx context.Context, cluster *clusterv1.Cluster, name string) error {
557602
logger := r.Log.WithValues("machine", name, "cluster", cluster.Name, "namespace", cluster.Namespace)
558603

0 commit comments

Comments
 (0)