Skip to content

Commit 9411031

Browse files
authored
Merge pull request #10662 from typeid/10661
🐛 Machine deletion skips waiting for volumes detached for unreachable Nodes
2 parents 9a2d8cd + b30fcd1 commit 9411031

File tree

2 files changed

+120
-1
lines changed

2 files changed

+120
-1
lines changed

internal/controllers/machine/machine_controller.go

+10-1
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
400400

401401
// After node draining is completed, and if isNodeVolumeDetachingAllowed returns True, make sure all
402402
// volumes are detached before proceeding to delete the Node.
403+
// In case the node is unreachable, the detachment is skipped.
403404
if r.isNodeVolumeDetachingAllowed(m) {
404405
// The VolumeDetachSucceededCondition never exists before we wait for volume detachment for the first time,
405406
// so its transition time can be used to record the first time we wait for volume detachment.
@@ -689,7 +690,7 @@ func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster,
689690
return ctrl.Result{}, nil
690691
}
691692

692-
// shouldWaitForNodeVolumes returns true if node status still have volumes attached
693+
// shouldWaitForNodeVolumes returns true if node status still have volumes attached and the node is reachable
693694
// pod deletion and volume detach happen asynchronously, so pod could be deleted before volume detached from the node
694695
// this could cause issue for some storage provisioner, for example, vsphere-volume this is problematic
695696
// because if the node is deleted before detach success, then the underline VMDK will be deleted together with the Machine
@@ -711,6 +712,14 @@ func (r *Reconciler) shouldWaitForNodeVolumes(ctx context.Context, cluster *clus
711712
return true, err
712713
}
713714

715+
if noderefutil.IsNodeUnreachable(node) {
716+
// If a node is unreachable, we can't detach the volume.
717+
// We need to skip the detachment as we otherwise block deletions
718+
// of unreachable nodes when a volume is attached.
719+
log.Info("Skipping volume detachment as node is unreachable.")
720+
return false, nil
721+
}
722+
714723
return len(node.Status.VolumesAttached) != 0, nil
715724
}
716725

internal/controllers/machine/machine_controller_test.go

+110
Original file line numberDiff line numberDiff line change
@@ -1591,6 +1591,116 @@ func TestIsNodeVolumeDetachingAllowed(t *testing.T) {
15911591
}
15921592
}
15931593

1594+
func TestShouldWaitForNodeVolumes(t *testing.T) {
1595+
testCluster := &clusterv1.Cluster{
1596+
TypeMeta: metav1.TypeMeta{Kind: "Cluster", APIVersion: clusterv1.GroupVersion.String()},
1597+
ObjectMeta: metav1.ObjectMeta{Namespace: metav1.NamespaceDefault, Name: "test-cluster"},
1598+
}
1599+
1600+
attachedVolumes := []corev1.AttachedVolume{
1601+
{
1602+
Name: "test-volume",
1603+
DevicePath: "test-path",
1604+
},
1605+
}
1606+
1607+
tests := []struct {
1608+
name string
1609+
node *corev1.Node
1610+
expected bool
1611+
}{
1612+
{
1613+
name: "Node has volumes attached",
1614+
node: &corev1.Node{
1615+
ObjectMeta: metav1.ObjectMeta{
1616+
Name: "test-node",
1617+
},
1618+
Status: corev1.NodeStatus{
1619+
Conditions: []corev1.NodeCondition{
1620+
{
1621+
Type: corev1.NodeReady,
1622+
Status: corev1.ConditionTrue,
1623+
},
1624+
},
1625+
VolumesAttached: attachedVolumes,
1626+
},
1627+
},
1628+
expected: true,
1629+
},
1630+
{
1631+
name: "Node has no volumes attached",
1632+
node: &corev1.Node{
1633+
ObjectMeta: metav1.ObjectMeta{
1634+
Name: "test-node",
1635+
},
1636+
Status: corev1.NodeStatus{
1637+
Conditions: []corev1.NodeCondition{
1638+
{
1639+
Type: corev1.NodeReady,
1640+
Status: corev1.ConditionTrue,
1641+
},
1642+
},
1643+
},
1644+
},
1645+
expected: false,
1646+
},
1647+
{
1648+
name: "Node is unreachable and has volumes attached",
1649+
node: &corev1.Node{
1650+
ObjectMeta: metav1.ObjectMeta{
1651+
Name: "unreachable-node",
1652+
},
1653+
Status: corev1.NodeStatus{
1654+
Conditions: []corev1.NodeCondition{
1655+
{
1656+
Type: corev1.NodeReady,
1657+
Status: corev1.ConditionUnknown,
1658+
},
1659+
},
1660+
VolumesAttached: attachedVolumes,
1661+
},
1662+
},
1663+
expected: false,
1664+
},
1665+
{
1666+
name: "Node is unreachable and has no volumes attached",
1667+
node: &corev1.Node{
1668+
ObjectMeta: metav1.ObjectMeta{
1669+
Name: "unreachable-node",
1670+
},
1671+
Status: corev1.NodeStatus{
1672+
Conditions: []corev1.NodeCondition{
1673+
{
1674+
Type: corev1.NodeReady,
1675+
Status: corev1.ConditionUnknown,
1676+
},
1677+
},
1678+
},
1679+
},
1680+
expected: false,
1681+
},
1682+
}
1683+
for _, tt := range tests {
1684+
t.Run(tt.name, func(t *testing.T) {
1685+
g := NewWithT(t)
1686+
1687+
var objs []client.Object
1688+
objs = append(objs, testCluster, tt.node)
1689+
1690+
c := fake.NewClientBuilder().WithObjects(objs...).Build()
1691+
tracker := remote.NewTestClusterCacheTracker(ctrl.Log, c, c, fakeScheme, client.ObjectKeyFromObject(testCluster))
1692+
r := &Reconciler{
1693+
Client: c,
1694+
Tracker: tracker,
1695+
}
1696+
1697+
got, err := r.shouldWaitForNodeVolumes(ctx, testCluster, tt.node.Name)
1698+
g.Expect(err).ToNot(HaveOccurred())
1699+
g.Expect(got).To(Equal(tt.expected))
1700+
})
1701+
}
1702+
}
1703+
15941704
func TestIsDeleteNodeAllowed(t *testing.T) {
15951705
deletionts := metav1.Now()
15961706

0 commit comments

Comments
 (0)