Skip to content

Commit bb90c1a

Browse files
committed
fix(10661): volumes don't block deletion of unreachable nodes
1 parent eaf97bc commit bb90c1a

File tree

2 files changed

+121
-1
lines changed

2 files changed

+121
-1
lines changed

internal/controllers/machine/machine_controller.go

+10-1
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
401401

402402
// After node draining is completed, and if isNodeVolumeDetachingAllowed returns True, make sure all
403403
// volumes are detached before proceeding to delete the Node.
404+
// In case the node is unreachable, the detachment is skipped.
404405
if r.isNodeVolumeDetachingAllowed(m) {
405406
// The VolumeDetachSucceededCondition never exists before we wait for volume detachment for the first time,
406407
// so its transition time can be used to record the first time we wait for volume detachment.
@@ -690,7 +691,7 @@ func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster,
690691
return ctrl.Result{}, nil
691692
}
692693

693-
// shouldWaitForNodeVolumes returns true if node status still have volumes attached
694+
// shouldWaitForNodeVolumes returns true if node status still have volumes attached and the node is reachable
694695
// pod deletion and volume detach happen asynchronously, so pod could be deleted before volume detached from the node
695696
// this could cause issue for some storage provisioner, for example, vsphere-volume this is problematic
696697
// because if the node is deleted before detach success, then the underline VMDK will be deleted together with the Machine
@@ -712,6 +713,14 @@ func (r *Reconciler) shouldWaitForNodeVolumes(ctx context.Context, cluster *clus
712713
return true, err
713714
}
714715

716+
if noderefutil.IsNodeUnreachable(node) {
717+
// If a node is unreachable, we can't detach the volume.
718+
// We need to skip the detachment as we otherwise block deletions
719+
// of unreachable nodes when a volume is attached.
720+
log.Info("Skipping volume detachment as node is unreachable.")
721+
return false, nil
722+
}
723+
715724
return len(node.Status.VolumesAttached) != 0, nil
716725
}
717726

internal/controllers/machine/machine_controller_test.go

+111
Original file line numberDiff line numberDiff line change
@@ -1599,6 +1599,117 @@ func TestIsNodeVolumeDetachingAllowed(t *testing.T) {
15991599
}
16001600
}
16011601

1602+
func TestShouldWaitForNodeVolumes(t *testing.T) {
1603+
testCluster := &clusterv1.Cluster{
1604+
TypeMeta: metav1.TypeMeta{Kind: "Cluster", APIVersion: clusterv1.GroupVersion.String()},
1605+
ObjectMeta: metav1.ObjectMeta{Namespace: metav1.NamespaceDefault, Name: "test-cluster"},
1606+
}
1607+
1608+
attachedVolumes := []corev1.AttachedVolume{
1609+
{
1610+
Name: "test-volume",
1611+
DevicePath: "test-path",
1612+
},
1613+
}
1614+
1615+
tests := []struct {
1616+
name string
1617+
node *corev1.Node
1618+
expected bool
1619+
}{
1620+
{
1621+
name: "Node has volumes attached",
1622+
node: &corev1.Node{
1623+
ObjectMeta: metav1.ObjectMeta{
1624+
Name: "test-node",
1625+
},
1626+
Status: corev1.NodeStatus{
1627+
Conditions: []corev1.NodeCondition{
1628+
{
1629+
Type: corev1.NodeReady,
1630+
Status: corev1.ConditionTrue,
1631+
},
1632+
},
1633+
VolumesAttached: attachedVolumes,
1634+
},
1635+
},
1636+
expected: true,
1637+
},
1638+
{
1639+
name: "Node has no volumes attached",
1640+
node: &corev1.Node{
1641+
ObjectMeta: metav1.ObjectMeta{
1642+
Name: "test-node",
1643+
},
1644+
Status: corev1.NodeStatus{
1645+
Conditions: []corev1.NodeCondition{
1646+
{
1647+
Type: corev1.NodeReady,
1648+
Status: corev1.ConditionTrue,
1649+
},
1650+
},
1651+
},
1652+
},
1653+
expected: false,
1654+
},
1655+
{
1656+
name: "Node is unreachable and has volumes attached",
1657+
node: &corev1.Node{
1658+
ObjectMeta: metav1.ObjectMeta{
1659+
Name: "unreachable-node",
1660+
},
1661+
Status: corev1.NodeStatus{
1662+
Conditions: []corev1.NodeCondition{
1663+
{
1664+
Type: corev1.NodeReady,
1665+
Status: corev1.ConditionUnknown,
1666+
},
1667+
},
1668+
VolumesAttached: attachedVolumes,
1669+
},
1670+
},
1671+
expected: false,
1672+
},
1673+
{
1674+
name: "Node is unreachable and has no volumes attached",
1675+
node: &corev1.Node{
1676+
ObjectMeta: metav1.ObjectMeta{
1677+
Name: "unreachable-node",
1678+
},
1679+
Status: corev1.NodeStatus{
1680+
Conditions: []corev1.NodeCondition{
1681+
{
1682+
Type: corev1.NodeReady,
1683+
Status: corev1.ConditionUnknown,
1684+
},
1685+
},
1686+
},
1687+
},
1688+
expected: false,
1689+
},
1690+
}
1691+
for _, tt := range tests {
1692+
t.Run(tt.name, func(t *testing.T) {
1693+
g := NewWithT(t)
1694+
1695+
var objs []client.Object
1696+
objs = append(objs, testCluster, tt.node)
1697+
1698+
c := fake.NewClientBuilder().WithObjects(objs...).Build()
1699+
tracker := remote.NewTestClusterCacheTracker(ctrl.Log, c, c, fakeScheme, client.ObjectKeyFromObject(testCluster))
1700+
r := &Reconciler{
1701+
Client: c,
1702+
UnstructuredCachingClient: c,
1703+
Tracker: tracker,
1704+
}
1705+
1706+
got, err := r.shouldWaitForNodeVolumes(ctx, testCluster, tt.node.Name)
1707+
g.Expect(err).ToNot(HaveOccurred())
1708+
g.Expect(got).To(Equal(tt.expected))
1709+
})
1710+
}
1711+
}
1712+
16021713
func TestIsDeleteNodeAllowed(t *testing.T) {
16031714
deletionts := metav1.Now()
16041715

0 commit comments

Comments
 (0)