From 867f640123931ed217cc7c62aa7c8150a8026bb0 Mon Sep 17 00:00:00 2001 From: daimaxiaxie Date: Tue, 18 Mar 2025 18:02:11 +0800 Subject: [PATCH] drain node by infraMachine ProviderID --- .../controllers/machine/machine_controller.go | 19 +++++-- .../machine/machine_controller_test.go | 55 ++++++++++++++++++- 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go index fbe00eb7a097..4a6c5350541e 100644 --- a/internal/controllers/machine/machine_controller.go +++ b/internal/controllers/machine/machine_controller.go @@ -53,6 +53,7 @@ import ( "sigs.k8s.io/cluster-api/controllers/external" "sigs.k8s.io/cluster-api/controllers/noderefutil" "sigs.k8s.io/cluster-api/feature" + "sigs.k8s.io/cluster-api/internal/contract" "sigs.k8s.io/cluster-api/internal/controllers/machine/drain" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" @@ -435,7 +436,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope) (ctrl.Result s.deletingReason = clusterv1.MachineDeletingV1Beta2Reason s.deletingMessage = "Deletion started" - err := r.isDeleteNodeAllowed(ctx, cluster, m) + err := r.isDeleteNodeAllowed(ctx, cluster, m, s.infraMachine) isDeleteNodeAllowed := err == nil if err != nil { switch err { @@ -713,14 +714,24 @@ func (r *Reconciler) nodeVolumeDetachTimeoutExceeded(machine *clusterv1.Machine) // isDeleteNodeAllowed returns nil only if the Machine's NodeRef is not nil // and if the Machine is not the last control plane node in the cluster. -func (r *Reconciler) isDeleteNodeAllowed(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine) error { +func (r *Reconciler) isDeleteNodeAllowed(ctx context.Context, cluster *clusterv1.Cluster, machine *clusterv1.Machine, infraMachine *unstructured.Unstructured) error { log := ctrl.LoggerFrom(ctx) // Return early if the cluster is being deleted. if !cluster.DeletionTimestamp.IsZero() { return errClusterIsBeingDeleted } - if machine.Status.NodeRef == nil && machine.Spec.ProviderID != nil { + var providerID string + if machine.Spec.ProviderID != nil { + providerID = *machine.Spec.ProviderID + } else if infraMachine != nil { + // Fallback to retrieve from infraMachine. + if providerIDFromInfraMachine, err := contract.InfrastructureMachine().ProviderID().Get(infraMachine); err == nil { + providerID = *providerIDFromInfraMachine + } + } + + if machine.Status.NodeRef == nil && providerID != "" { // If we don't have a node reference, but a provider id has been set, // try to retrieve the node one more time. // @@ -731,7 +742,7 @@ func (r *Reconciler) isDeleteNodeAllowed(ctx context.Context, cluster *clusterv1 if err != nil { log.Error(err, "Failed to get cluster client while deleting Machine and checking for nodes") } else { - node, err := r.getNode(ctx, remoteClient, *machine.Spec.ProviderID) + node, err := r.getNode(ctx, remoteClient, providerID) if err != nil && err != ErrNodeNotFound { log.Error(err, "Failed to get node while deleting Machine") } else if err == nil { diff --git a/internal/controllers/machine/machine_controller_test.go b/internal/controllers/machine/machine_controller_test.go index 0db71b250ac2..2008917ddd7f 100644 --- a/internal/controllers/machine/machine_controller_test.go +++ b/internal/controllers/machine/machine_controller_test.go @@ -2530,6 +2530,7 @@ func TestIsDeleteNodeAllowed(t *testing.T) { name string cluster *clusterv1.Cluster machine *clusterv1.Machine + infraMachine *unstructured.Unstructured expectedError error }{ { @@ -2556,6 +2557,7 @@ func TestIsDeleteNodeAllowed(t *testing.T) { }, Status: clusterv1.MachineStatus{}, }, + infraMachine: nil, expectedError: errNilNodeRef, }, { @@ -2586,6 +2588,7 @@ func TestIsDeleteNodeAllowed(t *testing.T) { }, }, }, + infraMachine: nil, expectedError: errNoControlPlaneNodes, }, { @@ -2618,6 +2621,7 @@ func TestIsDeleteNodeAllowed(t *testing.T) { }, }, }, + infraMachine: nil, expectedError: errNoControlPlaneNodes, }, { @@ -2648,6 +2652,7 @@ func TestIsDeleteNodeAllowed(t *testing.T) { }, }, }, + infraMachine: nil, expectedError: nil, }, { @@ -2661,6 +2666,7 @@ func TestIsDeleteNodeAllowed(t *testing.T) { }, }, machine: &clusterv1.Machine{}, + infraMachine: nil, expectedError: errClusterIsBeingDeleted, }, { @@ -2699,6 +2705,7 @@ func TestIsDeleteNodeAllowed(t *testing.T) { }, }, }, + infraMachine: nil, expectedError: nil, }, { @@ -2737,6 +2744,7 @@ func TestIsDeleteNodeAllowed(t *testing.T) { }, }, }, + infraMachine: nil, expectedError: errControlPlaneIsBeingDeleted, }, { @@ -2775,8 +2783,40 @@ func TestIsDeleteNodeAllowed(t *testing.T) { }, }, }, + infraMachine: nil, expectedError: errControlPlaneIsBeingDeleted, }, + { + name: "no nodeRef, infrastructure machine has providerID", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: metav1.NamespaceDefault, + }, + }, + machine: &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "created", + Namespace: metav1.NamespaceDefault, + Labels: map[string]string{ + clusterv1.ClusterNameLabel: "test-cluster", + }, + Finalizers: []string{clusterv1.MachineFinalizer}, + }, + Spec: clusterv1.MachineSpec{ + ClusterName: "test-cluster", + InfrastructureRef: corev1.ObjectReference{}, + Bootstrap: clusterv1.Bootstrap{DataSecretName: ptr.To("data")}, + }, + Status: clusterv1.MachineStatus{}, + }, + infraMachine: &unstructured.Unstructured{Object: map[string]interface{}{ + "spec": map[string]interface{}{ + "providerID": "test-node-1", + }, + }}, + expectedError: nil, + }, } emp := &unstructured.Unstructured{ @@ -2815,6 +2855,16 @@ func TestIsDeleteNodeAllowed(t *testing.T) { empBeingDeleted.SetDeletionTimestamp(&metav1.Time{Time: time.Now()}) empBeingDeleted.SetFinalizers([]string{"block-deletion"}) + testNodeA := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-1", + }, + Spec: corev1.NodeSpec{ + ProviderID: "test-node-1", + }, + } + remoteClient := fake.NewClientBuilder().WithIndex(&corev1.Node{}, "spec.providerID", index.NodeByProviderID).WithObjects(testNodeA).Build() + for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { g := NewWithT(t) @@ -2875,10 +2925,11 @@ func TestIsDeleteNodeAllowed(t *testing.T) { empBeingDeleted, ).Build() mr := &Reconciler{ - Client: c, + Client: c, + ClusterCache: clustercache.NewFakeClusterCache(remoteClient, client.ObjectKeyFromObject(tc.cluster)), } - err := mr.isDeleteNodeAllowed(ctx, tc.cluster, tc.machine) + err := mr.isDeleteNodeAllowed(ctx, tc.cluster, tc.machine, tc.infraMachine) if tc.expectedError == nil { g.Expect(err).ToNot(HaveOccurred()) } else {