diff --git a/internal/controller/appwrapper/node_health_monitor.go b/internal/controller/appwrapper/node_health_monitor.go index 4c935b6..5e2b87f 100644 --- a/internal/controller/appwrapper/node_health_monitor.go +++ b/internal/controller/appwrapper/node_health_monitor.go @@ -52,7 +52,8 @@ var ( // noScheduleNodes is a mapping from Node names to ResourceLists of unschedulable resources. // A resource may be unschedulable either because: // (a) the Node is cordoned (node.Spec.Unschedulable is true) or - // (b) Autopilot has labeled the Node with a NoExecute or NoSchedule taint for the resource. + // (b) the Node has been marked as NotReady by Kubernetes or + // (c) Autopilot has labeled the Node with a NoExecute or NoSchedule taint for the resource. noScheduleNodes = make(map[string]v1.ResourceList) // noScheduleNodesMutex synchronizes access to noScheduleNodes noScheduleNodesMutex sync.RWMutex @@ -136,7 +137,7 @@ func (r *NodeHealthMonitor) updateNoExecuteNodes(ctx context.Context, node *v1.N // update noScheduleNodes entry for node func (r *NodeHealthMonitor) updateNoScheduleNodes(ctx context.Context, node *v1.Node) { var noScheduleResources v1.ResourceList - if node.Spec.Unschedulable { + if r.nodeIsUnscheduable(node) { noScheduleResources = node.Status.Capacity.DeepCopy() delete(noScheduleResources, v1.ResourcePods) } else { @@ -178,6 +179,18 @@ func (r *NodeHealthMonitor) updateNoScheduleNodes(ctx context.Context, node *v1. } } +func (r *NodeHealthMonitor) nodeIsUnscheduable(node *v1.Node) bool { + if node.Spec.Unschedulable { + return true + } + for _, taint := range node.Spec.Taints { + if taint.Key == "node.kubernetes.io/unreachable" || taint.Key == "node.kubernetes.io/not-ready" { + return true + } + } + return false +} + // SetupWithManager sets up the controller with the Manager. func (r *NodeHealthMonitor) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). diff --git a/internal/controller/appwrapper/node_health_monitor_test.go b/internal/controller/appwrapper/node_health_monitor_test.go index 2a3f8cb..0397788 100644 --- a/internal/controller/appwrapper/node_health_monitor_test.go +++ b/internal/controller/appwrapper/node_health_monitor_test.go @@ -43,7 +43,13 @@ var _ = Describe("NodeMonitor Controller", func() { Expect(k8sClient.Create(ctx, node)).To(Succeed()) node = getNode(nodeName) node.Status.Capacity = nodeGPUs + node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{ + Type: v1.NodeReady, + Status: v1.ConditionTrue, + }) Expect(k8sClient.Status().Update(ctx, node)).To(Succeed()) + node.Spec.Taints = []v1.Taint{} + Expect(k8sClient.Update(ctx, node)).To(Succeed()) } deleteNode := func(nodeName string) { @@ -106,6 +112,62 @@ var _ = Describe("NodeMonitor Controller", func() { Expect(err).NotTo(HaveOccurred()) Expect(noExecuteNodes).Should(BeEmpty()) + By("A Node tainted as unreachable is detected as unscheduable") + node = getNode(node1Name.Name) + node.Spec.Taints = append(node.Spec.Taints, v1.Taint{Key: "node.kubernetes.io/unreachable", Effect: v1.TaintEffectNoExecute}) + Expect(k8sClient.Update(ctx, node)).Should(Succeed()) + _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) + Expect(err).NotTo(HaveOccurred()) + _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name}) + Expect(err).NotTo(HaveOccurred()) + Expect(noScheduleNodes).Should(HaveLen(1)) + Expect(noScheduleNodes).Should(HaveKey(node1Name.Name)) + Expect(noScheduleNodes[node1Name.Name]).Should(HaveKey(v1.ResourceName("nvidia.com/gpu"))) + + By("Repeated reconcile does not change map") + _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) + Expect(err).NotTo(HaveOccurred()) + _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name}) + Expect(err).NotTo(HaveOccurred()) + Expect(noScheduleNodes).Should(HaveLen(1)) + Expect(noScheduleNodes).Should(HaveKey(node1Name.Name)) + Expect(noScheduleNodes[node1Name.Name]).Should(HaveKey(v1.ResourceName("nvidia.com/gpu"))) + + By("Removing the taint updates unhealthyNodes") + node.Spec.Taints = []v1.Taint{} + Expect(k8sClient.Update(ctx, node)).Should(Succeed()) + _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) + Expect(err).NotTo(HaveOccurred()) + Expect(noScheduleNodes).Should(BeEmpty()) + + By("A Node tainted as not-read is detected as unscheduable") + node = getNode(node1Name.Name) + node.Spec.Taints = append(node.Spec.Taints, v1.Taint{Key: "node.kubernetes.io/not-ready", Effect: v1.TaintEffectNoExecute}) + Expect(k8sClient.Update(ctx, node)).Should(Succeed()) + _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) + Expect(err).NotTo(HaveOccurred()) + _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name}) + Expect(err).NotTo(HaveOccurred()) + Expect(noScheduleNodes).Should(HaveLen(1)) + Expect(noScheduleNodes).Should(HaveKey(node1Name.Name)) + Expect(noScheduleNodes[node1Name.Name]).Should(HaveKey(v1.ResourceName("nvidia.com/gpu"))) + + By("Repeated reconcile does not change map") + _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) + Expect(err).NotTo(HaveOccurred()) + _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name}) + Expect(err).NotTo(HaveOccurred()) + Expect(noScheduleNodes).Should(HaveLen(1)) + Expect(noScheduleNodes).Should(HaveKey(node1Name.Name)) + Expect(noScheduleNodes[node1Name.Name]).Should(HaveKey(v1.ResourceName("nvidia.com/gpu"))) + + By("Removing the taint updates unhealthyNodes") + node.Spec.Taints = []v1.Taint{} + Expect(k8sClient.Update(ctx, node)).Should(Succeed()) + _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) + Expect(err).NotTo(HaveOccurred()) + Expect(noScheduleNodes).Should(BeEmpty()) + deleteNode(node1Name.Name) deleteNode(node2Name.Name) })