Skip to content

Add a check for head pod imagePullSecrets #601

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 51 additions & 1 deletion pkg/controllers/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,10 @@ func (r *RayClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
return ctrl.Result{RequeueAfter: requeueTime}, err
}

if err := r.deleteHeadPodIfMissingImagePullSecrets(ctx, cluster); err != nil {
return ctrl.Result{RequeueAfter: requeueTime}, err
}
Comment on lines +216 to +218
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this not occur in WorkerPods too?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Theoretically yes. Practically, the worker Pods init container that waits for the head Pod to become ready makes it so it does not happen. I'd be inclined to keep the logic simple at the moment and iterate if necessary.


_, err = r.kubeClient.RbacV1().ClusterRoleBindings().Apply(ctx, desiredOAuthClusterRoleBinding(cluster), metav1.ApplyOptions{FieldManager: controllerName, Force: true})
if err != nil {
logger.Error(err, "Failed to update OAuth ClusterRoleBinding")
Expand Down Expand Up @@ -470,6 +474,7 @@ func generateCACertificate() ([]byte, []byte, error) {

return privateKeyPem, certPem, nil
}

func desiredWorkersNetworkPolicy(cluster *rayv1.RayCluster) *networkingv1ac.NetworkPolicyApplyConfiguration {
return networkingv1ac.NetworkPolicy(cluster.Name+"-workers", cluster.Namespace).
WithLabels(map[string]string{RayClusterNameLabel: cluster.Name}).
Expand All @@ -486,6 +491,7 @@ func desiredWorkersNetworkPolicy(cluster *rayv1.RayCluster) *networkingv1ac.Netw
metav1ac.OwnerReference().WithUID(cluster.UID).WithName(cluster.Name).WithKind(cluster.Kind).WithAPIVersion(cluster.APIVersion).WithController(true),
)
}

func desiredHeadNetworkPolicy(cluster *rayv1.RayCluster, cfg *config.KubeRayConfiguration, kubeRayNamespaces []string) *networkingv1ac.NetworkPolicyApplyConfiguration {
allSecuredPorts := []*networkingv1ac.NetworkPolicyPortApplyConfiguration{
networkingv1ac.NetworkPolicyPort().WithProtocol(corev1.ProtocolTCP).WithPort(intstr.FromInt(8443)),
Expand Down Expand Up @@ -544,6 +550,49 @@ func desiredHeadNetworkPolicy(cluster *rayv1.RayCluster, cfg *config.KubeRayConf
)
}

func (r *RayClusterReconciler) deleteHeadPodIfMissingImagePullSecrets(ctx context.Context, cluster *rayv1.RayCluster) error {
serviceAccount, err := r.kubeClient.CoreV1().ServiceAccounts(cluster.Namespace).Get(ctx, oauthServiceAccountNameFromCluster(cluster), metav1.GetOptions{})
if err != nil {
return fmt.Errorf("failed to get OAuth ServiceAccount: %w", err)
}

headPod, err := getHeadPod(ctx, r, cluster)
if err != nil {
return fmt.Errorf("failed to get head pod: %w", err)
}

if headPod == nil {
return nil
}

missingSecrets := map[string]bool{}
for _, secret := range serviceAccount.ImagePullSecrets {
missingSecrets[secret.Name] = true
}
for _, secret := range headPod.Spec.ImagePullSecrets {
delete(missingSecrets, secret.Name)
}
if len(missingSecrets) > 0 {
if err := r.kubeClient.CoreV1().Pods(headPod.Namespace).Delete(ctx, headPod.Name, metav1.DeleteOptions{}); err != nil {
return fmt.Errorf("failed to delete head pod: %w", err)
}
}
return nil
Comment on lines +568 to +580
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not think i fully understand this part:
what would happen if the serviceAccount has multiple ImagePullSecrets? then it can get into an infinitely reconcile by deleting headPod?

Copy link
Contributor Author

@Ygnas Ygnas Jul 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zdtsw I don't think it should, I just tried adding extra ImagePullSecrets into the SA, the head was deleted and recreated with the extra ones. I think all the ImagePullSecrets always end up in the headPod.
image

}

func getHeadPod(ctx context.Context, r *RayClusterReconciler, cluster *rayv1.RayCluster) (*corev1.Pod, error) {
podList, err := r.kubeClient.CoreV1().Pods(cluster.Namespace).List(ctx, metav1.ListOptions{
LabelSelector: fmt.Sprintf("ray.io/node-type=head,ray.io/cluster=%s", cluster.Name),
})
if err != nil {
return nil, err
}
if len(podList.Items) > 0 {
return &podList.Items[0], nil
}
return nil, nil
}

// SetupWithManager sets up the controller with the Manager.
func (r *RayClusterReconciler) SetupWithManager(mgr ctrl.Manager) error {
r.kubeClient = kubernetes.NewForConfigOrDie(mgr.GetConfig())
Expand Down Expand Up @@ -577,7 +626,8 @@ func (r *RayClusterReconciler) SetupWithManager(mgr ctrl.Manager) error {
NamespacedName: client.ObjectKey{
Name: name,
Namespace: namespace,
}}}
},
}}
}),
)
if r.IsOpenShift {
Expand Down
50 changes: 48 additions & 2 deletions pkg/controllers/raycluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import (

var _ = Describe("RayCluster controller", func() {
Context("RayCluster controller test", func() {
var rayClusterName = "test-raycluster"
rayClusterName := "test-raycluster"
var namespaceName string
BeforeEach(func(ctx SpecContext) {
By("Creating a namespace for running the tests.")
Expand Down Expand Up @@ -145,6 +145,53 @@ var _ = Describe("RayCluster controller", func() {
}).WithTimeout(time.Second * 10).Should(WithTransform(OwnerReferenceName, Equal(foundRayCluster.Name)))
})

It("should delete the head pod if missing image pull secrets", func(ctx SpecContext) {
foundRayCluster, err := rayClient.RayV1().RayClusters(namespaceName).Get(ctx, rayClusterName, metav1.GetOptions{})
Expect(err).To(Not(HaveOccurred()))

Eventually(func() (*corev1.ServiceAccount, error) {
return k8sClient.CoreV1().ServiceAccounts(namespaceName).Get(ctx, oauthServiceAccountNameFromCluster(foundRayCluster), metav1.GetOptions{})
}).WithTimeout(time.Second * 10).Should(WithTransform(OwnerReferenceKind, Equal("RayCluster")))

headPodName := "head-pod"
headPod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: headPodName,
Namespace: namespaceName,
Labels: map[string]string{
"ray.io/node-type": "head",
"ray.io/cluster": foundRayCluster.Name,
},
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "head-container",
Image: "busybox",
},
},
},
}
_, err = k8sClient.CoreV1().Pods(namespaceName).Create(ctx, headPod, metav1.CreateOptions{})
Expect(err).To(Not(HaveOccurred()))

Eventually(func() (*corev1.Pod, error) {
return k8sClient.CoreV1().Pods(namespaceName).Get(ctx, headPodName, metav1.GetOptions{})
}).WithTimeout(time.Second * 10).ShouldNot(BeNil())

sa, err := k8sClient.CoreV1().ServiceAccounts(namespaceName).Get(ctx, oauthServiceAccountNameFromCluster(foundRayCluster), metav1.GetOptions{})
Expect(err).To(Not(HaveOccurred()))

sa.ImagePullSecrets = append(sa.ImagePullSecrets, corev1.LocalObjectReference{Name: "test-image-pull-secret"})
_, err = k8sClient.CoreV1().ServiceAccounts(namespaceName).Update(ctx, sa, metav1.UpdateOptions{})
Expect(err).To(Not(HaveOccurred()))

Eventually(func() error {
_, err := k8sClient.CoreV1().Pods(namespaceName).Get(ctx, headPodName, metav1.GetOptions{})
return err
}).WithTimeout(time.Second * 10).Should(Satisfy(errors.IsNotFound))
})

It("should remove CRB when the RayCluster is deleted", func(ctx SpecContext) {
foundRayCluster, err := rayClient.RayV1().RayClusters(namespaceName).Get(ctx, rayClusterName, metav1.GetOptions{})
Expect(err).To(Not(HaveOccurred()))
Expand All @@ -157,7 +204,6 @@ var _ = Describe("RayCluster controller", func() {
return err
}).WithTimeout(time.Second * 10).Should(Satisfy(errors.IsNotFound))
})

})
})

Expand Down