Skip to content

Commit 3cd0556

Browse files
committed
Don't delete Ray head Pod when ImagePullSecret is provided
1 parent 1945b65 commit 3cd0556

File tree

4 files changed

+122
-4
lines changed

4 files changed

+122
-4
lines changed

go.mod

+2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ replace sigs.k8s.io/kueue v0.8.3 => github.com/opendatahub-io/kueue v0.8.3
4141
// This replace directive deal with the unaligned Ray operator version transitively pulled by Kueue 0.8.3
4242
replace github.com/ray-project/kuberay/ray-operator v1.2.1 => github.com/ray-project/kuberay/ray-operator v1.1.1
4343

44+
replace github.com/project-codeflare/codeflare-common => /home/ksuta/src/github.com/project-codeflare/codeflare-common
45+
4446
require (
4547
github.com/aymerick/douceur v0.2.0 // indirect
4648
github.com/beorn7/perks v1.0.1 // indirect

pkg/controllers/raycluster_controller.go

+5-2
Original file line numberDiff line numberDiff line change
@@ -219,8 +219,11 @@ func (r *RayClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
219219
return ctrl.Result{RequeueAfter: requeueTime}, err
220220
}
221221

222-
if err := r.deleteHeadPodIfMissingImagePullSecrets(ctx, cluster); err != nil {
223-
return ctrl.Result{RequeueAfter: requeueTime}, err
222+
if len(cluster.Spec.HeadGroupSpec.Template.Spec.ImagePullSecrets) == 0 {
223+
// Delete head pod only if user doesn't specify own imagePullSecrets and imagePullSecrets from OAuth ServiceAccount are not present in the head Pod
224+
if err := r.deleteHeadPodIfMissingImagePullSecrets(ctx, cluster); err != nil {
225+
return ctrl.Result{RequeueAfter: requeueTime}, err
226+
}
224227
}
225228

226229
_, err = r.kubeClient.RbacV1().ClusterRoleBindings().Apply(ctx, desiredOAuthClusterRoleBinding(cluster), metav1.ApplyOptions{FieldManager: controllerName, Force: true})

pkg/controllers/raycluster_controller_test.go

+67
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,73 @@ var _ = Describe("RayCluster controller", func() {
192192
}).WithTimeout(time.Second * 10).Should(Satisfy(errors.IsNotFound))
193193
})
194194

195+
It("should not delete the head pod if RayCluster CR provides image pull secrets", func(ctx SpecContext) {
196+
By("creating an instance of the RayCluster CR with imagePullSecret")
197+
rayclusterWithPullSecret := &rayv1.RayCluster{
198+
ObjectMeta: metav1.ObjectMeta{
199+
Name: "pull-secret-cluster",
200+
Namespace: namespaceName,
201+
},
202+
Spec: rayv1.RayClusterSpec{
203+
HeadGroupSpec: rayv1.HeadGroupSpec{
204+
Template: corev1.PodTemplateSpec{
205+
Spec: corev1.PodSpec{
206+
ImagePullSecrets: []corev1.LocalObjectReference{{Name: "custom-pull-secret"}},
207+
Containers: []corev1.Container{},
208+
},
209+
},
210+
RayStartParams: map[string]string{},
211+
},
212+
},
213+
}
214+
_, err := rayClient.RayV1().RayClusters(namespaceName).Create(ctx, rayclusterWithPullSecret, metav1.CreateOptions{})
215+
Expect(err).To(Not(HaveOccurred()))
216+
217+
Eventually(func() (*corev1.ServiceAccount, error) {
218+
return k8sClient.CoreV1().ServiceAccounts(namespaceName).Get(ctx, oauthServiceAccountNameFromCluster(rayclusterWithPullSecret), metav1.GetOptions{})
219+
}).WithTimeout(time.Second * 10).Should(WithTransform(OwnerReferenceKind, Equal("RayCluster")))
220+
221+
headPodName := "head-pod"
222+
headPod := &corev1.Pod{
223+
ObjectMeta: metav1.ObjectMeta{
224+
Name: headPodName,
225+
Namespace: namespaceName,
226+
Labels: map[string]string{
227+
"ray.io/node-type": "head",
228+
"ray.io/cluster": rayclusterWithPullSecret.Name,
229+
},
230+
},
231+
Spec: corev1.PodSpec{
232+
ImagePullSecrets: []corev1.LocalObjectReference{
233+
{Name: "custom-pull-secret"},
234+
},
235+
Containers: []corev1.Container{
236+
{
237+
Name: "head-container",
238+
Image: "busybox",
239+
},
240+
},
241+
},
242+
}
243+
_, err = k8sClient.CoreV1().Pods(namespaceName).Create(ctx, headPod, metav1.CreateOptions{})
244+
Expect(err).To(Not(HaveOccurred()))
245+
246+
Eventually(func() (*corev1.Pod, error) {
247+
return k8sClient.CoreV1().Pods(namespaceName).Get(ctx, headPodName, metav1.GetOptions{})
248+
}).WithTimeout(time.Second * 10).ShouldNot(BeNil())
249+
250+
sa, err := k8sClient.CoreV1().ServiceAccounts(namespaceName).Get(ctx, oauthServiceAccountNameFromCluster(rayclusterWithPullSecret), metav1.GetOptions{})
251+
Expect(err).To(Not(HaveOccurred()))
252+
253+
sa.ImagePullSecrets = append(sa.ImagePullSecrets, corev1.LocalObjectReference{Name: "test-image-pull-secret"})
254+
_, err = k8sClient.CoreV1().ServiceAccounts(namespaceName).Update(ctx, sa, metav1.UpdateOptions{})
255+
Expect(err).To(Not(HaveOccurred()))
256+
257+
Consistently(func() (*corev1.Pod, error) {
258+
return k8sClient.CoreV1().Pods(namespaceName).Get(ctx, headPodName, metav1.GetOptions{})
259+
}).WithTimeout(time.Second * 5).Should(Not(BeNil()))
260+
})
261+
195262
It("should remove CRB when the RayCluster is deleted", func(ctx SpecContext) {
196263
foundRayCluster, err := rayClient.RayV1().RayClusters(namespaceName).Get(ctx, rayClusterName, metav1.GetOptions{})
197264
Expect(err).To(Not(HaveOccurred()))

test/e2e/mnist_rayjob_raycluster_test.go

+48-2
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,52 @@ func runMnistRayJobRayClusterAppWrapper(t *testing.T, accelerator string, number
211211
test.Eventually(AppWrappers(test, namespace), TestTimeoutShort).Should(BeEmpty())
212212
}
213213

214+
// Verifying https://github.com/project-codeflare/codeflare-operator/issues/649
215+
func TestRayClusterImagePullSecret(t *testing.T) {
216+
test := With(t)
217+
218+
// Create a namespace
219+
namespace := test.NewTestNamespace()
220+
221+
// Create Kueue resources
222+
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
223+
defer func() {
224+
_ = test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
225+
}()
226+
clusterQueue := createClusterQueue(test, resourceFlavor, 0)
227+
defer func() {
228+
_ = test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
229+
}()
230+
CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
231+
232+
// Create ServiceAccount, wait until corresponding imagePullSecret is available
233+
sa := CreateServiceAccount(test, namespace.Name)
234+
test.Eventually(ServiceAccount(test, sa.Namespace, sa.Name), TestTimeoutShort).
235+
Should(
236+
HaveField("ImagePullSecrets", HaveLen(1)),
237+
)
238+
sa = GetServiceAccount(test, sa.Namespace, sa.Name)
239+
240+
// Create MNIST training script
241+
mnist := constructMNISTConfigMap(test, namespace)
242+
mnist, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), mnist, metav1.CreateOptions{})
243+
test.Expect(err).NotTo(HaveOccurred())
244+
test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name)
245+
246+
// Create RayCluster with imagePullSecret and assign it to the localqueue
247+
rayCluster := constructRayCluster(test, namespace, mnist, 0)
248+
rayCluster.Spec.HeadGroupSpec.Template.Spec.ImagePullSecrets = sa.ImagePullSecrets
249+
rayCluster, err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Create(test.Ctx(), rayCluster, metav1.CreateOptions{})
250+
test.Expect(err).NotTo(HaveOccurred())
251+
test.T().Logf("Created RayCluster %s/%s successfully", rayCluster.Namespace, rayCluster.Name)
252+
253+
test.T().Logf("Waiting for RayCluster %s/%s to be running", rayCluster.Namespace, rayCluster.Name)
254+
test.Eventually(RayCluster(test, namespace.Name, rayCluster.Name), TestTimeoutMedium).
255+
Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))
256+
}
257+
258+
// Helper functions
259+
214260
func constructMNISTConfigMap(test Test, namespace *corev1.Namespace) *corev1.ConfigMap {
215261
return &corev1.ConfigMap{
216262
TypeMeta: metav1.TypeMeta{
@@ -274,11 +320,11 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
274320
Resources: corev1.ResourceRequirements{
275321
Requests: corev1.ResourceList{
276322
corev1.ResourceCPU: resource.MustParse("250m"),
277-
corev1.ResourceMemory: resource.MustParse("512Mi"),
323+
corev1.ResourceMemory: resource.MustParse("2G"),
278324
},
279325
Limits: corev1.ResourceList{
280326
corev1.ResourceCPU: resource.MustParse("1"),
281-
corev1.ResourceMemory: resource.MustParse("2G"),
327+
corev1.ResourceMemory: resource.MustParse("4G"),
282328
},
283329
},
284330
},

0 commit comments

Comments
 (0)