sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/raycluster/raycluster_controller_test.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package raycluster 18 19 import ( 20 "fmt" 21 22 "github.com/google/go-cmp/cmp/cmpopts" 23 "github.com/onsi/ginkgo/v2" 24 "github.com/onsi/gomega" 25 rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" 26 corev1 "k8s.io/api/core/v1" 27 apierrors "k8s.io/apimachinery/pkg/api/errors" 28 apimeta "k8s.io/apimachinery/pkg/api/meta" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/apimachinery/pkg/types" 31 "k8s.io/client-go/kubernetes/scheme" 32 "k8s.io/utils/ptr" 33 ctrl "sigs.k8s.io/controller-runtime" 34 "sigs.k8s.io/controller-runtime/pkg/client" 35 36 configapi "sigs.k8s.io/kueue/apis/config/v1beta1" 37 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 38 "sigs.k8s.io/kueue/pkg/controller/constants" 39 "sigs.k8s.io/kueue/pkg/controller/jobframework" 40 workloadraycluster "sigs.k8s.io/kueue/pkg/controller/jobs/raycluster" 41 "sigs.k8s.io/kueue/pkg/util/testing" 42 testingraycluster "sigs.k8s.io/kueue/pkg/util/testingjobs/raycluster" 43 "sigs.k8s.io/kueue/test/integration/framework" 44 "sigs.k8s.io/kueue/test/util" 45 ) 46 47 const ( 48 jobName = "test-job" 49 instanceKey = "cloud.provider.com/instance" 50 priorityClassName = "test-priority-class" 51 priorityValue int32 = 10 52 ) 53 54 var ( 55 ignoreConditionTimestamps = cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime") 56 ) 57 58 // +kubebuilder:docs-gen:collapse=Imports 59 60 var _ = ginkgo.Describe("RayCluster controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 61 ginkgo.BeforeAll(func() { 62 fwk = &framework.Framework{ 63 CRDPath: crdPath, 64 DepCRDPaths: []string{rayCrdPath}, 65 } 66 67 cfg = fwk.Init() 68 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithManageJobsWithoutQueueName(true))) 69 }) 70 ginkgo.AfterAll(func() { 71 fwk.Teardown() 72 }) 73 74 var ( 75 ns *corev1.Namespace 76 wlLookupKey types.NamespacedName 77 ) 78 ginkgo.BeforeEach(func() { 79 ns = &corev1.Namespace{ 80 ObjectMeta: metav1.ObjectMeta{ 81 GenerateName: "core-", 82 }, 83 } 84 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 85 86 wlLookupKey = types.NamespacedName{Name: workloadraycluster.GetWorkloadNameForRayCluster(jobName), Namespace: ns.Name} 87 }) 88 ginkgo.AfterEach(func() { 89 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 90 }) 91 92 ginkgo.It("Should reconcile RayClusters", func() { 93 ginkgo.By("checking the job gets suspended when created unsuspended") 94 priorityClass := testing.MakePriorityClass(priorityClassName). 95 PriorityValue(priorityValue).Obj() 96 gomega.Expect(k8sClient.Create(ctx, priorityClass)).Should(gomega.Succeed()) 97 98 job := testingraycluster.MakeCluster(jobName, ns.Name). 99 Suspend(false). 100 WithPriorityClassName(priorityClassName). 101 Obj() 102 err := k8sClient.Create(ctx, job) 103 gomega.Expect(err).To(gomega.Succeed()) 104 createdJob := &rayv1.RayCluster{} 105 106 gomega.Eventually(func() bool { 107 if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: ns.Name}, createdJob); err != nil { 108 return false 109 } 110 return *createdJob.Spec.Suspend 111 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 112 113 ginkgo.By("checking the workload is created without queue assigned") 114 createdWorkload := &kueue.Workload{} 115 gomega.Eventually(func() error { 116 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 117 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 118 gomega.Expect(createdWorkload.Spec.QueueName).Should(gomega.Equal(""), "The Workload shouldn't have .spec.queueName set") 119 gomega.Expect(metav1.IsControlledBy(createdWorkload, createdJob)).To(gomega.BeTrue(), "The Workload should be owned by the Job") 120 121 ginkgo.By("checking the workload is created with priority and priorityName") 122 gomega.Expect(createdWorkload.Spec.PriorityClassName).Should(gomega.Equal(priorityClassName)) 123 gomega.Expect(*createdWorkload.Spec.Priority).Should(gomega.Equal(priorityValue)) 124 125 ginkgo.By("checking the workload is updated with queue name when the job does") 126 jobQueueName := "test-queue" 127 createdJob.Annotations = map[string]string{constants.QueueAnnotation: jobQueueName} 128 gomega.Expect(k8sClient.Update(ctx, createdJob)).Should(gomega.Succeed()) 129 gomega.Eventually(func() bool { 130 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 131 return false 132 } 133 return createdWorkload.Spec.QueueName == jobQueueName 134 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 135 136 ginkgo.By("checking a second non-matching workload is deleted") 137 secondWl := &kueue.Workload{ 138 ObjectMeta: metav1.ObjectMeta{ 139 Name: workloadraycluster.GetWorkloadNameForRayCluster("second-workload"), 140 Namespace: createdWorkload.Namespace, 141 }, 142 Spec: *createdWorkload.Spec.DeepCopy(), 143 } 144 145 gomega.Expect(ctrl.SetControllerReference(createdJob, secondWl, scheme.Scheme)).Should(gomega.Succeed()) 146 secondWl.Spec.PodSets[0].Count += 1 147 148 gomega.Expect(k8sClient.Create(ctx, secondWl)).Should(gomega.Succeed()) 149 gomega.Eventually(func() error { 150 wl := &kueue.Workload{} 151 key := types.NamespacedName{Name: secondWl.Name, Namespace: secondWl.Namespace} 152 return k8sClient.Get(ctx, key, wl) 153 }, util.Timeout, util.Interval).Should(testing.BeNotFoundError()) 154 // check the original wl is still there 155 gomega.Eventually(func() error { 156 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 157 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 158 159 ginkgo.By("checking the job is unsuspended when workload is assigned") 160 onDemandFlavor := testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 161 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 162 spotFlavor := testing.MakeResourceFlavor("spot").Label(instanceKey, "spot").Obj() 163 gomega.Expect(k8sClient.Create(ctx, spotFlavor)).Should(gomega.Succeed()) 164 clusterQueue := testing.MakeClusterQueue("cluster-queue"). 165 ResourceGroup( 166 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(), 167 *testing.MakeFlavorQuotas("spot").Resource(corev1.ResourceCPU, "5").Obj(), 168 ).Obj() 169 admission := testing.MakeAdmission(clusterQueue.Name).PodSets( 170 kueue.PodSetAssignment{ 171 Name: createdWorkload.Spec.PodSets[0].Name, 172 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 173 corev1.ResourceCPU: "on-demand", 174 }, 175 }, kueue.PodSetAssignment{ 176 Name: createdWorkload.Spec.PodSets[1].Name, 177 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 178 corev1.ResourceCPU: "spot", 179 }, 180 }, 181 ).Obj() 182 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 183 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 184 185 lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name} 186 gomega.Eventually(func() bool { 187 if err := k8sClient.Get(ctx, lookupKey, createdJob); err != nil { 188 return false 189 } 190 return !*createdJob.Spec.Suspend 191 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 192 193 gomega.Eventually(func() bool { 194 ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "Started", corev1.EventTypeNormal, fmt.Sprintf("Admitted by clusterQueue %v", clusterQueue.Name)) 195 return ok 196 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 197 gomega.Expect(len(createdJob.Spec.HeadGroupSpec.Template.Spec.NodeSelector)).Should(gomega.Equal(1)) 198 gomega.Expect(createdJob.Spec.HeadGroupSpec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) 199 gomega.Expect(len(createdJob.Spec.WorkerGroupSpecs[0].Template.Spec.NodeSelector)).Should(gomega.Equal(1)) 200 gomega.Expect(createdJob.Spec.WorkerGroupSpecs[0].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotFlavor.Name)) 201 gomega.Eventually(func() bool { 202 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 203 return false 204 } 205 return apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadQuotaReserved) 206 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 207 208 ginkgo.By("checking the job gets suspended when parallelism changes and the added node selectors are removed") 209 parallelism := ptr.Deref(job.Spec.WorkerGroupSpecs[0].Replicas, 1) 210 newParallelism := int32(parallelism + 1) 211 createdJob.Spec.WorkerGroupSpecs[0].Replicas = &newParallelism 212 gomega.Expect(k8sClient.Update(ctx, createdJob)).Should(gomega.Succeed()) 213 gomega.Eventually(func() bool { 214 if err := k8sClient.Get(ctx, lookupKey, createdJob); err != nil { 215 return false 216 } 217 return *createdJob.Spec.Suspend && len(createdJob.Spec.WorkerGroupSpecs[0].Template.Spec.NodeSelector) == 0 218 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 219 gomega.Eventually(func() bool { 220 ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "DeletedWorkload", corev1.EventTypeNormal, fmt.Sprintf("Deleted not matching Workload: %v", wlLookupKey.String())) 221 return ok 222 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 223 224 ginkgo.By("checking the workload is updated with new count") 225 gomega.Eventually(func() bool { 226 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 227 return false 228 } 229 return createdWorkload.Spec.PodSets[1].Count == newParallelism 230 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 231 gomega.Expect(createdWorkload.Status.Admission).Should(gomega.BeNil()) 232 233 ginkgo.By("checking the job is unsuspended and selectors added when workload is assigned again") 234 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 235 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 236 gomega.Eventually(func() bool { 237 if err := k8sClient.Get(ctx, lookupKey, createdJob); err != nil { 238 return false 239 } 240 return !*createdJob.Spec.Suspend 241 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 242 gomega.Expect(len(createdJob.Spec.HeadGroupSpec.Template.Spec.NodeSelector)).Should(gomega.Equal(1)) 243 gomega.Expect(createdJob.Spec.HeadGroupSpec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) 244 gomega.Expect(len(createdJob.Spec.WorkerGroupSpecs[0].Template.Spec.NodeSelector)).Should(gomega.Equal(1)) 245 gomega.Expect(createdJob.Spec.WorkerGroupSpecs[0].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotFlavor.Name)) 246 gomega.Eventually(func() bool { 247 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 248 return false 249 } 250 return apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadQuotaReserved) 251 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 252 }) 253 }) 254 255 var _ = ginkgo.Describe("Job controller RayCluster for workloads when only jobs with queue are managed", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 256 ginkgo.BeforeAll(func() { 257 fwk = &framework.Framework{ 258 CRDPath: crdPath, 259 DepCRDPaths: []string{rayCrdPath}, 260 } 261 cfg = fwk.Init() 262 ctx, k8sClient = fwk.RunManager(cfg, managerSetup()) 263 }) 264 ginkgo.AfterAll(func() { 265 fwk.Teardown() 266 }) 267 268 var ( 269 ns *corev1.Namespace 270 ) 271 ginkgo.BeforeEach(func() { 272 ns = &corev1.Namespace{ 273 ObjectMeta: metav1.ObjectMeta{ 274 GenerateName: "core-", 275 }, 276 } 277 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 278 }) 279 ginkgo.AfterEach(func() { 280 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 281 }) 282 283 ginkgo.It("Should reconcile jobs only when queue is set", func() { 284 ginkgo.By("checking the workload is not created when queue name is not set") 285 job := testingraycluster.MakeCluster(jobName, ns.Name).Obj() 286 gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) 287 lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name} 288 createdJob := &rayv1.RayCluster{} 289 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 290 291 createdWorkload := &kueue.Workload{} 292 wlLookupKey := types.NamespacedName{Name: workloadraycluster.GetWorkloadNameForRayCluster(jobName), Namespace: ns.Name} 293 gomega.Eventually(func() bool { 294 return apierrors.IsNotFound(k8sClient.Get(ctx, wlLookupKey, createdWorkload)) 295 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 296 297 ginkgo.By("checking the workload is created when queue name is set") 298 jobQueueName := "test-queue" 299 if createdJob.Labels == nil { 300 createdJob.Labels = map[string]string{constants.QueueAnnotation: jobQueueName} 301 } else { 302 createdJob.Labels[constants.QueueLabel] = jobQueueName 303 } 304 gomega.Expect(k8sClient.Update(ctx, createdJob)).Should(gomega.Succeed()) 305 gomega.Eventually(func() error { 306 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 307 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 308 }) 309 310 }) 311 312 var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 313 type podsReadyTestSpec struct { 314 beforeJobStatus *rayv1.RayClusterStatus 315 beforeCondition *metav1.Condition 316 jobStatus rayv1.RayClusterStatus 317 suspended bool 318 wantCondition *metav1.Condition 319 } 320 321 var defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj() 322 323 ginkgo.BeforeAll(func() { 324 fwk = &framework.Framework{ 325 CRDPath: crdPath, 326 DepCRDPaths: []string{rayCrdPath}, 327 } 328 cfg = fwk.Init() 329 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true}))) 330 331 ginkgo.By("Create a resource flavor") 332 gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed()) 333 }) 334 335 ginkgo.AfterAll(func() { 336 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true) 337 fwk.Teardown() 338 }) 339 340 var ( 341 ns *corev1.Namespace 342 wlLookupKey types.NamespacedName 343 ) 344 ginkgo.BeforeEach(func() { 345 ns = &corev1.Namespace{ 346 ObjectMeta: metav1.ObjectMeta{ 347 GenerateName: "core-", 348 }, 349 } 350 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 351 352 wlLookupKey = types.NamespacedName{Name: workloadraycluster.GetWorkloadNameForRayCluster(jobName), Namespace: ns.Name} 353 }) 354 ginkgo.AfterEach(func() { 355 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 356 }) 357 358 ginkgo.DescribeTable("Single job at different stages of progress towards completion", 359 func(podsReadyTestSpec podsReadyTestSpec) { 360 ginkgo.By("Create a job") 361 job := testingraycluster.MakeCluster(jobName, ns.Name).Obj() 362 jobQueueName := "test-queue" 363 job.Annotations = map[string]string{constants.QueueAnnotation: jobQueueName} 364 gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) 365 lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name} 366 createdJob := &rayv1.RayCluster{} 367 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 368 369 ginkgo.By("Fetch the workload created for the job") 370 createdWorkload := &kueue.Workload{} 371 gomega.Eventually(func() error { 372 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 373 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 374 375 ginkgo.By("Admit the workload created for the job") 376 admission := testing.MakeAdmission("foo").PodSets( 377 kueue.PodSetAssignment{ 378 Name: createdWorkload.Spec.PodSets[0].Name, 379 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 380 corev1.ResourceCPU: "default", 381 }, 382 }, kueue.PodSetAssignment{ 383 Name: createdWorkload.Spec.PodSets[1].Name, 384 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 385 corev1.ResourceCPU: "default", 386 }, 387 }, 388 ).Obj() 389 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 390 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 391 gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 392 393 ginkgo.By("Await for the job to be unsuspended") 394 gomega.Eventually(func() bool { 395 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 396 return *createdJob.Spec.Suspend 397 }, util.Timeout, util.Interval).Should(gomega.BeFalse()) 398 399 if podsReadyTestSpec.beforeJobStatus != nil { 400 ginkgo.By("Update the job status to simulate its initial progress towards completion") 401 createdJob.Status = *podsReadyTestSpec.beforeJobStatus 402 gomega.Expect(k8sClient.Status().Update(ctx, createdJob)).Should(gomega.Succeed()) 403 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 404 } 405 406 if podsReadyTestSpec.beforeCondition != nil { 407 ginkgo.By("Update the workload status") 408 gomega.Eventually(func() *metav1.Condition { 409 gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 410 return apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadPodsReady) 411 }, util.Timeout, util.Interval).Should(gomega.BeComparableTo(podsReadyTestSpec.beforeCondition, ignoreConditionTimestamps)) 412 } 413 414 ginkgo.By("Update the job status to simulate its progress towards completion") 415 createdJob.Status = podsReadyTestSpec.jobStatus 416 gomega.Expect(k8sClient.Status().Update(ctx, createdJob)).Should(gomega.Succeed()) 417 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 418 419 if podsReadyTestSpec.suspended { 420 ginkgo.By("Unset admission of the workload to suspend the job") 421 gomega.Eventually(func() error { 422 // the update may need to be retried due to a conflict as the workload gets 423 // also updated due to setting of the job status. 424 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 425 return err 426 } 427 return util.SetQuotaReservation(ctx, k8sClient, createdWorkload, nil) 428 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 429 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 430 } 431 432 ginkgo.By("Verify the PodsReady condition is added") 433 gomega.Eventually(func() *metav1.Condition { 434 gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 435 return apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadPodsReady) 436 }, util.Timeout, util.Interval).Should(gomega.BeComparableTo(podsReadyTestSpec.wantCondition, ignoreConditionTimestamps)) 437 }, 438 439 ginkgo.Entry("No progress", podsReadyTestSpec{ 440 wantCondition: &metav1.Condition{ 441 Type: kueue.WorkloadPodsReady, 442 Status: metav1.ConditionFalse, 443 Reason: "PodsReady", 444 Message: "Not all pods are ready or succeeded", 445 }, 446 }), 447 ginkgo.Entry("Running RayCluster", podsReadyTestSpec{ 448 jobStatus: rayv1.RayClusterStatus{ 449 State: rayv1.Ready, 450 }, 451 wantCondition: &metav1.Condition{ 452 Type: kueue.WorkloadPodsReady, 453 Status: metav1.ConditionTrue, 454 Reason: "PodsReady", 455 Message: "All pods were ready or succeeded since the workload admission", 456 }, 457 }), 458 459 ginkgo.Entry("Running RayCluster; PodsReady=False before", podsReadyTestSpec{ 460 beforeCondition: &metav1.Condition{ 461 Type: kueue.WorkloadPodsReady, 462 Status: metav1.ConditionFalse, 463 Reason: "PodsReady", 464 Message: "Not all pods are ready or succeeded", 465 }, 466 jobStatus: rayv1.RayClusterStatus{ 467 468 State: rayv1.Ready, 469 }, 470 wantCondition: &metav1.Condition{ 471 Type: kueue.WorkloadPodsReady, 472 Status: metav1.ConditionTrue, 473 Reason: "PodsReady", 474 Message: "All pods were ready or succeeded since the workload admission", 475 }, 476 }), 477 ginkgo.Entry("Job suspended; PodsReady=True before", podsReadyTestSpec{ 478 beforeJobStatus: &rayv1.RayClusterStatus{ 479 State: rayv1.Ready, 480 }, 481 beforeCondition: &metav1.Condition{ 482 Type: kueue.WorkloadPodsReady, 483 Status: metav1.ConditionTrue, 484 Reason: "PodsReady", 485 Message: "All pods were ready or succeeded since the workload admission", 486 }, 487 jobStatus: rayv1.RayClusterStatus{ 488 State: rayv1.Ready, 489 }, 490 suspended: true, 491 wantCondition: &metav1.Condition{ 492 Type: kueue.WorkloadPodsReady, 493 Status: metav1.ConditionFalse, 494 Reason: "PodsReady", 495 Message: "Not all pods are ready or succeeded", 496 }, 497 }), 498 ) 499 }) 500 501 var _ = ginkgo.Describe("RayCluster Job controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 502 ginkgo.BeforeAll(func() { 503 fwk = &framework.Framework{ 504 CRDPath: crdPath, 505 DepCRDPaths: []string{rayCrdPath}, 506 } 507 cfg = fwk.Init() 508 ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup()) 509 }) 510 ginkgo.AfterAll(func() { 511 fwk.Teardown() 512 }) 513 514 var ( 515 ns *corev1.Namespace 516 onDemandFlavor *kueue.ResourceFlavor 517 spotUntaintedFlavor *kueue.ResourceFlavor 518 clusterQueue *kueue.ClusterQueue 519 localQueue *kueue.LocalQueue 520 ) 521 522 ginkgo.BeforeEach(func() { 523 ns = &corev1.Namespace{ 524 ObjectMeta: metav1.ObjectMeta{ 525 GenerateName: "core-", 526 }, 527 } 528 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 529 530 onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 531 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 532 533 spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj() 534 gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed()) 535 536 clusterQueue = testing.MakeClusterQueue("dev-clusterqueue"). 537 ResourceGroup( 538 *testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "4").Obj(), 539 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "4").Obj(), 540 ).Obj() 541 gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed()) 542 }) 543 ginkgo.AfterEach(func() { 544 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 545 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true) 546 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true) 547 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, spotUntaintedFlavor, true) 548 }) 549 550 ginkgo.It("Should schedule jobs as they fit in their ClusterQueue", func() { 551 ginkgo.By("creating localQueue") 552 localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() 553 gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed()) 554 555 ginkgo.By("checking a dev job starts") 556 job := testingraycluster.MakeCluster("dev-job", ns.Name).Queue(localQueue.Name). 557 RequestHead(corev1.ResourceCPU, "3"). 558 RequestWorkerGroup(corev1.ResourceCPU, "4"). 559 Obj() 560 gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) 561 createdJob := &rayv1.RayCluster{} 562 gomega.Eventually(func() bool { 563 gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: job.Name, Namespace: job.Namespace}, createdJob)). 564 Should(gomega.Succeed()) 565 return *createdJob.Spec.Suspend 566 }, util.Timeout, util.Interval).Should(gomega.BeFalse()) 567 gomega.Expect(createdJob.Spec.HeadGroupSpec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotUntaintedFlavor.Name)) 568 gomega.Expect(createdJob.Spec.WorkerGroupSpecs[0].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) 569 util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 0) 570 util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 1) 571 572 ginkgo.By("checking a second no-fit RayCluster does not start") 573 job2 := testingraycluster.MakeCluster("dev-job2", ns.Name).Queue(localQueue.Name). 574 RequestHead(corev1.ResourceCPU, "2"). 575 RequestWorkerGroup(corev1.ResourceCPU, "2"). 576 Obj() 577 gomega.Expect(k8sClient.Create(ctx, job2)).Should(gomega.Succeed()) 578 createdJob2 := &rayv1.RayCluster{} 579 gomega.Eventually(func() bool { 580 gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: job2.Name, Namespace: job2.Namespace}, createdJob2)). 581 Should(gomega.Succeed()) 582 return *createdJob2.Spec.Suspend 583 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 584 util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 1) 585 util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 1) 586 587 ginkgo.By("deleting the job", func() { 588 gomega.Expect(k8sClient.Delete(ctx, job)).Should(gomega.Succeed()) 589 gomega.Eventually(func(g gomega.Gomega) error { 590 return k8sClient.Get(ctx, types.NamespacedName{Name: job.Name, Namespace: job.Namespace}, job) 591 }, util.Timeout, util.Interval).Should(testing.BeNotFoundError()) 592 }) 593 594 // Users should not have to delete the workload 595 // This is usually done by the garbage collector, but there is no garbage collection in integration test 596 ginkgo.By("deleting the workload", func() { 597 wl := &kueue.Workload{} 598 wlKey := types.NamespacedName{Name: workloadraycluster.GetWorkloadNameForRayCluster(job.Name), Namespace: job.Namespace} 599 gomega.Expect(k8sClient.Get(ctx, wlKey, wl)).Should(gomega.Succeed()) 600 gomega.Expect(k8sClient.Delete(ctx, wl)).Should(gomega.Succeed()) 601 }) 602 603 ginkgo.By("checking the second RayCluster starts when the first one was deleted") 604 gomega.Eventually(func() bool { 605 gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: job2.Name, Namespace: job2.Namespace}, createdJob2)). 606 Should(gomega.Succeed()) 607 return *createdJob2.Spec.Suspend 608 }, util.Timeout, util.Interval).Should(gomega.BeFalse()) 609 gomega.Expect(createdJob2.Spec.HeadGroupSpec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotUntaintedFlavor.Name)) 610 gomega.Expect(createdJob2.Spec.WorkerGroupSpecs[0].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotUntaintedFlavor.Name)) 611 util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 0) 612 util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 1) 613 }) 614 }) 615 616 var _ = ginkgo.Describe("Job controller with preemption enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 617 ginkgo.BeforeAll(func() { 618 fwk = &framework.Framework{ 619 CRDPath: crdPath, 620 DepCRDPaths: []string{rayCrdPath}, 621 } 622 cfg = fwk.Init() 623 ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup()) 624 }) 625 ginkgo.AfterAll(func() { 626 fwk.Teardown() 627 }) 628 629 var ( 630 ns *corev1.Namespace 631 onDemandFlavor *kueue.ResourceFlavor 632 clusterQueue *kueue.ClusterQueue 633 localQueue *kueue.LocalQueue 634 ) 635 636 ginkgo.BeforeEach(func() { 637 ns = &corev1.Namespace{ 638 ObjectMeta: metav1.ObjectMeta{ 639 GenerateName: "core-", 640 }, 641 } 642 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 643 644 onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 645 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 646 647 clusterQueue = testing.MakeClusterQueue("clusterqueue"). 648 ResourceGroup( 649 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "4").Obj(), 650 ). 651 Preemption(kueue.ClusterQueuePreemption{ 652 WithinClusterQueue: kueue.PreemptionPolicyLowerPriority, 653 }). 654 Obj() 655 gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed()) 656 657 ginkgo.By("creating localQueue") 658 localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() 659 gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed()) 660 661 ginkgo.By("creating priority") 662 priorityClass := testing.MakePriorityClass(priorityClassName). 663 PriorityValue(priorityValue).Obj() 664 gomega.Expect(k8sClient.Create(ctx, priorityClass)).Should(gomega.Succeed()) 665 }) 666 ginkgo.AfterEach(func() { 667 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 668 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true) 669 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true) 670 }) 671 672 ginkgo.It("Should preempt lower priority RayClusters when resource insufficient", func() { 673 ginkgo.By("Create a low priority RayCluster") 674 lowPriorityJob := testingraycluster.MakeCluster("raycluster-with-low-priority", ns.Name).Queue(localQueue.Name). 675 RequestHead(corev1.ResourceCPU, "1"). 676 RequestWorkerGroup(corev1.ResourceCPU, "2"). 677 Obj() 678 gomega.Expect(k8sClient.Create(ctx, lowPriorityJob)).Should(gomega.Succeed()) 679 680 ginkgo.By("Await for the low priority workload to be admitted") 681 createdJob := &rayv1.RayCluster{} 682 gomega.Eventually(func() bool { 683 gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: lowPriorityJob.Name, Namespace: lowPriorityJob.Namespace}, createdJob)). 684 Should(gomega.Succeed()) 685 return *createdJob.Spec.Suspend 686 }, util.Timeout, util.Interval).Should(gomega.BeFalse()) 687 688 ginkgo.By("Create a high priority RayCluster which will preempt the lower one") 689 highPriorityJob := testingraycluster.MakeCluster("raycluster-with-high-priority", ns.Name).Queue(localQueue.Name). 690 RequestHead(corev1.ResourceCPU, "2"). 691 WithPriorityClassName(priorityClassName). 692 RequestWorkerGroup(corev1.ResourceCPU, "2"). 693 Obj() 694 gomega.Expect(k8sClient.Create(ctx, highPriorityJob)).Should(gomega.Succeed()) 695 696 ginkgo.By("High priority workload should be admitted") 697 highPriorityWL := &kueue.Workload{} 698 highPriorityLookupKey := types.NamespacedName{Name: workloadraycluster.GetWorkloadNameForRayCluster(highPriorityJob.Name), Namespace: ns.Name} 699 700 gomega.Eventually(func() error { 701 return k8sClient.Get(ctx, highPriorityLookupKey, highPriorityWL) 702 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 703 apimeta.IsStatusConditionTrue(highPriorityWL.Status.Conditions, kueue.WorkloadAdmitted) 704 705 ginkgo.By("Low priority workload should not be admitted") 706 createdWorkload := &kueue.Workload{} 707 lowPriorityLookupKey := types.NamespacedName{Name: workloadraycluster.GetWorkloadNameForRayCluster(lowPriorityJob.Name), Namespace: ns.Name} 708 709 gomega.Eventually(func() error { 710 return k8sClient.Get(ctx, lowPriorityLookupKey, createdWorkload) 711 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 712 apimeta.IsStatusConditionFalse(createdWorkload.Status.Conditions, kueue.WorkloadAdmitted) 713 714 ginkgo.By("Low priority RayCluster should be suspended") 715 createdJob = &rayv1.RayCluster{} 716 gomega.Eventually(func() bool { 717 gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: lowPriorityJob.Name, Namespace: lowPriorityJob.Namespace}, createdJob)). 718 Should(gomega.Succeed()) 719 return *createdJob.Spec.Suspend 720 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 721 722 ginkgo.By("Delete high priority raycluster") 723 gomega.Expect(k8sClient.Delete(ctx, highPriorityJob)).To(gomega.Succeed()) 724 gomega.EventuallyWithOffset(1, func() error { 725 raycluster := &rayv1.RayCluster{} 726 return k8sClient.Get(ctx, client.ObjectKeyFromObject(highPriorityJob), raycluster) 727 }, util.Timeout, util.Interval).Should(testing.BeNotFoundError()) 728 // Manually delete workload because no garbage collection controller. 729 gomega.Expect(k8sClient.Delete(ctx, highPriorityWL)).To(gomega.Succeed()) 730 gomega.EventuallyWithOffset(1, func() error { 731 wl := &kueue.Workload{} 732 return k8sClient.Get(ctx, highPriorityLookupKey, wl) 733 }, util.Timeout, util.Interval).Should(testing.BeNotFoundError()) 734 735 ginkgo.By("Low priority workload should be admitted again") 736 createdWorkload = &kueue.Workload{} 737 gomega.Eventually(func() error { 738 return k8sClient.Get(ctx, lowPriorityLookupKey, createdWorkload) 739 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 740 apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadAdmitted) 741 742 ginkgo.By("Low priority RayCluster should be unsuspended") 743 createdJob = &rayv1.RayCluster{} 744 gomega.Eventually(func() bool { 745 gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: lowPriorityJob.Name, Namespace: lowPriorityJob.Namespace}, createdJob)). 746 Should(gomega.Succeed()) 747 return *createdJob.Spec.Suspend 748 }, util.Timeout, util.Interval).Should(gomega.BeFalse()) 749 }) 750 })