sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/rayjob/rayjob_controller_test.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package rayjob 18 19 import ( 20 "fmt" 21 22 "github.com/google/go-cmp/cmp/cmpopts" 23 "github.com/onsi/ginkgo/v2" 24 "github.com/onsi/gomega" 25 rayjobapi "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" 26 corev1 "k8s.io/api/core/v1" 27 apierrors "k8s.io/apimachinery/pkg/api/errors" 28 apimeta "k8s.io/apimachinery/pkg/api/meta" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/apimachinery/pkg/types" 31 "k8s.io/client-go/kubernetes/scheme" 32 "k8s.io/utils/ptr" 33 ctrl "sigs.k8s.io/controller-runtime" 34 "sigs.k8s.io/controller-runtime/pkg/client" 35 36 configapi "sigs.k8s.io/kueue/apis/config/v1beta1" 37 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 38 "sigs.k8s.io/kueue/pkg/controller/constants" 39 "sigs.k8s.io/kueue/pkg/controller/jobframework" 40 workloadrayjob "sigs.k8s.io/kueue/pkg/controller/jobs/rayjob" 41 "sigs.k8s.io/kueue/pkg/util/testing" 42 testingrayjob "sigs.k8s.io/kueue/pkg/util/testingjobs/rayjob" 43 "sigs.k8s.io/kueue/test/integration/framework" 44 "sigs.k8s.io/kueue/test/util" 45 ) 46 47 const ( 48 jobName = "test-job" 49 instanceKey = "cloud.provider.com/instance" 50 priorityClassName = "test-priority-class" 51 priorityValue int32 = 10 52 ) 53 54 var ( 55 ignoreConditionTimestamps = cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime") 56 ) 57 58 // +kubebuilder:docs-gen:collapse=Imports 59 60 func setInitStatus(name, namespace string) { 61 createdJob := &rayjobapi.RayJob{} 62 nsName := types.NamespacedName{Name: name, Namespace: namespace} 63 gomega.EventuallyWithOffset(1, func() error { 64 if err := k8sClient.Get(ctx, nsName, createdJob); err != nil { 65 return err 66 } 67 createdJob.Status.JobDeploymentStatus = rayjobapi.JobDeploymentStatusSuspended 68 return k8sClient.Status().Update(ctx, createdJob) 69 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 70 } 71 72 var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 73 ginkgo.BeforeAll(func() { 74 fwk = &framework.Framework{ 75 CRDPath: crdPath, 76 DepCRDPaths: []string{rayCrdPath}, 77 } 78 79 cfg = fwk.Init() 80 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithManageJobsWithoutQueueName(true))) 81 }) 82 ginkgo.AfterAll(func() { 83 fwk.Teardown() 84 }) 85 86 var ( 87 ns *corev1.Namespace 88 wlLookupKey types.NamespacedName 89 ) 90 ginkgo.BeforeEach(func() { 91 ns = &corev1.Namespace{ 92 ObjectMeta: metav1.ObjectMeta{ 93 GenerateName: "core-", 94 }, 95 } 96 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 97 98 wlLookupKey = types.NamespacedName{Name: workloadrayjob.GetWorkloadNameForRayJob(jobName), Namespace: ns.Name} 99 }) 100 ginkgo.AfterEach(func() { 101 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 102 }) 103 104 ginkgo.It("Should reconcile RayJobs", func() { 105 ginkgo.By("checking the job gets suspended when created unsuspended") 106 priorityClass := testing.MakePriorityClass(priorityClassName). 107 PriorityValue(priorityValue).Obj() 108 gomega.Expect(k8sClient.Create(ctx, priorityClass)).Should(gomega.Succeed()) 109 110 job := testingrayjob.MakeJob(jobName, ns.Name). 111 Suspend(false). 112 WithPriorityClassName(priorityClassName). 113 Obj() 114 err := k8sClient.Create(ctx, job) 115 gomega.Expect(err).To(gomega.Succeed()) 116 createdJob := &rayjobapi.RayJob{} 117 118 setInitStatus(jobName, ns.Name) 119 gomega.Eventually(func() bool { 120 if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: ns.Name}, createdJob); err != nil { 121 return false 122 } 123 return createdJob.Spec.Suspend 124 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 125 126 ginkgo.By("checking the workload is created without queue assigned") 127 createdWorkload := &kueue.Workload{} 128 gomega.Eventually(func() error { 129 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 130 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 131 gomega.Expect(createdWorkload.Spec.QueueName).Should(gomega.Equal(""), "The Workload shouldn't have .spec.queueName set") 132 gomega.Expect(metav1.IsControlledBy(createdWorkload, createdJob)).To(gomega.BeTrue(), "The Workload should be owned by the Job") 133 134 ginkgo.By("checking the workload is created with priority and priorityName") 135 gomega.Expect(createdWorkload.Spec.PriorityClassName).Should(gomega.Equal(priorityClassName)) 136 gomega.Expect(*createdWorkload.Spec.Priority).Should(gomega.Equal(priorityValue)) 137 138 ginkgo.By("checking the workload is updated with queue name when the job does") 139 jobQueueName := "test-queue" 140 createdJob.Annotations = map[string]string{constants.QueueAnnotation: jobQueueName} 141 gomega.Expect(k8sClient.Update(ctx, createdJob)).Should(gomega.Succeed()) 142 gomega.Eventually(func() bool { 143 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 144 return false 145 } 146 return createdWorkload.Spec.QueueName == jobQueueName 147 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 148 149 ginkgo.By("checking a second non-matching workload is deleted") 150 secondWl := &kueue.Workload{ 151 ObjectMeta: metav1.ObjectMeta{ 152 Name: workloadrayjob.GetWorkloadNameForRayJob("second-workload"), 153 Namespace: createdWorkload.Namespace, 154 }, 155 Spec: *createdWorkload.Spec.DeepCopy(), 156 } 157 gomega.Expect(ctrl.SetControllerReference(createdJob, secondWl, scheme.Scheme)).Should(gomega.Succeed()) 158 secondWl.Spec.PodSets[0].Count += 1 159 160 gomega.Expect(k8sClient.Create(ctx, secondWl)).Should(gomega.Succeed()) 161 gomega.Eventually(func() error { 162 wl := &kueue.Workload{} 163 key := types.NamespacedName{Name: secondWl.Name, Namespace: secondWl.Namespace} 164 return k8sClient.Get(ctx, key, wl) 165 }, util.Timeout, util.Interval).Should(testing.BeNotFoundError()) 166 // check the original wl is still there 167 gomega.Eventually(func() error { 168 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 169 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 170 171 ginkgo.By("checking the job is unsuspended when workload is assigned") 172 onDemandFlavor := testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 173 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 174 spotFlavor := testing.MakeResourceFlavor("spot").Label(instanceKey, "spot").Obj() 175 gomega.Expect(k8sClient.Create(ctx, spotFlavor)).Should(gomega.Succeed()) 176 clusterQueue := testing.MakeClusterQueue("cluster-queue"). 177 ResourceGroup( 178 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(), 179 *testing.MakeFlavorQuotas("spot").Resource(corev1.ResourceCPU, "5").Obj(), 180 ).Obj() 181 admission := testing.MakeAdmission(clusterQueue.Name).PodSets( 182 kueue.PodSetAssignment{ 183 Name: createdWorkload.Spec.PodSets[0].Name, 184 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 185 corev1.ResourceCPU: "on-demand", 186 }, 187 }, kueue.PodSetAssignment{ 188 Name: createdWorkload.Spec.PodSets[1].Name, 189 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 190 corev1.ResourceCPU: "spot", 191 }, 192 }, 193 ).Obj() 194 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 195 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 196 lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name} 197 gomega.Eventually(func() bool { 198 if err := k8sClient.Get(ctx, lookupKey, createdJob); err != nil { 199 return false 200 } 201 return !createdJob.Spec.Suspend 202 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 203 gomega.Eventually(func() bool { 204 ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "Started", corev1.EventTypeNormal, fmt.Sprintf("Admitted by clusterQueue %v", clusterQueue.Name)) 205 return ok 206 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 207 gomega.Expect(len(createdJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.NodeSelector)).Should(gomega.Equal(1)) 208 gomega.Expect(createdJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) 209 gomega.Expect(len(createdJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.NodeSelector)).Should(gomega.Equal(1)) 210 gomega.Expect(createdJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotFlavor.Name)) 211 gomega.Eventually(func() bool { 212 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 213 return false 214 } 215 return apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadQuotaReserved) 216 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 217 218 ginkgo.By("checking the job gets suspended when parallelism changes and the added node selectors are removed") 219 parallelism := ptr.Deref(job.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas, 1) 220 newParallelism := int32(parallelism + 1) 221 createdJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas = &newParallelism 222 gomega.Expect(k8sClient.Update(ctx, createdJob)).Should(gomega.Succeed()) 223 gomega.Eventually(func() bool { 224 if err := k8sClient.Get(ctx, lookupKey, createdJob); err != nil { 225 return false 226 } 227 return createdJob.Spec.Suspend && len(createdJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.NodeSelector) == 0 228 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 229 gomega.Eventually(func() bool { 230 ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "DeletedWorkload", corev1.EventTypeNormal, fmt.Sprintf("Deleted not matching Workload: %v", wlLookupKey.String())) 231 return ok 232 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 233 234 ginkgo.By("checking the workload is updated with new count") 235 gomega.Eventually(func() bool { 236 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 237 return false 238 } 239 return createdWorkload.Spec.PodSets[1].Count == newParallelism 240 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 241 gomega.Expect(createdWorkload.Status.Admission).Should(gomega.BeNil()) 242 243 ginkgo.By("checking the job is unsuspended and selectors added when workload is assigned again") 244 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 245 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 246 gomega.Eventually(func() bool { 247 if err := k8sClient.Get(ctx, lookupKey, createdJob); err != nil { 248 return false 249 } 250 return !createdJob.Spec.Suspend 251 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 252 gomega.Expect(len(createdJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.NodeSelector)).Should(gomega.Equal(1)) 253 gomega.Expect(createdJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) 254 gomega.Expect(len(createdJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.NodeSelector)).Should(gomega.Equal(1)) 255 gomega.Expect(createdJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotFlavor.Name)) 256 gomega.Eventually(func() bool { 257 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 258 return false 259 } 260 return apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadQuotaReserved) 261 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 262 263 ginkgo.By("checking the workload is finished when job is completed") 264 createdJob.Status.JobDeploymentStatus = rayjobapi.JobDeploymentStatusComplete 265 createdJob.Status.JobStatus = rayjobapi.JobStatusSucceeded 266 createdJob.Status.Message = "Job finished by test" 267 268 gomega.Expect(k8sClient.Status().Update(ctx, createdJob)).Should(gomega.Succeed()) 269 gomega.Eventually(func() bool { 270 gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).To(gomega.Succeed()) 271 return apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadFinished) 272 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 273 }) 274 }) 275 276 var _ = ginkgo.Describe("Job controller for workloads when only jobs with queue are managed", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 277 ginkgo.BeforeAll(func() { 278 fwk = &framework.Framework{ 279 CRDPath: crdPath, 280 DepCRDPaths: []string{rayCrdPath}, 281 } 282 cfg = fwk.Init() 283 ctx, k8sClient = fwk.RunManager(cfg, managerSetup()) 284 }) 285 ginkgo.AfterAll(func() { 286 fwk.Teardown() 287 }) 288 289 var ( 290 ns *corev1.Namespace 291 ) 292 ginkgo.BeforeEach(func() { 293 ns = &corev1.Namespace{ 294 ObjectMeta: metav1.ObjectMeta{ 295 GenerateName: "core-", 296 }, 297 } 298 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 299 }) 300 ginkgo.AfterEach(func() { 301 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 302 }) 303 304 ginkgo.It("Should reconcile jobs only when queue is set", func() { 305 ginkgo.By("checking the workload is not created when queue name is not set") 306 job := testingrayjob.MakeJob(jobName, ns.Name).Obj() 307 gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) 308 lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name} 309 createdJob := &rayjobapi.RayJob{} 310 setInitStatus(jobName, ns.Name) 311 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 312 313 createdWorkload := &kueue.Workload{} 314 wlLookupKey := types.NamespacedName{Name: workloadrayjob.GetWorkloadNameForRayJob(jobName), Namespace: ns.Name} 315 gomega.Eventually(func() bool { 316 return apierrors.IsNotFound(k8sClient.Get(ctx, wlLookupKey, createdWorkload)) 317 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 318 319 ginkgo.By("checking the workload is created when queue name is set") 320 jobQueueName := "test-queue" 321 if createdJob.Labels == nil { 322 createdJob.Labels = map[string]string{constants.QueueAnnotation: jobQueueName} 323 } else { 324 createdJob.Labels[constants.QueueLabel] = jobQueueName 325 } 326 gomega.Expect(k8sClient.Update(ctx, createdJob)).Should(gomega.Succeed()) 327 gomega.Eventually(func() error { 328 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 329 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 330 }) 331 332 }) 333 334 var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 335 type podsReadyTestSpec struct { 336 beforeJobStatus *rayjobapi.RayJobStatus 337 beforeCondition *metav1.Condition 338 jobStatus rayjobapi.RayJobStatus 339 suspended bool 340 wantCondition *metav1.Condition 341 } 342 343 var defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj() 344 345 ginkgo.BeforeAll(func() { 346 fwk = &framework.Framework{ 347 CRDPath: crdPath, 348 DepCRDPaths: []string{rayCrdPath}, 349 } 350 cfg = fwk.Init() 351 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true}))) 352 353 ginkgo.By("Create a resource flavor") 354 gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed()) 355 }) 356 357 ginkgo.AfterAll(func() { 358 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true) 359 fwk.Teardown() 360 }) 361 362 var ( 363 ns *corev1.Namespace 364 wlLookupKey types.NamespacedName 365 ) 366 ginkgo.BeforeEach(func() { 367 ns = &corev1.Namespace{ 368 ObjectMeta: metav1.ObjectMeta{ 369 GenerateName: "core-", 370 }, 371 } 372 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 373 374 wlLookupKey = types.NamespacedName{Name: workloadrayjob.GetWorkloadNameForRayJob(jobName), Namespace: ns.Name} 375 }) 376 ginkgo.AfterEach(func() { 377 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 378 }) 379 380 ginkgo.DescribeTable("Single job at different stages of progress towards completion", 381 func(podsReadyTestSpec podsReadyTestSpec) { 382 ginkgo.By("Create a job") 383 job := testingrayjob.MakeJob(jobName, ns.Name).Obj() 384 jobQueueName := "test-queue" 385 job.Annotations = map[string]string{constants.QueueAnnotation: jobQueueName} 386 gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) 387 lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name} 388 setInitStatus(jobName, ns.Name) 389 createdJob := &rayjobapi.RayJob{} 390 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 391 392 ginkgo.By("Fetch the workload created for the job") 393 createdWorkload := &kueue.Workload{} 394 gomega.Eventually(func() error { 395 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 396 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 397 398 ginkgo.By("Admit the workload created for the job") 399 admission := testing.MakeAdmission("foo").PodSets( 400 kueue.PodSetAssignment{ 401 Name: createdWorkload.Spec.PodSets[0].Name, 402 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 403 corev1.ResourceCPU: "default", 404 }, 405 }, kueue.PodSetAssignment{ 406 Name: createdWorkload.Spec.PodSets[1].Name, 407 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 408 corev1.ResourceCPU: "default", 409 }, 410 }, 411 ).Obj() 412 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 413 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 414 gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 415 416 ginkgo.By("Await for the job to be unsuspended") 417 gomega.Eventually(func() bool { 418 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 419 return createdJob.Spec.Suspend 420 }, util.Timeout, util.Interval).Should(gomega.BeFalse()) 421 422 if podsReadyTestSpec.beforeJobStatus != nil { 423 ginkgo.By("Update the job status to simulate its initial progress towards completion") 424 createdJob.Status = *podsReadyTestSpec.beforeJobStatus 425 gomega.Expect(k8sClient.Status().Update(ctx, createdJob)).Should(gomega.Succeed()) 426 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 427 } 428 429 if podsReadyTestSpec.beforeCondition != nil { 430 ginkgo.By("Update the workload status") 431 gomega.Eventually(func() *metav1.Condition { 432 gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 433 return apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadPodsReady) 434 }, util.Timeout, util.Interval).Should(gomega.BeComparableTo(podsReadyTestSpec.beforeCondition, ignoreConditionTimestamps)) 435 } 436 437 ginkgo.By("Update the job status to simulate its progress towards completion") 438 createdJob.Status = podsReadyTestSpec.jobStatus 439 gomega.Expect(k8sClient.Status().Update(ctx, createdJob)).Should(gomega.Succeed()) 440 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 441 442 if podsReadyTestSpec.suspended { 443 ginkgo.By("Unset admission of the workload to suspend the job") 444 gomega.Eventually(func() error { 445 // the update may need to be retried due to a conflict as the workload gets 446 // also updated due to setting of the job status. 447 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 448 return err 449 } 450 return util.SetQuotaReservation(ctx, k8sClient, createdWorkload, nil) 451 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 452 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 453 } 454 455 ginkgo.By("Verify the PodsReady condition is added") 456 gomega.Eventually(func() *metav1.Condition { 457 gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 458 return apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadPodsReady) 459 }, util.Timeout, util.Interval).Should(gomega.BeComparableTo(podsReadyTestSpec.wantCondition, ignoreConditionTimestamps)) 460 }, 461 ginkgo.Entry("No progress", podsReadyTestSpec{ 462 wantCondition: &metav1.Condition{ 463 Type: kueue.WorkloadPodsReady, 464 Status: metav1.ConditionFalse, 465 Reason: "PodsReady", 466 Message: "Not all pods are ready or succeeded", 467 }, 468 }), 469 ginkgo.Entry("Running RayJob", podsReadyTestSpec{ 470 jobStatus: rayjobapi.RayJobStatus{ 471 JobDeploymentStatus: rayjobapi.JobDeploymentStatusRunning, 472 RayClusterStatus: rayjobapi.RayClusterStatus{ 473 State: rayjobapi.Ready, 474 }, 475 }, 476 wantCondition: &metav1.Condition{ 477 Type: kueue.WorkloadPodsReady, 478 Status: metav1.ConditionTrue, 479 Reason: "PodsReady", 480 Message: "All pods were ready or succeeded since the workload admission", 481 }, 482 }), 483 ginkgo.Entry("Running RayJob; PodsReady=False before", podsReadyTestSpec{ 484 beforeCondition: &metav1.Condition{ 485 Type: kueue.WorkloadPodsReady, 486 Status: metav1.ConditionFalse, 487 Reason: "PodsReady", 488 Message: "Not all pods are ready or succeeded", 489 }, 490 jobStatus: rayjobapi.RayJobStatus{ 491 JobDeploymentStatus: rayjobapi.JobDeploymentStatusRunning, 492 RayClusterStatus: rayjobapi.RayClusterStatus{ 493 State: rayjobapi.Ready, 494 }, 495 }, 496 wantCondition: &metav1.Condition{ 497 Type: kueue.WorkloadPodsReady, 498 Status: metav1.ConditionTrue, 499 Reason: "PodsReady", 500 Message: "All pods were ready or succeeded since the workload admission", 501 }, 502 }), 503 ginkgo.Entry("Job suspended; PodsReady=True before", podsReadyTestSpec{ 504 beforeJobStatus: &rayjobapi.RayJobStatus{ 505 JobDeploymentStatus: rayjobapi.JobDeploymentStatusRunning, 506 RayClusterStatus: rayjobapi.RayClusterStatus{ 507 State: rayjobapi.Ready, 508 }, 509 }, 510 beforeCondition: &metav1.Condition{ 511 Type: kueue.WorkloadPodsReady, 512 Status: metav1.ConditionTrue, 513 Reason: "PodsReady", 514 Message: "All pods were ready or succeeded since the workload admission", 515 }, 516 jobStatus: rayjobapi.RayJobStatus{ 517 JobDeploymentStatus: rayjobapi.JobDeploymentStatusSuspended, 518 }, 519 suspended: true, 520 wantCondition: &metav1.Condition{ 521 Type: kueue.WorkloadPodsReady, 522 Status: metav1.ConditionFalse, 523 Reason: "PodsReady", 524 Message: "Not all pods are ready or succeeded", 525 }, 526 }), 527 ) 528 }) 529 530 var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 531 ginkgo.BeforeAll(func() { 532 fwk = &framework.Framework{ 533 CRDPath: crdPath, 534 DepCRDPaths: []string{rayCrdPath}, 535 } 536 cfg = fwk.Init() 537 ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup()) 538 }) 539 ginkgo.AfterAll(func() { 540 fwk.Teardown() 541 }) 542 543 var ( 544 ns *corev1.Namespace 545 onDemandFlavor *kueue.ResourceFlavor 546 spotUntaintedFlavor *kueue.ResourceFlavor 547 clusterQueue *kueue.ClusterQueue 548 localQueue *kueue.LocalQueue 549 ) 550 551 ginkgo.BeforeEach(func() { 552 ns = &corev1.Namespace{ 553 ObjectMeta: metav1.ObjectMeta{ 554 GenerateName: "core-", 555 }, 556 } 557 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 558 559 onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 560 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 561 562 spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj() 563 gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed()) 564 565 clusterQueue = testing.MakeClusterQueue("dev-clusterqueue"). 566 ResourceGroup( 567 *testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "5").Obj(), 568 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(), 569 ).Obj() 570 gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed()) 571 }) 572 ginkgo.AfterEach(func() { 573 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 574 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true) 575 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true) 576 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, spotUntaintedFlavor, true) 577 }) 578 579 ginkgo.It("Should schedule jobs as they fit in their ClusterQueue", func() { 580 ginkgo.By("creating localQueue") 581 localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() 582 gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed()) 583 584 ginkgo.By("checking a dev job starts") 585 job := testingrayjob.MakeJob("dev-job", ns.Name).Queue(localQueue.Name). 586 RequestHead(corev1.ResourceCPU, "3"). 587 RequestWorkerGroup(corev1.ResourceCPU, "4"). 588 Obj() 589 gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) 590 setInitStatus(job.Name, job.Namespace) 591 createdJob := &rayjobapi.RayJob{} 592 gomega.Eventually(func() bool { 593 gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: job.Name, Namespace: job.Namespace}, createdJob)). 594 Should(gomega.Succeed()) 595 return createdJob.Spec.Suspend 596 }, util.Timeout, util.Interval).Should(gomega.BeFalse()) 597 gomega.Expect(createdJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotUntaintedFlavor.Name)) 598 gomega.Expect(createdJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) 599 util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 0) 600 util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 1) 601 602 }) 603 }) 604 605 var _ = ginkgo.Describe("Job controller with preemption enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 606 ginkgo.BeforeAll(func() { 607 fwk = &framework.Framework{ 608 CRDPath: crdPath, 609 DepCRDPaths: []string{rayCrdPath}, 610 } 611 cfg = fwk.Init() 612 ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup()) 613 }) 614 ginkgo.AfterAll(func() { 615 fwk.Teardown() 616 }) 617 618 var ( 619 ns *corev1.Namespace 620 onDemandFlavor *kueue.ResourceFlavor 621 clusterQueue *kueue.ClusterQueue 622 localQueue *kueue.LocalQueue 623 ) 624 625 ginkgo.BeforeEach(func() { 626 ns = &corev1.Namespace{ 627 ObjectMeta: metav1.ObjectMeta{ 628 GenerateName: "core-", 629 }, 630 } 631 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 632 633 onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 634 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 635 636 clusterQueue = testing.MakeClusterQueue("clusterqueue"). 637 ResourceGroup( 638 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "4").Obj(), 639 ). 640 Preemption(kueue.ClusterQueuePreemption{ 641 WithinClusterQueue: kueue.PreemptionPolicyLowerPriority, 642 }). 643 Obj() 644 gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed()) 645 646 ginkgo.By("creating localQueue") 647 localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() 648 gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed()) 649 650 ginkgo.By("creating priority") 651 priorityClass := testing.MakePriorityClass(priorityClassName). 652 PriorityValue(priorityValue).Obj() 653 gomega.Expect(k8sClient.Create(ctx, priorityClass)).Should(gomega.Succeed()) 654 }) 655 ginkgo.AfterEach(func() { 656 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 657 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true) 658 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true) 659 }) 660 661 ginkgo.It("Should preempt lower priority rayJobs when resource insufficient", func() { 662 ginkgo.By("Create a low priority rayJob") 663 lowPriorityJob := testingrayjob.MakeJob("rayjob-with-low-priority", ns.Name).Queue(localQueue.Name). 664 RequestHead(corev1.ResourceCPU, "1"). 665 RequestWorkerGroup(corev1.ResourceCPU, "2"). 666 Obj() 667 gomega.Expect(k8sClient.Create(ctx, lowPriorityJob)).Should(gomega.Succeed()) 668 setInitStatus(lowPriorityJob.Name, lowPriorityJob.Namespace) 669 670 ginkgo.By("Await for the low priority workload to be admitted") 671 createdJob := &rayjobapi.RayJob{} 672 gomega.Eventually(func() bool { 673 gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: lowPriorityJob.Name, Namespace: lowPriorityJob.Namespace}, createdJob)). 674 Should(gomega.Succeed()) 675 return createdJob.Spec.Suspend 676 }, util.Timeout, util.Interval).Should(gomega.BeFalse()) 677 678 ginkgo.By("Create a high priority rayJob which will preempt the lower one") 679 highPriorityJob := testingrayjob.MakeJob("rayjob-with-high-priority", ns.Name).Queue(localQueue.Name). 680 RequestHead(corev1.ResourceCPU, "2"). 681 WithPriorityClassName(priorityClassName). 682 RequestWorkerGroup(corev1.ResourceCPU, "2"). 683 Obj() 684 gomega.Expect(k8sClient.Create(ctx, highPriorityJob)).Should(gomega.Succeed()) 685 setInitStatus(highPriorityJob.Name, highPriorityJob.Namespace) 686 687 ginkgo.By("High priority workload should be admitted") 688 highPriorityWL := &kueue.Workload{} 689 highPriorityLookupKey := types.NamespacedName{Name: workloadrayjob.GetWorkloadNameForRayJob(highPriorityJob.Name), Namespace: ns.Name} 690 691 gomega.Eventually(func() error { 692 return k8sClient.Get(ctx, highPriorityLookupKey, highPriorityWL) 693 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 694 apimeta.IsStatusConditionTrue(highPriorityWL.Status.Conditions, kueue.WorkloadAdmitted) 695 696 ginkgo.By("Low priority workload should not be admitted") 697 createdWorkload := &kueue.Workload{} 698 lowPriorityLookupKey := types.NamespacedName{Name: workloadrayjob.GetWorkloadNameForRayJob(lowPriorityJob.Name), Namespace: ns.Name} 699 700 gomega.Eventually(func() error { 701 return k8sClient.Get(ctx, lowPriorityLookupKey, createdWorkload) 702 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 703 apimeta.IsStatusConditionFalse(createdWorkload.Status.Conditions, kueue.WorkloadAdmitted) 704 705 ginkgo.By("Low priority rayJob should be suspended") 706 createdJob = &rayjobapi.RayJob{} 707 gomega.Eventually(func() bool { 708 gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: lowPriorityJob.Name, Namespace: lowPriorityJob.Namespace}, createdJob)). 709 Should(gomega.Succeed()) 710 return createdJob.Spec.Suspend 711 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 712 713 ginkgo.By("Delete high priority rayjob") 714 gomega.Expect(k8sClient.Delete(ctx, highPriorityJob)).To(gomega.Succeed()) 715 gomega.EventuallyWithOffset(1, func() error { 716 rayjob := &rayjobapi.RayJob{} 717 return k8sClient.Get(ctx, client.ObjectKeyFromObject(highPriorityJob), rayjob) 718 }, util.Timeout, util.Interval).Should(testing.BeNotFoundError()) 719 // Manually delete workload because no garbage collection controller. 720 gomega.Expect(k8sClient.Delete(ctx, highPriorityWL)).To(gomega.Succeed()) 721 gomega.EventuallyWithOffset(1, func() error { 722 wl := &kueue.Workload{} 723 return k8sClient.Get(ctx, highPriorityLookupKey, wl) 724 }, util.Timeout, util.Interval).Should(testing.BeNotFoundError()) 725 726 ginkgo.By("Low priority workload should be admitted again") 727 createdWorkload = &kueue.Workload{} 728 gomega.Eventually(func() error { 729 return k8sClient.Get(ctx, lowPriorityLookupKey, createdWorkload) 730 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 731 apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadAdmitted) 732 733 ginkgo.By("Low priority rayJob should be unsuspended") 734 createdJob = &rayjobapi.RayJob{} 735 gomega.Eventually(func() bool { 736 gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: lowPriorityJob.Name, Namespace: lowPriorityJob.Namespace}, createdJob)). 737 Should(gomega.Succeed()) 738 return createdJob.Spec.Suspend 739 }, util.Timeout, util.Interval).Should(gomega.BeFalse()) 740 }) 741 })