sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/pytorchjob/pytorchjob_controller_test.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package pytorchjob 18 19 import ( 20 kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 21 "github.com/onsi/ginkgo/v2" 22 "github.com/onsi/gomega" 23 corev1 "k8s.io/api/core/v1" 24 apierrors "k8s.io/apimachinery/pkg/api/errors" 25 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 "k8s.io/apimachinery/pkg/types" 27 "k8s.io/utils/ptr" 28 "sigs.k8s.io/controller-runtime/pkg/client" 29 30 configapi "sigs.k8s.io/kueue/apis/config/v1beta1" 31 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 32 "sigs.k8s.io/kueue/pkg/controller/constants" 33 "sigs.k8s.io/kueue/pkg/controller/jobframework" 34 workloadpytorchjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/pytorchjob" 35 "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/kubeflowjob" 36 "sigs.k8s.io/kueue/pkg/util/testing" 37 testingpytorchjob "sigs.k8s.io/kueue/pkg/util/testingjobs/pytorchjob" 38 "sigs.k8s.io/kueue/pkg/workload" 39 kftesting "sigs.k8s.io/kueue/test/integration/controller/jobs/kubeflow" 40 "sigs.k8s.io/kueue/test/integration/framework" 41 "sigs.k8s.io/kueue/test/util" 42 ) 43 44 const ( 45 jobName = "test-job" 46 instanceKey = "cloud.provider.com/instance" 47 priorityClassName = "test-priority-class" 48 priorityValue = 10 49 jobQueueName = "test-queue" 50 ) 51 52 // +kubebuilder:docs-gen:collapse=Imports 53 54 var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 55 56 ginkgo.BeforeAll(func() { 57 fwk = &framework.Framework{ 58 CRDPath: crdPath, 59 DepCRDPaths: []string{pytorchCrdPath}, 60 } 61 cfg = fwk.Init() 62 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithManageJobsWithoutQueueName(true))) 63 }) 64 ginkgo.AfterAll(func() { 65 fwk.Teardown() 66 }) 67 68 var ( 69 ns *corev1.Namespace 70 ) 71 ginkgo.BeforeEach(func() { 72 ns = &corev1.Namespace{ 73 ObjectMeta: metav1.ObjectMeta{ 74 GenerateName: "core-", 75 }, 76 } 77 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 78 }) 79 ginkgo.AfterEach(func() { 80 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 81 }) 82 83 ginkgo.It("Should reconcile PyTorchJobs", func() { 84 kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpytorchjob.JobControl)(testingpytorchjob.MakePyTorchJob(jobName, ns.Name).Obj())} 85 createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpytorchjob.JobControl)(&kftraining.PyTorchJob{})} 86 kftesting.ShouldReconcileJob(ctx, k8sClient, kfJob, createdJob, []kftesting.PodSetsResource{ 87 { 88 RoleName: kftraining.PyTorchJobReplicaTypeMaster, 89 ResourceCPU: "on-demand", 90 }, 91 { 92 RoleName: kftraining.PyTorchJobReplicaTypeWorker, 93 ResourceCPU: "spot", 94 }, 95 }) 96 }) 97 }) 98 99 var _ = ginkgo.Describe("Job controller for workloads when only jobs with queue are managed", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 100 ginkgo.BeforeAll(func() { 101 fwk = &framework.Framework{ 102 CRDPath: crdPath, 103 DepCRDPaths: []string{pytorchCrdPath}, 104 } 105 cfg := fwk.Init() 106 ctx, k8sClient = fwk.RunManager(cfg, managerSetup()) 107 }) 108 ginkgo.AfterAll(func() { 109 fwk.Teardown() 110 }) 111 112 var ( 113 ns *corev1.Namespace 114 ) 115 ginkgo.BeforeEach(func() { 116 ns = &corev1.Namespace{ 117 ObjectMeta: metav1.ObjectMeta{ 118 GenerateName: "core-", 119 }, 120 } 121 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 122 }) 123 ginkgo.AfterEach(func() { 124 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 125 }) 126 127 ginkgo.It("Should reconcile jobs only when queue is set", func() { 128 ginkgo.By("checking the workload is not created when queue name is not set") 129 job := testingpytorchjob.MakePyTorchJob(jobName, ns.Name).Obj() 130 gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) 131 lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name} 132 createdJob := &kftraining.PyTorchJob{} 133 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 134 135 createdWorkload := &kueue.Workload{} 136 wlLookupKey := types.NamespacedName{Name: workloadpytorchjob.GetWorkloadNameForPyTorchJob(jobName), Namespace: ns.Name} 137 gomega.Eventually(func() bool { 138 return apierrors.IsNotFound(k8sClient.Get(ctx, wlLookupKey, createdWorkload)) 139 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 140 141 ginkgo.By("checking the workload is created when queue name is set") 142 createdJob.Annotations = map[string]string{constants.QueueAnnotation: jobQueueName} 143 gomega.Expect(k8sClient.Update(ctx, createdJob)).Should(gomega.Succeed()) 144 gomega.Eventually(func() error { 145 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 146 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 147 }) 148 149 ginkgo.When("the queue has admission checks", func() { 150 var ( 151 clusterQueueAc *kueue.ClusterQueue 152 localQueue *kueue.LocalQueue 153 testFlavor *kueue.ResourceFlavor 154 jobLookupKey *types.NamespacedName 155 wlLookupKey *types.NamespacedName 156 admissionCheck *kueue.AdmissionCheck 157 ) 158 159 ginkgo.BeforeEach(func() { 160 admissionCheck = testing.MakeAdmissionCheck("check").ControllerName("ac-controller").Obj() 161 gomega.Expect(k8sClient.Create(ctx, admissionCheck)).To(gomega.Succeed()) 162 util.SetAdmissionCheckActive(ctx, k8sClient, admissionCheck, metav1.ConditionTrue) 163 clusterQueueAc = testing.MakeClusterQueue("prod-cq-with-checks"). 164 ResourceGroup( 165 *testing.MakeFlavorQuotas("test-flavor").Resource(corev1.ResourceCPU, "5").Obj(), 166 ).AdmissionChecks("check").Obj() 167 gomega.Expect(k8sClient.Create(ctx, clusterQueueAc)).Should(gomega.Succeed()) 168 localQueue = testing.MakeLocalQueue("queue", ns.Name).ClusterQueue(clusterQueueAc.Name).Obj() 169 gomega.Expect(k8sClient.Create(ctx, localQueue)).To(gomega.Succeed()) 170 testFlavor = testing.MakeResourceFlavor("test-flavor").Label(instanceKey, "test-flavor").Obj() 171 gomega.Expect(k8sClient.Create(ctx, testFlavor)).Should(gomega.Succeed()) 172 173 jobLookupKey = &types.NamespacedName{Name: jobName, Namespace: ns.Name} 174 wlLookupKey = &types.NamespacedName{Name: workloadpytorchjob.GetWorkloadNameForPyTorchJob(jobName), Namespace: ns.Name} 175 }) 176 177 ginkgo.AfterEach(func() { 178 gomega.Expect(util.DeleteAdmissionCheck(ctx, k8sClient, admissionCheck)).To(gomega.Succeed()) 179 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, testFlavor, true) 180 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 181 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueueAc, true) 182 }) 183 184 ginkgo.It("labels and annotations should be propagated from admission check to job", func() { 185 createdJob := &kftraining.PyTorchJob{} 186 createdWorkload := &kueue.Workload{} 187 188 ginkgo.By("creating the job with pod labels & annotations", func() { 189 job := testingpytorchjob.MakePyTorchJob(jobName, ns.Name). 190 PodAnnotation(kftraining.PyTorchJobReplicaTypeWorker, "old-ann-key", "old-ann-value"). 191 PodLabel(kftraining.PyTorchJobReplicaTypeWorker, "old-label-key", "old-label-value"). 192 Queue(localQueue.Name). 193 Obj() 194 gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) 195 }) 196 197 ginkgo.By("fetch the job and verify it is suspended as the checks are not ready", func() { 198 gomega.Eventually(func() *bool { 199 gomega.Expect(k8sClient.Get(ctx, *jobLookupKey, createdJob)).Should(gomega.Succeed()) 200 return createdJob.Spec.RunPolicy.Suspend 201 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true))) 202 }) 203 204 ginkgo.By("fetch the created workload", func() { 205 gomega.Eventually(func() error { 206 return k8sClient.Get(ctx, *wlLookupKey, createdWorkload) 207 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 208 }) 209 210 ginkgo.By("add labels & annotations to the admission check", func() { 211 gomega.Eventually(func() error { 212 var newWL kueue.Workload 213 gomega.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(createdWorkload), &newWL)).To(gomega.Succeed()) 214 workload.SetAdmissionCheckState(&newWL.Status.AdmissionChecks, kueue.AdmissionCheckState{ 215 Name: "check", 216 State: kueue.CheckStateReady, 217 PodSetUpdates: []kueue.PodSetUpdate{ 218 { 219 Name: "master", 220 }, 221 { 222 Name: "worker", 223 Annotations: map[string]string{ 224 "ann1": "ann-value1", 225 }, 226 Labels: map[string]string{ 227 "label1": "label-value1", 228 }, 229 NodeSelector: map[string]string{ 230 "selector1": "selector-value1", 231 }, 232 Tolerations: []corev1.Toleration{ 233 { 234 Key: "selector1", 235 Value: "selector-value1", 236 Operator: corev1.TolerationOpEqual, 237 Effect: corev1.TaintEffectNoSchedule, 238 }, 239 }, 240 }, 241 }, 242 }) 243 return k8sClient.Status().Update(ctx, &newWL) 244 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 245 }) 246 247 ginkgo.By("admit the workload", func() { 248 admission := testing.MakeAdmission(clusterQueueAc.Name). 249 PodSets( 250 kueue.PodSetAssignment{ 251 Name: "master", 252 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 253 corev1.ResourceCPU: "test-flavor", 254 }, 255 Count: ptr.To(createdWorkload.Spec.PodSets[0].Count), 256 }, 257 kueue.PodSetAssignment{ 258 Name: "worker", 259 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 260 corev1.ResourceCPU: "test-flavor", 261 }, 262 Count: ptr.To(createdWorkload.Spec.PodSets[1].Count), 263 }, 264 ). 265 Obj() 266 gomega.Expect(k8sClient.Get(ctx, *wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 267 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 268 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 269 }) 270 271 ginkgo.By("await for the job to start", func() { 272 gomega.Eventually(func() *bool { 273 gomega.Expect(k8sClient.Get(ctx, *jobLookupKey, createdJob)).Should(gomega.Succeed()) 274 return createdJob.Spec.RunPolicy.Suspend 275 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false))) 276 }) 277 278 ginkgo.By("verify the PodSetUpdates are propagated to the running job", func() { 279 worker := createdJob.Spec.PyTorchReplicaSpecs[kftraining.PyTorchJobReplicaTypeWorker].Template 280 gomega.Expect(worker.Annotations).Should(gomega.HaveKeyWithValue("ann1", "ann-value1")) 281 gomega.Expect(worker.Annotations).Should(gomega.HaveKeyWithValue("old-ann-key", "old-ann-value")) 282 gomega.Expect(worker.Labels).Should(gomega.HaveKeyWithValue("label1", "label-value1")) 283 gomega.Expect(worker.Labels).Should(gomega.HaveKeyWithValue("old-label-key", "old-label-value")) 284 gomega.Expect(worker.Spec.NodeSelector).Should(gomega.HaveKeyWithValue(instanceKey, "test-flavor")) 285 gomega.Expect(worker.Spec.NodeSelector).Should(gomega.HaveKeyWithValue("selector1", "selector-value1")) 286 gomega.Expect(worker.Spec.Tolerations).Should(gomega.BeComparableTo( 287 []corev1.Toleration{ 288 { 289 Key: "selector1", 290 Value: "selector-value1", 291 Operator: corev1.TolerationOpEqual, 292 Effect: corev1.TaintEffectNoSchedule, 293 }, 294 }, 295 )) 296 }) 297 298 ginkgo.By("delete the localQueue to prevent readmission", func() { 299 gomega.Expect(util.DeleteLocalQueue(ctx, k8sClient, localQueue)).Should(gomega.Succeed()) 300 }) 301 302 ginkgo.By("clear the workload's admission to stop the job", func() { 303 gomega.Expect(k8sClient.Get(ctx, *wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 304 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, nil)).Should(gomega.Succeed()) 305 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 306 }) 307 308 ginkgo.By("await for the job to be suspended", func() { 309 gomega.Eventually(func() *bool { 310 gomega.Expect(k8sClient.Get(ctx, *jobLookupKey, createdJob)).Should(gomega.Succeed()) 311 return createdJob.Spec.RunPolicy.Suspend 312 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true))) 313 }) 314 315 ginkgo.By("verify the PodSetUpdates are restored", func() { 316 worker := createdJob.Spec.PyTorchReplicaSpecs[kftraining.PyTorchJobReplicaTypeWorker].Template 317 gomega.Expect(worker.Annotations).ShouldNot(gomega.HaveKey("ann1")) 318 gomega.Expect(worker.Annotations).Should(gomega.HaveKeyWithValue("old-ann-key", "old-ann-value")) 319 gomega.Expect(worker.Labels).ShouldNot(gomega.HaveKey("label1")) 320 gomega.Expect(worker.Labels).Should(gomega.HaveKeyWithValue("old-label-key", "old-label-value")) 321 gomega.Expect(worker.Spec.NodeSelector).ShouldNot(gomega.HaveKey(instanceKey)) 322 gomega.Expect(worker.Spec.NodeSelector).ShouldNot(gomega.HaveKey("selector1")) 323 }) 324 }) 325 }) 326 }) 327 328 var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 329 var ( 330 ns *corev1.Namespace 331 defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj() 332 ) 333 334 ginkgo.BeforeAll(func() { 335 fwk = &framework.Framework{ 336 CRDPath: crdPath, 337 DepCRDPaths: []string{pytorchCrdPath}, 338 } 339 cfg := fwk.Init() 340 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true}))) 341 342 ginkgo.By("Create a resource flavor") 343 gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed()) 344 }) 345 ginkgo.AfterAll(func() { 346 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true) 347 fwk.Teardown() 348 }) 349 350 ginkgo.BeforeEach(func() { 351 ns = &corev1.Namespace{ 352 ObjectMeta: metav1.ObjectMeta{ 353 GenerateName: "core-", 354 }, 355 } 356 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 357 }) 358 ginkgo.AfterEach(func() { 359 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 360 }) 361 362 ginkgo.DescribeTable("Single job at different stages of progress towards completion", 363 func(podsReadyTestSpec kftesting.PodsReadyTestSpec) { 364 kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpytorchjob.JobControl)(testingpytorchjob.MakePyTorchJob(jobName, ns.Name).Parallelism(2).Obj())} 365 createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpytorchjob.JobControl)(&kftraining.PyTorchJob{})} 366 367 kftesting.JobControllerWhenWaitForPodsReadyEnabled(ctx, k8sClient, kfJob, createdJob, podsReadyTestSpec, []kftesting.PodSetsResource{ 368 { 369 RoleName: kftraining.PyTorchJobReplicaTypeMaster, 370 ResourceCPU: "default", 371 }, 372 { 373 RoleName: kftraining.PyTorchJobReplicaTypeWorker, 374 ResourceCPU: "default", 375 }, 376 }) 377 }, 378 ginkgo.Entry("No progress", kftesting.PodsReadyTestSpec{ 379 WantCondition: &metav1.Condition{ 380 Type: kueue.WorkloadPodsReady, 381 Status: metav1.ConditionFalse, 382 Reason: "PodsReady", 383 Message: "Not all pods are ready or succeeded", 384 }, 385 }), 386 ginkgo.Entry("Running PyTorchJob", kftesting.PodsReadyTestSpec{ 387 JobStatus: kftraining.JobStatus{ 388 Conditions: []kftraining.JobCondition{ 389 { 390 Type: kftraining.JobRunning, 391 Status: corev1.ConditionTrue, 392 Reason: "Running", 393 }, 394 }, 395 }, 396 WantCondition: &metav1.Condition{ 397 Type: kueue.WorkloadPodsReady, 398 Status: metav1.ConditionTrue, 399 Reason: "PodsReady", 400 Message: "All pods were ready or succeeded since the workload admission", 401 }, 402 }), 403 ginkgo.Entry("Running PyTorchJob; PodsReady=False before", kftesting.PodsReadyTestSpec{ 404 BeforeCondition: &metav1.Condition{ 405 Type: kueue.WorkloadPodsReady, 406 Status: metav1.ConditionFalse, 407 Reason: "PodsReady", 408 Message: "Not all pods are ready or succeeded", 409 }, 410 JobStatus: kftraining.JobStatus{ 411 Conditions: []kftraining.JobCondition{ 412 { 413 Type: kftraining.JobRunning, 414 Status: corev1.ConditionTrue, 415 Reason: "Running", 416 }, 417 }, 418 }, 419 WantCondition: &metav1.Condition{ 420 Type: kueue.WorkloadPodsReady, 421 Status: metav1.ConditionTrue, 422 Reason: "PodsReady", 423 Message: "All pods were ready or succeeded since the workload admission", 424 }, 425 }), 426 ginkgo.Entry("Job suspended; PodsReady=True before", kftesting.PodsReadyTestSpec{ 427 BeforeJobStatus: &kftraining.JobStatus{ 428 Conditions: []kftraining.JobCondition{ 429 { 430 Type: kftraining.JobRunning, 431 Status: corev1.ConditionTrue, 432 Reason: "Running", 433 }, 434 }, 435 }, 436 BeforeCondition: &metav1.Condition{ 437 Type: kueue.WorkloadPodsReady, 438 Status: metav1.ConditionTrue, 439 Reason: "PodsReady", 440 Message: "All pods were ready or succeeded since the workload admission", 441 }, 442 JobStatus: kftraining.JobStatus{ 443 Conditions: []kftraining.JobCondition{ 444 { 445 Type: kftraining.JobRunning, 446 Status: corev1.ConditionFalse, 447 Reason: "Suspended", 448 }, 449 }, 450 }, 451 Suspended: true, 452 WantCondition: &metav1.Condition{ 453 Type: kueue.WorkloadPodsReady, 454 Status: metav1.ConditionFalse, 455 Reason: "PodsReady", 456 Message: "Not all pods are ready or succeeded", 457 }, 458 }), 459 ) 460 }) 461 462 var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 463 var ( 464 ns *corev1.Namespace 465 onDemandFlavor *kueue.ResourceFlavor 466 spotUntaintedFlavor *kueue.ResourceFlavor 467 clusterQueue *kueue.ClusterQueue 468 localQueue *kueue.LocalQueue 469 ) 470 471 ginkgo.BeforeAll(func() { 472 fwk = &framework.Framework{ 473 CRDPath: crdPath, 474 DepCRDPaths: []string{pytorchCrdPath}, 475 } 476 cfg := fwk.Init() 477 ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup()) 478 }) 479 ginkgo.AfterAll(func() { 480 fwk.Teardown() 481 }) 482 483 ginkgo.BeforeEach(func() { 484 ns = &corev1.Namespace{ 485 ObjectMeta: metav1.ObjectMeta{ 486 GenerateName: "core-", 487 }, 488 } 489 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 490 491 onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 492 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 493 494 spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj() 495 gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed()) 496 497 clusterQueue = testing.MakeClusterQueue("dev-clusterqueue"). 498 ResourceGroup( 499 *testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "5").Obj(), 500 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(), 501 ).Obj() 502 gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed()) 503 }) 504 ginkgo.AfterEach(func() { 505 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 506 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true) 507 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true) 508 gomega.Expect(util.DeleteResourceFlavor(ctx, k8sClient, spotUntaintedFlavor)).To(gomega.Succeed()) 509 }) 510 511 ginkgo.It("Should schedule jobs as they fit in their ClusterQueue", func() { 512 ginkgo.By("creating localQueue") 513 localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() 514 gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed()) 515 516 kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpytorchjob.JobControl)( 517 testingpytorchjob.MakePyTorchJob(jobName, ns.Name).Queue(localQueue.Name). 518 Request(kftraining.PyTorchJobReplicaTypeMaster, corev1.ResourceCPU, "3"). 519 Request(kftraining.PyTorchJobReplicaTypeWorker, corev1.ResourceCPU, "4"). 520 Obj(), 521 )} 522 createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpytorchjob.JobControl)(&kftraining.PyTorchJob{})} 523 524 kftesting.ShouldScheduleJobsAsTheyFitInTheirClusterQueue(ctx, k8sClient, kfJob, createdJob, clusterQueue, []kftesting.PodSetsResource{ 525 { 526 RoleName: kftraining.PyTorchJobReplicaTypeMaster, 527 ResourceCPU: kueue.ResourceFlavorReference(spotUntaintedFlavor.Name), 528 }, 529 { 530 RoleName: kftraining.PyTorchJobReplicaTypeWorker, 531 ResourceCPU: kueue.ResourceFlavorReference(onDemandFlavor.Name), 532 }, 533 }) 534 }) 535 536 ginkgo.When("The workload's admission is removed", func() { 537 ginkgo.It("Should restore the original node selectors", func() { 538 539 localQueue := testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() 540 job := testingpytorchjob.MakePyTorchJob(jobName, ns.Name).Queue(localQueue.Name). 541 Request(kftraining.PyTorchJobReplicaTypeMaster, corev1.ResourceCPU, "3"). 542 Request(kftraining.PyTorchJobReplicaTypeWorker, corev1.ResourceCPU, "4"). 543 Obj() 544 lookupKey := types.NamespacedName{Name: job.Name, Namespace: job.Namespace} 545 createdJob := &kftraining.PyTorchJob{} 546 547 nodeSelectors := func(j *kftraining.PyTorchJob) map[kftraining.ReplicaType]map[string]string { 548 ret := map[kftraining.ReplicaType]map[string]string{} 549 for k := range j.Spec.PyTorchReplicaSpecs { 550 ret[k] = j.Spec.PyTorchReplicaSpecs[k].Template.Spec.NodeSelector 551 } 552 return ret 553 } 554 555 ginkgo.By("create a job", func() { 556 gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) 557 }) 558 559 ginkgo.By("job should be suspend", func() { 560 gomega.Eventually(func() *bool { 561 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 562 return createdJob.Spec.RunPolicy.Suspend 563 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true))) 564 }) 565 566 // backup the node selectors 567 originalNodeSelectors := nodeSelectors(createdJob) 568 569 ginkgo.By("create a localQueue", func() { 570 gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed()) 571 }) 572 573 ginkgo.By("job should be unsuspended", func() { 574 gomega.Eventually(func() *bool { 575 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 576 return createdJob.Spec.RunPolicy.Suspend 577 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false))) 578 }) 579 580 ginkgo.By("the node selectors should be updated", func() { 581 gomega.Eventually(func() map[kftraining.ReplicaType]map[string]string { 582 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 583 return nodeSelectors(createdJob) 584 }, util.Timeout, util.Interval).ShouldNot(gomega.Equal(originalNodeSelectors)) 585 }) 586 587 ginkgo.By("delete the localQueue to prevent readmission", func() { 588 gomega.Expect(util.DeleteLocalQueue(ctx, k8sClient, localQueue)).Should(gomega.Succeed()) 589 }) 590 591 ginkgo.By("clear the workload's admission to stop the job", func() { 592 wl := &kueue.Workload{} 593 wlKey := types.NamespacedName{Name: workloadpytorchjob.GetWorkloadNameForPyTorchJob(job.Name), Namespace: job.Namespace} 594 gomega.Expect(k8sClient.Get(ctx, wlKey, wl)).Should(gomega.Succeed()) 595 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, wl, nil)).Should(gomega.Succeed()) 596 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, wl) 597 }) 598 599 ginkgo.By("the node selectors should be restored", func() { 600 gomega.Eventually(func() map[kftraining.ReplicaType]map[string]string { 601 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 602 return nodeSelectors(createdJob) 603 }, util.Timeout, util.Interval).Should(gomega.Equal(originalNodeSelectors)) 604 }) 605 }) 606 }) 607 })