sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/mpijob/mpijob_controller_test.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package mpijob 18 19 import ( 20 "fmt" 21 22 "github.com/google/go-cmp/cmp/cmpopts" 23 kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" 24 "github.com/onsi/ginkgo/v2" 25 "github.com/onsi/gomega" 26 corev1 "k8s.io/api/core/v1" 27 apierrors "k8s.io/apimachinery/pkg/api/errors" 28 apimeta "k8s.io/apimachinery/pkg/api/meta" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/apimachinery/pkg/types" 31 "k8s.io/client-go/kubernetes/scheme" 32 "k8s.io/utils/ptr" 33 ctrl "sigs.k8s.io/controller-runtime" 34 "sigs.k8s.io/controller-runtime/pkg/client" 35 36 configapi "sigs.k8s.io/kueue/apis/config/v1beta1" 37 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 38 "sigs.k8s.io/kueue/pkg/controller/constants" 39 "sigs.k8s.io/kueue/pkg/controller/jobframework" 40 workloadmpijob "sigs.k8s.io/kueue/pkg/controller/jobs/mpijob" 41 "sigs.k8s.io/kueue/pkg/util/testing" 42 testingjob "sigs.k8s.io/kueue/pkg/util/testingjobs/job" 43 testingmpijob "sigs.k8s.io/kueue/pkg/util/testingjobs/mpijob" 44 "sigs.k8s.io/kueue/pkg/workload" 45 "sigs.k8s.io/kueue/test/integration/framework" 46 "sigs.k8s.io/kueue/test/util" 47 ) 48 49 const ( 50 jobName = "test-job" 51 instanceKey = "cloud.provider.com/instance" 52 priorityClassName = "test-priority-class" 53 priorityValue = 10 54 ) 55 56 var ( 57 ignoreConditionTimestamps = cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime") 58 ) 59 60 // +kubebuilder:docs-gen:collapse=Imports 61 62 var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 63 64 ginkgo.BeforeAll(func() { 65 fwk = &framework.Framework{ 66 CRDPath: crdPath, 67 DepCRDPaths: []string{mpiCrdPath}, 68 } 69 70 cfg = fwk.Init() 71 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(false, jobframework.WithManageJobsWithoutQueueName(true))) 72 }) 73 ginkgo.AfterAll(func() { 74 fwk.Teardown() 75 }) 76 77 var ( 78 ns *corev1.Namespace 79 wlLookupKey types.NamespacedName 80 ) 81 ginkgo.BeforeEach(func() { 82 ns = &corev1.Namespace{ 83 ObjectMeta: metav1.ObjectMeta{ 84 GenerateName: "core-", 85 }, 86 } 87 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 88 wlLookupKey = types.NamespacedName{Name: workloadmpijob.GetWorkloadNameForMPIJob(jobName), Namespace: ns.Name} 89 }) 90 ginkgo.AfterEach(func() { 91 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 92 }) 93 94 ginkgo.It("Should reconcile MPIJobs", func() { 95 ginkgo.By("checking the job gets suspended when created unsuspended") 96 priorityClass := testing.MakePriorityClass(priorityClassName). 97 PriorityValue(int32(priorityValue)).Obj() 98 gomega.Expect(k8sClient.Create(ctx, priorityClass)).Should(gomega.Succeed()) 99 100 job := testingmpijob.MakeMPIJob(jobName, ns.Name).PriorityClass(priorityClassName).Obj() 101 err := k8sClient.Create(ctx, job) 102 gomega.Expect(err).To(gomega.Succeed()) 103 createdJob := &kubeflow.MPIJob{} 104 105 gomega.Eventually(func() bool { 106 if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: ns.Name}, createdJob); err != nil { 107 return false 108 } 109 return createdJob.Spec.RunPolicy.Suspend != nil && *createdJob.Spec.RunPolicy.Suspend 110 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 111 112 ginkgo.By("checking the workload is created without queue assigned") 113 createdWorkload := &kueue.Workload{} 114 gomega.Eventually(func() error { 115 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 116 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 117 gomega.Expect(createdWorkload.Spec.QueueName).Should(gomega.Equal(""), "The Workload shouldn't have .spec.queueName set") 118 gomega.Expect(metav1.IsControlledBy(createdWorkload, createdJob)).To(gomega.BeTrue(), "The Workload should be owned by the Job") 119 120 ginkgo.By("checking the workload is created with priority and priorityName") 121 gomega.Expect(createdWorkload.Spec.PriorityClassName).Should(gomega.Equal(priorityClassName)) 122 gomega.Expect(*createdWorkload.Spec.Priority).Should(gomega.Equal(int32(priorityValue))) 123 124 ginkgo.By("checking the workload is updated with queue name when the job does") 125 jobQueueName := "test-queue" 126 createdJob.Annotations = map[string]string{constants.QueueAnnotation: jobQueueName} 127 gomega.Expect(k8sClient.Update(ctx, createdJob)).Should(gomega.Succeed()) 128 gomega.Eventually(func() bool { 129 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 130 return false 131 } 132 return createdWorkload.Spec.QueueName == jobQueueName 133 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 134 135 ginkgo.By("checking a second non-matching workload is deleted") 136 secondWl := &kueue.Workload{ 137 ObjectMeta: metav1.ObjectMeta{ 138 Name: workloadmpijob.GetWorkloadNameForMPIJob("second-workload"), 139 Namespace: createdWorkload.Namespace, 140 }, 141 Spec: *createdWorkload.Spec.DeepCopy(), 142 } 143 gomega.Expect(ctrl.SetControllerReference(createdJob, secondWl, scheme.Scheme)).Should(gomega.Succeed()) 144 secondWl.Spec.PodSets[0].Count += 1 145 146 gomega.Expect(k8sClient.Create(ctx, secondWl)).Should(gomega.Succeed()) 147 gomega.Eventually(func() error { 148 wl := &kueue.Workload{} 149 key := types.NamespacedName{Name: secondWl.Name, Namespace: secondWl.Namespace} 150 return k8sClient.Get(ctx, key, wl) 151 }, util.Timeout, util.Interval).Should(testing.BeNotFoundError()) 152 // check the original wl is still there 153 gomega.Eventually(func() error { 154 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 155 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 156 157 ginkgo.By("checking the job is unsuspended when workload is assigned") 158 onDemandFlavor := testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 159 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 160 spotFlavor := testing.MakeResourceFlavor("spot").Label(instanceKey, "spot").Obj() 161 gomega.Expect(k8sClient.Create(ctx, spotFlavor)).Should(gomega.Succeed()) 162 clusterQueue := testing.MakeClusterQueue("cluster-queue"). 163 ResourceGroup( 164 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(), 165 *testing.MakeFlavorQuotas("spot").Resource(corev1.ResourceCPU, "5").Obj(), 166 ).Obj() 167 admission := testing.MakeAdmission(clusterQueue.Name). 168 PodSets( 169 kueue.PodSetAssignment{ 170 Name: "Launcher", 171 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 172 corev1.ResourceCPU: "on-demand", 173 }, 174 Count: ptr.To(createdWorkload.Spec.PodSets[0].Count), 175 }, 176 kueue.PodSetAssignment{ 177 Name: "Worker", 178 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 179 corev1.ResourceCPU: "spot", 180 }, 181 Count: ptr.To(createdWorkload.Spec.PodSets[1].Count), 182 }, 183 ). 184 Obj() 185 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 186 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 187 lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name} 188 gomega.Eventually(func() bool { 189 if err := k8sClient.Get(ctx, lookupKey, createdJob); err != nil { 190 return false 191 } 192 return !*createdJob.Spec.RunPolicy.Suspend 193 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 194 gomega.Eventually(func() bool { 195 ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "Started", corev1.EventTypeNormal, fmt.Sprintf("Admitted by clusterQueue %v", clusterQueue.Name)) 196 return ok 197 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 198 gomega.Expect(len(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.NodeSelector)).Should(gomega.Equal(1)) 199 gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) 200 gomega.Expect(len(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.NodeSelector)).Should(gomega.Equal(1)) 201 gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotFlavor.Name)) 202 gomega.Eventually(func() bool { 203 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 204 return false 205 } 206 return len(createdWorkload.Status.Conditions) == 2 207 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 208 209 ginkgo.By("checking the job gets suspended when parallelism changes and the added node selectors are removed") 210 parallelism := ptr.Deref(job.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Replicas, 1) 211 newParallelism := int32(parallelism + 1) 212 createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Replicas = &newParallelism 213 gomega.Expect(k8sClient.Update(ctx, createdJob)).Should(gomega.Succeed()) 214 gomega.Eventually(func() bool { 215 if err := k8sClient.Get(ctx, lookupKey, createdJob); err != nil { 216 return false 217 } 218 return createdJob.Spec.RunPolicy.Suspend != nil && *createdJob.Spec.RunPolicy.Suspend && 219 len(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.NodeSelector) == 0 220 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 221 gomega.Eventually(func() bool { 222 ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "DeletedWorkload", corev1.EventTypeNormal, fmt.Sprintf("Deleted not matching Workload: %v", wlLookupKey.String())) 223 return ok 224 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 225 226 ginkgo.By("checking the workload is updated with new count") 227 gomega.Eventually(func() bool { 228 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 229 return false 230 } 231 return createdWorkload.Spec.PodSets[1].Count == newParallelism 232 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 233 gomega.Expect(createdWorkload.Status.Admission).Should(gomega.BeNil()) 234 235 ginkgo.By("checking the job is unsuspended and selectors added when workload is assigned again") 236 admission = testing.MakeAdmission(clusterQueue.Name). 237 PodSets( 238 kueue.PodSetAssignment{ 239 Name: "Launcher", 240 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 241 corev1.ResourceCPU: "on-demand", 242 }, 243 Count: ptr.To(createdWorkload.Spec.PodSets[0].Count), 244 }, 245 kueue.PodSetAssignment{ 246 Name: "Worker", 247 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 248 corev1.ResourceCPU: "spot", 249 }, 250 Count: ptr.To(createdWorkload.Spec.PodSets[1].Count), 251 }, 252 ). 253 Obj() 254 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 255 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 256 gomega.Eventually(func() bool { 257 if err := k8sClient.Get(ctx, lookupKey, createdJob); err != nil { 258 return false 259 } 260 return !*createdJob.Spec.RunPolicy.Suspend 261 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 262 gomega.Expect(len(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.NodeSelector)).Should(gomega.Equal(1)) 263 gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) 264 gomega.Expect(len(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.NodeSelector)).Should(gomega.Equal(1)) 265 gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotFlavor.Name)) 266 gomega.Eventually(func() bool { 267 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 268 return false 269 } 270 return len(createdWorkload.Status.Conditions) == 2 271 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 272 273 ginkgo.By("checking the workload is finished when job is completed") 274 createdJob.Status.Conditions = append(createdJob.Status.Conditions, 275 kubeflow.JobCondition{ 276 Type: kubeflow.JobSucceeded, 277 Status: corev1.ConditionTrue, 278 LastTransitionTime: metav1.Now(), 279 }) 280 gomega.Expect(k8sClient.Status().Update(ctx, createdJob)).Should(gomega.Succeed()) 281 gomega.Eventually(func() bool { 282 err := k8sClient.Get(ctx, wlLookupKey, createdWorkload) 283 if err != nil || len(createdWorkload.Status.Conditions) == 2 { 284 return false 285 } 286 287 return apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadFinished) 288 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 289 }) 290 291 ginkgo.When("the queue has admission checks", func() { 292 var ( 293 clusterQueueAc *kueue.ClusterQueue 294 localQueue *kueue.LocalQueue 295 testFlavor *kueue.ResourceFlavor 296 jobLookupKey *types.NamespacedName 297 wlLookupKey *types.NamespacedName 298 admissionCheck *kueue.AdmissionCheck 299 ) 300 301 ginkgo.BeforeEach(func() { 302 admissionCheck = testing.MakeAdmissionCheck("check").ControllerName("ac-controller").Obj() 303 gomega.Expect(k8sClient.Create(ctx, admissionCheck)).To(gomega.Succeed()) 304 util.SetAdmissionCheckActive(ctx, k8sClient, admissionCheck, metav1.ConditionTrue) 305 clusterQueueAc = testing.MakeClusterQueue("prod-cq-with-checks"). 306 ResourceGroup( 307 *testing.MakeFlavorQuotas("test-flavor").Resource(corev1.ResourceCPU, "5").Obj(), 308 ).AdmissionChecks("check").Obj() 309 gomega.Expect(k8sClient.Create(ctx, clusterQueueAc)).Should(gomega.Succeed()) 310 localQueue = testing.MakeLocalQueue("queue", ns.Name).ClusterQueue(clusterQueueAc.Name).Obj() 311 gomega.Expect(k8sClient.Create(ctx, localQueue)).To(gomega.Succeed()) 312 testFlavor = testing.MakeResourceFlavor("test-flavor").Label(instanceKey, "test-flavor").Obj() 313 gomega.Expect(k8sClient.Create(ctx, testFlavor)).Should(gomega.Succeed()) 314 315 jobLookupKey = &types.NamespacedName{Name: jobName, Namespace: ns.Name} 316 wlLookupKey = &types.NamespacedName{Name: workloadmpijob.GetWorkloadNameForMPIJob(jobName), Namespace: ns.Name} 317 }) 318 319 ginkgo.AfterEach(func() { 320 gomega.Expect(util.DeleteAdmissionCheck(ctx, k8sClient, admissionCheck)).To(gomega.Succeed()) 321 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, testFlavor, true) 322 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 323 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueueAc, true) 324 }) 325 326 ginkgo.It("labels and annotations should be propagated from admission check to job", func() { 327 createdJob := &kubeflow.MPIJob{} 328 createdWorkload := &kueue.Workload{} 329 330 ginkgo.By("creating the job with pod labels & annotations", func() { 331 job := testingmpijob.MakeMPIJob(jobName, ns.Name). 332 Queue(localQueue.Name). 333 PodAnnotation(kubeflow.MPIReplicaTypeWorker, "old-ann-key", "old-ann-value"). 334 PodLabel(kubeflow.MPIReplicaTypeWorker, "old-label-key", "old-label-value"). 335 Obj() 336 gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) 337 }) 338 339 ginkgo.By("fetch the job and verify it is suspended as the checks are not ready", func() { 340 gomega.Eventually(func() *bool { 341 gomega.Expect(k8sClient.Get(ctx, *jobLookupKey, createdJob)).Should(gomega.Succeed()) 342 return createdJob.Spec.RunPolicy.Suspend 343 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true))) 344 }) 345 346 ginkgo.By("fetch the created workload", func() { 347 gomega.Eventually(func() error { 348 return k8sClient.Get(ctx, *wlLookupKey, createdWorkload) 349 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 350 }) 351 352 ginkgo.By("add labels & annotations to the admission check", func() { 353 gomega.Eventually(func() error { 354 var newWL kueue.Workload 355 gomega.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(createdWorkload), &newWL)).To(gomega.Succeed()) 356 workload.SetAdmissionCheckState(&newWL.Status.AdmissionChecks, kueue.AdmissionCheckState{ 357 Name: "check", 358 State: kueue.CheckStateReady, 359 PodSetUpdates: []kueue.PodSetUpdate{ 360 { 361 Name: "launcher", 362 Annotations: map[string]string{ 363 "ann1": "ann-value-for-launcher", 364 }, 365 Labels: map[string]string{ 366 "label1": "label-value-for-launcher", 367 }, 368 NodeSelector: map[string]string{ 369 "selector1": "selector-value-for-launcher", 370 }, 371 }, 372 { 373 Name: "worker", 374 Annotations: map[string]string{ 375 "ann1": "ann-value1", 376 }, 377 Labels: map[string]string{ 378 "label1": "label-value1", 379 }, 380 NodeSelector: map[string]string{ 381 "selector1": "selector-value1", 382 }, 383 Tolerations: []corev1.Toleration{ 384 { 385 Key: "selector1", 386 Value: "selector-value1", 387 Operator: corev1.TolerationOpEqual, 388 Effect: corev1.TaintEffectNoSchedule, 389 }, 390 }, 391 }, 392 }, 393 }) 394 return k8sClient.Status().Update(ctx, &newWL) 395 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 396 }) 397 398 ginkgo.By("admit the workload", func() { 399 admission := testing.MakeAdmission(clusterQueueAc.Name). 400 PodSets( 401 kueue.PodSetAssignment{ 402 Name: "launcher", 403 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 404 corev1.ResourceCPU: "test-flavor", 405 }, 406 Count: ptr.To(createdWorkload.Spec.PodSets[0].Count), 407 }, 408 kueue.PodSetAssignment{ 409 Name: "worker", 410 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 411 corev1.ResourceCPU: "test-flavor", 412 }, 413 Count: ptr.To(createdWorkload.Spec.PodSets[1].Count), 414 }, 415 ). 416 Obj() 417 gomega.Expect(k8sClient.Get(ctx, *wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 418 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 419 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 420 }) 421 422 ginkgo.By("await for the job to start", func() { 423 gomega.Eventually(func() *bool { 424 gomega.Expect(k8sClient.Get(ctx, *jobLookupKey, createdJob)).Should(gomega.Succeed()) 425 return createdJob.Spec.RunPolicy.Suspend 426 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false))) 427 }) 428 429 ginkgo.By("verify the PodSetUpdates are propagated to the running job, for worker", func() { 430 worker := createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template 431 gomega.Expect(worker.Annotations).Should(gomega.HaveKeyWithValue("ann1", "ann-value1")) 432 gomega.Expect(worker.Annotations).Should(gomega.HaveKeyWithValue("old-ann-key", "old-ann-value")) 433 gomega.Expect(worker.Labels).Should(gomega.HaveKeyWithValue("label1", "label-value1")) 434 gomega.Expect(worker.Labels).Should(gomega.HaveKeyWithValue("old-label-key", "old-label-value")) 435 gomega.Expect(worker.Spec.NodeSelector).Should(gomega.HaveKeyWithValue(instanceKey, "test-flavor")) 436 gomega.Expect(worker.Spec.NodeSelector).Should(gomega.HaveKeyWithValue("selector1", "selector-value1")) 437 gomega.Expect(worker.Spec.Tolerations).Should(gomega.BeComparableTo( 438 []corev1.Toleration{ 439 { 440 Key: "selector1", 441 Value: "selector-value1", 442 Operator: corev1.TolerationOpEqual, 443 Effect: corev1.TaintEffectNoSchedule, 444 }, 445 }, 446 )) 447 }) 448 449 ginkgo.By("verify the PodSetUpdates are propagated to the running job, for launcher", func() { 450 launcher := createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template 451 gomega.Expect(launcher.Annotations).Should(gomega.HaveKeyWithValue("ann1", "ann-value-for-launcher")) 452 gomega.Expect(launcher.Labels).Should(gomega.HaveKeyWithValue("label1", "label-value-for-launcher")) 453 gomega.Expect(launcher.Spec.NodeSelector).Should(gomega.HaveKeyWithValue(instanceKey, "test-flavor")) 454 gomega.Expect(launcher.Spec.NodeSelector).Should(gomega.HaveKeyWithValue("selector1", "selector-value-for-launcher")) 455 }) 456 457 ginkgo.By("delete the localQueue to prevent readmission", func() { 458 gomega.Expect(util.DeleteLocalQueue(ctx, k8sClient, localQueue)).Should(gomega.Succeed()) 459 }) 460 461 ginkgo.By("clear the workload's admission to stop the job", func() { 462 gomega.Expect(k8sClient.Get(ctx, *wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 463 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, nil)).Should(gomega.Succeed()) 464 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 465 }) 466 467 ginkgo.By("await for the job to be suspended", func() { 468 gomega.Eventually(func() *bool { 469 gomega.Expect(k8sClient.Get(ctx, *jobLookupKey, createdJob)).Should(gomega.Succeed()) 470 return createdJob.Spec.RunPolicy.Suspend 471 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true))) 472 }) 473 474 ginkgo.By("verify the PodSetUpdates are restored for worker", func() { 475 worker := createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template 476 gomega.Expect(worker.Annotations).ShouldNot(gomega.HaveKey("ann1")) 477 gomega.Expect(worker.Annotations).Should(gomega.HaveKeyWithValue("old-ann-key", "old-ann-value")) 478 gomega.Expect(worker.Labels).ShouldNot(gomega.HaveKey("label1")) 479 gomega.Expect(worker.Labels).Should(gomega.HaveKeyWithValue("old-label-key", "old-label-value")) 480 gomega.Expect(worker.Spec.NodeSelector).ShouldNot(gomega.HaveKey(instanceKey)) 481 gomega.Expect(worker.Spec.NodeSelector).ShouldNot(gomega.HaveKey("selector1")) 482 }) 483 484 ginkgo.By("verify the PodSetUpdates are restored for launcher", func() { 485 launcher := createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template 486 gomega.Expect(launcher.Annotations).ShouldNot(gomega.HaveKey("ann1")) 487 gomega.Expect(launcher.Labels).ShouldNot(gomega.HaveKey("label1")) 488 gomega.Expect(launcher.Spec.NodeSelector).ShouldNot(gomega.HaveKey(instanceKey)) 489 gomega.Expect(launcher.Spec.NodeSelector).ShouldNot(gomega.HaveKey("selector1")) 490 }) 491 }) 492 }) 493 }) 494 495 var _ = ginkgo.Describe("Job controller for workloads when only jobs with queue are managed", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 496 ginkgo.BeforeAll(func() { 497 fwk = &framework.Framework{ 498 CRDPath: crdPath, 499 DepCRDPaths: []string{mpiCrdPath}, 500 } 501 cfg = fwk.Init() 502 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(true)) 503 }) 504 ginkgo.AfterAll(func() { 505 fwk.Teardown() 506 }) 507 508 var ( 509 ns *corev1.Namespace 510 childLookupKey types.NamespacedName 511 parentJobName = jobName + "-parent" 512 childJobName = jobName + "-child" 513 ) 514 ginkgo.BeforeEach(func() { 515 ns = &corev1.Namespace{ 516 ObjectMeta: metav1.ObjectMeta{ 517 GenerateName: "core-", 518 }, 519 } 520 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 521 childLookupKey = types.NamespacedName{Name: childJobName, Namespace: ns.Name} 522 }) 523 ginkgo.AfterEach(func() { 524 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 525 }) 526 527 ginkgo.It("Should reconcile jobs only when queue is set", func() { 528 ginkgo.By("checking the workload is not created when queue name is not set") 529 job := testingmpijob.MakeMPIJob(jobName, ns.Name).Obj() 530 gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) 531 lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name} 532 createdJob := &kubeflow.MPIJob{} 533 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 534 535 createdWorkload := &kueue.Workload{} 536 wlLookupKey := types.NamespacedName{Name: workloadmpijob.GetWorkloadNameForMPIJob(jobName), Namespace: ns.Name} 537 gomega.Eventually(func() bool { 538 return apierrors.IsNotFound(k8sClient.Get(ctx, wlLookupKey, createdWorkload)) 539 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 540 541 ginkgo.By("checking the workload is created when queue name is set") 542 jobQueueName := "test-queue" 543 createdJob.Annotations = map[string]string{constants.QueueAnnotation: jobQueueName} 544 gomega.Expect(k8sClient.Update(ctx, createdJob)).Should(gomega.Succeed()) 545 gomega.Eventually(func() error { 546 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 547 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 548 }) 549 550 ginkgo.It("Should suspend a job if the parent workload does not exist", func() { 551 ginkgo.By("Creating the parent job which has a queue name") 552 parentJob := testingmpijob.MakeMPIJob(parentJobName, ns.Name). 553 UID(parentJobName). 554 Queue("test"). 555 Suspend(false). 556 Obj() 557 gomega.Expect(k8sClient.Create(ctx, parentJob)).Should(gomega.Succeed()) 558 559 ginkgo.By("Creating the child job which uses the parent workload annotation") 560 childJob := testingjob.MakeJob(childJobName, ns.Name). 561 OwnerReference(parentJobName, kubeflow.SchemeGroupVersionKind). 562 Suspend(false). 563 ParentWorkload("non-existing-parent-workload"). 564 Obj() 565 gomega.Expect(k8sClient.Create(ctx, childJob)).Should(gomega.Succeed()) 566 567 ginkgo.By("checking that the child job is suspended") 568 gomega.Eventually(func() *bool { 569 gomega.Expect(k8sClient.Get(ctx, childLookupKey, childJob)).Should(gomega.Succeed()) 570 return childJob.Spec.Suspend 571 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true))) 572 }) 573 574 ginkgo.It("Should not suspend a child job if the parent job doesn't have a queue name", func() { 575 ginkgo.By("Creating the parent job which doesn't have a queue name") 576 parentJob := testingmpijob.MakeMPIJob(parentJobName, ns.Name). 577 UID(parentJobName). 578 Suspend(false). 579 Obj() 580 gomega.Expect(k8sClient.Create(ctx, parentJob)).Should(gomega.Succeed()) 581 582 ginkgo.By("Creating the child job which has ownerReference with known existing workload owner") 583 childJob := testingjob.MakeJob(childJobName, ns.Name). 584 OwnerReference(parentJobName, kubeflow.SchemeGroupVersionKind). 585 ParentWorkload(jobframework.GetWorkloadNameForOwnerWithGVK(parentJobName, kubeflow.SchemeGroupVersionKind)). 586 Suspend(false). 587 Obj() 588 gomega.Expect(k8sClient.Create(ctx, childJob)).Should(gomega.Succeed()) 589 590 ginkgo.By("Checking that the child job isn't suspended") 591 gomega.Eventually(func() *bool { 592 gomega.Expect(k8sClient.Get(ctx, childLookupKey, childJob)) 593 return childJob.Spec.Suspend 594 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false))) 595 }) 596 }) 597 598 var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 599 type podsReadyTestSpec struct { 600 beforeJobStatus *kubeflow.JobStatus 601 beforeCondition *metav1.Condition 602 jobStatus kubeflow.JobStatus 603 suspended bool 604 wantCondition *metav1.Condition 605 } 606 607 var ( 608 ns *corev1.Namespace 609 wlLookupKey types.NamespacedName 610 defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj() 611 ) 612 613 ginkgo.BeforeAll(func() { 614 fwk = &framework.Framework{ 615 CRDPath: crdPath, 616 DepCRDPaths: []string{mpiCrdPath}, 617 } 618 cfg = fwk.Init() 619 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(false, jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true}))) 620 621 ginkgo.By("Create a resource flavor") 622 gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed()) 623 }) 624 ginkgo.AfterAll(func() { 625 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true) 626 fwk.Teardown() 627 }) 628 629 ginkgo.BeforeEach(func() { 630 ns = &corev1.Namespace{ 631 ObjectMeta: metav1.ObjectMeta{ 632 GenerateName: "core-", 633 }, 634 } 635 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 636 wlLookupKey = types.NamespacedName{Name: workloadmpijob.GetWorkloadNameForMPIJob(jobName), Namespace: ns.Name} 637 }) 638 ginkgo.AfterEach(func() { 639 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 640 }) 641 642 ginkgo.DescribeTable("Single job at different stages of progress towards completion", 643 func(podsReadyTestSpec podsReadyTestSpec) { 644 ginkgo.By("Create a job") 645 job := testingmpijob.MakeMPIJob(jobName, ns.Name).Parallelism(2).Obj() 646 jobQueueName := "test-queue" 647 job.Annotations = map[string]string{constants.QueueAnnotation: jobQueueName} 648 gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) 649 lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name} 650 createdJob := &kubeflow.MPIJob{} 651 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 652 653 ginkgo.By("Fetch the workload created for the job") 654 createdWorkload := &kueue.Workload{} 655 gomega.Eventually(func() error { 656 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 657 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 658 659 ginkgo.By("Admit the workload created for the job") 660 admission := testing.MakeAdmission("foo"). 661 PodSets( 662 kueue.PodSetAssignment{ 663 Name: "Launcher", 664 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 665 corev1.ResourceCPU: "default", 666 }, 667 Count: ptr.To(createdWorkload.Spec.PodSets[0].Count), 668 }, 669 kueue.PodSetAssignment{ 670 Name: "Worker", 671 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 672 corev1.ResourceCPU: "default", 673 }, 674 Count: ptr.To(createdWorkload.Spec.PodSets[1].Count), 675 }, 676 ). 677 Obj() 678 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 679 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 680 gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 681 682 ginkgo.By("Await for the job to be unsuspended") 683 gomega.Eventually(func() *bool { 684 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 685 return createdJob.Spec.RunPolicy.Suspend 686 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false))) 687 688 if podsReadyTestSpec.beforeJobStatus != nil { 689 ginkgo.By("Update the job status to simulate its initial progress towards completion") 690 createdJob.Status = *podsReadyTestSpec.beforeJobStatus 691 gomega.Expect(k8sClient.Status().Update(ctx, createdJob)).Should(gomega.Succeed()) 692 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 693 } 694 695 if podsReadyTestSpec.beforeCondition != nil { 696 ginkgo.By("Update the workload status") 697 gomega.Eventually(func() *metav1.Condition { 698 gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 699 return apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadPodsReady) 700 }, util.Timeout, util.Interval).Should(gomega.BeComparableTo(podsReadyTestSpec.beforeCondition, ignoreConditionTimestamps)) 701 } 702 703 ginkgo.By("Update the job status to simulate its progress towards completion") 704 createdJob.Status = podsReadyTestSpec.jobStatus 705 gomega.Expect(k8sClient.Status().Update(ctx, createdJob)).Should(gomega.Succeed()) 706 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 707 708 if podsReadyTestSpec.suspended { 709 ginkgo.By("Unset admission of the workload to suspend the job") 710 gomega.Eventually(func() error { 711 // the update may need to be retried due to a conflict as the workload gets 712 // also updated due to setting of the job status. 713 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 714 return err 715 } 716 return util.SetQuotaReservation(ctx, k8sClient, createdWorkload, nil) 717 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 718 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 719 } 720 721 ginkgo.By("Verify the PodsReady condition is added") 722 gomega.Eventually(func() *metav1.Condition { 723 gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 724 return apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadPodsReady) 725 }, util.Timeout, util.Interval).Should(gomega.BeComparableTo(podsReadyTestSpec.wantCondition, ignoreConditionTimestamps)) 726 }, 727 ginkgo.Entry("No progress", podsReadyTestSpec{ 728 wantCondition: &metav1.Condition{ 729 Type: kueue.WorkloadPodsReady, 730 Status: metav1.ConditionFalse, 731 Reason: "PodsReady", 732 Message: "Not all pods are ready or succeeded", 733 }, 734 }), 735 ginkgo.Entry("Running MPIJob", podsReadyTestSpec{ 736 jobStatus: kubeflow.JobStatus{ 737 Conditions: []kubeflow.JobCondition{ 738 { 739 Type: kubeflow.JobRunning, 740 Status: corev1.ConditionTrue, 741 Reason: "Running", 742 }, 743 }, 744 }, 745 wantCondition: &metav1.Condition{ 746 Type: kueue.WorkloadPodsReady, 747 Status: metav1.ConditionTrue, 748 Reason: "PodsReady", 749 Message: "All pods were ready or succeeded since the workload admission", 750 }, 751 }), 752 ginkgo.Entry("Running MPIJob; PodsReady=False before", podsReadyTestSpec{ 753 beforeCondition: &metav1.Condition{ 754 Type: kueue.WorkloadPodsReady, 755 Status: metav1.ConditionFalse, 756 Reason: "PodsReady", 757 Message: "Not all pods are ready or succeeded", 758 }, 759 jobStatus: kubeflow.JobStatus{ 760 Conditions: []kubeflow.JobCondition{ 761 { 762 Type: kubeflow.JobRunning, 763 Status: corev1.ConditionTrue, 764 Reason: "Running", 765 }, 766 }, 767 }, 768 wantCondition: &metav1.Condition{ 769 Type: kueue.WorkloadPodsReady, 770 Status: metav1.ConditionTrue, 771 Reason: "PodsReady", 772 Message: "All pods were ready or succeeded since the workload admission", 773 }, 774 }), 775 ginkgo.Entry("Job suspended; PodsReady=True before", podsReadyTestSpec{ 776 beforeJobStatus: &kubeflow.JobStatus{ 777 Conditions: []kubeflow.JobCondition{ 778 { 779 Type: kubeflow.JobRunning, 780 Status: corev1.ConditionTrue, 781 Reason: "Running", 782 }, 783 }, 784 }, 785 beforeCondition: &metav1.Condition{ 786 Type: kueue.WorkloadPodsReady, 787 Status: metav1.ConditionTrue, 788 Reason: "PodsReady", 789 Message: "All pods were ready or succeeded since the workload admission", 790 }, 791 jobStatus: kubeflow.JobStatus{ 792 Conditions: []kubeflow.JobCondition{ 793 { 794 Type: kubeflow.JobRunning, 795 Status: corev1.ConditionFalse, 796 Reason: "Suspended", 797 }, 798 }, 799 }, 800 suspended: true, 801 wantCondition: &metav1.Condition{ 802 Type: kueue.WorkloadPodsReady, 803 Status: metav1.ConditionFalse, 804 Reason: "PodsReady", 805 Message: "Not all pods are ready or succeeded", 806 }, 807 }), 808 ) 809 }) 810 811 var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 812 var ( 813 ns *corev1.Namespace 814 onDemandFlavor *kueue.ResourceFlavor 815 spotUntaintedFlavor *kueue.ResourceFlavor 816 clusterQueue *kueue.ClusterQueue 817 localQueue *kueue.LocalQueue 818 ) 819 820 ginkgo.BeforeAll(func() { 821 fwk = &framework.Framework{ 822 CRDPath: crdPath, 823 DepCRDPaths: []string{mpiCrdPath}, 824 } 825 cfg = fwk.Init() 826 ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup()) 827 }) 828 ginkgo.AfterAll(func() { 829 fwk.Teardown() 830 }) 831 832 ginkgo.BeforeEach(func() { 833 ns = &corev1.Namespace{ 834 ObjectMeta: metav1.ObjectMeta{ 835 GenerateName: "core-", 836 }, 837 } 838 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 839 840 onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 841 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 842 843 spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj() 844 gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed()) 845 846 clusterQueue = testing.MakeClusterQueue("dev-clusterqueue"). 847 ResourceGroup( 848 *testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "5").Obj(), 849 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(), 850 ).Obj() 851 gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed()) 852 }) 853 ginkgo.AfterEach(func() { 854 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 855 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true) 856 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true) 857 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, spotUntaintedFlavor, true) 858 }) 859 860 ginkgo.It("Should schedule jobs as they fit in their ClusterQueue", func() { 861 ginkgo.By("creating localQueue") 862 localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() 863 gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed()) 864 865 ginkgo.By("checking a dev job starts") 866 job := testingmpijob.MakeMPIJob("dev-job", ns.Name).Queue(localQueue.Name). 867 Request(kubeflow.MPIReplicaTypeLauncher, corev1.ResourceCPU, "3"). 868 Request(kubeflow.MPIReplicaTypeWorker, corev1.ResourceCPU, "4"). 869 Obj() 870 gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) 871 createdJob := &kubeflow.MPIJob{} 872 gomega.Eventually(func() *bool { 873 gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: job.Name, Namespace: job.Namespace}, createdJob)). 874 Should(gomega.Succeed()) 875 return createdJob.Spec.RunPolicy.Suspend 876 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false))) 877 gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotUntaintedFlavor.Name)) 878 gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) 879 util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 0) 880 util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 1) 881 882 }) 883 884 ginkgo.When("The workload's admission is removed", func() { 885 ginkgo.It("Should restore the original node selectors", func() { 886 887 localQueue := testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() 888 job := testingmpijob.MakeMPIJob(jobName, ns.Name).Queue(localQueue.Name). 889 Request(kubeflow.MPIReplicaTypeLauncher, corev1.ResourceCPU, "3"). 890 Request(kubeflow.MPIReplicaTypeWorker, corev1.ResourceCPU, "4"). 891 Obj() 892 lookupKey := types.NamespacedName{Name: job.Name, Namespace: job.Namespace} 893 createdJob := &kubeflow.MPIJob{} 894 895 nodeSelectors := func(j *kubeflow.MPIJob) map[kubeflow.MPIReplicaType]map[string]string { 896 ret := map[kubeflow.MPIReplicaType]map[string]string{} 897 for k := range j.Spec.MPIReplicaSpecs { 898 ret[k] = j.Spec.MPIReplicaSpecs[k].Template.Spec.NodeSelector 899 } 900 return ret 901 } 902 903 ginkgo.By("create a job", func() { 904 gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) 905 }) 906 907 ginkgo.By("job should be suspend", func() { 908 gomega.Eventually(func() *bool { 909 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 910 return createdJob.Spec.RunPolicy.Suspend 911 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true))) 912 }) 913 914 // backup the the node selectors 915 originalNodeSelectors := nodeSelectors(createdJob) 916 917 ginkgo.By("create a localQueue", func() { 918 gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed()) 919 }) 920 921 ginkgo.By("job should be unsuspended", func() { 922 gomega.Eventually(func() *bool { 923 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 924 return createdJob.Spec.RunPolicy.Suspend 925 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false))) 926 }) 927 928 ginkgo.By("the node selectors should be updated", func() { 929 gomega.Eventually(func() map[kubeflow.MPIReplicaType]map[string]string { 930 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 931 return nodeSelectors(createdJob) 932 }, util.Timeout, util.Interval).ShouldNot(gomega.Equal(originalNodeSelectors)) 933 }) 934 935 ginkgo.By("delete the localQueue to prevent readmission", func() { 936 gomega.Expect(util.DeleteLocalQueue(ctx, k8sClient, localQueue)).Should(gomega.Succeed()) 937 }) 938 939 ginkgo.By("clear the workload's admission to stop the job", func() { 940 wl := &kueue.Workload{} 941 wlKey := types.NamespacedName{Name: workloadmpijob.GetWorkloadNameForMPIJob(job.Name), Namespace: job.Namespace} 942 gomega.Expect(k8sClient.Get(ctx, wlKey, wl)).Should(gomega.Succeed()) 943 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, wl, nil)).Should(gomega.Succeed()) 944 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, wl) 945 }) 946 947 ginkgo.By("the node selectors should be restored", func() { 948 gomega.Eventually(func() map[kubeflow.MPIReplicaType]map[string]string { 949 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) 950 return nodeSelectors(createdJob) 951 }, util.Timeout, util.Interval).Should(gomega.Equal(originalNodeSelectors)) 952 }) 953 }) 954 }) 955 })