sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/jobset/jobset_controller_test.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package jobset 18 19 import ( 20 "fmt" 21 22 "github.com/google/go-cmp/cmp/cmpopts" 23 "github.com/onsi/ginkgo/v2" 24 "github.com/onsi/gomega" 25 corev1 "k8s.io/api/core/v1" 26 apierrors "k8s.io/apimachinery/pkg/api/errors" 27 apimeta "k8s.io/apimachinery/pkg/api/meta" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/types" 30 "k8s.io/client-go/kubernetes/scheme" 31 "k8s.io/utils/ptr" 32 ctrl "sigs.k8s.io/controller-runtime" 33 "sigs.k8s.io/controller-runtime/pkg/client" 34 jobsetapi "sigs.k8s.io/jobset/api/jobset/v1alpha2" 35 36 configapi "sigs.k8s.io/kueue/apis/config/v1beta1" 37 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 38 "sigs.k8s.io/kueue/pkg/controller/constants" 39 "sigs.k8s.io/kueue/pkg/controller/jobframework" 40 workloadjobset "sigs.k8s.io/kueue/pkg/controller/jobs/jobset" 41 "sigs.k8s.io/kueue/pkg/util/testing" 42 testingjobset "sigs.k8s.io/kueue/pkg/util/testingjobs/jobset" 43 "sigs.k8s.io/kueue/pkg/workload" 44 "sigs.k8s.io/kueue/test/integration/framework" 45 "sigs.k8s.io/kueue/test/util" 46 ) 47 48 const ( 49 jobSetName = "test-job" 50 instanceKey = "cloud.provider.com/instance" 51 priorityClassName = "test-priority-class" 52 priorityValue int32 = 10 53 ) 54 55 var ( 56 ignoreConditionTimestamps = cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime") 57 ) 58 59 var _ = ginkgo.Describe("JobSet controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 60 ginkgo.BeforeAll(func() { 61 fwk = &framework.Framework{ 62 CRDPath: crdPath, 63 DepCRDPaths: []string{jobsetCrdPath}, 64 } 65 cfg = fwk.Init() 66 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithManageJobsWithoutQueueName(true))) 67 }) 68 ginkgo.AfterAll(func() { 69 fwk.Teardown() 70 }) 71 72 var ( 73 ns *corev1.Namespace 74 wlLookupKey types.NamespacedName 75 ) 76 ginkgo.BeforeEach(func() { 77 ns = &corev1.Namespace{ 78 ObjectMeta: metav1.ObjectMeta{ 79 GenerateName: "jobset-", 80 }, 81 } 82 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 83 84 wlLookupKey = types.NamespacedName{Name: workloadjobset.GetWorkloadNameForJobSet(jobSetName), Namespace: ns.Name} 85 }) 86 ginkgo.AfterEach(func() { 87 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 88 }) 89 90 ginkgo.It("Should reconcile JobSets", func() { 91 ginkgo.By("checking the JobSet gets suspended when created unsuspended") 92 priorityClass := testing.MakePriorityClass(priorityClassName). 93 PriorityValue(priorityValue).Obj() 94 gomega.Expect(k8sClient.Create(ctx, priorityClass)).Should(gomega.Succeed()) 95 96 jobSet := testingjobset.MakeJobSet(jobSetName, ns.Name).ReplicatedJobs( 97 testingjobset.ReplicatedJobRequirements{ 98 Name: "replicated-job-1", 99 Replicas: 1, 100 Parallelism: 1, 101 Completions: 1, 102 }, testingjobset.ReplicatedJobRequirements{ 103 Name: "replicated-job-2", 104 Replicas: 3, 105 Parallelism: 1, 106 Completions: 1, 107 }, 108 ).Suspend(false). 109 PriorityClass(priorityClassName). 110 Obj() 111 err := k8sClient.Create(ctx, jobSet) 112 gomega.Expect(err).To(gomega.Succeed()) 113 createdJobSet := &jobsetapi.JobSet{} 114 115 gomega.Eventually(func() bool { 116 if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobSetName, Namespace: ns.Name}, createdJobSet); err != nil { 117 return false 118 } 119 return ptr.Deref(createdJobSet.Spec.Suspend, false) 120 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 121 122 ginkgo.By("checking the workload is created without queue assigned") 123 createdWorkload := &kueue.Workload{} 124 gomega.Eventually(func() error { 125 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 126 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 127 gomega.Expect(createdWorkload.Spec.QueueName).Should(gomega.Equal(""), "The Workload shouldn't have .spec.queueName set") 128 gomega.Expect(metav1.IsControlledBy(createdWorkload, createdJobSet)).To(gomega.BeTrue(), "The Workload should be owned by the JobSet") 129 130 ginkgo.By("checking the workload is created with priority and priorityName") 131 gomega.Expect(createdWorkload.Spec.PriorityClassName).Should(gomega.Equal(priorityClassName)) 132 gomega.Expect(*createdWorkload.Spec.Priority).Should(gomega.Equal(priorityValue)) 133 134 ginkgo.By("checking the workload is updated with queue name when the JobSet does") 135 jobSetQueueName := "test-queue" 136 createdJobSet.Annotations = map[string]string{constants.QueueLabel: jobSetQueueName} 137 gomega.Expect(k8sClient.Update(ctx, createdJobSet)).Should(gomega.Succeed()) 138 gomega.Eventually(func() bool { 139 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 140 return false 141 } 142 return createdWorkload.Spec.QueueName == jobSetQueueName 143 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 144 145 ginkgo.By("checking a second non-matching workload is deleted") 146 secondWl := &kueue.Workload{ 147 ObjectMeta: metav1.ObjectMeta{ 148 Name: workloadjobset.GetWorkloadNameForJobSet("second-workload"), 149 Namespace: createdWorkload.Namespace, 150 }, 151 Spec: *createdWorkload.Spec.DeepCopy(), 152 } 153 gomega.Expect(ctrl.SetControllerReference(createdJobSet, secondWl, scheme.Scheme)).Should(gomega.Succeed()) 154 secondWl.Spec.PodSets[0].Count += 1 155 gomega.Expect(k8sClient.Create(ctx, secondWl)).Should(gomega.Succeed()) 156 gomega.Eventually(func() error { 157 wl := &kueue.Workload{} 158 key := types.NamespacedName{Name: secondWl.Name, Namespace: secondWl.Namespace} 159 return k8sClient.Get(ctx, key, wl) 160 }, util.Timeout, util.Interval).Should(testing.BeNotFoundError()) 161 // check the original wl is still there 162 //gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 163 gomega.Eventually(func() error { 164 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 165 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 166 167 ginkgo.By("checking the JobSet is unsuspended when workload is assigned") 168 onDemandFlavor := testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 169 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 170 spotFlavor := testing.MakeResourceFlavor("spot").Label(instanceKey, "spot").Obj() 171 gomega.Expect(k8sClient.Create(ctx, spotFlavor)).Should(gomega.Succeed()) 172 clusterQueue := testing.MakeClusterQueue("cluster-queue"). 173 ResourceGroup( 174 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(), 175 *testing.MakeFlavorQuotas("spot").Resource(corev1.ResourceCPU, "5").Obj(), 176 ).Obj() 177 admission := testing.MakeAdmission(clusterQueue.Name).PodSets( 178 kueue.PodSetAssignment{ 179 Name: createdWorkload.Spec.PodSets[0].Name, 180 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 181 corev1.ResourceCPU: "on-demand", 182 }, 183 }, kueue.PodSetAssignment{ 184 Name: createdWorkload.Spec.PodSets[1].Name, 185 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 186 corev1.ResourceCPU: "spot", 187 }, 188 }, 189 ).Obj() 190 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 191 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 192 lookupKey := types.NamespacedName{Name: jobSetName, Namespace: ns.Name} 193 gomega.Eventually(func() bool { 194 if err := k8sClient.Get(ctx, lookupKey, createdJobSet); err != nil { 195 return false 196 } 197 return !ptr.Deref(createdJobSet.Spec.Suspend, false) 198 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 199 gomega.Eventually(func() bool { 200 ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "Started", corev1.EventTypeNormal, fmt.Sprintf("Admitted by clusterQueue %v", clusterQueue.Name)) 201 return ok 202 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 203 gomega.Expect(createdJobSet.Spec.ReplicatedJobs[0].Template.Spec.Template.Spec.NodeSelector).Should(gomega.Equal(map[string]string{instanceKey: onDemandFlavor.Name})) 204 gomega.Expect(createdJobSet.Spec.ReplicatedJobs[1].Template.Spec.Template.Spec.NodeSelector).Should(gomega.Equal(map[string]string{instanceKey: spotFlavor.Name})) 205 gomega.Eventually(func() bool { 206 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 207 return false 208 } 209 return apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadQuotaReserved) 210 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 211 212 ginkgo.By("checking the JobSet gets suspended when parallelism changes and the added node selectors are removed") 213 parallelism := jobSet.Spec.ReplicatedJobs[0].Replicas 214 newParallelism := parallelism + 1 215 createdJobSet.Spec.ReplicatedJobs[0].Replicas = newParallelism 216 gomega.Expect(k8sClient.Update(ctx, createdJobSet)).Should(gomega.Succeed()) 217 gomega.Eventually(func() bool { 218 if err := k8sClient.Get(ctx, lookupKey, createdJobSet); err != nil { 219 return false 220 } 221 return createdJobSet.Spec.Suspend != nil && *createdJobSet.Spec.Suspend && 222 len(jobSet.Spec.ReplicatedJobs[0].Template.Spec.Template.Spec.NodeSelector) == 0 223 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 224 gomega.Eventually(func() bool { 225 ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "DeletedWorkload", corev1.EventTypeNormal, fmt.Sprintf("Deleted not matching Workload: %v", wlLookupKey.String())) 226 return ok 227 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 228 229 ginkgo.By("checking the workload is updated with new count") 230 gomega.Eventually(func() bool { 231 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 232 return false 233 } 234 return createdWorkload.Spec.PodSets[0].Count == int32(newParallelism) 235 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 236 gomega.Expect(createdWorkload.Status.Admission).Should(gomega.BeNil()) 237 238 ginkgo.By("checking the JobSet is unsuspended and selectors added when workload is assigned again") 239 admission = testing.MakeAdmission(clusterQueue.Name). 240 PodSets( 241 kueue.PodSetAssignment{ 242 Name: "replicated-job-1", 243 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 244 corev1.ResourceCPU: "on-demand", 245 }, 246 Count: ptr.To(createdWorkload.Spec.PodSets[0].Count), 247 }, 248 kueue.PodSetAssignment{ 249 Name: "replicated-job-2", 250 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 251 corev1.ResourceCPU: "spot", 252 }, 253 Count: ptr.To(createdWorkload.Spec.PodSets[1].Count), 254 }, 255 ). 256 Obj() 257 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 258 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 259 gomega.Eventually(func() bool { 260 if err := k8sClient.Get(ctx, lookupKey, createdJobSet); err != nil { 261 return false 262 } 263 return !*createdJobSet.Spec.Suspend 264 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 265 266 gomega.Expect(len(createdJobSet.Spec.ReplicatedJobs[0].Template.Spec.Template.Spec.NodeSelector)).Should(gomega.Equal(1)) 267 gomega.Expect(createdJobSet.Spec.ReplicatedJobs[0].Template.Spec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) 268 gomega.Expect(len(createdJobSet.Spec.ReplicatedJobs[1].Template.Spec.Template.Spec.NodeSelector)).Should(gomega.Equal(1)) 269 gomega.Expect(createdJobSet.Spec.ReplicatedJobs[1].Template.Spec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotFlavor.Name)) 270 271 ginkgo.By("checking the workload is finished when JobSet is completed") 272 createdJobSet.Status.Conditions = append(createdJobSet.Status.Conditions, 273 metav1.Condition{ 274 Type: string(jobsetapi.JobSetCompleted), 275 Status: metav1.ConditionStatus(corev1.ConditionTrue), 276 Reason: "AllJobsCompleted", 277 Message: "jobset completed successfully", 278 LastTransitionTime: metav1.Now(), 279 }) 280 gomega.Expect(k8sClient.Status().Update(ctx, createdJobSet)).Should(gomega.Succeed()) 281 gomega.Eventually(func() bool { 282 err := k8sClient.Get(ctx, wlLookupKey, createdWorkload) 283 if err != nil { 284 return false 285 } 286 return apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadFinished) 287 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 288 }) 289 290 ginkgo.When("the queue has admission checks", func() { 291 var ( 292 clusterQueueAc *kueue.ClusterQueue 293 localQueue *kueue.LocalQueue 294 testFlavor *kueue.ResourceFlavor 295 jobLookupKey *types.NamespacedName 296 wlLookupKey *types.NamespacedName 297 admissionCheck *kueue.AdmissionCheck 298 ) 299 300 ginkgo.BeforeEach(func() { 301 admissionCheck = testing.MakeAdmissionCheck("check").ControllerName("ac-controller").Obj() 302 gomega.Expect(k8sClient.Create(ctx, admissionCheck)).To(gomega.Succeed()) 303 util.SetAdmissionCheckActive(ctx, k8sClient, admissionCheck, metav1.ConditionTrue) 304 clusterQueueAc = testing.MakeClusterQueue("prod-cq-with-checks"). 305 ResourceGroup( 306 *testing.MakeFlavorQuotas("test-flavor").Resource(corev1.ResourceCPU, "5").Obj(), 307 ).AdmissionChecks("check").Obj() 308 gomega.Expect(k8sClient.Create(ctx, clusterQueueAc)).Should(gomega.Succeed()) 309 localQueue = testing.MakeLocalQueue("queue", ns.Name).ClusterQueue(clusterQueueAc.Name).Obj() 310 gomega.Expect(k8sClient.Create(ctx, localQueue)).To(gomega.Succeed()) 311 testFlavor = testing.MakeResourceFlavor("test-flavor").Label(instanceKey, "test-flavor").Obj() 312 gomega.Expect(k8sClient.Create(ctx, testFlavor)).Should(gomega.Succeed()) 313 314 jobLookupKey = &types.NamespacedName{Name: jobSetName, Namespace: ns.Name} 315 wlLookupKey = &types.NamespacedName{Name: workloadjobset.GetWorkloadNameForJobSet(jobSetName), Namespace: ns.Name} 316 }) 317 318 ginkgo.AfterEach(func() { 319 gomega.Expect(util.DeleteAdmissionCheck(ctx, k8sClient, admissionCheck)).To(gomega.Succeed()) 320 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, testFlavor, true) 321 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 322 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueueAc, true) 323 }) 324 325 ginkgo.It("labels and annotations should be propagated from admission check to job", func() { 326 createdJob := &jobsetapi.JobSet{} 327 createdWorkload := &kueue.Workload{} 328 329 ginkgo.By("creating the job", func() { 330 job := testingjobset.MakeJobSet(jobSetName, ns.Name).ReplicatedJobs( 331 testingjobset.ReplicatedJobRequirements{ 332 Name: "replicated-job-1", 333 Replicas: 1, 334 Parallelism: 1, 335 Completions: 1, 336 }, testingjobset.ReplicatedJobRequirements{ 337 Name: "replicated-job-2", 338 Replicas: 3, 339 Parallelism: 1, 340 Completions: 1, 341 }, 342 ). 343 Queue("queue"). 344 Request("replicated-job-1", corev1.ResourceCPU, "1"). 345 Request("replicated-job-2", corev1.ResourceCPU, "1"). 346 Obj() 347 job.Spec.ReplicatedJobs[0].Template.Spec.Template.Annotations = map[string]string{ 348 "old-ann-key": "old-ann-value", 349 } 350 job.Spec.ReplicatedJobs[0].Template.Spec.Template.Labels = map[string]string{ 351 "old-label-key": "old-label-value", 352 } 353 gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) 354 }) 355 356 ginkgo.By("fetch the job and verify it is suspended as the checks are not ready", func() { 357 gomega.Eventually(func() *bool { 358 gomega.Expect(k8sClient.Get(ctx, *jobLookupKey, createdJob)).Should(gomega.Succeed()) 359 return createdJob.Spec.Suspend 360 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true))) 361 }) 362 363 ginkgo.By("checking the workload is created", func() { 364 gomega.Eventually(func() error { 365 return k8sClient.Get(ctx, *wlLookupKey, createdWorkload) 366 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 367 }) 368 369 ginkgo.By("add labels & annotations to the admission check in PodSetUpdates", func() { 370 gomega.Eventually(func() error { 371 var newWL kueue.Workload 372 gomega.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(createdWorkload), &newWL)).To(gomega.Succeed()) 373 workload.SetAdmissionCheckState(&newWL.Status.AdmissionChecks, kueue.AdmissionCheckState{ 374 Name: "check", 375 State: kueue.CheckStateReady, 376 PodSetUpdates: []kueue.PodSetUpdate{ 377 { 378 Name: "replicated-job-1", 379 Annotations: map[string]string{ 380 "ann1": "ann-value1", 381 }, 382 Labels: map[string]string{ 383 "label1": "label-value1", 384 }, 385 NodeSelector: map[string]string{ 386 "selector1": "selector-value1", 387 }, 388 Tolerations: []corev1.Toleration{ 389 { 390 Key: "selector1", 391 Value: "selector-value1", 392 Operator: corev1.TolerationOpEqual, 393 Effect: corev1.TaintEffectNoSchedule, 394 }, 395 }, 396 }, 397 { 398 Name: "replicated-job-2", 399 Annotations: map[string]string{ 400 "ann1": "ann-value2", 401 }, 402 Labels: map[string]string{ 403 "label1": "label-value2", 404 }, 405 NodeSelector: map[string]string{ 406 "selector1": "selector-value2", 407 }, 408 }, 409 }, 410 }) 411 return k8sClient.Status().Update(ctx, &newWL) 412 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 413 }) 414 415 ginkgo.By("admit the workload", func() { 416 admission := testing.MakeAdmission(clusterQueueAc.Name). 417 PodSets( 418 kueue.PodSetAssignment{ 419 Name: createdWorkload.Spec.PodSets[0].Name, 420 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 421 corev1.ResourceCPU: "test-flavor", 422 }, 423 }, kueue.PodSetAssignment{ 424 Name: createdWorkload.Spec.PodSets[1].Name, 425 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 426 corev1.ResourceCPU: "test-flavor", 427 }, 428 }, 429 ). 430 Obj() 431 gomega.Expect(k8sClient.Get(ctx, *wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 432 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 433 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 434 }) 435 436 ginkgo.By("await for the job to be admitted", func() { 437 gomega.Eventually(func() *bool { 438 gomega.Expect(k8sClient.Get(ctx, *jobLookupKey, createdJob)).Should(gomega.Succeed()) 439 return createdJob.Spec.Suspend 440 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false))) 441 }) 442 443 ginkgo.By("verify the PodSetUpdates are propagated to the running job, for replicated-job-1", func() { 444 replica1 := createdJob.Spec.ReplicatedJobs[0].Template.Spec.Template 445 gomega.Expect(replica1.Annotations).Should(gomega.HaveKeyWithValue("ann1", "ann-value1")) 446 gomega.Expect(replica1.Annotations).Should(gomega.HaveKeyWithValue("old-ann-key", "old-ann-value")) 447 gomega.Expect(replica1.Labels).Should(gomega.HaveKeyWithValue("label1", "label-value1")) 448 gomega.Expect(replica1.Labels).Should(gomega.HaveKeyWithValue("old-label-key", "old-label-value")) 449 gomega.Expect(replica1.Spec.NodeSelector).Should(gomega.HaveKeyWithValue("selector1", "selector-value1")) 450 gomega.Expect(replica1.Spec.Tolerations).Should(gomega.BeComparableTo( 451 []corev1.Toleration{ 452 { 453 Key: "selector1", 454 Value: "selector-value1", 455 Operator: corev1.TolerationOpEqual, 456 Effect: corev1.TaintEffectNoSchedule, 457 }, 458 }, 459 )) 460 }) 461 462 ginkgo.By("verify the PodSetUpdates are propagated to the running job, for replicated-job-2", func() { 463 replica2 := createdJob.Spec.ReplicatedJobs[1].Template.Spec.Template 464 gomega.Expect(replica2.Spec.NodeSelector).Should(gomega.HaveKeyWithValue("selector1", "selector-value2")) 465 gomega.Expect(replica2.Annotations).Should(gomega.HaveKeyWithValue("ann1", "ann-value2")) 466 gomega.Expect(replica2.Labels).Should(gomega.HaveKeyWithValue("label1", "label-value2")) 467 }) 468 469 ginkgo.By("delete the localQueue to prevent readmission", func() { 470 gomega.Expect(util.DeleteLocalQueue(ctx, k8sClient, localQueue)).Should(gomega.Succeed()) 471 }) 472 473 ginkgo.By("clear the workload's admission to stop the job", func() { 474 gomega.Expect(k8sClient.Get(ctx, *wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 475 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, nil)).Should(gomega.Succeed()) 476 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 477 }) 478 479 ginkgo.By("await for the job to be suspended", func() { 480 gomega.Eventually(func() *bool { 481 gomega.Expect(k8sClient.Get(ctx, *jobLookupKey, createdJob)).Should(gomega.Succeed()) 482 return createdJob.Spec.Suspend 483 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true))) 484 }) 485 486 ginkgo.By("verify the PodSetUpdates are restored for replicated-job-1", func() { 487 replica1 := createdJob.Spec.ReplicatedJobs[0].Template.Spec.Template 488 gomega.Expect(replica1.Annotations).ShouldNot(gomega.HaveKey("ann1")) 489 gomega.Expect(replica1.Annotations).Should(gomega.HaveKeyWithValue("old-ann-key", "old-ann-value")) 490 gomega.Expect(replica1.Labels).ShouldNot(gomega.HaveKey("label1")) 491 gomega.Expect(replica1.Labels).Should(gomega.HaveKeyWithValue("old-label-key", "old-label-value")) 492 gomega.Expect(replica1.Spec.NodeSelector).ShouldNot(gomega.HaveKey("selector1")) 493 }) 494 495 ginkgo.By("verify the PodSetUpdates are restored for replicated-job-2", func() { 496 replica2 := createdJob.Spec.ReplicatedJobs[1].Template.Spec.Template 497 gomega.Expect(replica2.Spec.NodeSelector).ShouldNot(gomega.HaveKey("selector1")) 498 gomega.Expect(replica2.Annotations).ShouldNot(gomega.HaveKey("ann1")) 499 gomega.Expect(replica2.Labels).ShouldNot(gomega.HaveKey("label1")) 500 }) 501 }) 502 }) 503 }) 504 505 var _ = ginkgo.Describe("JobSet controller for workloads when only jobs with queue are managed", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 506 ginkgo.BeforeAll(func() { 507 fwk = &framework.Framework{ 508 CRDPath: crdPath, 509 DepCRDPaths: []string{jobsetCrdPath}, 510 } 511 cfg = fwk.Init() 512 ctx, k8sClient = fwk.RunManager(cfg, managerSetup()) 513 }) 514 ginkgo.AfterAll(func() { 515 fwk.Teardown() 516 }) 517 518 var ( 519 ns *corev1.Namespace 520 ) 521 ginkgo.BeforeEach(func() { 522 ns = &corev1.Namespace{ 523 ObjectMeta: metav1.ObjectMeta{ 524 GenerateName: "jobset-", 525 }, 526 } 527 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 528 }) 529 ginkgo.AfterEach(func() { 530 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 531 }) 532 533 ginkgo.It("Should reconcile jobs only when queue is set", func() { 534 ginkgo.By("checking the workload is not created when queue name is not set") 535 jobSet := testingjobset.MakeJobSet(jobSetName, ns.Name).ReplicatedJobs( 536 testingjobset.ReplicatedJobRequirements{ 537 Name: "replicated-job-1", 538 Replicas: 1, 539 Parallelism: 1, 540 Completions: 1, 541 }, testingjobset.ReplicatedJobRequirements{ 542 Name: "replicated-job-2", 543 Replicas: 3, 544 Parallelism: 1, 545 Completions: 1, 546 }, 547 ).Suspend(false). 548 Obj() 549 gomega.Expect(k8sClient.Create(ctx, jobSet)).Should(gomega.Succeed()) 550 lookupKey := types.NamespacedName{Name: jobSetName, Namespace: ns.Name} 551 createdJobSet := &jobsetapi.JobSet{} 552 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJobSet)).Should(gomega.Succeed()) 553 554 createdWorkload := &kueue.Workload{} 555 wlLookupKey := types.NamespacedName{Name: workloadjobset.GetWorkloadNameForJobSet(jobSetName), Namespace: ns.Name} 556 gomega.Eventually(func() bool { 557 return apierrors.IsNotFound(k8sClient.Get(ctx, wlLookupKey, createdWorkload)) 558 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 559 560 ginkgo.By("checking the workload is created when queue name is set") 561 jobQueueName := "test-queue" 562 if createdJobSet.Labels == nil { 563 createdJobSet.Labels = map[string]string{constants.QueueLabel: jobQueueName} 564 } else { 565 createdJobSet.Labels[constants.QueueLabel] = jobQueueName 566 } 567 gomega.Expect(k8sClient.Update(ctx, createdJobSet)).Should(gomega.Succeed()) 568 gomega.Eventually(func() error { 569 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 570 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 571 }) 572 }) 573 574 var _ = ginkgo.Describe("JobSet controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 575 type podsReadyTestSpec struct { 576 beforeJobSetStatus *jobsetapi.JobSetStatus 577 beforeCondition *metav1.Condition 578 jobSetStatus jobsetapi.JobSetStatus 579 suspended bool 580 wantCondition *metav1.Condition 581 } 582 583 var defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj() 584 585 ginkgo.BeforeAll(func() { 586 fwk = &framework.Framework{ 587 CRDPath: crdPath, 588 DepCRDPaths: []string{jobsetCrdPath}, 589 } 590 cfg = fwk.Init() 591 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true}))) 592 593 ginkgo.By("Create a resource flavor") 594 gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed()) 595 }) 596 597 ginkgo.AfterAll(func() { 598 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true) 599 fwk.Teardown() 600 }) 601 602 var ( 603 ns *corev1.Namespace 604 wlLookupKey types.NamespacedName 605 ) 606 ginkgo.BeforeEach(func() { 607 ns = &corev1.Namespace{ 608 ObjectMeta: metav1.ObjectMeta{ 609 GenerateName: "jobset-", 610 }, 611 } 612 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 613 614 wlLookupKey = types.NamespacedName{Name: workloadjobset.GetWorkloadNameForJobSet(jobSetName), Namespace: ns.Name} 615 }) 616 ginkgo.AfterEach(func() { 617 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 618 }) 619 620 ginkgo.DescribeTable("Single job at different stages of progress towards completion", 621 func(podsReadyTestSpec podsReadyTestSpec) { 622 ginkgo.By("Create a job") 623 jobSet := testingjobset.MakeJobSet(jobSetName, ns.Name).ReplicatedJobs( 624 testingjobset.ReplicatedJobRequirements{ 625 Name: "replicated-job-1", 626 Replicas: 1, 627 Parallelism: 1, 628 Completions: 1, 629 }, testingjobset.ReplicatedJobRequirements{ 630 Name: "replicated-job-2", 631 Replicas: 3, 632 Parallelism: 1, 633 Completions: 1, 634 }, 635 ).Obj() 636 jobSetQueueName := "test-queue" 637 jobSet.Annotations = map[string]string{constants.QueueLabel: jobSetQueueName} 638 gomega.Expect(k8sClient.Create(ctx, jobSet)).Should(gomega.Succeed()) 639 lookupKey := types.NamespacedName{Name: jobSetName, Namespace: ns.Name} 640 createdJobSet := &jobsetapi.JobSet{} 641 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJobSet)).Should(gomega.Succeed()) 642 643 ginkgo.By("Fetch the workload created for the JobSet") 644 createdWorkload := &kueue.Workload{} 645 gomega.Eventually(func() error { 646 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 647 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 648 649 ginkgo.By("Admit the workload created for the JobSet") 650 admission := testing.MakeAdmission("foo").PodSets( 651 kueue.PodSetAssignment{ 652 Name: createdWorkload.Spec.PodSets[0].Name, 653 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 654 corev1.ResourceCPU: "default", 655 }, 656 }, kueue.PodSetAssignment{ 657 Name: createdWorkload.Spec.PodSets[1].Name, 658 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 659 corev1.ResourceCPU: "default", 660 }, 661 }, 662 ).Obj() 663 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 664 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 665 gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 666 667 ginkgo.By("Await for the JobSet to be unsuspended") 668 gomega.Eventually(func() bool { 669 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJobSet)).Should(gomega.Succeed()) 670 return ptr.Deref(createdJobSet.Spec.Suspend, false) 671 }, util.Timeout, util.Interval).Should(gomega.BeFalse()) 672 673 if podsReadyTestSpec.beforeJobSetStatus != nil { 674 ginkgo.By("Update the JobSet status to simulate its initial progress towards completion") 675 createdJobSet.Status = *podsReadyTestSpec.beforeJobSetStatus 676 gomega.Expect(k8sClient.Status().Update(ctx, createdJobSet)).Should(gomega.Succeed()) 677 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJobSet)).Should(gomega.Succeed()) 678 } 679 680 if podsReadyTestSpec.beforeCondition != nil { 681 ginkgo.By("Update the workload status") 682 gomega.Eventually(func() *metav1.Condition { 683 gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 684 return apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadPodsReady) 685 }, util.Timeout, util.Interval).Should(gomega.BeComparableTo(podsReadyTestSpec.beforeCondition, ignoreConditionTimestamps)) 686 } 687 688 ginkgo.By("Update the JobSet status to simulate its progress towards completion") 689 createdJobSet.Status = podsReadyTestSpec.jobSetStatus 690 gomega.Expect(k8sClient.Status().Update(ctx, createdJobSet)).Should(gomega.Succeed()) 691 gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJobSet)).Should(gomega.Succeed()) 692 693 if podsReadyTestSpec.suspended { 694 ginkgo.By("Unset admission of the workload to suspend the JobSet") 695 gomega.Eventually(func() error { 696 // the update may need to be retried due to a conflict as the workload gets 697 // also updated due to setting of the job status. 698 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 699 return err 700 } 701 return util.SetQuotaReservation(ctx, k8sClient, createdWorkload, nil) 702 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 703 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 704 } 705 706 ginkgo.By("Verify the PodsReady condition is added") 707 gomega.Eventually(func() *metav1.Condition { 708 gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 709 return apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadPodsReady) 710 }, util.Timeout, util.Interval).Should(gomega.BeComparableTo(podsReadyTestSpec.wantCondition, ignoreConditionTimestamps)) 711 }, 712 ginkgo.Entry("No progress", podsReadyTestSpec{ 713 wantCondition: &metav1.Condition{ 714 Type: kueue.WorkloadPodsReady, 715 Status: metav1.ConditionFalse, 716 Reason: "PodsReady", 717 Message: "Not all pods are ready or succeeded", 718 }, 719 }), 720 ginkgo.Entry("Running JobSet", podsReadyTestSpec{ 721 jobSetStatus: jobsetapi.JobSetStatus{ 722 ReplicatedJobsStatus: []jobsetapi.ReplicatedJobStatus{ 723 { 724 Name: "replicated-job-1", 725 Ready: 1, 726 Succeeded: 0, 727 }, 728 { 729 Name: "replicated-job-2", 730 Ready: 2, 731 Succeeded: 1, 732 }, 733 }, 734 }, 735 wantCondition: &metav1.Condition{ 736 Type: kueue.WorkloadPodsReady, 737 Status: metav1.ConditionTrue, 738 Reason: "PodsReady", 739 Message: "All pods were ready or succeeded since the workload admission", 740 }, 741 }), 742 ginkgo.Entry("Running JobSet; PodsReady=False before", podsReadyTestSpec{ 743 beforeCondition: &metav1.Condition{ 744 Type: kueue.WorkloadPodsReady, 745 Status: metav1.ConditionFalse, 746 Reason: "PodsReady", 747 Message: "Not all pods are ready or succeeded", 748 }, 749 jobSetStatus: jobsetapi.JobSetStatus{ 750 ReplicatedJobsStatus: []jobsetapi.ReplicatedJobStatus{ 751 { 752 Name: "replicated-job-1", 753 Ready: 1, 754 Succeeded: 0, 755 }, 756 { 757 Name: "replicated-job-2", 758 Ready: 2, 759 Succeeded: 1, 760 }, 761 }, 762 }, 763 wantCondition: &metav1.Condition{ 764 Type: kueue.WorkloadPodsReady, 765 Status: metav1.ConditionTrue, 766 Reason: "PodsReady", 767 Message: "All pods were ready or succeeded since the workload admission", 768 }, 769 }), 770 ginkgo.Entry("JobSet suspended; PodsReady=True before", podsReadyTestSpec{ 771 beforeJobSetStatus: &jobsetapi.JobSetStatus{ 772 ReplicatedJobsStatus: []jobsetapi.ReplicatedJobStatus{ 773 { 774 Name: "replicated-job-1", 775 Ready: 1, 776 Succeeded: 0, 777 }, 778 { 779 Name: "replicated-job-2", 780 Ready: 2, 781 Succeeded: 1, 782 }, 783 }, 784 }, 785 beforeCondition: &metav1.Condition{ 786 Type: kueue.WorkloadPodsReady, 787 Status: metav1.ConditionTrue, 788 Reason: "PodsReady", 789 Message: "All pods were ready or succeeded since the workload admission", 790 }, 791 suspended: true, 792 wantCondition: &metav1.Condition{ 793 Type: kueue.WorkloadPodsReady, 794 Status: metav1.ConditionFalse, 795 Reason: "PodsReady", 796 Message: "Not all pods are ready or succeeded", 797 }, 798 }), 799 ) 800 }) 801 802 var _ = ginkgo.Describe("JobSet controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 803 ginkgo.BeforeAll(func() { 804 fwk = &framework.Framework{ 805 CRDPath: crdPath, 806 DepCRDPaths: []string{jobsetCrdPath}, 807 } 808 cfg = fwk.Init() 809 ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup()) 810 }) 811 ginkgo.AfterAll(func() { 812 fwk.Teardown() 813 }) 814 815 var ( 816 ns *corev1.Namespace 817 onDemandFlavor *kueue.ResourceFlavor 818 spotUntaintedFlavor *kueue.ResourceFlavor 819 clusterQueue *kueue.ClusterQueue 820 localQueue *kueue.LocalQueue 821 ) 822 823 ginkgo.BeforeEach(func() { 824 ns = &corev1.Namespace{ 825 ObjectMeta: metav1.ObjectMeta{ 826 GenerateName: "jobset-", 827 }, 828 } 829 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 830 831 onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 832 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 833 834 spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj() 835 gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed()) 836 837 clusterQueue = testing.MakeClusterQueue("dev-clusterqueue"). 838 ResourceGroup( 839 *testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "1").Obj(), 840 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(), 841 ).Obj() 842 gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed()) 843 }) 844 ginkgo.AfterEach(func() { 845 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 846 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true) 847 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true) 848 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, spotUntaintedFlavor, true) 849 }) 850 851 ginkgo.It("Should schedule JobSets as they fit in their ClusterQueue", func() { 852 ginkgo.By("creating localQueue") 853 localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() 854 gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed()) 855 856 ginkgo.By("checking a dev job starts") 857 jobSet := testingjobset.MakeJobSet("dev-job", ns.Name).ReplicatedJobs( 858 testingjobset.ReplicatedJobRequirements{ 859 Name: "replicated-job-1", 860 Replicas: 1, 861 Parallelism: 1, 862 Completions: 1, 863 }, testingjobset.ReplicatedJobRequirements{ 864 Name: "replicated-job-2", 865 Replicas: 3, 866 Parallelism: 1, 867 Completions: 1, 868 }, 869 ).Queue(localQueue.Name). 870 Request("replicated-job-1", corev1.ResourceCPU, "1"). 871 Request("replicated-job-2", corev1.ResourceCPU, "1"). 872 Obj() 873 gomega.Expect(k8sClient.Create(ctx, jobSet)).Should(gomega.Succeed()) 874 createdJobSet := &jobsetapi.JobSet{} 875 gomega.Eventually(func() bool { 876 gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobSet.Name, Namespace: jobSet.Namespace}, createdJobSet)). 877 Should(gomega.Succeed()) 878 return ptr.Deref(createdJobSet.Spec.Suspend, false) 879 }, util.Timeout, util.Interval).Should(gomega.BeFalse()) 880 fmt.Println(createdJobSet.Spec.ReplicatedJobs[0].Template.Spec.Template.Spec.NodeSelector) 881 gomega.Expect(createdJobSet.Spec.ReplicatedJobs[0].Template.Spec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotUntaintedFlavor.Name)) 882 gomega.Expect(createdJobSet.Spec.ReplicatedJobs[1].Template.Spec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) 883 util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 0) 884 util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 1) 885 886 }) 887 888 ginkgo.It("Should allow reclaim of resources that are no longer needed", func() { 889 ginkgo.By("creating localQueue", func() { 890 localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() 891 gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed()) 892 }) 893 894 jobSet1 := testingjobset.MakeJobSet("dev-jobset1", ns.Name).ReplicatedJobs( 895 testingjobset.ReplicatedJobRequirements{ 896 Name: "replicated-job-1", 897 Replicas: 2, 898 Parallelism: 4, 899 Completions: 8, 900 }, testingjobset.ReplicatedJobRequirements{ 901 Name: "replicated-job-2", 902 Replicas: 3, 903 Parallelism: 4, 904 Completions: 4, 905 }, 906 ).Queue(localQueue.Name). 907 Request("replicated-job-1", corev1.ResourceCPU, "250m"). 908 Request("replicated-job-2", corev1.ResourceCPU, "250m"). 909 Obj() 910 lookupKey1 := types.NamespacedName{Name: jobSet1.Name, Namespace: jobSet1.Namespace} 911 912 ginkgo.By("checking the first jobset starts", func() { 913 gomega.Expect(k8sClient.Create(ctx, jobSet1)).Should(gomega.Succeed()) 914 createdJobSet1 := &jobsetapi.JobSet{} 915 gomega.Eventually(func() *bool { 916 gomega.Expect(k8sClient.Get(ctx, lookupKey1, createdJobSet1)).Should(gomega.Succeed()) 917 return createdJobSet1.Spec.Suspend 918 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false))) 919 util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 0) 920 util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 1) 921 }) 922 923 jobSet2 := testingjobset.MakeJobSet("dev-jobset2", ns.Name).ReplicatedJobs( 924 testingjobset.ReplicatedJobRequirements{ 925 Name: "replicated-job-1", 926 Replicas: 2, 927 Parallelism: 1, 928 Completions: 1, 929 }, testingjobset.ReplicatedJobRequirements{ 930 Name: "replicated-job-2", 931 Replicas: 1, 932 Parallelism: 1, 933 Completions: 1, 934 }, 935 ).Queue(localQueue.Name). 936 Request("replicated-job-1", corev1.ResourceCPU, "1"). 937 Request("replicated-job-2", corev1.ResourceCPU, "1"). 938 Obj() 939 940 lookupKey2 := types.NamespacedName{Name: jobSet2.Name, Namespace: jobSet2.Namespace} 941 942 ginkgo.By("checking a second no-fit jobset does not start", func() { 943 gomega.Expect(k8sClient.Create(ctx, jobSet2)).Should(gomega.Succeed()) 944 createdJobSet2 := &jobsetapi.JobSet{} 945 gomega.Eventually(func() *bool { 946 gomega.Expect(k8sClient.Get(ctx, lookupKey2, createdJobSet2)).Should(gomega.Succeed()) 947 return createdJobSet2.Spec.Suspend 948 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true))) 949 util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 1) 950 util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 1) 951 }) 952 953 ginkgo.By("checking the second job starts when the first one needs less then two cpus", func() { 954 createdJobSet1 := &jobsetapi.JobSet{} 955 gomega.Expect(k8sClient.Get(ctx, lookupKey1, createdJobSet1)).Should(gomega.Succeed()) 956 createdJobSet1 = (&testingjobset.JobSetWrapper{JobSet: *createdJobSet1}).JobsStatus( 957 jobsetapi.ReplicatedJobStatus{ 958 Name: "replicated-job-1", 959 Succeeded: 2, 960 }, 961 jobsetapi.ReplicatedJobStatus{ 962 Name: "replicated-job-2", 963 Succeeded: 1, 964 }, 965 ).Obj() 966 gomega.Expect(k8sClient.Status().Update(ctx, createdJobSet1)).Should(gomega.Succeed()) 967 968 wl := &kueue.Workload{} 969 wlKey := types.NamespacedName{Name: workloadjobset.GetWorkloadNameForJobSet(jobSet1.Name), Namespace: jobSet1.Namespace} 970 gomega.Eventually(func() []kueue.ReclaimablePod { 971 gomega.Expect(k8sClient.Get(ctx, wlKey, wl)).Should(gomega.Succeed()) 972 return wl.Status.ReclaimablePods 973 974 }, util.Timeout, util.Interval).Should(gomega.BeComparableTo([]kueue.ReclaimablePod{ 975 { 976 Name: "replicated-job-1", 977 Count: 8, 978 }, 979 { 980 Name: "replicated-job-2", 981 Count: 4, 982 }, 983 })) 984 985 createdJobSet2 := &jobsetapi.JobSet{} 986 gomega.Eventually(func() *bool { 987 gomega.Expect(k8sClient.Get(ctx, lookupKey2, createdJobSet2)).Should(gomega.Succeed()) 988 return createdJobSet2.Spec.Suspend 989 }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false))) 990 util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 0) 991 util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 2) 992 }) 993 }) 994 })