sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/jobset/jobset_controller_test.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package jobset
    18  
    19  import (
    20  	"fmt"
    21  
    22  	"github.com/google/go-cmp/cmp/cmpopts"
    23  	"github.com/onsi/ginkgo/v2"
    24  	"github.com/onsi/gomega"
    25  	corev1 "k8s.io/api/core/v1"
    26  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    27  	apimeta "k8s.io/apimachinery/pkg/api/meta"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/apimachinery/pkg/types"
    30  	"k8s.io/client-go/kubernetes/scheme"
    31  	"k8s.io/utils/ptr"
    32  	ctrl "sigs.k8s.io/controller-runtime"
    33  	"sigs.k8s.io/controller-runtime/pkg/client"
    34  	jobsetapi "sigs.k8s.io/jobset/api/jobset/v1alpha2"
    35  
    36  	configapi "sigs.k8s.io/kueue/apis/config/v1beta1"
    37  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    38  	"sigs.k8s.io/kueue/pkg/controller/constants"
    39  	"sigs.k8s.io/kueue/pkg/controller/jobframework"
    40  	workloadjobset "sigs.k8s.io/kueue/pkg/controller/jobs/jobset"
    41  	"sigs.k8s.io/kueue/pkg/util/testing"
    42  	testingjobset "sigs.k8s.io/kueue/pkg/util/testingjobs/jobset"
    43  	"sigs.k8s.io/kueue/pkg/workload"
    44  	"sigs.k8s.io/kueue/test/integration/framework"
    45  	"sigs.k8s.io/kueue/test/util"
    46  )
    47  
    48  const (
    49  	jobSetName              = "test-job"
    50  	instanceKey             = "cloud.provider.com/instance"
    51  	priorityClassName       = "test-priority-class"
    52  	priorityValue     int32 = 10
    53  )
    54  
    55  var (
    56  	ignoreConditionTimestamps = cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime")
    57  )
    58  
    59  var _ = ginkgo.Describe("JobSet controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
    60  	ginkgo.BeforeAll(func() {
    61  		fwk = &framework.Framework{
    62  			CRDPath:     crdPath,
    63  			DepCRDPaths: []string{jobsetCrdPath},
    64  		}
    65  		cfg = fwk.Init()
    66  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithManageJobsWithoutQueueName(true)))
    67  	})
    68  	ginkgo.AfterAll(func() {
    69  		fwk.Teardown()
    70  	})
    71  
    72  	var (
    73  		ns          *corev1.Namespace
    74  		wlLookupKey types.NamespacedName
    75  	)
    76  	ginkgo.BeforeEach(func() {
    77  		ns = &corev1.Namespace{
    78  			ObjectMeta: metav1.ObjectMeta{
    79  				GenerateName: "jobset-",
    80  			},
    81  		}
    82  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
    83  
    84  		wlLookupKey = types.NamespacedName{Name: workloadjobset.GetWorkloadNameForJobSet(jobSetName), Namespace: ns.Name}
    85  	})
    86  	ginkgo.AfterEach(func() {
    87  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
    88  	})
    89  
    90  	ginkgo.It("Should reconcile JobSets", func() {
    91  		ginkgo.By("checking the JobSet gets suspended when created unsuspended")
    92  		priorityClass := testing.MakePriorityClass(priorityClassName).
    93  			PriorityValue(priorityValue).Obj()
    94  		gomega.Expect(k8sClient.Create(ctx, priorityClass)).Should(gomega.Succeed())
    95  
    96  		jobSet := testingjobset.MakeJobSet(jobSetName, ns.Name).ReplicatedJobs(
    97  			testingjobset.ReplicatedJobRequirements{
    98  				Name:        "replicated-job-1",
    99  				Replicas:    1,
   100  				Parallelism: 1,
   101  				Completions: 1,
   102  			}, testingjobset.ReplicatedJobRequirements{
   103  				Name:        "replicated-job-2",
   104  				Replicas:    3,
   105  				Parallelism: 1,
   106  				Completions: 1,
   107  			},
   108  		).Suspend(false).
   109  			PriorityClass(priorityClassName).
   110  			Obj()
   111  		err := k8sClient.Create(ctx, jobSet)
   112  		gomega.Expect(err).To(gomega.Succeed())
   113  		createdJobSet := &jobsetapi.JobSet{}
   114  
   115  		gomega.Eventually(func() bool {
   116  			if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobSetName, Namespace: ns.Name}, createdJobSet); err != nil {
   117  				return false
   118  			}
   119  			return ptr.Deref(createdJobSet.Spec.Suspend, false)
   120  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   121  
   122  		ginkgo.By("checking the workload is created without queue assigned")
   123  		createdWorkload := &kueue.Workload{}
   124  		gomega.Eventually(func() error {
   125  			return k8sClient.Get(ctx, wlLookupKey, createdWorkload)
   126  		}, util.Timeout, util.Interval).Should(gomega.Succeed())
   127  		gomega.Expect(createdWorkload.Spec.QueueName).Should(gomega.Equal(""), "The Workload shouldn't have .spec.queueName set")
   128  		gomega.Expect(metav1.IsControlledBy(createdWorkload, createdJobSet)).To(gomega.BeTrue(), "The Workload should be owned by the JobSet")
   129  
   130  		ginkgo.By("checking the workload is created with priority and priorityName")
   131  		gomega.Expect(createdWorkload.Spec.PriorityClassName).Should(gomega.Equal(priorityClassName))
   132  		gomega.Expect(*createdWorkload.Spec.Priority).Should(gomega.Equal(priorityValue))
   133  
   134  		ginkgo.By("checking the workload is updated with queue name when the JobSet does")
   135  		jobSetQueueName := "test-queue"
   136  		createdJobSet.Annotations = map[string]string{constants.QueueLabel: jobSetQueueName}
   137  		gomega.Expect(k8sClient.Update(ctx, createdJobSet)).Should(gomega.Succeed())
   138  		gomega.Eventually(func() bool {
   139  			if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil {
   140  				return false
   141  			}
   142  			return createdWorkload.Spec.QueueName == jobSetQueueName
   143  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   144  
   145  		ginkgo.By("checking a second non-matching workload is deleted")
   146  		secondWl := &kueue.Workload{
   147  			ObjectMeta: metav1.ObjectMeta{
   148  				Name:      workloadjobset.GetWorkloadNameForJobSet("second-workload"),
   149  				Namespace: createdWorkload.Namespace,
   150  			},
   151  			Spec: *createdWorkload.Spec.DeepCopy(),
   152  		}
   153  		gomega.Expect(ctrl.SetControllerReference(createdJobSet, secondWl, scheme.Scheme)).Should(gomega.Succeed())
   154  		secondWl.Spec.PodSets[0].Count += 1
   155  		gomega.Expect(k8sClient.Create(ctx, secondWl)).Should(gomega.Succeed())
   156  		gomega.Eventually(func() error {
   157  			wl := &kueue.Workload{}
   158  			key := types.NamespacedName{Name: secondWl.Name, Namespace: secondWl.Namespace}
   159  			return k8sClient.Get(ctx, key, wl)
   160  		}, util.Timeout, util.Interval).Should(testing.BeNotFoundError())
   161  		// check the original wl is still there
   162  		//gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed())
   163  		gomega.Eventually(func() error {
   164  			return k8sClient.Get(ctx, wlLookupKey, createdWorkload)
   165  		}, util.Timeout, util.Interval).Should(gomega.Succeed())
   166  
   167  		ginkgo.By("checking the JobSet is unsuspended when workload is assigned")
   168  		onDemandFlavor := testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj()
   169  		gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed())
   170  		spotFlavor := testing.MakeResourceFlavor("spot").Label(instanceKey, "spot").Obj()
   171  		gomega.Expect(k8sClient.Create(ctx, spotFlavor)).Should(gomega.Succeed())
   172  		clusterQueue := testing.MakeClusterQueue("cluster-queue").
   173  			ResourceGroup(
   174  				*testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(),
   175  				*testing.MakeFlavorQuotas("spot").Resource(corev1.ResourceCPU, "5").Obj(),
   176  			).Obj()
   177  		admission := testing.MakeAdmission(clusterQueue.Name).PodSets(
   178  			kueue.PodSetAssignment{
   179  				Name: createdWorkload.Spec.PodSets[0].Name,
   180  				Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{
   181  					corev1.ResourceCPU: "on-demand",
   182  				},
   183  			}, kueue.PodSetAssignment{
   184  				Name: createdWorkload.Spec.PodSets[1].Name,
   185  				Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{
   186  					corev1.ResourceCPU: "spot",
   187  				},
   188  			},
   189  		).Obj()
   190  		gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed())
   191  		util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload)
   192  		lookupKey := types.NamespacedName{Name: jobSetName, Namespace: ns.Name}
   193  		gomega.Eventually(func() bool {
   194  			if err := k8sClient.Get(ctx, lookupKey, createdJobSet); err != nil {
   195  				return false
   196  			}
   197  			return !ptr.Deref(createdJobSet.Spec.Suspend, false)
   198  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   199  		gomega.Eventually(func() bool {
   200  			ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "Started", corev1.EventTypeNormal, fmt.Sprintf("Admitted by clusterQueue %v", clusterQueue.Name))
   201  			return ok
   202  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   203  		gomega.Expect(createdJobSet.Spec.ReplicatedJobs[0].Template.Spec.Template.Spec.NodeSelector).Should(gomega.Equal(map[string]string{instanceKey: onDemandFlavor.Name}))
   204  		gomega.Expect(createdJobSet.Spec.ReplicatedJobs[1].Template.Spec.Template.Spec.NodeSelector).Should(gomega.Equal(map[string]string{instanceKey: spotFlavor.Name}))
   205  		gomega.Eventually(func() bool {
   206  			if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil {
   207  				return false
   208  			}
   209  			return apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadQuotaReserved)
   210  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   211  
   212  		ginkgo.By("checking the JobSet gets suspended when parallelism changes and the added node selectors are removed")
   213  		parallelism := jobSet.Spec.ReplicatedJobs[0].Replicas
   214  		newParallelism := parallelism + 1
   215  		createdJobSet.Spec.ReplicatedJobs[0].Replicas = newParallelism
   216  		gomega.Expect(k8sClient.Update(ctx, createdJobSet)).Should(gomega.Succeed())
   217  		gomega.Eventually(func() bool {
   218  			if err := k8sClient.Get(ctx, lookupKey, createdJobSet); err != nil {
   219  				return false
   220  			}
   221  			return createdJobSet.Spec.Suspend != nil && *createdJobSet.Spec.Suspend &&
   222  				len(jobSet.Spec.ReplicatedJobs[0].Template.Spec.Template.Spec.NodeSelector) == 0
   223  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   224  		gomega.Eventually(func() bool {
   225  			ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "DeletedWorkload", corev1.EventTypeNormal, fmt.Sprintf("Deleted not matching Workload: %v", wlLookupKey.String()))
   226  			return ok
   227  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   228  
   229  		ginkgo.By("checking the workload is updated with new count")
   230  		gomega.Eventually(func() bool {
   231  			if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil {
   232  				return false
   233  			}
   234  			return createdWorkload.Spec.PodSets[0].Count == int32(newParallelism)
   235  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   236  		gomega.Expect(createdWorkload.Status.Admission).Should(gomega.BeNil())
   237  
   238  		ginkgo.By("checking the JobSet is unsuspended and selectors added when workload is assigned again")
   239  		admission = testing.MakeAdmission(clusterQueue.Name).
   240  			PodSets(
   241  				kueue.PodSetAssignment{
   242  					Name: "replicated-job-1",
   243  					Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{
   244  						corev1.ResourceCPU: "on-demand",
   245  					},
   246  					Count: ptr.To(createdWorkload.Spec.PodSets[0].Count),
   247  				},
   248  				kueue.PodSetAssignment{
   249  					Name: "replicated-job-2",
   250  					Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{
   251  						corev1.ResourceCPU: "spot",
   252  					},
   253  					Count: ptr.To(createdWorkload.Spec.PodSets[1].Count),
   254  				},
   255  			).
   256  			Obj()
   257  		gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed())
   258  		util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload)
   259  		gomega.Eventually(func() bool {
   260  			if err := k8sClient.Get(ctx, lookupKey, createdJobSet); err != nil {
   261  				return false
   262  			}
   263  			return !*createdJobSet.Spec.Suspend
   264  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   265  
   266  		gomega.Expect(len(createdJobSet.Spec.ReplicatedJobs[0].Template.Spec.Template.Spec.NodeSelector)).Should(gomega.Equal(1))
   267  		gomega.Expect(createdJobSet.Spec.ReplicatedJobs[0].Template.Spec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name))
   268  		gomega.Expect(len(createdJobSet.Spec.ReplicatedJobs[1].Template.Spec.Template.Spec.NodeSelector)).Should(gomega.Equal(1))
   269  		gomega.Expect(createdJobSet.Spec.ReplicatedJobs[1].Template.Spec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotFlavor.Name))
   270  
   271  		ginkgo.By("checking the workload is finished when JobSet is completed")
   272  		createdJobSet.Status.Conditions = append(createdJobSet.Status.Conditions,
   273  			metav1.Condition{
   274  				Type:               string(jobsetapi.JobSetCompleted),
   275  				Status:             metav1.ConditionStatus(corev1.ConditionTrue),
   276  				Reason:             "AllJobsCompleted",
   277  				Message:            "jobset completed successfully",
   278  				LastTransitionTime: metav1.Now(),
   279  			})
   280  		gomega.Expect(k8sClient.Status().Update(ctx, createdJobSet)).Should(gomega.Succeed())
   281  		gomega.Eventually(func() bool {
   282  			err := k8sClient.Get(ctx, wlLookupKey, createdWorkload)
   283  			if err != nil {
   284  				return false
   285  			}
   286  			return apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadFinished)
   287  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   288  	})
   289  
   290  	ginkgo.When("the queue has admission checks", func() {
   291  		var (
   292  			clusterQueueAc *kueue.ClusterQueue
   293  			localQueue     *kueue.LocalQueue
   294  			testFlavor     *kueue.ResourceFlavor
   295  			jobLookupKey   *types.NamespacedName
   296  			wlLookupKey    *types.NamespacedName
   297  			admissionCheck *kueue.AdmissionCheck
   298  		)
   299  
   300  		ginkgo.BeforeEach(func() {
   301  			admissionCheck = testing.MakeAdmissionCheck("check").ControllerName("ac-controller").Obj()
   302  			gomega.Expect(k8sClient.Create(ctx, admissionCheck)).To(gomega.Succeed())
   303  			util.SetAdmissionCheckActive(ctx, k8sClient, admissionCheck, metav1.ConditionTrue)
   304  			clusterQueueAc = testing.MakeClusterQueue("prod-cq-with-checks").
   305  				ResourceGroup(
   306  					*testing.MakeFlavorQuotas("test-flavor").Resource(corev1.ResourceCPU, "5").Obj(),
   307  				).AdmissionChecks("check").Obj()
   308  			gomega.Expect(k8sClient.Create(ctx, clusterQueueAc)).Should(gomega.Succeed())
   309  			localQueue = testing.MakeLocalQueue("queue", ns.Name).ClusterQueue(clusterQueueAc.Name).Obj()
   310  			gomega.Expect(k8sClient.Create(ctx, localQueue)).To(gomega.Succeed())
   311  			testFlavor = testing.MakeResourceFlavor("test-flavor").Label(instanceKey, "test-flavor").Obj()
   312  			gomega.Expect(k8sClient.Create(ctx, testFlavor)).Should(gomega.Succeed())
   313  
   314  			jobLookupKey = &types.NamespacedName{Name: jobSetName, Namespace: ns.Name}
   315  			wlLookupKey = &types.NamespacedName{Name: workloadjobset.GetWorkloadNameForJobSet(jobSetName), Namespace: ns.Name}
   316  		})
   317  
   318  		ginkgo.AfterEach(func() {
   319  			gomega.Expect(util.DeleteAdmissionCheck(ctx, k8sClient, admissionCheck)).To(gomega.Succeed())
   320  			util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, testFlavor, true)
   321  			gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   322  			util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueueAc, true)
   323  		})
   324  
   325  		ginkgo.It("labels and annotations should be propagated from admission check to job", func() {
   326  			createdJob := &jobsetapi.JobSet{}
   327  			createdWorkload := &kueue.Workload{}
   328  
   329  			ginkgo.By("creating the job", func() {
   330  				job := testingjobset.MakeJobSet(jobSetName, ns.Name).ReplicatedJobs(
   331  					testingjobset.ReplicatedJobRequirements{
   332  						Name:        "replicated-job-1",
   333  						Replicas:    1,
   334  						Parallelism: 1,
   335  						Completions: 1,
   336  					}, testingjobset.ReplicatedJobRequirements{
   337  						Name:        "replicated-job-2",
   338  						Replicas:    3,
   339  						Parallelism: 1,
   340  						Completions: 1,
   341  					},
   342  				).
   343  					Queue("queue").
   344  					Request("replicated-job-1", corev1.ResourceCPU, "1").
   345  					Request("replicated-job-2", corev1.ResourceCPU, "1").
   346  					Obj()
   347  				job.Spec.ReplicatedJobs[0].Template.Spec.Template.Annotations = map[string]string{
   348  					"old-ann-key": "old-ann-value",
   349  				}
   350  				job.Spec.ReplicatedJobs[0].Template.Spec.Template.Labels = map[string]string{
   351  					"old-label-key": "old-label-value",
   352  				}
   353  				gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed())
   354  			})
   355  
   356  			ginkgo.By("fetch the job and verify it is suspended as the checks are not ready", func() {
   357  				gomega.Eventually(func() *bool {
   358  					gomega.Expect(k8sClient.Get(ctx, *jobLookupKey, createdJob)).Should(gomega.Succeed())
   359  					return createdJob.Spec.Suspend
   360  				}, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true)))
   361  			})
   362  
   363  			ginkgo.By("checking the workload is created", func() {
   364  				gomega.Eventually(func() error {
   365  					return k8sClient.Get(ctx, *wlLookupKey, createdWorkload)
   366  				}, util.Timeout, util.Interval).Should(gomega.Succeed())
   367  			})
   368  
   369  			ginkgo.By("add labels & annotations to the admission check in PodSetUpdates", func() {
   370  				gomega.Eventually(func() error {
   371  					var newWL kueue.Workload
   372  					gomega.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(createdWorkload), &newWL)).To(gomega.Succeed())
   373  					workload.SetAdmissionCheckState(&newWL.Status.AdmissionChecks, kueue.AdmissionCheckState{
   374  						Name:  "check",
   375  						State: kueue.CheckStateReady,
   376  						PodSetUpdates: []kueue.PodSetUpdate{
   377  							{
   378  								Name: "replicated-job-1",
   379  								Annotations: map[string]string{
   380  									"ann1": "ann-value1",
   381  								},
   382  								Labels: map[string]string{
   383  									"label1": "label-value1",
   384  								},
   385  								NodeSelector: map[string]string{
   386  									"selector1": "selector-value1",
   387  								},
   388  								Tolerations: []corev1.Toleration{
   389  									{
   390  										Key:      "selector1",
   391  										Value:    "selector-value1",
   392  										Operator: corev1.TolerationOpEqual,
   393  										Effect:   corev1.TaintEffectNoSchedule,
   394  									},
   395  								},
   396  							},
   397  							{
   398  								Name: "replicated-job-2",
   399  								Annotations: map[string]string{
   400  									"ann1": "ann-value2",
   401  								},
   402  								Labels: map[string]string{
   403  									"label1": "label-value2",
   404  								},
   405  								NodeSelector: map[string]string{
   406  									"selector1": "selector-value2",
   407  								},
   408  							},
   409  						},
   410  					})
   411  					return k8sClient.Status().Update(ctx, &newWL)
   412  				}, util.Timeout, util.Interval).Should(gomega.Succeed())
   413  			})
   414  
   415  			ginkgo.By("admit the workload", func() {
   416  				admission := testing.MakeAdmission(clusterQueueAc.Name).
   417  					PodSets(
   418  						kueue.PodSetAssignment{
   419  							Name: createdWorkload.Spec.PodSets[0].Name,
   420  							Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{
   421  								corev1.ResourceCPU: "test-flavor",
   422  							},
   423  						}, kueue.PodSetAssignment{
   424  							Name: createdWorkload.Spec.PodSets[1].Name,
   425  							Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{
   426  								corev1.ResourceCPU: "test-flavor",
   427  							},
   428  						},
   429  					).
   430  					Obj()
   431  				gomega.Expect(k8sClient.Get(ctx, *wlLookupKey, createdWorkload)).Should(gomega.Succeed())
   432  				gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed())
   433  				util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload)
   434  			})
   435  
   436  			ginkgo.By("await for the job to be admitted", func() {
   437  				gomega.Eventually(func() *bool {
   438  					gomega.Expect(k8sClient.Get(ctx, *jobLookupKey, createdJob)).Should(gomega.Succeed())
   439  					return createdJob.Spec.Suspend
   440  				}, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false)))
   441  			})
   442  
   443  			ginkgo.By("verify the PodSetUpdates are propagated to the running job, for replicated-job-1", func() {
   444  				replica1 := createdJob.Spec.ReplicatedJobs[0].Template.Spec.Template
   445  				gomega.Expect(replica1.Annotations).Should(gomega.HaveKeyWithValue("ann1", "ann-value1"))
   446  				gomega.Expect(replica1.Annotations).Should(gomega.HaveKeyWithValue("old-ann-key", "old-ann-value"))
   447  				gomega.Expect(replica1.Labels).Should(gomega.HaveKeyWithValue("label1", "label-value1"))
   448  				gomega.Expect(replica1.Labels).Should(gomega.HaveKeyWithValue("old-label-key", "old-label-value"))
   449  				gomega.Expect(replica1.Spec.NodeSelector).Should(gomega.HaveKeyWithValue("selector1", "selector-value1"))
   450  				gomega.Expect(replica1.Spec.Tolerations).Should(gomega.BeComparableTo(
   451  					[]corev1.Toleration{
   452  						{
   453  							Key:      "selector1",
   454  							Value:    "selector-value1",
   455  							Operator: corev1.TolerationOpEqual,
   456  							Effect:   corev1.TaintEffectNoSchedule,
   457  						},
   458  					},
   459  				))
   460  			})
   461  
   462  			ginkgo.By("verify the PodSetUpdates are propagated to the running job, for replicated-job-2", func() {
   463  				replica2 := createdJob.Spec.ReplicatedJobs[1].Template.Spec.Template
   464  				gomega.Expect(replica2.Spec.NodeSelector).Should(gomega.HaveKeyWithValue("selector1", "selector-value2"))
   465  				gomega.Expect(replica2.Annotations).Should(gomega.HaveKeyWithValue("ann1", "ann-value2"))
   466  				gomega.Expect(replica2.Labels).Should(gomega.HaveKeyWithValue("label1", "label-value2"))
   467  			})
   468  
   469  			ginkgo.By("delete the localQueue to prevent readmission", func() {
   470  				gomega.Expect(util.DeleteLocalQueue(ctx, k8sClient, localQueue)).Should(gomega.Succeed())
   471  			})
   472  
   473  			ginkgo.By("clear the workload's admission to stop the job", func() {
   474  				gomega.Expect(k8sClient.Get(ctx, *wlLookupKey, createdWorkload)).Should(gomega.Succeed())
   475  				gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, nil)).Should(gomega.Succeed())
   476  				util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload)
   477  			})
   478  
   479  			ginkgo.By("await for the job to be suspended", func() {
   480  				gomega.Eventually(func() *bool {
   481  					gomega.Expect(k8sClient.Get(ctx, *jobLookupKey, createdJob)).Should(gomega.Succeed())
   482  					return createdJob.Spec.Suspend
   483  				}, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true)))
   484  			})
   485  
   486  			ginkgo.By("verify the PodSetUpdates are restored for replicated-job-1", func() {
   487  				replica1 := createdJob.Spec.ReplicatedJobs[0].Template.Spec.Template
   488  				gomega.Expect(replica1.Annotations).ShouldNot(gomega.HaveKey("ann1"))
   489  				gomega.Expect(replica1.Annotations).Should(gomega.HaveKeyWithValue("old-ann-key", "old-ann-value"))
   490  				gomega.Expect(replica1.Labels).ShouldNot(gomega.HaveKey("label1"))
   491  				gomega.Expect(replica1.Labels).Should(gomega.HaveKeyWithValue("old-label-key", "old-label-value"))
   492  				gomega.Expect(replica1.Spec.NodeSelector).ShouldNot(gomega.HaveKey("selector1"))
   493  			})
   494  
   495  			ginkgo.By("verify the PodSetUpdates are restored for replicated-job-2", func() {
   496  				replica2 := createdJob.Spec.ReplicatedJobs[1].Template.Spec.Template
   497  				gomega.Expect(replica2.Spec.NodeSelector).ShouldNot(gomega.HaveKey("selector1"))
   498  				gomega.Expect(replica2.Annotations).ShouldNot(gomega.HaveKey("ann1"))
   499  				gomega.Expect(replica2.Labels).ShouldNot(gomega.HaveKey("label1"))
   500  			})
   501  		})
   502  	})
   503  })
   504  
   505  var _ = ginkgo.Describe("JobSet controller for workloads when only jobs with queue are managed", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
   506  	ginkgo.BeforeAll(func() {
   507  		fwk = &framework.Framework{
   508  			CRDPath:     crdPath,
   509  			DepCRDPaths: []string{jobsetCrdPath},
   510  		}
   511  		cfg = fwk.Init()
   512  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup())
   513  	})
   514  	ginkgo.AfterAll(func() {
   515  		fwk.Teardown()
   516  	})
   517  
   518  	var (
   519  		ns *corev1.Namespace
   520  	)
   521  	ginkgo.BeforeEach(func() {
   522  		ns = &corev1.Namespace{
   523  			ObjectMeta: metav1.ObjectMeta{
   524  				GenerateName: "jobset-",
   525  			},
   526  		}
   527  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   528  	})
   529  	ginkgo.AfterEach(func() {
   530  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   531  	})
   532  
   533  	ginkgo.It("Should reconcile jobs only when queue is set", func() {
   534  		ginkgo.By("checking the workload is not created when queue name is not set")
   535  		jobSet := testingjobset.MakeJobSet(jobSetName, ns.Name).ReplicatedJobs(
   536  			testingjobset.ReplicatedJobRequirements{
   537  				Name:        "replicated-job-1",
   538  				Replicas:    1,
   539  				Parallelism: 1,
   540  				Completions: 1,
   541  			}, testingjobset.ReplicatedJobRequirements{
   542  				Name:        "replicated-job-2",
   543  				Replicas:    3,
   544  				Parallelism: 1,
   545  				Completions: 1,
   546  			},
   547  		).Suspend(false).
   548  			Obj()
   549  		gomega.Expect(k8sClient.Create(ctx, jobSet)).Should(gomega.Succeed())
   550  		lookupKey := types.NamespacedName{Name: jobSetName, Namespace: ns.Name}
   551  		createdJobSet := &jobsetapi.JobSet{}
   552  		gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJobSet)).Should(gomega.Succeed())
   553  
   554  		createdWorkload := &kueue.Workload{}
   555  		wlLookupKey := types.NamespacedName{Name: workloadjobset.GetWorkloadNameForJobSet(jobSetName), Namespace: ns.Name}
   556  		gomega.Eventually(func() bool {
   557  			return apierrors.IsNotFound(k8sClient.Get(ctx, wlLookupKey, createdWorkload))
   558  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   559  
   560  		ginkgo.By("checking the workload is created when queue name is set")
   561  		jobQueueName := "test-queue"
   562  		if createdJobSet.Labels == nil {
   563  			createdJobSet.Labels = map[string]string{constants.QueueLabel: jobQueueName}
   564  		} else {
   565  			createdJobSet.Labels[constants.QueueLabel] = jobQueueName
   566  		}
   567  		gomega.Expect(k8sClient.Update(ctx, createdJobSet)).Should(gomega.Succeed())
   568  		gomega.Eventually(func() error {
   569  			return k8sClient.Get(ctx, wlLookupKey, createdWorkload)
   570  		}, util.Timeout, util.Interval).Should(gomega.Succeed())
   571  	})
   572  })
   573  
   574  var _ = ginkgo.Describe("JobSet controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
   575  	type podsReadyTestSpec struct {
   576  		beforeJobSetStatus *jobsetapi.JobSetStatus
   577  		beforeCondition    *metav1.Condition
   578  		jobSetStatus       jobsetapi.JobSetStatus
   579  		suspended          bool
   580  		wantCondition      *metav1.Condition
   581  	}
   582  
   583  	var defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj()
   584  
   585  	ginkgo.BeforeAll(func() {
   586  		fwk = &framework.Framework{
   587  			CRDPath:     crdPath,
   588  			DepCRDPaths: []string{jobsetCrdPath},
   589  		}
   590  		cfg = fwk.Init()
   591  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true})))
   592  
   593  		ginkgo.By("Create a resource flavor")
   594  		gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed())
   595  	})
   596  
   597  	ginkgo.AfterAll(func() {
   598  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true)
   599  		fwk.Teardown()
   600  	})
   601  
   602  	var (
   603  		ns          *corev1.Namespace
   604  		wlLookupKey types.NamespacedName
   605  	)
   606  	ginkgo.BeforeEach(func() {
   607  		ns = &corev1.Namespace{
   608  			ObjectMeta: metav1.ObjectMeta{
   609  				GenerateName: "jobset-",
   610  			},
   611  		}
   612  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   613  
   614  		wlLookupKey = types.NamespacedName{Name: workloadjobset.GetWorkloadNameForJobSet(jobSetName), Namespace: ns.Name}
   615  	})
   616  	ginkgo.AfterEach(func() {
   617  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   618  	})
   619  
   620  	ginkgo.DescribeTable("Single job at different stages of progress towards completion",
   621  		func(podsReadyTestSpec podsReadyTestSpec) {
   622  			ginkgo.By("Create a job")
   623  			jobSet := testingjobset.MakeJobSet(jobSetName, ns.Name).ReplicatedJobs(
   624  				testingjobset.ReplicatedJobRequirements{
   625  					Name:        "replicated-job-1",
   626  					Replicas:    1,
   627  					Parallelism: 1,
   628  					Completions: 1,
   629  				}, testingjobset.ReplicatedJobRequirements{
   630  					Name:        "replicated-job-2",
   631  					Replicas:    3,
   632  					Parallelism: 1,
   633  					Completions: 1,
   634  				},
   635  			).Obj()
   636  			jobSetQueueName := "test-queue"
   637  			jobSet.Annotations = map[string]string{constants.QueueLabel: jobSetQueueName}
   638  			gomega.Expect(k8sClient.Create(ctx, jobSet)).Should(gomega.Succeed())
   639  			lookupKey := types.NamespacedName{Name: jobSetName, Namespace: ns.Name}
   640  			createdJobSet := &jobsetapi.JobSet{}
   641  			gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJobSet)).Should(gomega.Succeed())
   642  
   643  			ginkgo.By("Fetch the workload created for the JobSet")
   644  			createdWorkload := &kueue.Workload{}
   645  			gomega.Eventually(func() error {
   646  				return k8sClient.Get(ctx, wlLookupKey, createdWorkload)
   647  			}, util.Timeout, util.Interval).Should(gomega.Succeed())
   648  
   649  			ginkgo.By("Admit the workload created for the JobSet")
   650  			admission := testing.MakeAdmission("foo").PodSets(
   651  				kueue.PodSetAssignment{
   652  					Name: createdWorkload.Spec.PodSets[0].Name,
   653  					Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{
   654  						corev1.ResourceCPU: "default",
   655  					},
   656  				}, kueue.PodSetAssignment{
   657  					Name: createdWorkload.Spec.PodSets[1].Name,
   658  					Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{
   659  						corev1.ResourceCPU: "default",
   660  					},
   661  				},
   662  			).Obj()
   663  			gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed())
   664  			util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload)
   665  			gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed())
   666  
   667  			ginkgo.By("Await for the JobSet to be unsuspended")
   668  			gomega.Eventually(func() bool {
   669  				gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJobSet)).Should(gomega.Succeed())
   670  				return ptr.Deref(createdJobSet.Spec.Suspend, false)
   671  			}, util.Timeout, util.Interval).Should(gomega.BeFalse())
   672  
   673  			if podsReadyTestSpec.beforeJobSetStatus != nil {
   674  				ginkgo.By("Update the JobSet status to simulate its initial progress towards completion")
   675  				createdJobSet.Status = *podsReadyTestSpec.beforeJobSetStatus
   676  				gomega.Expect(k8sClient.Status().Update(ctx, createdJobSet)).Should(gomega.Succeed())
   677  				gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJobSet)).Should(gomega.Succeed())
   678  			}
   679  
   680  			if podsReadyTestSpec.beforeCondition != nil {
   681  				ginkgo.By("Update the workload status")
   682  				gomega.Eventually(func() *metav1.Condition {
   683  					gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed())
   684  					return apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadPodsReady)
   685  				}, util.Timeout, util.Interval).Should(gomega.BeComparableTo(podsReadyTestSpec.beforeCondition, ignoreConditionTimestamps))
   686  			}
   687  
   688  			ginkgo.By("Update the JobSet status to simulate its progress towards completion")
   689  			createdJobSet.Status = podsReadyTestSpec.jobSetStatus
   690  			gomega.Expect(k8sClient.Status().Update(ctx, createdJobSet)).Should(gomega.Succeed())
   691  			gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJobSet)).Should(gomega.Succeed())
   692  
   693  			if podsReadyTestSpec.suspended {
   694  				ginkgo.By("Unset admission of the workload to suspend the JobSet")
   695  				gomega.Eventually(func() error {
   696  					// the update may need to be retried due to a conflict as the workload gets
   697  					// also updated due to setting of the job status.
   698  					if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil {
   699  						return err
   700  					}
   701  					return util.SetQuotaReservation(ctx, k8sClient, createdWorkload, nil)
   702  				}, util.Timeout, util.Interval).Should(gomega.Succeed())
   703  				util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload)
   704  			}
   705  
   706  			ginkgo.By("Verify the PodsReady condition is added")
   707  			gomega.Eventually(func() *metav1.Condition {
   708  				gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed())
   709  				return apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadPodsReady)
   710  			}, util.Timeout, util.Interval).Should(gomega.BeComparableTo(podsReadyTestSpec.wantCondition, ignoreConditionTimestamps))
   711  		},
   712  		ginkgo.Entry("No progress", podsReadyTestSpec{
   713  			wantCondition: &metav1.Condition{
   714  				Type:    kueue.WorkloadPodsReady,
   715  				Status:  metav1.ConditionFalse,
   716  				Reason:  "PodsReady",
   717  				Message: "Not all pods are ready or succeeded",
   718  			},
   719  		}),
   720  		ginkgo.Entry("Running JobSet", podsReadyTestSpec{
   721  			jobSetStatus: jobsetapi.JobSetStatus{
   722  				ReplicatedJobsStatus: []jobsetapi.ReplicatedJobStatus{
   723  					{
   724  						Name:      "replicated-job-1",
   725  						Ready:     1,
   726  						Succeeded: 0,
   727  					},
   728  					{
   729  						Name:      "replicated-job-2",
   730  						Ready:     2,
   731  						Succeeded: 1,
   732  					},
   733  				},
   734  			},
   735  			wantCondition: &metav1.Condition{
   736  				Type:    kueue.WorkloadPodsReady,
   737  				Status:  metav1.ConditionTrue,
   738  				Reason:  "PodsReady",
   739  				Message: "All pods were ready or succeeded since the workload admission",
   740  			},
   741  		}),
   742  		ginkgo.Entry("Running JobSet; PodsReady=False before", podsReadyTestSpec{
   743  			beforeCondition: &metav1.Condition{
   744  				Type:    kueue.WorkloadPodsReady,
   745  				Status:  metav1.ConditionFalse,
   746  				Reason:  "PodsReady",
   747  				Message: "Not all pods are ready or succeeded",
   748  			},
   749  			jobSetStatus: jobsetapi.JobSetStatus{
   750  				ReplicatedJobsStatus: []jobsetapi.ReplicatedJobStatus{
   751  					{
   752  						Name:      "replicated-job-1",
   753  						Ready:     1,
   754  						Succeeded: 0,
   755  					},
   756  					{
   757  						Name:      "replicated-job-2",
   758  						Ready:     2,
   759  						Succeeded: 1,
   760  					},
   761  				},
   762  			},
   763  			wantCondition: &metav1.Condition{
   764  				Type:    kueue.WorkloadPodsReady,
   765  				Status:  metav1.ConditionTrue,
   766  				Reason:  "PodsReady",
   767  				Message: "All pods were ready or succeeded since the workload admission",
   768  			},
   769  		}),
   770  		ginkgo.Entry("JobSet suspended; PodsReady=True before", podsReadyTestSpec{
   771  			beforeJobSetStatus: &jobsetapi.JobSetStatus{
   772  				ReplicatedJobsStatus: []jobsetapi.ReplicatedJobStatus{
   773  					{
   774  						Name:      "replicated-job-1",
   775  						Ready:     1,
   776  						Succeeded: 0,
   777  					},
   778  					{
   779  						Name:      "replicated-job-2",
   780  						Ready:     2,
   781  						Succeeded: 1,
   782  					},
   783  				},
   784  			},
   785  			beforeCondition: &metav1.Condition{
   786  				Type:    kueue.WorkloadPodsReady,
   787  				Status:  metav1.ConditionTrue,
   788  				Reason:  "PodsReady",
   789  				Message: "All pods were ready or succeeded since the workload admission",
   790  			},
   791  			suspended: true,
   792  			wantCondition: &metav1.Condition{
   793  				Type:    kueue.WorkloadPodsReady,
   794  				Status:  metav1.ConditionFalse,
   795  				Reason:  "PodsReady",
   796  				Message: "Not all pods are ready or succeeded",
   797  			},
   798  		}),
   799  	)
   800  })
   801  
   802  var _ = ginkgo.Describe("JobSet controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
   803  	ginkgo.BeforeAll(func() {
   804  		fwk = &framework.Framework{
   805  			CRDPath:     crdPath,
   806  			DepCRDPaths: []string{jobsetCrdPath},
   807  		}
   808  		cfg = fwk.Init()
   809  		ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup())
   810  	})
   811  	ginkgo.AfterAll(func() {
   812  		fwk.Teardown()
   813  	})
   814  
   815  	var (
   816  		ns                  *corev1.Namespace
   817  		onDemandFlavor      *kueue.ResourceFlavor
   818  		spotUntaintedFlavor *kueue.ResourceFlavor
   819  		clusterQueue        *kueue.ClusterQueue
   820  		localQueue          *kueue.LocalQueue
   821  	)
   822  
   823  	ginkgo.BeforeEach(func() {
   824  		ns = &corev1.Namespace{
   825  			ObjectMeta: metav1.ObjectMeta{
   826  				GenerateName: "jobset-",
   827  			},
   828  		}
   829  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   830  
   831  		onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj()
   832  		gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed())
   833  
   834  		spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj()
   835  		gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed())
   836  
   837  		clusterQueue = testing.MakeClusterQueue("dev-clusterqueue").
   838  			ResourceGroup(
   839  				*testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "1").Obj(),
   840  				*testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(),
   841  			).Obj()
   842  		gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed())
   843  	})
   844  	ginkgo.AfterEach(func() {
   845  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   846  		util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true)
   847  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true)
   848  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, spotUntaintedFlavor, true)
   849  	})
   850  
   851  	ginkgo.It("Should schedule JobSets as they fit in their ClusterQueue", func() {
   852  		ginkgo.By("creating localQueue")
   853  		localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj()
   854  		gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed())
   855  
   856  		ginkgo.By("checking a dev job starts")
   857  		jobSet := testingjobset.MakeJobSet("dev-job", ns.Name).ReplicatedJobs(
   858  			testingjobset.ReplicatedJobRequirements{
   859  				Name:        "replicated-job-1",
   860  				Replicas:    1,
   861  				Parallelism: 1,
   862  				Completions: 1,
   863  			}, testingjobset.ReplicatedJobRequirements{
   864  				Name:        "replicated-job-2",
   865  				Replicas:    3,
   866  				Parallelism: 1,
   867  				Completions: 1,
   868  			},
   869  		).Queue(localQueue.Name).
   870  			Request("replicated-job-1", corev1.ResourceCPU, "1").
   871  			Request("replicated-job-2", corev1.ResourceCPU, "1").
   872  			Obj()
   873  		gomega.Expect(k8sClient.Create(ctx, jobSet)).Should(gomega.Succeed())
   874  		createdJobSet := &jobsetapi.JobSet{}
   875  		gomega.Eventually(func() bool {
   876  			gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobSet.Name, Namespace: jobSet.Namespace}, createdJobSet)).
   877  				Should(gomega.Succeed())
   878  			return ptr.Deref(createdJobSet.Spec.Suspend, false)
   879  		}, util.Timeout, util.Interval).Should(gomega.BeFalse())
   880  		fmt.Println(createdJobSet.Spec.ReplicatedJobs[0].Template.Spec.Template.Spec.NodeSelector)
   881  		gomega.Expect(createdJobSet.Spec.ReplicatedJobs[0].Template.Spec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotUntaintedFlavor.Name))
   882  		gomega.Expect(createdJobSet.Spec.ReplicatedJobs[1].Template.Spec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name))
   883  		util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 0)
   884  		util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 1)
   885  
   886  	})
   887  
   888  	ginkgo.It("Should allow reclaim of resources that are no longer needed", func() {
   889  		ginkgo.By("creating localQueue", func() {
   890  			localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj()
   891  			gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed())
   892  		})
   893  
   894  		jobSet1 := testingjobset.MakeJobSet("dev-jobset1", ns.Name).ReplicatedJobs(
   895  			testingjobset.ReplicatedJobRequirements{
   896  				Name:        "replicated-job-1",
   897  				Replicas:    2,
   898  				Parallelism: 4,
   899  				Completions: 8,
   900  			}, testingjobset.ReplicatedJobRequirements{
   901  				Name:        "replicated-job-2",
   902  				Replicas:    3,
   903  				Parallelism: 4,
   904  				Completions: 4,
   905  			},
   906  		).Queue(localQueue.Name).
   907  			Request("replicated-job-1", corev1.ResourceCPU, "250m").
   908  			Request("replicated-job-2", corev1.ResourceCPU, "250m").
   909  			Obj()
   910  		lookupKey1 := types.NamespacedName{Name: jobSet1.Name, Namespace: jobSet1.Namespace}
   911  
   912  		ginkgo.By("checking the first jobset starts", func() {
   913  			gomega.Expect(k8sClient.Create(ctx, jobSet1)).Should(gomega.Succeed())
   914  			createdJobSet1 := &jobsetapi.JobSet{}
   915  			gomega.Eventually(func() *bool {
   916  				gomega.Expect(k8sClient.Get(ctx, lookupKey1, createdJobSet1)).Should(gomega.Succeed())
   917  				return createdJobSet1.Spec.Suspend
   918  			}, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false)))
   919  			util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 0)
   920  			util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 1)
   921  		})
   922  
   923  		jobSet2 := testingjobset.MakeJobSet("dev-jobset2", ns.Name).ReplicatedJobs(
   924  			testingjobset.ReplicatedJobRequirements{
   925  				Name:        "replicated-job-1",
   926  				Replicas:    2,
   927  				Parallelism: 1,
   928  				Completions: 1,
   929  			}, testingjobset.ReplicatedJobRequirements{
   930  				Name:        "replicated-job-2",
   931  				Replicas:    1,
   932  				Parallelism: 1,
   933  				Completions: 1,
   934  			},
   935  		).Queue(localQueue.Name).
   936  			Request("replicated-job-1", corev1.ResourceCPU, "1").
   937  			Request("replicated-job-2", corev1.ResourceCPU, "1").
   938  			Obj()
   939  
   940  		lookupKey2 := types.NamespacedName{Name: jobSet2.Name, Namespace: jobSet2.Namespace}
   941  
   942  		ginkgo.By("checking a second no-fit jobset does not start", func() {
   943  			gomega.Expect(k8sClient.Create(ctx, jobSet2)).Should(gomega.Succeed())
   944  			createdJobSet2 := &jobsetapi.JobSet{}
   945  			gomega.Eventually(func() *bool {
   946  				gomega.Expect(k8sClient.Get(ctx, lookupKey2, createdJobSet2)).Should(gomega.Succeed())
   947  				return createdJobSet2.Spec.Suspend
   948  			}, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true)))
   949  			util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 1)
   950  			util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 1)
   951  		})
   952  
   953  		ginkgo.By("checking the second job starts when the first one needs less then two cpus", func() {
   954  			createdJobSet1 := &jobsetapi.JobSet{}
   955  			gomega.Expect(k8sClient.Get(ctx, lookupKey1, createdJobSet1)).Should(gomega.Succeed())
   956  			createdJobSet1 = (&testingjobset.JobSetWrapper{JobSet: *createdJobSet1}).JobsStatus(
   957  				jobsetapi.ReplicatedJobStatus{
   958  					Name:      "replicated-job-1",
   959  					Succeeded: 2,
   960  				},
   961  				jobsetapi.ReplicatedJobStatus{
   962  					Name:      "replicated-job-2",
   963  					Succeeded: 1,
   964  				},
   965  			).Obj()
   966  			gomega.Expect(k8sClient.Status().Update(ctx, createdJobSet1)).Should(gomega.Succeed())
   967  
   968  			wl := &kueue.Workload{}
   969  			wlKey := types.NamespacedName{Name: workloadjobset.GetWorkloadNameForJobSet(jobSet1.Name), Namespace: jobSet1.Namespace}
   970  			gomega.Eventually(func() []kueue.ReclaimablePod {
   971  				gomega.Expect(k8sClient.Get(ctx, wlKey, wl)).Should(gomega.Succeed())
   972  				return wl.Status.ReclaimablePods
   973  
   974  			}, util.Timeout, util.Interval).Should(gomega.BeComparableTo([]kueue.ReclaimablePod{
   975  				{
   976  					Name:  "replicated-job-1",
   977  					Count: 8,
   978  				},
   979  				{
   980  					Name:  "replicated-job-2",
   981  					Count: 4,
   982  				},
   983  			}))
   984  
   985  			createdJobSet2 := &jobsetapi.JobSet{}
   986  			gomega.Eventually(func() *bool {
   987  				gomega.Expect(k8sClient.Get(ctx, lookupKey2, createdJobSet2)).Should(gomega.Succeed())
   988  				return createdJobSet2.Spec.Suspend
   989  			}, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false)))
   990  			util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 0)
   991  			util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 2)
   992  		})
   993  	})
   994  })