sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/pytorchjob/pytorchjob_controller_test.go

sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/pytorchjob/pytorchjob_controller_test.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package pytorchjob
    18  
    19  import (
    20  	kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    21  	"github.com/onsi/ginkgo/v2"
    22  	"github.com/onsi/gomega"
    23  	corev1 "k8s.io/api/core/v1"
    24  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    25  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    26  	"k8s.io/apimachinery/pkg/types"
    27  	"k8s.io/utils/ptr"
    28  	"sigs.k8s.io/controller-runtime/pkg/client"
    29  
    30  	configapi "sigs.k8s.io/kueue/apis/config/v1beta1"
    31  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    32  	"sigs.k8s.io/kueue/pkg/controller/constants"
    33  	"sigs.k8s.io/kueue/pkg/controller/jobframework"
    34  	workloadpytorchjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/pytorchjob"
    35  	"sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/kubeflowjob"
    36  	"sigs.k8s.io/kueue/pkg/util/testing"
    37  	testingpytorchjob "sigs.k8s.io/kueue/pkg/util/testingjobs/pytorchjob"
    38  	"sigs.k8s.io/kueue/pkg/workload"
    39  	kftesting "sigs.k8s.io/kueue/test/integration/controller/jobs/kubeflow"
    40  	"sigs.k8s.io/kueue/test/integration/framework"
    41  	"sigs.k8s.io/kueue/test/util"
    42  )
    43  
    44  const (
    45  	jobName           = "test-job"
    46  	instanceKey       = "cloud.provider.com/instance"
    47  	priorityClassName = "test-priority-class"
    48  	priorityValue     = 10
    49  	jobQueueName      = "test-queue"
    50  )
    51  
    52  // +kubebuilder:docs-gen:collapse=Imports
    53  
    54  var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
    55  
    56  	ginkgo.BeforeAll(func() {
    57  		fwk = &framework.Framework{
    58  			CRDPath:     crdPath,
    59  			DepCRDPaths: []string{pytorchCrdPath},
    60  		}
    61  		cfg = fwk.Init()
    62  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithManageJobsWithoutQueueName(true)))
    63  	})
    64  	ginkgo.AfterAll(func() {
    65  		fwk.Teardown()
    66  	})
    67  
    68  	var (
    69  		ns *corev1.Namespace
    70  	)
    71  	ginkgo.BeforeEach(func() {
    72  		ns = &corev1.Namespace{
    73  			ObjectMeta: metav1.ObjectMeta{
    74  				GenerateName: "core-",
    75  			},
    76  		}
    77  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
    78  	})
    79  	ginkgo.AfterEach(func() {
    80  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
    81  	})
    82  
    83  	ginkgo.It("Should reconcile PyTorchJobs", func() {
    84  		kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpytorchjob.JobControl)(testingpytorchjob.MakePyTorchJob(jobName, ns.Name).Obj())}
    85  		createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpytorchjob.JobControl)(&kftraining.PyTorchJob{})}
    86  		kftesting.ShouldReconcileJob(ctx, k8sClient, kfJob, createdJob, []kftesting.PodSetsResource{
    87  			{
    88  				RoleName:    kftraining.PyTorchJobReplicaTypeMaster,
    89  				ResourceCPU: "on-demand",
    90  			},
    91  			{
    92  				RoleName:    kftraining.PyTorchJobReplicaTypeWorker,
    93  				ResourceCPU: "spot",
    94  			},
    95  		})
    96  	})
    97  })
    98  
    99  var _ = ginkgo.Describe("Job controller for workloads when only jobs with queue are managed", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
   100  	ginkgo.BeforeAll(func() {
   101  		fwk = &framework.Framework{
   102  			CRDPath:     crdPath,
   103  			DepCRDPaths: []string{pytorchCrdPath},
   104  		}
   105  		cfg := fwk.Init()
   106  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup())
   107  	})
   108  	ginkgo.AfterAll(func() {
   109  		fwk.Teardown()
   110  	})
   111  
   112  	var (
   113  		ns *corev1.Namespace
   114  	)
   115  	ginkgo.BeforeEach(func() {
   116  		ns = &corev1.Namespace{
   117  			ObjectMeta: metav1.ObjectMeta{
   118  				GenerateName: "core-",
   119  			},
   120  		}
   121  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   122  	})
   123  	ginkgo.AfterEach(func() {
   124  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   125  	})
   126  
   127  	ginkgo.It("Should reconcile jobs only when queue is set", func() {
   128  		ginkgo.By("checking the workload is not created when queue name is not set")
   129  		job := testingpytorchjob.MakePyTorchJob(jobName, ns.Name).Obj()
   130  		gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed())
   131  		lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name}
   132  		createdJob := &kftraining.PyTorchJob{}
   133  		gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed())
   134  
   135  		createdWorkload := &kueue.Workload{}
   136  		wlLookupKey := types.NamespacedName{Name: workloadpytorchjob.GetWorkloadNameForPyTorchJob(jobName), Namespace: ns.Name}
   137  		gomega.Eventually(func() bool {
   138  			return apierrors.IsNotFound(k8sClient.Get(ctx, wlLookupKey, createdWorkload))
   139  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   140  
   141  		ginkgo.By("checking the workload is created when queue name is set")
   142  		createdJob.Annotations = map[string]string{constants.QueueAnnotation: jobQueueName}
   143  		gomega.Expect(k8sClient.Update(ctx, createdJob)).Should(gomega.Succeed())
   144  		gomega.Eventually(func() error {
   145  			return k8sClient.Get(ctx, wlLookupKey, createdWorkload)
   146  		}, util.Timeout, util.Interval).Should(gomega.Succeed())
   147  	})
   148  
   149  	ginkgo.When("the queue has admission checks", func() {
   150  		var (
   151  			clusterQueueAc *kueue.ClusterQueue
   152  			localQueue     *kueue.LocalQueue
   153  			testFlavor     *kueue.ResourceFlavor
   154  			jobLookupKey   *types.NamespacedName
   155  			wlLookupKey    *types.NamespacedName
   156  			admissionCheck *kueue.AdmissionCheck
   157  		)
   158  
   159  		ginkgo.BeforeEach(func() {
   160  			admissionCheck = testing.MakeAdmissionCheck("check").ControllerName("ac-controller").Obj()
   161  			gomega.Expect(k8sClient.Create(ctx, admissionCheck)).To(gomega.Succeed())
   162  			util.SetAdmissionCheckActive(ctx, k8sClient, admissionCheck, metav1.ConditionTrue)
   163  			clusterQueueAc = testing.MakeClusterQueue("prod-cq-with-checks").
   164  				ResourceGroup(
   165  					*testing.MakeFlavorQuotas("test-flavor").Resource(corev1.ResourceCPU, "5").Obj(),
   166  				).AdmissionChecks("check").Obj()
   167  			gomega.Expect(k8sClient.Create(ctx, clusterQueueAc)).Should(gomega.Succeed())
   168  			localQueue = testing.MakeLocalQueue("queue", ns.Name).ClusterQueue(clusterQueueAc.Name).Obj()
   169  			gomega.Expect(k8sClient.Create(ctx, localQueue)).To(gomega.Succeed())
   170  			testFlavor = testing.MakeResourceFlavor("test-flavor").Label(instanceKey, "test-flavor").Obj()
   171  			gomega.Expect(k8sClient.Create(ctx, testFlavor)).Should(gomega.Succeed())
   172  
   173  			jobLookupKey = &types.NamespacedName{Name: jobName, Namespace: ns.Name}
   174  			wlLookupKey = &types.NamespacedName{Name: workloadpytorchjob.GetWorkloadNameForPyTorchJob(jobName), Namespace: ns.Name}
   175  		})
   176  
   177  		ginkgo.AfterEach(func() {
   178  			gomega.Expect(util.DeleteAdmissionCheck(ctx, k8sClient, admissionCheck)).To(gomega.Succeed())
   179  			util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, testFlavor, true)
   180  			gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   181  			util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueueAc, true)
   182  		})
   183  
   184  		ginkgo.It("labels and annotations should be propagated from admission check to job", func() {
   185  			createdJob := &kftraining.PyTorchJob{}
   186  			createdWorkload := &kueue.Workload{}
   187  
   188  			ginkgo.By("creating the job with pod labels & annotations", func() {
   189  				job := testingpytorchjob.MakePyTorchJob(jobName, ns.Name).
   190  					PodAnnotation(kftraining.PyTorchJobReplicaTypeWorker, "old-ann-key", "old-ann-value").
   191  					PodLabel(kftraining.PyTorchJobReplicaTypeWorker, "old-label-key", "old-label-value").
   192  					Queue(localQueue.Name).
   193  					Obj()
   194  				gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed())
   195  			})
   196  
   197  			ginkgo.By("fetch the job and verify it is suspended as the checks are not ready", func() {
   198  				gomega.Eventually(func() *bool {
   199  					gomega.Expect(k8sClient.Get(ctx, *jobLookupKey, createdJob)).Should(gomega.Succeed())
   200  					return createdJob.Spec.RunPolicy.Suspend
   201  				}, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true)))
   202  			})
   203  
   204  			ginkgo.By("fetch the created workload", func() {
   205  				gomega.Eventually(func() error {
   206  					return k8sClient.Get(ctx, *wlLookupKey, createdWorkload)
   207  				}, util.Timeout, util.Interval).Should(gomega.Succeed())
   208  			})
   209  
   210  			ginkgo.By("add labels & annotations to the admission check", func() {
   211  				gomega.Eventually(func() error {
   212  					var newWL kueue.Workload
   213  					gomega.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(createdWorkload), &newWL)).To(gomega.Succeed())
   214  					workload.SetAdmissionCheckState(&newWL.Status.AdmissionChecks, kueue.AdmissionCheckState{
   215  						Name:  "check",
   216  						State: kueue.CheckStateReady,
   217  						PodSetUpdates: []kueue.PodSetUpdate{
   218  							{
   219  								Name: "master",
   220  							},
   221  							{
   222  								Name: "worker",
   223  								Annotations: map[string]string{
   224  									"ann1": "ann-value1",
   225  								},
   226  								Labels: map[string]string{
   227  									"label1": "label-value1",
   228  								},
   229  								NodeSelector: map[string]string{
   230  									"selector1": "selector-value1",
   231  								},
   232  								Tolerations: []corev1.Toleration{
   233  									{
   234  										Key:      "selector1",
   235  										Value:    "selector-value1",
   236  										Operator: corev1.TolerationOpEqual,
   237  										Effect:   corev1.TaintEffectNoSchedule,
   238  									},
   239  								},
   240  							},
   241  						},
   242  					})
   243  					return k8sClient.Status().Update(ctx, &newWL)
   244  				}, util.Timeout, util.Interval).Should(gomega.Succeed())
   245  			})
   246  
   247  			ginkgo.By("admit the workload", func() {
   248  				admission := testing.MakeAdmission(clusterQueueAc.Name).
   249  					PodSets(
   250  						kueue.PodSetAssignment{
   251  							Name: "master",
   252  							Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{
   253  								corev1.ResourceCPU: "test-flavor",
   254  							},
   255  							Count: ptr.To(createdWorkload.Spec.PodSets[0].Count),
   256  						},
   257  						kueue.PodSetAssignment{
   258  							Name: "worker",
   259  							Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{
   260  								corev1.ResourceCPU: "test-flavor",
   261  							},
   262  							Count: ptr.To(createdWorkload.Spec.PodSets[1].Count),
   263  						},
   264  					).
   265  					Obj()
   266  				gomega.Expect(k8sClient.Get(ctx, *wlLookupKey, createdWorkload)).Should(gomega.Succeed())
   267  				gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed())
   268  				util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload)
   269  			})
   270  
   271  			ginkgo.By("await for the job to start", func() {
   272  				gomega.Eventually(func() *bool {
   273  					gomega.Expect(k8sClient.Get(ctx, *jobLookupKey, createdJob)).Should(gomega.Succeed())
   274  					return createdJob.Spec.RunPolicy.Suspend
   275  				}, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false)))
   276  			})
   277  
   278  			ginkgo.By("verify the PodSetUpdates are propagated to the running job", func() {
   279  				worker := createdJob.Spec.PyTorchReplicaSpecs[kftraining.PyTorchJobReplicaTypeWorker].Template
   280  				gomega.Expect(worker.Annotations).Should(gomega.HaveKeyWithValue("ann1", "ann-value1"))
   281  				gomega.Expect(worker.Annotations).Should(gomega.HaveKeyWithValue("old-ann-key", "old-ann-value"))
   282  				gomega.Expect(worker.Labels).Should(gomega.HaveKeyWithValue("label1", "label-value1"))
   283  				gomega.Expect(worker.Labels).Should(gomega.HaveKeyWithValue("old-label-key", "old-label-value"))
   284  				gomega.Expect(worker.Spec.NodeSelector).Should(gomega.HaveKeyWithValue(instanceKey, "test-flavor"))
   285  				gomega.Expect(worker.Spec.NodeSelector).Should(gomega.HaveKeyWithValue("selector1", "selector-value1"))
   286  				gomega.Expect(worker.Spec.Tolerations).Should(gomega.BeComparableTo(
   287  					[]corev1.Toleration{
   288  						{
   289  							Key:      "selector1",
   290  							Value:    "selector-value1",
   291  							Operator: corev1.TolerationOpEqual,
   292  							Effect:   corev1.TaintEffectNoSchedule,
   293  						},
   294  					},
   295  				))
   296  			})
   297  
   298  			ginkgo.By("delete the localQueue to prevent readmission", func() {
   299  				gomega.Expect(util.DeleteLocalQueue(ctx, k8sClient, localQueue)).Should(gomega.Succeed())
   300  			})
   301  
   302  			ginkgo.By("clear the workload's admission to stop the job", func() {
   303  				gomega.Expect(k8sClient.Get(ctx, *wlLookupKey, createdWorkload)).Should(gomega.Succeed())
   304  				gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, nil)).Should(gomega.Succeed())
   305  				util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload)
   306  			})
   307  
   308  			ginkgo.By("await for the job to be suspended", func() {
   309  				gomega.Eventually(func() *bool {
   310  					gomega.Expect(k8sClient.Get(ctx, *jobLookupKey, createdJob)).Should(gomega.Succeed())
   311  					return createdJob.Spec.RunPolicy.Suspend
   312  				}, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true)))
   313  			})
   314  
   315  			ginkgo.By("verify the PodSetUpdates are restored", func() {
   316  				worker := createdJob.Spec.PyTorchReplicaSpecs[kftraining.PyTorchJobReplicaTypeWorker].Template
   317  				gomega.Expect(worker.Annotations).ShouldNot(gomega.HaveKey("ann1"))
   318  				gomega.Expect(worker.Annotations).Should(gomega.HaveKeyWithValue("old-ann-key", "old-ann-value"))
   319  				gomega.Expect(worker.Labels).ShouldNot(gomega.HaveKey("label1"))
   320  				gomega.Expect(worker.Labels).Should(gomega.HaveKeyWithValue("old-label-key", "old-label-value"))
   321  				gomega.Expect(worker.Spec.NodeSelector).ShouldNot(gomega.HaveKey(instanceKey))
   322  				gomega.Expect(worker.Spec.NodeSelector).ShouldNot(gomega.HaveKey("selector1"))
   323  			})
   324  		})
   325  	})
   326  })
   327  
   328  var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
   329  	var (
   330  		ns            *corev1.Namespace
   331  		defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj()
   332  	)
   333  
   334  	ginkgo.BeforeAll(func() {
   335  		fwk = &framework.Framework{
   336  			CRDPath:     crdPath,
   337  			DepCRDPaths: []string{pytorchCrdPath},
   338  		}
   339  		cfg := fwk.Init()
   340  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true})))
   341  
   342  		ginkgo.By("Create a resource flavor")
   343  		gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed())
   344  	})
   345  	ginkgo.AfterAll(func() {
   346  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true)
   347  		fwk.Teardown()
   348  	})
   349  
   350  	ginkgo.BeforeEach(func() {
   351  		ns = &corev1.Namespace{
   352  			ObjectMeta: metav1.ObjectMeta{
   353  				GenerateName: "core-",
   354  			},
   355  		}
   356  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   357  	})
   358  	ginkgo.AfterEach(func() {
   359  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   360  	})
   361  
   362  	ginkgo.DescribeTable("Single job at different stages of progress towards completion",
   363  		func(podsReadyTestSpec kftesting.PodsReadyTestSpec) {
   364  			kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpytorchjob.JobControl)(testingpytorchjob.MakePyTorchJob(jobName, ns.Name).Parallelism(2).Obj())}
   365  			createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpytorchjob.JobControl)(&kftraining.PyTorchJob{})}
   366  
   367  			kftesting.JobControllerWhenWaitForPodsReadyEnabled(ctx, k8sClient, kfJob, createdJob, podsReadyTestSpec, []kftesting.PodSetsResource{
   368  				{
   369  					RoleName:    kftraining.PyTorchJobReplicaTypeMaster,
   370  					ResourceCPU: "default",
   371  				},
   372  				{
   373  					RoleName:    kftraining.PyTorchJobReplicaTypeWorker,
   374  					ResourceCPU: "default",
   375  				},
   376  			})
   377  		},
   378  		ginkgo.Entry("No progress", kftesting.PodsReadyTestSpec{
   379  			WantCondition: &metav1.Condition{
   380  				Type:    kueue.WorkloadPodsReady,
   381  				Status:  metav1.ConditionFalse,
   382  				Reason:  "PodsReady",
   383  				Message: "Not all pods are ready or succeeded",
   384  			},
   385  		}),
   386  		ginkgo.Entry("Running PyTorchJob", kftesting.PodsReadyTestSpec{
   387  			JobStatus: kftraining.JobStatus{
   388  				Conditions: []kftraining.JobCondition{
   389  					{
   390  						Type:   kftraining.JobRunning,
   391  						Status: corev1.ConditionTrue,
   392  						Reason: "Running",
   393  					},
   394  				},
   395  			},
   396  			WantCondition: &metav1.Condition{
   397  				Type:    kueue.WorkloadPodsReady,
   398  				Status:  metav1.ConditionTrue,
   399  				Reason:  "PodsReady",
   400  				Message: "All pods were ready or succeeded since the workload admission",
   401  			},
   402  		}),
   403  		ginkgo.Entry("Running PyTorchJob; PodsReady=False before", kftesting.PodsReadyTestSpec{
   404  			BeforeCondition: &metav1.Condition{
   405  				Type:    kueue.WorkloadPodsReady,
   406  				Status:  metav1.ConditionFalse,
   407  				Reason:  "PodsReady",
   408  				Message: "Not all pods are ready or succeeded",
   409  			},
   410  			JobStatus: kftraining.JobStatus{
   411  				Conditions: []kftraining.JobCondition{
   412  					{
   413  						Type:   kftraining.JobRunning,
   414  						Status: corev1.ConditionTrue,
   415  						Reason: "Running",
   416  					},
   417  				},
   418  			},
   419  			WantCondition: &metav1.Condition{
   420  				Type:    kueue.WorkloadPodsReady,
   421  				Status:  metav1.ConditionTrue,
   422  				Reason:  "PodsReady",
   423  				Message: "All pods were ready or succeeded since the workload admission",
   424  			},
   425  		}),
   426  		ginkgo.Entry("Job suspended; PodsReady=True before", kftesting.PodsReadyTestSpec{
   427  			BeforeJobStatus: &kftraining.JobStatus{
   428  				Conditions: []kftraining.JobCondition{
   429  					{
   430  						Type:   kftraining.JobRunning,
   431  						Status: corev1.ConditionTrue,
   432  						Reason: "Running",
   433  					},
   434  				},
   435  			},
   436  			BeforeCondition: &metav1.Condition{
   437  				Type:    kueue.WorkloadPodsReady,
   438  				Status:  metav1.ConditionTrue,
   439  				Reason:  "PodsReady",
   440  				Message: "All pods were ready or succeeded since the workload admission",
   441  			},
   442  			JobStatus: kftraining.JobStatus{
   443  				Conditions: []kftraining.JobCondition{
   444  					{
   445  						Type:   kftraining.JobRunning,
   446  						Status: corev1.ConditionFalse,
   447  						Reason: "Suspended",
   448  					},
   449  				},
   450  			},
   451  			Suspended: true,
   452  			WantCondition: &metav1.Condition{
   453  				Type:    kueue.WorkloadPodsReady,
   454  				Status:  metav1.ConditionFalse,
   455  				Reason:  "PodsReady",
   456  				Message: "Not all pods are ready or succeeded",
   457  			},
   458  		}),
   459  	)
   460  })
   461  
   462  var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
   463  	var (
   464  		ns                  *corev1.Namespace
   465  		onDemandFlavor      *kueue.ResourceFlavor
   466  		spotUntaintedFlavor *kueue.ResourceFlavor
   467  		clusterQueue        *kueue.ClusterQueue
   468  		localQueue          *kueue.LocalQueue
   469  	)
   470  
   471  	ginkgo.BeforeAll(func() {
   472  		fwk = &framework.Framework{
   473  			CRDPath:     crdPath,
   474  			DepCRDPaths: []string{pytorchCrdPath},
   475  		}
   476  		cfg := fwk.Init()
   477  		ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup())
   478  	})
   479  	ginkgo.AfterAll(func() {
   480  		fwk.Teardown()
   481  	})
   482  
   483  	ginkgo.BeforeEach(func() {
   484  		ns = &corev1.Namespace{
   485  			ObjectMeta: metav1.ObjectMeta{
   486  				GenerateName: "core-",
   487  			},
   488  		}
   489  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   490  
   491  		onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj()
   492  		gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed())
   493  
   494  		spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj()
   495  		gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed())
   496  
   497  		clusterQueue = testing.MakeClusterQueue("dev-clusterqueue").
   498  			ResourceGroup(
   499  				*testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "5").Obj(),
   500  				*testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(),
   501  			).Obj()
   502  		gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed())
   503  	})
   504  	ginkgo.AfterEach(func() {
   505  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   506  		util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true)
   507  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true)
   508  		gomega.Expect(util.DeleteResourceFlavor(ctx, k8sClient, spotUntaintedFlavor)).To(gomega.Succeed())
   509  	})
   510  
   511  	ginkgo.It("Should schedule jobs as they fit in their ClusterQueue", func() {
   512  		ginkgo.By("creating localQueue")
   513  		localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj()
   514  		gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed())
   515  
   516  		kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpytorchjob.JobControl)(
   517  			testingpytorchjob.MakePyTorchJob(jobName, ns.Name).Queue(localQueue.Name).
   518  				Request(kftraining.PyTorchJobReplicaTypeMaster, corev1.ResourceCPU, "3").
   519  				Request(kftraining.PyTorchJobReplicaTypeWorker, corev1.ResourceCPU, "4").
   520  				Obj(),
   521  		)}
   522  		createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpytorchjob.JobControl)(&kftraining.PyTorchJob{})}
   523  
   524  		kftesting.ShouldScheduleJobsAsTheyFitInTheirClusterQueue(ctx, k8sClient, kfJob, createdJob, clusterQueue, []kftesting.PodSetsResource{
   525  			{
   526  				RoleName:    kftraining.PyTorchJobReplicaTypeMaster,
   527  				ResourceCPU: kueue.ResourceFlavorReference(spotUntaintedFlavor.Name),
   528  			},
   529  			{
   530  				RoleName:    kftraining.PyTorchJobReplicaTypeWorker,
   531  				ResourceCPU: kueue.ResourceFlavorReference(onDemandFlavor.Name),
   532  			},
   533  		})
   534  	})
   535  
   536  	ginkgo.When("The workload's admission is removed", func() {
   537  		ginkgo.It("Should restore the original node selectors", func() {
   538  
   539  			localQueue := testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj()
   540  			job := testingpytorchjob.MakePyTorchJob(jobName, ns.Name).Queue(localQueue.Name).
   541  				Request(kftraining.PyTorchJobReplicaTypeMaster, corev1.ResourceCPU, "3").
   542  				Request(kftraining.PyTorchJobReplicaTypeWorker, corev1.ResourceCPU, "4").
   543  				Obj()
   544  			lookupKey := types.NamespacedName{Name: job.Name, Namespace: job.Namespace}
   545  			createdJob := &kftraining.PyTorchJob{}
   546  
   547  			nodeSelectors := func(j *kftraining.PyTorchJob) map[kftraining.ReplicaType]map[string]string {
   548  				ret := map[kftraining.ReplicaType]map[string]string{}
   549  				for k := range j.Spec.PyTorchReplicaSpecs {
   550  					ret[k] = j.Spec.PyTorchReplicaSpecs[k].Template.Spec.NodeSelector
   551  				}
   552  				return ret
   553  			}
   554  
   555  			ginkgo.By("create a job", func() {
   556  				gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed())
   557  			})
   558  
   559  			ginkgo.By("job should be suspend", func() {
   560  				gomega.Eventually(func() *bool {
   561  					gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed())
   562  					return createdJob.Spec.RunPolicy.Suspend
   563  				}, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(true)))
   564  			})
   565  
   566  			// backup the node selectors
   567  			originalNodeSelectors := nodeSelectors(createdJob)
   568  
   569  			ginkgo.By("create a localQueue", func() {
   570  				gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed())
   571  			})
   572  
   573  			ginkgo.By("job should be unsuspended", func() {
   574  				gomega.Eventually(func() *bool {
   575  					gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed())
   576  					return createdJob.Spec.RunPolicy.Suspend
   577  				}, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false)))
   578  			})
   579  
   580  			ginkgo.By("the node selectors should be updated", func() {
   581  				gomega.Eventually(func() map[kftraining.ReplicaType]map[string]string {
   582  					gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed())
   583  					return nodeSelectors(createdJob)
   584  				}, util.Timeout, util.Interval).ShouldNot(gomega.Equal(originalNodeSelectors))
   585  			})
   586  
   587  			ginkgo.By("delete the localQueue to prevent readmission", func() {
   588  				gomega.Expect(util.DeleteLocalQueue(ctx, k8sClient, localQueue)).Should(gomega.Succeed())
   589  			})
   590  
   591  			ginkgo.By("clear the workload's admission to stop the job", func() {
   592  				wl := &kueue.Workload{}
   593  				wlKey := types.NamespacedName{Name: workloadpytorchjob.GetWorkloadNameForPyTorchJob(job.Name), Namespace: job.Namespace}
   594  				gomega.Expect(k8sClient.Get(ctx, wlKey, wl)).Should(gomega.Succeed())
   595  				gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, wl, nil)).Should(gomega.Succeed())
   596  				util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, wl)
   597  			})
   598  
   599  			ginkgo.By("the node selectors should be restored", func() {
   600  				gomega.Eventually(func() map[kftraining.ReplicaType]map[string]string {
   601  					gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed())
   602  					return nodeSelectors(createdJob)
   603  				}, util.Timeout, util.Interval).Should(gomega.Equal(originalNodeSelectors))
   604  			})
   605  		})
   606  	})
   607  })