sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/rayjob/rayjob_controller_test.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package rayjob
    18  
    19  import (
    20  	"fmt"
    21  
    22  	"github.com/google/go-cmp/cmp/cmpopts"
    23  	"github.com/onsi/ginkgo/v2"
    24  	"github.com/onsi/gomega"
    25  	rayjobapi "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1"
    26  	corev1 "k8s.io/api/core/v1"
    27  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    28  	apimeta "k8s.io/apimachinery/pkg/api/meta"
    29  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    30  	"k8s.io/apimachinery/pkg/types"
    31  	"k8s.io/client-go/kubernetes/scheme"
    32  	"k8s.io/utils/ptr"
    33  	ctrl "sigs.k8s.io/controller-runtime"
    34  	"sigs.k8s.io/controller-runtime/pkg/client"
    35  
    36  	configapi "sigs.k8s.io/kueue/apis/config/v1beta1"
    37  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    38  	"sigs.k8s.io/kueue/pkg/controller/constants"
    39  	"sigs.k8s.io/kueue/pkg/controller/jobframework"
    40  	workloadrayjob "sigs.k8s.io/kueue/pkg/controller/jobs/rayjob"
    41  	"sigs.k8s.io/kueue/pkg/util/testing"
    42  	testingrayjob "sigs.k8s.io/kueue/pkg/util/testingjobs/rayjob"
    43  	"sigs.k8s.io/kueue/test/integration/framework"
    44  	"sigs.k8s.io/kueue/test/util"
    45  )
    46  
    47  const (
    48  	jobName                 = "test-job"
    49  	instanceKey             = "cloud.provider.com/instance"
    50  	priorityClassName       = "test-priority-class"
    51  	priorityValue     int32 = 10
    52  )
    53  
    54  var (
    55  	ignoreConditionTimestamps = cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime")
    56  )
    57  
    58  // +kubebuilder:docs-gen:collapse=Imports
    59  
    60  func setInitStatus(name, namespace string) {
    61  	createdJob := &rayjobapi.RayJob{}
    62  	nsName := types.NamespacedName{Name: name, Namespace: namespace}
    63  	gomega.EventuallyWithOffset(1, func() error {
    64  		if err := k8sClient.Get(ctx, nsName, createdJob); err != nil {
    65  			return err
    66  		}
    67  		createdJob.Status.JobDeploymentStatus = rayjobapi.JobDeploymentStatusSuspended
    68  		return k8sClient.Status().Update(ctx, createdJob)
    69  	}, util.Timeout, util.Interval).Should(gomega.Succeed())
    70  }
    71  
    72  var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
    73  	ginkgo.BeforeAll(func() {
    74  		fwk = &framework.Framework{
    75  			CRDPath:     crdPath,
    76  			DepCRDPaths: []string{rayCrdPath},
    77  		}
    78  
    79  		cfg = fwk.Init()
    80  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithManageJobsWithoutQueueName(true)))
    81  	})
    82  	ginkgo.AfterAll(func() {
    83  		fwk.Teardown()
    84  	})
    85  
    86  	var (
    87  		ns          *corev1.Namespace
    88  		wlLookupKey types.NamespacedName
    89  	)
    90  	ginkgo.BeforeEach(func() {
    91  		ns = &corev1.Namespace{
    92  			ObjectMeta: metav1.ObjectMeta{
    93  				GenerateName: "core-",
    94  			},
    95  		}
    96  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
    97  
    98  		wlLookupKey = types.NamespacedName{Name: workloadrayjob.GetWorkloadNameForRayJob(jobName), Namespace: ns.Name}
    99  	})
   100  	ginkgo.AfterEach(func() {
   101  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   102  	})
   103  
   104  	ginkgo.It("Should reconcile RayJobs", func() {
   105  		ginkgo.By("checking the job gets suspended when created unsuspended")
   106  		priorityClass := testing.MakePriorityClass(priorityClassName).
   107  			PriorityValue(priorityValue).Obj()
   108  		gomega.Expect(k8sClient.Create(ctx, priorityClass)).Should(gomega.Succeed())
   109  
   110  		job := testingrayjob.MakeJob(jobName, ns.Name).
   111  			Suspend(false).
   112  			WithPriorityClassName(priorityClassName).
   113  			Obj()
   114  		err := k8sClient.Create(ctx, job)
   115  		gomega.Expect(err).To(gomega.Succeed())
   116  		createdJob := &rayjobapi.RayJob{}
   117  
   118  		setInitStatus(jobName, ns.Name)
   119  		gomega.Eventually(func() bool {
   120  			if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: ns.Name}, createdJob); err != nil {
   121  				return false
   122  			}
   123  			return createdJob.Spec.Suspend
   124  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   125  
   126  		ginkgo.By("checking the workload is created without queue assigned")
   127  		createdWorkload := &kueue.Workload{}
   128  		gomega.Eventually(func() error {
   129  			return k8sClient.Get(ctx, wlLookupKey, createdWorkload)
   130  		}, util.Timeout, util.Interval).Should(gomega.Succeed())
   131  		gomega.Expect(createdWorkload.Spec.QueueName).Should(gomega.Equal(""), "The Workload shouldn't have .spec.queueName set")
   132  		gomega.Expect(metav1.IsControlledBy(createdWorkload, createdJob)).To(gomega.BeTrue(), "The Workload should be owned by the Job")
   133  
   134  		ginkgo.By("checking the workload is created with priority and priorityName")
   135  		gomega.Expect(createdWorkload.Spec.PriorityClassName).Should(gomega.Equal(priorityClassName))
   136  		gomega.Expect(*createdWorkload.Spec.Priority).Should(gomega.Equal(priorityValue))
   137  
   138  		ginkgo.By("checking the workload is updated with queue name when the job does")
   139  		jobQueueName := "test-queue"
   140  		createdJob.Annotations = map[string]string{constants.QueueAnnotation: jobQueueName}
   141  		gomega.Expect(k8sClient.Update(ctx, createdJob)).Should(gomega.Succeed())
   142  		gomega.Eventually(func() bool {
   143  			if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil {
   144  				return false
   145  			}
   146  			return createdWorkload.Spec.QueueName == jobQueueName
   147  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   148  
   149  		ginkgo.By("checking a second non-matching workload is deleted")
   150  		secondWl := &kueue.Workload{
   151  			ObjectMeta: metav1.ObjectMeta{
   152  				Name:      workloadrayjob.GetWorkloadNameForRayJob("second-workload"),
   153  				Namespace: createdWorkload.Namespace,
   154  			},
   155  			Spec: *createdWorkload.Spec.DeepCopy(),
   156  		}
   157  		gomega.Expect(ctrl.SetControllerReference(createdJob, secondWl, scheme.Scheme)).Should(gomega.Succeed())
   158  		secondWl.Spec.PodSets[0].Count += 1
   159  
   160  		gomega.Expect(k8sClient.Create(ctx, secondWl)).Should(gomega.Succeed())
   161  		gomega.Eventually(func() error {
   162  			wl := &kueue.Workload{}
   163  			key := types.NamespacedName{Name: secondWl.Name, Namespace: secondWl.Namespace}
   164  			return k8sClient.Get(ctx, key, wl)
   165  		}, util.Timeout, util.Interval).Should(testing.BeNotFoundError())
   166  		// check the original wl is still there
   167  		gomega.Eventually(func() error {
   168  			return k8sClient.Get(ctx, wlLookupKey, createdWorkload)
   169  		}, util.Timeout, util.Interval).Should(gomega.Succeed())
   170  
   171  		ginkgo.By("checking the job is unsuspended when workload is assigned")
   172  		onDemandFlavor := testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj()
   173  		gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed())
   174  		spotFlavor := testing.MakeResourceFlavor("spot").Label(instanceKey, "spot").Obj()
   175  		gomega.Expect(k8sClient.Create(ctx, spotFlavor)).Should(gomega.Succeed())
   176  		clusterQueue := testing.MakeClusterQueue("cluster-queue").
   177  			ResourceGroup(
   178  				*testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(),
   179  				*testing.MakeFlavorQuotas("spot").Resource(corev1.ResourceCPU, "5").Obj(),
   180  			).Obj()
   181  		admission := testing.MakeAdmission(clusterQueue.Name).PodSets(
   182  			kueue.PodSetAssignment{
   183  				Name: createdWorkload.Spec.PodSets[0].Name,
   184  				Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{
   185  					corev1.ResourceCPU: "on-demand",
   186  				},
   187  			}, kueue.PodSetAssignment{
   188  				Name: createdWorkload.Spec.PodSets[1].Name,
   189  				Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{
   190  					corev1.ResourceCPU: "spot",
   191  				},
   192  			},
   193  		).Obj()
   194  		gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed())
   195  		util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload)
   196  		lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name}
   197  		gomega.Eventually(func() bool {
   198  			if err := k8sClient.Get(ctx, lookupKey, createdJob); err != nil {
   199  				return false
   200  			}
   201  			return !createdJob.Spec.Suspend
   202  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   203  		gomega.Eventually(func() bool {
   204  			ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "Started", corev1.EventTypeNormal, fmt.Sprintf("Admitted by clusterQueue %v", clusterQueue.Name))
   205  			return ok
   206  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   207  		gomega.Expect(len(createdJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.NodeSelector)).Should(gomega.Equal(1))
   208  		gomega.Expect(createdJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name))
   209  		gomega.Expect(len(createdJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.NodeSelector)).Should(gomega.Equal(1))
   210  		gomega.Expect(createdJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotFlavor.Name))
   211  		gomega.Eventually(func() bool {
   212  			if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil {
   213  				return false
   214  			}
   215  			return apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadQuotaReserved)
   216  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   217  
   218  		ginkgo.By("checking the job gets suspended when parallelism changes and the added node selectors are removed")
   219  		parallelism := ptr.Deref(job.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas, 1)
   220  		newParallelism := int32(parallelism + 1)
   221  		createdJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Replicas = &newParallelism
   222  		gomega.Expect(k8sClient.Update(ctx, createdJob)).Should(gomega.Succeed())
   223  		gomega.Eventually(func() bool {
   224  			if err := k8sClient.Get(ctx, lookupKey, createdJob); err != nil {
   225  				return false
   226  			}
   227  			return createdJob.Spec.Suspend && len(createdJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.NodeSelector) == 0
   228  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   229  		gomega.Eventually(func() bool {
   230  			ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "DeletedWorkload", corev1.EventTypeNormal, fmt.Sprintf("Deleted not matching Workload: %v", wlLookupKey.String()))
   231  			return ok
   232  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   233  
   234  		ginkgo.By("checking the workload is updated with new count")
   235  		gomega.Eventually(func() bool {
   236  			if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil {
   237  				return false
   238  			}
   239  			return createdWorkload.Spec.PodSets[1].Count == newParallelism
   240  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   241  		gomega.Expect(createdWorkload.Status.Admission).Should(gomega.BeNil())
   242  
   243  		ginkgo.By("checking the job is unsuspended and selectors added when workload is assigned again")
   244  		gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed())
   245  		util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload)
   246  		gomega.Eventually(func() bool {
   247  			if err := k8sClient.Get(ctx, lookupKey, createdJob); err != nil {
   248  				return false
   249  			}
   250  			return !createdJob.Spec.Suspend
   251  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   252  		gomega.Expect(len(createdJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.NodeSelector)).Should(gomega.Equal(1))
   253  		gomega.Expect(createdJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name))
   254  		gomega.Expect(len(createdJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.NodeSelector)).Should(gomega.Equal(1))
   255  		gomega.Expect(createdJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotFlavor.Name))
   256  		gomega.Eventually(func() bool {
   257  			if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil {
   258  				return false
   259  			}
   260  			return apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadQuotaReserved)
   261  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   262  
   263  		ginkgo.By("checking the workload is finished when job is completed")
   264  		createdJob.Status.JobDeploymentStatus = rayjobapi.JobDeploymentStatusComplete
   265  		createdJob.Status.JobStatus = rayjobapi.JobStatusSucceeded
   266  		createdJob.Status.Message = "Job finished by test"
   267  
   268  		gomega.Expect(k8sClient.Status().Update(ctx, createdJob)).Should(gomega.Succeed())
   269  		gomega.Eventually(func() bool {
   270  			gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).To(gomega.Succeed())
   271  			return apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadFinished)
   272  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   273  	})
   274  })
   275  
   276  var _ = ginkgo.Describe("Job controller for workloads when only jobs with queue are managed", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
   277  	ginkgo.BeforeAll(func() {
   278  		fwk = &framework.Framework{
   279  			CRDPath:     crdPath,
   280  			DepCRDPaths: []string{rayCrdPath},
   281  		}
   282  		cfg = fwk.Init()
   283  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup())
   284  	})
   285  	ginkgo.AfterAll(func() {
   286  		fwk.Teardown()
   287  	})
   288  
   289  	var (
   290  		ns *corev1.Namespace
   291  	)
   292  	ginkgo.BeforeEach(func() {
   293  		ns = &corev1.Namespace{
   294  			ObjectMeta: metav1.ObjectMeta{
   295  				GenerateName: "core-",
   296  			},
   297  		}
   298  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   299  	})
   300  	ginkgo.AfterEach(func() {
   301  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   302  	})
   303  
   304  	ginkgo.It("Should reconcile jobs only when queue is set", func() {
   305  		ginkgo.By("checking the workload is not created when queue name is not set")
   306  		job := testingrayjob.MakeJob(jobName, ns.Name).Obj()
   307  		gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed())
   308  		lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name}
   309  		createdJob := &rayjobapi.RayJob{}
   310  		setInitStatus(jobName, ns.Name)
   311  		gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed())
   312  
   313  		createdWorkload := &kueue.Workload{}
   314  		wlLookupKey := types.NamespacedName{Name: workloadrayjob.GetWorkloadNameForRayJob(jobName), Namespace: ns.Name}
   315  		gomega.Eventually(func() bool {
   316  			return apierrors.IsNotFound(k8sClient.Get(ctx, wlLookupKey, createdWorkload))
   317  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   318  
   319  		ginkgo.By("checking the workload is created when queue name is set")
   320  		jobQueueName := "test-queue"
   321  		if createdJob.Labels == nil {
   322  			createdJob.Labels = map[string]string{constants.QueueAnnotation: jobQueueName}
   323  		} else {
   324  			createdJob.Labels[constants.QueueLabel] = jobQueueName
   325  		}
   326  		gomega.Expect(k8sClient.Update(ctx, createdJob)).Should(gomega.Succeed())
   327  		gomega.Eventually(func() error {
   328  			return k8sClient.Get(ctx, wlLookupKey, createdWorkload)
   329  		}, util.Timeout, util.Interval).Should(gomega.Succeed())
   330  	})
   331  
   332  })
   333  
   334  var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
   335  	type podsReadyTestSpec struct {
   336  		beforeJobStatus *rayjobapi.RayJobStatus
   337  		beforeCondition *metav1.Condition
   338  		jobStatus       rayjobapi.RayJobStatus
   339  		suspended       bool
   340  		wantCondition   *metav1.Condition
   341  	}
   342  
   343  	var defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj()
   344  
   345  	ginkgo.BeforeAll(func() {
   346  		fwk = &framework.Framework{
   347  			CRDPath:     crdPath,
   348  			DepCRDPaths: []string{rayCrdPath},
   349  		}
   350  		cfg = fwk.Init()
   351  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true})))
   352  
   353  		ginkgo.By("Create a resource flavor")
   354  		gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed())
   355  	})
   356  
   357  	ginkgo.AfterAll(func() {
   358  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true)
   359  		fwk.Teardown()
   360  	})
   361  
   362  	var (
   363  		ns          *corev1.Namespace
   364  		wlLookupKey types.NamespacedName
   365  	)
   366  	ginkgo.BeforeEach(func() {
   367  		ns = &corev1.Namespace{
   368  			ObjectMeta: metav1.ObjectMeta{
   369  				GenerateName: "core-",
   370  			},
   371  		}
   372  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   373  
   374  		wlLookupKey = types.NamespacedName{Name: workloadrayjob.GetWorkloadNameForRayJob(jobName), Namespace: ns.Name}
   375  	})
   376  	ginkgo.AfterEach(func() {
   377  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   378  	})
   379  
   380  	ginkgo.DescribeTable("Single job at different stages of progress towards completion",
   381  		func(podsReadyTestSpec podsReadyTestSpec) {
   382  			ginkgo.By("Create a job")
   383  			job := testingrayjob.MakeJob(jobName, ns.Name).Obj()
   384  			jobQueueName := "test-queue"
   385  			job.Annotations = map[string]string{constants.QueueAnnotation: jobQueueName}
   386  			gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed())
   387  			lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name}
   388  			setInitStatus(jobName, ns.Name)
   389  			createdJob := &rayjobapi.RayJob{}
   390  			gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed())
   391  
   392  			ginkgo.By("Fetch the workload created for the job")
   393  			createdWorkload := &kueue.Workload{}
   394  			gomega.Eventually(func() error {
   395  				return k8sClient.Get(ctx, wlLookupKey, createdWorkload)
   396  			}, util.Timeout, util.Interval).Should(gomega.Succeed())
   397  
   398  			ginkgo.By("Admit the workload created for the job")
   399  			admission := testing.MakeAdmission("foo").PodSets(
   400  				kueue.PodSetAssignment{
   401  					Name: createdWorkload.Spec.PodSets[0].Name,
   402  					Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{
   403  						corev1.ResourceCPU: "default",
   404  					},
   405  				}, kueue.PodSetAssignment{
   406  					Name: createdWorkload.Spec.PodSets[1].Name,
   407  					Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{
   408  						corev1.ResourceCPU: "default",
   409  					},
   410  				},
   411  			).Obj()
   412  			gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed())
   413  			util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload)
   414  			gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed())
   415  
   416  			ginkgo.By("Await for the job to be unsuspended")
   417  			gomega.Eventually(func() bool {
   418  				gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed())
   419  				return createdJob.Spec.Suspend
   420  			}, util.Timeout, util.Interval).Should(gomega.BeFalse())
   421  
   422  			if podsReadyTestSpec.beforeJobStatus != nil {
   423  				ginkgo.By("Update the job status to simulate its initial progress towards completion")
   424  				createdJob.Status = *podsReadyTestSpec.beforeJobStatus
   425  				gomega.Expect(k8sClient.Status().Update(ctx, createdJob)).Should(gomega.Succeed())
   426  				gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed())
   427  			}
   428  
   429  			if podsReadyTestSpec.beforeCondition != nil {
   430  				ginkgo.By("Update the workload status")
   431  				gomega.Eventually(func() *metav1.Condition {
   432  					gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed())
   433  					return apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadPodsReady)
   434  				}, util.Timeout, util.Interval).Should(gomega.BeComparableTo(podsReadyTestSpec.beforeCondition, ignoreConditionTimestamps))
   435  			}
   436  
   437  			ginkgo.By("Update the job status to simulate its progress towards completion")
   438  			createdJob.Status = podsReadyTestSpec.jobStatus
   439  			gomega.Expect(k8sClient.Status().Update(ctx, createdJob)).Should(gomega.Succeed())
   440  			gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed())
   441  
   442  			if podsReadyTestSpec.suspended {
   443  				ginkgo.By("Unset admission of the workload to suspend the job")
   444  				gomega.Eventually(func() error {
   445  					// the update may need to be retried due to a conflict as the workload gets
   446  					// also updated due to setting of the job status.
   447  					if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil {
   448  						return err
   449  					}
   450  					return util.SetQuotaReservation(ctx, k8sClient, createdWorkload, nil)
   451  				}, util.Timeout, util.Interval).Should(gomega.Succeed())
   452  				util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload)
   453  			}
   454  
   455  			ginkgo.By("Verify the PodsReady condition is added")
   456  			gomega.Eventually(func() *metav1.Condition {
   457  				gomega.Expect(k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed())
   458  				return apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadPodsReady)
   459  			}, util.Timeout, util.Interval).Should(gomega.BeComparableTo(podsReadyTestSpec.wantCondition, ignoreConditionTimestamps))
   460  		},
   461  		ginkgo.Entry("No progress", podsReadyTestSpec{
   462  			wantCondition: &metav1.Condition{
   463  				Type:    kueue.WorkloadPodsReady,
   464  				Status:  metav1.ConditionFalse,
   465  				Reason:  "PodsReady",
   466  				Message: "Not all pods are ready or succeeded",
   467  			},
   468  		}),
   469  		ginkgo.Entry("Running RayJob", podsReadyTestSpec{
   470  			jobStatus: rayjobapi.RayJobStatus{
   471  				JobDeploymentStatus: rayjobapi.JobDeploymentStatusRunning,
   472  				RayClusterStatus: rayjobapi.RayClusterStatus{
   473  					State: rayjobapi.Ready,
   474  				},
   475  			},
   476  			wantCondition: &metav1.Condition{
   477  				Type:    kueue.WorkloadPodsReady,
   478  				Status:  metav1.ConditionTrue,
   479  				Reason:  "PodsReady",
   480  				Message: "All pods were ready or succeeded since the workload admission",
   481  			},
   482  		}),
   483  		ginkgo.Entry("Running RayJob; PodsReady=False before", podsReadyTestSpec{
   484  			beforeCondition: &metav1.Condition{
   485  				Type:    kueue.WorkloadPodsReady,
   486  				Status:  metav1.ConditionFalse,
   487  				Reason:  "PodsReady",
   488  				Message: "Not all pods are ready or succeeded",
   489  			},
   490  			jobStatus: rayjobapi.RayJobStatus{
   491  				JobDeploymentStatus: rayjobapi.JobDeploymentStatusRunning,
   492  				RayClusterStatus: rayjobapi.RayClusterStatus{
   493  					State: rayjobapi.Ready,
   494  				},
   495  			},
   496  			wantCondition: &metav1.Condition{
   497  				Type:    kueue.WorkloadPodsReady,
   498  				Status:  metav1.ConditionTrue,
   499  				Reason:  "PodsReady",
   500  				Message: "All pods were ready or succeeded since the workload admission",
   501  			},
   502  		}),
   503  		ginkgo.Entry("Job suspended; PodsReady=True before", podsReadyTestSpec{
   504  			beforeJobStatus: &rayjobapi.RayJobStatus{
   505  				JobDeploymentStatus: rayjobapi.JobDeploymentStatusRunning,
   506  				RayClusterStatus: rayjobapi.RayClusterStatus{
   507  					State: rayjobapi.Ready,
   508  				},
   509  			},
   510  			beforeCondition: &metav1.Condition{
   511  				Type:    kueue.WorkloadPodsReady,
   512  				Status:  metav1.ConditionTrue,
   513  				Reason:  "PodsReady",
   514  				Message: "All pods were ready or succeeded since the workload admission",
   515  			},
   516  			jobStatus: rayjobapi.RayJobStatus{
   517  				JobDeploymentStatus: rayjobapi.JobDeploymentStatusSuspended,
   518  			},
   519  			suspended: true,
   520  			wantCondition: &metav1.Condition{
   521  				Type:    kueue.WorkloadPodsReady,
   522  				Status:  metav1.ConditionFalse,
   523  				Reason:  "PodsReady",
   524  				Message: "Not all pods are ready or succeeded",
   525  			},
   526  		}),
   527  	)
   528  })
   529  
   530  var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
   531  	ginkgo.BeforeAll(func() {
   532  		fwk = &framework.Framework{
   533  			CRDPath:     crdPath,
   534  			DepCRDPaths: []string{rayCrdPath},
   535  		}
   536  		cfg = fwk.Init()
   537  		ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup())
   538  	})
   539  	ginkgo.AfterAll(func() {
   540  		fwk.Teardown()
   541  	})
   542  
   543  	var (
   544  		ns                  *corev1.Namespace
   545  		onDemandFlavor      *kueue.ResourceFlavor
   546  		spotUntaintedFlavor *kueue.ResourceFlavor
   547  		clusterQueue        *kueue.ClusterQueue
   548  		localQueue          *kueue.LocalQueue
   549  	)
   550  
   551  	ginkgo.BeforeEach(func() {
   552  		ns = &corev1.Namespace{
   553  			ObjectMeta: metav1.ObjectMeta{
   554  				GenerateName: "core-",
   555  			},
   556  		}
   557  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   558  
   559  		onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj()
   560  		gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed())
   561  
   562  		spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj()
   563  		gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed())
   564  
   565  		clusterQueue = testing.MakeClusterQueue("dev-clusterqueue").
   566  			ResourceGroup(
   567  				*testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "5").Obj(),
   568  				*testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(),
   569  			).Obj()
   570  		gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed())
   571  	})
   572  	ginkgo.AfterEach(func() {
   573  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   574  		util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true)
   575  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true)
   576  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, spotUntaintedFlavor, true)
   577  	})
   578  
   579  	ginkgo.It("Should schedule jobs as they fit in their ClusterQueue", func() {
   580  		ginkgo.By("creating localQueue")
   581  		localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj()
   582  		gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed())
   583  
   584  		ginkgo.By("checking a dev job starts")
   585  		job := testingrayjob.MakeJob("dev-job", ns.Name).Queue(localQueue.Name).
   586  			RequestHead(corev1.ResourceCPU, "3").
   587  			RequestWorkerGroup(corev1.ResourceCPU, "4").
   588  			Obj()
   589  		gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed())
   590  		setInitStatus(job.Name, job.Namespace)
   591  		createdJob := &rayjobapi.RayJob{}
   592  		gomega.Eventually(func() bool {
   593  			gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: job.Name, Namespace: job.Namespace}, createdJob)).
   594  				Should(gomega.Succeed())
   595  			return createdJob.Spec.Suspend
   596  		}, util.Timeout, util.Interval).Should(gomega.BeFalse())
   597  		gomega.Expect(createdJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotUntaintedFlavor.Name))
   598  		gomega.Expect(createdJob.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name))
   599  		util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 0)
   600  		util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 1)
   601  
   602  	})
   603  })
   604  
   605  var _ = ginkgo.Describe("Job controller with preemption enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
   606  	ginkgo.BeforeAll(func() {
   607  		fwk = &framework.Framework{
   608  			CRDPath:     crdPath,
   609  			DepCRDPaths: []string{rayCrdPath},
   610  		}
   611  		cfg = fwk.Init()
   612  		ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup())
   613  	})
   614  	ginkgo.AfterAll(func() {
   615  		fwk.Teardown()
   616  	})
   617  
   618  	var (
   619  		ns             *corev1.Namespace
   620  		onDemandFlavor *kueue.ResourceFlavor
   621  		clusterQueue   *kueue.ClusterQueue
   622  		localQueue     *kueue.LocalQueue
   623  	)
   624  
   625  	ginkgo.BeforeEach(func() {
   626  		ns = &corev1.Namespace{
   627  			ObjectMeta: metav1.ObjectMeta{
   628  				GenerateName: "core-",
   629  			},
   630  		}
   631  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   632  
   633  		onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj()
   634  		gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed())
   635  
   636  		clusterQueue = testing.MakeClusterQueue("clusterqueue").
   637  			ResourceGroup(
   638  				*testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "4").Obj(),
   639  			).
   640  			Preemption(kueue.ClusterQueuePreemption{
   641  				WithinClusterQueue: kueue.PreemptionPolicyLowerPriority,
   642  			}).
   643  			Obj()
   644  		gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed())
   645  
   646  		ginkgo.By("creating localQueue")
   647  		localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj()
   648  		gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed())
   649  
   650  		ginkgo.By("creating priority")
   651  		priorityClass := testing.MakePriorityClass(priorityClassName).
   652  			PriorityValue(priorityValue).Obj()
   653  		gomega.Expect(k8sClient.Create(ctx, priorityClass)).Should(gomega.Succeed())
   654  	})
   655  	ginkgo.AfterEach(func() {
   656  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   657  		util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true)
   658  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true)
   659  	})
   660  
   661  	ginkgo.It("Should preempt lower priority rayJobs when resource insufficient", func() {
   662  		ginkgo.By("Create a low priority rayJob")
   663  		lowPriorityJob := testingrayjob.MakeJob("rayjob-with-low-priority", ns.Name).Queue(localQueue.Name).
   664  			RequestHead(corev1.ResourceCPU, "1").
   665  			RequestWorkerGroup(corev1.ResourceCPU, "2").
   666  			Obj()
   667  		gomega.Expect(k8sClient.Create(ctx, lowPriorityJob)).Should(gomega.Succeed())
   668  		setInitStatus(lowPriorityJob.Name, lowPriorityJob.Namespace)
   669  
   670  		ginkgo.By("Await for the low priority workload to be admitted")
   671  		createdJob := &rayjobapi.RayJob{}
   672  		gomega.Eventually(func() bool {
   673  			gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: lowPriorityJob.Name, Namespace: lowPriorityJob.Namespace}, createdJob)).
   674  				Should(gomega.Succeed())
   675  			return createdJob.Spec.Suspend
   676  		}, util.Timeout, util.Interval).Should(gomega.BeFalse())
   677  
   678  		ginkgo.By("Create a high priority rayJob which will preempt the lower one")
   679  		highPriorityJob := testingrayjob.MakeJob("rayjob-with-high-priority", ns.Name).Queue(localQueue.Name).
   680  			RequestHead(corev1.ResourceCPU, "2").
   681  			WithPriorityClassName(priorityClassName).
   682  			RequestWorkerGroup(corev1.ResourceCPU, "2").
   683  			Obj()
   684  		gomega.Expect(k8sClient.Create(ctx, highPriorityJob)).Should(gomega.Succeed())
   685  		setInitStatus(highPriorityJob.Name, highPriorityJob.Namespace)
   686  
   687  		ginkgo.By("High priority workload should be admitted")
   688  		highPriorityWL := &kueue.Workload{}
   689  		highPriorityLookupKey := types.NamespacedName{Name: workloadrayjob.GetWorkloadNameForRayJob(highPriorityJob.Name), Namespace: ns.Name}
   690  
   691  		gomega.Eventually(func() error {
   692  			return k8sClient.Get(ctx, highPriorityLookupKey, highPriorityWL)
   693  		}, util.Timeout, util.Interval).Should(gomega.Succeed())
   694  		apimeta.IsStatusConditionTrue(highPriorityWL.Status.Conditions, kueue.WorkloadAdmitted)
   695  
   696  		ginkgo.By("Low priority workload should not be admitted")
   697  		createdWorkload := &kueue.Workload{}
   698  		lowPriorityLookupKey := types.NamespacedName{Name: workloadrayjob.GetWorkloadNameForRayJob(lowPriorityJob.Name), Namespace: ns.Name}
   699  
   700  		gomega.Eventually(func() error {
   701  			return k8sClient.Get(ctx, lowPriorityLookupKey, createdWorkload)
   702  		}, util.Timeout, util.Interval).Should(gomega.Succeed())
   703  		apimeta.IsStatusConditionFalse(createdWorkload.Status.Conditions, kueue.WorkloadAdmitted)
   704  
   705  		ginkgo.By("Low priority rayJob should be suspended")
   706  		createdJob = &rayjobapi.RayJob{}
   707  		gomega.Eventually(func() bool {
   708  			gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: lowPriorityJob.Name, Namespace: lowPriorityJob.Namespace}, createdJob)).
   709  				Should(gomega.Succeed())
   710  			return createdJob.Spec.Suspend
   711  		}, util.Timeout, util.Interval).Should(gomega.BeTrue())
   712  
   713  		ginkgo.By("Delete high priority rayjob")
   714  		gomega.Expect(k8sClient.Delete(ctx, highPriorityJob)).To(gomega.Succeed())
   715  		gomega.EventuallyWithOffset(1, func() error {
   716  			rayjob := &rayjobapi.RayJob{}
   717  			return k8sClient.Get(ctx, client.ObjectKeyFromObject(highPriorityJob), rayjob)
   718  		}, util.Timeout, util.Interval).Should(testing.BeNotFoundError())
   719  		// Manually delete workload because no garbage collection controller.
   720  		gomega.Expect(k8sClient.Delete(ctx, highPriorityWL)).To(gomega.Succeed())
   721  		gomega.EventuallyWithOffset(1, func() error {
   722  			wl := &kueue.Workload{}
   723  			return k8sClient.Get(ctx, highPriorityLookupKey, wl)
   724  		}, util.Timeout, util.Interval).Should(testing.BeNotFoundError())
   725  
   726  		ginkgo.By("Low priority workload should be admitted again")
   727  		createdWorkload = &kueue.Workload{}
   728  		gomega.Eventually(func() error {
   729  			return k8sClient.Get(ctx, lowPriorityLookupKey, createdWorkload)
   730  		}, util.Timeout, util.Interval).Should(gomega.Succeed())
   731  		apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadAdmitted)
   732  
   733  		ginkgo.By("Low priority rayJob should be unsuspended")
   734  		createdJob = &rayjobapi.RayJob{}
   735  		gomega.Eventually(func() bool {
   736  			gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: lowPriorityJob.Name, Namespace: lowPriorityJob.Namespace}, createdJob)).
   737  				Should(gomega.Succeed())
   738  			return createdJob.Spec.Suspend
   739  		}, util.Timeout, util.Interval).Should(gomega.BeFalse())
   740  	})
   741  })