sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/xgboostjob/xgboostjob_controller_test.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package xgboostjob
    18  
    19  import (
    20  	kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    21  	"github.com/onsi/ginkgo/v2"
    22  	"github.com/onsi/gomega"
    23  	corev1 "k8s.io/api/core/v1"
    24  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    25  
    26  	configapi "sigs.k8s.io/kueue/apis/config/v1beta1"
    27  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    28  	"sigs.k8s.io/kueue/pkg/controller/jobframework"
    29  	workloadxgboostjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/xgboostjob"
    30  	"sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/kubeflowjob"
    31  	"sigs.k8s.io/kueue/pkg/util/testing"
    32  	testingxgboostjob "sigs.k8s.io/kueue/pkg/util/testingjobs/xgboostjob"
    33  	kftesting "sigs.k8s.io/kueue/test/integration/controller/jobs/kubeflow"
    34  	"sigs.k8s.io/kueue/test/integration/framework"
    35  	"sigs.k8s.io/kueue/test/util"
    36  )
    37  
    38  const (
    39  	jobName           = "test-job"
    40  	instanceKey       = "cloud.provider.com/instance"
    41  	priorityClassName = "test-priority-class"
    42  	priorityValue     = 10
    43  	jobQueueName      = "test-queue"
    44  )
    45  
    46  // +kubebuilder:docs-gen:collapse=Imports
    47  
    48  var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
    49  
    50  	ginkgo.BeforeAll(func() {
    51  		fwk = &framework.Framework{
    52  			CRDPath:     crdPath,
    53  			DepCRDPaths: []string{xgbCrdPath},
    54  		}
    55  		cfg = fwk.Init()
    56  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithManageJobsWithoutQueueName(true)))
    57  	})
    58  	ginkgo.AfterAll(func() {
    59  		fwk.Teardown()
    60  	})
    61  
    62  	var (
    63  		ns *corev1.Namespace
    64  	)
    65  	ginkgo.BeforeEach(func() {
    66  		ns = &corev1.Namespace{
    67  			ObjectMeta: metav1.ObjectMeta{
    68  				GenerateName: "core-",
    69  			},
    70  		}
    71  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
    72  	})
    73  	ginkgo.AfterEach(func() {
    74  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
    75  	})
    76  
    77  	ginkgo.It("Should reconcile XGBoostJobs", func() {
    78  		kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadxgboostjob.JobControl)(testingxgboostjob.MakeXGBoostJob(jobName, ns.Name).Obj())}
    79  		createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadxgboostjob.JobControl)(&kftraining.XGBoostJob{})}
    80  		kftesting.ShouldReconcileJob(ctx, k8sClient, kfJob, createdJob, []kftesting.PodSetsResource{
    81  			{
    82  				RoleName:    kftraining.XGBoostJobReplicaTypeMaster,
    83  				ResourceCPU: "on-demand",
    84  			},
    85  			{
    86  				RoleName:    kftraining.XGBoostJobReplicaTypeWorker,
    87  				ResourceCPU: "spot",
    88  			},
    89  		})
    90  	})
    91  })
    92  
    93  var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
    94  	var (
    95  		ns            *corev1.Namespace
    96  		defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj()
    97  	)
    98  
    99  	ginkgo.BeforeAll(func() {
   100  		fwk = &framework.Framework{
   101  			CRDPath:     crdPath,
   102  			DepCRDPaths: []string{xgbCrdPath},
   103  		}
   104  		cfg := fwk.Init()
   105  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true})))
   106  
   107  		ginkgo.By("Create a resource flavor")
   108  		gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed())
   109  	})
   110  	ginkgo.AfterAll(func() {
   111  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true)
   112  		fwk.Teardown()
   113  	})
   114  
   115  	ginkgo.BeforeEach(func() {
   116  		ns = &corev1.Namespace{
   117  			ObjectMeta: metav1.ObjectMeta{
   118  				GenerateName: "core-",
   119  			},
   120  		}
   121  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   122  	})
   123  	ginkgo.AfterEach(func() {
   124  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   125  	})
   126  
   127  	ginkgo.DescribeTable("Single job at different stages of progress towards completion",
   128  		func(podsReadyTestSpec kftesting.PodsReadyTestSpec) {
   129  			kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadxgboostjob.JobControl)(testingxgboostjob.MakeXGBoostJob(jobName, ns.Name).Parallelism(2).Obj())}
   130  			createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadxgboostjob.JobControl)(&kftraining.XGBoostJob{})}
   131  			kftesting.JobControllerWhenWaitForPodsReadyEnabled(ctx, k8sClient, kfJob, createdJob, podsReadyTestSpec, []kftesting.PodSetsResource{
   132  				{
   133  					RoleName:    kftraining.XGBoostJobReplicaTypeMaster,
   134  					ResourceCPU: "default",
   135  				},
   136  				{
   137  					RoleName:    kftraining.XGBoostJobReplicaTypeWorker,
   138  					ResourceCPU: "default",
   139  				},
   140  			})
   141  		},
   142  		ginkgo.Entry("No progress", kftesting.PodsReadyTestSpec{
   143  			WantCondition: &metav1.Condition{
   144  				Type:    kueue.WorkloadPodsReady,
   145  				Status:  metav1.ConditionFalse,
   146  				Reason:  "PodsReady",
   147  				Message: "Not all pods are ready or succeeded",
   148  			},
   149  		}),
   150  		ginkgo.Entry("Running XGBoostJob", kftesting.PodsReadyTestSpec{
   151  			JobStatus: kftraining.JobStatus{
   152  				Conditions: []kftraining.JobCondition{
   153  					{
   154  						Type:   kftraining.JobRunning,
   155  						Status: corev1.ConditionTrue,
   156  						Reason: "Running",
   157  					},
   158  				},
   159  			},
   160  			WantCondition: &metav1.Condition{
   161  				Type:    kueue.WorkloadPodsReady,
   162  				Status:  metav1.ConditionTrue,
   163  				Reason:  "PodsReady",
   164  				Message: "All pods were ready or succeeded since the workload admission",
   165  			},
   166  		}),
   167  		ginkgo.Entry("Running XGBoostJob; PodsReady=False before", kftesting.PodsReadyTestSpec{
   168  			BeforeCondition: &metav1.Condition{
   169  				Type:    kueue.WorkloadPodsReady,
   170  				Status:  metav1.ConditionFalse,
   171  				Reason:  "PodsReady",
   172  				Message: "Not all pods are ready or succeeded",
   173  			},
   174  			JobStatus: kftraining.JobStatus{
   175  				Conditions: []kftraining.JobCondition{
   176  					{
   177  						Type:   kftraining.JobRunning,
   178  						Status: corev1.ConditionTrue,
   179  						Reason: "Running",
   180  					},
   181  				},
   182  			},
   183  			WantCondition: &metav1.Condition{
   184  				Type:    kueue.WorkloadPodsReady,
   185  				Status:  metav1.ConditionTrue,
   186  				Reason:  "PodsReady",
   187  				Message: "All pods were ready or succeeded since the workload admission",
   188  			},
   189  		}),
   190  		ginkgo.Entry("Job suspended; PodsReady=True before", kftesting.PodsReadyTestSpec{
   191  			BeforeJobStatus: &kftraining.JobStatus{
   192  				Conditions: []kftraining.JobCondition{
   193  					{
   194  						Type:   kftraining.JobRunning,
   195  						Status: corev1.ConditionTrue,
   196  						Reason: "Running",
   197  					},
   198  				},
   199  			},
   200  			BeforeCondition: &metav1.Condition{
   201  				Type:    kueue.WorkloadPodsReady,
   202  				Status:  metav1.ConditionTrue,
   203  				Reason:  "PodsReady",
   204  				Message: "All pods were ready or succeeded since the workload admission",
   205  			},
   206  			JobStatus: kftraining.JobStatus{
   207  				Conditions: []kftraining.JobCondition{
   208  					{
   209  						Type:   kftraining.JobRunning,
   210  						Status: corev1.ConditionFalse,
   211  						Reason: "Suspended",
   212  					},
   213  				},
   214  			},
   215  			Suspended: true,
   216  			WantCondition: &metav1.Condition{
   217  				Type:    kueue.WorkloadPodsReady,
   218  				Status:  metav1.ConditionFalse,
   219  				Reason:  "PodsReady",
   220  				Message: "Not all pods are ready or succeeded",
   221  			},
   222  		}),
   223  	)
   224  })
   225  
   226  var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
   227  	var (
   228  		ns                  *corev1.Namespace
   229  		onDemandFlavor      *kueue.ResourceFlavor
   230  		spotUntaintedFlavor *kueue.ResourceFlavor
   231  		clusterQueue        *kueue.ClusterQueue
   232  		localQueue          *kueue.LocalQueue
   233  	)
   234  
   235  	ginkgo.BeforeAll(func() {
   236  		fwk = &framework.Framework{
   237  			CRDPath:     crdPath,
   238  			DepCRDPaths: []string{xgbCrdPath},
   239  		}
   240  		cfg := fwk.Init()
   241  		ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup())
   242  	})
   243  	ginkgo.AfterAll(func() {
   244  		fwk.Teardown()
   245  	})
   246  
   247  	ginkgo.BeforeEach(func() {
   248  		ns = &corev1.Namespace{
   249  			ObjectMeta: metav1.ObjectMeta{
   250  				GenerateName: "core-",
   251  			},
   252  		}
   253  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   254  
   255  		onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj()
   256  		gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed())
   257  
   258  		spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj()
   259  		gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed())
   260  
   261  		clusterQueue = testing.MakeClusterQueue("dev-clusterqueue").
   262  			ResourceGroup(
   263  				*testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "5").Obj(),
   264  				*testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(),
   265  			).Obj()
   266  		gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed())
   267  	})
   268  	ginkgo.AfterEach(func() {
   269  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   270  		util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true)
   271  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true)
   272  		gomega.Expect(util.DeleteResourceFlavor(ctx, k8sClient, spotUntaintedFlavor)).To(gomega.Succeed())
   273  	})
   274  
   275  	ginkgo.It("Should schedule jobs as they fit in their ClusterQueue", func() {
   276  		ginkgo.By("creating localQueue")
   277  		localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj()
   278  		gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed())
   279  
   280  		kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadxgboostjob.JobControl)(
   281  			testingxgboostjob.MakeXGBoostJob(jobName, ns.Name).Queue(localQueue.Name).
   282  				Request(kftraining.XGBoostJobReplicaTypeMaster, corev1.ResourceCPU, "3").
   283  				Request(kftraining.XGBoostJobReplicaTypeWorker, corev1.ResourceCPU, "4").
   284  				Obj(),
   285  		)}
   286  		createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadxgboostjob.JobControl)(&kftraining.XGBoostJob{})}
   287  		kftesting.ShouldScheduleJobsAsTheyFitInTheirClusterQueue(ctx, k8sClient, kfJob, createdJob, clusterQueue, []kftesting.PodSetsResource{
   288  			{
   289  				RoleName:    kftraining.XGBoostJobReplicaTypeMaster,
   290  				ResourceCPU: kueue.ResourceFlavorReference(spotUntaintedFlavor.Name),
   291  			},
   292  			{
   293  				RoleName:    kftraining.XGBoostJobReplicaTypeWorker,
   294  				ResourceCPU: kueue.ResourceFlavorReference(onDemandFlavor.Name),
   295  			},
   296  		})
   297  	})
   298  })