sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/mxjob/mxjob_controller_test.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package mxjob
    18  
    19  import (
    20  	kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    21  	"github.com/onsi/ginkgo/v2"
    22  	"github.com/onsi/gomega"
    23  	corev1 "k8s.io/api/core/v1"
    24  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    25  
    26  	configapi "sigs.k8s.io/kueue/apis/config/v1beta1"
    27  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    28  	"sigs.k8s.io/kueue/pkg/controller/jobframework"
    29  	workloadmxjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/mxjob"
    30  	"sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/kubeflowjob"
    31  	"sigs.k8s.io/kueue/pkg/util/testing"
    32  	testingmxjob "sigs.k8s.io/kueue/pkg/util/testingjobs/mxjob"
    33  	kftesting "sigs.k8s.io/kueue/test/integration/controller/jobs/kubeflow"
    34  	"sigs.k8s.io/kueue/test/integration/framework"
    35  	"sigs.k8s.io/kueue/test/util"
    36  )
    37  
    38  const (
    39  	jobName           = "test-job"
    40  	instanceKey       = "cloud.provider.com/instance"
    41  	priorityClassName = "test-priority-class"
    42  	priorityValue     = 10
    43  )
    44  
    45  // +kubebuilder:docs-gen:collapse=Imports
    46  
    47  var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
    48  
    49  	ginkgo.BeforeAll(func() {
    50  		fwk = &framework.Framework{
    51  			CRDPath:     crdPath,
    52  			DepCRDPaths: []string{mxnetCrdPath},
    53  		}
    54  		cfg = fwk.Init()
    55  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithManageJobsWithoutQueueName(true)))
    56  	})
    57  	ginkgo.AfterAll(func() {
    58  		fwk.Teardown()
    59  	})
    60  
    61  	var (
    62  		ns *corev1.Namespace
    63  	)
    64  	ginkgo.BeforeEach(func() {
    65  		ns = &corev1.Namespace{
    66  			ObjectMeta: metav1.ObjectMeta{
    67  				GenerateName: "core-",
    68  			},
    69  		}
    70  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
    71  	})
    72  	ginkgo.AfterEach(func() {
    73  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
    74  	})
    75  
    76  	ginkgo.It("Should reconcile MXJobs", func() {
    77  		kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadmxjob.JobControl)(testingmxjob.MakeMXJob(jobName, ns.Name).Obj())}
    78  		createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadmxjob.JobControl)(&kftraining.MXJob{})}
    79  
    80  		kftesting.ShouldReconcileJob(ctx, k8sClient, kfJob, createdJob, []kftesting.PodSetsResource{
    81  			{
    82  				RoleName:    kftraining.MXJobReplicaTypeScheduler,
    83  				ResourceCPU: "on-demand",
    84  			},
    85  			{
    86  				RoleName:    kftraining.MXJobReplicaTypeServer,
    87  				ResourceCPU: "spot",
    88  			},
    89  			{
    90  				RoleName:    kftraining.MXJobReplicaTypeWorker,
    91  				ResourceCPU: "spot",
    92  			},
    93  		})
    94  	})
    95  })
    96  
    97  var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
    98  	var (
    99  		ns            *corev1.Namespace
   100  		defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj()
   101  	)
   102  
   103  	ginkgo.BeforeAll(func() {
   104  		fwk = &framework.Framework{
   105  			CRDPath:     crdPath,
   106  			DepCRDPaths: []string{mxnetCrdPath},
   107  		}
   108  		cfg := fwk.Init()
   109  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true})))
   110  
   111  		ginkgo.By("Create a resource flavor")
   112  		gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed())
   113  	})
   114  	ginkgo.AfterAll(func() {
   115  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true)
   116  		fwk.Teardown()
   117  	})
   118  
   119  	ginkgo.BeforeEach(func() {
   120  		ns = &corev1.Namespace{
   121  			ObjectMeta: metav1.ObjectMeta{
   122  				GenerateName: "core-",
   123  			},
   124  		}
   125  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   126  	})
   127  	ginkgo.AfterEach(func() {
   128  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   129  	})
   130  
   131  	ginkgo.DescribeTable("Single job at different stages of progress towards completion",
   132  		func(podsReadyTestSpec kftesting.PodsReadyTestSpec) {
   133  			kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadmxjob.JobControl)(testingmxjob.MakeMXJob(jobName, ns.Name).Parallelism(2, 2).Obj())}
   134  			createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadmxjob.JobControl)(&kftraining.MXJob{})}
   135  
   136  			kftesting.JobControllerWhenWaitForPodsReadyEnabled(ctx, k8sClient, kfJob, createdJob, podsReadyTestSpec, []kftesting.PodSetsResource{
   137  				{
   138  					RoleName:    kftraining.MXJobReplicaTypeScheduler,
   139  					ResourceCPU: "default",
   140  				},
   141  				{
   142  					RoleName:    kftraining.MXJobReplicaTypeServer,
   143  					ResourceCPU: "default",
   144  				},
   145  				{
   146  					RoleName:    kftraining.MXJobReplicaTypeWorker,
   147  					ResourceCPU: "default",
   148  				},
   149  			})
   150  		},
   151  		ginkgo.Entry("No progress", kftesting.PodsReadyTestSpec{
   152  			WantCondition: &metav1.Condition{
   153  				Type:    kueue.WorkloadPodsReady,
   154  				Status:  metav1.ConditionFalse,
   155  				Reason:  "PodsReady",
   156  				Message: "Not all pods are ready or succeeded",
   157  			},
   158  		}),
   159  		ginkgo.Entry("Running MXJob", kftesting.PodsReadyTestSpec{
   160  			JobStatus: kftraining.JobStatus{
   161  				Conditions: []kftraining.JobCondition{
   162  					{
   163  						Type:   kftraining.JobRunning,
   164  						Status: corev1.ConditionTrue,
   165  						Reason: "Running",
   166  					},
   167  				},
   168  			},
   169  			WantCondition: &metav1.Condition{
   170  				Type:    kueue.WorkloadPodsReady,
   171  				Status:  metav1.ConditionTrue,
   172  				Reason:  "PodsReady",
   173  				Message: "All pods were ready or succeeded since the workload admission",
   174  			},
   175  		}),
   176  		ginkgo.Entry("Running MXJob; PodsReady=False before", kftesting.PodsReadyTestSpec{
   177  			BeforeCondition: &metav1.Condition{
   178  				Type:    kueue.WorkloadPodsReady,
   179  				Status:  metav1.ConditionFalse,
   180  				Reason:  "PodsReady",
   181  				Message: "Not all pods are ready or succeeded",
   182  			},
   183  			JobStatus: kftraining.JobStatus{
   184  				Conditions: []kftraining.JobCondition{
   185  					{
   186  						Type:   kftraining.JobRunning,
   187  						Status: corev1.ConditionTrue,
   188  						Reason: "Running",
   189  					},
   190  				},
   191  			},
   192  			WantCondition: &metav1.Condition{
   193  				Type:    kueue.WorkloadPodsReady,
   194  				Status:  metav1.ConditionTrue,
   195  				Reason:  "PodsReady",
   196  				Message: "All pods were ready or succeeded since the workload admission",
   197  			},
   198  		}),
   199  		ginkgo.Entry("Job suspended; PodsReady=True before", kftesting.PodsReadyTestSpec{
   200  			BeforeJobStatus: &kftraining.JobStatus{
   201  				Conditions: []kftraining.JobCondition{
   202  					{
   203  						Type:   kftraining.JobRunning,
   204  						Status: corev1.ConditionTrue,
   205  						Reason: "Running",
   206  					},
   207  				},
   208  			},
   209  			BeforeCondition: &metav1.Condition{
   210  				Type:    kueue.WorkloadPodsReady,
   211  				Status:  metav1.ConditionTrue,
   212  				Reason:  "PodsReady",
   213  				Message: "All pods were ready or succeeded since the workload admission",
   214  			},
   215  			JobStatus: kftraining.JobStatus{
   216  				Conditions: []kftraining.JobCondition{
   217  					{
   218  						Type:   kftraining.JobRunning,
   219  						Status: corev1.ConditionFalse,
   220  						Reason: "Suspended",
   221  					},
   222  				},
   223  			},
   224  			Suspended: true,
   225  			WantCondition: &metav1.Condition{
   226  				Type:    kueue.WorkloadPodsReady,
   227  				Status:  metav1.ConditionFalse,
   228  				Reason:  "PodsReady",
   229  				Message: "Not all pods are ready or succeeded",
   230  			},
   231  		}),
   232  	)
   233  })
   234  
   235  var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
   236  	var (
   237  		ns                  *corev1.Namespace
   238  		onDemandFlavor      *kueue.ResourceFlavor
   239  		spotUntaintedFlavor *kueue.ResourceFlavor
   240  		clusterQueue        *kueue.ClusterQueue
   241  		localQueue          *kueue.LocalQueue
   242  	)
   243  
   244  	ginkgo.BeforeAll(func() {
   245  		fwk = &framework.Framework{
   246  			CRDPath:     crdPath,
   247  			DepCRDPaths: []string{mxnetCrdPath},
   248  		}
   249  		cfg := fwk.Init()
   250  		ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup())
   251  	})
   252  	ginkgo.AfterAll(func() {
   253  		fwk.Teardown()
   254  	})
   255  
   256  	ginkgo.BeforeEach(func() {
   257  		ns = &corev1.Namespace{
   258  			ObjectMeta: metav1.ObjectMeta{
   259  				GenerateName: "core-",
   260  			},
   261  		}
   262  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   263  
   264  		onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj()
   265  		gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed())
   266  
   267  		spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj()
   268  		gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed())
   269  
   270  		clusterQueue = testing.MakeClusterQueue("dev-clusterqueue").
   271  			ResourceGroup(
   272  				*testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "8").Obj(),
   273  				*testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(),
   274  			).Obj()
   275  		gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed())
   276  	})
   277  	ginkgo.AfterEach(func() {
   278  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   279  		util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true)
   280  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true)
   281  		gomega.Expect(util.DeleteResourceFlavor(ctx, k8sClient, spotUntaintedFlavor)).To(gomega.Succeed())
   282  	})
   283  
   284  	ginkgo.It("Should schedule jobs as they fit in their ClusterQueue", func() {
   285  		ginkgo.By("creating localQueue")
   286  		localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj()
   287  		gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed())
   288  
   289  		kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadmxjob.JobControl)(
   290  			testingmxjob.MakeMXJob(jobName, ns.Name).Queue(localQueue.Name).
   291  				Request(kftraining.MXJobReplicaTypeScheduler, corev1.ResourceCPU, "3").
   292  				Request(kftraining.MXJobReplicaTypeServer, corev1.ResourceCPU, "4").
   293  				Request(kftraining.MXJobReplicaTypeWorker, corev1.ResourceCPU, "4").
   294  				Obj(),
   295  		)}
   296  		createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadmxjob.JobControl)(&kftraining.MXJob{})}
   297  
   298  		kftesting.ShouldScheduleJobsAsTheyFitInTheirClusterQueue(ctx, k8sClient, kfJob, createdJob, clusterQueue, []kftesting.PodSetsResource{
   299  			{
   300  				RoleName:    kftraining.MXJobReplicaTypeScheduler,
   301  				ResourceCPU: kueue.ResourceFlavorReference(spotUntaintedFlavor.Name),
   302  			},
   303  			{
   304  				RoleName:    kftraining.MXJobReplicaTypeServer,
   305  				ResourceCPU: kueue.ResourceFlavorReference(spotUntaintedFlavor.Name),
   306  			},
   307  			{
   308  				RoleName:    kftraining.MXJobReplicaTypeWorker,
   309  				ResourceCPU: kueue.ResourceFlavorReference(onDemandFlavor.Name),
   310  			},
   311  		})
   312  	})
   313  })