sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/paddlejob/paddlejob_controller_test.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package paddlejob
    18  
    19  import (
    20  	kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    21  	"github.com/onsi/ginkgo/v2"
    22  	"github.com/onsi/gomega"
    23  	corev1 "k8s.io/api/core/v1"
    24  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    25  
    26  	configapi "sigs.k8s.io/kueue/apis/config/v1beta1"
    27  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    28  	"sigs.k8s.io/kueue/pkg/controller/jobframework"
    29  	workloadpaddlejob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/paddlejob"
    30  	"sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/kubeflowjob"
    31  
    32  	"sigs.k8s.io/kueue/pkg/util/testing"
    33  	testingpaddlejob "sigs.k8s.io/kueue/pkg/util/testingjobs/paddlejob"
    34  	kftesting "sigs.k8s.io/kueue/test/integration/controller/jobs/kubeflow"
    35  	"sigs.k8s.io/kueue/test/integration/framework"
    36  	"sigs.k8s.io/kueue/test/util"
    37  )
    38  
    39  const (
    40  	jobName           = "test-job"
    41  	instanceKey       = "cloud.provider.com/instance"
    42  	priorityClassName = "test-priority-class"
    43  	priorityValue     = 10
    44  	jobQueueName      = "test-queue"
    45  )
    46  
    47  // +kubebuilder:docs-gen:collapse=Imports
    48  
    49  var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
    50  
    51  	ginkgo.BeforeAll(func() {
    52  		fwk = &framework.Framework{
    53  			CRDPath:     crdPath,
    54  			DepCRDPaths: []string{paddleCrdPath},
    55  		}
    56  		cfg = fwk.Init()
    57  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithManageJobsWithoutQueueName(true)))
    58  	})
    59  	ginkgo.AfterAll(func() {
    60  		fwk.Teardown()
    61  	})
    62  
    63  	var (
    64  		ns *corev1.Namespace
    65  	)
    66  	ginkgo.BeforeEach(func() {
    67  		ns = &corev1.Namespace{
    68  			ObjectMeta: metav1.ObjectMeta{
    69  				GenerateName: "core-",
    70  			},
    71  		}
    72  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
    73  	})
    74  	ginkgo.AfterEach(func() {
    75  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
    76  	})
    77  
    78  	ginkgo.It("Should reconcile PaddleJobs", func() {
    79  		kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpaddlejob.JobControl)(testingpaddlejob.MakePaddleJob(jobName, ns.Name).Obj())}
    80  		createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpaddlejob.JobControl)(&kftraining.PaddleJob{})}
    81  
    82  		kftesting.ShouldReconcileJob(ctx, k8sClient, kfJob, createdJob, []kftesting.PodSetsResource{
    83  			{
    84  				RoleName:    kftraining.PaddleJobReplicaTypeMaster,
    85  				ResourceCPU: "on-demand",
    86  			},
    87  			{
    88  				RoleName:    kftraining.PaddleJobReplicaTypeWorker,
    89  				ResourceCPU: "spot",
    90  			},
    91  		})
    92  	})
    93  })
    94  
    95  var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
    96  	var (
    97  		ns            *corev1.Namespace
    98  		defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj()
    99  	)
   100  
   101  	ginkgo.BeforeAll(func() {
   102  		fwk = &framework.Framework{
   103  			CRDPath:     crdPath,
   104  			DepCRDPaths: []string{paddleCrdPath},
   105  		}
   106  		cfg := fwk.Init()
   107  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true})))
   108  
   109  		ginkgo.By("Create a resource flavor")
   110  		gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed())
   111  	})
   112  	ginkgo.AfterAll(func() {
   113  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true)
   114  		fwk.Teardown()
   115  	})
   116  
   117  	ginkgo.BeforeEach(func() {
   118  		ns = &corev1.Namespace{
   119  			ObjectMeta: metav1.ObjectMeta{
   120  				GenerateName: "core-",
   121  			},
   122  		}
   123  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   124  	})
   125  	ginkgo.AfterEach(func() {
   126  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   127  	})
   128  
   129  	ginkgo.DescribeTable("Single job at different stages of progress towards completion",
   130  		func(podsReadyTestSpec kftesting.PodsReadyTestSpec) {
   131  			kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpaddlejob.JobControl)(testingpaddlejob.MakePaddleJob(jobName, ns.Name).Parallelism(2).Obj())}
   132  			createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpaddlejob.JobControl)(&kftraining.PaddleJob{})}
   133  
   134  			kftesting.JobControllerWhenWaitForPodsReadyEnabled(ctx, k8sClient, kfJob, createdJob, podsReadyTestSpec, []kftesting.PodSetsResource{
   135  				{
   136  					RoleName:    kftraining.PaddleJobReplicaTypeMaster,
   137  					ResourceCPU: "default",
   138  				},
   139  				{
   140  					RoleName:    kftraining.PaddleJobReplicaTypeWorker,
   141  					ResourceCPU: "default",
   142  				},
   143  			})
   144  		},
   145  		ginkgo.Entry("No progress", kftesting.PodsReadyTestSpec{
   146  			WantCondition: &metav1.Condition{
   147  				Type:    kueue.WorkloadPodsReady,
   148  				Status:  metav1.ConditionFalse,
   149  				Reason:  "PodsReady",
   150  				Message: "Not all pods are ready or succeeded",
   151  			},
   152  		}),
   153  		ginkgo.Entry("Running PaddleJob", kftesting.PodsReadyTestSpec{
   154  			JobStatus: kftraining.JobStatus{
   155  				Conditions: []kftraining.JobCondition{
   156  					{
   157  						Type:   kftraining.JobRunning,
   158  						Status: corev1.ConditionTrue,
   159  						Reason: "Running",
   160  					},
   161  				},
   162  			},
   163  			WantCondition: &metav1.Condition{
   164  				Type:    kueue.WorkloadPodsReady,
   165  				Status:  metav1.ConditionTrue,
   166  				Reason:  "PodsReady",
   167  				Message: "All pods were ready or succeeded since the workload admission",
   168  			},
   169  		}),
   170  		ginkgo.Entry("Running PaddleJob; PodsReady=False before", kftesting.PodsReadyTestSpec{
   171  			BeforeCondition: &metav1.Condition{
   172  				Type:    kueue.WorkloadPodsReady,
   173  				Status:  metav1.ConditionFalse,
   174  				Reason:  "PodsReady",
   175  				Message: "Not all pods are ready or succeeded",
   176  			},
   177  			JobStatus: kftraining.JobStatus{
   178  				Conditions: []kftraining.JobCondition{
   179  					{
   180  						Type:   kftraining.JobRunning,
   181  						Status: corev1.ConditionTrue,
   182  						Reason: "Running",
   183  					},
   184  				},
   185  			},
   186  			WantCondition: &metav1.Condition{
   187  				Type:    kueue.WorkloadPodsReady,
   188  				Status:  metav1.ConditionTrue,
   189  				Reason:  "PodsReady",
   190  				Message: "All pods were ready or succeeded since the workload admission",
   191  			},
   192  		}),
   193  		ginkgo.Entry("Job suspended; PodsReady=True before", kftesting.PodsReadyTestSpec{
   194  			BeforeJobStatus: &kftraining.JobStatus{
   195  				Conditions: []kftraining.JobCondition{
   196  					{
   197  						Type:   kftraining.JobRunning,
   198  						Status: corev1.ConditionTrue,
   199  						Reason: "Running",
   200  					},
   201  				},
   202  			},
   203  			BeforeCondition: &metav1.Condition{
   204  				Type:    kueue.WorkloadPodsReady,
   205  				Status:  metav1.ConditionTrue,
   206  				Reason:  "PodsReady",
   207  				Message: "All pods were ready or succeeded since the workload admission",
   208  			},
   209  			JobStatus: kftraining.JobStatus{
   210  				Conditions: []kftraining.JobCondition{
   211  					{
   212  						Type:   kftraining.JobRunning,
   213  						Status: corev1.ConditionFalse,
   214  						Reason: "Suspended",
   215  					},
   216  				},
   217  			},
   218  			Suspended: true,
   219  			WantCondition: &metav1.Condition{
   220  				Type:    kueue.WorkloadPodsReady,
   221  				Status:  metav1.ConditionFalse,
   222  				Reason:  "PodsReady",
   223  				Message: "Not all pods are ready or succeeded",
   224  			},
   225  		}),
   226  	)
   227  })
   228  
   229  var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
   230  	var (
   231  		ns                  *corev1.Namespace
   232  		onDemandFlavor      *kueue.ResourceFlavor
   233  		spotUntaintedFlavor *kueue.ResourceFlavor
   234  		clusterQueue        *kueue.ClusterQueue
   235  		localQueue          *kueue.LocalQueue
   236  	)
   237  
   238  	ginkgo.BeforeAll(func() {
   239  		fwk = &framework.Framework{
   240  			CRDPath:     crdPath,
   241  			DepCRDPaths: []string{paddleCrdPath},
   242  		}
   243  		cfg := fwk.Init()
   244  		ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup())
   245  	})
   246  	ginkgo.AfterAll(func() {
   247  		fwk.Teardown()
   248  	})
   249  
   250  	ginkgo.BeforeEach(func() {
   251  		ns = &corev1.Namespace{
   252  			ObjectMeta: metav1.ObjectMeta{
   253  				GenerateName: "core-",
   254  			},
   255  		}
   256  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   257  
   258  		onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj()
   259  		gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed())
   260  
   261  		spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj()
   262  		gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed())
   263  
   264  		clusterQueue = testing.MakeClusterQueue("dev-clusterqueue").
   265  			ResourceGroup(
   266  				*testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "5").Obj(),
   267  				*testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(),
   268  			).Obj()
   269  		gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed())
   270  	})
   271  	ginkgo.AfterEach(func() {
   272  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   273  		util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true)
   274  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true)
   275  		gomega.Expect(util.DeleteResourceFlavor(ctx, k8sClient, spotUntaintedFlavor)).To(gomega.Succeed())
   276  	})
   277  
   278  	ginkgo.It("Should schedule jobs as they fit in their ClusterQueue", func() {
   279  		ginkgo.By("creating localQueue")
   280  		localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj()
   281  		gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed())
   282  
   283  		kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpaddlejob.JobControl)(
   284  			testingpaddlejob.MakePaddleJob(jobName, ns.Name).Queue(localQueue.Name).
   285  				Request(kftraining.PaddleJobReplicaTypeMaster, corev1.ResourceCPU, "3").
   286  				Request(kftraining.PaddleJobReplicaTypeWorker, corev1.ResourceCPU, "4").
   287  				Obj(),
   288  		)}
   289  		createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpaddlejob.JobControl)(&kftraining.PaddleJob{})}
   290  
   291  		kftesting.ShouldScheduleJobsAsTheyFitInTheirClusterQueue(ctx, k8sClient, kfJob, createdJob, clusterQueue, []kftesting.PodSetsResource{
   292  			{
   293  				RoleName:    kftraining.PaddleJobReplicaTypeMaster,
   294  				ResourceCPU: kueue.ResourceFlavorReference(spotUntaintedFlavor.Name),
   295  			},
   296  			{
   297  				RoleName:    kftraining.PaddleJobReplicaTypeWorker,
   298  				ResourceCPU: kueue.ResourceFlavorReference(onDemandFlavor.Name),
   299  			},
   300  		})
   301  	})
   302  })