sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/tfjob/tfjob_controller_test.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package tfjob
    18  
    19  import (
    20  	kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    21  	"github.com/onsi/ginkgo/v2"
    22  	"github.com/onsi/gomega"
    23  	corev1 "k8s.io/api/core/v1"
    24  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    25  
    26  	configapi "sigs.k8s.io/kueue/apis/config/v1beta1"
    27  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    28  	"sigs.k8s.io/kueue/pkg/controller/jobframework"
    29  	workloadtfjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/tfjob"
    30  	"sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/kubeflowjob"
    31  	"sigs.k8s.io/kueue/pkg/util/testing"
    32  	testingtfjob "sigs.k8s.io/kueue/pkg/util/testingjobs/tfjob"
    33  	kftesting "sigs.k8s.io/kueue/test/integration/controller/jobs/kubeflow"
    34  	"sigs.k8s.io/kueue/test/integration/framework"
    35  	"sigs.k8s.io/kueue/test/util"
    36  )
    37  
    38  const (
    39  	jobName           = "test-job"
    40  	instanceKey       = "cloud.provider.com/instance"
    41  	priorityClassName = "test-priority-class"
    42  	priorityValue     = 10
    43  	jobQueueName      = "test-queue"
    44  )
    45  
    46  // +kubebuilder:docs-gen:collapse=Imports
    47  
    48  var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
    49  
    50  	ginkgo.BeforeAll(func() {
    51  		fwk = &framework.Framework{
    52  			CRDPath:     crdPath,
    53  			DepCRDPaths: []string{tensorflowCrdPath},
    54  		}
    55  		cfg = fwk.Init()
    56  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithManageJobsWithoutQueueName(true)))
    57  	})
    58  	ginkgo.AfterAll(func() {
    59  		fwk.Teardown()
    60  	})
    61  
    62  	var (
    63  		ns *corev1.Namespace
    64  	)
    65  	ginkgo.BeforeEach(func() {
    66  		ns = &corev1.Namespace{
    67  			ObjectMeta: metav1.ObjectMeta{
    68  				GenerateName: "core-",
    69  			},
    70  		}
    71  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
    72  	})
    73  	ginkgo.AfterEach(func() {
    74  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
    75  	})
    76  
    77  	ginkgo.It("Should reconcile TFJobs", func() {
    78  		kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadtfjob.JobControl)(testingtfjob.MakeTFJob(jobName, ns.Name).Obj())}
    79  		createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadtfjob.JobControl)(&kftraining.TFJob{})}
    80  
    81  		kftesting.ShouldReconcileJob(ctx, k8sClient, kfJob, createdJob, []kftesting.PodSetsResource{
    82  			{
    83  				RoleName:    kftraining.TFJobReplicaTypeChief,
    84  				ResourceCPU: "on-demand",
    85  			},
    86  			{
    87  				RoleName:    kftraining.TFJobReplicaTypePS,
    88  				ResourceCPU: "spot",
    89  			},
    90  			{
    91  				RoleName:    kftraining.TFJobReplicaTypeWorker,
    92  				ResourceCPU: "spot",
    93  			},
    94  		})
    95  	})
    96  })
    97  
    98  var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
    99  	var (
   100  		ns            *corev1.Namespace
   101  		defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj()
   102  	)
   103  
   104  	ginkgo.BeforeAll(func() {
   105  		fwk = &framework.Framework{
   106  			CRDPath:     crdPath,
   107  			DepCRDPaths: []string{tensorflowCrdPath},
   108  		}
   109  		cfg := fwk.Init()
   110  		ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true})))
   111  
   112  		ginkgo.By("Create a resource flavor")
   113  		gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed())
   114  	})
   115  	ginkgo.AfterAll(func() {
   116  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true)
   117  		fwk.Teardown()
   118  	})
   119  
   120  	ginkgo.BeforeEach(func() {
   121  		ns = &corev1.Namespace{
   122  			ObjectMeta: metav1.ObjectMeta{
   123  				GenerateName: "core-",
   124  			},
   125  		}
   126  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   127  	})
   128  	ginkgo.AfterEach(func() {
   129  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   130  	})
   131  
   132  	ginkgo.DescribeTable("Single job at different stages of progress towards completion",
   133  		func(podsReadyTestSpec kftesting.PodsReadyTestSpec) {
   134  			kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadtfjob.JobControl)(testingtfjob.MakeTFJob(jobName, ns.Name).Parallelism(2, 2).Obj())}
   135  			createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadtfjob.JobControl)(&kftraining.TFJob{})}
   136  
   137  			kftesting.JobControllerWhenWaitForPodsReadyEnabled(ctx, k8sClient, kfJob, createdJob, podsReadyTestSpec, []kftesting.PodSetsResource{
   138  				{
   139  					RoleName:    kftraining.TFJobReplicaTypeChief,
   140  					ResourceCPU: "default",
   141  				},
   142  				{
   143  					RoleName:    kftraining.TFJobReplicaTypePS,
   144  					ResourceCPU: "default",
   145  				},
   146  				{
   147  					RoleName:    kftraining.TFJobReplicaTypeWorker,
   148  					ResourceCPU: "default",
   149  				},
   150  			})
   151  		},
   152  		ginkgo.Entry("No progress", kftesting.PodsReadyTestSpec{
   153  			WantCondition: &metav1.Condition{
   154  				Type:    kueue.WorkloadPodsReady,
   155  				Status:  metav1.ConditionFalse,
   156  				Reason:  "PodsReady",
   157  				Message: "Not all pods are ready or succeeded",
   158  			},
   159  		}),
   160  		ginkgo.Entry("Running TFJob", kftesting.PodsReadyTestSpec{
   161  			JobStatus: kftraining.JobStatus{
   162  				Conditions: []kftraining.JobCondition{
   163  					{
   164  						Type:   kftraining.JobRunning,
   165  						Status: corev1.ConditionTrue,
   166  						Reason: "Running",
   167  					},
   168  				},
   169  			},
   170  			WantCondition: &metav1.Condition{
   171  				Type:    kueue.WorkloadPodsReady,
   172  				Status:  metav1.ConditionTrue,
   173  				Reason:  "PodsReady",
   174  				Message: "All pods were ready or succeeded since the workload admission",
   175  			},
   176  		}),
   177  		ginkgo.Entry("Running TFJob; PodsReady=False before", kftesting.PodsReadyTestSpec{
   178  			BeforeCondition: &metav1.Condition{
   179  				Type:    kueue.WorkloadPodsReady,
   180  				Status:  metav1.ConditionFalse,
   181  				Reason:  "PodsReady",
   182  				Message: "Not all pods are ready or succeeded",
   183  			},
   184  			JobStatus: kftraining.JobStatus{
   185  				Conditions: []kftraining.JobCondition{
   186  					{
   187  						Type:   kftraining.JobRunning,
   188  						Status: corev1.ConditionTrue,
   189  						Reason: "Running",
   190  					},
   191  				},
   192  			},
   193  			WantCondition: &metav1.Condition{
   194  				Type:    kueue.WorkloadPodsReady,
   195  				Status:  metav1.ConditionTrue,
   196  				Reason:  "PodsReady",
   197  				Message: "All pods were ready or succeeded since the workload admission",
   198  			},
   199  		}),
   200  		ginkgo.Entry("Job suspended; PodsReady=True before", kftesting.PodsReadyTestSpec{
   201  			BeforeJobStatus: &kftraining.JobStatus{
   202  				Conditions: []kftraining.JobCondition{
   203  					{
   204  						Type:   kftraining.JobRunning,
   205  						Status: corev1.ConditionTrue,
   206  						Reason: "Running",
   207  					},
   208  				},
   209  			},
   210  			BeforeCondition: &metav1.Condition{
   211  				Type:    kueue.WorkloadPodsReady,
   212  				Status:  metav1.ConditionTrue,
   213  				Reason:  "PodsReady",
   214  				Message: "All pods were ready or succeeded since the workload admission",
   215  			},
   216  			JobStatus: kftraining.JobStatus{
   217  				Conditions: []kftraining.JobCondition{
   218  					{
   219  						Type:   kftraining.JobRunning,
   220  						Status: corev1.ConditionFalse,
   221  						Reason: "Suspended",
   222  					},
   223  				},
   224  			},
   225  			Suspended: true,
   226  			WantCondition: &metav1.Condition{
   227  				Type:    kueue.WorkloadPodsReady,
   228  				Status:  metav1.ConditionFalse,
   229  				Reason:  "PodsReady",
   230  				Message: "Not all pods are ready or succeeded",
   231  			},
   232  		}),
   233  	)
   234  })
   235  
   236  var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
   237  	var (
   238  		ns                  *corev1.Namespace
   239  		onDemandFlavor      *kueue.ResourceFlavor
   240  		spotUntaintedFlavor *kueue.ResourceFlavor
   241  		clusterQueue        *kueue.ClusterQueue
   242  		localQueue          *kueue.LocalQueue
   243  	)
   244  
   245  	ginkgo.BeforeAll(func() {
   246  		fwk = &framework.Framework{
   247  			CRDPath:     crdPath,
   248  			DepCRDPaths: []string{tensorflowCrdPath},
   249  		}
   250  		cfg := fwk.Init()
   251  		ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup())
   252  	})
   253  	ginkgo.AfterAll(func() {
   254  		fwk.Teardown()
   255  	})
   256  
   257  	ginkgo.BeforeEach(func() {
   258  		ns = &corev1.Namespace{
   259  			ObjectMeta: metav1.ObjectMeta{
   260  				GenerateName: "core-",
   261  			},
   262  		}
   263  		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
   264  
   265  		onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj()
   266  		gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed())
   267  
   268  		spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj()
   269  		gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed())
   270  
   271  		clusterQueue = testing.MakeClusterQueue("dev-clusterqueue").
   272  			ResourceGroup(
   273  				*testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "8").Obj(),
   274  				*testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(),
   275  			).Obj()
   276  		gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed())
   277  	})
   278  	ginkgo.AfterEach(func() {
   279  		gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed())
   280  		util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true)
   281  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true)
   282  		gomega.Expect(util.DeleteResourceFlavor(ctx, k8sClient, spotUntaintedFlavor)).To(gomega.Succeed())
   283  	})
   284  
   285  	ginkgo.It("Should schedule jobs as they fit in their ClusterQueue", func() {
   286  		ginkgo.By("creating localQueue")
   287  		localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj()
   288  		gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed())
   289  
   290  		kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadtfjob.JobControl)(
   291  			testingtfjob.MakeTFJob(jobName, ns.Name).Queue(localQueue.Name).
   292  				Request(kftraining.TFJobReplicaTypeChief, corev1.ResourceCPU, "3").
   293  				Request(kftraining.TFJobReplicaTypePS, corev1.ResourceCPU, "4").
   294  				Request(kftraining.TFJobReplicaTypeWorker, corev1.ResourceCPU, "4").
   295  				Obj(),
   296  		)}
   297  		createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadtfjob.JobControl)(&kftraining.TFJob{})}
   298  
   299  		kftesting.ShouldScheduleJobsAsTheyFitInTheirClusterQueue(ctx, k8sClient, kfJob, createdJob, clusterQueue, []kftesting.PodSetsResource{
   300  			{
   301  				RoleName:    kftraining.TFJobReplicaTypeChief,
   302  				ResourceCPU: kueue.ResourceFlavorReference(spotUntaintedFlavor.Name),
   303  			},
   304  			{
   305  				RoleName:    kftraining.TFJobReplicaTypePS,
   306  				ResourceCPU: kueue.ResourceFlavorReference(spotUntaintedFlavor.Name),
   307  			},
   308  			{
   309  				RoleName:    kftraining.TFJobReplicaTypeWorker,
   310  				ResourceCPU: kueue.ResourceFlavorReference(onDemandFlavor.Name),
   311  			},
   312  		})
   313  	})
   314  })