sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/mxjob/mxjob_controller_test.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package mxjob 18 19 import ( 20 kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 21 "github.com/onsi/ginkgo/v2" 22 "github.com/onsi/gomega" 23 corev1 "k8s.io/api/core/v1" 24 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 26 configapi "sigs.k8s.io/kueue/apis/config/v1beta1" 27 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 28 "sigs.k8s.io/kueue/pkg/controller/jobframework" 29 workloadmxjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/mxjob" 30 "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/kubeflowjob" 31 "sigs.k8s.io/kueue/pkg/util/testing" 32 testingmxjob "sigs.k8s.io/kueue/pkg/util/testingjobs/mxjob" 33 kftesting "sigs.k8s.io/kueue/test/integration/controller/jobs/kubeflow" 34 "sigs.k8s.io/kueue/test/integration/framework" 35 "sigs.k8s.io/kueue/test/util" 36 ) 37 38 const ( 39 jobName = "test-job" 40 instanceKey = "cloud.provider.com/instance" 41 priorityClassName = "test-priority-class" 42 priorityValue = 10 43 ) 44 45 // +kubebuilder:docs-gen:collapse=Imports 46 47 var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 48 49 ginkgo.BeforeAll(func() { 50 fwk = &framework.Framework{ 51 CRDPath: crdPath, 52 DepCRDPaths: []string{mxnetCrdPath}, 53 } 54 cfg = fwk.Init() 55 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithManageJobsWithoutQueueName(true))) 56 }) 57 ginkgo.AfterAll(func() { 58 fwk.Teardown() 59 }) 60 61 var ( 62 ns *corev1.Namespace 63 ) 64 ginkgo.BeforeEach(func() { 65 ns = &corev1.Namespace{ 66 ObjectMeta: metav1.ObjectMeta{ 67 GenerateName: "core-", 68 }, 69 } 70 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 71 }) 72 ginkgo.AfterEach(func() { 73 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 74 }) 75 76 ginkgo.It("Should reconcile MXJobs", func() { 77 kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadmxjob.JobControl)(testingmxjob.MakeMXJob(jobName, ns.Name).Obj())} 78 createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadmxjob.JobControl)(&kftraining.MXJob{})} 79 80 kftesting.ShouldReconcileJob(ctx, k8sClient, kfJob, createdJob, []kftesting.PodSetsResource{ 81 { 82 RoleName: kftraining.MXJobReplicaTypeScheduler, 83 ResourceCPU: "on-demand", 84 }, 85 { 86 RoleName: kftraining.MXJobReplicaTypeServer, 87 ResourceCPU: "spot", 88 }, 89 { 90 RoleName: kftraining.MXJobReplicaTypeWorker, 91 ResourceCPU: "spot", 92 }, 93 }) 94 }) 95 }) 96 97 var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 98 var ( 99 ns *corev1.Namespace 100 defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj() 101 ) 102 103 ginkgo.BeforeAll(func() { 104 fwk = &framework.Framework{ 105 CRDPath: crdPath, 106 DepCRDPaths: []string{mxnetCrdPath}, 107 } 108 cfg := fwk.Init() 109 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true}))) 110 111 ginkgo.By("Create a resource flavor") 112 gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed()) 113 }) 114 ginkgo.AfterAll(func() { 115 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true) 116 fwk.Teardown() 117 }) 118 119 ginkgo.BeforeEach(func() { 120 ns = &corev1.Namespace{ 121 ObjectMeta: metav1.ObjectMeta{ 122 GenerateName: "core-", 123 }, 124 } 125 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 126 }) 127 ginkgo.AfterEach(func() { 128 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 129 }) 130 131 ginkgo.DescribeTable("Single job at different stages of progress towards completion", 132 func(podsReadyTestSpec kftesting.PodsReadyTestSpec) { 133 kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadmxjob.JobControl)(testingmxjob.MakeMXJob(jobName, ns.Name).Parallelism(2, 2).Obj())} 134 createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadmxjob.JobControl)(&kftraining.MXJob{})} 135 136 kftesting.JobControllerWhenWaitForPodsReadyEnabled(ctx, k8sClient, kfJob, createdJob, podsReadyTestSpec, []kftesting.PodSetsResource{ 137 { 138 RoleName: kftraining.MXJobReplicaTypeScheduler, 139 ResourceCPU: "default", 140 }, 141 { 142 RoleName: kftraining.MXJobReplicaTypeServer, 143 ResourceCPU: "default", 144 }, 145 { 146 RoleName: kftraining.MXJobReplicaTypeWorker, 147 ResourceCPU: "default", 148 }, 149 }) 150 }, 151 ginkgo.Entry("No progress", kftesting.PodsReadyTestSpec{ 152 WantCondition: &metav1.Condition{ 153 Type: kueue.WorkloadPodsReady, 154 Status: metav1.ConditionFalse, 155 Reason: "PodsReady", 156 Message: "Not all pods are ready or succeeded", 157 }, 158 }), 159 ginkgo.Entry("Running MXJob", kftesting.PodsReadyTestSpec{ 160 JobStatus: kftraining.JobStatus{ 161 Conditions: []kftraining.JobCondition{ 162 { 163 Type: kftraining.JobRunning, 164 Status: corev1.ConditionTrue, 165 Reason: "Running", 166 }, 167 }, 168 }, 169 WantCondition: &metav1.Condition{ 170 Type: kueue.WorkloadPodsReady, 171 Status: metav1.ConditionTrue, 172 Reason: "PodsReady", 173 Message: "All pods were ready or succeeded since the workload admission", 174 }, 175 }), 176 ginkgo.Entry("Running MXJob; PodsReady=False before", kftesting.PodsReadyTestSpec{ 177 BeforeCondition: &metav1.Condition{ 178 Type: kueue.WorkloadPodsReady, 179 Status: metav1.ConditionFalse, 180 Reason: "PodsReady", 181 Message: "Not all pods are ready or succeeded", 182 }, 183 JobStatus: kftraining.JobStatus{ 184 Conditions: []kftraining.JobCondition{ 185 { 186 Type: kftraining.JobRunning, 187 Status: corev1.ConditionTrue, 188 Reason: "Running", 189 }, 190 }, 191 }, 192 WantCondition: &metav1.Condition{ 193 Type: kueue.WorkloadPodsReady, 194 Status: metav1.ConditionTrue, 195 Reason: "PodsReady", 196 Message: "All pods were ready or succeeded since the workload admission", 197 }, 198 }), 199 ginkgo.Entry("Job suspended; PodsReady=True before", kftesting.PodsReadyTestSpec{ 200 BeforeJobStatus: &kftraining.JobStatus{ 201 Conditions: []kftraining.JobCondition{ 202 { 203 Type: kftraining.JobRunning, 204 Status: corev1.ConditionTrue, 205 Reason: "Running", 206 }, 207 }, 208 }, 209 BeforeCondition: &metav1.Condition{ 210 Type: kueue.WorkloadPodsReady, 211 Status: metav1.ConditionTrue, 212 Reason: "PodsReady", 213 Message: "All pods were ready or succeeded since the workload admission", 214 }, 215 JobStatus: kftraining.JobStatus{ 216 Conditions: []kftraining.JobCondition{ 217 { 218 Type: kftraining.JobRunning, 219 Status: corev1.ConditionFalse, 220 Reason: "Suspended", 221 }, 222 }, 223 }, 224 Suspended: true, 225 WantCondition: &metav1.Condition{ 226 Type: kueue.WorkloadPodsReady, 227 Status: metav1.ConditionFalse, 228 Reason: "PodsReady", 229 Message: "Not all pods are ready or succeeded", 230 }, 231 }), 232 ) 233 }) 234 235 var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 236 var ( 237 ns *corev1.Namespace 238 onDemandFlavor *kueue.ResourceFlavor 239 spotUntaintedFlavor *kueue.ResourceFlavor 240 clusterQueue *kueue.ClusterQueue 241 localQueue *kueue.LocalQueue 242 ) 243 244 ginkgo.BeforeAll(func() { 245 fwk = &framework.Framework{ 246 CRDPath: crdPath, 247 DepCRDPaths: []string{mxnetCrdPath}, 248 } 249 cfg := fwk.Init() 250 ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup()) 251 }) 252 ginkgo.AfterAll(func() { 253 fwk.Teardown() 254 }) 255 256 ginkgo.BeforeEach(func() { 257 ns = &corev1.Namespace{ 258 ObjectMeta: metav1.ObjectMeta{ 259 GenerateName: "core-", 260 }, 261 } 262 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 263 264 onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 265 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 266 267 spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj() 268 gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed()) 269 270 clusterQueue = testing.MakeClusterQueue("dev-clusterqueue"). 271 ResourceGroup( 272 *testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "8").Obj(), 273 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(), 274 ).Obj() 275 gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed()) 276 }) 277 ginkgo.AfterEach(func() { 278 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 279 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true) 280 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true) 281 gomega.Expect(util.DeleteResourceFlavor(ctx, k8sClient, spotUntaintedFlavor)).To(gomega.Succeed()) 282 }) 283 284 ginkgo.It("Should schedule jobs as they fit in their ClusterQueue", func() { 285 ginkgo.By("creating localQueue") 286 localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() 287 gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed()) 288 289 kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadmxjob.JobControl)( 290 testingmxjob.MakeMXJob(jobName, ns.Name).Queue(localQueue.Name). 291 Request(kftraining.MXJobReplicaTypeScheduler, corev1.ResourceCPU, "3"). 292 Request(kftraining.MXJobReplicaTypeServer, corev1.ResourceCPU, "4"). 293 Request(kftraining.MXJobReplicaTypeWorker, corev1.ResourceCPU, "4"). 294 Obj(), 295 )} 296 createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadmxjob.JobControl)(&kftraining.MXJob{})} 297 298 kftesting.ShouldScheduleJobsAsTheyFitInTheirClusterQueue(ctx, k8sClient, kfJob, createdJob, clusterQueue, []kftesting.PodSetsResource{ 299 { 300 RoleName: kftraining.MXJobReplicaTypeScheduler, 301 ResourceCPU: kueue.ResourceFlavorReference(spotUntaintedFlavor.Name), 302 }, 303 { 304 RoleName: kftraining.MXJobReplicaTypeServer, 305 ResourceCPU: kueue.ResourceFlavorReference(spotUntaintedFlavor.Name), 306 }, 307 { 308 RoleName: kftraining.MXJobReplicaTypeWorker, 309 ResourceCPU: kueue.ResourceFlavorReference(onDemandFlavor.Name), 310 }, 311 }) 312 }) 313 })