sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/paddlejob/paddlejob_controller_test.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package paddlejob 18 19 import ( 20 kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 21 "github.com/onsi/ginkgo/v2" 22 "github.com/onsi/gomega" 23 corev1 "k8s.io/api/core/v1" 24 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 26 configapi "sigs.k8s.io/kueue/apis/config/v1beta1" 27 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 28 "sigs.k8s.io/kueue/pkg/controller/jobframework" 29 workloadpaddlejob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/paddlejob" 30 "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/kubeflowjob" 31 32 "sigs.k8s.io/kueue/pkg/util/testing" 33 testingpaddlejob "sigs.k8s.io/kueue/pkg/util/testingjobs/paddlejob" 34 kftesting "sigs.k8s.io/kueue/test/integration/controller/jobs/kubeflow" 35 "sigs.k8s.io/kueue/test/integration/framework" 36 "sigs.k8s.io/kueue/test/util" 37 ) 38 39 const ( 40 jobName = "test-job" 41 instanceKey = "cloud.provider.com/instance" 42 priorityClassName = "test-priority-class" 43 priorityValue = 10 44 jobQueueName = "test-queue" 45 ) 46 47 // +kubebuilder:docs-gen:collapse=Imports 48 49 var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 50 51 ginkgo.BeforeAll(func() { 52 fwk = &framework.Framework{ 53 CRDPath: crdPath, 54 DepCRDPaths: []string{paddleCrdPath}, 55 } 56 cfg = fwk.Init() 57 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithManageJobsWithoutQueueName(true))) 58 }) 59 ginkgo.AfterAll(func() { 60 fwk.Teardown() 61 }) 62 63 var ( 64 ns *corev1.Namespace 65 ) 66 ginkgo.BeforeEach(func() { 67 ns = &corev1.Namespace{ 68 ObjectMeta: metav1.ObjectMeta{ 69 GenerateName: "core-", 70 }, 71 } 72 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 73 }) 74 ginkgo.AfterEach(func() { 75 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 76 }) 77 78 ginkgo.It("Should reconcile PaddleJobs", func() { 79 kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpaddlejob.JobControl)(testingpaddlejob.MakePaddleJob(jobName, ns.Name).Obj())} 80 createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpaddlejob.JobControl)(&kftraining.PaddleJob{})} 81 82 kftesting.ShouldReconcileJob(ctx, k8sClient, kfJob, createdJob, []kftesting.PodSetsResource{ 83 { 84 RoleName: kftraining.PaddleJobReplicaTypeMaster, 85 ResourceCPU: "on-demand", 86 }, 87 { 88 RoleName: kftraining.PaddleJobReplicaTypeWorker, 89 ResourceCPU: "spot", 90 }, 91 }) 92 }) 93 }) 94 95 var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 96 var ( 97 ns *corev1.Namespace 98 defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj() 99 ) 100 101 ginkgo.BeforeAll(func() { 102 fwk = &framework.Framework{ 103 CRDPath: crdPath, 104 DepCRDPaths: []string{paddleCrdPath}, 105 } 106 cfg := fwk.Init() 107 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true}))) 108 109 ginkgo.By("Create a resource flavor") 110 gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed()) 111 }) 112 ginkgo.AfterAll(func() { 113 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true) 114 fwk.Teardown() 115 }) 116 117 ginkgo.BeforeEach(func() { 118 ns = &corev1.Namespace{ 119 ObjectMeta: metav1.ObjectMeta{ 120 GenerateName: "core-", 121 }, 122 } 123 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 124 }) 125 ginkgo.AfterEach(func() { 126 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 127 }) 128 129 ginkgo.DescribeTable("Single job at different stages of progress towards completion", 130 func(podsReadyTestSpec kftesting.PodsReadyTestSpec) { 131 kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpaddlejob.JobControl)(testingpaddlejob.MakePaddleJob(jobName, ns.Name).Parallelism(2).Obj())} 132 createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpaddlejob.JobControl)(&kftraining.PaddleJob{})} 133 134 kftesting.JobControllerWhenWaitForPodsReadyEnabled(ctx, k8sClient, kfJob, createdJob, podsReadyTestSpec, []kftesting.PodSetsResource{ 135 { 136 RoleName: kftraining.PaddleJobReplicaTypeMaster, 137 ResourceCPU: "default", 138 }, 139 { 140 RoleName: kftraining.PaddleJobReplicaTypeWorker, 141 ResourceCPU: "default", 142 }, 143 }) 144 }, 145 ginkgo.Entry("No progress", kftesting.PodsReadyTestSpec{ 146 WantCondition: &metav1.Condition{ 147 Type: kueue.WorkloadPodsReady, 148 Status: metav1.ConditionFalse, 149 Reason: "PodsReady", 150 Message: "Not all pods are ready or succeeded", 151 }, 152 }), 153 ginkgo.Entry("Running PaddleJob", kftesting.PodsReadyTestSpec{ 154 JobStatus: kftraining.JobStatus{ 155 Conditions: []kftraining.JobCondition{ 156 { 157 Type: kftraining.JobRunning, 158 Status: corev1.ConditionTrue, 159 Reason: "Running", 160 }, 161 }, 162 }, 163 WantCondition: &metav1.Condition{ 164 Type: kueue.WorkloadPodsReady, 165 Status: metav1.ConditionTrue, 166 Reason: "PodsReady", 167 Message: "All pods were ready or succeeded since the workload admission", 168 }, 169 }), 170 ginkgo.Entry("Running PaddleJob; PodsReady=False before", kftesting.PodsReadyTestSpec{ 171 BeforeCondition: &metav1.Condition{ 172 Type: kueue.WorkloadPodsReady, 173 Status: metav1.ConditionFalse, 174 Reason: "PodsReady", 175 Message: "Not all pods are ready or succeeded", 176 }, 177 JobStatus: kftraining.JobStatus{ 178 Conditions: []kftraining.JobCondition{ 179 { 180 Type: kftraining.JobRunning, 181 Status: corev1.ConditionTrue, 182 Reason: "Running", 183 }, 184 }, 185 }, 186 WantCondition: &metav1.Condition{ 187 Type: kueue.WorkloadPodsReady, 188 Status: metav1.ConditionTrue, 189 Reason: "PodsReady", 190 Message: "All pods were ready or succeeded since the workload admission", 191 }, 192 }), 193 ginkgo.Entry("Job suspended; PodsReady=True before", kftesting.PodsReadyTestSpec{ 194 BeforeJobStatus: &kftraining.JobStatus{ 195 Conditions: []kftraining.JobCondition{ 196 { 197 Type: kftraining.JobRunning, 198 Status: corev1.ConditionTrue, 199 Reason: "Running", 200 }, 201 }, 202 }, 203 BeforeCondition: &metav1.Condition{ 204 Type: kueue.WorkloadPodsReady, 205 Status: metav1.ConditionTrue, 206 Reason: "PodsReady", 207 Message: "All pods were ready or succeeded since the workload admission", 208 }, 209 JobStatus: kftraining.JobStatus{ 210 Conditions: []kftraining.JobCondition{ 211 { 212 Type: kftraining.JobRunning, 213 Status: corev1.ConditionFalse, 214 Reason: "Suspended", 215 }, 216 }, 217 }, 218 Suspended: true, 219 WantCondition: &metav1.Condition{ 220 Type: kueue.WorkloadPodsReady, 221 Status: metav1.ConditionFalse, 222 Reason: "PodsReady", 223 Message: "Not all pods are ready or succeeded", 224 }, 225 }), 226 ) 227 }) 228 229 var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 230 var ( 231 ns *corev1.Namespace 232 onDemandFlavor *kueue.ResourceFlavor 233 spotUntaintedFlavor *kueue.ResourceFlavor 234 clusterQueue *kueue.ClusterQueue 235 localQueue *kueue.LocalQueue 236 ) 237 238 ginkgo.BeforeAll(func() { 239 fwk = &framework.Framework{ 240 CRDPath: crdPath, 241 DepCRDPaths: []string{paddleCrdPath}, 242 } 243 cfg := fwk.Init() 244 ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup()) 245 }) 246 ginkgo.AfterAll(func() { 247 fwk.Teardown() 248 }) 249 250 ginkgo.BeforeEach(func() { 251 ns = &corev1.Namespace{ 252 ObjectMeta: metav1.ObjectMeta{ 253 GenerateName: "core-", 254 }, 255 } 256 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 257 258 onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 259 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 260 261 spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj() 262 gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed()) 263 264 clusterQueue = testing.MakeClusterQueue("dev-clusterqueue"). 265 ResourceGroup( 266 *testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "5").Obj(), 267 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(), 268 ).Obj() 269 gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed()) 270 }) 271 ginkgo.AfterEach(func() { 272 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 273 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true) 274 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true) 275 gomega.Expect(util.DeleteResourceFlavor(ctx, k8sClient, spotUntaintedFlavor)).To(gomega.Succeed()) 276 }) 277 278 ginkgo.It("Should schedule jobs as they fit in their ClusterQueue", func() { 279 ginkgo.By("creating localQueue") 280 localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() 281 gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed()) 282 283 kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpaddlejob.JobControl)( 284 testingpaddlejob.MakePaddleJob(jobName, ns.Name).Queue(localQueue.Name). 285 Request(kftraining.PaddleJobReplicaTypeMaster, corev1.ResourceCPU, "3"). 286 Request(kftraining.PaddleJobReplicaTypeWorker, corev1.ResourceCPU, "4"). 287 Obj(), 288 )} 289 createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadpaddlejob.JobControl)(&kftraining.PaddleJob{})} 290 291 kftesting.ShouldScheduleJobsAsTheyFitInTheirClusterQueue(ctx, k8sClient, kfJob, createdJob, clusterQueue, []kftesting.PodSetsResource{ 292 { 293 RoleName: kftraining.PaddleJobReplicaTypeMaster, 294 ResourceCPU: kueue.ResourceFlavorReference(spotUntaintedFlavor.Name), 295 }, 296 { 297 RoleName: kftraining.PaddleJobReplicaTypeWorker, 298 ResourceCPU: kueue.ResourceFlavorReference(onDemandFlavor.Name), 299 }, 300 }) 301 }) 302 })