sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/tfjob/tfjob_controller_test.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package tfjob 18 19 import ( 20 kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 21 "github.com/onsi/ginkgo/v2" 22 "github.com/onsi/gomega" 23 corev1 "k8s.io/api/core/v1" 24 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 26 configapi "sigs.k8s.io/kueue/apis/config/v1beta1" 27 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 28 "sigs.k8s.io/kueue/pkg/controller/jobframework" 29 workloadtfjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/tfjob" 30 "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/kubeflowjob" 31 "sigs.k8s.io/kueue/pkg/util/testing" 32 testingtfjob "sigs.k8s.io/kueue/pkg/util/testingjobs/tfjob" 33 kftesting "sigs.k8s.io/kueue/test/integration/controller/jobs/kubeflow" 34 "sigs.k8s.io/kueue/test/integration/framework" 35 "sigs.k8s.io/kueue/test/util" 36 ) 37 38 const ( 39 jobName = "test-job" 40 instanceKey = "cloud.provider.com/instance" 41 priorityClassName = "test-priority-class" 42 priorityValue = 10 43 jobQueueName = "test-queue" 44 ) 45 46 // +kubebuilder:docs-gen:collapse=Imports 47 48 var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 49 50 ginkgo.BeforeAll(func() { 51 fwk = &framework.Framework{ 52 CRDPath: crdPath, 53 DepCRDPaths: []string{tensorflowCrdPath}, 54 } 55 cfg = fwk.Init() 56 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithManageJobsWithoutQueueName(true))) 57 }) 58 ginkgo.AfterAll(func() { 59 fwk.Teardown() 60 }) 61 62 var ( 63 ns *corev1.Namespace 64 ) 65 ginkgo.BeforeEach(func() { 66 ns = &corev1.Namespace{ 67 ObjectMeta: metav1.ObjectMeta{ 68 GenerateName: "core-", 69 }, 70 } 71 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 72 }) 73 ginkgo.AfterEach(func() { 74 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 75 }) 76 77 ginkgo.It("Should reconcile TFJobs", func() { 78 kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadtfjob.JobControl)(testingtfjob.MakeTFJob(jobName, ns.Name).Obj())} 79 createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadtfjob.JobControl)(&kftraining.TFJob{})} 80 81 kftesting.ShouldReconcileJob(ctx, k8sClient, kfJob, createdJob, []kftesting.PodSetsResource{ 82 { 83 RoleName: kftraining.TFJobReplicaTypeChief, 84 ResourceCPU: "on-demand", 85 }, 86 { 87 RoleName: kftraining.TFJobReplicaTypePS, 88 ResourceCPU: "spot", 89 }, 90 { 91 RoleName: kftraining.TFJobReplicaTypeWorker, 92 ResourceCPU: "spot", 93 }, 94 }) 95 }) 96 }) 97 98 var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 99 var ( 100 ns *corev1.Namespace 101 defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj() 102 ) 103 104 ginkgo.BeforeAll(func() { 105 fwk = &framework.Framework{ 106 CRDPath: crdPath, 107 DepCRDPaths: []string{tensorflowCrdPath}, 108 } 109 cfg := fwk.Init() 110 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true}))) 111 112 ginkgo.By("Create a resource flavor") 113 gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed()) 114 }) 115 ginkgo.AfterAll(func() { 116 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true) 117 fwk.Teardown() 118 }) 119 120 ginkgo.BeforeEach(func() { 121 ns = &corev1.Namespace{ 122 ObjectMeta: metav1.ObjectMeta{ 123 GenerateName: "core-", 124 }, 125 } 126 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 127 }) 128 ginkgo.AfterEach(func() { 129 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 130 }) 131 132 ginkgo.DescribeTable("Single job at different stages of progress towards completion", 133 func(podsReadyTestSpec kftesting.PodsReadyTestSpec) { 134 kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadtfjob.JobControl)(testingtfjob.MakeTFJob(jobName, ns.Name).Parallelism(2, 2).Obj())} 135 createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadtfjob.JobControl)(&kftraining.TFJob{})} 136 137 kftesting.JobControllerWhenWaitForPodsReadyEnabled(ctx, k8sClient, kfJob, createdJob, podsReadyTestSpec, []kftesting.PodSetsResource{ 138 { 139 RoleName: kftraining.TFJobReplicaTypeChief, 140 ResourceCPU: "default", 141 }, 142 { 143 RoleName: kftraining.TFJobReplicaTypePS, 144 ResourceCPU: "default", 145 }, 146 { 147 RoleName: kftraining.TFJobReplicaTypeWorker, 148 ResourceCPU: "default", 149 }, 150 }) 151 }, 152 ginkgo.Entry("No progress", kftesting.PodsReadyTestSpec{ 153 WantCondition: &metav1.Condition{ 154 Type: kueue.WorkloadPodsReady, 155 Status: metav1.ConditionFalse, 156 Reason: "PodsReady", 157 Message: "Not all pods are ready or succeeded", 158 }, 159 }), 160 ginkgo.Entry("Running TFJob", kftesting.PodsReadyTestSpec{ 161 JobStatus: kftraining.JobStatus{ 162 Conditions: []kftraining.JobCondition{ 163 { 164 Type: kftraining.JobRunning, 165 Status: corev1.ConditionTrue, 166 Reason: "Running", 167 }, 168 }, 169 }, 170 WantCondition: &metav1.Condition{ 171 Type: kueue.WorkloadPodsReady, 172 Status: metav1.ConditionTrue, 173 Reason: "PodsReady", 174 Message: "All pods were ready or succeeded since the workload admission", 175 }, 176 }), 177 ginkgo.Entry("Running TFJob; PodsReady=False before", kftesting.PodsReadyTestSpec{ 178 BeforeCondition: &metav1.Condition{ 179 Type: kueue.WorkloadPodsReady, 180 Status: metav1.ConditionFalse, 181 Reason: "PodsReady", 182 Message: "Not all pods are ready or succeeded", 183 }, 184 JobStatus: kftraining.JobStatus{ 185 Conditions: []kftraining.JobCondition{ 186 { 187 Type: kftraining.JobRunning, 188 Status: corev1.ConditionTrue, 189 Reason: "Running", 190 }, 191 }, 192 }, 193 WantCondition: &metav1.Condition{ 194 Type: kueue.WorkloadPodsReady, 195 Status: metav1.ConditionTrue, 196 Reason: "PodsReady", 197 Message: "All pods were ready or succeeded since the workload admission", 198 }, 199 }), 200 ginkgo.Entry("Job suspended; PodsReady=True before", kftesting.PodsReadyTestSpec{ 201 BeforeJobStatus: &kftraining.JobStatus{ 202 Conditions: []kftraining.JobCondition{ 203 { 204 Type: kftraining.JobRunning, 205 Status: corev1.ConditionTrue, 206 Reason: "Running", 207 }, 208 }, 209 }, 210 BeforeCondition: &metav1.Condition{ 211 Type: kueue.WorkloadPodsReady, 212 Status: metav1.ConditionTrue, 213 Reason: "PodsReady", 214 Message: "All pods were ready or succeeded since the workload admission", 215 }, 216 JobStatus: kftraining.JobStatus{ 217 Conditions: []kftraining.JobCondition{ 218 { 219 Type: kftraining.JobRunning, 220 Status: corev1.ConditionFalse, 221 Reason: "Suspended", 222 }, 223 }, 224 }, 225 Suspended: true, 226 WantCondition: &metav1.Condition{ 227 Type: kueue.WorkloadPodsReady, 228 Status: metav1.ConditionFalse, 229 Reason: "PodsReady", 230 Message: "Not all pods are ready or succeeded", 231 }, 232 }), 233 ) 234 }) 235 236 var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 237 var ( 238 ns *corev1.Namespace 239 onDemandFlavor *kueue.ResourceFlavor 240 spotUntaintedFlavor *kueue.ResourceFlavor 241 clusterQueue *kueue.ClusterQueue 242 localQueue *kueue.LocalQueue 243 ) 244 245 ginkgo.BeforeAll(func() { 246 fwk = &framework.Framework{ 247 CRDPath: crdPath, 248 DepCRDPaths: []string{tensorflowCrdPath}, 249 } 250 cfg := fwk.Init() 251 ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup()) 252 }) 253 ginkgo.AfterAll(func() { 254 fwk.Teardown() 255 }) 256 257 ginkgo.BeforeEach(func() { 258 ns = &corev1.Namespace{ 259 ObjectMeta: metav1.ObjectMeta{ 260 GenerateName: "core-", 261 }, 262 } 263 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 264 265 onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 266 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 267 268 spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj() 269 gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed()) 270 271 clusterQueue = testing.MakeClusterQueue("dev-clusterqueue"). 272 ResourceGroup( 273 *testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "8").Obj(), 274 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(), 275 ).Obj() 276 gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed()) 277 }) 278 ginkgo.AfterEach(func() { 279 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 280 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true) 281 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true) 282 gomega.Expect(util.DeleteResourceFlavor(ctx, k8sClient, spotUntaintedFlavor)).To(gomega.Succeed()) 283 }) 284 285 ginkgo.It("Should schedule jobs as they fit in their ClusterQueue", func() { 286 ginkgo.By("creating localQueue") 287 localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() 288 gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed()) 289 290 kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadtfjob.JobControl)( 291 testingtfjob.MakeTFJob(jobName, ns.Name).Queue(localQueue.Name). 292 Request(kftraining.TFJobReplicaTypeChief, corev1.ResourceCPU, "3"). 293 Request(kftraining.TFJobReplicaTypePS, corev1.ResourceCPU, "4"). 294 Request(kftraining.TFJobReplicaTypeWorker, corev1.ResourceCPU, "4"). 295 Obj(), 296 )} 297 createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadtfjob.JobControl)(&kftraining.TFJob{})} 298 299 kftesting.ShouldScheduleJobsAsTheyFitInTheirClusterQueue(ctx, k8sClient, kfJob, createdJob, clusterQueue, []kftesting.PodSetsResource{ 300 { 301 RoleName: kftraining.TFJobReplicaTypeChief, 302 ResourceCPU: kueue.ResourceFlavorReference(spotUntaintedFlavor.Name), 303 }, 304 { 305 RoleName: kftraining.TFJobReplicaTypePS, 306 ResourceCPU: kueue.ResourceFlavorReference(spotUntaintedFlavor.Name), 307 }, 308 { 309 RoleName: kftraining.TFJobReplicaTypeWorker, 310 ResourceCPU: kueue.ResourceFlavorReference(onDemandFlavor.Name), 311 }, 312 }) 313 }) 314 })