sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/xgboostjob/xgboostjob_controller_test.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package xgboostjob 18 19 import ( 20 kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 21 "github.com/onsi/ginkgo/v2" 22 "github.com/onsi/gomega" 23 corev1 "k8s.io/api/core/v1" 24 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 26 configapi "sigs.k8s.io/kueue/apis/config/v1beta1" 27 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 28 "sigs.k8s.io/kueue/pkg/controller/jobframework" 29 workloadxgboostjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/xgboostjob" 30 "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/kubeflowjob" 31 "sigs.k8s.io/kueue/pkg/util/testing" 32 testingxgboostjob "sigs.k8s.io/kueue/pkg/util/testingjobs/xgboostjob" 33 kftesting "sigs.k8s.io/kueue/test/integration/controller/jobs/kubeflow" 34 "sigs.k8s.io/kueue/test/integration/framework" 35 "sigs.k8s.io/kueue/test/util" 36 ) 37 38 const ( 39 jobName = "test-job" 40 instanceKey = "cloud.provider.com/instance" 41 priorityClassName = "test-priority-class" 42 priorityValue = 10 43 jobQueueName = "test-queue" 44 ) 45 46 // +kubebuilder:docs-gen:collapse=Imports 47 48 var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 49 50 ginkgo.BeforeAll(func() { 51 fwk = &framework.Framework{ 52 CRDPath: crdPath, 53 DepCRDPaths: []string{xgbCrdPath}, 54 } 55 cfg = fwk.Init() 56 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithManageJobsWithoutQueueName(true))) 57 }) 58 ginkgo.AfterAll(func() { 59 fwk.Teardown() 60 }) 61 62 var ( 63 ns *corev1.Namespace 64 ) 65 ginkgo.BeforeEach(func() { 66 ns = &corev1.Namespace{ 67 ObjectMeta: metav1.ObjectMeta{ 68 GenerateName: "core-", 69 }, 70 } 71 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 72 }) 73 ginkgo.AfterEach(func() { 74 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 75 }) 76 77 ginkgo.It("Should reconcile XGBoostJobs", func() { 78 kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadxgboostjob.JobControl)(testingxgboostjob.MakeXGBoostJob(jobName, ns.Name).Obj())} 79 createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadxgboostjob.JobControl)(&kftraining.XGBoostJob{})} 80 kftesting.ShouldReconcileJob(ctx, k8sClient, kfJob, createdJob, []kftesting.PodSetsResource{ 81 { 82 RoleName: kftraining.XGBoostJobReplicaTypeMaster, 83 ResourceCPU: "on-demand", 84 }, 85 { 86 RoleName: kftraining.XGBoostJobReplicaTypeWorker, 87 ResourceCPU: "spot", 88 }, 89 }) 90 }) 91 }) 92 93 var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 94 var ( 95 ns *corev1.Namespace 96 defaultFlavor = testing.MakeResourceFlavor("default").Label(instanceKey, "default").Obj() 97 ) 98 99 ginkgo.BeforeAll(func() { 100 fwk = &framework.Framework{ 101 CRDPath: crdPath, 102 DepCRDPaths: []string{xgbCrdPath}, 103 } 104 cfg := fwk.Init() 105 ctx, k8sClient = fwk.RunManager(cfg, managerSetup(jobframework.WithWaitForPodsReady(&configapi.WaitForPodsReady{Enable: true}))) 106 107 ginkgo.By("Create a resource flavor") 108 gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).Should(gomega.Succeed()) 109 }) 110 ginkgo.AfterAll(func() { 111 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, defaultFlavor, true) 112 fwk.Teardown() 113 }) 114 115 ginkgo.BeforeEach(func() { 116 ns = &corev1.Namespace{ 117 ObjectMeta: metav1.ObjectMeta{ 118 GenerateName: "core-", 119 }, 120 } 121 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 122 }) 123 ginkgo.AfterEach(func() { 124 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 125 }) 126 127 ginkgo.DescribeTable("Single job at different stages of progress towards completion", 128 func(podsReadyTestSpec kftesting.PodsReadyTestSpec) { 129 kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadxgboostjob.JobControl)(testingxgboostjob.MakeXGBoostJob(jobName, ns.Name).Parallelism(2).Obj())} 130 createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadxgboostjob.JobControl)(&kftraining.XGBoostJob{})} 131 kftesting.JobControllerWhenWaitForPodsReadyEnabled(ctx, k8sClient, kfJob, createdJob, podsReadyTestSpec, []kftesting.PodSetsResource{ 132 { 133 RoleName: kftraining.XGBoostJobReplicaTypeMaster, 134 ResourceCPU: "default", 135 }, 136 { 137 RoleName: kftraining.XGBoostJobReplicaTypeWorker, 138 ResourceCPU: "default", 139 }, 140 }) 141 }, 142 ginkgo.Entry("No progress", kftesting.PodsReadyTestSpec{ 143 WantCondition: &metav1.Condition{ 144 Type: kueue.WorkloadPodsReady, 145 Status: metav1.ConditionFalse, 146 Reason: "PodsReady", 147 Message: "Not all pods are ready or succeeded", 148 }, 149 }), 150 ginkgo.Entry("Running XGBoostJob", kftesting.PodsReadyTestSpec{ 151 JobStatus: kftraining.JobStatus{ 152 Conditions: []kftraining.JobCondition{ 153 { 154 Type: kftraining.JobRunning, 155 Status: corev1.ConditionTrue, 156 Reason: "Running", 157 }, 158 }, 159 }, 160 WantCondition: &metav1.Condition{ 161 Type: kueue.WorkloadPodsReady, 162 Status: metav1.ConditionTrue, 163 Reason: "PodsReady", 164 Message: "All pods were ready or succeeded since the workload admission", 165 }, 166 }), 167 ginkgo.Entry("Running XGBoostJob; PodsReady=False before", kftesting.PodsReadyTestSpec{ 168 BeforeCondition: &metav1.Condition{ 169 Type: kueue.WorkloadPodsReady, 170 Status: metav1.ConditionFalse, 171 Reason: "PodsReady", 172 Message: "Not all pods are ready or succeeded", 173 }, 174 JobStatus: kftraining.JobStatus{ 175 Conditions: []kftraining.JobCondition{ 176 { 177 Type: kftraining.JobRunning, 178 Status: corev1.ConditionTrue, 179 Reason: "Running", 180 }, 181 }, 182 }, 183 WantCondition: &metav1.Condition{ 184 Type: kueue.WorkloadPodsReady, 185 Status: metav1.ConditionTrue, 186 Reason: "PodsReady", 187 Message: "All pods were ready or succeeded since the workload admission", 188 }, 189 }), 190 ginkgo.Entry("Job suspended; PodsReady=True before", kftesting.PodsReadyTestSpec{ 191 BeforeJobStatus: &kftraining.JobStatus{ 192 Conditions: []kftraining.JobCondition{ 193 { 194 Type: kftraining.JobRunning, 195 Status: corev1.ConditionTrue, 196 Reason: "Running", 197 }, 198 }, 199 }, 200 BeforeCondition: &metav1.Condition{ 201 Type: kueue.WorkloadPodsReady, 202 Status: metav1.ConditionTrue, 203 Reason: "PodsReady", 204 Message: "All pods were ready or succeeded since the workload admission", 205 }, 206 JobStatus: kftraining.JobStatus{ 207 Conditions: []kftraining.JobCondition{ 208 { 209 Type: kftraining.JobRunning, 210 Status: corev1.ConditionFalse, 211 Reason: "Suspended", 212 }, 213 }, 214 }, 215 Suspended: true, 216 WantCondition: &metav1.Condition{ 217 Type: kueue.WorkloadPodsReady, 218 Status: metav1.ConditionFalse, 219 Reason: "PodsReady", 220 Message: "Not all pods are ready or succeeded", 221 }, 222 }), 223 ) 224 }) 225 226 var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { 227 var ( 228 ns *corev1.Namespace 229 onDemandFlavor *kueue.ResourceFlavor 230 spotUntaintedFlavor *kueue.ResourceFlavor 231 clusterQueue *kueue.ClusterQueue 232 localQueue *kueue.LocalQueue 233 ) 234 235 ginkgo.BeforeAll(func() { 236 fwk = &framework.Framework{ 237 CRDPath: crdPath, 238 DepCRDPaths: []string{xgbCrdPath}, 239 } 240 cfg := fwk.Init() 241 ctx, k8sClient = fwk.RunManager(cfg, managerAndSchedulerSetup()) 242 }) 243 ginkgo.AfterAll(func() { 244 fwk.Teardown() 245 }) 246 247 ginkgo.BeforeEach(func() { 248 ns = &corev1.Namespace{ 249 ObjectMeta: metav1.ObjectMeta{ 250 GenerateName: "core-", 251 }, 252 } 253 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 254 255 onDemandFlavor = testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 256 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 257 258 spotUntaintedFlavor = testing.MakeResourceFlavor("spot-untainted").Label(instanceKey, "spot-untainted").Obj() 259 gomega.Expect(k8sClient.Create(ctx, spotUntaintedFlavor)).Should(gomega.Succeed()) 260 261 clusterQueue = testing.MakeClusterQueue("dev-clusterqueue"). 262 ResourceGroup( 263 *testing.MakeFlavorQuotas("spot-untainted").Resource(corev1.ResourceCPU, "5").Obj(), 264 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(), 265 ).Obj() 266 gomega.Expect(k8sClient.Create(ctx, clusterQueue)).Should(gomega.Succeed()) 267 }) 268 ginkgo.AfterEach(func() { 269 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 270 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, clusterQueue, true) 271 util.ExpectResourceFlavorToBeDeleted(ctx, k8sClient, onDemandFlavor, true) 272 gomega.Expect(util.DeleteResourceFlavor(ctx, k8sClient, spotUntaintedFlavor)).To(gomega.Succeed()) 273 }) 274 275 ginkgo.It("Should schedule jobs as they fit in their ClusterQueue", func() { 276 ginkgo.By("creating localQueue") 277 localQueue = testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() 278 gomega.Expect(k8sClient.Create(ctx, localQueue)).Should(gomega.Succeed()) 279 280 kfJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadxgboostjob.JobControl)( 281 testingxgboostjob.MakeXGBoostJob(jobName, ns.Name).Queue(localQueue.Name). 282 Request(kftraining.XGBoostJobReplicaTypeMaster, corev1.ResourceCPU, "3"). 283 Request(kftraining.XGBoostJobReplicaTypeWorker, corev1.ResourceCPU, "4"). 284 Obj(), 285 )} 286 createdJob := kubeflowjob.KubeflowJob{KFJobControl: (*workloadxgboostjob.JobControl)(&kftraining.XGBoostJob{})} 287 kftesting.ShouldScheduleJobsAsTheyFitInTheirClusterQueue(ctx, k8sClient, kfJob, createdJob, clusterQueue, []kftesting.PodSetsResource{ 288 { 289 RoleName: kftraining.XGBoostJobReplicaTypeMaster, 290 ResourceCPU: kueue.ResourceFlavorReference(spotUntaintedFlavor.Name), 291 }, 292 { 293 RoleName: kftraining.XGBoostJobReplicaTypeWorker, 294 ResourceCPU: kueue.ResourceFlavorReference(onDemandFlavor.Name), 295 }, 296 }) 297 }) 298 })