sigs.k8s.io/kueue@v0.6.2/test/integration/controller/jobs/kubeflow/kubeflowjob.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package kubeflow 18 19 import ( 20 "context" 21 "fmt" 22 23 "github.com/google/go-cmp/cmp/cmpopts" 24 kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 25 "github.com/onsi/ginkgo/v2" 26 "github.com/onsi/gomega" 27 corev1 "k8s.io/api/core/v1" 28 apimeta "k8s.io/apimachinery/pkg/api/meta" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/apimachinery/pkg/types" 31 "k8s.io/client-go/kubernetes/scheme" 32 "k8s.io/utils/ptr" 33 ctrl "sigs.k8s.io/controller-runtime" 34 35 "sigs.k8s.io/controller-runtime/pkg/client" 36 37 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 38 "sigs.k8s.io/kueue/pkg/controller/constants" 39 "sigs.k8s.io/kueue/pkg/controller/jobframework" 40 "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/kubeflowjob" 41 "sigs.k8s.io/kueue/pkg/util/testing" 42 43 "sigs.k8s.io/kueue/test/util" 44 ) 45 46 const ( 47 instanceKey = "cloud.provider.com/instance" 48 priorityClassName = "test-priority-class" 49 priorityValue = 10 50 jobQueueName = "test-queue" 51 ) 52 53 var ( 54 ignoreConditionTimestamps = cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime") 55 ) 56 57 type PodsReadyTestSpec struct { 58 BeforeJobStatus *kftraining.JobStatus 59 BeforeCondition *metav1.Condition 60 JobStatus kftraining.JobStatus 61 Suspended bool 62 WantCondition *metav1.Condition 63 } 64 65 var ReplicaTypeWorker = kftraining.ReplicaType("Worker") 66 67 func ShouldReconcileJob(ctx context.Context, k8sClient client.Client, job, createdJob kubeflowjob.KubeflowJob, podSetsResources []PodSetsResource) { 68 ginkgo.By("checking the job gets suspended when created unsuspended") 69 priorityClass := testing.MakePriorityClass(priorityClassName). 70 PriorityValue(int32(priorityValue)).Obj() 71 gomega.Expect(k8sClient.Create(ctx, priorityClass)).Should(gomega.Succeed()) 72 73 if job.KFJobControl.RunPolicy().SchedulingPolicy == nil { 74 job.KFJobControl.RunPolicy().SchedulingPolicy = &kftraining.SchedulingPolicy{} 75 } 76 job.KFJobControl.RunPolicy().SchedulingPolicy.PriorityClass = priorityClassName 77 err := k8sClient.Create(ctx, job.Object()) 78 gomega.Expect(err).To(gomega.Succeed()) 79 80 lookupKey := client.ObjectKeyFromObject(job.Object()) 81 82 gomega.Eventually(func() bool { 83 if err := k8sClient.Get(ctx, lookupKey, createdJob.Object()); err != nil { 84 return false 85 } 86 return createdJob.IsSuspended() 87 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 88 89 wlLookupKey := types.NamespacedName{ 90 Name: jobframework.GetWorkloadNameForOwnerWithGVK(job.Object().GetName(), job.GVK()), 91 Namespace: job.Object().GetNamespace(), 92 } 93 94 ginkgo.By("checking the workload is created without queue assigned") 95 createdWorkload := util.AwaitAndVerifyCreatedWorkload(ctx, k8sClient, wlLookupKey, createdJob.Object()) 96 util.VerifyWorkloadPriority(createdWorkload, priorityClassName, priorityValue) 97 gomega.Expect(createdWorkload.Spec.QueueName).Should(gomega.Equal(""), "The Workload shouldn't have .spec.queueName set") 98 99 ginkgo.By("checking the workload is created with priority and priorityName") 100 gomega.Expect(createdWorkload.Spec.PriorityClassName).Should(gomega.Equal(priorityClassName)) 101 gomega.Expect(*createdWorkload.Spec.Priority).Should(gomega.Equal(int32(priorityValue))) 102 103 ginkgo.By("checking the workload is updated with queue name when the job does") 104 createdJob.Object().SetAnnotations(map[string]string{constants.QueueAnnotation: jobQueueName}) 105 gomega.Expect(k8sClient.Update(ctx, createdJob.Object())).Should(gomega.Succeed()) 106 util.AwaitAndVerifyWorkloadQueueName(ctx, k8sClient, createdWorkload, wlLookupKey, jobQueueName) 107 108 ginkgo.By("checking a second non-matching workload is deleted") 109 secondWl := &kueue.Workload{ 110 ObjectMeta: metav1.ObjectMeta{ 111 Name: jobframework.GetWorkloadNameForOwnerWithGVK("second-workload", job.GVK()), 112 Namespace: createdWorkload.Namespace, 113 }, 114 Spec: *createdWorkload.Spec.DeepCopy(), 115 } 116 gomega.Expect(ctrl.SetControllerReference(createdJob.Object(), secondWl, scheme.Scheme)).Should(gomega.Succeed()) 117 secondWl.Spec.PodSets[0].Count += 1 118 119 gomega.Expect(k8sClient.Create(ctx, secondWl)).Should(gomega.Succeed()) 120 gomega.Eventually(func() error { 121 wl := &kueue.Workload{} 122 key := types.NamespacedName{Name: secondWl.Name, Namespace: secondWl.Namespace} 123 return k8sClient.Get(ctx, key, wl) 124 }, util.Timeout, util.Interval).Should(testing.BeNotFoundError()) 125 // check the original wl is still there 126 gomega.Eventually(func() error { 127 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 128 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 129 130 ginkgo.By("checking the job is unsuspended when workload is assigned") 131 onDemandFlavor := testing.MakeResourceFlavor("on-demand").Label(instanceKey, "on-demand").Obj() 132 gomega.Expect(k8sClient.Create(ctx, onDemandFlavor)).Should(gomega.Succeed()) 133 spotFlavor := testing.MakeResourceFlavor("spot").Label(instanceKey, "spot").Obj() 134 gomega.Expect(k8sClient.Create(ctx, spotFlavor)).Should(gomega.Succeed()) 135 clusterQueue := testing.MakeClusterQueue("cluster-queue"). 136 ResourceGroup( 137 *testing.MakeFlavorQuotas("on-demand").Resource(corev1.ResourceCPU, "5").Obj(), 138 *testing.MakeFlavorQuotas("spot").Resource(corev1.ResourceCPU, "5").Obj(), 139 ).Obj() 140 admission := testing.MakeAdmission(clusterQueue.Name).PodSets(CreatePodSetAssigment(createdWorkload, podSetsResources)...).Obj() 141 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 142 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 143 gomega.Eventually(func() bool { 144 if err := k8sClient.Get(ctx, lookupKey, createdJob.Object()); err != nil { 145 return false 146 } 147 return !createdJob.IsSuspended() 148 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 149 gomega.Eventually(func() bool { 150 ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "Started", corev1.EventTypeNormal, fmt.Sprintf("Admitted by clusterQueue %v", clusterQueue.Name)) 151 return ok 152 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 153 for _, psr := range podSetsResources { 154 gomega.Expect(createdJob.KFJobControl.ReplicaSpecs()[psr.RoleName].Template.Spec.NodeSelector). 155 To(gomega.BeComparableTo(map[string]string{instanceKey: string(psr.ResourceCPU)})) 156 } 157 gomega.Eventually(func() bool { 158 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 159 return false 160 } 161 return len(createdWorkload.Status.Conditions) == 2 162 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 163 164 ginkgo.By("checking the job gets suspended when parallelism changes and the added node selectors are removed") 165 parallelism := ptr.Deref(job.KFJobControl.ReplicaSpecs()[ReplicaTypeWorker].Replicas, 1) 166 newParallelism := parallelism + 1 167 createdJob.KFJobControl.ReplicaSpecs()[ReplicaTypeWorker].Replicas = &newParallelism 168 gomega.Expect(k8sClient.Update(ctx, createdJob.Object())).Should(gomega.Succeed()) 169 gomega.Eventually(func() bool { 170 if err := k8sClient.Get(ctx, lookupKey, createdJob.Object()); err != nil { 171 return false 172 } 173 return createdJob.IsSuspended() && 174 len(createdJob.KFJobControl.ReplicaSpecs()[ReplicaTypeWorker].Template.Spec.NodeSelector) == 0 175 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 176 gomega.Eventually(func() bool { 177 ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "DeletedWorkload", corev1.EventTypeNormal, fmt.Sprintf("Deleted not matching Workload: %v", wlLookupKey.String())) 178 return ok 179 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 180 181 ginkgo.By("checking the workload is updated with new count") 182 gomega.Eventually(func() bool { 183 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 184 return false 185 } 186 return workerPodSetsCount(createdWorkload, podSetsResources) == newParallelism 187 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 188 gomega.Expect(createdWorkload.Status.Admission).Should(gomega.BeNil()) 189 190 ginkgo.By("checking the job is unsuspended and selectors added when workload is assigned again") 191 admission = testing.MakeAdmission(clusterQueue.Name).PodSets(CreatePodSetAssigment(createdWorkload, podSetsResources)...).Obj() 192 gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 193 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 194 gomega.Eventually(func() bool { 195 if err := k8sClient.Get(ctx, lookupKey, createdJob.Object()); err != nil { 196 return false 197 } 198 return !createdJob.IsSuspended() 199 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 200 for _, psr := range podSetsResources { 201 gomega.Expect(createdJob.KFJobControl.ReplicaSpecs()[psr.RoleName].Template.Spec.NodeSelector). 202 To(gomega.BeComparableTo(map[string]string{instanceKey: string(psr.ResourceCPU)})) 203 } 204 gomega.Eventually(func() bool { 205 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 206 return false 207 } 208 return len(createdWorkload.Status.Conditions) == 2 209 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 210 211 ginkgo.By("checking the workload is finished when job is completed") 212 createdJob.KFJobControl.JobStatus().Conditions = append(createdJob.KFJobControl.JobStatus().Conditions, 213 kftraining.JobCondition{ 214 Type: kftraining.JobSucceeded, 215 Status: corev1.ConditionTrue, 216 LastTransitionTime: metav1.Now(), 217 }) 218 gomega.Expect(k8sClient.Status().Update(ctx, createdJob.Object())).Should(gomega.Succeed()) 219 gomega.Eventually(func() bool { 220 err := k8sClient.Get(ctx, wlLookupKey, createdWorkload) 221 if err != nil || len(createdWorkload.Status.Conditions) == 2 { 222 return false 223 } 224 225 return apimeta.IsStatusConditionTrue(createdWorkload.Status.Conditions, kueue.WorkloadFinished) 226 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 227 } 228 229 func JobControllerWhenWaitForPodsReadyEnabled(ctx context.Context, k8sClient client.Client, job, createdJob kubeflowjob.KubeflowJob, podsReadyTestSpec PodsReadyTestSpec, podSetsResources []PodSetsResource) { 230 ginkgo.By("Create a job") 231 job.Object().SetAnnotations(map[string]string{constants.QueueAnnotation: jobQueueName}) 232 gomega.ExpectWithOffset(1, k8sClient.Create(ctx, job.Object())).Should(gomega.Succeed()) 233 lookupKey := client.ObjectKeyFromObject(job.Object()) 234 gomega.ExpectWithOffset(1, k8sClient.Get(ctx, lookupKey, createdJob.Object())).Should(gomega.Succeed()) 235 236 wlLookupKey := types.NamespacedName{ 237 Name: jobframework.GetWorkloadNameForOwnerWithGVK(job.Object().GetName(), job.GVK()), 238 Namespace: job.Object().GetNamespace(), 239 } 240 241 ginkgo.By("Fetch the workload created for the job") 242 createdWorkload := &kueue.Workload{} 243 gomega.EventuallyWithOffset(1, func() error { 244 return k8sClient.Get(ctx, wlLookupKey, createdWorkload) 245 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 246 247 ginkgo.By("Admit the workload created for the job") 248 admission := testing.MakeAdmission("foo").PodSets(CreatePodSetAssigment(createdWorkload, podSetsResources)...).Obj() 249 gomega.ExpectWithOffset(1, util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed()) 250 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 251 gomega.ExpectWithOffset(1, k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 252 253 ginkgo.By("Await for the job to be unsuspended") 254 gomega.EventuallyWithOffset(1, func() bool { 255 gomega.ExpectWithOffset(1, k8sClient.Get(ctx, lookupKey, createdJob.Object())).Should(gomega.Succeed()) 256 return createdJob.IsSuspended() 257 }, util.Timeout, util.Interval).Should(gomega.BeFalse()) 258 259 if podsReadyTestSpec.BeforeJobStatus != nil { 260 ginkgo.By("Update the job status to simulate its initial progress towards completion") 261 createdJob.KFJobControl.JobStatus().Conditions = append(createdJob.KFJobControl.JobStatus().Conditions, podsReadyTestSpec.BeforeJobStatus.Conditions...) 262 gomega.ExpectWithOffset(1, k8sClient.Status().Update(ctx, createdJob.Object())).Should(gomega.Succeed()) 263 gomega.ExpectWithOffset(1, k8sClient.Get(ctx, lookupKey, createdJob.Object())).Should(gomega.Succeed()) 264 } 265 266 if podsReadyTestSpec.BeforeCondition != nil { 267 ginkgo.By("Update the workload status") 268 gomega.EventuallyWithOffset(1, func() *metav1.Condition { 269 gomega.ExpectWithOffset(1, k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 270 return apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadPodsReady) 271 }, util.Timeout, util.Interval).Should(gomega.BeComparableTo(podsReadyTestSpec.BeforeCondition, ignoreConditionTimestamps)) 272 } 273 274 ginkgo.By("Update the job status to simulate its progress towards completion") 275 createdJob.KFJobControl.JobStatus().Conditions = append(createdJob.KFJobControl.JobStatus().Conditions, podsReadyTestSpec.JobStatus.Conditions...) 276 gomega.ExpectWithOffset(1, k8sClient.Status().Update(ctx, createdJob.Object())).Should(gomega.Succeed()) 277 gomega.ExpectWithOffset(1, k8sClient.Get(ctx, lookupKey, createdJob.Object())).Should(gomega.Succeed()) 278 279 if podsReadyTestSpec.Suspended { 280 ginkgo.By("Unset admission of the workload to suspend the job") 281 gomega.EventuallyWithOffset(1, func() error { 282 // the update may need to be retried due to a conflict as the workload gets 283 // also updated due to setting of the job status. 284 if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { 285 return err 286 } 287 return util.SetQuotaReservation(ctx, k8sClient, createdWorkload, nil) 288 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 289 util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload) 290 } 291 292 ginkgo.By("Verify the PodsReady condition is added") 293 gomega.EventuallyWithOffset(1, func() *metav1.Condition { 294 gomega.ExpectWithOffset(1, k8sClient.Get(ctx, wlLookupKey, createdWorkload)).Should(gomega.Succeed()) 295 return apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadPodsReady) 296 }, util.Timeout, util.Interval).Should(gomega.BeComparableTo(podsReadyTestSpec.WantCondition, ignoreConditionTimestamps)) 297 } 298 299 func ShouldScheduleJobsAsTheyFitInTheirClusterQueue(ctx context.Context, k8sClient client.Client, job, createdJob kubeflowjob.KubeflowJob, clusterQueue *kueue.ClusterQueue, podSetsResources []PodSetsResource) { 300 ginkgo.By("checking a job starts") 301 gomega.ExpectWithOffset(1, k8sClient.Create(ctx, job.Object())).Should(gomega.Succeed()) 302 gomega.EventuallyWithOffset(1, func() bool { 303 gomega.ExpectWithOffset(1, k8sClient.Get(ctx, client.ObjectKeyFromObject(job.Object()), createdJob.Object())). 304 Should(gomega.Succeed()) 305 return createdJob.IsSuspended() 306 }, util.Timeout, util.Interval).Should(gomega.BeFalse()) 307 for _, psr := range podSetsResources { 308 gomega.ExpectWithOffset(1, createdJob.KFJobControl.ReplicaSpecs()[psr.RoleName].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(string(psr.ResourceCPU))) 309 } 310 util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 0) 311 util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 1) 312 } 313 314 type PodSetsResource struct { 315 RoleName kftraining.ReplicaType 316 ResourceCPU kueue.ResourceFlavorReference 317 } 318 319 func CreatePodSetAssigment(createdWorkload *kueue.Workload, podSetsResource []PodSetsResource) []kueue.PodSetAssignment { 320 pda := []kueue.PodSetAssignment{} 321 for i, psr := range podSetsResource { 322 pda = append(pda, kueue.PodSetAssignment{ 323 Name: string(psr.RoleName), 324 Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{ 325 corev1.ResourceCPU: psr.ResourceCPU, 326 }, 327 Count: ptr.To(createdWorkload.Spec.PodSets[i].Count), 328 }) 329 } 330 return pda 331 } 332 333 func workerPodSetsCount(wl *kueue.Workload, podSetsResources []PodSetsResource) int32 { 334 idx := -1 335 for i, psr := range podSetsResources { 336 if psr.RoleName == ReplicaTypeWorker { 337 idx = i 338 } 339 } 340 return wl.Spec.PodSets[idx].Count 341 }