github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go (about) 1 // Copyright 2022 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package paddle 16 17 import ( 18 "context" 19 "fmt" 20 21 . "github.com/onsi/ginkgo/v2" 22 . "github.com/onsi/gomega" 23 corev1 "k8s.io/api/core/v1" 24 "k8s.io/apimachinery/pkg/api/errors" 25 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 "k8s.io/apimachinery/pkg/types" 27 "k8s.io/utils/pointer" 28 "sigs.k8s.io/controller-runtime/pkg/client" 29 30 kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 31 commonutil "github.com/kubeflow/training-operator/pkg/util" 32 "github.com/kubeflow/training-operator/pkg/util/testutil" 33 ) 34 35 var _ = Describe("PaddleJob controller", func() { 36 // Define utility constants for object names and testing timeouts/durations and intervals. 37 const ( 38 expectedPort = int32(8080) 39 ) 40 Context("When creating the PaddleJob", func() { 41 const name = "test-job" 42 var ( 43 ctx = context.Background() 44 ns *corev1.Namespace 45 job *kubeflowv1.PaddleJob 46 jobKey types.NamespacedName 47 masterKey types.NamespacedName 48 worker0Key types.NamespacedName 49 ) 50 BeforeEach(func() { 51 ns = &corev1.Namespace{ 52 ObjectMeta: metav1.ObjectMeta{ 53 GenerateName: "paddle-test-", 54 }, 55 } 56 Expect(testK8sClient.Create(ctx, ns)).Should(Succeed()) 57 58 job = newPaddleJobForTest(name, ns.Name) 59 jobKey = client.ObjectKeyFromObject(job) 60 masterKey = types.NamespacedName{ 61 Name: fmt.Sprintf("%s-master-0", name), 62 Namespace: ns.Name, 63 } 64 worker0Key = types.NamespacedName{ 65 Name: fmt.Sprintf("%s-worker-0", name), 66 Namespace: ns.Name, 67 } 68 job.Spec.PaddleReplicaSpecs = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{ 69 kubeflowv1.PaddleJobReplicaTypeMaster: { 70 Replicas: pointer.Int32(1), 71 Template: corev1.PodTemplateSpec{ 72 Spec: corev1.PodSpec{ 73 Containers: []corev1.Container{ 74 { 75 Image: "test-image", 76 Name: kubeflowv1.PaddleJobDefaultContainerName, 77 Ports: []corev1.ContainerPort{ 78 { 79 Name: kubeflowv1.PaddleJobDefaultPortName, 80 ContainerPort: expectedPort, 81 Protocol: corev1.ProtocolTCP, 82 }, 83 }, 84 }, 85 }, 86 }, 87 }, 88 }, 89 kubeflowv1.PaddleJobReplicaTypeWorker: { 90 Replicas: pointer.Int32(2), 91 Template: corev1.PodTemplateSpec{ 92 Spec: corev1.PodSpec{ 93 Containers: []corev1.Container{ 94 { 95 Image: "test-image", 96 Name: kubeflowv1.PaddleJobDefaultContainerName, 97 Ports: []corev1.ContainerPort{ 98 { 99 Name: kubeflowv1.PaddleJobDefaultPortName, 100 ContainerPort: expectedPort, 101 Protocol: corev1.ProtocolTCP, 102 }, 103 }, 104 }, 105 }, 106 }, 107 }, 108 }, 109 } 110 }) 111 AfterEach(func() { 112 Expect(testK8sClient.Delete(ctx, job)).Should(Succeed()) 113 Expect(testK8sClient.Delete(ctx, ns)).Should(Succeed()) 114 }) 115 It("Should get the corresponding resources successfully", func() { 116 By("By creating a new PaddleJob") 117 Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) 118 119 created := &kubeflowv1.PaddleJob{} 120 121 // We'll need to retry getting this newly created PaddleJob, given that creation may not immediately happen. 122 Eventually(func() bool { 123 err := testK8sClient.Get(ctx, jobKey, created) 124 return err == nil 125 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 126 127 masterPod := &corev1.Pod{} 128 Eventually(func() bool { 129 err := testK8sClient.Get(ctx, masterKey, masterPod) 130 return err == nil 131 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 132 133 masterSvc := &corev1.Service{} 134 Eventually(func() bool { 135 err := testK8sClient.Get(ctx, masterKey, masterSvc) 136 return err == nil 137 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 138 139 // Check the pod port. 140 Expect(masterPod.Spec.Containers[0].Ports).To(ContainElement(corev1.ContainerPort{ 141 Name: kubeflowv1.PaddleJobDefaultPortName, 142 ContainerPort: expectedPort, 143 Protocol: corev1.ProtocolTCP})) 144 // Check env variable 145 Expect(masterPod.Spec.Containers[0].Env).To(ContainElements(corev1.EnvVar{ 146 Name: EnvMasterEndpoint, 147 Value: fmt.Sprintf("$(POD_IP_DUMMY):%d", expectedPort), 148 })) 149 // Check service port. 150 Expect(masterSvc.Spec.Ports[0].Port).To(Equal(expectedPort)) 151 // Check owner reference. 152 trueVal := true 153 Expect(masterPod.OwnerReferences).To(ContainElement(metav1.OwnerReference{ 154 APIVersion: kubeflowv1.SchemeGroupVersion.String(), 155 Kind: kubeflowv1.PaddleJobKind, 156 Name: name, 157 UID: created.UID, 158 Controller: &trueVal, 159 BlockOwnerDeletion: &trueVal, 160 })) 161 Expect(masterSvc.OwnerReferences).To(ContainElement(metav1.OwnerReference{ 162 APIVersion: kubeflowv1.SchemeGroupVersion.String(), 163 Kind: kubeflowv1.PaddleJobKind, 164 Name: name, 165 UID: created.UID, 166 Controller: &trueVal, 167 BlockOwnerDeletion: &trueVal, 168 })) 169 170 // Test job status. 171 masterPod.Status.Phase = corev1.PodSucceeded 172 masterPod.ResourceVersion = "" 173 Expect(testK8sClient.Status().Update(ctx, masterPod)).Should(Succeed()) 174 Eventually(func() bool { 175 err := testK8sClient.Get(ctx, jobKey, created) 176 if err != nil { 177 return false 178 } 179 return created.Status.ReplicaStatuses != nil && created.Status. 180 ReplicaStatuses[kubeflowv1.PaddleJobReplicaTypeMaster].Succeeded == 1 181 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 182 // Check if the job is succeeded. 183 cond := getCondition(created.Status, kubeflowv1.JobSucceeded) 184 Expect(cond.Status).To(Equal(corev1.ConditionTrue)) 185 }) 186 It("Shouldn't create resources if PaddleJob is suspended", func() { 187 By("By creating a new PaddleJob with suspend=true") 188 job.Spec.RunPolicy.Suspend = pointer.Bool(true) 189 job.Spec.PaddleReplicaSpecs[kubeflowv1.PaddleJobReplicaTypeWorker].Replicas = pointer.Int32(1) 190 Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) 191 192 created := &kubeflowv1.PaddleJob{} 193 masterPod := &corev1.Pod{} 194 workerPod := &corev1.Pod{} 195 masterSvc := &corev1.Service{} 196 workerSvc := &corev1.Service{} 197 198 By("Checking created PaddleJob") 199 Eventually(func() bool { 200 err := testK8sClient.Get(ctx, jobKey, created) 201 return err == nil 202 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 203 By("Checking created PaddleJob has a nil startTime") 204 Consistently(func() *metav1.Time { 205 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 206 return created.Status.StartTime 207 }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) 208 209 By("Checking if the pods and services aren't created") 210 Consistently(func() bool { 211 errMasterPod := testK8sClient.Get(ctx, masterKey, masterPod) 212 errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) 213 errMasterSvc := testK8sClient.Get(ctx, masterKey, masterSvc) 214 errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc) 215 return errors.IsNotFound(errMasterPod) && errors.IsNotFound(errWorkerPod) && 216 errors.IsNotFound(errMasterSvc) && errors.IsNotFound(errWorkerSvc) 217 }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) 218 219 By("Checking if the PaddleJob has suspended condition") 220 Eventually(func() []kubeflowv1.JobCondition { 221 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 222 return created.Status.Conditions 223 }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{ 224 { 225 Type: kubeflowv1.JobCreated, 226 Status: corev1.ConditionTrue, 227 Reason: commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobCreatedReason), 228 Message: fmt.Sprintf("PaddleJob %s is created.", name), 229 }, 230 { 231 Type: kubeflowv1.JobSuspended, 232 Status: corev1.ConditionTrue, 233 Reason: commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobSuspendedReason), 234 Message: fmt.Sprintf("PaddleJob %s is suspended.", name), 235 }, 236 }, testutil.IgnoreJobConditionsTimes)) 237 }) 238 239 It("Should delete resources after PaddleJob is suspended; Should resume PaddleJob after PaddleJob is unsuspended", func() { 240 By("By creating a new PaddleJob") 241 job.Spec.PaddleReplicaSpecs[kubeflowv1.PaddleJobReplicaTypeWorker].Replicas = pointer.Int32(1) 242 Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) 243 244 created := &kubeflowv1.PaddleJob{} 245 masterPod := &corev1.Pod{} 246 workerPod := &corev1.Pod{} 247 masterSvc := &corev1.Service{} 248 workerSvc := &corev1.Service{} 249 250 // We'll need to retry getting this newly created PaddleJob, given that creation may not immediately happen. 251 By("Checking created PaddleJob") 252 Eventually(func() bool { 253 err := testK8sClient.Get(ctx, jobKey, created) 254 return err == nil 255 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 256 257 var startTimeBeforeSuspended *metav1.Time 258 Eventually(func() *metav1.Time { 259 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 260 startTimeBeforeSuspended = created.Status.StartTime 261 return startTimeBeforeSuspended 262 }, testutil.Timeout, testutil.Interval).ShouldNot(BeNil()) 263 264 By("Checking the created pods and services") 265 Eventually(func() bool { 266 errMaster := testK8sClient.Get(ctx, masterKey, masterPod) 267 errWorker := testK8sClient.Get(ctx, worker0Key, workerPod) 268 return errMaster == nil && errWorker == nil 269 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 270 Eventually(func() bool { 271 errMaster := testK8sClient.Get(ctx, masterKey, masterSvc) 272 errWorker := testK8sClient.Get(ctx, worker0Key, workerSvc) 273 return errMaster == nil && errWorker == nil 274 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 275 276 By("Updating the pod's phase with Running") 277 Eventually(func() error { 278 Expect(testK8sClient.Get(ctx, masterKey, masterPod)).Should(Succeed()) 279 masterPod.Status.Phase = corev1.PodRunning 280 return testK8sClient.Status().Update(ctx, masterPod) 281 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 282 Eventually(func() error { 283 Expect(testK8sClient.Get(ctx, worker0Key, workerPod)).Should(Succeed()) 284 workerPod.Status.Phase = corev1.PodRunning 285 return testK8sClient.Status().Update(ctx, workerPod) 286 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 287 288 By("Checking the PaddleJob's condition") 289 Eventually(func() []kubeflowv1.JobCondition { 290 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 291 return created.Status.Conditions 292 }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{ 293 { 294 Type: kubeflowv1.JobCreated, 295 Status: corev1.ConditionTrue, 296 Reason: commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobCreatedReason), 297 Message: fmt.Sprintf("PaddleJob %s is created.", name), 298 }, 299 { 300 Type: kubeflowv1.JobRunning, 301 Status: corev1.ConditionTrue, 302 Reason: commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobRunningReason), 303 Message: fmt.Sprintf("PaddleJob %s is running.", name), 304 }, 305 }, testutil.IgnoreJobConditionsTimes)) 306 307 By("Updating the PaddleJob with suspend=true") 308 Eventually(func() error { 309 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 310 created.Spec.RunPolicy.Suspend = pointer.Bool(true) 311 return testK8sClient.Update(ctx, created) 312 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 313 314 By("Checking if the pods and services are removed") 315 Eventually(func() bool { 316 errMaster := testK8sClient.Get(ctx, masterKey, masterPod) 317 errWorker := testK8sClient.Get(ctx, worker0Key, workerPod) 318 return errors.IsNotFound(errMaster) && errors.IsNotFound(errWorker) 319 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 320 Eventually(func() bool { 321 errMaster := testK8sClient.Get(ctx, masterKey, masterSvc) 322 errWorker := testK8sClient.Get(ctx, worker0Key, workerSvc) 323 return errors.IsNotFound(errMaster) && errors.IsNotFound(errWorker) 324 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 325 Consistently(func() bool { 326 errMasterPod := testK8sClient.Get(ctx, masterKey, masterPod) 327 errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) 328 errMasterSvc := testK8sClient.Get(ctx, masterKey, masterSvc) 329 errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc) 330 return errors.IsNotFound(errMasterPod) && errors.IsNotFound(errWorkerPod) && 331 errors.IsNotFound(errMasterSvc) && errors.IsNotFound(errWorkerSvc) 332 }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) 333 334 By("Checking if the PaddleJob has a suspended condition") 335 Eventually(func() bool { 336 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 337 return created.Status.ReplicaStatuses[kubeflowv1.PaddleJobReplicaTypeMaster].Active == 0 && 338 created.Status.ReplicaStatuses[kubeflowv1.PaddleJobReplicaTypeWorker].Active == 0 && 339 created.Status.StartTime.Equal(startTimeBeforeSuspended) 340 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 341 Consistently(func() bool { 342 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 343 return created.Status.ReplicaStatuses[kubeflowv1.PaddleJobReplicaTypeMaster].Active == 0 && 344 created.Status.ReplicaStatuses[kubeflowv1.PaddleJobReplicaTypeWorker].Active == 0 && 345 created.Status.StartTime.Equal(startTimeBeforeSuspended) 346 }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) 347 Expect(created.Status.Conditions).Should(BeComparableTo([]kubeflowv1.JobCondition{ 348 { 349 Type: kubeflowv1.JobCreated, 350 Status: corev1.ConditionTrue, 351 Reason: commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobCreatedReason), 352 Message: fmt.Sprintf("PaddleJob %s is created.", name), 353 }, 354 { 355 Type: kubeflowv1.JobRunning, 356 Status: corev1.ConditionFalse, 357 Reason: commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobSuspendedReason), 358 Message: fmt.Sprintf("PaddleJob %s is suspended.", name), 359 }, 360 { 361 Type: kubeflowv1.JobSuspended, 362 Reason: commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobSuspendedReason), 363 Message: fmt.Sprintf("PaddleJob %s is suspended.", name), 364 Status: corev1.ConditionTrue, 365 }, 366 }, testutil.IgnoreJobConditionsTimes)) 367 368 By("Unsuspending the PaddleJob") 369 Eventually(func() error { 370 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 371 created.Spec.RunPolicy.Suspend = pointer.Bool(false) 372 return testK8sClient.Update(ctx, created) 373 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 374 Eventually(func() *metav1.Time { 375 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 376 return created.Status.StartTime 377 }, testutil.Timeout, testutil.Interval).ShouldNot(BeNil()) 378 379 By("Check if the pods and services are created") 380 Eventually(func() error { 381 return testK8sClient.Get(ctx, masterKey, masterPod) 382 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 383 Eventually(func() error { 384 return testK8sClient.Get(ctx, worker0Key, workerPod) 385 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 386 Eventually(func() error { 387 return testK8sClient.Get(ctx, masterKey, masterSvc) 388 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 389 Eventually(func() error { 390 return testK8sClient.Get(ctx, worker0Key, workerSvc) 391 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 392 393 By("Updating Pod's condition with running") 394 Eventually(func() error { 395 Expect(testK8sClient.Get(ctx, masterKey, masterPod)).Should(Succeed()) 396 masterPod.Status.Phase = corev1.PodRunning 397 return testK8sClient.Status().Update(ctx, masterPod) 398 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 399 Eventually(func() error { 400 Expect(testK8sClient.Get(ctx, worker0Key, workerPod)).Should(Succeed()) 401 workerPod.Status.Phase = corev1.PodRunning 402 return testK8sClient.Status().Update(ctx, workerPod) 403 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 404 405 By("Checking if the PaddleJob has resumed conditions") 406 Eventually(func() []kubeflowv1.JobCondition { 407 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 408 return created.Status.Conditions 409 }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{ 410 { 411 Type: kubeflowv1.JobCreated, 412 Status: corev1.ConditionTrue, 413 Reason: commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobCreatedReason), 414 Message: fmt.Sprintf("PaddleJob %s is created.", name), 415 }, 416 { 417 Type: kubeflowv1.JobSuspended, 418 Reason: commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobResumedReason), 419 Message: fmt.Sprintf("PaddleJob %s is resumed.", name), 420 Status: corev1.ConditionFalse, 421 }, 422 { 423 Type: kubeflowv1.JobRunning, 424 Status: corev1.ConditionTrue, 425 Reason: commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobRunningReason), 426 Message: fmt.Sprintf("PaddleJob %s is running.", name), 427 }, 428 }, testutil.IgnoreJobConditionsTimes)) 429 430 By("Checking if the startTime is updated") 431 Expect(created.Status.StartTime).ShouldNot(Equal(startTimeBeforeSuspended)) 432 }) 433 }) 434 }) 435 436 func newPaddleJobForTest(name, namespace string) *kubeflowv1.PaddleJob { 437 return &kubeflowv1.PaddleJob{ 438 ObjectMeta: metav1.ObjectMeta{ 439 Name: name, 440 Namespace: namespace, 441 }, 442 } 443 } 444 445 // getCondition returns the condition with the provided type. 446 func getCondition(status kubeflowv1.JobStatus, condType kubeflowv1.JobConditionType) *kubeflowv1.JobCondition { 447 for _, condition := range status.Conditions { 448 if condition.Type == condType { 449 return &condition 450 } 451 } 452 return nil 453 }