github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/tensorflow/status_test.go (about) 1 // Copyright 2021 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tensorflow 16 17 import ( 18 "context" 19 "fmt" 20 21 . "github.com/onsi/ginkgo/v2" 22 . "github.com/onsi/gomega" 23 corev1 "k8s.io/api/core/v1" 24 v1 "k8s.io/api/core/v1" 25 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 "k8s.io/apimachinery/pkg/types" 27 "k8s.io/apimachinery/pkg/util/uuid" 28 "sigs.k8s.io/controller-runtime/pkg/client" 29 30 kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 31 tftestutil "github.com/kubeflow/training-operator/pkg/controller.v1/tensorflow/testutil" 32 "github.com/kubeflow/training-operator/pkg/util" 33 "github.com/kubeflow/training-operator/pkg/util/testutil" 34 ) 35 36 var _ = Describe("TFJob controller", func() { 37 Context("Test Failed", func() { 38 It("should update TFJob with failed status", func() { 39 By("creating a TFJob with replicaStatues initialized") 40 tfJob := tftestutil.NewTFJob(3, 0) 41 initializeReplicaStatuses(&tfJob.Status, kubeflowv1.TFJobReplicaTypeWorker) 42 43 By("prepare pod") 44 refs := []metav1.OwnerReference{ 45 *reconciler.GenOwnerReference(tfJob), 46 } 47 pod := tftestutil.NewBasePod("pod", tfJob, refs) 48 pod.Status.Phase = v1.PodFailed 49 50 By("update job replica statuses") 51 updateJobReplicaStatuses(&tfJob.Status, kubeflowv1.TFJobReplicaTypeWorker, pod) 52 Expect(tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker].Failed).Should(Equal(int32(1))) 53 54 By("update job status") 55 Expect(reconciler.UpdateJobStatus(tfJob, tfJob.Spec.TFReplicaSpecs, &tfJob.Status)).To(Succeed()) 56 57 By("finding failed job status") 58 found := false 59 for _, condition := range tfJob.Status.Conditions { 60 if condition.Type == kubeflowv1.JobFailed { 61 found = true 62 } 63 } 64 Expect(found).To(BeTrue()) 65 }) 66 }) 67 68 Context("Test Status", func() { 69 It("should update TFJob with desired status", func() { 70 type testCase struct { 71 description string 72 tfJob *kubeflowv1.TFJob 73 74 expectedFailedPS int32 75 expectedSucceededPS int32 76 expectedActivePS int32 77 78 expectedFailedWorker int32 79 expectedSucceededWorker int32 80 expectedActiveWorker int32 81 82 expectedFailedChief int32 83 expectedSucceededChief int32 84 expectedActiveChief int32 85 86 restart bool 87 worker0Completed bool 88 89 expectedType kubeflowv1.JobConditionType 90 } 91 92 testCases := []testCase{ 93 { 94 description: "Chief worker is succeeded", 95 tfJob: tftestutil.NewTFJobWithChief(1, 0), 96 expectedFailedPS: 0, 97 expectedSucceededPS: 0, 98 expectedActivePS: 0, 99 expectedFailedWorker: 0, 100 expectedSucceededWorker: 1, 101 expectedActiveWorker: 0, 102 expectedFailedChief: 0, 103 expectedSucceededChief: 1, 104 expectedActiveChief: 0, 105 restart: false, 106 worker0Completed: false, 107 expectedType: kubeflowv1.JobSucceeded, 108 }, 109 { 110 description: "Chief worker is running", 111 tfJob: tftestutil.NewTFJobWithChief(1, 0), 112 expectedFailedPS: 0, 113 expectedSucceededPS: 0, 114 expectedActivePS: 0, 115 expectedFailedWorker: 0, 116 expectedSucceededWorker: 0, 117 expectedActiveWorker: 0, 118 expectedFailedChief: 0, 119 expectedSucceededChief: 0, 120 expectedActiveChief: 1, 121 restart: false, 122 worker0Completed: false, 123 expectedType: kubeflowv1.JobRunning, 124 }, 125 { 126 description: "Chief worker is failed", 127 tfJob: tftestutil.NewTFJobWithChief(1, 0), 128 expectedFailedPS: 0, 129 expectedSucceededPS: 0, 130 expectedActivePS: 0, 131 expectedFailedWorker: 0, 132 expectedSucceededWorker: 0, 133 expectedActiveWorker: 0, 134 expectedFailedChief: 1, 135 expectedSucceededChief: 0, 136 expectedActiveChief: 0, 137 restart: false, 138 worker0Completed: false, 139 expectedType: kubeflowv1.JobFailed, 140 }, 141 { 142 description: "(No chief worker) Worker is failed", 143 tfJob: tftestutil.NewTFJob(1, 0), 144 expectedFailedPS: 0, 145 expectedSucceededPS: 0, 146 expectedActivePS: 0, 147 expectedFailedWorker: 1, 148 expectedSucceededWorker: 0, 149 expectedActiveWorker: 0, 150 expectedFailedChief: 0, 151 expectedSucceededChief: 0, 152 expectedActiveChief: 0, 153 restart: false, 154 worker0Completed: false, 155 expectedType: kubeflowv1.JobFailed, 156 }, 157 { 158 description: "(No chief worker) Worker is succeeded", 159 tfJob: tftestutil.NewTFJob(1, 0), 160 expectedFailedPS: 0, 161 expectedSucceededPS: 0, 162 expectedActivePS: 0, 163 expectedFailedWorker: 0, 164 expectedSucceededWorker: 1, 165 expectedActiveWorker: 0, 166 expectedFailedChief: 0, 167 expectedSucceededChief: 0, 168 expectedActiveChief: 0, 169 restart: false, 170 worker0Completed: false, 171 expectedType: kubeflowv1.JobSucceeded, 172 }, 173 { 174 description: "(No chief worker) Worker is running", 175 tfJob: tftestutil.NewTFJob(1, 0), 176 expectedFailedPS: 0, 177 expectedSucceededPS: 0, 178 expectedActivePS: 0, 179 expectedFailedWorker: 0, 180 expectedSucceededWorker: 0, 181 expectedActiveWorker: 1, 182 expectedFailedChief: 0, 183 expectedSucceededChief: 0, 184 expectedActiveChief: 0, 185 restart: false, 186 worker0Completed: false, 187 expectedType: kubeflowv1.JobRunning, 188 }, 189 { 190 description: "(No chief worker) 2 workers are succeeded, 2 workers are active", 191 tfJob: tftestutil.NewTFJob(4, 2), 192 expectedFailedPS: 0, 193 expectedSucceededPS: 0, 194 expectedActivePS: 2, 195 expectedFailedWorker: 0, 196 expectedSucceededWorker: 2, 197 expectedActiveWorker: 2, 198 expectedFailedChief: 0, 199 expectedSucceededChief: 0, 200 expectedActiveChief: 0, 201 restart: false, 202 worker0Completed: false, 203 expectedType: kubeflowv1.JobRunning, 204 }, 205 { 206 description: "(No chief worker) 2 workers are running, 2 workers are failed", 207 tfJob: tftestutil.NewTFJob(4, 2), 208 expectedFailedPS: 0, 209 expectedSucceededPS: 0, 210 expectedActivePS: 2, 211 expectedFailedWorker: 2, 212 expectedSucceededWorker: 0, 213 expectedActiveWorker: 2, 214 expectedFailedChief: 0, 215 expectedSucceededChief: 0, 216 expectedActiveChief: 0, 217 restart: false, 218 worker0Completed: false, 219 expectedType: kubeflowv1.JobFailed, 220 }, 221 { 222 description: "(No chief worker) 2 workers are succeeded, 2 workers are failed", 223 tfJob: tftestutil.NewTFJob(4, 2), 224 expectedFailedPS: 0, 225 expectedSucceededPS: 0, 226 expectedActivePS: 2, 227 expectedFailedWorker: 2, 228 expectedSucceededWorker: 2, 229 expectedActiveWorker: 0, 230 expectedFailedChief: 0, 231 expectedSucceededChief: 0, 232 expectedActiveChief: 0, 233 restart: false, 234 worker0Completed: false, 235 expectedType: kubeflowv1.JobFailed, 236 }, 237 { 238 description: "(No chief worker) worker-0 are succeeded, 3 workers are active", 239 tfJob: tftestutil.NewTFJob(4, 2), 240 expectedFailedPS: 0, 241 expectedSucceededPS: 0, 242 expectedActivePS: 2, 243 expectedFailedWorker: 0, 244 expectedSucceededWorker: 1, 245 expectedActiveWorker: 3, 246 expectedFailedChief: 0, 247 expectedSucceededChief: 0, 248 expectedActiveChief: 0, 249 restart: false, 250 worker0Completed: true, 251 expectedType: kubeflowv1.JobSucceeded, 252 }, 253 { 254 description: "(No chief worker, successPolicy: AllWorkers) worker-0 are succeeded, 3 workers are active", 255 tfJob: tftestutil.NewTFJobWithSuccessPolicy(4, 0, kubeflowv1.SuccessPolicyAllWorkers), 256 expectedFailedPS: 0, 257 expectedSucceededPS: 0, 258 expectedActivePS: 0, 259 expectedFailedWorker: 0, 260 expectedSucceededWorker: 1, 261 expectedActiveWorker: 3, 262 expectedFailedChief: 0, 263 expectedSucceededChief: 0, 264 expectedActiveChief: 0, 265 restart: false, 266 worker0Completed: true, 267 expectedType: kubeflowv1.JobRunning, 268 }, 269 { 270 description: "(No chief worker, successPolicy: AllWorkers) 4 workers are succeeded", 271 tfJob: tftestutil.NewTFJobWithSuccessPolicy(4, 0, kubeflowv1.SuccessPolicyAllWorkers), 272 expectedFailedPS: 0, 273 expectedSucceededPS: 0, 274 expectedActivePS: 0, 275 expectedFailedWorker: 0, 276 expectedSucceededWorker: 4, 277 expectedActiveWorker: 0, 278 expectedFailedChief: 0, 279 expectedSucceededChief: 0, 280 expectedActiveChief: 0, 281 restart: false, 282 worker0Completed: true, 283 expectedType: kubeflowv1.JobSucceeded, 284 }, 285 { 286 description: "(No chief worker, successPolicy: AllWorkers) worker-0 is succeeded, 2 workers are running, 1 worker is failed", 287 tfJob: tftestutil.NewTFJobWithSuccessPolicy(4, 0, kubeflowv1.SuccessPolicyAllWorkers), 288 expectedFailedPS: 0, 289 expectedSucceededPS: 0, 290 expectedActivePS: 0, 291 expectedFailedWorker: 1, 292 expectedSucceededWorker: 1, 293 expectedActiveWorker: 2, 294 expectedFailedChief: 0, 295 expectedSucceededChief: 0, 296 expectedActiveChief: 0, 297 restart: false, 298 worker0Completed: true, 299 expectedType: kubeflowv1.JobFailed, 300 }, 301 { 302 description: "Chief is running, workers are failed", 303 tfJob: tftestutil.NewTFJobWithChief(4, 2), 304 expectedFailedPS: 0, 305 expectedSucceededPS: 0, 306 expectedActivePS: 2, 307 expectedFailedWorker: 4, 308 expectedSucceededWorker: 0, 309 expectedActiveWorker: 0, 310 expectedFailedChief: 0, 311 expectedSucceededChief: 0, 312 expectedActiveChief: 1, 313 restart: false, 314 worker0Completed: false, 315 expectedType: kubeflowv1.JobRunning, 316 }, 317 { 318 description: "Chief is running, workers are succeeded", 319 tfJob: tftestutil.NewTFJobWithChief(4, 2), 320 expectedFailedPS: 0, 321 expectedSucceededPS: 0, 322 expectedActivePS: 2, 323 expectedFailedWorker: 0, 324 expectedSucceededWorker: 4, 325 expectedActiveWorker: 0, 326 expectedFailedChief: 0, 327 expectedSucceededChief: 0, 328 expectedActiveChief: 1, 329 restart: false, 330 worker0Completed: false, 331 expectedType: kubeflowv1.JobRunning, 332 }, 333 { 334 description: "Chief is running, a PS is failed", 335 tfJob: tftestutil.NewTFJobWithChief(4, 2), 336 expectedFailedPS: 1, 337 expectedSucceededPS: 0, 338 expectedActivePS: 1, 339 expectedFailedWorker: 0, 340 expectedSucceededWorker: 4, 341 expectedActiveWorker: 0, 342 expectedFailedChief: 0, 343 expectedSucceededChief: 0, 344 expectedActiveChief: 1, 345 restart: false, 346 worker0Completed: false, 347 expectedType: kubeflowv1.JobFailed, 348 }, 349 { 350 description: "Chief is failed, workers are succeeded", 351 tfJob: tftestutil.NewTFJobWithChief(4, 2), 352 expectedFailedPS: 0, 353 expectedSucceededPS: 0, 354 expectedActivePS: 2, 355 expectedFailedWorker: 0, 356 expectedSucceededWorker: 4, 357 expectedActiveWorker: 0, 358 expectedFailedChief: 1, 359 expectedSucceededChief: 0, 360 expectedActiveChief: 0, 361 restart: false, 362 worker0Completed: false, 363 expectedType: kubeflowv1.JobFailed, 364 }, 365 { 366 description: "Chief is succeeded, workers are failed", 367 tfJob: tftestutil.NewTFJobWithChief(4, 2), 368 expectedFailedPS: 0, 369 expectedSucceededPS: 0, 370 expectedActivePS: 2, 371 expectedFailedWorker: 4, 372 expectedSucceededWorker: 0, 373 expectedActiveWorker: 0, 374 expectedFailedChief: 0, 375 expectedSucceededChief: 1, 376 expectedActiveChief: 0, 377 restart: false, 378 worker0Completed: false, 379 expectedType: kubeflowv1.JobSucceeded, 380 }, 381 { 382 description: "Chief is failed and restarting", 383 tfJob: tftestutil.NewTFJobWithChief(4, 2), 384 expectedFailedPS: 0, 385 expectedSucceededPS: 0, 386 expectedActivePS: 2, 387 expectedFailedWorker: 4, 388 expectedSucceededWorker: 0, 389 expectedActiveWorker: 0, 390 expectedFailedChief: 1, 391 expectedSucceededChief: 0, 392 expectedActiveChief: 0, 393 restart: true, 394 worker0Completed: false, 395 expectedType: kubeflowv1.JobRestarting, 396 }, 397 } 398 399 jobNameTemplate := "test-status-%d" 400 for i, c := range testCases { 401 reconciler.Log.Info("testing case", "description", c.description) 402 c.tfJob.SetName(fmt.Sprintf(jobNameTemplate, i)) 403 c.tfJob.SetUID(uuid.NewUUID()) 404 405 initializeReplicaStatuses(&c.tfJob.Status, kubeflowv1.TFJobReplicaTypeWorker) 406 initializeReplicaStatuses(&c.tfJob.Status, kubeflowv1.TFJobReplicaTypeChief) 407 initializeReplicaStatuses(&c.tfJob.Status, kubeflowv1.TFJobReplicaTypePS) 408 409 setStatusForTest(c.tfJob, kubeflowv1.TFJobReplicaTypePS, c.expectedFailedPS, c.expectedSucceededPS, c.expectedActivePS, c.restart, c.worker0Completed, testK8sClient) 410 setStatusForTest(c.tfJob, kubeflowv1.TFJobReplicaTypeWorker, c.expectedFailedWorker, c.expectedSucceededWorker, c.expectedActiveWorker, c.restart, c.worker0Completed, testK8sClient) 411 setStatusForTest(c.tfJob, kubeflowv1.TFJobReplicaTypeChief, c.expectedFailedChief, c.expectedSucceededChief, c.expectedActiveChief, c.restart, c.worker0Completed, testK8sClient) 412 413 // Adding this section to make sure all pods are created and cached 414 Eventually(func() error { 415 podList := &corev1.PodList{} 416 basicLabels := reconciler.GenLabels(c.tfJob.GetName()) 417 selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ 418 MatchLabels: basicLabels, 419 }) 420 if err != nil { 421 return err 422 } 423 listOpt := client.MatchingLabelsSelector{ 424 Selector: selector, 425 } 426 err = testK8sClient.List(context.Background(), podList, listOpt) 427 if err != nil { 428 return nil 429 } 430 totalExpectedPodCount := c.expectedFailedPS + c.expectedSucceededPS + c.expectedActivePS + 431 c.expectedFailedWorker + c.expectedSucceededWorker + c.expectedActiveWorker + 432 c.expectedFailedChief + c.expectedSucceededChief + c.expectedActiveChief 433 if len(podList.Items) != int(totalExpectedPodCount) { 434 return fmt.Errorf("pod number (%d) for %s not match for expected pod number %d", 435 len(podList.Items), c.tfJob.GetName(), totalExpectedPodCount) 436 } 437 return nil 438 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 439 440 _ = reconciler.ReconcileJobs(c.tfJob, c.tfJob.Spec.TFReplicaSpecs, c.tfJob.Status, &c.tfJob.Spec.RunPolicy) 441 442 Expect(filterOutConditionTest(c.tfJob.Status)).Should(Succeed()) 443 444 reconciler.Log.Info("checking status", "tfJob.Status", c.tfJob.Status) 445 found := false 446 for _, condition := range c.tfJob.Status.Conditions { 447 if condition.Type == c.expectedType { 448 found = true 449 } 450 } 451 Expect(found).To(BeTrue()) 452 reconciler.Log.Info("passed!", 453 "job name", c.tfJob.GetName(), "job description", c.description) 454 } 455 }) 456 }) 457 }) 458 459 func setStatusForTest(tfJob *kubeflowv1.TFJob, rtype kubeflowv1.ReplicaType, failed, succeeded, active int32, restart bool, worker0Completed bool, client client.Client) { 460 if restart == true { 461 tfJob.Spec.TFReplicaSpecs[rtype].RestartPolicy = kubeflowv1.RestartPolicyExitCode 462 } 463 464 basicLabels := reconciler.GenLabels(tfJob.GetName()) 465 ctx := context.Background() 466 467 Expect(rtype).Should(BeElementOf([]kubeflowv1.ReplicaType{ 468 kubeflowv1.TFJobReplicaTypeWorker, 469 kubeflowv1.TFJobReplicaTypePS, 470 kubeflowv1.TFJobReplicaTypeChief, 471 })) 472 473 refs := []metav1.OwnerReference{ 474 *reconciler.GenOwnerReference(tfJob), 475 } 476 477 var i int32 478 index := 0 479 for i = 0; i < succeeded; i++ { 480 pod := tftestutil.NewPod(tfJob, rtype, index, refs) 481 for k, v := range basicLabels { 482 pod.Labels[k] = v 483 } 484 po := &corev1.Pod{} 485 Expect(client.Create(ctx, pod)).Should(Succeed()) 486 487 key := genKeyFromJob(pod) 488 Eventually(func() error { 489 po = &corev1.Pod{} 490 if err := client.Get(ctx, key, po); err != nil { 491 return err 492 } 493 494 po.Status.Phase = corev1.PodSucceeded 495 if worker0Completed == true && rtype == kubeflowv1.TFJobReplicaTypeWorker && index == 0 { 496 po.Status.ContainerStatuses = []corev1.ContainerStatus{ 497 { 498 Name: reconciler.GetDefaultContainerName(), 499 State: corev1.ContainerState{ 500 Terminated: &corev1.ContainerStateTerminated{ 501 ExitCode: int32(0), // exit with 0 502 }, 503 }, 504 }, 505 } 506 } 507 508 return client.Status().Update(ctx, po) 509 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 510 511 updateJobReplicaStatuses(&tfJob.Status, rtype, po) 512 513 index++ 514 } 515 516 for i = 0; i < failed; i++ { 517 pod := tftestutil.NewPod(tfJob, rtype, index, refs) 518 for k, v := range basicLabels { 519 pod.Labels[k] = v 520 } 521 po := &corev1.Pod{} 522 Expect(client.Create(ctx, pod)).Should(Succeed()) 523 524 key := genKeyFromJob(pod) 525 Eventually(func() error { 526 po = &corev1.Pod{} 527 if err := client.Get(ctx, key, po); err != nil { 528 return err 529 } 530 531 po.Status.Phase = corev1.PodFailed 532 if restart == true { 533 if po.Status.ContainerStatuses == nil { 534 po.Status.ContainerStatuses = []corev1.ContainerStatus{ 535 { 536 Name: reconciler.GetDefaultContainerName(), 537 State: corev1.ContainerState{ 538 Terminated: &corev1.ContainerStateTerminated{ 539 ExitCode: int32(130), // 130 is a retryable code 540 }, 541 }, 542 }, 543 } 544 } 545 } 546 547 return client.Status().Update(ctx, po) 548 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 549 550 updateJobReplicaStatuses(&tfJob.Status, rtype, po) 551 index++ 552 } 553 554 for i = 0; i < active; i++ { 555 pod := tftestutil.NewPod(tfJob, rtype, index, refs) 556 for k, v := range basicLabels { 557 pod.Labels[k] = v 558 } 559 po := &corev1.Pod{} 560 Expect(client.Create(ctx, pod)).Should(Succeed()) 561 562 key := genKeyFromJob(pod) 563 Eventually(func() error { 564 po = &corev1.Pod{} 565 if err := client.Get(ctx, key, po); err != nil { 566 return err 567 } 568 569 po.Status.Phase = corev1.PodRunning 570 571 return client.Status().Update(ctx, po) 572 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 573 574 updateJobReplicaStatuses(&tfJob.Status, rtype, po) 575 index++ 576 } 577 } 578 579 func genKeyFromJob(job client.Object) types.NamespacedName { 580 ns := metav1.NamespaceDefault 581 if job.GetNamespace() != "" { 582 ns = job.GetNamespace() 583 } 584 return types.NamespacedName{ 585 Namespace: ns, 586 Name: job.GetName(), 587 } 588 } 589 590 func filterOutConditionTest(status kubeflowv1.JobStatus) error { 591 flag := util.IsFailed(status) || util.IsSucceeded(status) 592 for _, condition := range status.Conditions { 593 if flag && condition.Type == kubeflowv1.JobRunning && condition.Status == corev1.ConditionTrue { 594 return fmt.Errorf("error condition status when succeeded or failed") 595 } 596 } 597 return nil 598 }