volcano.sh/volcano@v1.9.0/test/e2e/util/job.go (about) 1 /* 2 Copyright 2021 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package util 18 19 import ( 20 "context" 21 "encoding/json" 22 "fmt" 23 "strconv" 24 "strings" 25 "time" 26 27 . "github.com/onsi/gomega" 28 batchv1 "k8s.io/api/batch/v1" 29 v1 "k8s.io/api/core/v1" 30 "k8s.io/apimachinery/pkg/api/errors" 31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 "k8s.io/apimachinery/pkg/types" 33 "k8s.io/apimachinery/pkg/util/wait" 34 35 batchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1" 36 schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" 37 ) 38 39 type TaskSpec struct { 40 Name string 41 Min, Rep int32 42 Img string 43 Command string 44 WorkingDir string 45 Hostport int32 46 Req v1.ResourceList 47 Limit v1.ResourceList 48 Affinity *v1.Affinity 49 Labels map[string]string 50 Policies []batchv1alpha1.LifecyclePolicy 51 RestartPolicy v1.RestartPolicy 52 Tolerations []v1.Toleration 53 DefaultGracefulPeriod *int64 54 Taskpriority string 55 MaxRetry int32 56 } 57 58 type JobSpec struct { 59 Name string 60 Namespace string 61 Queue string 62 Tasks []TaskSpec 63 Policies []batchv1alpha1.LifecyclePolicy 64 Min int32 65 Pri string 66 Plugins map[string][]string 67 Volumes []batchv1alpha1.VolumeSpec 68 NodeName string 69 // ttl seconds after job finished 70 TTL *int32 71 MinSuccess *int32 72 // job max retry 73 MaxRetry int32 74 } 75 76 func Namespace(context *TestContext, job *JobSpec) string { 77 if len(job.Namespace) != 0 { 78 return job.Namespace 79 } 80 81 return context.Namespace 82 } 83 84 func CreateJob(context *TestContext, jobSpec *JobSpec) *batchv1alpha1.Job { 85 job, err := CreateJobInner(context, jobSpec) 86 Expect(err).NotTo(HaveOccurred(), "failed to create job %s in namespace %s", jobSpec.Name, jobSpec.Namespace) 87 return job 88 } 89 90 func CreateJobWithPodGroup(ctx *TestContext, jobSpec *JobSpec, 91 pgName string, annotations map[string]string) *batchv1alpha1.Job { 92 ns := Namespace(ctx, jobSpec) 93 94 job := &batchv1alpha1.Job{ 95 ObjectMeta: metav1.ObjectMeta{ 96 Name: jobSpec.Name, 97 Namespace: ns, 98 Annotations: annotations, 99 }, 100 Spec: batchv1alpha1.JobSpec{ 101 Policies: jobSpec.Policies, 102 Queue: jobSpec.Queue, 103 Plugins: jobSpec.Plugins, 104 TTLSecondsAfterFinished: jobSpec.TTL, 105 }, 106 } 107 108 var min int32 109 for i, task := range jobSpec.Tasks { 110 name := task.Name 111 if len(name) == 0 { 112 name = fmt.Sprintf("%s-task-%d", jobSpec.Name, i) 113 } 114 115 restartPolicy := v1.RestartPolicyOnFailure 116 if len(task.RestartPolicy) > 0 { 117 restartPolicy = task.RestartPolicy 118 } 119 120 ts := batchv1alpha1.TaskSpec{ 121 Name: name, 122 Replicas: task.Rep, 123 Policies: task.Policies, 124 Template: v1.PodTemplateSpec{ 125 ObjectMeta: metav1.ObjectMeta{ 126 Name: name, 127 Labels: task.Labels, 128 }, 129 Spec: v1.PodSpec{ 130 SchedulerName: "volcano", 131 RestartPolicy: restartPolicy, 132 Containers: CreateContainers(task.Img, task.Command, task.WorkingDir, task.Req, task.Limit, task.Hostport), 133 Affinity: task.Affinity, 134 Tolerations: task.Tolerations, 135 PriorityClassName: task.Taskpriority, 136 }, 137 }, 138 } 139 140 if pgName != "" { 141 ts.Template.ObjectMeta.Annotations = map[string]string{schedulingv1beta1.KubeGroupNameAnnotationKey: pgName} 142 } 143 144 if task.DefaultGracefulPeriod != nil { 145 ts.Template.Spec.TerminationGracePeriodSeconds = task.DefaultGracefulPeriod 146 } else { 147 // NOTE: TerminationGracePeriodSeconds is set to 3 in default in case of timeout when restarting tasks in test. 148 var defaultPeriod int64 = 3 149 ts.Template.Spec.TerminationGracePeriodSeconds = &defaultPeriod 150 } 151 152 job.Spec.Tasks = append(job.Spec.Tasks, ts) 153 154 min += task.Min 155 } 156 157 if jobSpec.Min > 0 { 158 job.Spec.MinAvailable = jobSpec.Min 159 } else { 160 job.Spec.MinAvailable = min 161 } 162 163 if jobSpec.Pri != "" { 164 job.Spec.PriorityClassName = jobSpec.Pri 165 } 166 167 job.Spec.Volumes = jobSpec.Volumes 168 169 jobCreated, err := ctx.Vcclient.BatchV1alpha1().Jobs(job.Namespace).Create(context.TODO(), job, metav1.CreateOptions{}) 170 Expect(err).NotTo(HaveOccurred(), "failed to create job %s in namespace %s", job.Name, job.Namespace) 171 172 return jobCreated 173 } 174 175 func UpdateJob(ctx *TestContext, job *batchv1alpha1.Job) error { 176 spec, err := json.Marshal(job.Spec) 177 if err != nil { 178 return err 179 } 180 patch := fmt.Sprintf(`[{"op": "replace", "path": "/spec", "value":%s}]`, spec) 181 patchBytes := []byte(patch) 182 _, err = ctx.Vcclient.BatchV1alpha1().Jobs(job.Namespace).Patch(context.TODO(), 183 job.Name, types.JSONPatchType, patchBytes, metav1.PatchOptions{}) 184 return err 185 } 186 187 func CreateJobInner(ctx *TestContext, jobSpec *JobSpec) (*batchv1alpha1.Job, error) { 188 ns := Namespace(ctx, jobSpec) 189 190 job := &batchv1alpha1.Job{ 191 ObjectMeta: metav1.ObjectMeta{ 192 Name: jobSpec.Name, 193 Namespace: ns, 194 }, 195 Spec: batchv1alpha1.JobSpec{ 196 SchedulerName: "volcano", 197 Policies: jobSpec.Policies, 198 Queue: jobSpec.Queue, 199 Plugins: jobSpec.Plugins, 200 TTLSecondsAfterFinished: jobSpec.TTL, 201 MinSuccess: jobSpec.MinSuccess, 202 MaxRetry: jobSpec.MaxRetry, 203 }, 204 } 205 206 var min int32 207 for i, task := range jobSpec.Tasks { 208 name := task.Name 209 if len(name) == 0 { 210 name = fmt.Sprintf("%s-task-%d", jobSpec.Name, i) 211 } 212 213 restartPolicy := v1.RestartPolicyOnFailure 214 if len(task.RestartPolicy) > 0 { 215 restartPolicy = task.RestartPolicy 216 } 217 218 maxRetry := task.MaxRetry 219 if maxRetry == 0 { 220 maxRetry = -1 221 } 222 223 ts := batchv1alpha1.TaskSpec{ 224 Name: name, 225 Replicas: task.Rep, 226 Policies: task.Policies, 227 MaxRetry: maxRetry, 228 Template: v1.PodTemplateSpec{ 229 ObjectMeta: metav1.ObjectMeta{ 230 Name: name, 231 Labels: task.Labels, 232 }, 233 Spec: v1.PodSpec{ 234 RestartPolicy: restartPolicy, 235 Containers: CreateContainers(task.Img, task.Command, task.WorkingDir, task.Req, task.Limit, task.Hostport), 236 Affinity: task.Affinity, 237 Tolerations: task.Tolerations, 238 PriorityClassName: task.Taskpriority, 239 }, 240 }, 241 } 242 if jobSpec.NodeName != "" { 243 ts.Template.Spec.NodeName = jobSpec.NodeName 244 } 245 246 if task.DefaultGracefulPeriod != nil { 247 ts.Template.Spec.TerminationGracePeriodSeconds = task.DefaultGracefulPeriod 248 } else { 249 // NOTE: TerminationGracePeriodSeconds is set to 3 in default in case of timeout when restarting tasks in test. 250 var defaultPeriod int64 = 3 251 ts.Template.Spec.TerminationGracePeriodSeconds = &defaultPeriod 252 } 253 254 job.Spec.Tasks = append(job.Spec.Tasks, ts) 255 256 min += task.Min 257 } 258 259 if jobSpec.Min > 0 { 260 job.Spec.MinAvailable = jobSpec.Min 261 } else { 262 job.Spec.MinAvailable = min 263 } 264 265 if jobSpec.Pri != "" { 266 job.Spec.PriorityClassName = jobSpec.Pri 267 } 268 269 job.Spec.Volumes = jobSpec.Volumes 270 271 return ctx.Vcclient.BatchV1alpha1().Jobs(job.Namespace).Create(context.TODO(), job, metav1.CreateOptions{}) 272 } 273 274 func WaitTaskPhase(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase, taskNum int) error { 275 var additionalError error 276 err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) { 277 pods, err := ctx.Kubeclient.CoreV1().Pods(job.Namespace).List(context.TODO(), metav1.ListOptions{}) 278 Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", job.Namespace) 279 280 readyTaskNum := 0 281 for _, pod := range pods.Items { 282 if !metav1.IsControlledBy(&pod, job) { 283 continue 284 } 285 286 for _, p := range phase { 287 if pod.Status.Phase == p { 288 readyTaskNum++ 289 break 290 } 291 } 292 } 293 294 ready := taskNum <= readyTaskNum 295 if !ready { 296 additionalError = fmt.Errorf("expected job '%s' to have %d ready pods, actual got %d", job.Name, 297 taskNum, 298 readyTaskNum) 299 } 300 return ready, nil 301 }) 302 if err != nil && strings.Contains(err.Error(), TimeOutMessage) { 303 return fmt.Errorf("[Wait time out]: %s", additionalError) 304 } 305 return err 306 } 307 308 func taskPhaseEx(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase, taskNum map[string]int) error { 309 err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) { 310 311 pods, err := ctx.Kubeclient.CoreV1().Pods(job.Namespace).List(context.TODO(), metav1.ListOptions{}) 312 Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", job.Namespace) 313 314 readyTaskNum := map[string]int{} 315 for _, pod := range pods.Items { 316 if !metav1.IsControlledBy(&pod, job) { 317 continue 318 } 319 320 for _, p := range phase { 321 if pod.Status.Phase == p { 322 readyTaskNum[pod.Spec.PriorityClassName]++ 323 break 324 } 325 } 326 } 327 328 for k, v := range taskNum { 329 if v > readyTaskNum[k] { 330 return false, nil 331 } 332 } 333 334 return true, nil 335 }) 336 if err != nil && strings.Contains(err.Error(), TimeOutMessage) { 337 return fmt.Errorf("[Wait time out]") 338 } 339 return err 340 341 } 342 343 func jobUnschedulable(ctx *TestContext, job *batchv1alpha1.Job, now time.Time) error { 344 var additionalError error 345 // TODO(k82cn): check Job's Condition instead of PodGroup's event. 346 err := wait.Poll(10*time.Second, FiveMinute, func() (bool, error) { 347 pgName := job.Name + "-" + string(job.UID) 348 pg, err := ctx.Vcclient.SchedulingV1beta1().PodGroups(job.Namespace).Get(context.TODO(), pgName, metav1.GetOptions{}) 349 if err != nil { 350 additionalError = fmt.Errorf("expected to have job's podgroup %s created, actual got error %s", 351 job.Name, err.Error()) 352 return false, nil 353 } 354 355 events, err := ctx.Kubeclient.CoreV1().Events(pg.Namespace).List(context.TODO(), metav1.ListOptions{}) 356 if err != nil { 357 additionalError = fmt.Errorf("expected to have events for job %s, actual got error %s", 358 job.Name, err.Error()) 359 return false, nil 360 } 361 for _, event := range events.Items { 362 target := event.InvolvedObject 363 if strings.HasPrefix(target.Name, pg.Name) && target.Namespace == pg.Namespace { 364 if event.Reason == string("Unschedulable") || event.Reason == string("FailedScheduling") && event.LastTimestamp.After(now) { 365 return true, nil 366 } 367 } 368 } 369 additionalError = fmt.Errorf( 370 "expected to have 'Unschedulable' events for podgroup %s, actual got nothing", job.Name) 371 return false, nil 372 }) 373 if err != nil && strings.Contains(err.Error(), TimeOutMessage) { 374 return fmt.Errorf("[Wait time out]: %s", additionalError) 375 } 376 return err 377 } 378 379 func JobEvicted(ctx *TestContext, job *batchv1alpha1.Job, time time.Time) wait.ConditionFunc { 380 // TODO(k82cn): check Job's conditions instead of PodGroup's event. 381 return func() (bool, error) { 382 pgName := job.Name + "-" + string(job.UID) 383 pg, err := ctx.Vcclient.SchedulingV1beta1().PodGroups(job.Namespace).Get(context.TODO(), pgName, metav1.GetOptions{}) 384 Expect(err).NotTo(HaveOccurred(), "failed to get pod group of job %s in namespace %s", job.Name, job.Namespace) 385 386 events, err := ctx.Kubeclient.CoreV1().Events(pg.Namespace).List(context.TODO(), metav1.ListOptions{}) 387 Expect(err).NotTo(HaveOccurred(), "failed to list events in namespace %s", pg.Namespace) 388 389 for _, event := range events.Items { 390 target := event.InvolvedObject 391 if target.Name == pg.Name && target.Namespace == pg.Namespace { 392 if event.Reason == string("Evict") && event.LastTimestamp.After(time) { 393 return true, nil 394 } 395 } 396 } 397 return false, nil 398 } 399 } 400 401 func WaitJobPhases(ctx *TestContext, job *batchv1alpha1.Job, phases []batchv1alpha1.JobPhase) error { 402 w, err := ctx.Vcclient.BatchV1alpha1().Jobs(job.Namespace).Watch(context.TODO(), metav1.ListOptions{}) 403 if err != nil { 404 return err 405 } 406 defer w.Stop() 407 408 var additionalError error 409 total := int32(0) 410 for _, task := range job.Spec.Tasks { 411 total += task.Replicas 412 } 413 414 ch := w.ResultChan() 415 index := 0 416 timeout := time.After(TenMinute) 417 418 for index < len(phases) { 419 select { 420 case event, open := <-ch: 421 if !open { 422 return fmt.Errorf("watch channel should be always open") 423 } 424 425 newJob := event.Object.(*batchv1alpha1.Job) 426 phase := phases[index] 427 if newJob.Name != job.Name || newJob.Namespace != job.Namespace { 428 continue 429 } 430 431 if newJob.Status.State.Phase != phase { 432 additionalError = fmt.Errorf( 433 "expected job '%s' to be in status %s, actual get %s", 434 job.Name, phase, newJob.Status.State.Phase) 435 continue 436 } 437 438 var flag bool 439 switch phase { 440 case batchv1alpha1.Pending: 441 flag = (newJob.Status.Pending+newJob.Status.Succeeded+ 442 newJob.Status.Failed+newJob.Status.Running) == 0 || 443 (total-newJob.Status.Terminating >= newJob.Status.MinAvailable) 444 case batchv1alpha1.Terminating, batchv1alpha1.Aborting, batchv1alpha1.Restarting, batchv1alpha1.Completing: 445 flag = newJob.Status.Terminating > 0 446 case batchv1alpha1.Terminated, batchv1alpha1.Aborted, batchv1alpha1.Completed: 447 flag = newJob.Status.Pending == 0 && 448 newJob.Status.Running == 0 && 449 newJob.Status.Terminating == 0 450 case batchv1alpha1.Running: 451 flag = newJob.Status.Running >= newJob.Spec.MinAvailable 452 default: 453 return fmt.Errorf("unknown phase %s", phase) 454 } 455 456 if !flag { 457 additionalError = fmt.Errorf( 458 "expected job '%s' to be in status %s, actual detail status %s", 459 job.Name, phase, getJobStatusDetail(newJob)) 460 continue 461 } 462 463 index++ 464 timeout = time.After(TenMinute) 465 466 case <-timeout: 467 return fmt.Errorf("[Wait time out]: %s", additionalError) 468 } 469 } 470 471 return nil 472 } 473 474 func WaitJobStates(ctx *TestContext, job *batchv1alpha1.Job, phases []batchv1alpha1.JobPhase, waitTime time.Duration) error { 475 for _, phase := range phases { 476 err := waitJobPhaseExpect(ctx, job, phase, waitTime) 477 if err != nil { 478 return err 479 } 480 } 481 return nil 482 } 483 484 func getJobStatusDetail(job *batchv1alpha1.Job) string { 485 return fmt.Sprintf("\nName: %s\n Phase: %s\nPending: %d"+ 486 "\nRunning: %d\nSucceeded: %d\nTerminating: %d\nFailed: %d\n ", 487 job.Name, job.Status.State.Phase, job.Status.Pending, job.Status.Running, 488 job.Status.Succeeded, job.Status.Terminating, job.Status.Failed) 489 } 490 491 // WaitJobReady waits for the Job to be ready 492 func WaitJobReady(ctx *TestContext, job *batchv1alpha1.Job) error { 493 return WaitTasksReady(ctx, job, int(job.Spec.MinAvailable)) 494 } 495 496 // WaitJobPending waits for the Job to be pending 497 func WaitJobPending(ctx *TestContext, job *batchv1alpha1.Job) error { 498 return WaitTaskPhase(ctx, job, []v1.PodPhase{v1.PodPending}, int(job.Spec.MinAvailable)) 499 } 500 501 // WaitTasksReady waits for the tasks of a Job to be ready 502 func WaitTasksReady(ctx *TestContext, job *batchv1alpha1.Job, taskNum int) error { 503 return WaitTaskPhase(ctx, job, []v1.PodPhase{v1.PodRunning, v1.PodSucceeded}, taskNum) 504 } 505 506 func WaitTasksReadyEx(ctx *TestContext, job *batchv1alpha1.Job, taskNum map[string]int) error { 507 return taskPhaseEx(ctx, job, []v1.PodPhase{v1.PodRunning, v1.PodSucceeded}, taskNum) 508 } 509 510 // WaitTasksPending waits for the tasks of a Job to be pending 511 func WaitTasksPending(ctx *TestContext, job *batchv1alpha1.Job, taskNum int) error { 512 return WaitTaskPhase(ctx, job, []v1.PodPhase{v1.PodPending}, taskNum) 513 } 514 515 // WaitJobStateReady waits for the state of a Job to be ready 516 func WaitJobStateReady(ctx *TestContext, job *batchv1alpha1.Job) error { 517 return waitJobPhaseExpect(ctx, job, batchv1alpha1.Running, FiveMinute) 518 } 519 520 // WaitJobStatePending waits for the state of a Job to be pending 521 func WaitJobStatePending(ctx *TestContext, job *batchv1alpha1.Job) error { 522 return waitJobPhaseExpect(ctx, job, batchv1alpha1.Pending, FiveMinute) 523 } 524 525 // WaitJobStateAborted waits for the state of a Job to be aborted 526 func WaitJobStateAborted(ctx *TestContext, job *batchv1alpha1.Job) error { 527 return waitJobPhaseExpect(ctx, job, batchv1alpha1.Aborted, FiveMinute) 528 } 529 530 // WaitPodPhaseRunningMoreThanNum waits for the number of running pods to be more than specified number 531 func WaitPodPhaseRunningMoreThanNum(ctx *TestContext, namespace string, num int) error { 532 var additionalError error 533 err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) { 534 clusterPods, err := ctx.Kubeclient.CoreV1().Pods(namespace).List(context.TODO(), metav1.ListOptions{}) 535 Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", namespace) 536 537 runningPodNum := 0 538 for _, pod := range clusterPods.Items { 539 if pod.Status.Phase == "Running" { 540 runningPodNum++ 541 } 542 } 543 544 expected := runningPodNum >= num 545 if !expected { 546 additionalError = fmt.Errorf("expected running pod is '%s', actual got %s", strconv.Itoa(runningPodNum), strconv.Itoa(num)) 547 } 548 return expected, nil 549 }) 550 if err != nil && strings.Contains(err.Error(), TimeOutMessage) { 551 return fmt.Errorf("[Wait time out]: %s", additionalError) 552 } 553 return err 554 } 555 556 func waitJobPhaseExpect(ctx *TestContext, job *batchv1alpha1.Job, state batchv1alpha1.JobPhase, waitTime time.Duration) error { 557 var additionalError error 558 err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) { 559 job, err := ctx.Vcclient.BatchV1alpha1().Jobs(job.Namespace).Get(context.TODO(), job.Name, metav1.GetOptions{}) 560 Expect(err).NotTo(HaveOccurred()) 561 expected := job.Status.State.Phase == state 562 if !expected { 563 additionalError = fmt.Errorf("expected job '%s' phase in %s, actual got %s", job.Name, 564 state, job.Status.State.Phase) 565 } 566 return expected, nil 567 }) 568 if err != nil && strings.Contains(err.Error(), TimeOutMessage) { 569 return fmt.Errorf("[Wait time out]: %s", additionalError) 570 } 571 return err 572 } 573 574 func WaitJobPhaseReady(ctx *TestContext, job *batchv1.Job) error { 575 var additionalError error 576 577 err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) { 578 job, err := ctx.Kubeclient.BatchV1().Jobs(job.Namespace).Get(context.TODO(), job.Name, metav1.GetOptions{}) 579 Expect(err).NotTo(HaveOccurred()) 580 expected := job.Status.Active > 0 581 if !expected { 582 additionalError = fmt.Errorf("expected job '%s' active pod to be greater than 0, actual got %d", job.Name, job.Status.Active) 583 } 584 return expected, nil 585 }) 586 587 if err != nil && strings.Contains(err.Error(), TimeOutMessage) { 588 return fmt.Errorf("[Wait time out]: %s", additionalError) 589 } 590 591 return err 592 } 593 594 func WaitJobUnschedulable(ctx *TestContext, job *batchv1alpha1.Job) error { 595 now := time.Now() 596 return jobUnschedulable(ctx, job, now) 597 } 598 599 func CreateContainers(img, command, workingDir string, req, limit v1.ResourceList, hostport int32) []v1.Container { 600 var imageRepo []string 601 container := v1.Container{ 602 Image: img, 603 ImagePullPolicy: v1.PullIfNotPresent, 604 Resources: v1.ResourceRequirements{ 605 Requests: req, 606 Limits: limit, 607 }, 608 } 609 if !strings.Contains(img, ":") { 610 imageRepo = strings.Split(img, "/") 611 } else { 612 imageRepo = strings.Split(img[:strings.Index(img, ":")], "/") 613 } 614 container.Name = imageRepo[len(imageRepo)-1] 615 616 if len(command) > 0 { 617 container.Command = []string{"/bin/sh"} 618 container.Args = []string{"-c", command} 619 } 620 621 if hostport > 0 { 622 container.Ports = []v1.ContainerPort{ 623 { 624 ContainerPort: hostport, 625 HostPort: hostport, 626 }, 627 } 628 } 629 630 if len(workingDir) > 0 { 631 container.WorkingDir = workingDir 632 } 633 634 return []v1.Container{container} 635 } 636 637 // WaitJobCleanedUp waits for the Job to be cleaned up 638 func WaitJobCleanedUp(ctx *TestContext, cleanupjob *batchv1alpha1.Job) error { 639 var additionalError error 640 641 pods := GetTasksOfJob(ctx, cleanupjob) 642 643 err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) { 644 job, err := ctx.Vcclient.BatchV1alpha1().Jobs(cleanupjob.Namespace).Get(context.TODO(), cleanupjob.Name, metav1.GetOptions{}) 645 if err != nil && !errors.IsNotFound(err) { 646 return false, nil 647 } 648 if len(job.Name) != 0 { 649 additionalError = fmt.Errorf("job %s/%s still exist", job.Namespace, job.Name) 650 return false, nil 651 } 652 653 pgName := cleanupjob.Name + "-" + string(cleanupjob.UID) 654 pg, err := ctx.Vcclient.SchedulingV1beta1().PodGroups(cleanupjob.Namespace).Get(context.TODO(), pgName, metav1.GetOptions{}) 655 if err != nil && !errors.IsNotFound(err) { 656 return false, nil 657 } 658 if len(pg.Name) != 0 { 659 additionalError = fmt.Errorf("pdgroup %s/%s still exist", job.Namespace, job.Name) 660 return false, nil 661 } 662 663 return true, nil 664 }) 665 if err != nil && strings.Contains(err.Error(), TimeOutMessage) { 666 return fmt.Errorf("[Wait time out]: %s", additionalError) 667 } 668 669 for _, pod := range pods { 670 err := WaitPodGone(ctx, pod.Name, pod.Namespace) 671 if err != nil { 672 return err 673 } 674 } 675 676 return err 677 } 678 679 // GetTasksOfJob returns the tasks belongs to the job 680 func GetTasksOfJob(ctx *TestContext, job *batchv1alpha1.Job) []*v1.Pod { 681 pods, err := ctx.Kubeclient.CoreV1().Pods(job.Namespace).List(context.TODO(), metav1.ListOptions{}) 682 Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", job.Namespace) 683 684 var tasks []*v1.Pod 685 686 for _, pod := range pods.Items { 687 if !metav1.IsControlledBy(&pod, job) { 688 continue 689 } 690 duplicatePod := pod.DeepCopy() 691 tasks = append(tasks, duplicatePod) 692 } 693 694 return tasks 695 } 696 697 // WaitPodGone waits the Pod to be deleted when aborting a Job 698 func WaitPodGone(ctx *TestContext, podName, namespace string) error { 699 var additionalError error 700 err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) { 701 _, err := ctx.Kubeclient.CoreV1().Pods(namespace).Get(context.TODO(), podName, metav1.GetOptions{}) 702 expected := errors.IsNotFound(err) 703 if !expected { 704 additionalError = fmt.Errorf("job related pod should be deleted when aborting job") 705 } 706 707 return expected, nil 708 }) 709 if err != nil && strings.Contains(err.Error(), TimeOutMessage) { 710 return fmt.Errorf("[Wait time out]: %s", additionalError) 711 } 712 return err 713 } 714 715 // WaitJobTerminateAction waits for the Job to be terminated 716 func WaitJobTerminateAction(ctx *TestContext, pg *batchv1alpha1.Job) error { 717 return wait.Poll(10*time.Second, FiveMinute, jobTerminateAction(ctx, pg, time.Now())) 718 } 719 720 func jobTerminateAction(ctx *TestContext, pg *batchv1alpha1.Job, time time.Time) wait.ConditionFunc { 721 return func() (bool, error) { 722 events, err := ctx.Kubeclient.CoreV1().Events(pg.Namespace).List(context.TODO(), metav1.ListOptions{}) 723 Expect(err).NotTo(HaveOccurred(), "failed to list events in namespace %s", pg.Namespace) 724 725 for _, event := range events.Items { 726 target := event.InvolvedObject 727 if strings.HasPrefix(target.Name, pg.Name) && target.Namespace == pg.Namespace { 728 if event.Reason == string(ExecuteAction) && strings.Contains(event.Message, "TerminateJob") && event.LastTimestamp.After(time) { 729 return true, nil 730 } 731 } 732 } 733 734 return false, nil 735 } 736 } 737 738 // WaitPodPhase waits for the Pod to be the specified phase 739 func WaitPodPhase(ctx *TestContext, pod *v1.Pod, phase []v1.PodPhase) error { 740 var additionalError error 741 err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) { 742 pods, err := ctx.Kubeclient.CoreV1().Pods(pod.Namespace).List(context.TODO(), metav1.ListOptions{}) 743 Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", pod.Namespace) 744 745 for _, p := range phase { 746 for _, pod := range pods.Items { 747 if pod.Status.Phase == p { 748 return true, nil 749 } 750 } 751 } 752 753 additionalError = fmt.Errorf("expected pod '%s' to %v, actual got %s", pod.Name, phase, pod.Status.Phase) 754 return false, nil 755 }) 756 if err != nil && strings.Contains(err.Error(), TimeOutMessage) { 757 return fmt.Errorf("[Wait time out]: %s", additionalError) 758 } 759 return err 760 } 761 762 // IsPodScheduled returns whether the Pod is scheduled 763 func IsPodScheduled(pod *v1.Pod) bool { 764 for _, cond := range pod.Status.Conditions { 765 if cond.Type == v1.PodScheduled && cond.Status == v1.ConditionTrue { 766 return true 767 } 768 } 769 return false 770 } 771 772 // WaitTasksCompleted waits for the tasks of a job to be completed 773 func WaitTasksCompleted(ctx *TestContext, job *batchv1alpha1.Job, successNum int32) error { 774 var additionalError error 775 err := wait.Poll(100*time.Millisecond, TwoMinute, func() (bool, error) { 776 pods, err := ctx.Kubeclient.CoreV1().Pods(job.Namespace).List(context.TODO(), metav1.ListOptions{}) 777 Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", job.Namespace) 778 779 var succeeded int32 = 0 780 for _, pod := range pods.Items { 781 if !metav1.IsControlledBy(&pod, job) { 782 continue 783 } 784 785 if pod.Status.Phase == "Succeeded" { 786 succeeded++ 787 } 788 } 789 790 ready := succeeded >= successNum 791 if !ready { 792 additionalError = fmt.Errorf("expected job '%s' to have %d succeeded pods, actual got %d", job.Name, 793 successNum, 794 succeeded) 795 } 796 return ready, nil 797 }) 798 if err != nil && strings.Contains(err.Error(), TimeOutMessage) { 799 return fmt.Errorf("[Wait time out]: %s", additionalError) 800 } 801 return err 802 }