volcano.sh/volcano@v1.9.0/pkg/controllers/job/job_controller_actions.go (about) 1 /* 2 Copyright 2019 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package job 18 19 import ( 20 "context" 21 "fmt" 22 "reflect" 23 "sort" 24 "sync" 25 "sync/atomic" 26 "time" 27 28 v1 "k8s.io/api/core/v1" 29 apierrors "k8s.io/apimachinery/pkg/api/errors" 30 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 31 quotav1 "k8s.io/apiserver/pkg/quota/v1" 32 "k8s.io/klog/v2" 33 34 batch "volcano.sh/apis/pkg/apis/batch/v1alpha1" 35 "volcano.sh/apis/pkg/apis/helpers" 36 scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1" 37 38 "volcano.sh/volcano/pkg/controllers/apis" 39 jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers" 40 "volcano.sh/volcano/pkg/controllers/job/state" 41 "volcano.sh/volcano/pkg/controllers/util" 42 ) 43 44 var calMutex sync.Mutex 45 46 func (cc *jobcontroller) killJob(jobInfo *apis.JobInfo, podRetainPhase state.PhaseMap, updateStatus state.UpdateStatusFn) error { 47 job := jobInfo.Job 48 klog.V(3).Infof("Killing Job <%s/%s>, current version %d", job.Namespace, job.Name, job.Status.Version) 49 defer klog.V(3).Infof("Finished Job <%s/%s> killing, current version %d", job.Namespace, job.Name, job.Status.Version) 50 51 if job.DeletionTimestamp != nil { 52 klog.Infof("Job <%s/%s> is terminating, skip management process.", 53 job.Namespace, job.Name) 54 return nil 55 } 56 57 var pending, running, terminating, succeeded, failed, unknown int32 58 taskStatusCount := make(map[string]batch.TaskState) 59 60 var errs []error 61 var total int 62 63 for _, pods := range jobInfo.Pods { 64 for _, pod := range pods { 65 total++ 66 67 if pod.DeletionTimestamp != nil { 68 klog.Infof("Pod <%s/%s> is terminating", pod.Namespace, pod.Name) 69 terminating++ 70 continue 71 } 72 73 maxRetry := job.Spec.MaxRetry 74 lastRetry := false 75 if job.Status.RetryCount >= maxRetry-1 { 76 lastRetry = true 77 } 78 79 // Only retain the Failed and Succeeded pods at the last retry. 80 // If it is not the last retry, kill pod as defined in `podRetainPhase`. 81 retainPhase := podRetainPhase 82 if lastRetry { 83 retainPhase = state.PodRetainPhaseSoft 84 } 85 _, retain := retainPhase[pod.Status.Phase] 86 87 if !retain { 88 err := cc.deleteJobPod(job.Name, pod) 89 if err == nil { 90 terminating++ 91 continue 92 } 93 // record the err, and then collect the pod info like retained pod 94 errs = append(errs, err) 95 cc.resyncTask(pod) 96 } 97 98 classifyAndAddUpPodBaseOnPhase(pod, &pending, &running, &succeeded, &failed, &unknown) 99 calcPodStatus(pod, taskStatusCount) 100 } 101 } 102 103 if len(errs) != 0 { 104 klog.Errorf("failed to kill pods for job %s/%s, with err %+v", job.Namespace, job.Name, errs) 105 cc.recorder.Event(job, v1.EventTypeWarning, FailedDeletePodReason, 106 fmt.Sprintf("Error deleting pods: %+v", errs)) 107 return fmt.Errorf("failed to kill %d pods of %d", len(errs), total) 108 } 109 110 job = job.DeepCopy() 111 // Job version is bumped only when job is killed 112 job.Status.Version++ 113 job.Status.Pending = pending 114 job.Status.Running = running 115 job.Status.Succeeded = succeeded 116 job.Status.Failed = failed 117 job.Status.Terminating = terminating 118 job.Status.Unknown = unknown 119 job.Status.TaskStatusCount = taskStatusCount 120 121 // Update running duration 122 klog.V(3).Infof("Running duration is %s", metav1.Duration{Duration: time.Since(jobInfo.Job.CreationTimestamp.Time)}.ToUnstructured()) 123 job.Status.RunningDuration = &metav1.Duration{Duration: time.Since(jobInfo.Job.CreationTimestamp.Time)} 124 125 if updateStatus != nil { 126 if updateStatus(&job.Status) { 127 job.Status.State.LastTransitionTime = metav1.Now() 128 jobCondition := newCondition(job.Status.State.Phase, &job.Status.State.LastTransitionTime) 129 job.Status.Conditions = append(job.Status.Conditions, jobCondition) 130 } 131 } 132 133 // must be called before update job status 134 if err := cc.pluginOnJobDelete(job); err != nil { 135 return err 136 } 137 138 // Update Job status 139 newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) 140 if err != nil { 141 klog.Errorf("Failed to update status of Job %v/%v: %v", 142 job.Namespace, job.Name, err) 143 return err 144 } 145 if e := cc.cache.Update(newJob); e != nil { 146 klog.Errorf("KillJob - Failed to update Job %v/%v in cache: %v", 147 newJob.Namespace, newJob.Name, e) 148 return e 149 } 150 151 // Delete PodGroup 152 pgName := job.Name + "-" + string(job.UID) 153 if err := cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Delete(context.TODO(), pgName, metav1.DeleteOptions{}); err != nil { 154 if !apierrors.IsNotFound(err) { 155 klog.Errorf("Failed to delete PodGroup of Job %v/%v: %v", 156 job.Namespace, job.Name, err) 157 return err 158 } 159 } 160 161 // NOTE(k82cn): DO NOT delete input/output until job is deleted. 162 163 return nil 164 } 165 166 func (cc *jobcontroller) initiateJob(job *batch.Job) (*batch.Job, error) { 167 klog.V(3).Infof("Starting to initiate Job <%s/%s>", job.Namespace, job.Name) 168 jobInstance, err := cc.initJobStatus(job) 169 if err != nil { 170 cc.recorder.Event(job, v1.EventTypeWarning, string(batch.JobStatusError), 171 fmt.Sprintf("Failed to initialize job status, err: %v", err)) 172 return nil, err 173 } 174 175 if err := cc.pluginOnJobAdd(jobInstance); err != nil { 176 cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PluginError), 177 fmt.Sprintf("Execute plugin when job add failed, err: %v", err)) 178 return nil, err 179 } 180 181 newJob, err := cc.createJobIOIfNotExist(jobInstance) 182 if err != nil { 183 cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PVCError), 184 fmt.Sprintf("Failed to create PVC, err: %v", err)) 185 return nil, err 186 } 187 188 if err := cc.createOrUpdatePodGroup(newJob); err != nil { 189 cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PodGroupError), 190 fmt.Sprintf("Failed to create PodGroup, err: %v", err)) 191 return nil, err 192 } 193 194 return newJob, nil 195 } 196 197 func (cc *jobcontroller) initOnJobUpdate(job *batch.Job) error { 198 klog.V(3).Infof("Starting to initiate Job <%s/%s> on update", job.Namespace, job.Name) 199 200 if err := cc.pluginOnJobUpdate(job); err != nil { 201 cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PluginError), 202 fmt.Sprintf("Execute plugin when job add failed, err: %v", err)) 203 return err 204 } 205 206 if err := cc.createOrUpdatePodGroup(job); err != nil { 207 cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PodGroupError), 208 fmt.Sprintf("Failed to create PodGroup, err: %v", err)) 209 return err 210 } 211 212 return nil 213 } 214 215 func (cc *jobcontroller) GetQueueInfo(queue string) (*scheduling.Queue, error) { 216 queueInfo, err := cc.queueLister.Get(queue) 217 if err != nil { 218 klog.Errorf("Failed to get queue from queueLister, error: %s", err.Error()) 219 } 220 221 return queueInfo, err 222 } 223 224 func (cc *jobcontroller) syncJob(jobInfo *apis.JobInfo, updateStatus state.UpdateStatusFn) error { 225 job := jobInfo.Job 226 klog.V(3).Infof("Starting to sync up Job <%s/%s>, current version %d", job.Namespace, job.Name, job.Status.Version) 227 defer klog.V(3).Infof("Finished Job <%s/%s> sync up, current version %d", job.Namespace, job.Name, job.Status.Version) 228 229 if jobInfo.Job.DeletionTimestamp != nil { 230 klog.Infof("Job <%s/%s> is terminating, skip management process.", 231 jobInfo.Job.Namespace, jobInfo.Job.Name) 232 return nil 233 } 234 235 // deep copy job to prevent mutate it 236 job = job.DeepCopy() 237 238 // Find queue that job belongs to, and check if the queue has forwarding metadata 239 queueInfo, err := cc.GetQueueInfo(job.Spec.Queue) 240 if err != nil { 241 return err 242 } 243 244 var jobForwarding bool 245 if len(queueInfo.Spec.ExtendClusters) != 0 { 246 jobForwarding = true 247 if len(job.Annotations) == 0 { 248 job.Annotations = make(map[string]string) 249 } 250 job.Annotations[batch.JobForwardingKey] = "true" 251 job, err = cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).Update(context.TODO(), job, metav1.UpdateOptions{}) 252 if err != nil { 253 klog.Errorf("failed to update job: %s/%s, error: %s", job.Namespace, job.Name, err.Error()) 254 return err 255 } 256 } 257 258 // Skip job initiation if job is already initiated 259 if !isInitiated(job) { 260 if job, err = cc.initiateJob(job); err != nil { 261 return err 262 } 263 } else { 264 // TODO: optimize this call it only when scale up/down 265 if err = cc.initOnJobUpdate(job); err != nil { 266 return err 267 } 268 } 269 270 if len(queueInfo.Spec.ExtendClusters) != 0 { 271 jobForwarding = true 272 job.Annotations[batch.JobForwardingKey] = "true" 273 _, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).Update(context.TODO(), job, metav1.UpdateOptions{}) 274 if err != nil { 275 klog.Errorf("failed to update job: %s/%s, error: %s", job.Namespace, job.Name, err.Error()) 276 return err 277 } 278 } 279 280 var syncTask bool 281 pgName := job.Name + "-" + string(job.UID) 282 if pg, _ := cc.pgLister.PodGroups(job.Namespace).Get(pgName); pg != nil { 283 if pg.Status.Phase != "" && pg.Status.Phase != scheduling.PodGroupPending { 284 syncTask = true 285 } 286 287 for _, condition := range pg.Status.Conditions { 288 if condition.Type == scheduling.PodGroupUnschedulableType { 289 cc.recorder.Eventf(job, v1.EventTypeWarning, string(batch.PodGroupPending), 290 fmt.Sprintf("PodGroup %s:%s unschedule,reason: %s", job.Namespace, job.Name, condition.Message)) 291 } 292 } 293 } 294 295 var jobCondition batch.JobCondition 296 if !syncTask { 297 if updateStatus != nil { 298 if updateStatus(&job.Status) { 299 job.Status.State.LastTransitionTime = metav1.Now() 300 jobCondition = newCondition(job.Status.State.Phase, &job.Status.State.LastTransitionTime) 301 job.Status.Conditions = append(job.Status.Conditions, jobCondition) 302 } 303 } 304 newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) 305 if err != nil { 306 klog.Errorf("Failed to update status of Job %v/%v: %v", 307 job.Namespace, job.Name, err) 308 return err 309 } 310 if e := cc.cache.Update(newJob); e != nil { 311 klog.Errorf("SyncJob - Failed to update Job %v/%v in cache: %v", 312 newJob.Namespace, newJob.Name, e) 313 return e 314 } 315 return nil 316 } 317 318 var running, pending, terminating, succeeded, failed, unknown int32 319 taskStatusCount := make(map[string]batch.TaskState) 320 321 podToCreate := make(map[string][]*v1.Pod) 322 var podToDelete []*v1.Pod 323 var creationErrs []error 324 var deletionErrs []error 325 appendMutex := sync.Mutex{} 326 327 appendError := func(container *[]error, err error) { 328 appendMutex.Lock() 329 defer appendMutex.Unlock() 330 *container = append(*container, err) 331 } 332 333 waitCreationGroup := sync.WaitGroup{} 334 335 for _, ts := range job.Spec.Tasks { 336 ts.Template.Name = ts.Name 337 tc := ts.Template.DeepCopy() 338 name := ts.Template.Name 339 340 pods, found := jobInfo.Pods[name] 341 if !found { 342 pods = map[string]*v1.Pod{} 343 } 344 345 var podToCreateEachTask []*v1.Pod 346 for i := 0; i < int(ts.Replicas); i++ { 347 podName := fmt.Sprintf(jobhelpers.PodNameFmt, job.Name, name, i) 348 if pod, found := pods[podName]; !found { 349 newPod := createJobPod(job, tc, ts.TopologyPolicy, i, jobForwarding) 350 if err := cc.pluginOnPodCreate(job, newPod); err != nil { 351 return err 352 } 353 podToCreateEachTask = append(podToCreateEachTask, newPod) 354 waitCreationGroup.Add(1) 355 } else { 356 delete(pods, podName) 357 if pod.DeletionTimestamp != nil { 358 klog.Infof("Pod <%s/%s> is terminating", pod.Namespace, pod.Name) 359 atomic.AddInt32(&terminating, 1) 360 continue 361 } 362 363 classifyAndAddUpPodBaseOnPhase(pod, &pending, &running, &succeeded, &failed, &unknown) 364 calcPodStatus(pod, taskStatusCount) 365 } 366 } 367 podToCreate[ts.Name] = podToCreateEachTask 368 for _, pod := range pods { 369 podToDelete = append(podToDelete, pod) 370 } 371 } 372 373 for taskName, podToCreateEachTask := range podToCreate { 374 if len(podToCreateEachTask) == 0 { 375 continue 376 } 377 go func(taskName string, podToCreateEachTask []*v1.Pod) { 378 taskIndex := jobhelpers.GetTaskIndexUnderJob(taskName, job) 379 if job.Spec.Tasks[taskIndex].DependsOn != nil { 380 if !cc.waitDependsOnTaskMeetCondition(taskName, taskIndex, podToCreateEachTask, job) { 381 klog.V(3).Infof("Job %s/%s depends on task not ready", job.Name, job.Namespace) 382 // release wait group 383 for _, pod := range podToCreateEachTask { 384 go func(pod *v1.Pod) { 385 defer waitCreationGroup.Done() 386 }(pod) 387 } 388 return 389 } 390 } 391 392 for _, pod := range podToCreateEachTask { 393 go func(pod *v1.Pod) { 394 defer waitCreationGroup.Done() 395 newPod, err := cc.kubeClient.CoreV1().Pods(pod.Namespace).Create(context.TODO(), pod, metav1.CreateOptions{}) 396 if err != nil && !apierrors.IsAlreadyExists(err) { 397 // Failed to create Pod, waitCreationGroup a moment and then create it again 398 // This is to ensure all podsMap under the same Job created 399 // So gang-scheduling could schedule the Job successfully 400 klog.Errorf("Failed to create pod %s for Job %s, err %#v", 401 pod.Name, job.Name, err) 402 appendError(&creationErrs, fmt.Errorf("failed to create pod %s, err: %#v", pod.Name, err)) 403 } else { 404 classifyAndAddUpPodBaseOnPhase(newPod, &pending, &running, &succeeded, &failed, &unknown) 405 calcPodStatus(pod, taskStatusCount) 406 klog.V(5).Infof("Created Task <%s> of Job <%s/%s>", 407 pod.Name, job.Namespace, job.Name) 408 } 409 }(pod) 410 } 411 }(taskName, podToCreateEachTask) 412 } 413 414 waitCreationGroup.Wait() 415 416 if len(creationErrs) != 0 { 417 cc.recorder.Event(job, v1.EventTypeWarning, FailedCreatePodReason, 418 fmt.Sprintf("Error creating pods: %+v", creationErrs)) 419 return fmt.Errorf("failed to create %d pods of %d", len(creationErrs), len(podToCreate)) 420 } 421 422 // Delete pods when scale down. 423 waitDeletionGroup := sync.WaitGroup{} 424 waitDeletionGroup.Add(len(podToDelete)) 425 for _, pod := range podToDelete { 426 go func(pod *v1.Pod) { 427 defer waitDeletionGroup.Done() 428 err := cc.deleteJobPod(job.Name, pod) 429 if err != nil { 430 // Failed to delete Pod, waitCreationGroup a moment and then create it again 431 // This is to ensure all podsMap under the same Job created 432 // So gang-scheduling could schedule the Job successfully 433 klog.Errorf("Failed to delete pod %s for Job %s, err %#v", 434 pod.Name, job.Name, err) 435 appendError(&deletionErrs, err) 436 cc.resyncTask(pod) 437 } else { 438 klog.V(3).Infof("Deleted Task <%s> of Job <%s/%s>", 439 pod.Name, job.Namespace, job.Name) 440 atomic.AddInt32(&terminating, 1) 441 } 442 }(pod) 443 } 444 waitDeletionGroup.Wait() 445 446 if len(deletionErrs) != 0 { 447 cc.recorder.Event(job, v1.EventTypeWarning, FailedDeletePodReason, 448 fmt.Sprintf("Error deleting pods: %+v", deletionErrs)) 449 return fmt.Errorf("failed to delete %d pods of %d", len(deletionErrs), len(podToDelete)) 450 } 451 job.Status = batch.JobStatus{ 452 State: job.Status.State, 453 454 Pending: pending, 455 Running: running, 456 Succeeded: succeeded, 457 Failed: failed, 458 Terminating: terminating, 459 Unknown: unknown, 460 Version: job.Status.Version, 461 MinAvailable: job.Spec.MinAvailable, 462 TaskStatusCount: taskStatusCount, 463 ControlledResources: job.Status.ControlledResources, 464 Conditions: job.Status.Conditions, 465 RetryCount: job.Status.RetryCount, 466 } 467 468 if updateStatus != nil && updateStatus(&job.Status) { 469 job.Status.State.LastTransitionTime = metav1.Now() 470 jobCondition = newCondition(job.Status.State.Phase, &job.Status.State.LastTransitionTime) 471 job.Status.Conditions = append(job.Status.Conditions, jobCondition) 472 } 473 newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) 474 if err != nil { 475 klog.Errorf("Failed to update status of Job %v/%v: %v", 476 job.Namespace, job.Name, err) 477 return err 478 } 479 if e := cc.cache.Update(newJob); e != nil { 480 klog.Errorf("SyncJob - Failed to update Job %v/%v in cache: %v", 481 newJob.Namespace, newJob.Name, e) 482 return e 483 } 484 485 return nil 486 } 487 488 func (cc *jobcontroller) waitDependsOnTaskMeetCondition(taskName string, taskIndex int, podToCreateEachTask []*v1.Pod, job *batch.Job) bool { 489 if job.Spec.Tasks[taskIndex].DependsOn == nil { 490 return true 491 } 492 dependsOn := *job.Spec.Tasks[taskIndex].DependsOn 493 if len(dependsOn.Name) > 1 && dependsOn.Iteration == batch.IterationAny { 494 // any ready to create task, return true 495 for _, task := range dependsOn.Name { 496 if cc.isDependsOnPodsReady(task, job) { 497 return true 498 } 499 } 500 // all not ready to skip create task, return false 501 return false 502 } 503 for _, dependsOnTask := range dependsOn.Name { 504 // any not ready to skip create task, return false 505 if !cc.isDependsOnPodsReady(dependsOnTask, job) { 506 return false 507 } 508 } 509 // all ready to create task, return true 510 return true 511 } 512 513 func (cc *jobcontroller) isDependsOnPodsReady(task string, job *batch.Job) bool { 514 dependsOnPods := jobhelpers.GetPodsNameUnderTask(task, job) 515 dependsOnTaskIndex := jobhelpers.GetTaskIndexUnderJob(task, job) 516 runningPodCount := 0 517 for _, podName := range dependsOnPods { 518 pod, err := cc.podLister.Pods(job.Namespace).Get(podName) 519 if err != nil { 520 // If pod is not found. There are 2 possibilities. 521 // 1. vcjob has been deleted. This function should return true. 522 // 2. pod is not created. This function should return false, continue waiting. 523 if apierrors.IsNotFound(err) { 524 _, errGetJob := cc.jobLister.Jobs(job.Namespace).Get(job.Name) 525 if errGetJob != nil { 526 return apierrors.IsNotFound(errGetJob) 527 } 528 } 529 klog.Errorf("Failed to get pod %v/%v %v", job.Namespace, podName, err) 530 continue 531 } 532 533 if pod.Status.Phase != v1.PodRunning && pod.Status.Phase != v1.PodSucceeded { 534 klog.V(5).Infof("Sequential state, pod %v/%v of depends on tasks is not running", pod.Namespace, pod.Name) 535 continue 536 } 537 538 allContainerReady := true 539 for _, containerStatus := range pod.Status.ContainerStatuses { 540 if !containerStatus.Ready { 541 allContainerReady = false 542 break 543 } 544 } 545 if allContainerReady { 546 runningPodCount++ 547 } 548 } 549 dependsOnTaskMinReplicas := job.Spec.Tasks[dependsOnTaskIndex].MinAvailable 550 if dependsOnTaskMinReplicas != nil { 551 if runningPodCount < int(*dependsOnTaskMinReplicas) { 552 klog.V(5).Infof("In a depends on startup state, there are already %d pods running, which is less than the minimum number of runs", runningPodCount) 553 return false 554 } 555 } 556 return true 557 } 558 559 func (cc *jobcontroller) createJobIOIfNotExist(job *batch.Job) (*batch.Job, error) { 560 // If PVC does not exist, create them for Job. 561 var needUpdate bool 562 if job.Status.ControlledResources == nil { 563 job.Status.ControlledResources = make(map[string]string) 564 } 565 for index, volume := range job.Spec.Volumes { 566 vcName := volume.VolumeClaimName 567 if len(vcName) == 0 { 568 // NOTE(k82cn): Ensure never have duplicated generated names. 569 for { 570 vcName = jobhelpers.GenPVCName(job.Name) 571 exist, err := cc.checkPVCExist(job, vcName) 572 if err != nil { 573 return job, err 574 } 575 if exist { 576 continue 577 } 578 job.Spec.Volumes[index].VolumeClaimName = vcName 579 needUpdate = true 580 break 581 } 582 // TODO: check VolumeClaim must be set if VolumeClaimName is empty 583 if volume.VolumeClaim != nil { 584 if err := cc.createPVC(job, vcName, volume.VolumeClaim); err != nil { 585 return job, err 586 } 587 } 588 } else { 589 exist, err := cc.checkPVCExist(job, vcName) 590 if err != nil { 591 return job, err 592 } 593 if !exist { 594 return job, fmt.Errorf("pvc %s is not found, the job will be in the Pending state until the PVC is created", vcName) 595 } 596 } 597 job.Status.ControlledResources["volume-pvc-"+vcName] = vcName 598 } 599 if needUpdate { 600 newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).Update(context.TODO(), job, metav1.UpdateOptions{}) 601 if err != nil { 602 klog.Errorf("Failed to update Job %v/%v for volume claim name: %v ", 603 job.Namespace, job.Name, err) 604 return job, err 605 } 606 607 newJob.Status = job.Status 608 return newJob, err 609 } 610 return job, nil 611 } 612 613 func (cc *jobcontroller) checkPVCExist(job *batch.Job, pvc string) (bool, error) { 614 if _, err := cc.pvcLister.PersistentVolumeClaims(job.Namespace).Get(pvc); err != nil { 615 if apierrors.IsNotFound(err) { 616 return false, nil 617 } 618 klog.V(3).Infof("Failed to get PVC %s for job <%s/%s>: %v", 619 pvc, job.Namespace, job.Name, err) 620 return false, err 621 } 622 return true, nil 623 } 624 625 func (cc *jobcontroller) createPVC(job *batch.Job, vcName string, volumeClaim *v1.PersistentVolumeClaimSpec) error { 626 pvc := &v1.PersistentVolumeClaim{ 627 ObjectMeta: metav1.ObjectMeta{ 628 Namespace: job.Namespace, 629 Name: vcName, 630 OwnerReferences: []metav1.OwnerReference{ 631 *metav1.NewControllerRef(job, helpers.JobKind), 632 }, 633 }, 634 Spec: *volumeClaim, 635 } 636 637 klog.V(3).Infof("Try to create PVC: %v", pvc) 638 639 if _, e := cc.kubeClient.CoreV1().PersistentVolumeClaims(job.Namespace).Create(context.TODO(), pvc, metav1.CreateOptions{}); e != nil { 640 klog.V(3).Infof("Failed to create PVC for Job <%s/%s>: %v", 641 job.Namespace, job.Name, e) 642 return e 643 } 644 return nil 645 } 646 647 func (cc *jobcontroller) createOrUpdatePodGroup(job *batch.Job) error { 648 // If PodGroup does not exist, create one for Job. 649 pgName := job.Name + "-" + string(job.UID) 650 var pg *scheduling.PodGroup 651 var err error 652 pg, err = cc.pgLister.PodGroups(job.Namespace).Get(pgName) 653 if err != nil { 654 if !apierrors.IsNotFound(err) { 655 klog.Errorf("Failed to get PodGroup for Job <%s/%s>: %v", 656 job.Namespace, job.Name, err) 657 return err 658 } 659 // try to get old pg if new pg not exist 660 pg, err = cc.pgLister.PodGroups(job.Namespace).Get(job.Name) 661 if err != nil { 662 if !apierrors.IsNotFound(err) { 663 klog.Errorf("Failed to get PodGroup for Job <%s/%s>: %v", 664 job.Namespace, job.Name, err) 665 return err 666 } 667 668 minTaskMember := map[string]int32{} 669 for _, task := range job.Spec.Tasks { 670 if task.MinAvailable != nil { 671 minTaskMember[task.Name] = *task.MinAvailable 672 } else { 673 minTaskMember[task.Name] = task.Replicas 674 } 675 } 676 677 pg := &scheduling.PodGroup{ 678 ObjectMeta: metav1.ObjectMeta{ 679 Namespace: job.Namespace, 680 // add job.UID into its name when create new PodGroup 681 Name: pgName, 682 Annotations: job.Annotations, 683 Labels: job.Labels, 684 OwnerReferences: []metav1.OwnerReference{ 685 *metav1.NewControllerRef(job, helpers.JobKind), 686 }, 687 }, 688 Spec: scheduling.PodGroupSpec{ 689 MinMember: job.Spec.MinAvailable, 690 MinTaskMember: minTaskMember, 691 Queue: job.Spec.Queue, 692 MinResources: cc.calcPGMinResources(job), 693 PriorityClassName: job.Spec.PriorityClassName, 694 }, 695 } 696 697 if _, err = cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Create(context.TODO(), pg, metav1.CreateOptions{}); err != nil { 698 if !apierrors.IsAlreadyExists(err) { 699 klog.Errorf("Failed to create PodGroup for Job <%s/%s>: %v", 700 job.Namespace, job.Name, err) 701 return err 702 } 703 } 704 return nil 705 } 706 } 707 708 pgShouldUpdate := false 709 if pg.Spec.PriorityClassName != job.Spec.PriorityClassName { 710 pg.Spec.PriorityClassName = job.Spec.PriorityClassName 711 pgShouldUpdate = true 712 } 713 714 minResources := cc.calcPGMinResources(job) 715 if pg.Spec.MinMember != job.Spec.MinAvailable || !reflect.DeepEqual(pg.Spec.MinResources, minResources) { 716 pg.Spec.MinMember = job.Spec.MinAvailable 717 pg.Spec.MinResources = minResources 718 pgShouldUpdate = true 719 } 720 721 if pg.Spec.MinTaskMember == nil { 722 pgShouldUpdate = true 723 pg.Spec.MinTaskMember = make(map[string]int32) 724 } 725 726 for _, task := range job.Spec.Tasks { 727 cnt := task.Replicas 728 if task.MinAvailable != nil { 729 cnt = *task.MinAvailable 730 } 731 732 if taskMember, ok := pg.Spec.MinTaskMember[task.Name]; !ok { 733 pgShouldUpdate = true 734 pg.Spec.MinTaskMember[task.Name] = cnt 735 } else { 736 if taskMember == cnt { 737 continue 738 } 739 740 pgShouldUpdate = true 741 pg.Spec.MinTaskMember[task.Name] = cnt 742 } 743 } 744 745 if !pgShouldUpdate { 746 return nil 747 } 748 749 _, err = cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Update(context.TODO(), pg, metav1.UpdateOptions{}) 750 if err != nil { 751 klog.V(3).Infof("Failed to update PodGroup for Job <%s/%s>: %v", 752 job.Namespace, job.Name, err) 753 } 754 return err 755 } 756 757 func (cc *jobcontroller) deleteJobPod(jobName string, pod *v1.Pod) error { 758 err := cc.kubeClient.CoreV1().Pods(pod.Namespace).Delete(context.TODO(), pod.Name, metav1.DeleteOptions{}) 759 if err != nil && !apierrors.IsNotFound(err) { 760 klog.Errorf("Failed to delete pod %s/%s for Job %s, err %#v", 761 pod.Namespace, pod.Name, jobName, err) 762 763 return fmt.Errorf("failed to delete pod %s, err %#v", pod.Name, err) 764 } 765 766 return nil 767 } 768 769 func (cc *jobcontroller) calcPGMinResources(job *batch.Job) *v1.ResourceList { 770 // sort task by priorityClasses 771 var tasksPriority TasksPriority 772 for _, task := range job.Spec.Tasks { 773 tp := TaskPriority{0, task} 774 pc := task.Template.Spec.PriorityClassName 775 776 if pc != "" { 777 priorityClass, err := cc.pcLister.Get(pc) 778 if err != nil || priorityClass == nil { 779 klog.Warningf("Ignore task %s priority class %s: %v", task.Name, pc, err) 780 } else { 781 tp.priority = priorityClass.Value 782 } 783 } 784 tasksPriority = append(tasksPriority, tp) 785 } 786 787 sort.Sort(tasksPriority) 788 789 minReq := v1.ResourceList{} 790 podCnt := int32(0) 791 for _, task := range tasksPriority { 792 for i := int32(0); i < task.Replicas; i++ { 793 if podCnt >= job.Spec.MinAvailable { 794 break 795 } 796 797 podCnt++ 798 pod := &v1.Pod{ 799 Spec: task.Template.Spec, 800 } 801 minReq = quotav1.Add(minReq, *util.GetPodQuotaUsage(pod)) 802 } 803 } 804 805 return &minReq 806 } 807 808 func (cc *jobcontroller) initJobStatus(job *batch.Job) (*batch.Job, error) { 809 if job.Status.State.Phase != "" { 810 return job, nil 811 } 812 813 job.Status.State.Phase = batch.Pending 814 job.Status.State.LastTransitionTime = metav1.Now() 815 job.Status.MinAvailable = job.Spec.MinAvailable 816 jobCondition := newCondition(job.Status.State.Phase, &job.Status.State.LastTransitionTime) 817 job.Status.Conditions = append(job.Status.Conditions, jobCondition) 818 newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) 819 if err != nil { 820 klog.Errorf("Failed to update status of Job %v/%v: %v", 821 job.Namespace, job.Name, err) 822 return nil, err 823 } 824 if err := cc.cache.Update(newJob); err != nil { 825 klog.Errorf("CreateJob - Failed to update Job %v/%v in cache: %v", 826 newJob.Namespace, newJob.Name, err) 827 return nil, err 828 } 829 830 return newJob, nil 831 } 832 833 func classifyAndAddUpPodBaseOnPhase(pod *v1.Pod, pending, running, succeeded, failed, unknown *int32) { 834 switch pod.Status.Phase { 835 case v1.PodPending: 836 atomic.AddInt32(pending, 1) 837 case v1.PodRunning: 838 atomic.AddInt32(running, 1) 839 case v1.PodSucceeded: 840 atomic.AddInt32(succeeded, 1) 841 case v1.PodFailed: 842 atomic.AddInt32(failed, 1) 843 default: 844 atomic.AddInt32(unknown, 1) 845 } 846 } 847 848 func calcPodStatus(pod *v1.Pod, taskStatusCount map[string]batch.TaskState) { 849 taskName, found := pod.Annotations[batch.TaskSpecKey] 850 if !found { 851 return 852 } 853 854 calMutex.Lock() 855 defer calMutex.Unlock() 856 if _, ok := taskStatusCount[taskName]; !ok { 857 taskStatusCount[taskName] = batch.TaskState{ 858 Phase: make(map[v1.PodPhase]int32), 859 } 860 } 861 862 switch pod.Status.Phase { 863 case v1.PodPending: 864 taskStatusCount[taskName].Phase[v1.PodPending]++ 865 case v1.PodRunning: 866 taskStatusCount[taskName].Phase[v1.PodRunning]++ 867 case v1.PodSucceeded: 868 taskStatusCount[taskName].Phase[v1.PodSucceeded]++ 869 case v1.PodFailed: 870 taskStatusCount[taskName].Phase[v1.PodFailed]++ 871 default: 872 taskStatusCount[taskName].Phase[v1.PodUnknown]++ 873 } 874 } 875 876 func isInitiated(job *batch.Job) bool { 877 if job.Status.State.Phase == "" || job.Status.State.Phase == batch.Pending { 878 return false 879 } 880 881 return true 882 } 883 884 func newCondition(status batch.JobPhase, lastTransitionTime *metav1.Time) batch.JobCondition { 885 return batch.JobCondition{ 886 Status: status, 887 LastTransitionTime: lastTransitionTime, 888 } 889 }