k8s.io/kubernetes@v1.29.3/pkg/controller/job/job_controller.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package job 18 19 import ( 20 "context" 21 "fmt" 22 "reflect" 23 "sort" 24 "sync" 25 "sync/atomic" 26 "time" 27 28 batch "k8s.io/api/batch/v1" 29 v1 "k8s.io/api/core/v1" 30 apierrors "k8s.io/apimachinery/pkg/api/errors" 31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 "k8s.io/apimachinery/pkg/labels" 33 "k8s.io/apimachinery/pkg/types" 34 "k8s.io/apimachinery/pkg/util/json" 35 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 36 "k8s.io/apimachinery/pkg/util/sets" 37 "k8s.io/apimachinery/pkg/util/wait" 38 "k8s.io/apiserver/pkg/util/feature" 39 batchinformers "k8s.io/client-go/informers/batch/v1" 40 coreinformers "k8s.io/client-go/informers/core/v1" 41 clientset "k8s.io/client-go/kubernetes" 42 "k8s.io/client-go/kubernetes/scheme" 43 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 44 batchv1listers "k8s.io/client-go/listers/batch/v1" 45 corelisters "k8s.io/client-go/listers/core/v1" 46 "k8s.io/client-go/tools/cache" 47 "k8s.io/client-go/tools/record" 48 "k8s.io/client-go/util/workqueue" 49 "k8s.io/klog/v2" 50 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 51 "k8s.io/kubernetes/pkg/controller" 52 "k8s.io/kubernetes/pkg/controller/job/metrics" 53 "k8s.io/kubernetes/pkg/features" 54 "k8s.io/utils/clock" 55 "k8s.io/utils/integer" 56 "k8s.io/utils/ptr" 57 ) 58 59 // controllerKind contains the schema.GroupVersionKind for this controller type. 60 var controllerKind = batch.SchemeGroupVersion.WithKind("Job") 61 62 var ( 63 // syncJobBatchPeriod is the batch period for controller sync invocations for a Job. 64 syncJobBatchPeriod = time.Second 65 // DefaultJobApiBackOff is the default API backoff period. Exported for tests. 66 DefaultJobApiBackOff = time.Second 67 // MaxJobApiBackOff is the max API backoff period. Exported for tests. 68 MaxJobApiBackOff = time.Minute 69 // DefaultJobPodFailureBackOff is the default pod failure backoff period. Exported for tests. 70 DefaultJobPodFailureBackOff = 10 * time.Second 71 // MaxJobPodFailureBackOff is the max pod failure backoff period. Exported for tests. 72 MaxJobPodFailureBackOff = 10 * time.Minute 73 // MaxUncountedPods is the maximum size the slices in 74 // .status.uncountedTerminatedPods should have to keep their representation 75 // roughly below 20 KB. Exported for tests 76 MaxUncountedPods = 500 77 // MaxPodCreateDeletePerSync is the maximum number of pods that can be 78 // created or deleted in a single sync call. Exported for tests. 79 MaxPodCreateDeletePerSync = 500 80 ) 81 82 // Controller ensures that all Job objects have corresponding pods to 83 // run their configured workload. 84 type Controller struct { 85 kubeClient clientset.Interface 86 podControl controller.PodControlInterface 87 88 // To allow injection of the following for testing. 89 updateStatusHandler func(ctx context.Context, job *batch.Job) (*batch.Job, error) 90 patchJobHandler func(ctx context.Context, job *batch.Job, patch []byte) error 91 syncHandler func(ctx context.Context, jobKey string) error 92 // podStoreSynced returns true if the pod store has been synced at least once. 93 // Added as a member to the struct to allow injection for testing. 94 podStoreSynced cache.InformerSynced 95 // jobStoreSynced returns true if the job store has been synced at least once. 96 // Added as a member to the struct to allow injection for testing. 97 jobStoreSynced cache.InformerSynced 98 99 // A TTLCache of pod creates/deletes each rc expects to see 100 expectations controller.ControllerExpectationsInterface 101 102 // finalizerExpectations tracks the Pod UIDs for which the controller 103 // expects to observe the tracking finalizer removed. 104 finalizerExpectations *uidTrackingExpectations 105 106 // A store of jobs 107 jobLister batchv1listers.JobLister 108 109 // A store of pods, populated by the podController 110 podStore corelisters.PodLister 111 112 // Jobs that need to be updated 113 queue workqueue.RateLimitingInterface 114 115 // Orphan deleted pods that still have a Job tracking finalizer to be removed 116 orphanQueue workqueue.RateLimitingInterface 117 118 broadcaster record.EventBroadcaster 119 recorder record.EventRecorder 120 121 clock clock.WithTicker 122 123 // Store with information to compute the expotential backoff delay for pod 124 // recreation in case of pod failures. 125 podBackoffStore *backoffStore 126 } 127 128 type syncJobCtx struct { 129 job *batch.Job 130 pods []*v1.Pod 131 finishedCondition *batch.JobCondition 132 activePods []*v1.Pod 133 succeeded int32 134 prevSucceededIndexes orderedIntervals 135 succeededIndexes orderedIntervals 136 failedIndexes *orderedIntervals 137 newBackoffRecord backoffRecord 138 expectedRmFinalizers sets.Set[string] 139 uncounted *uncountedTerminatedPods 140 podsWithDelayedDeletionPerIndex map[int]*v1.Pod 141 terminating *int32 142 } 143 144 // NewController creates a new Job controller that keeps the relevant pods 145 // in sync with their corresponding Job objects. 146 func NewController(ctx context.Context, podInformer coreinformers.PodInformer, jobInformer batchinformers.JobInformer, kubeClient clientset.Interface) (*Controller, error) { 147 return newControllerWithClock(ctx, podInformer, jobInformer, kubeClient, &clock.RealClock{}) 148 } 149 150 func newControllerWithClock(ctx context.Context, podInformer coreinformers.PodInformer, jobInformer batchinformers.JobInformer, kubeClient clientset.Interface, clock clock.WithTicker) (*Controller, error) { 151 eventBroadcaster := record.NewBroadcaster() 152 logger := klog.FromContext(ctx) 153 154 jm := &Controller{ 155 kubeClient: kubeClient, 156 podControl: controller.RealPodControl{ 157 KubeClient: kubeClient, 158 Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "job-controller"}), 159 }, 160 expectations: controller.NewControllerExpectations(), 161 finalizerExpectations: newUIDTrackingExpectations(), 162 queue: workqueue.NewRateLimitingQueueWithConfig(workqueue.NewItemExponentialFailureRateLimiter(DefaultJobApiBackOff, MaxJobApiBackOff), workqueue.RateLimitingQueueConfig{Name: "job", Clock: clock}), 163 orphanQueue: workqueue.NewRateLimitingQueueWithConfig(workqueue.NewItemExponentialFailureRateLimiter(DefaultJobApiBackOff, MaxJobApiBackOff), workqueue.RateLimitingQueueConfig{Name: "job_orphan_pod", Clock: clock}), 164 broadcaster: eventBroadcaster, 165 recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "job-controller"}), 166 clock: clock, 167 podBackoffStore: newBackoffStore(), 168 } 169 170 if _, err := jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 171 AddFunc: func(obj interface{}) { 172 jm.enqueueSyncJobImmediately(logger, obj) 173 }, 174 UpdateFunc: func(oldObj, newObj interface{}) { 175 jm.updateJob(logger, oldObj, newObj) 176 }, 177 DeleteFunc: func(obj interface{}) { 178 jm.deleteJob(logger, obj) 179 }, 180 }); err != nil { 181 return nil, fmt.Errorf("adding Job event handler: %w", err) 182 } 183 jm.jobLister = jobInformer.Lister() 184 jm.jobStoreSynced = jobInformer.Informer().HasSynced 185 186 if _, err := podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 187 AddFunc: func(obj interface{}) { 188 jm.addPod(logger, obj) 189 }, 190 UpdateFunc: func(oldObj, newObj interface{}) { 191 jm.updatePod(logger, oldObj, newObj) 192 }, 193 DeleteFunc: func(obj interface{}) { 194 jm.deletePod(logger, obj, true) 195 }, 196 }); err != nil { 197 return nil, fmt.Errorf("adding Pod event handler: %w", err) 198 } 199 jm.podStore = podInformer.Lister() 200 jm.podStoreSynced = podInformer.Informer().HasSynced 201 202 jm.updateStatusHandler = jm.updateJobStatus 203 jm.patchJobHandler = jm.patchJob 204 jm.syncHandler = jm.syncJob 205 206 metrics.Register() 207 208 return jm, nil 209 } 210 211 // Run the main goroutine responsible for watching and syncing jobs. 212 func (jm *Controller) Run(ctx context.Context, workers int) { 213 defer utilruntime.HandleCrash() 214 logger := klog.FromContext(ctx) 215 216 // Start events processing pipeline. 217 jm.broadcaster.StartStructuredLogging(0) 218 jm.broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: jm.kubeClient.CoreV1().Events("")}) 219 defer jm.broadcaster.Shutdown() 220 221 defer jm.queue.ShutDown() 222 defer jm.orphanQueue.ShutDown() 223 224 logger.Info("Starting job controller") 225 defer logger.Info("Shutting down job controller") 226 227 if !cache.WaitForNamedCacheSync("job", ctx.Done(), jm.podStoreSynced, jm.jobStoreSynced) { 228 return 229 } 230 231 for i := 0; i < workers; i++ { 232 go wait.UntilWithContext(ctx, jm.worker, time.Second) 233 } 234 235 go wait.UntilWithContext(ctx, jm.orphanWorker, time.Second) 236 237 <-ctx.Done() 238 } 239 240 // getPodJobs returns a list of Jobs that potentially match a Pod. 241 func (jm *Controller) getPodJobs(pod *v1.Pod) []*batch.Job { 242 jobs, err := jm.jobLister.GetPodJobs(pod) 243 if err != nil { 244 return nil 245 } 246 if len(jobs) > 1 { 247 // ControllerRef will ensure we don't do anything crazy, but more than one 248 // item in this list nevertheless constitutes user error. 249 utilruntime.HandleError(fmt.Errorf("user error! more than one job is selecting pods with labels: %+v", pod.Labels)) 250 } 251 ret := make([]*batch.Job, 0, len(jobs)) 252 for i := range jobs { 253 ret = append(ret, &jobs[i]) 254 } 255 return ret 256 } 257 258 // resolveControllerRef returns the controller referenced by a ControllerRef, 259 // or nil if the ControllerRef could not be resolved to a matching controller 260 // of the correct Kind. 261 func (jm *Controller) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) *batch.Job { 262 // We can't look up by UID, so look up by Name and then verify UID. 263 // Don't even try to look up by Name if it's the wrong Kind. 264 if controllerRef.Kind != controllerKind.Kind { 265 return nil 266 } 267 job, err := jm.jobLister.Jobs(namespace).Get(controllerRef.Name) 268 if err != nil { 269 return nil 270 } 271 if job.UID != controllerRef.UID { 272 // The controller we found with this Name is not the same one that the 273 // ControllerRef points to. 274 return nil 275 } 276 return job 277 } 278 279 // When a pod is created, enqueue the controller that manages it and update its expectations. 280 func (jm *Controller) addPod(logger klog.Logger, obj interface{}) { 281 pod := obj.(*v1.Pod) 282 recordFinishedPodWithTrackingFinalizer(nil, pod) 283 if pod.DeletionTimestamp != nil { 284 // on a restart of the controller, it's possible a new pod shows up in a state that 285 // is already pending deletion. Prevent the pod from being a creation observation. 286 jm.deletePod(logger, pod, false) 287 return 288 } 289 290 // If it has a ControllerRef, that's all that matters. 291 if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil { 292 job := jm.resolveControllerRef(pod.Namespace, controllerRef) 293 if job == nil { 294 return 295 } 296 jobKey, err := controller.KeyFunc(job) 297 if err != nil { 298 return 299 } 300 jm.expectations.CreationObserved(logger, jobKey) 301 jm.enqueueSyncJobBatched(logger, job) 302 return 303 } 304 305 // Otherwise, it's an orphan. 306 // Clean the finalizer. 307 if hasJobTrackingFinalizer(pod) { 308 jm.enqueueOrphanPod(pod) 309 } 310 // Get a list of all matching controllers and sync 311 // them to see if anyone wants to adopt it. 312 // DO NOT observe creation because no controller should be waiting for an 313 // orphan. 314 for _, job := range jm.getPodJobs(pod) { 315 jm.enqueueSyncJobBatched(logger, job) 316 } 317 } 318 319 // When a pod is updated, figure out what job/s manage it and wake them up. 320 // If the labels of the pod have changed we need to awaken both the old 321 // and new job. old and cur must be *v1.Pod types. 322 func (jm *Controller) updatePod(logger klog.Logger, old, cur interface{}) { 323 curPod := cur.(*v1.Pod) 324 oldPod := old.(*v1.Pod) 325 recordFinishedPodWithTrackingFinalizer(oldPod, curPod) 326 if curPod.ResourceVersion == oldPod.ResourceVersion { 327 // Periodic resync will send update events for all known pods. 328 // Two different versions of the same pod will always have different RVs. 329 return 330 } 331 if curPod.DeletionTimestamp != nil { 332 // when a pod is deleted gracefully it's deletion timestamp is first modified to reflect a grace period, 333 // and after such time has passed, the kubelet actually deletes it from the store. We receive an update 334 // for modification of the deletion timestamp and expect an job to create more pods asap, not wait 335 // until the kubelet actually deletes the pod. 336 jm.deletePod(logger, curPod, false) 337 return 338 } 339 340 // Don't check if oldPod has the finalizer, as during ownership transfer 341 // finalizers might be re-added and removed again in behalf of the new owner. 342 // If all those Pod updates collapse into a single event, the finalizer 343 // might be removed in oldPod and curPod. We want to record the latest 344 // state. 345 finalizerRemoved := !hasJobTrackingFinalizer(curPod) 346 curControllerRef := metav1.GetControllerOf(curPod) 347 oldControllerRef := metav1.GetControllerOf(oldPod) 348 controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef) 349 if controllerRefChanged && oldControllerRef != nil { 350 // The ControllerRef was changed. Sync the old controller, if any. 351 if job := jm.resolveControllerRef(oldPod.Namespace, oldControllerRef); job != nil { 352 if finalizerRemoved { 353 key, err := controller.KeyFunc(job) 354 if err == nil { 355 jm.finalizerExpectations.finalizerRemovalObserved(logger, key, string(curPod.UID)) 356 } 357 } 358 jm.enqueueSyncJobBatched(logger, job) 359 } 360 } 361 362 // If it has a ControllerRef, that's all that matters. 363 if curControllerRef != nil { 364 job := jm.resolveControllerRef(curPod.Namespace, curControllerRef) 365 if job == nil { 366 return 367 } 368 if finalizerRemoved { 369 key, err := controller.KeyFunc(job) 370 if err == nil { 371 jm.finalizerExpectations.finalizerRemovalObserved(logger, key, string(curPod.UID)) 372 } 373 } 374 jm.enqueueSyncJobBatched(logger, job) 375 return 376 } 377 378 // Otherwise, it's an orphan. 379 // Clean the finalizer. 380 if hasJobTrackingFinalizer(curPod) { 381 jm.enqueueOrphanPod(curPod) 382 } 383 // If anything changed, sync matching controllers 384 // to see if anyone wants to adopt it now. 385 labelChanged := !reflect.DeepEqual(curPod.Labels, oldPod.Labels) 386 if labelChanged || controllerRefChanged { 387 for _, job := range jm.getPodJobs(curPod) { 388 jm.enqueueSyncJobBatched(logger, job) 389 } 390 } 391 } 392 393 // When a pod is deleted, enqueue the job that manages the pod and update its expectations. 394 // obj could be an *v1.Pod, or a DeleteFinalStateUnknown marker item. 395 func (jm *Controller) deletePod(logger klog.Logger, obj interface{}, final bool) { 396 pod, ok := obj.(*v1.Pod) 397 if final { 398 recordFinishedPodWithTrackingFinalizer(pod, nil) 399 } 400 401 // When a delete is dropped, the relist will notice a pod in the store not 402 // in the list, leading to the insertion of a tombstone object which contains 403 // the deleted key/value. Note that this value might be stale. If the pod 404 // changed labels the new job will not be woken up till the periodic resync. 405 if !ok { 406 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 407 if !ok { 408 utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %+v", obj)) 409 return 410 } 411 pod, ok = tombstone.Obj.(*v1.Pod) 412 if !ok { 413 utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a pod %+v", obj)) 414 return 415 } 416 } 417 418 controllerRef := metav1.GetControllerOf(pod) 419 hasFinalizer := hasJobTrackingFinalizer(pod) 420 if controllerRef == nil { 421 // No controller should care about orphans being deleted. 422 // But this pod might have belonged to a Job and the GC removed the reference. 423 if hasFinalizer { 424 jm.enqueueOrphanPod(pod) 425 } 426 return 427 } 428 job := jm.resolveControllerRef(pod.Namespace, controllerRef) 429 if job == nil || IsJobFinished(job) { 430 // syncJob will not remove this finalizer. 431 if hasFinalizer { 432 jm.enqueueOrphanPod(pod) 433 } 434 return 435 } 436 jobKey, err := controller.KeyFunc(job) 437 if err != nil { 438 return 439 } 440 jm.expectations.DeletionObserved(logger, jobKey) 441 442 // Consider the finalizer removed if this is the final delete. Otherwise, 443 // it's an update for the deletion timestamp, then check finalizer. 444 if final || !hasFinalizer { 445 jm.finalizerExpectations.finalizerRemovalObserved(logger, jobKey, string(pod.UID)) 446 } 447 448 jm.enqueueSyncJobBatched(logger, job) 449 } 450 451 func (jm *Controller) updateJob(logger klog.Logger, old, cur interface{}) { 452 oldJob := old.(*batch.Job) 453 curJob := cur.(*batch.Job) 454 455 // never return error 456 key, err := controller.KeyFunc(curJob) 457 if err != nil { 458 return 459 } 460 461 if curJob.Generation == oldJob.Generation { 462 // Delay the Job sync when no generation change to batch Job status updates, 463 // typically triggered by pod events. 464 jm.enqueueSyncJobBatched(logger, curJob) 465 } else { 466 // Trigger immediate sync when spec is changed. 467 jm.enqueueSyncJobImmediately(logger, curJob) 468 } 469 470 // The job shouldn't be marked as finished until all pod finalizers are removed. 471 // This is a backup operation in this case. 472 if IsJobFinished(curJob) { 473 jm.cleanupPodFinalizers(curJob) 474 } 475 476 // check if need to add a new rsync for ActiveDeadlineSeconds 477 if curJob.Status.StartTime != nil { 478 curADS := curJob.Spec.ActiveDeadlineSeconds 479 if curADS == nil { 480 return 481 } 482 oldADS := oldJob.Spec.ActiveDeadlineSeconds 483 if oldADS == nil || *oldADS != *curADS { 484 passed := jm.clock.Since(curJob.Status.StartTime.Time) 485 total := time.Duration(*curADS) * time.Second 486 // AddAfter will handle total < passed 487 jm.queue.AddAfter(key, total-passed) 488 logger.V(4).Info("job's ActiveDeadlineSeconds updated, will rsync", "key", key, "interval", total-passed) 489 } 490 } 491 } 492 493 // deleteJob enqueues the job and all the pods associated with it that still 494 // have a finalizer. 495 func (jm *Controller) deleteJob(logger klog.Logger, obj interface{}) { 496 jm.enqueueSyncJobImmediately(logger, obj) 497 jobObj, ok := obj.(*batch.Job) 498 if !ok { 499 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 500 if !ok { 501 utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %+v", obj)) 502 return 503 } 504 jobObj, ok = tombstone.Obj.(*batch.Job) 505 if !ok { 506 utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a job %+v", obj)) 507 return 508 } 509 } 510 jm.cleanupPodFinalizers(jobObj) 511 } 512 513 // enqueueSyncJobImmediately tells the Job controller to invoke syncJob 514 // immediately. 515 // It is only used for Job events (creation, deletion, spec update). 516 // obj could be an *batch.Job, or a DeletionFinalStateUnknown marker item. 517 func (jm *Controller) enqueueSyncJobImmediately(logger klog.Logger, obj interface{}) { 518 jm.enqueueSyncJobInternal(logger, obj, 0) 519 } 520 521 // enqueueSyncJobBatched tells the controller to invoke syncJob with a 522 // constant batching delay. 523 // It is used for: 524 // - Pod events (creation, deletion, update) 525 // - Job status update 526 // obj could be an *batch.Job, or a DeletionFinalStateUnknown marker item. 527 func (jm *Controller) enqueueSyncJobBatched(logger klog.Logger, obj interface{}) { 528 jm.enqueueSyncJobInternal(logger, obj, syncJobBatchPeriod) 529 } 530 531 // enqueueSyncJobWithDelay tells the controller to invoke syncJob with a 532 // custom delay, but not smaller than the batching delay. 533 // It is used when pod recreations are delayed due to pod failures. 534 // obj could be an *batch.Job, or a DeletionFinalStateUnknown marker item. 535 func (jm *Controller) enqueueSyncJobWithDelay(logger klog.Logger, obj interface{}, delay time.Duration) { 536 if delay < syncJobBatchPeriod { 537 delay = syncJobBatchPeriod 538 } 539 jm.enqueueSyncJobInternal(logger, obj, delay) 540 } 541 542 func (jm *Controller) enqueueSyncJobInternal(logger klog.Logger, obj interface{}, delay time.Duration) { 543 key, err := controller.KeyFunc(obj) 544 if err != nil { 545 utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) 546 return 547 } 548 // TODO: Handle overlapping controllers better. Either disallow them at admission time or 549 // deterministically avoid syncing controllers that fight over pods. Currently, we only 550 // ensure that the same controller is synced for a given pod. When we periodically relist 551 // all controllers there will still be some replica instability. One way to handle this is 552 // by querying the store for all controllers that this rc overlaps, as well as all 553 // controllers that overlap this rc, and sorting them. 554 logger.Info("enqueueing job", "key", key) 555 jm.queue.AddAfter(key, delay) 556 } 557 558 func (jm *Controller) enqueueOrphanPod(obj *v1.Pod) { 559 key, err := controller.KeyFunc(obj) 560 if err != nil { 561 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %+v: %v", obj, err)) 562 return 563 } 564 jm.orphanQueue.Add(key) 565 } 566 567 // worker runs a worker thread that just dequeues items, processes them, and marks them done. 568 // It enforces that the syncHandler is never invoked concurrently with the same key. 569 func (jm *Controller) worker(ctx context.Context) { 570 for jm.processNextWorkItem(ctx) { 571 } 572 } 573 574 func (jm *Controller) processNextWorkItem(ctx context.Context) bool { 575 key, quit := jm.queue.Get() 576 if quit { 577 return false 578 } 579 defer jm.queue.Done(key) 580 581 err := jm.syncHandler(ctx, key.(string)) 582 if err == nil { 583 jm.queue.Forget(key) 584 return true 585 } 586 587 utilruntime.HandleError(fmt.Errorf("syncing job: %w", err)) 588 jm.queue.AddRateLimited(key) 589 590 return true 591 } 592 593 func (jm *Controller) orphanWorker(ctx context.Context) { 594 for jm.processNextOrphanPod(ctx) { 595 } 596 } 597 598 func (jm *Controller) processNextOrphanPod(ctx context.Context) bool { 599 key, quit := jm.orphanQueue.Get() 600 if quit { 601 return false 602 } 603 defer jm.orphanQueue.Done(key) 604 err := jm.syncOrphanPod(ctx, key.(string)) 605 if err != nil { 606 utilruntime.HandleError(fmt.Errorf("Error syncing orphan pod: %v", err)) 607 jm.orphanQueue.AddRateLimited(key) 608 } else { 609 jm.orphanQueue.Forget(key) 610 } 611 612 return true 613 } 614 615 // syncOrphanPod removes the tracking finalizer from an orphan pod if found. 616 func (jm *Controller) syncOrphanPod(ctx context.Context, key string) error { 617 startTime := jm.clock.Now() 618 logger := klog.FromContext(ctx) 619 defer func() { 620 logger.V(4).Info("Finished syncing orphan pod", "pod", key, "elapsed", jm.clock.Since(startTime)) 621 }() 622 623 ns, name, err := cache.SplitMetaNamespaceKey(key) 624 if err != nil { 625 return err 626 } 627 628 sharedPod, err := jm.podStore.Pods(ns).Get(name) 629 if err != nil { 630 if apierrors.IsNotFound(err) { 631 logger.V(4).Info("Orphan pod has been deleted", "pod", key) 632 return nil 633 } 634 return err 635 } 636 // Make sure the pod is still orphaned. 637 if controllerRef := metav1.GetControllerOf(sharedPod); controllerRef != nil { 638 job := jm.resolveControllerRef(sharedPod.Namespace, controllerRef) 639 if job != nil && !IsJobFinished(job) { 640 // The pod was adopted. Do not remove finalizer. 641 return nil 642 } 643 } 644 if patch := removeTrackingFinalizerPatch(sharedPod); patch != nil { 645 if err := jm.podControl.PatchPod(ctx, ns, name, patch); err != nil && !apierrors.IsNotFound(err) { 646 return err 647 } 648 } 649 return nil 650 } 651 652 // getPodsForJob returns the set of pods that this Job should manage. 653 // It also reconciles ControllerRef by adopting/orphaning, adding tracking 654 // finalizers. 655 // Note that the returned Pods are pointers into the cache. 656 func (jm *Controller) getPodsForJob(ctx context.Context, j *batch.Job) ([]*v1.Pod, error) { 657 selector, err := metav1.LabelSelectorAsSelector(j.Spec.Selector) 658 if err != nil { 659 return nil, fmt.Errorf("couldn't convert Job selector: %v", err) 660 } 661 // List all pods to include those that don't match the selector anymore 662 // but have a ControllerRef pointing to this controller. 663 pods, err := jm.podStore.Pods(j.Namespace).List(labels.Everything()) 664 if err != nil { 665 return nil, err 666 } 667 // If any adoptions are attempted, we should first recheck for deletion 668 // with an uncached quorum read sometime after listing Pods (see #42639). 669 canAdoptFunc := controller.RecheckDeletionTimestamp(func(ctx context.Context) (metav1.Object, error) { 670 fresh, err := jm.kubeClient.BatchV1().Jobs(j.Namespace).Get(ctx, j.Name, metav1.GetOptions{}) 671 if err != nil { 672 return nil, err 673 } 674 if fresh.UID != j.UID { 675 return nil, fmt.Errorf("original Job %v/%v is gone: got uid %v, wanted %v", j.Namespace, j.Name, fresh.UID, j.UID) 676 } 677 return fresh, nil 678 }) 679 cm := controller.NewPodControllerRefManager(jm.podControl, j, selector, controllerKind, canAdoptFunc, batch.JobTrackingFinalizer) 680 // When adopting Pods, this operation adds an ownerRef and finalizers. 681 pods, err = cm.ClaimPods(ctx, pods) 682 if err != nil { 683 return pods, err 684 } 685 // Set finalizer on adopted pods for the remaining calculations. 686 for i, p := range pods { 687 adopted := true 688 for _, r := range p.OwnerReferences { 689 if r.UID == j.UID { 690 adopted = false 691 break 692 } 693 } 694 if adopted && !hasJobTrackingFinalizer(p) { 695 pods[i] = p.DeepCopy() 696 pods[i].Finalizers = append(p.Finalizers, batch.JobTrackingFinalizer) 697 } 698 } 699 return pods, err 700 } 701 702 // syncJob will sync the job with the given key if it has had its expectations fulfilled, meaning 703 // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked 704 // concurrently with the same key. 705 func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) { 706 startTime := jm.clock.Now() 707 logger := klog.FromContext(ctx) 708 defer func() { 709 logger.V(4).Info("Finished syncing job", "key", key, "elapsed", jm.clock.Since(startTime)) 710 }() 711 712 ns, name, err := cache.SplitMetaNamespaceKey(key) 713 if err != nil { 714 return err 715 } 716 if len(ns) == 0 || len(name) == 0 { 717 return fmt.Errorf("invalid job key %q: either namespace or name is missing", key) 718 } 719 sharedJob, err := jm.jobLister.Jobs(ns).Get(name) 720 if err != nil { 721 if apierrors.IsNotFound(err) { 722 logger.V(4).Info("Job has been deleted", "key", key) 723 jm.expectations.DeleteExpectations(logger, key) 724 jm.finalizerExpectations.deleteExpectations(logger, key) 725 726 err := jm.podBackoffStore.removeBackoffRecord(key) 727 if err != nil { 728 // re-syncing here as the record has to be removed for finished/deleted jobs 729 return fmt.Errorf("error removing backoff record %w", err) 730 } 731 return nil 732 } 733 return err 734 } 735 // make a copy so we don't mutate the shared cache 736 job := *sharedJob.DeepCopy() 737 738 // if job was finished previously, we don't want to redo the termination 739 if IsJobFinished(&job) { 740 err := jm.podBackoffStore.removeBackoffRecord(key) 741 if err != nil { 742 // re-syncing here as the record has to be removed for finished/deleted jobs 743 return fmt.Errorf("error removing backoff record %w", err) 744 } 745 return nil 746 } 747 748 if job.Spec.CompletionMode != nil && *job.Spec.CompletionMode != batch.NonIndexedCompletion && *job.Spec.CompletionMode != batch.IndexedCompletion { 749 jm.recorder.Event(&job, v1.EventTypeWarning, "UnknownCompletionMode", "Skipped Job sync because completion mode is unknown") 750 return nil 751 } 752 753 completionMode := getCompletionMode(&job) 754 action := metrics.JobSyncActionReconciling 755 756 defer func() { 757 result := "success" 758 if rErr != nil { 759 result = "error" 760 } 761 762 metrics.JobSyncDurationSeconds.WithLabelValues(completionMode, result, action).Observe(jm.clock.Since(startTime).Seconds()) 763 metrics.JobSyncNum.WithLabelValues(completionMode, result, action).Inc() 764 }() 765 766 if job.Status.UncountedTerminatedPods == nil { 767 job.Status.UncountedTerminatedPods = &batch.UncountedTerminatedPods{} 768 } 769 770 // Check the expectations of the job before counting active pods, otherwise a new pod can sneak in 771 // and update the expectations after we've retrieved active pods from the store. If a new pod enters 772 // the store after we've checked the expectation, the job sync is just deferred till the next relist. 773 satisfiedExpectations := jm.expectations.SatisfiedExpectations(logger, key) 774 775 pods, err := jm.getPodsForJob(ctx, &job) 776 if err != nil { 777 return err 778 } 779 var terminating *int32 780 if feature.DefaultFeatureGate.Enabled(features.JobPodReplacementPolicy) { 781 terminating = ptr.To(controller.CountTerminatingPods(pods)) 782 } 783 jobCtx := &syncJobCtx{ 784 job: &job, 785 pods: pods, 786 activePods: controller.FilterActivePods(logger, pods), 787 terminating: terminating, 788 uncounted: newUncountedTerminatedPods(*job.Status.UncountedTerminatedPods), 789 expectedRmFinalizers: jm.finalizerExpectations.getExpectedUIDs(key), 790 } 791 active := int32(len(jobCtx.activePods)) 792 newSucceededPods, newFailedPods := getNewFinishedPods(jobCtx) 793 jobCtx.succeeded = job.Status.Succeeded + int32(len(newSucceededPods)) + int32(len(jobCtx.uncounted.succeeded)) 794 failed := job.Status.Failed + int32(nonIgnoredFailedPodsCount(jobCtx, newFailedPods)) + int32(len(jobCtx.uncounted.failed)) 795 var ready *int32 796 if feature.DefaultFeatureGate.Enabled(features.JobReadyPods) { 797 ready = ptr.To(countReadyPods(jobCtx.activePods)) 798 } 799 800 // Job first start. Set StartTime only if the job is not in the suspended state. 801 if job.Status.StartTime == nil && !jobSuspended(&job) { 802 now := metav1.NewTime(jm.clock.Now()) 803 job.Status.StartTime = &now 804 } 805 806 jobCtx.newBackoffRecord = jm.podBackoffStore.newBackoffRecord(key, newSucceededPods, newFailedPods) 807 808 var manageJobErr error 809 810 exceedsBackoffLimit := failed > *job.Spec.BackoffLimit 811 812 if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) { 813 if failureTargetCondition := findConditionByType(job.Status.Conditions, batch.JobFailureTarget); failureTargetCondition != nil { 814 jobCtx.finishedCondition = newFailedConditionForFailureTarget(failureTargetCondition, jm.clock.Now()) 815 } else if failJobMessage := getFailJobMessage(&job, pods); failJobMessage != nil { 816 // Prepare the interim FailureTarget condition to record the failure message before the finalizers (allowing removal of the pods) are removed. 817 jobCtx.finishedCondition = newCondition(batch.JobFailureTarget, v1.ConditionTrue, batch.JobReasonPodFailurePolicy, *failJobMessage, jm.clock.Now()) 818 } 819 } 820 if jobCtx.finishedCondition == nil { 821 if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) { 822 // check if the number of pod restart exceeds backoff (for restart OnFailure only) 823 // OR if the number of failed jobs increased since the last syncJob 824 jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, batch.JobReasonBackoffLimitExceeded, "Job has reached the specified backoff limit", jm.clock.Now()) 825 } else if jm.pastActiveDeadline(&job) { 826 jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, batch.JobReasonDeadlineExceeded, "Job was active longer than specified deadline", jm.clock.Now()) 827 } else if job.Spec.ActiveDeadlineSeconds != nil && !jobSuspended(&job) { 828 syncDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds)*time.Second - jm.clock.Since(job.Status.StartTime.Time) 829 logger.V(2).Info("Job has activeDeadlineSeconds configuration. Will sync this job again", "key", key, "nextSyncIn", syncDuration) 830 jm.queue.AddAfter(key, syncDuration) 831 } 832 } 833 834 if isIndexedJob(&job) { 835 jobCtx.prevSucceededIndexes, jobCtx.succeededIndexes = calculateSucceededIndexes(logger, &job, pods) 836 jobCtx.succeeded = int32(jobCtx.succeededIndexes.total()) 837 if hasBackoffLimitPerIndex(&job) { 838 jobCtx.failedIndexes = calculateFailedIndexes(logger, &job, pods) 839 if jobCtx.finishedCondition == nil { 840 if job.Spec.MaxFailedIndexes != nil && jobCtx.failedIndexes.total() > int(*job.Spec.MaxFailedIndexes) { 841 jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, batch.JobReasonMaxFailedIndexesExceeded, "Job has exceeded the specified maximal number of failed indexes", jm.clock.Now()) 842 } else if jobCtx.failedIndexes.total() > 0 && jobCtx.failedIndexes.total()+jobCtx.succeededIndexes.total() >= int(*job.Spec.Completions) { 843 jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, batch.JobReasonFailedIndexes, "Job has failed indexes", jm.clock.Now()) 844 } 845 } 846 jobCtx.podsWithDelayedDeletionPerIndex = getPodsWithDelayedDeletionPerIndex(logger, jobCtx) 847 } 848 } 849 suspendCondChanged := false 850 // Remove active pods if Job failed. 851 if jobCtx.finishedCondition != nil { 852 deleted, err := jm.deleteActivePods(ctx, &job, jobCtx.activePods) 853 if deleted != active || !satisfiedExpectations { 854 // Can't declare the Job as finished yet, as there might be remaining 855 // pod finalizers or pods that are not in the informer's cache yet. 856 jobCtx.finishedCondition = nil 857 } 858 active -= deleted 859 manageJobErr = err 860 } else { 861 manageJobCalled := false 862 if satisfiedExpectations && job.DeletionTimestamp == nil { 863 active, action, manageJobErr = jm.manageJob(ctx, &job, jobCtx) 864 manageJobCalled = true 865 } 866 complete := false 867 if job.Spec.Completions == nil { 868 // This type of job is complete when any pod exits with success. 869 // Each pod is capable of 870 // determining whether or not the entire Job is done. Subsequent pods are 871 // not expected to fail, but if they do, the failure is ignored. Once any 872 // pod succeeds, the controller waits for remaining pods to finish, and 873 // then the job is complete. 874 complete = jobCtx.succeeded > 0 && active == 0 875 } else { 876 // Job specifies a number of completions. This type of job signals 877 // success by having that number of successes. Since we do not 878 // start more pods than there are remaining completions, there should 879 // not be any remaining active pods once this count is reached. 880 complete = jobCtx.succeeded >= *job.Spec.Completions && active == 0 881 } 882 if complete { 883 jobCtx.finishedCondition = newCondition(batch.JobComplete, v1.ConditionTrue, "", "", jm.clock.Now()) 884 } else if manageJobCalled { 885 // Update the conditions / emit events only if manageJob was called in 886 // this syncJob. Otherwise wait for the right syncJob call to make 887 // updates. 888 if job.Spec.Suspend != nil && *job.Spec.Suspend { 889 // Job can be in the suspended state only if it is NOT completed. 890 var isUpdated bool 891 job.Status.Conditions, isUpdated = ensureJobConditionStatus(job.Status.Conditions, batch.JobSuspended, v1.ConditionTrue, "JobSuspended", "Job suspended", jm.clock.Now()) 892 if isUpdated { 893 suspendCondChanged = true 894 jm.recorder.Event(&job, v1.EventTypeNormal, "Suspended", "Job suspended") 895 } 896 } else { 897 // Job not suspended. 898 var isUpdated bool 899 job.Status.Conditions, isUpdated = ensureJobConditionStatus(job.Status.Conditions, batch.JobSuspended, v1.ConditionFalse, "JobResumed", "Job resumed", jm.clock.Now()) 900 if isUpdated { 901 suspendCondChanged = true 902 jm.recorder.Event(&job, v1.EventTypeNormal, "Resumed", "Job resumed") 903 // Resumed jobs will always reset StartTime to current time. This is 904 // done because the ActiveDeadlineSeconds timer shouldn't go off 905 // whilst the Job is still suspended and resetting StartTime is 906 // consistent with resuming a Job created in the suspended state. 907 // (ActiveDeadlineSeconds is interpreted as the number of seconds a 908 // Job is continuously active.) 909 now := metav1.NewTime(jm.clock.Now()) 910 job.Status.StartTime = &now 911 } 912 } 913 } 914 } 915 916 needsStatusUpdate := suspendCondChanged || active != job.Status.Active || !ptr.Equal(ready, job.Status.Ready) 917 needsStatusUpdate = needsStatusUpdate || !ptr.Equal(job.Status.Terminating, jobCtx.terminating) 918 job.Status.Active = active 919 job.Status.Ready = ready 920 job.Status.Terminating = jobCtx.terminating 921 err = jm.trackJobStatusAndRemoveFinalizers(ctx, jobCtx, needsStatusUpdate) 922 if err != nil { 923 return fmt.Errorf("tracking status: %w", err) 924 } 925 926 return manageJobErr 927 } 928 929 // deleteActivePods issues deletion for active Pods, preserving finalizers. 930 // This is done through DELETE calls that set deletion timestamps. 931 // The method trackJobStatusAndRemoveFinalizers removes the finalizers, after 932 // which the objects can actually be deleted. 933 // Returns number of successfully deletions issued. 934 func (jm *Controller) deleteActivePods(ctx context.Context, job *batch.Job, pods []*v1.Pod) (int32, error) { 935 errCh := make(chan error, len(pods)) 936 successfulDeletes := int32(len(pods)) 937 wg := sync.WaitGroup{} 938 wg.Add(len(pods)) 939 for i := range pods { 940 go func(pod *v1.Pod) { 941 defer wg.Done() 942 if err := jm.podControl.DeletePod(ctx, job.Namespace, pod.Name, job); err != nil && !apierrors.IsNotFound(err) { 943 atomic.AddInt32(&successfulDeletes, -1) 944 errCh <- err 945 utilruntime.HandleError(err) 946 } 947 }(pods[i]) 948 } 949 wg.Wait() 950 return successfulDeletes, errorFromChannel(errCh) 951 } 952 953 func nonIgnoredFailedPodsCount(jobCtx *syncJobCtx, failedPods []*v1.Pod) int { 954 result := len(failedPods) 955 if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && jobCtx.job.Spec.PodFailurePolicy != nil { 956 for _, p := range failedPods { 957 _, countFailed, _ := matchPodFailurePolicy(jobCtx.job.Spec.PodFailurePolicy, p) 958 if !countFailed { 959 result-- 960 } 961 } 962 } 963 return result 964 } 965 966 // deleteJobPods deletes the pods, returns the number of successful removals 967 // and any error. 968 func (jm *Controller) deleteJobPods(ctx context.Context, job *batch.Job, jobKey string, pods []*v1.Pod) (int32, error) { 969 errCh := make(chan error, len(pods)) 970 successfulDeletes := int32(len(pods)) 971 logger := klog.FromContext(ctx) 972 973 failDelete := func(pod *v1.Pod, err error) { 974 // Decrement the expected number of deletes because the informer won't observe this deletion 975 jm.expectations.DeletionObserved(logger, jobKey) 976 if !apierrors.IsNotFound(err) { 977 logger.V(2).Info("Failed to delete Pod", "job", klog.KObj(job), "pod", klog.KObj(pod), "err", err) 978 atomic.AddInt32(&successfulDeletes, -1) 979 errCh <- err 980 utilruntime.HandleError(err) 981 } 982 } 983 984 wg := sync.WaitGroup{} 985 wg.Add(len(pods)) 986 for i := range pods { 987 go func(pod *v1.Pod) { 988 defer wg.Done() 989 if patch := removeTrackingFinalizerPatch(pod); patch != nil { 990 if err := jm.podControl.PatchPod(ctx, pod.Namespace, pod.Name, patch); err != nil { 991 failDelete(pod, fmt.Errorf("removing completion finalizer: %w", err)) 992 return 993 } 994 } 995 if err := jm.podControl.DeletePod(ctx, job.Namespace, pod.Name, job); err != nil { 996 failDelete(pod, err) 997 } 998 }(pods[i]) 999 } 1000 wg.Wait() 1001 return successfulDeletes, errorFromChannel(errCh) 1002 } 1003 1004 // trackJobStatusAndRemoveFinalizers does: 1005 // 1. Add finished Pods to .status.uncountedTerminatedPods 1006 // 2. Remove the finalizers from the Pods if they completed or were removed 1007 // or the job was removed. 1008 // 3. Increment job counters for pods that no longer have a finalizer. 1009 // 4. Add Complete condition if satisfied with current counters. 1010 // 1011 // It does this up to a limited number of Pods so that the size of .status 1012 // doesn't grow too much and this sync doesn't starve other Jobs. 1013 func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, jobCtx *syncJobCtx, needsFlush bool) error { 1014 logger := klog.FromContext(ctx) 1015 1016 isIndexed := isIndexedJob(jobCtx.job) 1017 var podsToRemoveFinalizer []*v1.Pod 1018 uncountedStatus := jobCtx.job.Status.UncountedTerminatedPods 1019 var newSucceededIndexes []int 1020 if isIndexed { 1021 // Sort to introduce completed Indexes in order. 1022 sort.Sort(byCompletionIndex(jobCtx.pods)) 1023 } 1024 uidsWithFinalizer := make(sets.Set[string], len(jobCtx.pods)) 1025 for _, p := range jobCtx.pods { 1026 uid := string(p.UID) 1027 if hasJobTrackingFinalizer(p) && !jobCtx.expectedRmFinalizers.Has(uid) { 1028 uidsWithFinalizer.Insert(uid) 1029 } 1030 } 1031 1032 // Shallow copy, as it will only be used to detect changes in the counters. 1033 oldCounters := jobCtx.job.Status 1034 if cleanUncountedPodsWithoutFinalizers(&jobCtx.job.Status, uidsWithFinalizer) { 1035 needsFlush = true 1036 } 1037 podFailureCountByPolicyAction := map[string]int{} 1038 reachedMaxUncountedPods := false 1039 for _, pod := range jobCtx.pods { 1040 if !hasJobTrackingFinalizer(pod) || jobCtx.expectedRmFinalizers.Has(string(pod.UID)) { 1041 // This pod was processed in a previous sync. 1042 continue 1043 } 1044 considerPodFailed := isPodFailed(pod, jobCtx.job) 1045 if !canRemoveFinalizer(logger, jobCtx, pod, considerPodFailed) { 1046 continue 1047 } 1048 podsToRemoveFinalizer = append(podsToRemoveFinalizer, pod) 1049 if pod.Status.Phase == v1.PodSucceeded && !jobCtx.uncounted.failed.Has(string(pod.UID)) { 1050 if isIndexed { 1051 // The completion index is enough to avoid recounting succeeded pods. 1052 // No need to track UIDs. 1053 ix := getCompletionIndex(pod.Annotations) 1054 if ix != unknownCompletionIndex && ix < int(*jobCtx.job.Spec.Completions) && !jobCtx.prevSucceededIndexes.has(ix) { 1055 newSucceededIndexes = append(newSucceededIndexes, ix) 1056 needsFlush = true 1057 } 1058 } else if !jobCtx.uncounted.succeeded.Has(string(pod.UID)) { 1059 needsFlush = true 1060 uncountedStatus.Succeeded = append(uncountedStatus.Succeeded, pod.UID) 1061 } 1062 } else if considerPodFailed || jobCtx.finishedCondition != nil { 1063 // When the job is considered finished, every non-terminated pod is considered failed 1064 ix := getCompletionIndex(pod.Annotations) 1065 if !jobCtx.uncounted.failed.Has(string(pod.UID)) && (!isIndexed || (ix != unknownCompletionIndex && ix < int(*jobCtx.job.Spec.Completions))) { 1066 if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && jobCtx.job.Spec.PodFailurePolicy != nil { 1067 _, countFailed, action := matchPodFailurePolicy(jobCtx.job.Spec.PodFailurePolicy, pod) 1068 if action != nil { 1069 podFailureCountByPolicyAction[string(*action)] += 1 1070 } 1071 if countFailed { 1072 needsFlush = true 1073 uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID) 1074 } 1075 } else { 1076 needsFlush = true 1077 uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID) 1078 } 1079 } 1080 } 1081 if len(newSucceededIndexes)+len(uncountedStatus.Succeeded)+len(uncountedStatus.Failed) >= MaxUncountedPods { 1082 // The controller added enough Pods already to .status.uncountedTerminatedPods 1083 // We stop counting pods and removing finalizers here to: 1084 // 1. Ensure that the UIDs representation are under 20 KB. 1085 // 2. Cap the number of finalizer removals so that syncing of big Jobs 1086 // doesn't starve smaller ones. 1087 // 1088 // The job will be synced again because the Job status and Pod updates 1089 // will put the Job back to the work queue. 1090 reachedMaxUncountedPods = true 1091 break 1092 } 1093 } 1094 if isIndexed { 1095 jobCtx.succeededIndexes = jobCtx.succeededIndexes.withOrderedIndexes(newSucceededIndexes) 1096 succeededIndexesStr := jobCtx.succeededIndexes.String() 1097 if succeededIndexesStr != jobCtx.job.Status.CompletedIndexes { 1098 needsFlush = true 1099 } 1100 jobCtx.job.Status.Succeeded = int32(jobCtx.succeededIndexes.total()) 1101 jobCtx.job.Status.CompletedIndexes = succeededIndexesStr 1102 var failedIndexesStr *string 1103 if jobCtx.failedIndexes != nil { 1104 failedIndexesStr = ptr.To(jobCtx.failedIndexes.String()) 1105 } 1106 if !ptr.Equal(jobCtx.job.Status.FailedIndexes, failedIndexesStr) { 1107 jobCtx.job.Status.FailedIndexes = failedIndexesStr 1108 needsFlush = true 1109 } 1110 } 1111 if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) { 1112 if jobCtx.finishedCondition != nil && jobCtx.finishedCondition.Type == batch.JobFailureTarget { 1113 1114 // Append the interim FailureTarget condition to update the job status with before finalizers are removed. 1115 jobCtx.job.Status.Conditions = append(jobCtx.job.Status.Conditions, *jobCtx.finishedCondition) 1116 needsFlush = true 1117 1118 // Prepare the final Failed condition to update the job status with after the finalizers are removed. 1119 // It is also used in the enactJobFinished function for reporting. 1120 jobCtx.finishedCondition = newFailedConditionForFailureTarget(jobCtx.finishedCondition, jm.clock.Now()) 1121 } 1122 } 1123 var err error 1124 if jobCtx.job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, jobCtx, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, podFailureCountByPolicyAction, needsFlush); err != nil { 1125 return err 1126 } 1127 jobFinished := !reachedMaxUncountedPods && jm.enactJobFinished(jobCtx.job, jobCtx.finishedCondition) 1128 if jobFinished { 1129 needsFlush = true 1130 } 1131 if needsFlush { 1132 if _, err := jm.updateStatusHandler(ctx, jobCtx.job); err != nil { 1133 return fmt.Errorf("removing uncounted pods from status: %w", err) 1134 } 1135 if jobFinished { 1136 jm.recordJobFinished(jobCtx.job, jobCtx.finishedCondition) 1137 } 1138 recordJobPodFinished(logger, jobCtx.job, oldCounters) 1139 } 1140 return nil 1141 } 1142 1143 // canRemoveFinalizer determines if the pod's finalizer can be safely removed. 1144 // The finalizer can be removed when: 1145 // - the entire Job is terminating; or 1146 // - the pod's index is succeeded; or 1147 // - the Pod is considered failed, unless it's removal is delayed for the 1148 // purpose of transferring the JobIndexFailureCount annotations to the 1149 // replacement pod. the entire Job is terminating the finalizer can be 1150 // removed unconditionally. 1151 func canRemoveFinalizer(logger klog.Logger, jobCtx *syncJobCtx, pod *v1.Pod, considerPodFailed bool) bool { 1152 if jobCtx.job.DeletionTimestamp != nil || jobCtx.finishedCondition != nil || pod.Status.Phase == v1.PodSucceeded { 1153 return true 1154 } 1155 if !considerPodFailed { 1156 return false 1157 } 1158 if hasBackoffLimitPerIndex(jobCtx.job) { 1159 if index := getCompletionIndex(pod.Annotations); index != unknownCompletionIndex { 1160 if p, ok := jobCtx.podsWithDelayedDeletionPerIndex[index]; ok && p.UID == pod.UID { 1161 logger.V(3).Info("Delaying pod finalizer removal to await for pod recreation within the index", "pod", klog.KObj(pod)) 1162 return false 1163 } 1164 } 1165 } 1166 return true 1167 } 1168 1169 // flushUncountedAndRemoveFinalizers does: 1170 // 1. flush the Job status that might include new uncounted Pod UIDs. Also flush the interim FailureTarget condition 1171 // if present. 1172 // 2. perform the removal of finalizers from Pods which are in the uncounted 1173 // lists. 1174 // 3. update the counters based on the Pods for which it successfully removed 1175 // the finalizers. 1176 // 4. (if not all removals succeeded) flush Job status again. 1177 // 1178 // Returns whether there are pending changes in the Job status that need to be 1179 // flushed in subsequent calls. 1180 func (jm *Controller) flushUncountedAndRemoveFinalizers(ctx context.Context, jobCtx *syncJobCtx, podsToRemoveFinalizer []*v1.Pod, uidsWithFinalizer sets.Set[string], oldCounters *batch.JobStatus, podFailureCountByPolicyAction map[string]int, needsFlush bool) (*batch.Job, bool, error) { 1181 logger := klog.FromContext(ctx) 1182 var err error 1183 if needsFlush { 1184 if jobCtx.job, err = jm.updateStatusHandler(ctx, jobCtx.job); err != nil { 1185 return jobCtx.job, needsFlush, fmt.Errorf("adding uncounted pods to status: %w", err) 1186 } 1187 1188 err = jm.podBackoffStore.updateBackoffRecord(jobCtx.newBackoffRecord) 1189 1190 if err != nil { 1191 // this error might undercount the backoff. 1192 // re-syncing from the current state might not help to recover 1193 // the backoff information 1194 logger.Error(err, "Backoff update failed") 1195 } 1196 1197 recordJobPodFinished(logger, jobCtx.job, *oldCounters) 1198 // Shallow copy, as it will only be used to detect changes in the counters. 1199 *oldCounters = jobCtx.job.Status 1200 needsFlush = false 1201 } 1202 recordJobPodFailurePolicyActions(jobCtx.job, podFailureCountByPolicyAction) 1203 1204 jobKey, err := controller.KeyFunc(jobCtx.job) 1205 if err != nil { 1206 return jobCtx.job, needsFlush, fmt.Errorf("getting job key: %w", err) 1207 } 1208 var rmErr error 1209 if len(podsToRemoveFinalizer) > 0 { 1210 var rmSucceded []bool 1211 rmSucceded, rmErr = jm.removeTrackingFinalizerFromPods(ctx, jobKey, podsToRemoveFinalizer) 1212 for i, p := range podsToRemoveFinalizer { 1213 if rmSucceded[i] { 1214 uidsWithFinalizer.Delete(string(p.UID)) 1215 } 1216 } 1217 } 1218 // Failed to remove some finalizers. Attempt to update the status with the 1219 // partial progress. 1220 if cleanUncountedPodsWithoutFinalizers(&jobCtx.job.Status, uidsWithFinalizer) { 1221 needsFlush = true 1222 } 1223 if rmErr != nil && needsFlush { 1224 if job, err := jm.updateStatusHandler(ctx, jobCtx.job); err != nil { 1225 return job, needsFlush, fmt.Errorf("removing uncounted pods from status: %w", err) 1226 } 1227 } 1228 return jobCtx.job, needsFlush, rmErr 1229 } 1230 1231 // cleanUncountedPodsWithoutFinalizers removes the Pod UIDs from 1232 // .status.uncountedTerminatedPods for which the finalizer was successfully 1233 // removed and increments the corresponding status counters. 1234 // Returns whether there was any status change. 1235 func cleanUncountedPodsWithoutFinalizers(status *batch.JobStatus, uidsWithFinalizer sets.Set[string]) bool { 1236 updated := false 1237 uncountedStatus := status.UncountedTerminatedPods 1238 newUncounted := filterInUncountedUIDs(uncountedStatus.Succeeded, uidsWithFinalizer) 1239 if len(newUncounted) != len(uncountedStatus.Succeeded) { 1240 updated = true 1241 status.Succeeded += int32(len(uncountedStatus.Succeeded) - len(newUncounted)) 1242 uncountedStatus.Succeeded = newUncounted 1243 } 1244 newUncounted = filterInUncountedUIDs(uncountedStatus.Failed, uidsWithFinalizer) 1245 if len(newUncounted) != len(uncountedStatus.Failed) { 1246 updated = true 1247 status.Failed += int32(len(uncountedStatus.Failed) - len(newUncounted)) 1248 uncountedStatus.Failed = newUncounted 1249 } 1250 return updated 1251 } 1252 1253 // removeTrackingFinalizerFromPods removes tracking finalizers from Pods and 1254 // returns an array of booleans where the i-th value is true if the finalizer 1255 // of the i-th Pod was successfully removed (if the pod was deleted when this 1256 // function was called, it's considered as the finalizer was removed successfully). 1257 func (jm *Controller) removeTrackingFinalizerFromPods(ctx context.Context, jobKey string, pods []*v1.Pod) ([]bool, error) { 1258 logger := klog.FromContext(ctx) 1259 errCh := make(chan error, len(pods)) 1260 succeeded := make([]bool, len(pods)) 1261 uids := make([]string, len(pods)) 1262 for i, p := range pods { 1263 uids[i] = string(p.UID) 1264 } 1265 if jobKey != "" { 1266 err := jm.finalizerExpectations.expectFinalizersRemoved(logger, jobKey, uids) 1267 if err != nil { 1268 return succeeded, fmt.Errorf("setting expected removed finalizers: %w", err) 1269 } 1270 } 1271 wg := sync.WaitGroup{} 1272 wg.Add(len(pods)) 1273 for i := range pods { 1274 go func(i int) { 1275 pod := pods[i] 1276 defer wg.Done() 1277 if patch := removeTrackingFinalizerPatch(pod); patch != nil { 1278 if err := jm.podControl.PatchPod(ctx, pod.Namespace, pod.Name, patch); err != nil { 1279 // In case of any failure, we don't expect a Pod update for the 1280 // finalizer removed. Clear expectation now. 1281 if jobKey != "" { 1282 jm.finalizerExpectations.finalizerRemovalObserved(logger, jobKey, string(pod.UID)) 1283 } 1284 if !apierrors.IsNotFound(err) { 1285 errCh <- err 1286 utilruntime.HandleError(fmt.Errorf("removing tracking finalizer: %w", err)) 1287 return 1288 } 1289 } 1290 succeeded[i] = true 1291 } 1292 }(i) 1293 } 1294 wg.Wait() 1295 1296 return succeeded, errorFromChannel(errCh) 1297 } 1298 1299 // enactJobFinished adds the Complete or Failed condition and records events. 1300 // Returns whether the Job was considered finished. 1301 func (jm *Controller) enactJobFinished(job *batch.Job, finishedCond *batch.JobCondition) bool { 1302 if finishedCond == nil { 1303 return false 1304 } 1305 if uncounted := job.Status.UncountedTerminatedPods; uncounted != nil { 1306 if len(uncounted.Succeeded) > 0 || len(uncounted.Failed) > 0 { 1307 return false 1308 } 1309 } 1310 job.Status.Conditions, _ = ensureJobConditionStatus(job.Status.Conditions, finishedCond.Type, finishedCond.Status, finishedCond.Reason, finishedCond.Message, jm.clock.Now()) 1311 if finishedCond.Type == batch.JobComplete { 1312 job.Status.CompletionTime = &finishedCond.LastTransitionTime 1313 } 1314 return true 1315 } 1316 1317 // recordJobFinished records events and the job_finished_total metric for a finished job. 1318 func (jm *Controller) recordJobFinished(job *batch.Job, finishedCond *batch.JobCondition) bool { 1319 completionMode := getCompletionMode(job) 1320 if finishedCond.Type == batch.JobComplete { 1321 if job.Spec.Completions != nil && job.Status.Succeeded > *job.Spec.Completions { 1322 jm.recorder.Event(job, v1.EventTypeWarning, "TooManySucceededPods", "Too many succeeded pods running after completion count reached") 1323 } 1324 jm.recorder.Event(job, v1.EventTypeNormal, "Completed", "Job completed") 1325 metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded", "").Inc() 1326 } else { 1327 jm.recorder.Event(job, v1.EventTypeWarning, finishedCond.Reason, finishedCond.Message) 1328 metrics.JobFinishedNum.WithLabelValues(completionMode, "failed", finishedCond.Reason).Inc() 1329 } 1330 return true 1331 } 1332 1333 func filterInUncountedUIDs(uncounted []types.UID, include sets.Set[string]) []types.UID { 1334 var newUncounted []types.UID 1335 for _, uid := range uncounted { 1336 if include.Has(string(uid)) { 1337 newUncounted = append(newUncounted, uid) 1338 } 1339 } 1340 return newUncounted 1341 } 1342 1343 // newFailedConditionForFailureTarget creates a job Failed condition based on 1344 // the interim FailureTarget condition. 1345 func newFailedConditionForFailureTarget(condition *batch.JobCondition, now time.Time) *batch.JobCondition { 1346 return newCondition(batch.JobFailed, v1.ConditionTrue, condition.Reason, condition.Message, now) 1347 } 1348 1349 // pastBackoffLimitOnFailure checks if container restartCounts sum exceeds BackoffLimit 1350 // this method applies only to pods with restartPolicy == OnFailure 1351 func pastBackoffLimitOnFailure(job *batch.Job, pods []*v1.Pod) bool { 1352 if job.Spec.Template.Spec.RestartPolicy != v1.RestartPolicyOnFailure { 1353 return false 1354 } 1355 result := int32(0) 1356 for i := range pods { 1357 po := pods[i] 1358 if po.Status.Phase == v1.PodRunning || po.Status.Phase == v1.PodPending { 1359 for j := range po.Status.InitContainerStatuses { 1360 stat := po.Status.InitContainerStatuses[j] 1361 result += stat.RestartCount 1362 } 1363 for j := range po.Status.ContainerStatuses { 1364 stat := po.Status.ContainerStatuses[j] 1365 result += stat.RestartCount 1366 } 1367 } 1368 } 1369 if *job.Spec.BackoffLimit == 0 { 1370 return result > 0 1371 } 1372 return result >= *job.Spec.BackoffLimit 1373 } 1374 1375 // pastActiveDeadline checks if job has ActiveDeadlineSeconds field set and if 1376 // it is exceeded. If the job is currently suspended, the function will always 1377 // return false. 1378 func (jm *Controller) pastActiveDeadline(job *batch.Job) bool { 1379 if job.Spec.ActiveDeadlineSeconds == nil || job.Status.StartTime == nil || jobSuspended(job) { 1380 return false 1381 } 1382 duration := jm.clock.Since(job.Status.StartTime.Time) 1383 allowedDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds) * time.Second 1384 return duration >= allowedDuration 1385 } 1386 1387 func newCondition(conditionType batch.JobConditionType, status v1.ConditionStatus, reason, message string, now time.Time) *batch.JobCondition { 1388 return &batch.JobCondition{ 1389 Type: conditionType, 1390 Status: status, 1391 LastProbeTime: metav1.NewTime(now), 1392 LastTransitionTime: metav1.NewTime(now), 1393 Reason: reason, 1394 Message: message, 1395 } 1396 } 1397 1398 // getFailJobMessage returns a job failure message if the job should fail with the current counters 1399 func getFailJobMessage(job *batch.Job, pods []*v1.Pod) *string { 1400 if !feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) || job.Spec.PodFailurePolicy == nil { 1401 return nil 1402 } 1403 for _, p := range pods { 1404 if isPodFailed(p, job) { 1405 jobFailureMessage, _, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p) 1406 if jobFailureMessage != nil { 1407 return jobFailureMessage 1408 } 1409 } 1410 } 1411 return nil 1412 } 1413 1414 // getNewFinishedPods returns the list of newly succeeded and failed pods that are not accounted 1415 // in the job status. The list of failed pods can be affected by the podFailurePolicy. 1416 func getNewFinishedPods(jobCtx *syncJobCtx) (succeededPods, failedPods []*v1.Pod) { 1417 succeededPods = getValidPodsWithFilter(jobCtx, jobCtx.uncounted.Succeeded(), func(p *v1.Pod) bool { 1418 return p.Status.Phase == v1.PodSucceeded 1419 }) 1420 failedPods = getValidPodsWithFilter(jobCtx, jobCtx.uncounted.Failed(), func(p *v1.Pod) bool { 1421 return isPodFailed(p, jobCtx.job) 1422 }) 1423 return succeededPods, failedPods 1424 } 1425 1426 // jobSuspended returns whether a Job is suspended while taking the feature 1427 // gate into account. 1428 func jobSuspended(job *batch.Job) bool { 1429 return job.Spec.Suspend != nil && *job.Spec.Suspend 1430 } 1431 1432 // manageJob is the core method responsible for managing the number of running 1433 // pods according to what is specified in the job.Spec. 1434 // Respects back-off; does not create new pods if the back-off time has not passed 1435 // Does NOT modify <activePods>. 1436 func (jm *Controller) manageJob(ctx context.Context, job *batch.Job, jobCtx *syncJobCtx) (int32, string, error) { 1437 logger := klog.FromContext(ctx) 1438 active := int32(len(jobCtx.activePods)) 1439 parallelism := *job.Spec.Parallelism 1440 jobKey, err := controller.KeyFunc(job) 1441 if err != nil { 1442 utilruntime.HandleError(fmt.Errorf("Couldn't get key for job %#v: %v", job, err)) 1443 return 0, metrics.JobSyncActionTracking, nil 1444 } 1445 1446 if jobSuspended(job) { 1447 logger.V(4).Info("Deleting all active pods in suspended job", "job", klog.KObj(job), "active", active) 1448 podsToDelete := activePodsForRemoval(job, jobCtx.activePods, int(active)) 1449 jm.expectations.ExpectDeletions(logger, jobKey, len(podsToDelete)) 1450 removed, err := jm.deleteJobPods(ctx, job, jobKey, podsToDelete) 1451 active -= removed 1452 return active, metrics.JobSyncActionPodsDeleted, err 1453 } 1454 1455 var terminating int32 = 0 1456 if onlyReplaceFailedPods(jobCtx.job) { 1457 // For PodFailurePolicy specified but PodReplacementPolicy disabled 1458 // we still need to count terminating pods for replica counts 1459 // But we will not allow updates to status. 1460 if jobCtx.terminating == nil { 1461 terminating = controller.CountTerminatingPods(jobCtx.pods) 1462 } else { 1463 terminating = *jobCtx.terminating 1464 } 1465 } 1466 wantActive := int32(0) 1467 if job.Spec.Completions == nil { 1468 // Job does not specify a number of completions. Therefore, number active 1469 // should be equal to parallelism, unless the job has seen at least 1470 // once success, in which leave whatever is running, running. 1471 if jobCtx.succeeded > 0 { 1472 wantActive = active 1473 } else { 1474 wantActive = parallelism 1475 } 1476 } else { 1477 // Job specifies a specific number of completions. Therefore, number 1478 // active should not ever exceed number of remaining completions. 1479 wantActive = *job.Spec.Completions - jobCtx.succeeded 1480 if wantActive > parallelism { 1481 wantActive = parallelism 1482 } 1483 if wantActive < 0 { 1484 wantActive = 0 1485 } 1486 } 1487 1488 rmAtLeast := active - wantActive 1489 if rmAtLeast < 0 { 1490 rmAtLeast = 0 1491 } 1492 podsToDelete := activePodsForRemoval(job, jobCtx.activePods, int(rmAtLeast)) 1493 if len(podsToDelete) > MaxPodCreateDeletePerSync { 1494 podsToDelete = podsToDelete[:MaxPodCreateDeletePerSync] 1495 } 1496 if len(podsToDelete) > 0 { 1497 jm.expectations.ExpectDeletions(logger, jobKey, len(podsToDelete)) 1498 logger.V(4).Info("Too many pods running for job", "job", klog.KObj(job), "deleted", len(podsToDelete), "target", wantActive) 1499 removed, err := jm.deleteJobPods(ctx, job, jobKey, podsToDelete) 1500 active -= removed 1501 // While it is possible for a Job to require both pod creations and 1502 // deletions at the same time (e.g. indexed Jobs with repeated indexes), we 1503 // restrict ourselves to either just pod deletion or pod creation in any 1504 // given sync cycle. Of these two, pod deletion takes precedence. 1505 return active, metrics.JobSyncActionPodsDeleted, err 1506 } 1507 1508 if diff := wantActive - terminating - active; diff > 0 { 1509 var remainingTime time.Duration 1510 if !hasBackoffLimitPerIndex(job) { 1511 // we compute the global remaining time for pod creation when backoffLimitPerIndex is not used 1512 remainingTime = jobCtx.newBackoffRecord.getRemainingTime(jm.clock, DefaultJobPodFailureBackOff, MaxJobPodFailureBackOff) 1513 } 1514 if remainingTime > 0 { 1515 jm.enqueueSyncJobWithDelay(logger, job, remainingTime) 1516 return 0, metrics.JobSyncActionPodsCreated, nil 1517 } 1518 if diff > int32(MaxPodCreateDeletePerSync) { 1519 diff = int32(MaxPodCreateDeletePerSync) 1520 } 1521 1522 var indexesToAdd []int 1523 if isIndexedJob(job) { 1524 indexesToAdd = firstPendingIndexes(jobCtx, int(diff), int(*job.Spec.Completions)) 1525 if hasBackoffLimitPerIndex(job) { 1526 indexesToAdd, remainingTime = jm.getPodCreationInfoForIndependentIndexes(logger, indexesToAdd, jobCtx.podsWithDelayedDeletionPerIndex) 1527 if remainingTime > 0 { 1528 jm.enqueueSyncJobWithDelay(logger, job, remainingTime) 1529 return 0, metrics.JobSyncActionPodsCreated, nil 1530 } 1531 } 1532 diff = int32(len(indexesToAdd)) 1533 } 1534 1535 jm.expectations.ExpectCreations(logger, jobKey, int(diff)) 1536 errCh := make(chan error, diff) 1537 logger.V(4).Info("Too few pods running", "key", jobKey, "need", wantActive, "creating", diff) 1538 1539 wait := sync.WaitGroup{} 1540 1541 active += diff 1542 1543 podTemplate := job.Spec.Template.DeepCopy() 1544 if isIndexedJob(job) { 1545 addCompletionIndexEnvVariables(podTemplate) 1546 } 1547 podTemplate.Finalizers = appendJobCompletionFinalizerIfNotFound(podTemplate.Finalizers) 1548 1549 // Counters for pod creation status (used by the job_pods_creation_total metric) 1550 var creationsSucceeded, creationsFailed int32 = 0, 0 1551 1552 // Batch the pod creates. Batch sizes start at SlowStartInitialBatchSize 1553 // and double with each successful iteration in a kind of "slow start". 1554 // This handles attempts to start large numbers of pods that would 1555 // likely all fail with the same error. For example a project with a 1556 // low quota that attempts to create a large number of pods will be 1557 // prevented from spamming the API service with the pod create requests 1558 // after one of its pods fails. Conveniently, this also prevents the 1559 // event spam that those failures would generate. 1560 for batchSize := int32(integer.IntMin(int(diff), controller.SlowStartInitialBatchSize)); diff > 0; batchSize = integer.Int32Min(2*batchSize, diff) { 1561 errorCount := len(errCh) 1562 wait.Add(int(batchSize)) 1563 for i := int32(0); i < batchSize; i++ { 1564 completionIndex := unknownCompletionIndex 1565 if len(indexesToAdd) > 0 { 1566 completionIndex = indexesToAdd[0] 1567 indexesToAdd = indexesToAdd[1:] 1568 } 1569 go func() { 1570 template := podTemplate 1571 generateName := "" 1572 if completionIndex != unknownCompletionIndex { 1573 template = podTemplate.DeepCopy() 1574 addCompletionIndexAnnotation(template, completionIndex) 1575 1576 if feature.DefaultFeatureGate.Enabled(features.PodIndexLabel) { 1577 addCompletionIndexLabel(template, completionIndex) 1578 } 1579 template.Spec.Hostname = fmt.Sprintf("%s-%d", job.Name, completionIndex) 1580 generateName = podGenerateNameWithIndex(job.Name, completionIndex) 1581 if hasBackoffLimitPerIndex(job) { 1582 addIndexFailureCountAnnotation(logger, template, job, jobCtx.podsWithDelayedDeletionPerIndex[completionIndex]) 1583 } 1584 } 1585 defer wait.Done() 1586 err := jm.podControl.CreatePodsWithGenerateName(ctx, job.Namespace, template, job, metav1.NewControllerRef(job, controllerKind), generateName) 1587 if err != nil { 1588 if apierrors.HasStatusCause(err, v1.NamespaceTerminatingCause) { 1589 // If the namespace is being torn down, we can safely ignore 1590 // this error since all subsequent creations will fail. 1591 return 1592 } 1593 } 1594 if err != nil { 1595 defer utilruntime.HandleError(err) 1596 // Decrement the expected number of creates because the informer won't observe this pod 1597 logger.V(2).Info("Failed creation, decrementing expectations", "job", klog.KObj(job)) 1598 jm.expectations.CreationObserved(logger, jobKey) 1599 atomic.AddInt32(&active, -1) 1600 errCh <- err 1601 atomic.AddInt32(&creationsFailed, 1) 1602 } 1603 atomic.AddInt32(&creationsSucceeded, 1) 1604 }() 1605 } 1606 wait.Wait() 1607 // any skipped pods that we never attempted to start shouldn't be expected. 1608 skippedPods := diff - batchSize 1609 if errorCount < len(errCh) && skippedPods > 0 { 1610 logger.V(2).Info("Slow-start failure. Skipping creating pods, decrementing expectations", "skippedCount", skippedPods, "job", klog.KObj(job)) 1611 active -= skippedPods 1612 for i := int32(0); i < skippedPods; i++ { 1613 // Decrement the expected number of creates because the informer won't observe this pod 1614 jm.expectations.CreationObserved(logger, jobKey) 1615 } 1616 // The skipped pods will be retried later. The next controller resync will 1617 // retry the slow start process. 1618 break 1619 } 1620 diff -= batchSize 1621 } 1622 recordJobPodsCreationTotal(job, creationsSucceeded, creationsFailed) 1623 return active, metrics.JobSyncActionPodsCreated, errorFromChannel(errCh) 1624 } 1625 1626 return active, metrics.JobSyncActionTracking, nil 1627 } 1628 1629 // getPodCreationInfoForIndependentIndexes returns a sub-list of all indexes 1630 // to create that contains those which can be already created. In case no indexes 1631 // are ready to create pods, it returns the lowest remaining time to create pods 1632 // out of all indexes. 1633 func (jm *Controller) getPodCreationInfoForIndependentIndexes(logger klog.Logger, indexesToAdd []int, podsWithDelayedDeletionPerIndex map[int]*v1.Pod) ([]int, time.Duration) { 1634 var indexesToAddNow []int 1635 var minRemainingTimePerIndex *time.Duration 1636 for _, indexToAdd := range indexesToAdd { 1637 if remainingTimePerIndex := getRemainingTimePerIndex(logger, jm.clock, DefaultJobPodFailureBackOff, MaxJobPodFailureBackOff, podsWithDelayedDeletionPerIndex[indexToAdd]); remainingTimePerIndex == 0 { 1638 indexesToAddNow = append(indexesToAddNow, indexToAdd) 1639 } else if minRemainingTimePerIndex == nil || remainingTimePerIndex < *minRemainingTimePerIndex { 1640 minRemainingTimePerIndex = &remainingTimePerIndex 1641 } 1642 } 1643 if len(indexesToAddNow) > 0 { 1644 return indexesToAddNow, 0 1645 } 1646 return indexesToAddNow, ptr.Deref(minRemainingTimePerIndex, 0) 1647 } 1648 1649 // activePodsForRemoval returns Pods that should be removed because there 1650 // are too many pods running or, if this is an indexed job, there are repeated 1651 // indexes or invalid indexes or some pods don't have indexes. 1652 // Sorts candidate pods in the order such that not-ready < ready, unscheduled 1653 // < scheduled, and pending < running. This ensures that we delete pods 1654 // in the earlier stages whenever possible. 1655 func activePodsForRemoval(job *batch.Job, pods []*v1.Pod, rmAtLeast int) []*v1.Pod { 1656 var rm, left []*v1.Pod 1657 1658 if isIndexedJob(job) { 1659 rm = make([]*v1.Pod, 0, rmAtLeast) 1660 left = make([]*v1.Pod, 0, len(pods)-rmAtLeast) 1661 rm, left = appendDuplicatedIndexPodsForRemoval(rm, left, pods, int(*job.Spec.Completions)) 1662 } else { 1663 left = pods 1664 } 1665 1666 if len(rm) < rmAtLeast { 1667 sort.Sort(controller.ActivePods(left)) 1668 rm = append(rm, left[:rmAtLeast-len(rm)]...) 1669 } 1670 return rm 1671 } 1672 1673 // updateJobStatus calls the API to update the job status. 1674 func (jm *Controller) updateJobStatus(ctx context.Context, job *batch.Job) (*batch.Job, error) { 1675 return jm.kubeClient.BatchV1().Jobs(job.Namespace).UpdateStatus(ctx, job, metav1.UpdateOptions{}) 1676 } 1677 1678 func (jm *Controller) patchJob(ctx context.Context, job *batch.Job, data []byte) error { 1679 _, err := jm.kubeClient.BatchV1().Jobs(job.Namespace).Patch( 1680 ctx, job.Name, types.StrategicMergePatchType, data, metav1.PatchOptions{}) 1681 return err 1682 } 1683 1684 // getValidPodsWithFilter returns the valid pods that pass the filter. 1685 // Pods are valid if they have a finalizer or in uncounted set 1686 // and, for Indexed Jobs, a valid completion index. 1687 func getValidPodsWithFilter(jobCtx *syncJobCtx, uncounted sets.Set[string], filter func(*v1.Pod) bool) []*v1.Pod { 1688 var result []*v1.Pod 1689 for _, p := range jobCtx.pods { 1690 uid := string(p.UID) 1691 1692 // Pods that don't have a completion finalizer are in the uncounted set or 1693 // have already been accounted for in the Job status. 1694 if !hasJobTrackingFinalizer(p) || uncounted.Has(uid) || jobCtx.expectedRmFinalizers.Has(uid) { 1695 continue 1696 } 1697 if isIndexedJob(jobCtx.job) { 1698 idx := getCompletionIndex(p.Annotations) 1699 if idx == unknownCompletionIndex || idx >= int(*jobCtx.job.Spec.Completions) { 1700 continue 1701 } 1702 } 1703 if filter(p) { 1704 result = append(result, p) 1705 } 1706 } 1707 return result 1708 } 1709 1710 // getCompletionMode returns string representation of the completion mode. Used as a label value for metrics. 1711 func getCompletionMode(job *batch.Job) string { 1712 if isIndexedJob(job) { 1713 return string(batch.IndexedCompletion) 1714 } 1715 return string(batch.NonIndexedCompletion) 1716 } 1717 1718 func appendJobCompletionFinalizerIfNotFound(finalizers []string) []string { 1719 for _, fin := range finalizers { 1720 if fin == batch.JobTrackingFinalizer { 1721 return finalizers 1722 } 1723 } 1724 return append(finalizers, batch.JobTrackingFinalizer) 1725 } 1726 1727 func removeTrackingFinalizerPatch(pod *v1.Pod) []byte { 1728 if !hasJobTrackingFinalizer(pod) { 1729 return nil 1730 } 1731 patch := map[string]interface{}{ 1732 "metadata": map[string]interface{}{ 1733 "$deleteFromPrimitiveList/finalizers": []string{batch.JobTrackingFinalizer}, 1734 }, 1735 } 1736 patchBytes, _ := json.Marshal(patch) 1737 return patchBytes 1738 } 1739 1740 type uncountedTerminatedPods struct { 1741 succeeded sets.Set[string] 1742 failed sets.Set[string] 1743 } 1744 1745 func newUncountedTerminatedPods(in batch.UncountedTerminatedPods) *uncountedTerminatedPods { 1746 obj := uncountedTerminatedPods{ 1747 succeeded: make(sets.Set[string], len(in.Succeeded)), 1748 failed: make(sets.Set[string], len(in.Failed)), 1749 } 1750 for _, v := range in.Succeeded { 1751 obj.succeeded.Insert(string(v)) 1752 } 1753 for _, v := range in.Failed { 1754 obj.failed.Insert(string(v)) 1755 } 1756 return &obj 1757 } 1758 1759 func (u *uncountedTerminatedPods) Succeeded() sets.Set[string] { 1760 if u == nil { 1761 return nil 1762 } 1763 return u.succeeded 1764 } 1765 1766 func (u *uncountedTerminatedPods) Failed() sets.Set[string] { 1767 if u == nil { 1768 return nil 1769 } 1770 return u.failed 1771 } 1772 1773 func errorFromChannel(errCh <-chan error) error { 1774 select { 1775 case err := <-errCh: 1776 return err 1777 default: 1778 } 1779 return nil 1780 } 1781 1782 // ensureJobConditionStatus appends or updates an existing job condition of the 1783 // given type with the given status value. Note that this function will not 1784 // append to the conditions list if the new condition's status is false 1785 // (because going from nothing to false is meaningless); it can, however, 1786 // update the status condition to false. The function returns a bool to let the 1787 // caller know if the list was changed (either appended or updated). 1788 func ensureJobConditionStatus(list []batch.JobCondition, cType batch.JobConditionType, status v1.ConditionStatus, reason, message string, now time.Time) ([]batch.JobCondition, bool) { 1789 if condition := findConditionByType(list, cType); condition != nil { 1790 if condition.Status != status || condition.Reason != reason || condition.Message != message { 1791 *condition = *newCondition(cType, status, reason, message, now) 1792 return list, true 1793 } 1794 return list, false 1795 } 1796 // A condition with that type doesn't exist in the list. 1797 if status != v1.ConditionFalse { 1798 return append(list, *newCondition(cType, status, reason, message, now)), true 1799 } 1800 return list, false 1801 } 1802 1803 func isPodFailed(p *v1.Pod, job *batch.Job) bool { 1804 if feature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions) && feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil { 1805 // When PodDisruptionConditions is enabled, orphan Pods and unschedulable 1806 // terminating Pods are marked as Failed. So we only need to check the phase. 1807 // TODO(#113855): Stop limiting this behavior to Jobs with podFailurePolicy. 1808 // For now, we do so to avoid affecting all running Jobs without the 1809 // availability to opt-out into the old behavior. 1810 return p.Status.Phase == v1.PodFailed 1811 } 1812 if p.Status.Phase == v1.PodFailed { 1813 return true 1814 } 1815 if onlyReplaceFailedPods(job) { 1816 return p.Status.Phase == v1.PodFailed 1817 } 1818 // Count deleted Pods as failures to account for orphan Pods that 1819 // never have a chance to reach the Failed phase. 1820 return p.DeletionTimestamp != nil && p.Status.Phase != v1.PodSucceeded 1821 } 1822 1823 func findConditionByType(list []batch.JobCondition, cType batch.JobConditionType) *batch.JobCondition { 1824 for i := range list { 1825 if list[i].Type == cType { 1826 return &list[i] 1827 } 1828 } 1829 return nil 1830 } 1831 1832 func recordJobPodFinished(logger klog.Logger, job *batch.Job, oldCounters batch.JobStatus) { 1833 completionMode := completionModeStr(job) 1834 var diff int 1835 1836 // Updating succeeded metric must be handled differently 1837 // for Indexed Jobs to handle the case where the job has 1838 // been scaled down by reducing completions & parallelism 1839 // in tandem, and now a previously completed index is 1840 // now out of range (i.e. index >= spec.Completions). 1841 if isIndexedJob(job) { 1842 completions := int(*job.Spec.Completions) 1843 if job.Status.CompletedIndexes != oldCounters.CompletedIndexes { 1844 diff = indexesCount(logger, &job.Status.CompletedIndexes, completions) - indexesCount(logger, &oldCounters.CompletedIndexes, completions) 1845 } 1846 backoffLimitLabel := backoffLimitMetricsLabel(job) 1847 metrics.JobFinishedIndexesTotal.WithLabelValues(metrics.Succeeded, backoffLimitLabel).Add(float64(diff)) 1848 if hasBackoffLimitPerIndex(job) && job.Status.FailedIndexes != oldCounters.FailedIndexes { 1849 if failedDiff := indexesCount(logger, job.Status.FailedIndexes, completions) - indexesCount(logger, oldCounters.FailedIndexes, completions); failedDiff > 0 { 1850 metrics.JobFinishedIndexesTotal.WithLabelValues(metrics.Failed, backoffLimitLabel).Add(float64(failedDiff)) 1851 } 1852 } 1853 } else { 1854 diff = int(job.Status.Succeeded) - int(oldCounters.Succeeded) 1855 } 1856 metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Succeeded).Add(float64(diff)) 1857 1858 // Update failed metric. 1859 diff = int(job.Status.Failed - oldCounters.Failed) 1860 metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed).Add(float64(diff)) 1861 } 1862 1863 func indexesCount(logger klog.Logger, indexesStr *string, completions int) int { 1864 if indexesStr == nil { 1865 return 0 1866 } 1867 return parseIndexesFromString(logger, *indexesStr, completions).total() 1868 } 1869 1870 func backoffLimitMetricsLabel(job *batch.Job) string { 1871 if hasBackoffLimitPerIndex(job) { 1872 return "perIndex" 1873 } 1874 return "global" 1875 } 1876 1877 func recordJobPodFailurePolicyActions(job *batch.Job, podFailureCountByPolicyAction map[string]int) { 1878 for action, count := range podFailureCountByPolicyAction { 1879 metrics.PodFailuresHandledByFailurePolicy.WithLabelValues(action).Add(float64(count)) 1880 } 1881 } 1882 1883 func countReadyPods(pods []*v1.Pod) int32 { 1884 cnt := int32(0) 1885 for _, p := range pods { 1886 if podutil.IsPodReady(p) { 1887 cnt++ 1888 } 1889 } 1890 return cnt 1891 } 1892 1893 // This checks if we should apply PodReplacementPolicy. 1894 // PodReplacementPolicy controls when we recreate pods if they are marked as terminating 1895 // Failed means that we recreate only once the pod has terminated. 1896 func onlyReplaceFailedPods(job *batch.Job) bool { 1897 // We check both PodReplacementPolicy for nil and failed 1898 // because it is possible that `PodReplacementPolicy` is not defaulted, 1899 // when the `JobPodReplacementPolicy` feature gate is disabled for API server. 1900 if feature.DefaultFeatureGate.Enabled(features.JobPodReplacementPolicy) && job.Spec.PodReplacementPolicy != nil && *job.Spec.PodReplacementPolicy == batch.Failed { 1901 return true 1902 } 1903 return feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil 1904 } 1905 1906 func (jm *Controller) cleanupPodFinalizers(job *batch.Job) { 1907 // Listing pods shouldn't really fail, as we are just querying the informer cache. 1908 selector, err := metav1.LabelSelectorAsSelector(job.Spec.Selector) 1909 if err != nil { 1910 utilruntime.HandleError(fmt.Errorf("parsing deleted job selector: %v", err)) 1911 return 1912 } 1913 pods, _ := jm.podStore.Pods(job.Namespace).List(selector) 1914 for _, pod := range pods { 1915 if metav1.IsControlledBy(pod, job) && hasJobTrackingFinalizer(pod) { 1916 jm.enqueueOrphanPod(pod) 1917 } 1918 } 1919 } 1920 1921 func recordJobPodsCreationTotal(job *batch.Job, succeeded, failed int32) { 1922 reason := metrics.PodCreateNew 1923 if feature.DefaultFeatureGate.Enabled(features.JobPodReplacementPolicy) { 1924 podsTerminating := job.Status.Terminating != nil && *job.Status.Terminating > 0 1925 isRecreateAction := podsTerminating || job.Status.Failed > 0 1926 if isRecreateAction { 1927 reason = metrics.PodRecreateTerminatingOrFailed 1928 if *job.Spec.PodReplacementPolicy == batch.Failed { 1929 reason = metrics.PodRecreateFailed 1930 } 1931 } 1932 } 1933 if succeeded > 0 { 1934 metrics.JobPodsCreationTotal.WithLabelValues(reason, metrics.Succeeded).Add(float64(succeeded)) 1935 } 1936 if failed > 0 { 1937 metrics.JobPodsCreationTotal.WithLabelValues(reason, metrics.Failed).Add(float64(failed)) 1938 } 1939 }