k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/job/job_controller.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package job 18 19 import ( 20 "context" 21 "fmt" 22 "reflect" 23 "sort" 24 "sync" 25 "sync/atomic" 26 "time" 27 28 batch "k8s.io/api/batch/v1" 29 v1 "k8s.io/api/core/v1" 30 apierrors "k8s.io/apimachinery/pkg/api/errors" 31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 "k8s.io/apimachinery/pkg/labels" 33 "k8s.io/apimachinery/pkg/types" 34 "k8s.io/apimachinery/pkg/util/json" 35 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 36 "k8s.io/apimachinery/pkg/util/sets" 37 "k8s.io/apimachinery/pkg/util/wait" 38 "k8s.io/apiserver/pkg/util/feature" 39 batchinformers "k8s.io/client-go/informers/batch/v1" 40 coreinformers "k8s.io/client-go/informers/core/v1" 41 clientset "k8s.io/client-go/kubernetes" 42 "k8s.io/client-go/kubernetes/scheme" 43 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 44 batchv1listers "k8s.io/client-go/listers/batch/v1" 45 corelisters "k8s.io/client-go/listers/core/v1" 46 "k8s.io/client-go/tools/cache" 47 "k8s.io/client-go/tools/record" 48 "k8s.io/client-go/util/workqueue" 49 "k8s.io/klog/v2" 50 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 51 "k8s.io/kubernetes/pkg/controller" 52 "k8s.io/kubernetes/pkg/controller/job/metrics" 53 "k8s.io/kubernetes/pkg/controller/job/util" 54 "k8s.io/kubernetes/pkg/features" 55 "k8s.io/utils/clock" 56 "k8s.io/utils/ptr" 57 ) 58 59 // controllerKind contains the schema.GroupVersionKind for this controller type. 60 var controllerKind = batch.SchemeGroupVersion.WithKind("Job") 61 62 var ( 63 // syncJobBatchPeriod is the batch period for controller sync invocations for a Job. 64 syncJobBatchPeriod = time.Second 65 // DefaultJobApiBackOff is the default API backoff period. Exported for tests. 66 DefaultJobApiBackOff = time.Second 67 // MaxJobApiBackOff is the max API backoff period. Exported for tests. 68 MaxJobApiBackOff = time.Minute 69 // DefaultJobPodFailureBackOff is the default pod failure backoff period. Exported for tests. 70 DefaultJobPodFailureBackOff = 10 * time.Second 71 // MaxJobPodFailureBackOff is the max pod failure backoff period. Exported for tests. 72 MaxJobPodFailureBackOff = 10 * time.Minute 73 // MaxUncountedPods is the maximum size the slices in 74 // .status.uncountedTerminatedPods should have to keep their representation 75 // roughly below 20 KB. Exported for tests 76 MaxUncountedPods = 500 77 // MaxPodCreateDeletePerSync is the maximum number of pods that can be 78 // created or deleted in a single sync call. Exported for tests. 79 MaxPodCreateDeletePerSync = 500 80 ) 81 82 // Controller ensures that all Job objects have corresponding pods to 83 // run their configured workload. 84 type Controller struct { 85 kubeClient clientset.Interface 86 podControl controller.PodControlInterface 87 88 // To allow injection of the following for testing. 89 updateStatusHandler func(ctx context.Context, job *batch.Job) (*batch.Job, error) 90 patchJobHandler func(ctx context.Context, job *batch.Job, patch []byte) error 91 syncHandler func(ctx context.Context, jobKey string) error 92 // podStoreSynced returns true if the pod store has been synced at least once. 93 // Added as a member to the struct to allow injection for testing. 94 podStoreSynced cache.InformerSynced 95 // jobStoreSynced returns true if the job store has been synced at least once. 96 // Added as a member to the struct to allow injection for testing. 97 jobStoreSynced cache.InformerSynced 98 99 // A TTLCache of pod creates/deletes each rc expects to see 100 expectations controller.ControllerExpectationsInterface 101 102 // finalizerExpectations tracks the Pod UIDs for which the controller 103 // expects to observe the tracking finalizer removed. 104 finalizerExpectations *uidTrackingExpectations 105 106 // A store of jobs 107 jobLister batchv1listers.JobLister 108 109 // A store of pods, populated by the podController 110 podStore corelisters.PodLister 111 112 // Jobs that need to be updated 113 queue workqueue.TypedRateLimitingInterface[string] 114 115 // Orphan deleted pods that still have a Job tracking finalizer to be removed 116 orphanQueue workqueue.TypedRateLimitingInterface[string] 117 118 broadcaster record.EventBroadcaster 119 recorder record.EventRecorder 120 121 clock clock.WithTicker 122 123 // Store with information to compute the expotential backoff delay for pod 124 // recreation in case of pod failures. 125 podBackoffStore *backoffStore 126 } 127 128 type syncJobCtx struct { 129 job *batch.Job 130 pods []*v1.Pod 131 finishedCondition *batch.JobCondition 132 activePods []*v1.Pod 133 succeeded int32 134 failed int32 135 prevSucceededIndexes orderedIntervals 136 succeededIndexes orderedIntervals 137 failedIndexes *orderedIntervals 138 newBackoffRecord backoffRecord 139 expectedRmFinalizers sets.Set[string] 140 uncounted *uncountedTerminatedPods 141 podsWithDelayedDeletionPerIndex map[int]*v1.Pod 142 terminating *int32 143 } 144 145 // NewController creates a new Job controller that keeps the relevant pods 146 // in sync with their corresponding Job objects. 147 func NewController(ctx context.Context, podInformer coreinformers.PodInformer, jobInformer batchinformers.JobInformer, kubeClient clientset.Interface) (*Controller, error) { 148 return newControllerWithClock(ctx, podInformer, jobInformer, kubeClient, &clock.RealClock{}) 149 } 150 151 func newControllerWithClock(ctx context.Context, podInformer coreinformers.PodInformer, jobInformer batchinformers.JobInformer, kubeClient clientset.Interface, clock clock.WithTicker) (*Controller, error) { 152 eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx)) 153 logger := klog.FromContext(ctx) 154 155 jm := &Controller{ 156 kubeClient: kubeClient, 157 podControl: controller.RealPodControl{ 158 KubeClient: kubeClient, 159 Recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "job-controller"}), 160 }, 161 expectations: controller.NewControllerExpectations(), 162 finalizerExpectations: newUIDTrackingExpectations(), 163 queue: workqueue.NewTypedRateLimitingQueueWithConfig(workqueue.NewTypedItemExponentialFailureRateLimiter[string](DefaultJobApiBackOff, MaxJobApiBackOff), workqueue.TypedRateLimitingQueueConfig[string]{Name: "job", Clock: clock}), 164 orphanQueue: workqueue.NewTypedRateLimitingQueueWithConfig(workqueue.NewTypedItemExponentialFailureRateLimiter[string](DefaultJobApiBackOff, MaxJobApiBackOff), workqueue.TypedRateLimitingQueueConfig[string]{Name: "job_orphan_pod", Clock: clock}), 165 broadcaster: eventBroadcaster, 166 recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "job-controller"}), 167 clock: clock, 168 podBackoffStore: newBackoffStore(), 169 } 170 171 if _, err := jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 172 AddFunc: func(obj interface{}) { 173 jm.addJob(logger, obj) 174 }, 175 UpdateFunc: func(oldObj, newObj interface{}) { 176 jm.updateJob(logger, oldObj, newObj) 177 }, 178 DeleteFunc: func(obj interface{}) { 179 jm.deleteJob(logger, obj) 180 }, 181 }); err != nil { 182 return nil, fmt.Errorf("adding Job event handler: %w", err) 183 } 184 jm.jobLister = jobInformer.Lister() 185 jm.jobStoreSynced = jobInformer.Informer().HasSynced 186 187 if _, err := podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 188 AddFunc: func(obj interface{}) { 189 jm.addPod(logger, obj) 190 }, 191 UpdateFunc: func(oldObj, newObj interface{}) { 192 jm.updatePod(logger, oldObj, newObj) 193 }, 194 DeleteFunc: func(obj interface{}) { 195 jm.deletePod(logger, obj, true) 196 }, 197 }); err != nil { 198 return nil, fmt.Errorf("adding Pod event handler: %w", err) 199 } 200 jm.podStore = podInformer.Lister() 201 jm.podStoreSynced = podInformer.Informer().HasSynced 202 203 jm.updateStatusHandler = jm.updateJobStatus 204 jm.patchJobHandler = jm.patchJob 205 jm.syncHandler = jm.syncJob 206 207 metrics.Register() 208 209 return jm, nil 210 } 211 212 // Run the main goroutine responsible for watching and syncing jobs. 213 func (jm *Controller) Run(ctx context.Context, workers int) { 214 defer utilruntime.HandleCrash() 215 logger := klog.FromContext(ctx) 216 217 // Start events processing pipeline. 218 jm.broadcaster.StartStructuredLogging(3) 219 jm.broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: jm.kubeClient.CoreV1().Events("")}) 220 defer jm.broadcaster.Shutdown() 221 222 defer jm.queue.ShutDown() 223 defer jm.orphanQueue.ShutDown() 224 225 logger.Info("Starting job controller") 226 defer logger.Info("Shutting down job controller") 227 228 if !cache.WaitForNamedCacheSync("job", ctx.Done(), jm.podStoreSynced, jm.jobStoreSynced) { 229 return 230 } 231 232 for i := 0; i < workers; i++ { 233 go wait.UntilWithContext(ctx, jm.worker, time.Second) 234 } 235 236 go wait.UntilWithContext(ctx, jm.orphanWorker, time.Second) 237 238 <-ctx.Done() 239 } 240 241 // getPodJobs returns a list of Jobs that potentially match a Pod. 242 func (jm *Controller) getPodJobs(pod *v1.Pod) []*batch.Job { 243 jobs, err := jm.jobLister.GetPodJobs(pod) 244 if err != nil { 245 return nil 246 } 247 if len(jobs) > 1 { 248 // ControllerRef will ensure we don't do anything crazy, but more than one 249 // item in this list nevertheless constitutes user error. 250 utilruntime.HandleError(fmt.Errorf("user error! more than one job is selecting pods with labels: %+v", pod.Labels)) 251 } 252 ret := make([]*batch.Job, 0, len(jobs)) 253 for i := range jobs { 254 ret = append(ret, &jobs[i]) 255 } 256 return ret 257 } 258 259 // resolveControllerRef returns the controller referenced by a ControllerRef, 260 // or nil if the ControllerRef could not be resolved to a matching controller 261 // of the correct Kind. 262 func (jm *Controller) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) *batch.Job { 263 // We can't look up by UID, so look up by Name and then verify UID. 264 // Don't even try to look up by Name if it's the wrong Kind. 265 if controllerRef.Kind != controllerKind.Kind { 266 return nil 267 } 268 job, err := jm.jobLister.Jobs(namespace).Get(controllerRef.Name) 269 if err != nil { 270 return nil 271 } 272 if job.UID != controllerRef.UID { 273 // The controller we found with this Name is not the same one that the 274 // ControllerRef points to. 275 return nil 276 } 277 return job 278 } 279 280 // When a pod is created, enqueue the controller that manages it and update its expectations. 281 func (jm *Controller) addPod(logger klog.Logger, obj interface{}) { 282 pod := obj.(*v1.Pod) 283 recordFinishedPodWithTrackingFinalizer(nil, pod) 284 if pod.DeletionTimestamp != nil { 285 // on a restart of the controller, it's possible a new pod shows up in a state that 286 // is already pending deletion. Prevent the pod from being a creation observation. 287 jm.deletePod(logger, pod, false) 288 return 289 } 290 291 // If it has a ControllerRef, that's all that matters. 292 if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil { 293 job := jm.resolveControllerRef(pod.Namespace, controllerRef) 294 if job == nil { 295 return 296 } 297 jobKey, err := controller.KeyFunc(job) 298 if err != nil { 299 return 300 } 301 jm.expectations.CreationObserved(logger, jobKey) 302 jm.enqueueSyncJobBatched(logger, job) 303 return 304 } 305 306 // Otherwise, it's an orphan. 307 // Clean the finalizer. 308 if hasJobTrackingFinalizer(pod) { 309 jm.enqueueOrphanPod(pod) 310 } 311 // Get a list of all matching controllers and sync 312 // them to see if anyone wants to adopt it. 313 // DO NOT observe creation because no controller should be waiting for an 314 // orphan. 315 for _, job := range jm.getPodJobs(pod) { 316 jm.enqueueSyncJobBatched(logger, job) 317 } 318 } 319 320 // When a pod is updated, figure out what job/s manage it and wake them up. 321 // If the labels of the pod have changed we need to awaken both the old 322 // and new job. old and cur must be *v1.Pod types. 323 func (jm *Controller) updatePod(logger klog.Logger, old, cur interface{}) { 324 curPod := cur.(*v1.Pod) 325 oldPod := old.(*v1.Pod) 326 recordFinishedPodWithTrackingFinalizer(oldPod, curPod) 327 if curPod.ResourceVersion == oldPod.ResourceVersion { 328 // Periodic resync will send update events for all known pods. 329 // Two different versions of the same pod will always have different RVs. 330 return 331 } 332 if curPod.DeletionTimestamp != nil { 333 // when a pod is deleted gracefully it's deletion timestamp is first modified to reflect a grace period, 334 // and after such time has passed, the kubelet actually deletes it from the store. We receive an update 335 // for modification of the deletion timestamp and expect an job to create more pods asap, not wait 336 // until the kubelet actually deletes the pod. 337 jm.deletePod(logger, curPod, false) 338 return 339 } 340 341 // Don't check if oldPod has the finalizer, as during ownership transfer 342 // finalizers might be re-added and removed again in behalf of the new owner. 343 // If all those Pod updates collapse into a single event, the finalizer 344 // might be removed in oldPod and curPod. We want to record the latest 345 // state. 346 finalizerRemoved := !hasJobTrackingFinalizer(curPod) 347 curControllerRef := metav1.GetControllerOf(curPod) 348 oldControllerRef := metav1.GetControllerOf(oldPod) 349 controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef) 350 if controllerRefChanged && oldControllerRef != nil { 351 // The ControllerRef was changed. Sync the old controller, if any. 352 if job := jm.resolveControllerRef(oldPod.Namespace, oldControllerRef); job != nil { 353 if finalizerRemoved { 354 key, err := controller.KeyFunc(job) 355 if err == nil { 356 jm.finalizerExpectations.finalizerRemovalObserved(logger, key, string(curPod.UID)) 357 } 358 } 359 jm.enqueueSyncJobBatched(logger, job) 360 } 361 } 362 363 // If it has a ControllerRef, that's all that matters. 364 if curControllerRef != nil { 365 job := jm.resolveControllerRef(curPod.Namespace, curControllerRef) 366 if job == nil { 367 return 368 } 369 if finalizerRemoved { 370 key, err := controller.KeyFunc(job) 371 if err == nil { 372 jm.finalizerExpectations.finalizerRemovalObserved(logger, key, string(curPod.UID)) 373 } 374 } 375 jm.enqueueSyncJobBatched(logger, job) 376 return 377 } 378 379 // Otherwise, it's an orphan. 380 // Clean the finalizer. 381 if hasJobTrackingFinalizer(curPod) { 382 jm.enqueueOrphanPod(curPod) 383 } 384 // If anything changed, sync matching controllers 385 // to see if anyone wants to adopt it now. 386 labelChanged := !reflect.DeepEqual(curPod.Labels, oldPod.Labels) 387 if labelChanged || controllerRefChanged { 388 for _, job := range jm.getPodJobs(curPod) { 389 jm.enqueueSyncJobBatched(logger, job) 390 } 391 } 392 } 393 394 // When a pod is deleted, enqueue the job that manages the pod and update its expectations. 395 // obj could be an *v1.Pod, or a DeleteFinalStateUnknown marker item. 396 func (jm *Controller) deletePod(logger klog.Logger, obj interface{}, final bool) { 397 pod, ok := obj.(*v1.Pod) 398 if final { 399 recordFinishedPodWithTrackingFinalizer(pod, nil) 400 } 401 402 // When a delete is dropped, the relist will notice a pod in the store not 403 // in the list, leading to the insertion of a tombstone object which contains 404 // the deleted key/value. Note that this value might be stale. If the pod 405 // changed labels the new job will not be woken up till the periodic resync. 406 if !ok { 407 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 408 if !ok { 409 utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %+v", obj)) 410 return 411 } 412 pod, ok = tombstone.Obj.(*v1.Pod) 413 if !ok { 414 utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a pod %+v", obj)) 415 return 416 } 417 } 418 419 controllerRef := metav1.GetControllerOf(pod) 420 hasFinalizer := hasJobTrackingFinalizer(pod) 421 if controllerRef == nil { 422 // No controller should care about orphans being deleted. 423 // But this pod might have belonged to a Job and the GC removed the reference. 424 if hasFinalizer { 425 jm.enqueueOrphanPod(pod) 426 } 427 return 428 } 429 job := jm.resolveControllerRef(pod.Namespace, controllerRef) 430 if job == nil || util.IsJobFinished(job) { 431 // syncJob will not remove this finalizer. 432 if hasFinalizer { 433 jm.enqueueOrphanPod(pod) 434 } 435 return 436 } 437 jobKey, err := controller.KeyFunc(job) 438 if err != nil { 439 return 440 } 441 jm.expectations.DeletionObserved(logger, jobKey) 442 443 // Consider the finalizer removed if this is the final delete. Otherwise, 444 // it's an update for the deletion timestamp, then check finalizer. 445 if final || !hasFinalizer { 446 jm.finalizerExpectations.finalizerRemovalObserved(logger, jobKey, string(pod.UID)) 447 } 448 449 jm.enqueueSyncJobBatched(logger, job) 450 } 451 452 func (jm *Controller) addJob(logger klog.Logger, obj interface{}) { 453 jm.enqueueSyncJobImmediately(logger, obj) 454 jobObj, ok := obj.(*batch.Job) 455 if !ok { 456 return 457 } 458 if controllerName := managedByExternalController(jobObj); controllerName != nil { 459 metrics.JobByExternalControllerTotal.WithLabelValues(*controllerName).Inc() 460 } 461 } 462 463 func (jm *Controller) updateJob(logger klog.Logger, old, cur interface{}) { 464 oldJob := old.(*batch.Job) 465 curJob := cur.(*batch.Job) 466 467 // never return error 468 key, err := controller.KeyFunc(curJob) 469 if err != nil { 470 return 471 } 472 473 if curJob.Generation == oldJob.Generation { 474 // Delay the Job sync when no generation change to batch Job status updates, 475 // typically triggered by pod events. 476 jm.enqueueSyncJobBatched(logger, curJob) 477 } else { 478 // Trigger immediate sync when spec is changed. 479 jm.enqueueSyncJobImmediately(logger, curJob) 480 } 481 482 // The job shouldn't be marked as finished until all pod finalizers are removed. 483 // This is a backup operation in this case. 484 if util.IsJobFinished(curJob) { 485 jm.cleanupPodFinalizers(curJob) 486 } 487 488 // check if need to add a new rsync for ActiveDeadlineSeconds 489 if curJob.Status.StartTime != nil { 490 curADS := curJob.Spec.ActiveDeadlineSeconds 491 if curADS == nil { 492 return 493 } 494 oldADS := oldJob.Spec.ActiveDeadlineSeconds 495 if oldADS == nil || *oldADS != *curADS { 496 passed := jm.clock.Since(curJob.Status.StartTime.Time) 497 total := time.Duration(*curADS) * time.Second 498 // AddAfter will handle total < passed 499 jm.queue.AddAfter(key, total-passed) 500 logger.V(4).Info("job's ActiveDeadlineSeconds updated, will rsync", "key", key, "interval", total-passed) 501 } 502 } 503 } 504 505 // deleteJob enqueues the job and all the pods associated with it that still 506 // have a finalizer. 507 func (jm *Controller) deleteJob(logger klog.Logger, obj interface{}) { 508 jm.enqueueSyncJobImmediately(logger, obj) 509 jobObj, ok := obj.(*batch.Job) 510 if !ok { 511 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 512 if !ok { 513 utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %+v", obj)) 514 return 515 } 516 jobObj, ok = tombstone.Obj.(*batch.Job) 517 if !ok { 518 utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a job %+v", obj)) 519 return 520 } 521 } 522 jm.cleanupPodFinalizers(jobObj) 523 } 524 525 // enqueueSyncJobImmediately tells the Job controller to invoke syncJob 526 // immediately. 527 // It is only used for Job events (creation, deletion, spec update). 528 // obj could be an *batch.Job, or a DeletionFinalStateUnknown marker item. 529 func (jm *Controller) enqueueSyncJobImmediately(logger klog.Logger, obj interface{}) { 530 jm.enqueueSyncJobInternal(logger, obj, 0) 531 } 532 533 // enqueueSyncJobBatched tells the controller to invoke syncJob with a 534 // constant batching delay. 535 // It is used for: 536 // - Pod events (creation, deletion, update) 537 // - Job status update 538 // obj could be an *batch.Job, or a DeletionFinalStateUnknown marker item. 539 func (jm *Controller) enqueueSyncJobBatched(logger klog.Logger, obj interface{}) { 540 jm.enqueueSyncJobInternal(logger, obj, syncJobBatchPeriod) 541 } 542 543 // enqueueSyncJobWithDelay tells the controller to invoke syncJob with a 544 // custom delay, but not smaller than the batching delay. 545 // It is used when pod recreations are delayed due to pod failures. 546 // obj could be an *batch.Job, or a DeletionFinalStateUnknown marker item. 547 func (jm *Controller) enqueueSyncJobWithDelay(logger klog.Logger, obj interface{}, delay time.Duration) { 548 if delay < syncJobBatchPeriod { 549 delay = syncJobBatchPeriod 550 } 551 jm.enqueueSyncJobInternal(logger, obj, delay) 552 } 553 554 func (jm *Controller) enqueueSyncJobInternal(logger klog.Logger, obj interface{}, delay time.Duration) { 555 key, err := controller.KeyFunc(obj) 556 if err != nil { 557 utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) 558 return 559 } 560 561 // TODO: Handle overlapping controllers better. Either disallow them at admission time or 562 // deterministically avoid syncing controllers that fight over pods. Currently, we only 563 // ensure that the same controller is synced for a given pod. When we periodically relist 564 // all controllers there will still be some replica instability. One way to handle this is 565 // by querying the store for all controllers that this rc overlaps, as well as all 566 // controllers that overlap this rc, and sorting them. 567 logger.Info("enqueueing job", "key", key, "delay", delay) 568 jm.queue.AddAfter(key, delay) 569 } 570 571 func (jm *Controller) enqueueOrphanPod(obj *v1.Pod) { 572 key, err := controller.KeyFunc(obj) 573 if err != nil { 574 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %+v: %v", obj, err)) 575 return 576 } 577 jm.orphanQueue.Add(key) 578 } 579 580 // worker runs a worker thread that just dequeues items, processes them, and marks them done. 581 // It enforces that the syncHandler is never invoked concurrently with the same key. 582 func (jm *Controller) worker(ctx context.Context) { 583 for jm.processNextWorkItem(ctx) { 584 } 585 } 586 587 func (jm *Controller) processNextWorkItem(ctx context.Context) bool { 588 key, quit := jm.queue.Get() 589 if quit { 590 return false 591 } 592 defer jm.queue.Done(key) 593 594 err := jm.syncHandler(ctx, key) 595 if err == nil { 596 jm.queue.Forget(key) 597 return true 598 } 599 600 utilruntime.HandleError(fmt.Errorf("syncing job: %w", err)) 601 jm.queue.AddRateLimited(key) 602 603 return true 604 } 605 606 func (jm *Controller) orphanWorker(ctx context.Context) { 607 for jm.processNextOrphanPod(ctx) { 608 } 609 } 610 611 func (jm *Controller) processNextOrphanPod(ctx context.Context) bool { 612 key, quit := jm.orphanQueue.Get() 613 if quit { 614 return false 615 } 616 defer jm.orphanQueue.Done(key) 617 err := jm.syncOrphanPod(ctx, key) 618 if err != nil { 619 utilruntime.HandleError(fmt.Errorf("Error syncing orphan pod: %v", err)) 620 jm.orphanQueue.AddRateLimited(key) 621 } else { 622 jm.orphanQueue.Forget(key) 623 } 624 625 return true 626 } 627 628 // syncOrphanPod removes the tracking finalizer from an orphan pod if found. 629 func (jm *Controller) syncOrphanPod(ctx context.Context, key string) error { 630 startTime := jm.clock.Now() 631 logger := klog.FromContext(ctx) 632 defer func() { 633 logger.V(4).Info("Finished syncing orphan pod", "pod", key, "elapsed", jm.clock.Since(startTime)) 634 }() 635 636 ns, name, err := cache.SplitMetaNamespaceKey(key) 637 if err != nil { 638 return err 639 } 640 641 sharedPod, err := jm.podStore.Pods(ns).Get(name) 642 if err != nil { 643 if apierrors.IsNotFound(err) { 644 logger.V(4).Info("Orphan pod has been deleted", "pod", key) 645 return nil 646 } 647 return err 648 } 649 // Make sure the pod is still orphaned. 650 if controllerRef := metav1.GetControllerOf(sharedPod); controllerRef != nil { 651 if controllerRef.Kind != controllerKind.Kind || controllerRef.APIVersion != batch.SchemeGroupVersion.String() { 652 // The pod is controlled by an owner that is not a batch/v1 Job. Do not remove finalizer. 653 return nil 654 } 655 job := jm.resolveControllerRef(sharedPod.Namespace, controllerRef) 656 if job != nil { 657 // Skip cleanup of finalizers for pods owned by a job managed by an external controller 658 if controllerName := managedByExternalController(job); controllerName != nil { 659 logger.V(2).Info("Skip cleanup of the job finalizer for a pod owned by a job that is managed by an external controller", "key", key, "podUID", sharedPod.UID, "jobUID", job.UID, "controllerName", controllerName) 660 return nil 661 } 662 } 663 if job != nil && !util.IsJobFinished(job) { 664 // The pod was adopted. Do not remove finalizer. 665 return nil 666 } 667 } 668 if patch := removeTrackingFinalizerPatch(sharedPod); patch != nil { 669 if err := jm.podControl.PatchPod(ctx, ns, name, patch); err != nil && !apierrors.IsNotFound(err) { 670 return err 671 } 672 } 673 return nil 674 } 675 676 // getPodsForJob returns the set of pods that this Job should manage. 677 // It also reconciles ControllerRef by adopting/orphaning, adding tracking 678 // finalizers. 679 // Note that the returned Pods are pointers into the cache. 680 func (jm *Controller) getPodsForJob(ctx context.Context, j *batch.Job) ([]*v1.Pod, error) { 681 selector, err := metav1.LabelSelectorAsSelector(j.Spec.Selector) 682 if err != nil { 683 return nil, fmt.Errorf("couldn't convert Job selector: %v", err) 684 } 685 // List all pods to include those that don't match the selector anymore 686 // but have a ControllerRef pointing to this controller. 687 pods, err := jm.podStore.Pods(j.Namespace).List(labels.Everything()) 688 if err != nil { 689 return nil, err 690 } 691 // If any adoptions are attempted, we should first recheck for deletion 692 // with an uncached quorum read sometime after listing Pods (see #42639). 693 canAdoptFunc := controller.RecheckDeletionTimestamp(func(ctx context.Context) (metav1.Object, error) { 694 fresh, err := jm.kubeClient.BatchV1().Jobs(j.Namespace).Get(ctx, j.Name, metav1.GetOptions{}) 695 if err != nil { 696 return nil, err 697 } 698 if fresh.UID != j.UID { 699 return nil, fmt.Errorf("original Job %v/%v is gone: got uid %v, wanted %v", j.Namespace, j.Name, fresh.UID, j.UID) 700 } 701 return fresh, nil 702 }) 703 cm := controller.NewPodControllerRefManager(jm.podControl, j, selector, controllerKind, canAdoptFunc, batch.JobTrackingFinalizer) 704 // When adopting Pods, this operation adds an ownerRef and finalizers. 705 pods, err = cm.ClaimPods(ctx, pods) 706 if err != nil { 707 return pods, err 708 } 709 // Set finalizer on adopted pods for the remaining calculations. 710 for i, p := range pods { 711 adopted := true 712 for _, r := range p.OwnerReferences { 713 if r.UID == j.UID { 714 adopted = false 715 break 716 } 717 } 718 if adopted && !hasJobTrackingFinalizer(p) { 719 pods[i] = p.DeepCopy() 720 pods[i].Finalizers = append(p.Finalizers, batch.JobTrackingFinalizer) 721 } 722 } 723 return pods, err 724 } 725 726 // syncJob will sync the job with the given key if it has had its expectations fulfilled, meaning 727 // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked 728 // concurrently with the same key. 729 func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) { 730 startTime := jm.clock.Now() 731 logger := klog.FromContext(ctx) 732 defer func() { 733 logger.V(4).Info("Finished syncing job", "key", key, "elapsed", jm.clock.Since(startTime)) 734 }() 735 736 ns, name, err := cache.SplitMetaNamespaceKey(key) 737 if err != nil { 738 return err 739 } 740 if len(ns) == 0 || len(name) == 0 { 741 return fmt.Errorf("invalid job key %q: either namespace or name is missing", key) 742 } 743 sharedJob, err := jm.jobLister.Jobs(ns).Get(name) 744 if err != nil { 745 if apierrors.IsNotFound(err) { 746 logger.V(4).Info("Job has been deleted", "key", key) 747 jm.expectations.DeleteExpectations(logger, key) 748 jm.finalizerExpectations.deleteExpectations(logger, key) 749 750 err := jm.podBackoffStore.removeBackoffRecord(key) 751 if err != nil { 752 // re-syncing here as the record has to be removed for finished/deleted jobs 753 return fmt.Errorf("error removing backoff record %w", err) 754 } 755 return nil 756 } 757 return err 758 } 759 760 // Skip syncing of the job it is managed by another controller. 761 // We cannot rely solely on skipping of queueing such jobs for synchronization, 762 // because it is possible a synchronization task is queued for a job, without 763 // the managedBy field, but the job is quickly replaced by another job with 764 // the field. Then, the syncJob might be invoked for a job with the field. 765 if controllerName := managedByExternalController(sharedJob); controllerName != nil { 766 logger.V(2).Info("Skip syncing the job as it is managed by an external controller", "key", key, "uid", sharedJob.UID, "controllerName", controllerName) 767 return nil 768 } 769 770 // make a copy so we don't mutate the shared cache 771 job := *sharedJob.DeepCopy() 772 773 // if job was finished previously, we don't want to redo the termination 774 if util.IsJobFinished(&job) { 775 err := jm.podBackoffStore.removeBackoffRecord(key) 776 if err != nil { 777 // re-syncing here as the record has to be removed for finished/deleted jobs 778 return fmt.Errorf("error removing backoff record %w", err) 779 } 780 return nil 781 } 782 783 if job.Spec.CompletionMode != nil && *job.Spec.CompletionMode != batch.NonIndexedCompletion && *job.Spec.CompletionMode != batch.IndexedCompletion { 784 jm.recorder.Event(&job, v1.EventTypeWarning, "UnknownCompletionMode", "Skipped Job sync because completion mode is unknown") 785 return nil 786 } 787 788 completionMode := getCompletionMode(&job) 789 action := metrics.JobSyncActionReconciling 790 791 defer func() { 792 result := "success" 793 if rErr != nil { 794 result = "error" 795 } 796 797 metrics.JobSyncDurationSeconds.WithLabelValues(completionMode, result, action).Observe(jm.clock.Since(startTime).Seconds()) 798 metrics.JobSyncNum.WithLabelValues(completionMode, result, action).Inc() 799 }() 800 801 if job.Status.UncountedTerminatedPods == nil { 802 job.Status.UncountedTerminatedPods = &batch.UncountedTerminatedPods{} 803 } 804 805 // Check the expectations of the job before counting active pods, otherwise a new pod can sneak in 806 // and update the expectations after we've retrieved active pods from the store. If a new pod enters 807 // the store after we've checked the expectation, the job sync is just deferred till the next relist. 808 satisfiedExpectations := jm.expectations.SatisfiedExpectations(logger, key) 809 810 pods, err := jm.getPodsForJob(ctx, &job) 811 if err != nil { 812 return err 813 } 814 var terminating *int32 815 if feature.DefaultFeatureGate.Enabled(features.JobPodReplacementPolicy) { 816 terminating = ptr.To(controller.CountTerminatingPods(pods)) 817 } 818 jobCtx := &syncJobCtx{ 819 job: &job, 820 pods: pods, 821 activePods: controller.FilterActivePods(logger, pods), 822 terminating: terminating, 823 uncounted: newUncountedTerminatedPods(*job.Status.UncountedTerminatedPods), 824 expectedRmFinalizers: jm.finalizerExpectations.getExpectedUIDs(key), 825 } 826 active := int32(len(jobCtx.activePods)) 827 newSucceededPods, newFailedPods := getNewFinishedPods(jobCtx) 828 jobCtx.succeeded = job.Status.Succeeded + int32(len(newSucceededPods)) + int32(len(jobCtx.uncounted.succeeded)) 829 jobCtx.failed = job.Status.Failed + int32(nonIgnoredFailedPodsCount(jobCtx, newFailedPods)) + int32(len(jobCtx.uncounted.failed)) 830 var ready *int32 831 if feature.DefaultFeatureGate.Enabled(features.JobReadyPods) { 832 ready = ptr.To(countReadyPods(jobCtx.activePods)) 833 } 834 835 // Job first start. Set StartTime only if the job is not in the suspended state. 836 if job.Status.StartTime == nil && !jobSuspended(&job) { 837 now := metav1.NewTime(jm.clock.Now()) 838 job.Status.StartTime = &now 839 } 840 841 jobCtx.newBackoffRecord = jm.podBackoffStore.newBackoffRecord(key, newSucceededPods, newFailedPods) 842 843 var manageJobErr error 844 845 exceedsBackoffLimit := jobCtx.failed > *job.Spec.BackoffLimit 846 jobCtx.finishedCondition = hasSuccessCriteriaMetCondition(&job) 847 848 // Given that the Job already has the SuccessCriteriaMet condition, the termination condition already had confirmed in another cycle. 849 // So, the job-controller evaluates the podFailurePolicy only when the Job doesn't have the SuccessCriteriaMet condition. 850 if jobCtx.finishedCondition == nil && feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) { 851 if failureTargetCondition := findConditionByType(job.Status.Conditions, batch.JobFailureTarget); failureTargetCondition != nil { 852 jobCtx.finishedCondition = newFailedConditionForFailureTarget(failureTargetCondition, jm.clock.Now()) 853 } else if failJobMessage := getFailJobMessage(&job, pods); failJobMessage != nil { 854 // Prepare the interim FailureTarget condition to record the failure message before the finalizers (allowing removal of the pods) are removed. 855 jobCtx.finishedCondition = newCondition(batch.JobFailureTarget, v1.ConditionTrue, batch.JobReasonPodFailurePolicy, *failJobMessage, jm.clock.Now()) 856 } 857 } 858 if jobCtx.finishedCondition == nil { 859 if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) { 860 // check if the number of pod restart exceeds backoff (for restart OnFailure only) 861 // OR if the number of failed jobs increased since the last syncJob 862 jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, batch.JobReasonBackoffLimitExceeded, "Job has reached the specified backoff limit", jm.clock.Now()) 863 } else if jm.pastActiveDeadline(&job) { 864 jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, batch.JobReasonDeadlineExceeded, "Job was active longer than specified deadline", jm.clock.Now()) 865 } else if job.Spec.ActiveDeadlineSeconds != nil && !jobSuspended(&job) { 866 syncDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds)*time.Second - jm.clock.Since(job.Status.StartTime.Time) 867 logger.V(2).Info("Job has activeDeadlineSeconds configuration. Will sync this job again", "key", key, "nextSyncIn", syncDuration) 868 jm.queue.AddAfter(key, syncDuration) 869 } 870 } 871 872 if isIndexedJob(&job) { 873 jobCtx.prevSucceededIndexes, jobCtx.succeededIndexes = calculateSucceededIndexes(logger, &job, pods) 874 jobCtx.succeeded = int32(jobCtx.succeededIndexes.total()) 875 if hasBackoffLimitPerIndex(&job) { 876 jobCtx.failedIndexes = calculateFailedIndexes(logger, &job, pods) 877 if jobCtx.finishedCondition == nil { 878 if job.Spec.MaxFailedIndexes != nil && jobCtx.failedIndexes.total() > int(*job.Spec.MaxFailedIndexes) { 879 jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, batch.JobReasonMaxFailedIndexesExceeded, "Job has exceeded the specified maximal number of failed indexes", jm.clock.Now()) 880 } else if jobCtx.failedIndexes.total() > 0 && jobCtx.failedIndexes.total()+jobCtx.succeededIndexes.total() >= int(*job.Spec.Completions) { 881 jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, batch.JobReasonFailedIndexes, "Job has failed indexes", jm.clock.Now()) 882 } 883 } 884 jobCtx.podsWithDelayedDeletionPerIndex = getPodsWithDelayedDeletionPerIndex(logger, jobCtx) 885 } 886 if jobCtx.finishedCondition == nil && hasSuccessCriteriaMetCondition(jobCtx.job) == nil { 887 if msg, met := matchSuccessPolicy(logger, job.Spec.SuccessPolicy, *job.Spec.Completions, jobCtx.succeededIndexes); met { 888 jobCtx.finishedCondition = newCondition(batch.JobSuccessCriteriaMet, v1.ConditionTrue, batch.JobReasonSuccessPolicy, msg, jm.clock.Now()) 889 } 890 } 891 } 892 suspendCondChanged := false 893 // Remove active pods if Job failed. 894 if jobCtx.finishedCondition != nil { 895 deleted, err := jm.deleteActivePods(ctx, &job, jobCtx.activePods) 896 if deleted != active || !satisfiedExpectations { 897 // Can't declare the Job as finished yet, as there might be remaining 898 // pod finalizers or pods that are not in the informer's cache yet. 899 jobCtx.finishedCondition = nil 900 } 901 active -= deleted 902 manageJobErr = err 903 } else { 904 manageJobCalled := false 905 if satisfiedExpectations && job.DeletionTimestamp == nil { 906 active, action, manageJobErr = jm.manageJob(ctx, &job, jobCtx) 907 manageJobCalled = true 908 } 909 complete := false 910 if job.Spec.Completions == nil { 911 // This type of job is complete when any pod exits with success. 912 // Each pod is capable of 913 // determining whether or not the entire Job is done. Subsequent pods are 914 // not expected to fail, but if they do, the failure is ignored. Once any 915 // pod succeeds, the controller waits for remaining pods to finish, and 916 // then the job is complete. 917 complete = jobCtx.succeeded > 0 && active == 0 918 } else { 919 // Job specifies a number of completions. This type of job signals 920 // success by having that number of successes. Since we do not 921 // start more pods than there are remaining completions, there should 922 // not be any remaining active pods once this count is reached. 923 complete = jobCtx.succeeded >= *job.Spec.Completions && active == 0 924 } 925 if complete { 926 jobCtx.finishedCondition = newCondition(batch.JobComplete, v1.ConditionTrue, "", "", jm.clock.Now()) 927 } else if manageJobCalled { 928 // Update the conditions / emit events only if manageJob was called in 929 // this syncJob. Otherwise wait for the right syncJob call to make 930 // updates. 931 if job.Spec.Suspend != nil && *job.Spec.Suspend { 932 // Job can be in the suspended state only if it is NOT completed. 933 var isUpdated bool 934 job.Status.Conditions, isUpdated = ensureJobConditionStatus(job.Status.Conditions, batch.JobSuspended, v1.ConditionTrue, "JobSuspended", "Job suspended", jm.clock.Now()) 935 if isUpdated { 936 suspendCondChanged = true 937 jm.recorder.Event(&job, v1.EventTypeNormal, "Suspended", "Job suspended") 938 } 939 } else { 940 // Job not suspended. 941 var isUpdated bool 942 job.Status.Conditions, isUpdated = ensureJobConditionStatus(job.Status.Conditions, batch.JobSuspended, v1.ConditionFalse, "JobResumed", "Job resumed", jm.clock.Now()) 943 if isUpdated { 944 suspendCondChanged = true 945 jm.recorder.Event(&job, v1.EventTypeNormal, "Resumed", "Job resumed") 946 // Resumed jobs will always reset StartTime to current time. This is 947 // done because the ActiveDeadlineSeconds timer shouldn't go off 948 // whilst the Job is still suspended and resetting StartTime is 949 // consistent with resuming a Job created in the suspended state. 950 // (ActiveDeadlineSeconds is interpreted as the number of seconds a 951 // Job is continuously active.) 952 now := metav1.NewTime(jm.clock.Now()) 953 job.Status.StartTime = &now 954 } 955 } 956 } 957 } 958 959 needsStatusUpdate := suspendCondChanged || active != job.Status.Active || !ptr.Equal(ready, job.Status.Ready) 960 needsStatusUpdate = needsStatusUpdate || !ptr.Equal(job.Status.Terminating, jobCtx.terminating) 961 job.Status.Active = active 962 job.Status.Ready = ready 963 job.Status.Terminating = jobCtx.terminating 964 err = jm.trackJobStatusAndRemoveFinalizers(ctx, jobCtx, needsStatusUpdate) 965 if err != nil { 966 return fmt.Errorf("tracking status: %w", err) 967 } 968 969 return manageJobErr 970 } 971 972 // deleteActivePods issues deletion for active Pods, preserving finalizers. 973 // This is done through DELETE calls that set deletion timestamps. 974 // The method trackJobStatusAndRemoveFinalizers removes the finalizers, after 975 // which the objects can actually be deleted. 976 // Returns number of successfully deletions issued. 977 func (jm *Controller) deleteActivePods(ctx context.Context, job *batch.Job, pods []*v1.Pod) (int32, error) { 978 errCh := make(chan error, len(pods)) 979 successfulDeletes := int32(len(pods)) 980 wg := sync.WaitGroup{} 981 wg.Add(len(pods)) 982 for i := range pods { 983 go func(pod *v1.Pod) { 984 defer wg.Done() 985 if err := jm.podControl.DeletePod(ctx, job.Namespace, pod.Name, job); err != nil && !apierrors.IsNotFound(err) { 986 atomic.AddInt32(&successfulDeletes, -1) 987 errCh <- err 988 utilruntime.HandleError(err) 989 } 990 }(pods[i]) 991 } 992 wg.Wait() 993 return successfulDeletes, errorFromChannel(errCh) 994 } 995 996 func nonIgnoredFailedPodsCount(jobCtx *syncJobCtx, failedPods []*v1.Pod) int { 997 result := len(failedPods) 998 if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && jobCtx.job.Spec.PodFailurePolicy != nil { 999 for _, p := range failedPods { 1000 _, countFailed, _ := matchPodFailurePolicy(jobCtx.job.Spec.PodFailurePolicy, p) 1001 if !countFailed { 1002 result-- 1003 } 1004 } 1005 } 1006 return result 1007 } 1008 1009 // deleteJobPods deletes the pods, returns the number of successful removals 1010 // and any error. 1011 func (jm *Controller) deleteJobPods(ctx context.Context, job *batch.Job, jobKey string, pods []*v1.Pod) (int32, error) { 1012 errCh := make(chan error, len(pods)) 1013 successfulDeletes := int32(len(pods)) 1014 logger := klog.FromContext(ctx) 1015 1016 failDelete := func(pod *v1.Pod, err error) { 1017 // Decrement the expected number of deletes because the informer won't observe this deletion 1018 jm.expectations.DeletionObserved(logger, jobKey) 1019 if !apierrors.IsNotFound(err) { 1020 logger.V(2).Info("Failed to delete Pod", "job", klog.KObj(job), "pod", klog.KObj(pod), "err", err) 1021 atomic.AddInt32(&successfulDeletes, -1) 1022 errCh <- err 1023 utilruntime.HandleError(err) 1024 } 1025 } 1026 1027 wg := sync.WaitGroup{} 1028 wg.Add(len(pods)) 1029 for i := range pods { 1030 go func(pod *v1.Pod) { 1031 defer wg.Done() 1032 if patch := removeTrackingFinalizerPatch(pod); patch != nil { 1033 if err := jm.podControl.PatchPod(ctx, pod.Namespace, pod.Name, patch); err != nil { 1034 failDelete(pod, fmt.Errorf("removing completion finalizer: %w", err)) 1035 return 1036 } 1037 } 1038 if err := jm.podControl.DeletePod(ctx, job.Namespace, pod.Name, job); err != nil { 1039 failDelete(pod, err) 1040 } 1041 }(pods[i]) 1042 } 1043 wg.Wait() 1044 return successfulDeletes, errorFromChannel(errCh) 1045 } 1046 1047 // trackJobStatusAndRemoveFinalizers does: 1048 // 1. Add finished Pods to .status.uncountedTerminatedPods 1049 // 2. Remove the finalizers from the Pods if they completed or were removed 1050 // or the job was removed. 1051 // 3. Increment job counters for pods that no longer have a finalizer. 1052 // 4. Add Complete condition if satisfied with current counters. 1053 // 1054 // It does this up to a limited number of Pods so that the size of .status 1055 // doesn't grow too much and this sync doesn't starve other Jobs. 1056 func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, jobCtx *syncJobCtx, needsFlush bool) error { 1057 logger := klog.FromContext(ctx) 1058 1059 isIndexed := isIndexedJob(jobCtx.job) 1060 var podsToRemoveFinalizer []*v1.Pod 1061 uncountedStatus := jobCtx.job.Status.UncountedTerminatedPods 1062 var newSucceededIndexes []int 1063 if isIndexed { 1064 // Sort to introduce completed Indexes in order. 1065 sort.Sort(byCompletionIndex(jobCtx.pods)) 1066 } 1067 uidsWithFinalizer := make(sets.Set[string], len(jobCtx.pods)) 1068 for _, p := range jobCtx.pods { 1069 uid := string(p.UID) 1070 if hasJobTrackingFinalizer(p) && !jobCtx.expectedRmFinalizers.Has(uid) { 1071 uidsWithFinalizer.Insert(uid) 1072 } 1073 } 1074 1075 // Shallow copy, as it will only be used to detect changes in the counters. 1076 oldCounters := jobCtx.job.Status 1077 if cleanUncountedPodsWithoutFinalizers(&jobCtx.job.Status, uidsWithFinalizer) { 1078 needsFlush = true 1079 } 1080 podFailureCountByPolicyAction := map[string]int{} 1081 reachedMaxUncountedPods := false 1082 for _, pod := range jobCtx.pods { 1083 if !hasJobTrackingFinalizer(pod) || jobCtx.expectedRmFinalizers.Has(string(pod.UID)) { 1084 // This pod was processed in a previous sync. 1085 continue 1086 } 1087 considerPodFailed := isPodFailed(pod, jobCtx.job) 1088 if !canRemoveFinalizer(logger, jobCtx, pod, considerPodFailed) { 1089 continue 1090 } 1091 podsToRemoveFinalizer = append(podsToRemoveFinalizer, pod) 1092 if pod.Status.Phase == v1.PodSucceeded && !jobCtx.uncounted.failed.Has(string(pod.UID)) { 1093 if isIndexed { 1094 // The completion index is enough to avoid recounting succeeded pods. 1095 // No need to track UIDs. 1096 ix := getCompletionIndex(pod.Annotations) 1097 if ix != unknownCompletionIndex && ix < int(*jobCtx.job.Spec.Completions) && !jobCtx.prevSucceededIndexes.has(ix) { 1098 newSucceededIndexes = append(newSucceededIndexes, ix) 1099 needsFlush = true 1100 } 1101 } else if !jobCtx.uncounted.succeeded.Has(string(pod.UID)) { 1102 needsFlush = true 1103 uncountedStatus.Succeeded = append(uncountedStatus.Succeeded, pod.UID) 1104 } 1105 } else if considerPodFailed || (jobCtx.finishedCondition != nil && !isSuccessCriteriaMetCondition(jobCtx.finishedCondition)) { 1106 // When the job is considered finished, every non-terminated pod is considered failed. 1107 ix := getCompletionIndex(pod.Annotations) 1108 if !jobCtx.uncounted.failed.Has(string(pod.UID)) && (!isIndexed || (ix != unknownCompletionIndex && ix < int(*jobCtx.job.Spec.Completions))) { 1109 if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && jobCtx.job.Spec.PodFailurePolicy != nil { 1110 _, countFailed, action := matchPodFailurePolicy(jobCtx.job.Spec.PodFailurePolicy, pod) 1111 if action != nil { 1112 podFailureCountByPolicyAction[string(*action)] += 1 1113 } 1114 if countFailed { 1115 needsFlush = true 1116 uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID) 1117 } 1118 } else { 1119 needsFlush = true 1120 uncountedStatus.Failed = append(uncountedStatus.Failed, pod.UID) 1121 } 1122 } 1123 } 1124 if len(newSucceededIndexes)+len(uncountedStatus.Succeeded)+len(uncountedStatus.Failed) >= MaxUncountedPods { 1125 // The controller added enough Pods already to .status.uncountedTerminatedPods 1126 // We stop counting pods and removing finalizers here to: 1127 // 1. Ensure that the UIDs representation are under 20 KB. 1128 // 2. Cap the number of finalizer removals so that syncing of big Jobs 1129 // doesn't starve smaller ones. 1130 // 1131 // The job will be synced again because the Job status and Pod updates 1132 // will put the Job back to the work queue. 1133 reachedMaxUncountedPods = true 1134 break 1135 } 1136 } 1137 if isIndexed { 1138 jobCtx.succeededIndexes = jobCtx.succeededIndexes.withOrderedIndexes(newSucceededIndexes) 1139 succeededIndexesStr := jobCtx.succeededIndexes.String() 1140 if succeededIndexesStr != jobCtx.job.Status.CompletedIndexes { 1141 needsFlush = true 1142 } 1143 jobCtx.job.Status.Succeeded = int32(jobCtx.succeededIndexes.total()) 1144 jobCtx.job.Status.CompletedIndexes = succeededIndexesStr 1145 var failedIndexesStr *string 1146 if jobCtx.failedIndexes != nil { 1147 failedIndexesStr = ptr.To(jobCtx.failedIndexes.String()) 1148 } 1149 if !ptr.Equal(jobCtx.job.Status.FailedIndexes, failedIndexesStr) { 1150 jobCtx.job.Status.FailedIndexes = failedIndexesStr 1151 needsFlush = true 1152 } 1153 } 1154 if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) { 1155 if jobCtx.finishedCondition != nil && jobCtx.finishedCondition.Type == batch.JobFailureTarget { 1156 1157 // Append the interim FailureTarget condition to update the job status with before finalizers are removed. 1158 jobCtx.job.Status.Conditions = append(jobCtx.job.Status.Conditions, *jobCtx.finishedCondition) 1159 needsFlush = true 1160 1161 // Prepare the final Failed condition to update the job status with after the finalizers are removed. 1162 // It is also used in the enactJobFinished function for reporting. 1163 jobCtx.finishedCondition = newFailedConditionForFailureTarget(jobCtx.finishedCondition, jm.clock.Now()) 1164 } 1165 } 1166 if isSuccessCriteriaMetCondition(jobCtx.finishedCondition) { 1167 // Append the interim SuccessCriteriaMet condition to update the job status with before finalizers are removed. 1168 if hasSuccessCriteriaMetCondition(jobCtx.job) == nil { 1169 jobCtx.job.Status.Conditions = append(jobCtx.job.Status.Conditions, *jobCtx.finishedCondition) 1170 needsFlush = true 1171 } 1172 1173 // Prepare the final Complete condition to update the job status with after the finalizers are removed. 1174 // It is also used in the enactJobFinished function for reporting. 1175 jobCtx.finishedCondition = newCondition(batch.JobComplete, v1.ConditionTrue, jobCtx.finishedCondition.Reason, jobCtx.finishedCondition.Message, jm.clock.Now()) 1176 } 1177 var err error 1178 if jobCtx.job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, jobCtx, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, podFailureCountByPolicyAction, needsFlush); err != nil { 1179 return err 1180 } 1181 jobFinished := !reachedMaxUncountedPods && jm.enactJobFinished(jobCtx.job, jobCtx.finishedCondition) 1182 if jobFinished { 1183 needsFlush = true 1184 } 1185 if needsFlush { 1186 if _, err := jm.updateStatusHandler(ctx, jobCtx.job); err != nil { 1187 return fmt.Errorf("removing uncounted pods from status: %w", err) 1188 } 1189 if jobFinished { 1190 jm.recordJobFinished(jobCtx.job, jobCtx.finishedCondition) 1191 } 1192 recordJobPodFinished(logger, jobCtx.job, oldCounters) 1193 } 1194 return nil 1195 } 1196 1197 // canRemoveFinalizer determines if the pod's finalizer can be safely removed. 1198 // The finalizer can be removed when: 1199 // - the entire Job is terminating; or 1200 // - the pod's index is succeeded; or 1201 // - the Pod is considered failed, unless it's removal is delayed for the 1202 // purpose of transferring the JobIndexFailureCount annotations to the 1203 // replacement pod. the entire Job is terminating the finalizer can be 1204 // removed unconditionally; or 1205 // - the Job met successPolicy. 1206 func canRemoveFinalizer(logger klog.Logger, jobCtx *syncJobCtx, pod *v1.Pod, considerPodFailed bool) bool { 1207 if jobCtx.job.DeletionTimestamp != nil || jobCtx.finishedCondition != nil || pod.Status.Phase == v1.PodSucceeded { 1208 return true 1209 } 1210 if !considerPodFailed { 1211 return false 1212 } 1213 if hasBackoffLimitPerIndex(jobCtx.job) { 1214 if index := getCompletionIndex(pod.Annotations); index != unknownCompletionIndex { 1215 if p, ok := jobCtx.podsWithDelayedDeletionPerIndex[index]; ok && p.UID == pod.UID { 1216 logger.V(3).Info("Delaying pod finalizer removal to await for pod recreation within the index", "pod", klog.KObj(pod)) 1217 return false 1218 } 1219 } 1220 } 1221 return true 1222 } 1223 1224 // flushUncountedAndRemoveFinalizers does: 1225 // 1. flush the Job status that might include new uncounted Pod UIDs. 1226 // Also flush the interim FailureTarget and SuccessCriteriaMet conditions if present. 1227 // 2. perform the removal of finalizers from Pods which are in the uncounted 1228 // lists. 1229 // 3. update the counters based on the Pods for which it successfully removed 1230 // the finalizers. 1231 // 4. (if not all removals succeeded) flush Job status again. 1232 // 1233 // Returns whether there are pending changes in the Job status that need to be 1234 // flushed in subsequent calls. 1235 func (jm *Controller) flushUncountedAndRemoveFinalizers(ctx context.Context, jobCtx *syncJobCtx, podsToRemoveFinalizer []*v1.Pod, uidsWithFinalizer sets.Set[string], oldCounters *batch.JobStatus, podFailureCountByPolicyAction map[string]int, needsFlush bool) (*batch.Job, bool, error) { 1236 logger := klog.FromContext(ctx) 1237 var err error 1238 if needsFlush { 1239 if jobCtx.job, err = jm.updateStatusHandler(ctx, jobCtx.job); err != nil { 1240 return jobCtx.job, needsFlush, fmt.Errorf("adding uncounted pods to status: %w", err) 1241 } 1242 1243 err = jm.podBackoffStore.updateBackoffRecord(jobCtx.newBackoffRecord) 1244 1245 if err != nil { 1246 // this error might undercount the backoff. 1247 // re-syncing from the current state might not help to recover 1248 // the backoff information 1249 logger.Error(err, "Backoff update failed") 1250 } 1251 1252 recordJobPodFinished(logger, jobCtx.job, *oldCounters) 1253 // Shallow copy, as it will only be used to detect changes in the counters. 1254 *oldCounters = jobCtx.job.Status 1255 needsFlush = false 1256 } 1257 recordJobPodFailurePolicyActions(jobCtx.job, podFailureCountByPolicyAction) 1258 1259 jobKey, err := controller.KeyFunc(jobCtx.job) 1260 if err != nil { 1261 return jobCtx.job, needsFlush, fmt.Errorf("getting job key: %w", err) 1262 } 1263 var rmErr error 1264 if len(podsToRemoveFinalizer) > 0 { 1265 var rmSucceded []bool 1266 rmSucceded, rmErr = jm.removeTrackingFinalizerFromPods(ctx, jobKey, podsToRemoveFinalizer) 1267 for i, p := range podsToRemoveFinalizer { 1268 if rmSucceded[i] { 1269 uidsWithFinalizer.Delete(string(p.UID)) 1270 } 1271 } 1272 } 1273 // Failed to remove some finalizers. Attempt to update the status with the 1274 // partial progress. 1275 if cleanUncountedPodsWithoutFinalizers(&jobCtx.job.Status, uidsWithFinalizer) { 1276 needsFlush = true 1277 } 1278 if rmErr != nil && needsFlush { 1279 if job, err := jm.updateStatusHandler(ctx, jobCtx.job); err != nil { 1280 return job, needsFlush, fmt.Errorf("removing uncounted pods from status: %w", err) 1281 } 1282 } 1283 return jobCtx.job, needsFlush, rmErr 1284 } 1285 1286 // cleanUncountedPodsWithoutFinalizers removes the Pod UIDs from 1287 // .status.uncountedTerminatedPods for which the finalizer was successfully 1288 // removed and increments the corresponding status counters. 1289 // Returns whether there was any status change. 1290 func cleanUncountedPodsWithoutFinalizers(status *batch.JobStatus, uidsWithFinalizer sets.Set[string]) bool { 1291 updated := false 1292 uncountedStatus := status.UncountedTerminatedPods 1293 newUncounted := filterInUncountedUIDs(uncountedStatus.Succeeded, uidsWithFinalizer) 1294 if len(newUncounted) != len(uncountedStatus.Succeeded) { 1295 updated = true 1296 status.Succeeded += int32(len(uncountedStatus.Succeeded) - len(newUncounted)) 1297 uncountedStatus.Succeeded = newUncounted 1298 } 1299 newUncounted = filterInUncountedUIDs(uncountedStatus.Failed, uidsWithFinalizer) 1300 if len(newUncounted) != len(uncountedStatus.Failed) { 1301 updated = true 1302 status.Failed += int32(len(uncountedStatus.Failed) - len(newUncounted)) 1303 uncountedStatus.Failed = newUncounted 1304 } 1305 return updated 1306 } 1307 1308 // removeTrackingFinalizerFromPods removes tracking finalizers from Pods and 1309 // returns an array of booleans where the i-th value is true if the finalizer 1310 // of the i-th Pod was successfully removed (if the pod was deleted when this 1311 // function was called, it's considered as the finalizer was removed successfully). 1312 func (jm *Controller) removeTrackingFinalizerFromPods(ctx context.Context, jobKey string, pods []*v1.Pod) ([]bool, error) { 1313 logger := klog.FromContext(ctx) 1314 errCh := make(chan error, len(pods)) 1315 succeeded := make([]bool, len(pods)) 1316 uids := make([]string, len(pods)) 1317 for i, p := range pods { 1318 uids[i] = string(p.UID) 1319 } 1320 if jobKey != "" { 1321 err := jm.finalizerExpectations.expectFinalizersRemoved(logger, jobKey, uids) 1322 if err != nil { 1323 return succeeded, fmt.Errorf("setting expected removed finalizers: %w", err) 1324 } 1325 } 1326 wg := sync.WaitGroup{} 1327 wg.Add(len(pods)) 1328 for i := range pods { 1329 go func(i int) { 1330 pod := pods[i] 1331 defer wg.Done() 1332 if patch := removeTrackingFinalizerPatch(pod); patch != nil { 1333 if err := jm.podControl.PatchPod(ctx, pod.Namespace, pod.Name, patch); err != nil { 1334 // In case of any failure, we don't expect a Pod update for the 1335 // finalizer removed. Clear expectation now. 1336 if jobKey != "" { 1337 jm.finalizerExpectations.finalizerRemovalObserved(logger, jobKey, string(pod.UID)) 1338 } 1339 if !apierrors.IsNotFound(err) { 1340 errCh <- err 1341 utilruntime.HandleError(fmt.Errorf("removing tracking finalizer: %w", err)) 1342 return 1343 } 1344 } 1345 succeeded[i] = true 1346 } 1347 }(i) 1348 } 1349 wg.Wait() 1350 1351 return succeeded, errorFromChannel(errCh) 1352 } 1353 1354 // enactJobFinished adds the Complete or Failed condition and records events. 1355 // Returns whether the Job was considered finished. 1356 func (jm *Controller) enactJobFinished(job *batch.Job, finishedCond *batch.JobCondition) bool { 1357 if finishedCond == nil { 1358 return false 1359 } 1360 if uncounted := job.Status.UncountedTerminatedPods; uncounted != nil { 1361 if len(uncounted.Succeeded) > 0 || len(uncounted.Failed) > 0 { 1362 return false 1363 } 1364 } 1365 job.Status.Conditions, _ = ensureJobConditionStatus(job.Status.Conditions, finishedCond.Type, finishedCond.Status, finishedCond.Reason, finishedCond.Message, jm.clock.Now()) 1366 if finishedCond.Type == batch.JobComplete { 1367 job.Status.CompletionTime = &finishedCond.LastTransitionTime 1368 } 1369 return true 1370 } 1371 1372 // recordJobFinished records events and the job_finished_total metric for a finished job. 1373 func (jm *Controller) recordJobFinished(job *batch.Job, finishedCond *batch.JobCondition) bool { 1374 completionMode := getCompletionMode(job) 1375 if finishedCond.Type == batch.JobComplete { 1376 if job.Spec.Completions != nil && job.Status.Succeeded > *job.Spec.Completions { 1377 jm.recorder.Event(job, v1.EventTypeWarning, "TooManySucceededPods", "Too many succeeded pods running after completion count reached") 1378 } 1379 jm.recorder.Event(job, v1.EventTypeNormal, "Completed", "Job completed") 1380 metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded", "").Inc() 1381 } else { 1382 jm.recorder.Event(job, v1.EventTypeWarning, finishedCond.Reason, finishedCond.Message) 1383 metrics.JobFinishedNum.WithLabelValues(completionMode, "failed", finishedCond.Reason).Inc() 1384 } 1385 return true 1386 } 1387 1388 func filterInUncountedUIDs(uncounted []types.UID, include sets.Set[string]) []types.UID { 1389 var newUncounted []types.UID 1390 for _, uid := range uncounted { 1391 if include.Has(string(uid)) { 1392 newUncounted = append(newUncounted, uid) 1393 } 1394 } 1395 return newUncounted 1396 } 1397 1398 // newFailedConditionForFailureTarget creates a job Failed condition based on 1399 // the interim FailureTarget condition. 1400 func newFailedConditionForFailureTarget(condition *batch.JobCondition, now time.Time) *batch.JobCondition { 1401 return newCondition(batch.JobFailed, v1.ConditionTrue, condition.Reason, condition.Message, now) 1402 } 1403 1404 // pastBackoffLimitOnFailure checks if container restartCounts sum exceeds BackoffLimit 1405 // this method applies only to pods with restartPolicy == OnFailure 1406 func pastBackoffLimitOnFailure(job *batch.Job, pods []*v1.Pod) bool { 1407 if job.Spec.Template.Spec.RestartPolicy != v1.RestartPolicyOnFailure { 1408 return false 1409 } 1410 result := int32(0) 1411 for i := range pods { 1412 po := pods[i] 1413 if po.Status.Phase == v1.PodRunning || po.Status.Phase == v1.PodPending { 1414 for j := range po.Status.InitContainerStatuses { 1415 stat := po.Status.InitContainerStatuses[j] 1416 result += stat.RestartCount 1417 } 1418 for j := range po.Status.ContainerStatuses { 1419 stat := po.Status.ContainerStatuses[j] 1420 result += stat.RestartCount 1421 } 1422 } 1423 } 1424 if *job.Spec.BackoffLimit == 0 { 1425 return result > 0 1426 } 1427 return result >= *job.Spec.BackoffLimit 1428 } 1429 1430 // pastActiveDeadline checks if job has ActiveDeadlineSeconds field set and if 1431 // it is exceeded. If the job is currently suspended, the function will always 1432 // return false. 1433 func (jm *Controller) pastActiveDeadline(job *batch.Job) bool { 1434 if job.Spec.ActiveDeadlineSeconds == nil || job.Status.StartTime == nil || jobSuspended(job) { 1435 return false 1436 } 1437 duration := jm.clock.Since(job.Status.StartTime.Time) 1438 allowedDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds) * time.Second 1439 return duration >= allowedDuration 1440 } 1441 1442 func newCondition(conditionType batch.JobConditionType, status v1.ConditionStatus, reason, message string, now time.Time) *batch.JobCondition { 1443 return &batch.JobCondition{ 1444 Type: conditionType, 1445 Status: status, 1446 LastProbeTime: metav1.NewTime(now), 1447 LastTransitionTime: metav1.NewTime(now), 1448 Reason: reason, 1449 Message: message, 1450 } 1451 } 1452 1453 // getFailJobMessage returns a job failure message if the job should fail with the current counters 1454 func getFailJobMessage(job *batch.Job, pods []*v1.Pod) *string { 1455 if !feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) || job.Spec.PodFailurePolicy == nil { 1456 return nil 1457 } 1458 for _, p := range pods { 1459 if isPodFailed(p, job) { 1460 jobFailureMessage, _, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, p) 1461 if jobFailureMessage != nil { 1462 return jobFailureMessage 1463 } 1464 } 1465 } 1466 return nil 1467 } 1468 1469 // getNewFinishedPods returns the list of newly succeeded and failed pods that are not accounted 1470 // in the job status. The list of failed pods can be affected by the podFailurePolicy. 1471 func getNewFinishedPods(jobCtx *syncJobCtx) (succeededPods, failedPods []*v1.Pod) { 1472 succeededPods = getValidPodsWithFilter(jobCtx, jobCtx.uncounted.Succeeded(), func(p *v1.Pod) bool { 1473 return p.Status.Phase == v1.PodSucceeded 1474 }) 1475 failedPods = getValidPodsWithFilter(jobCtx, jobCtx.uncounted.Failed(), func(p *v1.Pod) bool { 1476 return isPodFailed(p, jobCtx.job) 1477 }) 1478 return succeededPods, failedPods 1479 } 1480 1481 // jobSuspended returns whether a Job is suspended while taking the feature 1482 // gate into account. 1483 func jobSuspended(job *batch.Job) bool { 1484 return job.Spec.Suspend != nil && *job.Spec.Suspend 1485 } 1486 1487 // manageJob is the core method responsible for managing the number of running 1488 // pods according to what is specified in the job.Spec. 1489 // Respects back-off; does not create new pods if the back-off time has not passed 1490 // Does NOT modify <activePods>. 1491 func (jm *Controller) manageJob(ctx context.Context, job *batch.Job, jobCtx *syncJobCtx) (int32, string, error) { 1492 logger := klog.FromContext(ctx) 1493 active := int32(len(jobCtx.activePods)) 1494 parallelism := *job.Spec.Parallelism 1495 jobKey, err := controller.KeyFunc(job) 1496 if err != nil { 1497 utilruntime.HandleError(fmt.Errorf("Couldn't get key for job %#v: %v", job, err)) 1498 return 0, metrics.JobSyncActionTracking, nil 1499 } 1500 1501 if jobSuspended(job) { 1502 logger.V(4).Info("Deleting all active pods in suspended job", "job", klog.KObj(job), "active", active) 1503 podsToDelete := activePodsForRemoval(job, jobCtx.activePods, int(active)) 1504 jm.expectations.ExpectDeletions(logger, jobKey, len(podsToDelete)) 1505 removed, err := jm.deleteJobPods(ctx, job, jobKey, podsToDelete) 1506 active -= removed 1507 return active, metrics.JobSyncActionPodsDeleted, err 1508 } 1509 1510 var terminating int32 = 0 1511 if onlyReplaceFailedPods(jobCtx.job) { 1512 // For PodFailurePolicy specified but PodReplacementPolicy disabled 1513 // we still need to count terminating pods for replica counts 1514 // But we will not allow updates to status. 1515 if jobCtx.terminating == nil { 1516 terminating = controller.CountTerminatingPods(jobCtx.pods) 1517 } else { 1518 terminating = *jobCtx.terminating 1519 } 1520 } 1521 wantActive := int32(0) 1522 if job.Spec.Completions == nil { 1523 // Job does not specify a number of completions. Therefore, number active 1524 // should be equal to parallelism, unless the job has seen at least 1525 // once success, in which leave whatever is running, running. 1526 if jobCtx.succeeded > 0 { 1527 wantActive = active 1528 } else { 1529 wantActive = parallelism 1530 } 1531 } else { 1532 // Job specifies a specific number of completions. Therefore, number 1533 // active should not ever exceed number of remaining completions. 1534 wantActive = *job.Spec.Completions - jobCtx.succeeded 1535 if wantActive > parallelism { 1536 wantActive = parallelism 1537 } 1538 if wantActive < 0 { 1539 wantActive = 0 1540 } 1541 } 1542 1543 rmAtLeast := active - wantActive 1544 if rmAtLeast < 0 { 1545 rmAtLeast = 0 1546 } 1547 podsToDelete := activePodsForRemoval(job, jobCtx.activePods, int(rmAtLeast)) 1548 if len(podsToDelete) > MaxPodCreateDeletePerSync { 1549 podsToDelete = podsToDelete[:MaxPodCreateDeletePerSync] 1550 } 1551 if len(podsToDelete) > 0 { 1552 jm.expectations.ExpectDeletions(logger, jobKey, len(podsToDelete)) 1553 logger.V(4).Info("Too many pods running for job", "job", klog.KObj(job), "deleted", len(podsToDelete), "target", wantActive) 1554 removed, err := jm.deleteJobPods(ctx, job, jobKey, podsToDelete) 1555 active -= removed 1556 // While it is possible for a Job to require both pod creations and 1557 // deletions at the same time (e.g. indexed Jobs with repeated indexes), we 1558 // restrict ourselves to either just pod deletion or pod creation in any 1559 // given sync cycle. Of these two, pod deletion takes precedence. 1560 return active, metrics.JobSyncActionPodsDeleted, err 1561 } 1562 1563 if diff := wantActive - terminating - active; diff > 0 { 1564 var remainingTime time.Duration 1565 if !hasBackoffLimitPerIndex(job) { 1566 // we compute the global remaining time for pod creation when backoffLimitPerIndex is not used 1567 remainingTime = jobCtx.newBackoffRecord.getRemainingTime(jm.clock, DefaultJobPodFailureBackOff, MaxJobPodFailureBackOff) 1568 } 1569 if remainingTime > 0 { 1570 jm.enqueueSyncJobWithDelay(logger, job, remainingTime) 1571 return 0, metrics.JobSyncActionPodsCreated, nil 1572 } 1573 if diff > int32(MaxPodCreateDeletePerSync) { 1574 diff = int32(MaxPodCreateDeletePerSync) 1575 } 1576 1577 var indexesToAdd []int 1578 if isIndexedJob(job) { 1579 indexesToAdd = firstPendingIndexes(jobCtx, int(diff), int(*job.Spec.Completions)) 1580 if hasBackoffLimitPerIndex(job) { 1581 indexesToAdd, remainingTime = jm.getPodCreationInfoForIndependentIndexes(logger, indexesToAdd, jobCtx.podsWithDelayedDeletionPerIndex) 1582 if remainingTime > 0 { 1583 jm.enqueueSyncJobWithDelay(logger, job, remainingTime) 1584 return 0, metrics.JobSyncActionPodsCreated, nil 1585 } 1586 } 1587 diff = int32(len(indexesToAdd)) 1588 } 1589 1590 jm.expectations.ExpectCreations(logger, jobKey, int(diff)) 1591 errCh := make(chan error, diff) 1592 logger.V(4).Info("Too few pods running", "key", jobKey, "need", wantActive, "creating", diff) 1593 1594 wait := sync.WaitGroup{} 1595 1596 active += diff 1597 1598 podTemplate := job.Spec.Template.DeepCopy() 1599 if isIndexedJob(job) { 1600 addCompletionIndexEnvVariables(podTemplate) 1601 } 1602 podTemplate.Finalizers = appendJobCompletionFinalizerIfNotFound(podTemplate.Finalizers) 1603 1604 // Counters for pod creation status (used by the job_pods_creation_total metric) 1605 var creationsSucceeded, creationsFailed int32 = 0, 0 1606 1607 // Batch the pod creates. Batch sizes start at SlowStartInitialBatchSize 1608 // and double with each successful iteration in a kind of "slow start". 1609 // This handles attempts to start large numbers of pods that would 1610 // likely all fail with the same error. For example a project with a 1611 // low quota that attempts to create a large number of pods will be 1612 // prevented from spamming the API service with the pod create requests 1613 // after one of its pods fails. Conveniently, this also prevents the 1614 // event spam that those failures would generate. 1615 for batchSize := min(diff, int32(controller.SlowStartInitialBatchSize)); diff > 0; batchSize = min(2*batchSize, diff) { 1616 errorCount := len(errCh) 1617 wait.Add(int(batchSize)) 1618 for i := int32(0); i < batchSize; i++ { 1619 completionIndex := unknownCompletionIndex 1620 if len(indexesToAdd) > 0 { 1621 completionIndex = indexesToAdd[0] 1622 indexesToAdd = indexesToAdd[1:] 1623 } 1624 go func() { 1625 template := podTemplate 1626 generateName := "" 1627 if completionIndex != unknownCompletionIndex { 1628 template = podTemplate.DeepCopy() 1629 addCompletionIndexAnnotation(template, completionIndex) 1630 1631 if feature.DefaultFeatureGate.Enabled(features.PodIndexLabel) { 1632 addCompletionIndexLabel(template, completionIndex) 1633 } 1634 template.Spec.Hostname = fmt.Sprintf("%s-%d", job.Name, completionIndex) 1635 generateName = podGenerateNameWithIndex(job.Name, completionIndex) 1636 if hasBackoffLimitPerIndex(job) { 1637 addIndexFailureCountAnnotation(logger, template, job, jobCtx.podsWithDelayedDeletionPerIndex[completionIndex]) 1638 } 1639 } 1640 defer wait.Done() 1641 err := jm.podControl.CreatePodsWithGenerateName(ctx, job.Namespace, template, job, metav1.NewControllerRef(job, controllerKind), generateName) 1642 if err != nil { 1643 if apierrors.HasStatusCause(err, v1.NamespaceTerminatingCause) { 1644 // If the namespace is being torn down, we can safely ignore 1645 // this error since all subsequent creations will fail. 1646 return 1647 } 1648 } 1649 if err != nil { 1650 defer utilruntime.HandleError(err) 1651 // Decrement the expected number of creates because the informer won't observe this pod 1652 logger.V(2).Info("Failed creation, decrementing expectations", "job", klog.KObj(job)) 1653 jm.expectations.CreationObserved(logger, jobKey) 1654 atomic.AddInt32(&active, -1) 1655 errCh <- err 1656 atomic.AddInt32(&creationsFailed, 1) 1657 } 1658 atomic.AddInt32(&creationsSucceeded, 1) 1659 }() 1660 } 1661 wait.Wait() 1662 // any skipped pods that we never attempted to start shouldn't be expected. 1663 skippedPods := diff - batchSize 1664 if errorCount < len(errCh) && skippedPods > 0 { 1665 logger.V(2).Info("Slow-start failure. Skipping creating pods, decrementing expectations", "skippedCount", skippedPods, "job", klog.KObj(job)) 1666 active -= skippedPods 1667 for i := int32(0); i < skippedPods; i++ { 1668 // Decrement the expected number of creates because the informer won't observe this pod 1669 jm.expectations.CreationObserved(logger, jobKey) 1670 } 1671 // The skipped pods will be retried later. The next controller resync will 1672 // retry the slow start process. 1673 break 1674 } 1675 diff -= batchSize 1676 } 1677 recordJobPodsCreationTotal(job, jobCtx, creationsSucceeded, creationsFailed) 1678 return active, metrics.JobSyncActionPodsCreated, errorFromChannel(errCh) 1679 } 1680 1681 return active, metrics.JobSyncActionTracking, nil 1682 } 1683 1684 // getPodCreationInfoForIndependentIndexes returns a sub-list of all indexes 1685 // to create that contains those which can be already created. In case no indexes 1686 // are ready to create pods, it returns the lowest remaining time to create pods 1687 // out of all indexes. 1688 func (jm *Controller) getPodCreationInfoForIndependentIndexes(logger klog.Logger, indexesToAdd []int, podsWithDelayedDeletionPerIndex map[int]*v1.Pod) ([]int, time.Duration) { 1689 var indexesToAddNow []int 1690 var minRemainingTimePerIndex *time.Duration 1691 for _, indexToAdd := range indexesToAdd { 1692 if remainingTimePerIndex := getRemainingTimePerIndex(logger, jm.clock, DefaultJobPodFailureBackOff, MaxJobPodFailureBackOff, podsWithDelayedDeletionPerIndex[indexToAdd]); remainingTimePerIndex == 0 { 1693 indexesToAddNow = append(indexesToAddNow, indexToAdd) 1694 } else if minRemainingTimePerIndex == nil || remainingTimePerIndex < *minRemainingTimePerIndex { 1695 minRemainingTimePerIndex = &remainingTimePerIndex 1696 } 1697 } 1698 if len(indexesToAddNow) > 0 { 1699 return indexesToAddNow, 0 1700 } 1701 return indexesToAddNow, ptr.Deref(minRemainingTimePerIndex, 0) 1702 } 1703 1704 // activePodsForRemoval returns Pods that should be removed because there 1705 // are too many pods running or, if this is an indexed job, there are repeated 1706 // indexes or invalid indexes or some pods don't have indexes. 1707 // Sorts candidate pods in the order such that not-ready < ready, unscheduled 1708 // < scheduled, and pending < running. This ensures that we delete pods 1709 // in the earlier stages whenever possible. 1710 func activePodsForRemoval(job *batch.Job, pods []*v1.Pod, rmAtLeast int) []*v1.Pod { 1711 var rm, left []*v1.Pod 1712 1713 if isIndexedJob(job) { 1714 rm = make([]*v1.Pod, 0, rmAtLeast) 1715 left = make([]*v1.Pod, 0, len(pods)-rmAtLeast) 1716 rm, left = appendDuplicatedIndexPodsForRemoval(rm, left, pods, int(*job.Spec.Completions)) 1717 } else { 1718 left = pods 1719 } 1720 1721 if len(rm) < rmAtLeast { 1722 sort.Sort(controller.ActivePods(left)) 1723 rm = append(rm, left[:rmAtLeast-len(rm)]...) 1724 } 1725 return rm 1726 } 1727 1728 // updateJobStatus calls the API to update the job status. 1729 func (jm *Controller) updateJobStatus(ctx context.Context, job *batch.Job) (*batch.Job, error) { 1730 return jm.kubeClient.BatchV1().Jobs(job.Namespace).UpdateStatus(ctx, job, metav1.UpdateOptions{}) 1731 } 1732 1733 func (jm *Controller) patchJob(ctx context.Context, job *batch.Job, data []byte) error { 1734 _, err := jm.kubeClient.BatchV1().Jobs(job.Namespace).Patch( 1735 ctx, job.Name, types.StrategicMergePatchType, data, metav1.PatchOptions{}) 1736 return err 1737 } 1738 1739 // getValidPodsWithFilter returns the valid pods that pass the filter. 1740 // Pods are valid if they have a finalizer or in uncounted set 1741 // and, for Indexed Jobs, a valid completion index. 1742 func getValidPodsWithFilter(jobCtx *syncJobCtx, uncounted sets.Set[string], filter func(*v1.Pod) bool) []*v1.Pod { 1743 var result []*v1.Pod 1744 for _, p := range jobCtx.pods { 1745 uid := string(p.UID) 1746 1747 // Pods that don't have a completion finalizer are in the uncounted set or 1748 // have already been accounted for in the Job status. 1749 if !hasJobTrackingFinalizer(p) || uncounted.Has(uid) || jobCtx.expectedRmFinalizers.Has(uid) { 1750 continue 1751 } 1752 if isIndexedJob(jobCtx.job) { 1753 idx := getCompletionIndex(p.Annotations) 1754 if idx == unknownCompletionIndex || idx >= int(*jobCtx.job.Spec.Completions) { 1755 continue 1756 } 1757 } 1758 if filter(p) { 1759 result = append(result, p) 1760 } 1761 } 1762 return result 1763 } 1764 1765 // getCompletionMode returns string representation of the completion mode. Used as a label value for metrics. 1766 func getCompletionMode(job *batch.Job) string { 1767 if isIndexedJob(job) { 1768 return string(batch.IndexedCompletion) 1769 } 1770 return string(batch.NonIndexedCompletion) 1771 } 1772 1773 func appendJobCompletionFinalizerIfNotFound(finalizers []string) []string { 1774 for _, fin := range finalizers { 1775 if fin == batch.JobTrackingFinalizer { 1776 return finalizers 1777 } 1778 } 1779 return append(finalizers, batch.JobTrackingFinalizer) 1780 } 1781 1782 func removeTrackingFinalizerPatch(pod *v1.Pod) []byte { 1783 if !hasJobTrackingFinalizer(pod) { 1784 return nil 1785 } 1786 patch := map[string]interface{}{ 1787 "metadata": map[string]interface{}{ 1788 "$deleteFromPrimitiveList/finalizers": []string{batch.JobTrackingFinalizer}, 1789 }, 1790 } 1791 patchBytes, _ := json.Marshal(patch) 1792 return patchBytes 1793 } 1794 1795 type uncountedTerminatedPods struct { 1796 succeeded sets.Set[string] 1797 failed sets.Set[string] 1798 } 1799 1800 func newUncountedTerminatedPods(in batch.UncountedTerminatedPods) *uncountedTerminatedPods { 1801 obj := uncountedTerminatedPods{ 1802 succeeded: make(sets.Set[string], len(in.Succeeded)), 1803 failed: make(sets.Set[string], len(in.Failed)), 1804 } 1805 for _, v := range in.Succeeded { 1806 obj.succeeded.Insert(string(v)) 1807 } 1808 for _, v := range in.Failed { 1809 obj.failed.Insert(string(v)) 1810 } 1811 return &obj 1812 } 1813 1814 func (u *uncountedTerminatedPods) Succeeded() sets.Set[string] { 1815 if u == nil { 1816 return nil 1817 } 1818 return u.succeeded 1819 } 1820 1821 func (u *uncountedTerminatedPods) Failed() sets.Set[string] { 1822 if u == nil { 1823 return nil 1824 } 1825 return u.failed 1826 } 1827 1828 func errorFromChannel(errCh <-chan error) error { 1829 select { 1830 case err := <-errCh: 1831 return err 1832 default: 1833 } 1834 return nil 1835 } 1836 1837 // ensureJobConditionStatus appends or updates an existing job condition of the 1838 // given type with the given status value. Note that this function will not 1839 // append to the conditions list if the new condition's status is false 1840 // (because going from nothing to false is meaningless); it can, however, 1841 // update the status condition to false. The function returns a bool to let the 1842 // caller know if the list was changed (either appended or updated). 1843 func ensureJobConditionStatus(list []batch.JobCondition, cType batch.JobConditionType, status v1.ConditionStatus, reason, message string, now time.Time) ([]batch.JobCondition, bool) { 1844 if condition := findConditionByType(list, cType); condition != nil { 1845 if condition.Status != status || condition.Reason != reason || condition.Message != message { 1846 *condition = *newCondition(cType, status, reason, message, now) 1847 return list, true 1848 } 1849 return list, false 1850 } 1851 // A condition with that type doesn't exist in the list. 1852 if status != v1.ConditionFalse { 1853 return append(list, *newCondition(cType, status, reason, message, now)), true 1854 } 1855 return list, false 1856 } 1857 1858 func isPodFailed(p *v1.Pod, job *batch.Job) bool { 1859 if feature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions) && feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil { 1860 // When PodDisruptionConditions is enabled, orphan Pods and unschedulable 1861 // terminating Pods are marked as Failed. So we only need to check the phase. 1862 // TODO(#113855): Stop limiting this behavior to Jobs with podFailurePolicy. 1863 // For now, we do so to avoid affecting all running Jobs without the 1864 // availability to opt-out into the old behavior. 1865 return p.Status.Phase == v1.PodFailed 1866 } 1867 if p.Status.Phase == v1.PodFailed { 1868 return true 1869 } 1870 if onlyReplaceFailedPods(job) { 1871 return p.Status.Phase == v1.PodFailed 1872 } 1873 // Count deleted Pods as failures to account for orphan Pods that 1874 // never have a chance to reach the Failed phase. 1875 return p.DeletionTimestamp != nil && p.Status.Phase != v1.PodSucceeded 1876 } 1877 1878 func findConditionByType(list []batch.JobCondition, cType batch.JobConditionType) *batch.JobCondition { 1879 for i := range list { 1880 if list[i].Type == cType { 1881 return &list[i] 1882 } 1883 } 1884 return nil 1885 } 1886 1887 func recordJobPodFinished(logger klog.Logger, job *batch.Job, oldCounters batch.JobStatus) { 1888 completionMode := completionModeStr(job) 1889 var diff int 1890 1891 // Updating succeeded metric must be handled differently 1892 // for Indexed Jobs to handle the case where the job has 1893 // been scaled down by reducing completions & parallelism 1894 // in tandem, and now a previously completed index is 1895 // now out of range (i.e. index >= spec.Completions). 1896 if isIndexedJob(job) { 1897 completions := int(*job.Spec.Completions) 1898 if job.Status.CompletedIndexes != oldCounters.CompletedIndexes { 1899 diff = indexesCount(logger, &job.Status.CompletedIndexes, completions) - indexesCount(logger, &oldCounters.CompletedIndexes, completions) 1900 } 1901 backoffLimitLabel := backoffLimitMetricsLabel(job) 1902 metrics.JobFinishedIndexesTotal.WithLabelValues(metrics.Succeeded, backoffLimitLabel).Add(float64(diff)) 1903 if hasBackoffLimitPerIndex(job) && job.Status.FailedIndexes != oldCounters.FailedIndexes { 1904 if failedDiff := indexesCount(logger, job.Status.FailedIndexes, completions) - indexesCount(logger, oldCounters.FailedIndexes, completions); failedDiff > 0 { 1905 metrics.JobFinishedIndexesTotal.WithLabelValues(metrics.Failed, backoffLimitLabel).Add(float64(failedDiff)) 1906 } 1907 } 1908 } else { 1909 diff = int(job.Status.Succeeded) - int(oldCounters.Succeeded) 1910 } 1911 metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Succeeded).Add(float64(diff)) 1912 1913 // Update failed metric. 1914 diff = int(job.Status.Failed - oldCounters.Failed) 1915 metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed).Add(float64(diff)) 1916 } 1917 1918 func indexesCount(logger klog.Logger, indexesStr *string, completions int) int { 1919 if indexesStr == nil { 1920 return 0 1921 } 1922 return parseIndexesFromString(logger, *indexesStr, completions).total() 1923 } 1924 1925 func backoffLimitMetricsLabel(job *batch.Job) string { 1926 if hasBackoffLimitPerIndex(job) { 1927 return "perIndex" 1928 } 1929 return "global" 1930 } 1931 1932 func recordJobPodFailurePolicyActions(job *batch.Job, podFailureCountByPolicyAction map[string]int) { 1933 for action, count := range podFailureCountByPolicyAction { 1934 metrics.PodFailuresHandledByFailurePolicy.WithLabelValues(action).Add(float64(count)) 1935 } 1936 } 1937 1938 func countReadyPods(pods []*v1.Pod) int32 { 1939 cnt := int32(0) 1940 for _, p := range pods { 1941 if podutil.IsPodReady(p) { 1942 cnt++ 1943 } 1944 } 1945 return cnt 1946 } 1947 1948 // This checks if we should apply PodReplacementPolicy. 1949 // PodReplacementPolicy controls when we recreate pods if they are marked as terminating 1950 // Failed means that we recreate only once the pod has terminated. 1951 func onlyReplaceFailedPods(job *batch.Job) bool { 1952 // We check both PodReplacementPolicy for nil and failed 1953 // because it is possible that `PodReplacementPolicy` is not defaulted, 1954 // when the `JobPodReplacementPolicy` feature gate is disabled for API server. 1955 if feature.DefaultFeatureGate.Enabled(features.JobPodReplacementPolicy) && job.Spec.PodReplacementPolicy != nil && *job.Spec.PodReplacementPolicy == batch.Failed { 1956 return true 1957 } 1958 return feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil 1959 } 1960 1961 func (jm *Controller) cleanupPodFinalizers(job *batch.Job) { 1962 // Listing pods shouldn't really fail, as we are just querying the informer cache. 1963 selector, err := metav1.LabelSelectorAsSelector(job.Spec.Selector) 1964 if err != nil { 1965 utilruntime.HandleError(fmt.Errorf("parsing deleted job selector: %v", err)) 1966 return 1967 } 1968 pods, _ := jm.podStore.Pods(job.Namespace).List(selector) 1969 for _, pod := range pods { 1970 if metav1.IsControlledBy(pod, job) && hasJobTrackingFinalizer(pod) { 1971 jm.enqueueOrphanPod(pod) 1972 } 1973 } 1974 } 1975 1976 func recordJobPodsCreationTotal(job *batch.Job, jobCtx *syncJobCtx, succeeded, failed int32) { 1977 reason := metrics.PodCreateNew 1978 if feature.DefaultFeatureGate.Enabled(features.JobPodReplacementPolicy) { 1979 if ptr.Deref(job.Spec.PodReplacementPolicy, batch.TerminatingOrFailed) == batch.Failed && jobCtx.failed > 0 { 1980 reason = metrics.PodRecreateFailed 1981 } else if jobCtx.failed > 0 || ptr.Deref(jobCtx.terminating, 0) > 0 { 1982 reason = metrics.PodRecreateTerminatingOrFailed 1983 } 1984 } 1985 if succeeded > 0 { 1986 metrics.JobPodsCreationTotal.WithLabelValues(reason, metrics.Succeeded).Add(float64(succeeded)) 1987 } 1988 if failed > 0 { 1989 metrics.JobPodsCreationTotal.WithLabelValues(reason, metrics.Failed).Add(float64(failed)) 1990 } 1991 } 1992 1993 func managedByExternalController(jobObj *batch.Job) *string { 1994 if feature.DefaultFeatureGate.Enabled(features.JobManagedBy) { 1995 if controllerName := jobObj.Spec.ManagedBy; controllerName != nil && *controllerName != batch.JobControllerName { 1996 return controllerName 1997 } 1998 } 1999 return nil 2000 }