k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/cronjob/cronjob_controllerv2.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cronjob 18 19 import ( 20 "context" 21 "fmt" 22 "reflect" 23 "sort" 24 "strings" 25 "time" 26 27 "github.com/robfig/cron/v3" 28 29 batchv1 "k8s.io/api/batch/v1" 30 corev1 "k8s.io/api/core/v1" 31 "k8s.io/apimachinery/pkg/api/errors" 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 "k8s.io/apimachinery/pkg/labels" 34 "k8s.io/apimachinery/pkg/runtime" 35 "k8s.io/apimachinery/pkg/types" 36 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 37 "k8s.io/apimachinery/pkg/util/wait" 38 batchv1informers "k8s.io/client-go/informers/batch/v1" 39 clientset "k8s.io/client-go/kubernetes" 40 "k8s.io/client-go/kubernetes/scheme" 41 corev1client "k8s.io/client-go/kubernetes/typed/core/v1" 42 batchv1listers "k8s.io/client-go/listers/batch/v1" 43 "k8s.io/client-go/tools/cache" 44 "k8s.io/client-go/tools/record" 45 ref "k8s.io/client-go/tools/reference" 46 "k8s.io/client-go/util/workqueue" 47 "k8s.io/klog/v2" 48 "k8s.io/kubernetes/pkg/controller" 49 "k8s.io/kubernetes/pkg/controller/cronjob/metrics" 50 jobutil "k8s.io/kubernetes/pkg/controller/job/util" 51 "k8s.io/utils/pointer" 52 ) 53 54 var ( 55 // controllerKind contains the schema.GroupVersionKind for this controller type. 56 controllerKind = batchv1.SchemeGroupVersion.WithKind("CronJob") 57 58 nextScheduleDelta = 100 * time.Millisecond 59 ) 60 61 // ControllerV2 is a controller for CronJobs. 62 // Refactored Cronjob controller that uses DelayingQueue and informers 63 type ControllerV2 struct { 64 queue workqueue.TypedRateLimitingInterface[string] 65 66 kubeClient clientset.Interface 67 recorder record.EventRecorder 68 broadcaster record.EventBroadcaster 69 70 jobControl jobControlInterface 71 cronJobControl cjControlInterface 72 73 jobLister batchv1listers.JobLister 74 cronJobLister batchv1listers.CronJobLister 75 76 jobListerSynced cache.InformerSynced 77 cronJobListerSynced cache.InformerSynced 78 79 // now is a function that returns current time, done to facilitate unit tests 80 now func() time.Time 81 } 82 83 // NewControllerV2 creates and initializes a new Controller. 84 func NewControllerV2(ctx context.Context, jobInformer batchv1informers.JobInformer, cronJobsInformer batchv1informers.CronJobInformer, kubeClient clientset.Interface) (*ControllerV2, error) { 85 logger := klog.FromContext(ctx) 86 eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx)) 87 88 jm := &ControllerV2{ 89 queue: workqueue.NewTypedRateLimitingQueueWithConfig( 90 workqueue.DefaultTypedControllerRateLimiter[string](), 91 workqueue.TypedRateLimitingQueueConfig[string]{ 92 Name: "cronjob", 93 }, 94 ), 95 kubeClient: kubeClient, 96 broadcaster: eventBroadcaster, 97 recorder: eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "cronjob-controller"}), 98 99 jobControl: realJobControl{KubeClient: kubeClient}, 100 cronJobControl: &realCJControl{KubeClient: kubeClient}, 101 102 jobLister: jobInformer.Lister(), 103 cronJobLister: cronJobsInformer.Lister(), 104 105 jobListerSynced: jobInformer.Informer().HasSynced, 106 cronJobListerSynced: cronJobsInformer.Informer().HasSynced, 107 now: time.Now, 108 } 109 110 jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 111 AddFunc: jm.addJob, 112 UpdateFunc: jm.updateJob, 113 DeleteFunc: jm.deleteJob, 114 }) 115 116 cronJobsInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 117 AddFunc: func(obj interface{}) { 118 jm.enqueueController(obj) 119 }, 120 UpdateFunc: func(oldObj, newObj interface{}) { 121 jm.updateCronJob(logger, oldObj, newObj) 122 }, 123 DeleteFunc: func(obj interface{}) { 124 jm.enqueueController(obj) 125 }, 126 }) 127 128 metrics.Register() 129 130 return jm, nil 131 } 132 133 // Run starts the main goroutine responsible for watching and syncing jobs. 134 func (jm *ControllerV2) Run(ctx context.Context, workers int) { 135 defer utilruntime.HandleCrash() 136 137 // Start event processing pipeline. 138 jm.broadcaster.StartStructuredLogging(3) 139 jm.broadcaster.StartRecordingToSink(&corev1client.EventSinkImpl{Interface: jm.kubeClient.CoreV1().Events("")}) 140 defer jm.broadcaster.Shutdown() 141 142 defer jm.queue.ShutDown() 143 144 logger := klog.FromContext(ctx) 145 logger.Info("Starting cronjob controller v2") 146 defer logger.Info("Shutting down cronjob controller v2") 147 148 if !cache.WaitForNamedCacheSync("cronjob", ctx.Done(), jm.jobListerSynced, jm.cronJobListerSynced) { 149 return 150 } 151 152 for i := 0; i < workers; i++ { 153 go wait.UntilWithContext(ctx, jm.worker, time.Second) 154 } 155 156 <-ctx.Done() 157 } 158 159 func (jm *ControllerV2) worker(ctx context.Context) { 160 for jm.processNextWorkItem(ctx) { 161 } 162 } 163 164 func (jm *ControllerV2) processNextWorkItem(ctx context.Context) bool { 165 key, quit := jm.queue.Get() 166 if quit { 167 return false 168 } 169 defer jm.queue.Done(key) 170 171 requeueAfter, err := jm.sync(ctx, key) 172 switch { 173 case err != nil: 174 utilruntime.HandleError(fmt.Errorf("error syncing CronJobController %v, requeuing: %w", key, err)) 175 jm.queue.AddRateLimited(key) 176 case requeueAfter != nil: 177 jm.queue.Forget(key) 178 jm.queue.AddAfter(key, *requeueAfter) 179 } 180 return true 181 } 182 183 func (jm *ControllerV2) sync(ctx context.Context, cronJobKey string) (*time.Duration, error) { 184 ns, name, err := cache.SplitMetaNamespaceKey(cronJobKey) 185 if err != nil { 186 return nil, err 187 } 188 logger := klog.FromContext(ctx) 189 cronJob, err := jm.cronJobLister.CronJobs(ns).Get(name) 190 switch { 191 case errors.IsNotFound(err): 192 // may be cronjob is deleted, don't need to requeue this key 193 logger.V(4).Info("CronJob not found, may be it is deleted", "cronjob", klog.KObj(cronJob), "err", err) 194 return nil, nil 195 case err != nil: 196 // for other transient apiserver error requeue with exponential backoff 197 return nil, err 198 } 199 200 jobsToBeReconciled, err := jm.getJobsToBeReconciled(cronJob) 201 if err != nil { 202 return nil, err 203 } 204 205 // cronJobCopy is used to combine all the updates to a 206 // CronJob object and perform an actual update only once. 207 cronJobCopy := cronJob.DeepCopy() 208 209 updateStatusAfterCleanup := jm.cleanupFinishedJobs(ctx, cronJobCopy, jobsToBeReconciled) 210 211 requeueAfter, updateStatusAfterSync, syncErr := jm.syncCronJob(ctx, cronJobCopy, jobsToBeReconciled) 212 if syncErr != nil { 213 logger.V(2).Info("Error reconciling cronjob", "cronjob", klog.KObj(cronJob), "err", syncErr) 214 } 215 216 // Update the CronJob if needed 217 if updateStatusAfterCleanup || updateStatusAfterSync { 218 if _, err := jm.cronJobControl.UpdateStatus(ctx, cronJobCopy); err != nil { 219 logger.V(2).Info("Unable to update status for cronjob", "cronjob", klog.KObj(cronJob), "resourceVersion", cronJob.ResourceVersion, "err", err) 220 return nil, err 221 } 222 } 223 224 if requeueAfter != nil { 225 logger.V(4).Info("Re-queuing cronjob", "cronjob", klog.KObj(cronJob), "requeueAfter", requeueAfter) 226 return requeueAfter, nil 227 } 228 // this marks the key done, currently only happens when the cronjob is suspended or spec has invalid schedule format 229 return nil, syncErr 230 } 231 232 // resolveControllerRef returns the controller referenced by a ControllerRef, 233 // or nil if the ControllerRef could not be resolved to a matching controller 234 // of the correct Kind. 235 func (jm *ControllerV2) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) *batchv1.CronJob { 236 // We can't look up by UID, so look up by Name and then verify UID. 237 // Don't even try to look up by Name if it's the wrong Kind. 238 if controllerRef.Kind != controllerKind.Kind { 239 return nil 240 } 241 cronJob, err := jm.cronJobLister.CronJobs(namespace).Get(controllerRef.Name) 242 if err != nil { 243 return nil 244 } 245 if cronJob.UID != controllerRef.UID { 246 // The controller we found with this Name is not the same one that the 247 // ControllerRef points to. 248 return nil 249 } 250 return cronJob 251 } 252 253 func (jm *ControllerV2) getJobsToBeReconciled(cronJob *batchv1.CronJob) ([]*batchv1.Job, error) { 254 // list all jobs: there may be jobs with labels that don't match the template anymore, 255 // but that still have a ControllerRef to the given cronjob 256 jobList, err := jm.jobLister.Jobs(cronJob.Namespace).List(labels.Everything()) 257 if err != nil { 258 return nil, err 259 } 260 261 jobsToBeReconciled := []*batchv1.Job{} 262 263 for _, job := range jobList { 264 // If it has a ControllerRef, that's all that matters. 265 if controllerRef := metav1.GetControllerOf(job); controllerRef != nil && controllerRef.Name == cronJob.Name { 266 // this job is needs to be reconciled 267 jobsToBeReconciled = append(jobsToBeReconciled, job) 268 } 269 } 270 return jobsToBeReconciled, nil 271 } 272 273 // When a job is created, enqueue the controller that manages it and update it's expectations. 274 func (jm *ControllerV2) addJob(obj interface{}) { 275 job := obj.(*batchv1.Job) 276 if job.DeletionTimestamp != nil { 277 // on a restart of the controller, it's possible a new job shows up in a state that 278 // is already pending deletion. Prevent the job from being a creation observation. 279 jm.deleteJob(job) 280 return 281 } 282 283 // If it has a ControllerRef, that's all that matters. 284 if controllerRef := metav1.GetControllerOf(job); controllerRef != nil { 285 cronJob := jm.resolveControllerRef(job.Namespace, controllerRef) 286 if cronJob == nil { 287 return 288 } 289 jm.enqueueController(cronJob) 290 return 291 } 292 } 293 294 // updateJob figures out what CronJob(s) manage a Job when the Job 295 // is updated and wake them up. If the anything of the Job have changed, we need to 296 // awaken both the old and new CronJob. old and cur must be *batchv1.Job 297 // types. 298 func (jm *ControllerV2) updateJob(old, cur interface{}) { 299 curJob := cur.(*batchv1.Job) 300 oldJob := old.(*batchv1.Job) 301 if curJob.ResourceVersion == oldJob.ResourceVersion { 302 // Periodic resync will send update events for all known jobs. 303 // Two different versions of the same jobs will always have different RVs. 304 return 305 } 306 307 curControllerRef := metav1.GetControllerOf(curJob) 308 oldControllerRef := metav1.GetControllerOf(oldJob) 309 controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef) 310 if controllerRefChanged && oldControllerRef != nil { 311 // The ControllerRef was changed. Sync the old controller, if any. 312 if cronJob := jm.resolveControllerRef(oldJob.Namespace, oldControllerRef); cronJob != nil { 313 jm.enqueueController(cronJob) 314 } 315 } 316 317 // If it has a ControllerRef, that's all that matters. 318 if curControllerRef != nil { 319 cronJob := jm.resolveControllerRef(curJob.Namespace, curControllerRef) 320 if cronJob == nil { 321 return 322 } 323 jm.enqueueController(cronJob) 324 return 325 } 326 } 327 328 func (jm *ControllerV2) deleteJob(obj interface{}) { 329 job, ok := obj.(*batchv1.Job) 330 331 // When a delete is dropped, the relist will notice a job in the store not 332 // in the list, leading to the insertion of a tombstone object which contains 333 // the deleted key/value. Note that this value might be stale. 334 if !ok { 335 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 336 if !ok { 337 utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %#v", obj)) 338 return 339 } 340 job, ok = tombstone.Obj.(*batchv1.Job) 341 if !ok { 342 utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a Job %#v", obj)) 343 return 344 } 345 } 346 347 controllerRef := metav1.GetControllerOf(job) 348 if controllerRef == nil { 349 // No controller should care about orphans being deleted. 350 return 351 } 352 cronJob := jm.resolveControllerRef(job.Namespace, controllerRef) 353 if cronJob == nil { 354 return 355 } 356 jm.enqueueController(cronJob) 357 } 358 359 func (jm *ControllerV2) enqueueController(obj interface{}) { 360 key, err := controller.KeyFunc(obj) 361 if err != nil { 362 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %+v: %v", obj, err)) 363 return 364 } 365 366 jm.queue.Add(key) 367 } 368 369 func (jm *ControllerV2) enqueueControllerAfter(obj interface{}, t time.Duration) { 370 key, err := controller.KeyFunc(obj) 371 if err != nil { 372 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %+v: %v", obj, err)) 373 return 374 } 375 376 jm.queue.AddAfter(key, t) 377 } 378 379 // updateCronJob re-queues the CronJob for next scheduled time if there is a 380 // change in spec.schedule otherwise it re-queues it now 381 func (jm *ControllerV2) updateCronJob(logger klog.Logger, old interface{}, curr interface{}) { 382 oldCJ, okOld := old.(*batchv1.CronJob) 383 newCJ, okNew := curr.(*batchv1.CronJob) 384 385 if !okOld || !okNew { 386 // typecasting of one failed, handle this better, may be log entry 387 return 388 } 389 // if the change in schedule results in next requeue having to be sooner than it already was, 390 // it will be handled here by the queue. If the next requeue is further than previous schedule, 391 // the sync loop will essentially be a no-op for the already queued key with old schedule. 392 if oldCJ.Spec.Schedule != newCJ.Spec.Schedule || !pointer.StringEqual(oldCJ.Spec.TimeZone, newCJ.Spec.TimeZone) { 393 // schedule changed, change the requeue time, pass nil recorder so that syncCronJob will output any warnings 394 sched, err := cron.ParseStandard(formatSchedule(newCJ, nil)) 395 if err != nil { 396 // this is likely a user error in defining the spec value 397 // we should log the error and not reconcile this cronjob until an update to spec 398 logger.V(2).Info("Unparseable schedule for cronjob", "cronjob", klog.KObj(newCJ), "schedule", newCJ.Spec.Schedule, "err", err) 399 jm.recorder.Eventf(newCJ, corev1.EventTypeWarning, "UnParseableCronJobSchedule", "unparseable schedule for cronjob: %s", newCJ.Spec.Schedule) 400 return 401 } 402 now := jm.now() 403 t := nextScheduleTimeDuration(newCJ, now, sched) 404 405 jm.enqueueControllerAfter(curr, *t) 406 return 407 } 408 409 // other parameters changed, requeue this now and if this gets triggered 410 // within deadline, sync loop will work on the CJ otherwise updates will be handled 411 // during the next schedule 412 // TODO: need to handle the change of spec.JobTemplate.metadata.labels explicitly 413 // to cleanup jobs with old labels 414 jm.enqueueController(curr) 415 } 416 417 // syncCronJob reconciles a CronJob with a list of any Jobs that it created. 418 // All known jobs created by "cronJob" should be included in "jobs". 419 // The current time is passed in to facilitate testing. 420 // It returns a bool to indicate an update to api-server is needed 421 func (jm *ControllerV2) syncCronJob( 422 ctx context.Context, 423 cronJob *batchv1.CronJob, 424 jobs []*batchv1.Job) (*time.Duration, bool, error) { 425 426 now := jm.now() 427 updateStatus := false 428 429 childrenJobs := make(map[types.UID]bool) 430 for _, j := range jobs { 431 childrenJobs[j.ObjectMeta.UID] = true 432 found := inActiveList(cronJob, j.ObjectMeta.UID) 433 if !found && !jobutil.IsJobFinished(j) { 434 cjCopy, err := jm.cronJobControl.GetCronJob(ctx, cronJob.Namespace, cronJob.Name) 435 if err != nil { 436 return nil, updateStatus, err 437 } 438 if inActiveList(cjCopy, j.ObjectMeta.UID) { 439 cronJob = cjCopy 440 continue 441 } 442 jm.recorder.Eventf(cronJob, corev1.EventTypeWarning, "UnexpectedJob", "Saw a job that the controller did not create or forgot: %s", j.Name) 443 // We found an unfinished job that has us as the parent, but it is not in our Active list. 444 // This could happen if we crashed right after creating the Job and before updating the status, 445 // or if our jobs list is newer than our cj status after a relist, or if someone intentionally created 446 // a job that they wanted us to adopt. 447 } else if found && jobutil.IsJobFinished(j) { 448 _, condition := jobutil.FinishedCondition(j) 449 deleteFromActiveList(cronJob, j.ObjectMeta.UID) 450 jm.recorder.Eventf(cronJob, corev1.EventTypeNormal, "SawCompletedJob", "Saw completed job: %s, condition: %v", j.Name, condition) 451 updateStatus = true 452 } else if jobutil.IsJobSucceeded(j) { 453 // a job does not have to be in active list, as long as it has completed successfully, we will process the timestamp 454 if cronJob.Status.LastSuccessfulTime == nil { 455 cronJob.Status.LastSuccessfulTime = j.Status.CompletionTime 456 updateStatus = true 457 } 458 if j.Status.CompletionTime != nil && j.Status.CompletionTime.After(cronJob.Status.LastSuccessfulTime.Time) { 459 cronJob.Status.LastSuccessfulTime = j.Status.CompletionTime 460 updateStatus = true 461 } 462 } 463 } 464 465 // Remove any job reference from the active list if the corresponding job does not exist any more. 466 // Otherwise, the cronjob may be stuck in active mode forever even though there is no matching 467 // job running. 468 for _, j := range cronJob.Status.Active { 469 _, found := childrenJobs[j.UID] 470 if found { 471 continue 472 } 473 // Explicitly try to get the job from api-server to avoid a slow watch not able to update 474 // the job lister on time, giving an unwanted miss 475 _, err := jm.jobControl.GetJob(j.Namespace, j.Name) 476 switch { 477 case errors.IsNotFound(err): 478 // The job is actually missing, delete from active list and schedule a new one if within 479 // deadline 480 jm.recorder.Eventf(cronJob, corev1.EventTypeNormal, "MissingJob", "Active job went missing: %v", j.Name) 481 deleteFromActiveList(cronJob, j.UID) 482 updateStatus = true 483 case err != nil: 484 return nil, updateStatus, err 485 } 486 // the job is missing in the lister but found in api-server 487 } 488 489 if cronJob.DeletionTimestamp != nil { 490 // The CronJob is being deleted. 491 // Don't do anything other than updating status. 492 return nil, updateStatus, nil 493 } 494 495 logger := klog.FromContext(ctx) 496 if cronJob.Spec.TimeZone != nil { 497 timeZone := pointer.StringDeref(cronJob.Spec.TimeZone, "") 498 if _, err := time.LoadLocation(timeZone); err != nil { 499 logger.V(4).Info("Not starting job because timeZone is invalid", "cronjob", klog.KObj(cronJob), "timeZone", timeZone, "err", err) 500 jm.recorder.Eventf(cronJob, corev1.EventTypeWarning, "UnknownTimeZone", "invalid timeZone: %q: %s", timeZone, err) 501 return nil, updateStatus, nil 502 } 503 } 504 505 if cronJob.Spec.Suspend != nil && *cronJob.Spec.Suspend { 506 logger.V(4).Info("Not starting job because the cron is suspended", "cronjob", klog.KObj(cronJob)) 507 return nil, updateStatus, nil 508 } 509 510 sched, err := cron.ParseStandard(formatSchedule(cronJob, jm.recorder)) 511 if err != nil { 512 // this is likely a user error in defining the spec value 513 // we should log the error and not reconcile this cronjob until an update to spec 514 logger.V(2).Info("Unparseable schedule", "cronjob", klog.KObj(cronJob), "schedule", cronJob.Spec.Schedule, "err", err) 515 jm.recorder.Eventf(cronJob, corev1.EventTypeWarning, "UnparseableSchedule", "unparseable schedule: %q : %s", cronJob.Spec.Schedule, err) 516 return nil, updateStatus, nil 517 } 518 519 scheduledTime, err := nextScheduleTime(logger, cronJob, now, sched, jm.recorder) 520 if err != nil { 521 // this is likely a user error in defining the spec value 522 // we should log the error and not reconcile this cronjob until an update to spec 523 logger.V(2).Info("Invalid schedule", "cronjob", klog.KObj(cronJob), "schedule", cronJob.Spec.Schedule, "err", err) 524 jm.recorder.Eventf(cronJob, corev1.EventTypeWarning, "InvalidSchedule", "invalid schedule: %s : %s", cronJob.Spec.Schedule, err) 525 return nil, updateStatus, nil 526 } 527 if scheduledTime == nil { 528 // no unmet start time, return cj,. 529 // The only time this should happen is if queue is filled after restart. 530 // Otherwise, the queue is always suppose to trigger sync function at the time of 531 // the scheduled time, that will give atleast 1 unmet time schedule 532 logger.V(4).Info("No unmet start times", "cronjob", klog.KObj(cronJob)) 533 t := nextScheduleTimeDuration(cronJob, now, sched) 534 return t, updateStatus, nil 535 } 536 537 tooLate := false 538 if cronJob.Spec.StartingDeadlineSeconds != nil { 539 tooLate = scheduledTime.Add(time.Second * time.Duration(*cronJob.Spec.StartingDeadlineSeconds)).Before(now) 540 } 541 if tooLate { 542 logger.V(4).Info("Missed starting window", "cronjob", klog.KObj(cronJob)) 543 jm.recorder.Eventf(cronJob, corev1.EventTypeWarning, "MissSchedule", "Missed scheduled time to start a job: %s", scheduledTime.UTC().Format(time.RFC1123Z)) 544 545 // TODO: Since we don't set LastScheduleTime when not scheduling, we are going to keep noticing 546 // the miss every cycle. In order to avoid sending multiple events, and to avoid processing 547 // the cj again and again, we could set a Status.LastMissedTime when we notice a miss. 548 // Then, when we call getRecentUnmetScheduleTimes, we can take max(creationTimestamp, 549 // Status.LastScheduleTime, Status.LastMissedTime), and then so we won't generate 550 // and event the next time we process it, and also so the user looking at the status 551 // can see easily that there was a missed execution. 552 t := nextScheduleTimeDuration(cronJob, now, sched) 553 return t, updateStatus, nil 554 } 555 if inActiveListByName(cronJob, &batchv1.Job{ 556 ObjectMeta: metav1.ObjectMeta{ 557 Name: getJobName(cronJob, *scheduledTime), 558 Namespace: cronJob.Namespace, 559 }}) || cronJob.Status.LastScheduleTime.Equal(&metav1.Time{Time: *scheduledTime}) { 560 logger.V(4).Info("Not starting job because the scheduled time is already processed", "cronjob", klog.KObj(cronJob), "schedule", scheduledTime) 561 t := nextScheduleTimeDuration(cronJob, now, sched) 562 return t, updateStatus, nil 563 } 564 if cronJob.Spec.ConcurrencyPolicy == batchv1.ForbidConcurrent && len(cronJob.Status.Active) > 0 { 565 // Regardless which source of information we use for the set of active jobs, 566 // there is some risk that we won't see an active job when there is one. 567 // (because we haven't seen the status update to the SJ or the created pod). 568 // So it is theoretically possible to have concurrency with Forbid. 569 // As long the as the invocations are "far enough apart in time", this usually won't happen. 570 // 571 // TODO: for Forbid, we could use the same name for every execution, as a lock. 572 // With replace, we could use a name that is deterministic per execution time. 573 // But that would mean that you could not inspect prior successes or failures of Forbid jobs. 574 logger.V(4).Info("Not starting job because prior execution is still running and concurrency policy is Forbid", "cronjob", klog.KObj(cronJob)) 575 jm.recorder.Eventf(cronJob, corev1.EventTypeNormal, "JobAlreadyActive", "Not starting job because prior execution is running and concurrency policy is Forbid") 576 t := nextScheduleTimeDuration(cronJob, now, sched) 577 return t, updateStatus, nil 578 } 579 if cronJob.Spec.ConcurrencyPolicy == batchv1.ReplaceConcurrent { 580 for _, j := range cronJob.Status.Active { 581 logger.V(4).Info("Deleting job that was still running at next scheduled start time", "job", klog.KRef(j.Namespace, j.Name)) 582 job, err := jm.jobControl.GetJob(j.Namespace, j.Name) 583 if err != nil { 584 jm.recorder.Eventf(cronJob, corev1.EventTypeWarning, "FailedGet", "Get job: %v", err) 585 return nil, updateStatus, err 586 } 587 if !deleteJob(logger, cronJob, job, jm.jobControl, jm.recorder) { 588 return nil, updateStatus, fmt.Errorf("could not replace job %s/%s", job.Namespace, job.Name) 589 } 590 updateStatus = true 591 } 592 } 593 594 jobAlreadyExists := false 595 jobReq, err := getJobFromTemplate2(cronJob, *scheduledTime) 596 if err != nil { 597 logger.Error(err, "Unable to make Job from template", "cronjob", klog.KObj(cronJob)) 598 return nil, updateStatus, err 599 } 600 jobResp, err := jm.jobControl.CreateJob(cronJob.Namespace, jobReq) 601 switch { 602 case errors.HasStatusCause(err, corev1.NamespaceTerminatingCause): 603 // if the namespace is being terminated, we don't have to do 604 // anything because any creation will fail 605 return nil, updateStatus, err 606 case errors.IsAlreadyExists(err): 607 // If the job is created by other actor, assume it has updated the cronjob status accordingly. 608 // However, if the job was created by cronjob controller, this means we've previously created the job 609 // but failed to update the active list in the status, in which case we should reattempt to add the job 610 // into the active list and update the status. 611 jobAlreadyExists = true 612 job, err := jm.jobControl.GetJob(jobReq.GetNamespace(), jobReq.GetName()) 613 if err != nil { 614 return nil, updateStatus, err 615 } 616 jobResp = job 617 618 // check that this job is owned by cronjob controller, otherwise do nothing and assume external controller 619 // is updating the status. 620 if !metav1.IsControlledBy(job, cronJob) { 621 return nil, updateStatus, nil 622 } 623 624 // Recheck if the job is missing from the active list before attempting to update the status again. 625 found := inActiveList(cronJob, job.ObjectMeta.UID) 626 if found { 627 return nil, updateStatus, nil 628 } 629 case err != nil: 630 // default error handling 631 jm.recorder.Eventf(cronJob, corev1.EventTypeWarning, "FailedCreate", "Error creating job: %v", err) 632 return nil, updateStatus, err 633 } 634 635 if jobAlreadyExists { 636 logger.Info("Job already exists", "cronjob", klog.KObj(cronJob), "job", klog.KObj(jobReq)) 637 } else { 638 metrics.CronJobCreationSkew.Observe(jobResp.ObjectMeta.GetCreationTimestamp().Sub(*scheduledTime).Seconds()) 639 logger.V(4).Info("Created Job", "job", klog.KObj(jobResp), "cronjob", klog.KObj(cronJob)) 640 jm.recorder.Eventf(cronJob, corev1.EventTypeNormal, "SuccessfulCreate", "Created job %v", jobResp.Name) 641 } 642 643 // ------------------------------------------------------------------ // 644 645 // If this process restarts at this point (after posting a job, but 646 // before updating the status), then we might try to start the job on 647 // the next time. Actually, if we re-list the SJs and Jobs on the next 648 // iteration of syncAll, we might not see our own status update, and 649 // then post one again. So, we need to use the job name as a lock to 650 // prevent us from making the job twice (name the job with hash of its 651 // scheduled time). 652 653 // Add the just-started job to the status list. 654 jobRef, err := getRef(jobResp) 655 if err != nil { 656 logger.V(2).Info("Unable to make object reference", "cronjob", klog.KObj(cronJob), "err", err) 657 return nil, updateStatus, fmt.Errorf("unable to make object reference for job for %s", klog.KObj(cronJob)) 658 } 659 cronJob.Status.Active = append(cronJob.Status.Active, *jobRef) 660 cronJob.Status.LastScheduleTime = &metav1.Time{Time: *scheduledTime} 661 updateStatus = true 662 663 t := nextScheduleTimeDuration(cronJob, now, sched) 664 return t, updateStatus, nil 665 } 666 667 func getJobName(cj *batchv1.CronJob, scheduledTime time.Time) string { 668 return fmt.Sprintf("%s-%d", cj.Name, getTimeHashInMinutes(scheduledTime)) 669 } 670 671 // cleanupFinishedJobs cleanups finished jobs created by a CronJob 672 // It returns a bool to indicate an update to api-server is needed 673 func (jm *ControllerV2) cleanupFinishedJobs(ctx context.Context, cj *batchv1.CronJob, js []*batchv1.Job) bool { 674 // If neither limits are active, there is no need to do anything. 675 if cj.Spec.FailedJobsHistoryLimit == nil && cj.Spec.SuccessfulJobsHistoryLimit == nil { 676 return false 677 } 678 679 updateStatus := false 680 failedJobs := []*batchv1.Job{} 681 successfulJobs := []*batchv1.Job{} 682 683 for _, job := range js { 684 isFinished, finishedStatus := jm.getFinishedStatus(job) 685 if isFinished && finishedStatus == batchv1.JobComplete { 686 successfulJobs = append(successfulJobs, job) 687 } else if isFinished && finishedStatus == batchv1.JobFailed { 688 failedJobs = append(failedJobs, job) 689 } 690 } 691 692 if cj.Spec.SuccessfulJobsHistoryLimit != nil && 693 jm.removeOldestJobs(ctx, cj, 694 successfulJobs, 695 *cj.Spec.SuccessfulJobsHistoryLimit) { 696 updateStatus = true 697 } 698 699 if cj.Spec.FailedJobsHistoryLimit != nil && 700 jm.removeOldestJobs(ctx, cj, 701 failedJobs, 702 *cj.Spec.FailedJobsHistoryLimit) { 703 updateStatus = true 704 } 705 706 return updateStatus 707 } 708 709 func (jm *ControllerV2) getFinishedStatus(j *batchv1.Job) (bool, batchv1.JobConditionType) { 710 for _, c := range j.Status.Conditions { 711 if (c.Type == batchv1.JobComplete || c.Type == batchv1.JobFailed) && c.Status == corev1.ConditionTrue { 712 return true, c.Type 713 } 714 } 715 return false, "" 716 } 717 718 // removeOldestJobs removes the oldest jobs from a list of jobs 719 func (jm *ControllerV2) removeOldestJobs(ctx context.Context, cj *batchv1.CronJob, js []*batchv1.Job, maxJobs int32) bool { 720 updateStatus := false 721 numToDelete := len(js) - int(maxJobs) 722 if numToDelete <= 0 { 723 return updateStatus 724 } 725 logger := klog.FromContext(ctx) 726 logger.V(4).Info("Cleaning up jobs from CronJob list", "deletejobnum", numToDelete, "jobnum", len(js), "cronjob", klog.KObj(cj)) 727 728 sort.Sort(byJobStartTime(js)) 729 for i := 0; i < numToDelete; i++ { 730 logger.V(4).Info("Removing job from CronJob list", "job", js[i].Name, "cronjob", klog.KObj(cj)) 731 if deleteJob(logger, cj, js[i], jm.jobControl, jm.recorder) { 732 updateStatus = true 733 } 734 } 735 return updateStatus 736 } 737 738 // deleteJob reaps a job, deleting the job, the pods and the reference in the active list 739 func deleteJob(logger klog.Logger, cj *batchv1.CronJob, job *batchv1.Job, jc jobControlInterface, recorder record.EventRecorder) bool { 740 // delete the job itself... 741 if err := jc.DeleteJob(job.Namespace, job.Name); err != nil { 742 recorder.Eventf(cj, corev1.EventTypeWarning, "FailedDelete", "Deleted job: %v", err) 743 logger.Error(err, "Error deleting job from cronjob", "job", klog.KObj(job), "cronjob", klog.KObj(cj)) 744 return false 745 } 746 // ... and its reference from active list 747 deleteFromActiveList(cj, job.ObjectMeta.UID) 748 recorder.Eventf(cj, corev1.EventTypeNormal, "SuccessfulDelete", "Deleted job %v", job.Name) 749 750 return true 751 } 752 753 func getRef(object runtime.Object) (*corev1.ObjectReference, error) { 754 return ref.GetReference(scheme.Scheme, object) 755 } 756 757 func formatSchedule(cj *batchv1.CronJob, recorder record.EventRecorder) string { 758 if strings.Contains(cj.Spec.Schedule, "TZ") { 759 if recorder != nil { 760 recorder.Eventf(cj, corev1.EventTypeWarning, "UnsupportedSchedule", "CRON_TZ or TZ used in schedule %q is not officially supported, see https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/ for more details", cj.Spec.Schedule) 761 } 762 763 return cj.Spec.Schedule 764 } 765 766 if cj.Spec.TimeZone != nil { 767 if _, err := time.LoadLocation(*cj.Spec.TimeZone); err != nil { 768 return cj.Spec.Schedule 769 } 770 771 return fmt.Sprintf("TZ=%s %s", *cj.Spec.TimeZone, cj.Spec.Schedule) 772 } 773 774 return cj.Spec.Schedule 775 }