k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/cronjob/cronjob_controllerv2.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package cronjob
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"reflect"
    23  	"sort"
    24  	"strings"
    25  	"time"
    26  
    27  	"github.com/robfig/cron/v3"
    28  
    29  	batchv1 "k8s.io/api/batch/v1"
    30  	corev1 "k8s.io/api/core/v1"
    31  	"k8s.io/apimachinery/pkg/api/errors"
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  	"k8s.io/apimachinery/pkg/labels"
    34  	"k8s.io/apimachinery/pkg/runtime"
    35  	"k8s.io/apimachinery/pkg/types"
    36  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    37  	"k8s.io/apimachinery/pkg/util/wait"
    38  	batchv1informers "k8s.io/client-go/informers/batch/v1"
    39  	clientset "k8s.io/client-go/kubernetes"
    40  	"k8s.io/client-go/kubernetes/scheme"
    41  	corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
    42  	batchv1listers "k8s.io/client-go/listers/batch/v1"
    43  	"k8s.io/client-go/tools/cache"
    44  	"k8s.io/client-go/tools/record"
    45  	ref "k8s.io/client-go/tools/reference"
    46  	"k8s.io/client-go/util/workqueue"
    47  	"k8s.io/klog/v2"
    48  	"k8s.io/kubernetes/pkg/controller"
    49  	"k8s.io/kubernetes/pkg/controller/cronjob/metrics"
    50  	jobutil "k8s.io/kubernetes/pkg/controller/job/util"
    51  	"k8s.io/utils/pointer"
    52  )
    53  
    54  var (
    55  	// controllerKind contains the schema.GroupVersionKind for this controller type.
    56  	controllerKind = batchv1.SchemeGroupVersion.WithKind("CronJob")
    57  
    58  	nextScheduleDelta = 100 * time.Millisecond
    59  )
    60  
    61  // ControllerV2 is a controller for CronJobs.
    62  // Refactored Cronjob controller that uses DelayingQueue and informers
    63  type ControllerV2 struct {
    64  	queue workqueue.TypedRateLimitingInterface[string]
    65  
    66  	kubeClient  clientset.Interface
    67  	recorder    record.EventRecorder
    68  	broadcaster record.EventBroadcaster
    69  
    70  	jobControl     jobControlInterface
    71  	cronJobControl cjControlInterface
    72  
    73  	jobLister     batchv1listers.JobLister
    74  	cronJobLister batchv1listers.CronJobLister
    75  
    76  	jobListerSynced     cache.InformerSynced
    77  	cronJobListerSynced cache.InformerSynced
    78  
    79  	// now is a function that returns current time, done to facilitate unit tests
    80  	now func() time.Time
    81  }
    82  
    83  // NewControllerV2 creates and initializes a new Controller.
    84  func NewControllerV2(ctx context.Context, jobInformer batchv1informers.JobInformer, cronJobsInformer batchv1informers.CronJobInformer, kubeClient clientset.Interface) (*ControllerV2, error) {
    85  	logger := klog.FromContext(ctx)
    86  	eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx))
    87  
    88  	jm := &ControllerV2{
    89  		queue: workqueue.NewTypedRateLimitingQueueWithConfig(
    90  			workqueue.DefaultTypedControllerRateLimiter[string](),
    91  			workqueue.TypedRateLimitingQueueConfig[string]{
    92  				Name: "cronjob",
    93  			},
    94  		),
    95  		kubeClient:  kubeClient,
    96  		broadcaster: eventBroadcaster,
    97  		recorder:    eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "cronjob-controller"}),
    98  
    99  		jobControl:     realJobControl{KubeClient: kubeClient},
   100  		cronJobControl: &realCJControl{KubeClient: kubeClient},
   101  
   102  		jobLister:     jobInformer.Lister(),
   103  		cronJobLister: cronJobsInformer.Lister(),
   104  
   105  		jobListerSynced:     jobInformer.Informer().HasSynced,
   106  		cronJobListerSynced: cronJobsInformer.Informer().HasSynced,
   107  		now:                 time.Now,
   108  	}
   109  
   110  	jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   111  		AddFunc:    jm.addJob,
   112  		UpdateFunc: jm.updateJob,
   113  		DeleteFunc: jm.deleteJob,
   114  	})
   115  
   116  	cronJobsInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   117  		AddFunc: func(obj interface{}) {
   118  			jm.enqueueController(obj)
   119  		},
   120  		UpdateFunc: func(oldObj, newObj interface{}) {
   121  			jm.updateCronJob(logger, oldObj, newObj)
   122  		},
   123  		DeleteFunc: func(obj interface{}) {
   124  			jm.enqueueController(obj)
   125  		},
   126  	})
   127  
   128  	metrics.Register()
   129  
   130  	return jm, nil
   131  }
   132  
   133  // Run starts the main goroutine responsible for watching and syncing jobs.
   134  func (jm *ControllerV2) Run(ctx context.Context, workers int) {
   135  	defer utilruntime.HandleCrash()
   136  
   137  	// Start event processing pipeline.
   138  	jm.broadcaster.StartStructuredLogging(3)
   139  	jm.broadcaster.StartRecordingToSink(&corev1client.EventSinkImpl{Interface: jm.kubeClient.CoreV1().Events("")})
   140  	defer jm.broadcaster.Shutdown()
   141  
   142  	defer jm.queue.ShutDown()
   143  
   144  	logger := klog.FromContext(ctx)
   145  	logger.Info("Starting cronjob controller v2")
   146  	defer logger.Info("Shutting down cronjob controller v2")
   147  
   148  	if !cache.WaitForNamedCacheSync("cronjob", ctx.Done(), jm.jobListerSynced, jm.cronJobListerSynced) {
   149  		return
   150  	}
   151  
   152  	for i := 0; i < workers; i++ {
   153  		go wait.UntilWithContext(ctx, jm.worker, time.Second)
   154  	}
   155  
   156  	<-ctx.Done()
   157  }
   158  
   159  func (jm *ControllerV2) worker(ctx context.Context) {
   160  	for jm.processNextWorkItem(ctx) {
   161  	}
   162  }
   163  
   164  func (jm *ControllerV2) processNextWorkItem(ctx context.Context) bool {
   165  	key, quit := jm.queue.Get()
   166  	if quit {
   167  		return false
   168  	}
   169  	defer jm.queue.Done(key)
   170  
   171  	requeueAfter, err := jm.sync(ctx, key)
   172  	switch {
   173  	case err != nil:
   174  		utilruntime.HandleError(fmt.Errorf("error syncing CronJobController %v, requeuing: %w", key, err))
   175  		jm.queue.AddRateLimited(key)
   176  	case requeueAfter != nil:
   177  		jm.queue.Forget(key)
   178  		jm.queue.AddAfter(key, *requeueAfter)
   179  	}
   180  	return true
   181  }
   182  
   183  func (jm *ControllerV2) sync(ctx context.Context, cronJobKey string) (*time.Duration, error) {
   184  	ns, name, err := cache.SplitMetaNamespaceKey(cronJobKey)
   185  	if err != nil {
   186  		return nil, err
   187  	}
   188  	logger := klog.FromContext(ctx)
   189  	cronJob, err := jm.cronJobLister.CronJobs(ns).Get(name)
   190  	switch {
   191  	case errors.IsNotFound(err):
   192  		// may be cronjob is deleted, don't need to requeue this key
   193  		logger.V(4).Info("CronJob not found, may be it is deleted", "cronjob", klog.KObj(cronJob), "err", err)
   194  		return nil, nil
   195  	case err != nil:
   196  		// for other transient apiserver error requeue with exponential backoff
   197  		return nil, err
   198  	}
   199  
   200  	jobsToBeReconciled, err := jm.getJobsToBeReconciled(cronJob)
   201  	if err != nil {
   202  		return nil, err
   203  	}
   204  
   205  	// cronJobCopy is used to combine all the updates to a
   206  	// CronJob object and perform an actual update only once.
   207  	cronJobCopy := cronJob.DeepCopy()
   208  
   209  	updateStatusAfterCleanup := jm.cleanupFinishedJobs(ctx, cronJobCopy, jobsToBeReconciled)
   210  
   211  	requeueAfter, updateStatusAfterSync, syncErr := jm.syncCronJob(ctx, cronJobCopy, jobsToBeReconciled)
   212  	if syncErr != nil {
   213  		logger.V(2).Info("Error reconciling cronjob", "cronjob", klog.KObj(cronJob), "err", syncErr)
   214  	}
   215  
   216  	// Update the CronJob if needed
   217  	if updateStatusAfterCleanup || updateStatusAfterSync {
   218  		if _, err := jm.cronJobControl.UpdateStatus(ctx, cronJobCopy); err != nil {
   219  			logger.V(2).Info("Unable to update status for cronjob", "cronjob", klog.KObj(cronJob), "resourceVersion", cronJob.ResourceVersion, "err", err)
   220  			return nil, err
   221  		}
   222  	}
   223  
   224  	if requeueAfter != nil {
   225  		logger.V(4).Info("Re-queuing cronjob", "cronjob", klog.KObj(cronJob), "requeueAfter", requeueAfter)
   226  		return requeueAfter, nil
   227  	}
   228  	// this marks the key done, currently only happens when the cronjob is suspended or spec has invalid schedule format
   229  	return nil, syncErr
   230  }
   231  
   232  // resolveControllerRef returns the controller referenced by a ControllerRef,
   233  // or nil if the ControllerRef could not be resolved to a matching controller
   234  // of the correct Kind.
   235  func (jm *ControllerV2) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) *batchv1.CronJob {
   236  	// We can't look up by UID, so look up by Name and then verify UID.
   237  	// Don't even try to look up by Name if it's the wrong Kind.
   238  	if controllerRef.Kind != controllerKind.Kind {
   239  		return nil
   240  	}
   241  	cronJob, err := jm.cronJobLister.CronJobs(namespace).Get(controllerRef.Name)
   242  	if err != nil {
   243  		return nil
   244  	}
   245  	if cronJob.UID != controllerRef.UID {
   246  		// The controller we found with this Name is not the same one that the
   247  		// ControllerRef points to.
   248  		return nil
   249  	}
   250  	return cronJob
   251  }
   252  
   253  func (jm *ControllerV2) getJobsToBeReconciled(cronJob *batchv1.CronJob) ([]*batchv1.Job, error) {
   254  	// list all jobs: there may be jobs with labels that don't match the template anymore,
   255  	// but that still have a ControllerRef to the given cronjob
   256  	jobList, err := jm.jobLister.Jobs(cronJob.Namespace).List(labels.Everything())
   257  	if err != nil {
   258  		return nil, err
   259  	}
   260  
   261  	jobsToBeReconciled := []*batchv1.Job{}
   262  
   263  	for _, job := range jobList {
   264  		// If it has a ControllerRef, that's all that matters.
   265  		if controllerRef := metav1.GetControllerOf(job); controllerRef != nil && controllerRef.Name == cronJob.Name {
   266  			// this job is needs to be reconciled
   267  			jobsToBeReconciled = append(jobsToBeReconciled, job)
   268  		}
   269  	}
   270  	return jobsToBeReconciled, nil
   271  }
   272  
   273  // When a job is created, enqueue the controller that manages it and update it's expectations.
   274  func (jm *ControllerV2) addJob(obj interface{}) {
   275  	job := obj.(*batchv1.Job)
   276  	if job.DeletionTimestamp != nil {
   277  		// on a restart of the controller, it's possible a new job shows up in a state that
   278  		// is already pending deletion. Prevent the job from being a creation observation.
   279  		jm.deleteJob(job)
   280  		return
   281  	}
   282  
   283  	// If it has a ControllerRef, that's all that matters.
   284  	if controllerRef := metav1.GetControllerOf(job); controllerRef != nil {
   285  		cronJob := jm.resolveControllerRef(job.Namespace, controllerRef)
   286  		if cronJob == nil {
   287  			return
   288  		}
   289  		jm.enqueueController(cronJob)
   290  		return
   291  	}
   292  }
   293  
   294  // updateJob figures out what CronJob(s) manage a Job when the Job
   295  // is updated and wake them up. If the anything of the Job have changed, we need to
   296  // awaken both the old and new CronJob. old and cur must be *batchv1.Job
   297  // types.
   298  func (jm *ControllerV2) updateJob(old, cur interface{}) {
   299  	curJob := cur.(*batchv1.Job)
   300  	oldJob := old.(*batchv1.Job)
   301  	if curJob.ResourceVersion == oldJob.ResourceVersion {
   302  		// Periodic resync will send update events for all known jobs.
   303  		// Two different versions of the same jobs will always have different RVs.
   304  		return
   305  	}
   306  
   307  	curControllerRef := metav1.GetControllerOf(curJob)
   308  	oldControllerRef := metav1.GetControllerOf(oldJob)
   309  	controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef)
   310  	if controllerRefChanged && oldControllerRef != nil {
   311  		// The ControllerRef was changed. Sync the old controller, if any.
   312  		if cronJob := jm.resolveControllerRef(oldJob.Namespace, oldControllerRef); cronJob != nil {
   313  			jm.enqueueController(cronJob)
   314  		}
   315  	}
   316  
   317  	// If it has a ControllerRef, that's all that matters.
   318  	if curControllerRef != nil {
   319  		cronJob := jm.resolveControllerRef(curJob.Namespace, curControllerRef)
   320  		if cronJob == nil {
   321  			return
   322  		}
   323  		jm.enqueueController(cronJob)
   324  		return
   325  	}
   326  }
   327  
   328  func (jm *ControllerV2) deleteJob(obj interface{}) {
   329  	job, ok := obj.(*batchv1.Job)
   330  
   331  	// When a delete is dropped, the relist will notice a job in the store not
   332  	// in the list, leading to the insertion of a tombstone object which contains
   333  	// the deleted key/value. Note that this value might be stale.
   334  	if !ok {
   335  		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
   336  		if !ok {
   337  			utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %#v", obj))
   338  			return
   339  		}
   340  		job, ok = tombstone.Obj.(*batchv1.Job)
   341  		if !ok {
   342  			utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a Job %#v", obj))
   343  			return
   344  		}
   345  	}
   346  
   347  	controllerRef := metav1.GetControllerOf(job)
   348  	if controllerRef == nil {
   349  		// No controller should care about orphans being deleted.
   350  		return
   351  	}
   352  	cronJob := jm.resolveControllerRef(job.Namespace, controllerRef)
   353  	if cronJob == nil {
   354  		return
   355  	}
   356  	jm.enqueueController(cronJob)
   357  }
   358  
   359  func (jm *ControllerV2) enqueueController(obj interface{}) {
   360  	key, err := controller.KeyFunc(obj)
   361  	if err != nil {
   362  		utilruntime.HandleError(fmt.Errorf("couldn't get key for object %+v: %v", obj, err))
   363  		return
   364  	}
   365  
   366  	jm.queue.Add(key)
   367  }
   368  
   369  func (jm *ControllerV2) enqueueControllerAfter(obj interface{}, t time.Duration) {
   370  	key, err := controller.KeyFunc(obj)
   371  	if err != nil {
   372  		utilruntime.HandleError(fmt.Errorf("couldn't get key for object %+v: %v", obj, err))
   373  		return
   374  	}
   375  
   376  	jm.queue.AddAfter(key, t)
   377  }
   378  
   379  // updateCronJob re-queues the CronJob for next scheduled time if there is a
   380  // change in spec.schedule otherwise it re-queues it now
   381  func (jm *ControllerV2) updateCronJob(logger klog.Logger, old interface{}, curr interface{}) {
   382  	oldCJ, okOld := old.(*batchv1.CronJob)
   383  	newCJ, okNew := curr.(*batchv1.CronJob)
   384  
   385  	if !okOld || !okNew {
   386  		// typecasting of one failed, handle this better, may be log entry
   387  		return
   388  	}
   389  	// if the change in schedule results in next requeue having to be sooner than it already was,
   390  	// it will be handled here by the queue. If the next requeue is further than previous schedule,
   391  	// the sync loop will essentially be a no-op for the already queued key with old schedule.
   392  	if oldCJ.Spec.Schedule != newCJ.Spec.Schedule || !pointer.StringEqual(oldCJ.Spec.TimeZone, newCJ.Spec.TimeZone) {
   393  		// schedule changed, change the requeue time, pass nil recorder so that syncCronJob will output any warnings
   394  		sched, err := cron.ParseStandard(formatSchedule(newCJ, nil))
   395  		if err != nil {
   396  			// this is likely a user error in defining the spec value
   397  			// we should log the error and not reconcile this cronjob until an update to spec
   398  			logger.V(2).Info("Unparseable schedule for cronjob", "cronjob", klog.KObj(newCJ), "schedule", newCJ.Spec.Schedule, "err", err)
   399  			jm.recorder.Eventf(newCJ, corev1.EventTypeWarning, "UnParseableCronJobSchedule", "unparseable schedule for cronjob: %s", newCJ.Spec.Schedule)
   400  			return
   401  		}
   402  		now := jm.now()
   403  		t := nextScheduleTimeDuration(newCJ, now, sched)
   404  
   405  		jm.enqueueControllerAfter(curr, *t)
   406  		return
   407  	}
   408  
   409  	// other parameters changed, requeue this now and if this gets triggered
   410  	// within deadline, sync loop will work on the CJ otherwise updates will be handled
   411  	// during the next schedule
   412  	// TODO: need to handle the change of spec.JobTemplate.metadata.labels explicitly
   413  	//   to cleanup jobs with old labels
   414  	jm.enqueueController(curr)
   415  }
   416  
   417  // syncCronJob reconciles a CronJob with a list of any Jobs that it created.
   418  // All known jobs created by "cronJob" should be included in "jobs".
   419  // The current time is passed in to facilitate testing.
   420  // It returns a bool to indicate an update to api-server is needed
   421  func (jm *ControllerV2) syncCronJob(
   422  	ctx context.Context,
   423  	cronJob *batchv1.CronJob,
   424  	jobs []*batchv1.Job) (*time.Duration, bool, error) {
   425  
   426  	now := jm.now()
   427  	updateStatus := false
   428  
   429  	childrenJobs := make(map[types.UID]bool)
   430  	for _, j := range jobs {
   431  		childrenJobs[j.ObjectMeta.UID] = true
   432  		found := inActiveList(cronJob, j.ObjectMeta.UID)
   433  		if !found && !jobutil.IsJobFinished(j) {
   434  			cjCopy, err := jm.cronJobControl.GetCronJob(ctx, cronJob.Namespace, cronJob.Name)
   435  			if err != nil {
   436  				return nil, updateStatus, err
   437  			}
   438  			if inActiveList(cjCopy, j.ObjectMeta.UID) {
   439  				cronJob = cjCopy
   440  				continue
   441  			}
   442  			jm.recorder.Eventf(cronJob, corev1.EventTypeWarning, "UnexpectedJob", "Saw a job that the controller did not create or forgot: %s", j.Name)
   443  			// We found an unfinished job that has us as the parent, but it is not in our Active list.
   444  			// This could happen if we crashed right after creating the Job and before updating the status,
   445  			// or if our jobs list is newer than our cj status after a relist, or if someone intentionally created
   446  			// a job that they wanted us to adopt.
   447  		} else if found && jobutil.IsJobFinished(j) {
   448  			_, condition := jobutil.FinishedCondition(j)
   449  			deleteFromActiveList(cronJob, j.ObjectMeta.UID)
   450  			jm.recorder.Eventf(cronJob, corev1.EventTypeNormal, "SawCompletedJob", "Saw completed job: %s, condition: %v", j.Name, condition)
   451  			updateStatus = true
   452  		} else if jobutil.IsJobSucceeded(j) {
   453  			// a job does not have to be in active list, as long as it has completed successfully, we will process the timestamp
   454  			if cronJob.Status.LastSuccessfulTime == nil {
   455  				cronJob.Status.LastSuccessfulTime = j.Status.CompletionTime
   456  				updateStatus = true
   457  			}
   458  			if j.Status.CompletionTime != nil && j.Status.CompletionTime.After(cronJob.Status.LastSuccessfulTime.Time) {
   459  				cronJob.Status.LastSuccessfulTime = j.Status.CompletionTime
   460  				updateStatus = true
   461  			}
   462  		}
   463  	}
   464  
   465  	// Remove any job reference from the active list if the corresponding job does not exist any more.
   466  	// Otherwise, the cronjob may be stuck in active mode forever even though there is no matching
   467  	// job running.
   468  	for _, j := range cronJob.Status.Active {
   469  		_, found := childrenJobs[j.UID]
   470  		if found {
   471  			continue
   472  		}
   473  		// Explicitly try to get the job from api-server to avoid a slow watch not able to update
   474  		// the job lister on time, giving an unwanted miss
   475  		_, err := jm.jobControl.GetJob(j.Namespace, j.Name)
   476  		switch {
   477  		case errors.IsNotFound(err):
   478  			// The job is actually missing, delete from active list and schedule a new one if within
   479  			// deadline
   480  			jm.recorder.Eventf(cronJob, corev1.EventTypeNormal, "MissingJob", "Active job went missing: %v", j.Name)
   481  			deleteFromActiveList(cronJob, j.UID)
   482  			updateStatus = true
   483  		case err != nil:
   484  			return nil, updateStatus, err
   485  		}
   486  		// the job is missing in the lister but found in api-server
   487  	}
   488  
   489  	if cronJob.DeletionTimestamp != nil {
   490  		// The CronJob is being deleted.
   491  		// Don't do anything other than updating status.
   492  		return nil, updateStatus, nil
   493  	}
   494  
   495  	logger := klog.FromContext(ctx)
   496  	if cronJob.Spec.TimeZone != nil {
   497  		timeZone := pointer.StringDeref(cronJob.Spec.TimeZone, "")
   498  		if _, err := time.LoadLocation(timeZone); err != nil {
   499  			logger.V(4).Info("Not starting job because timeZone is invalid", "cronjob", klog.KObj(cronJob), "timeZone", timeZone, "err", err)
   500  			jm.recorder.Eventf(cronJob, corev1.EventTypeWarning, "UnknownTimeZone", "invalid timeZone: %q: %s", timeZone, err)
   501  			return nil, updateStatus, nil
   502  		}
   503  	}
   504  
   505  	if cronJob.Spec.Suspend != nil && *cronJob.Spec.Suspend {
   506  		logger.V(4).Info("Not starting job because the cron is suspended", "cronjob", klog.KObj(cronJob))
   507  		return nil, updateStatus, nil
   508  	}
   509  
   510  	sched, err := cron.ParseStandard(formatSchedule(cronJob, jm.recorder))
   511  	if err != nil {
   512  		// this is likely a user error in defining the spec value
   513  		// we should log the error and not reconcile this cronjob until an update to spec
   514  		logger.V(2).Info("Unparseable schedule", "cronjob", klog.KObj(cronJob), "schedule", cronJob.Spec.Schedule, "err", err)
   515  		jm.recorder.Eventf(cronJob, corev1.EventTypeWarning, "UnparseableSchedule", "unparseable schedule: %q : %s", cronJob.Spec.Schedule, err)
   516  		return nil, updateStatus, nil
   517  	}
   518  
   519  	scheduledTime, err := nextScheduleTime(logger, cronJob, now, sched, jm.recorder)
   520  	if err != nil {
   521  		// this is likely a user error in defining the spec value
   522  		// we should log the error and not reconcile this cronjob until an update to spec
   523  		logger.V(2).Info("Invalid schedule", "cronjob", klog.KObj(cronJob), "schedule", cronJob.Spec.Schedule, "err", err)
   524  		jm.recorder.Eventf(cronJob, corev1.EventTypeWarning, "InvalidSchedule", "invalid schedule: %s : %s", cronJob.Spec.Schedule, err)
   525  		return nil, updateStatus, nil
   526  	}
   527  	if scheduledTime == nil {
   528  		// no unmet start time, return cj,.
   529  		// The only time this should happen is if queue is filled after restart.
   530  		// Otherwise, the queue is always suppose to trigger sync function at the time of
   531  		// the scheduled time, that will give atleast 1 unmet time schedule
   532  		logger.V(4).Info("No unmet start times", "cronjob", klog.KObj(cronJob))
   533  		t := nextScheduleTimeDuration(cronJob, now, sched)
   534  		return t, updateStatus, nil
   535  	}
   536  
   537  	tooLate := false
   538  	if cronJob.Spec.StartingDeadlineSeconds != nil {
   539  		tooLate = scheduledTime.Add(time.Second * time.Duration(*cronJob.Spec.StartingDeadlineSeconds)).Before(now)
   540  	}
   541  	if tooLate {
   542  		logger.V(4).Info("Missed starting window", "cronjob", klog.KObj(cronJob))
   543  		jm.recorder.Eventf(cronJob, corev1.EventTypeWarning, "MissSchedule", "Missed scheduled time to start a job: %s", scheduledTime.UTC().Format(time.RFC1123Z))
   544  
   545  		// TODO: Since we don't set LastScheduleTime when not scheduling, we are going to keep noticing
   546  		// the miss every cycle.  In order to avoid sending multiple events, and to avoid processing
   547  		// the cj again and again, we could set a Status.LastMissedTime when we notice a miss.
   548  		// Then, when we call getRecentUnmetScheduleTimes, we can take max(creationTimestamp,
   549  		// Status.LastScheduleTime, Status.LastMissedTime), and then so we won't generate
   550  		// and event the next time we process it, and also so the user looking at the status
   551  		// can see easily that there was a missed execution.
   552  		t := nextScheduleTimeDuration(cronJob, now, sched)
   553  		return t, updateStatus, nil
   554  	}
   555  	if inActiveListByName(cronJob, &batchv1.Job{
   556  		ObjectMeta: metav1.ObjectMeta{
   557  			Name:      getJobName(cronJob, *scheduledTime),
   558  			Namespace: cronJob.Namespace,
   559  		}}) || cronJob.Status.LastScheduleTime.Equal(&metav1.Time{Time: *scheduledTime}) {
   560  		logger.V(4).Info("Not starting job because the scheduled time is already processed", "cronjob", klog.KObj(cronJob), "schedule", scheduledTime)
   561  		t := nextScheduleTimeDuration(cronJob, now, sched)
   562  		return t, updateStatus, nil
   563  	}
   564  	if cronJob.Spec.ConcurrencyPolicy == batchv1.ForbidConcurrent && len(cronJob.Status.Active) > 0 {
   565  		// Regardless which source of information we use for the set of active jobs,
   566  		// there is some risk that we won't see an active job when there is one.
   567  		// (because we haven't seen the status update to the SJ or the created pod).
   568  		// So it is theoretically possible to have concurrency with Forbid.
   569  		// As long the as the invocations are "far enough apart in time", this usually won't happen.
   570  		//
   571  		// TODO: for Forbid, we could use the same name for every execution, as a lock.
   572  		// With replace, we could use a name that is deterministic per execution time.
   573  		// But that would mean that you could not inspect prior successes or failures of Forbid jobs.
   574  		logger.V(4).Info("Not starting job because prior execution is still running and concurrency policy is Forbid", "cronjob", klog.KObj(cronJob))
   575  		jm.recorder.Eventf(cronJob, corev1.EventTypeNormal, "JobAlreadyActive", "Not starting job because prior execution is running and concurrency policy is Forbid")
   576  		t := nextScheduleTimeDuration(cronJob, now, sched)
   577  		return t, updateStatus, nil
   578  	}
   579  	if cronJob.Spec.ConcurrencyPolicy == batchv1.ReplaceConcurrent {
   580  		for _, j := range cronJob.Status.Active {
   581  			logger.V(4).Info("Deleting job that was still running at next scheduled start time", "job", klog.KRef(j.Namespace, j.Name))
   582  			job, err := jm.jobControl.GetJob(j.Namespace, j.Name)
   583  			if err != nil {
   584  				jm.recorder.Eventf(cronJob, corev1.EventTypeWarning, "FailedGet", "Get job: %v", err)
   585  				return nil, updateStatus, err
   586  			}
   587  			if !deleteJob(logger, cronJob, job, jm.jobControl, jm.recorder) {
   588  				return nil, updateStatus, fmt.Errorf("could not replace job %s/%s", job.Namespace, job.Name)
   589  			}
   590  			updateStatus = true
   591  		}
   592  	}
   593  
   594  	jobAlreadyExists := false
   595  	jobReq, err := getJobFromTemplate2(cronJob, *scheduledTime)
   596  	if err != nil {
   597  		logger.Error(err, "Unable to make Job from template", "cronjob", klog.KObj(cronJob))
   598  		return nil, updateStatus, err
   599  	}
   600  	jobResp, err := jm.jobControl.CreateJob(cronJob.Namespace, jobReq)
   601  	switch {
   602  	case errors.HasStatusCause(err, corev1.NamespaceTerminatingCause):
   603  		// if the namespace is being terminated, we don't have to do
   604  		// anything because any creation will fail
   605  		return nil, updateStatus, err
   606  	case errors.IsAlreadyExists(err):
   607  		// If the job is created by other actor, assume it has updated the cronjob status accordingly.
   608  		// However, if the job was created by cronjob controller, this means we've previously created the job
   609  		// but failed to update the active list in the status, in which case we should reattempt to add the job
   610  		// into the active list and update the status.
   611  		jobAlreadyExists = true
   612  		job, err := jm.jobControl.GetJob(jobReq.GetNamespace(), jobReq.GetName())
   613  		if err != nil {
   614  			return nil, updateStatus, err
   615  		}
   616  		jobResp = job
   617  
   618  		// check that this job is owned by cronjob controller, otherwise do nothing and assume external controller
   619  		// is updating the status.
   620  		if !metav1.IsControlledBy(job, cronJob) {
   621  			return nil, updateStatus, nil
   622  		}
   623  
   624  		// Recheck if the job is missing from the active list before attempting to update the status again.
   625  		found := inActiveList(cronJob, job.ObjectMeta.UID)
   626  		if found {
   627  			return nil, updateStatus, nil
   628  		}
   629  	case err != nil:
   630  		// default error handling
   631  		jm.recorder.Eventf(cronJob, corev1.EventTypeWarning, "FailedCreate", "Error creating job: %v", err)
   632  		return nil, updateStatus, err
   633  	}
   634  
   635  	if jobAlreadyExists {
   636  		logger.Info("Job already exists", "cronjob", klog.KObj(cronJob), "job", klog.KObj(jobReq))
   637  	} else {
   638  		metrics.CronJobCreationSkew.Observe(jobResp.ObjectMeta.GetCreationTimestamp().Sub(*scheduledTime).Seconds())
   639  		logger.V(4).Info("Created Job", "job", klog.KObj(jobResp), "cronjob", klog.KObj(cronJob))
   640  		jm.recorder.Eventf(cronJob, corev1.EventTypeNormal, "SuccessfulCreate", "Created job %v", jobResp.Name)
   641  	}
   642  
   643  	// ------------------------------------------------------------------ //
   644  
   645  	// If this process restarts at this point (after posting a job, but
   646  	// before updating the status), then we might try to start the job on
   647  	// the next time.  Actually, if we re-list the SJs and Jobs on the next
   648  	// iteration of syncAll, we might not see our own status update, and
   649  	// then post one again.  So, we need to use the job name as a lock to
   650  	// prevent us from making the job twice (name the job with hash of its
   651  	// scheduled time).
   652  
   653  	// Add the just-started job to the status list.
   654  	jobRef, err := getRef(jobResp)
   655  	if err != nil {
   656  		logger.V(2).Info("Unable to make object reference", "cronjob", klog.KObj(cronJob), "err", err)
   657  		return nil, updateStatus, fmt.Errorf("unable to make object reference for job for %s", klog.KObj(cronJob))
   658  	}
   659  	cronJob.Status.Active = append(cronJob.Status.Active, *jobRef)
   660  	cronJob.Status.LastScheduleTime = &metav1.Time{Time: *scheduledTime}
   661  	updateStatus = true
   662  
   663  	t := nextScheduleTimeDuration(cronJob, now, sched)
   664  	return t, updateStatus, nil
   665  }
   666  
   667  func getJobName(cj *batchv1.CronJob, scheduledTime time.Time) string {
   668  	return fmt.Sprintf("%s-%d", cj.Name, getTimeHashInMinutes(scheduledTime))
   669  }
   670  
   671  // cleanupFinishedJobs cleanups finished jobs created by a CronJob
   672  // It returns a bool to indicate an update to api-server is needed
   673  func (jm *ControllerV2) cleanupFinishedJobs(ctx context.Context, cj *batchv1.CronJob, js []*batchv1.Job) bool {
   674  	// If neither limits are active, there is no need to do anything.
   675  	if cj.Spec.FailedJobsHistoryLimit == nil && cj.Spec.SuccessfulJobsHistoryLimit == nil {
   676  		return false
   677  	}
   678  
   679  	updateStatus := false
   680  	failedJobs := []*batchv1.Job{}
   681  	successfulJobs := []*batchv1.Job{}
   682  
   683  	for _, job := range js {
   684  		isFinished, finishedStatus := jm.getFinishedStatus(job)
   685  		if isFinished && finishedStatus == batchv1.JobComplete {
   686  			successfulJobs = append(successfulJobs, job)
   687  		} else if isFinished && finishedStatus == batchv1.JobFailed {
   688  			failedJobs = append(failedJobs, job)
   689  		}
   690  	}
   691  
   692  	if cj.Spec.SuccessfulJobsHistoryLimit != nil &&
   693  		jm.removeOldestJobs(ctx, cj,
   694  			successfulJobs,
   695  			*cj.Spec.SuccessfulJobsHistoryLimit) {
   696  		updateStatus = true
   697  	}
   698  
   699  	if cj.Spec.FailedJobsHistoryLimit != nil &&
   700  		jm.removeOldestJobs(ctx, cj,
   701  			failedJobs,
   702  			*cj.Spec.FailedJobsHistoryLimit) {
   703  		updateStatus = true
   704  	}
   705  
   706  	return updateStatus
   707  }
   708  
   709  func (jm *ControllerV2) getFinishedStatus(j *batchv1.Job) (bool, batchv1.JobConditionType) {
   710  	for _, c := range j.Status.Conditions {
   711  		if (c.Type == batchv1.JobComplete || c.Type == batchv1.JobFailed) && c.Status == corev1.ConditionTrue {
   712  			return true, c.Type
   713  		}
   714  	}
   715  	return false, ""
   716  }
   717  
   718  // removeOldestJobs removes the oldest jobs from a list of jobs
   719  func (jm *ControllerV2) removeOldestJobs(ctx context.Context, cj *batchv1.CronJob, js []*batchv1.Job, maxJobs int32) bool {
   720  	updateStatus := false
   721  	numToDelete := len(js) - int(maxJobs)
   722  	if numToDelete <= 0 {
   723  		return updateStatus
   724  	}
   725  	logger := klog.FromContext(ctx)
   726  	logger.V(4).Info("Cleaning up jobs from CronJob list", "deletejobnum", numToDelete, "jobnum", len(js), "cronjob", klog.KObj(cj))
   727  
   728  	sort.Sort(byJobStartTime(js))
   729  	for i := 0; i < numToDelete; i++ {
   730  		logger.V(4).Info("Removing job from CronJob list", "job", js[i].Name, "cronjob", klog.KObj(cj))
   731  		if deleteJob(logger, cj, js[i], jm.jobControl, jm.recorder) {
   732  			updateStatus = true
   733  		}
   734  	}
   735  	return updateStatus
   736  }
   737  
   738  // deleteJob reaps a job, deleting the job, the pods and the reference in the active list
   739  func deleteJob(logger klog.Logger, cj *batchv1.CronJob, job *batchv1.Job, jc jobControlInterface, recorder record.EventRecorder) bool {
   740  	// delete the job itself...
   741  	if err := jc.DeleteJob(job.Namespace, job.Name); err != nil {
   742  		recorder.Eventf(cj, corev1.EventTypeWarning, "FailedDelete", "Deleted job: %v", err)
   743  		logger.Error(err, "Error deleting job from cronjob", "job", klog.KObj(job), "cronjob", klog.KObj(cj))
   744  		return false
   745  	}
   746  	// ... and its reference from active list
   747  	deleteFromActiveList(cj, job.ObjectMeta.UID)
   748  	recorder.Eventf(cj, corev1.EventTypeNormal, "SuccessfulDelete", "Deleted job %v", job.Name)
   749  
   750  	return true
   751  }
   752  
   753  func getRef(object runtime.Object) (*corev1.ObjectReference, error) {
   754  	return ref.GetReference(scheme.Scheme, object)
   755  }
   756  
   757  func formatSchedule(cj *batchv1.CronJob, recorder record.EventRecorder) string {
   758  	if strings.Contains(cj.Spec.Schedule, "TZ") {
   759  		if recorder != nil {
   760  			recorder.Eventf(cj, corev1.EventTypeWarning, "UnsupportedSchedule", "CRON_TZ or TZ used in schedule %q is not officially supported, see https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/ for more details", cj.Spec.Schedule)
   761  		}
   762  
   763  		return cj.Spec.Schedule
   764  	}
   765  
   766  	if cj.Spec.TimeZone != nil {
   767  		if _, err := time.LoadLocation(*cj.Spec.TimeZone); err != nil {
   768  			return cj.Spec.Schedule
   769  		}
   770  
   771  		return fmt.Sprintf("TZ=%s %s", *cj.Spec.TimeZone, cj.Spec.Schedule)
   772  	}
   773  
   774  	return cj.Spec.Schedule
   775  }