k8s.io/kubernetes@v1.29.3/pkg/controller/ttlafterfinished/ttlafterfinished_controller.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package ttlafterfinished
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"time"
    23  
    24  	batch "k8s.io/api/batch/v1"
    25  	v1 "k8s.io/api/core/v1"
    26  	"k8s.io/apimachinery/pkg/api/errors"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    29  	"k8s.io/apimachinery/pkg/util/wait"
    30  	batchinformers "k8s.io/client-go/informers/batch/v1"
    31  	clientset "k8s.io/client-go/kubernetes"
    32  	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
    33  	batchlisters "k8s.io/client-go/listers/batch/v1"
    34  	"k8s.io/client-go/tools/cache"
    35  	"k8s.io/client-go/tools/record"
    36  	"k8s.io/client-go/util/workqueue"
    37  	"k8s.io/klog/v2"
    38  	"k8s.io/kubectl/pkg/scheme"
    39  	"k8s.io/kubernetes/pkg/controller"
    40  	jobutil "k8s.io/kubernetes/pkg/controller/job"
    41  	"k8s.io/kubernetes/pkg/controller/ttlafterfinished/metrics"
    42  	"k8s.io/utils/clock"
    43  )
    44  
    45  // Controller watches for changes of Jobs API objects. Triggered by Job creation
    46  // and updates, it enqueues Jobs that have non-nil `.spec.ttlSecondsAfterFinished`
    47  // to the `queue`. The Controller has workers who consume `queue`, check whether
    48  // the Job TTL has expired or not; if the Job TTL hasn't expired, it will add the
    49  // Job to the queue after the TTL is expected to expire; if the TTL has expired, the
    50  // worker will send requests to the API server to delete the Jobs accordingly.
    51  // This is implemented outside of Job controller for separation of concerns, and
    52  // because it will be extended to handle other finishable resource types.
    53  type Controller struct {
    54  	client   clientset.Interface
    55  	recorder record.EventRecorder
    56  
    57  	// jLister can list/get Jobs from the shared informer's store
    58  	jLister batchlisters.JobLister
    59  
    60  	// jStoreSynced returns true if the Job store has been synced at least once.
    61  	// Added as a member to the struct to allow injection for testing.
    62  	jListerSynced cache.InformerSynced
    63  
    64  	// Jobs that the controller will check its TTL and attempt to delete when the TTL expires.
    65  	queue workqueue.RateLimitingInterface
    66  
    67  	// The clock for tracking time
    68  	clock clock.Clock
    69  }
    70  
    71  // New creates an instance of Controller
    72  func New(ctx context.Context, jobInformer batchinformers.JobInformer, client clientset.Interface) *Controller {
    73  	eventBroadcaster := record.NewBroadcaster()
    74  	eventBroadcaster.StartStructuredLogging(0)
    75  	eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: client.CoreV1().Events("")})
    76  
    77  	metrics.Register()
    78  
    79  	tc := &Controller{
    80  		client:   client,
    81  		recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "ttl-after-finished-controller"}),
    82  		queue:    workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "ttl_jobs_to_delete"),
    83  	}
    84  
    85  	logger := klog.FromContext(ctx)
    86  	jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
    87  		AddFunc: func(obj interface{}) {
    88  			tc.addJob(logger, obj)
    89  		},
    90  		UpdateFunc: func(oldObj, newObj interface{}) {
    91  			tc.updateJob(logger, oldObj, newObj)
    92  		},
    93  	})
    94  
    95  	tc.jLister = jobInformer.Lister()
    96  	tc.jListerSynced = jobInformer.Informer().HasSynced
    97  
    98  	tc.clock = clock.RealClock{}
    99  
   100  	return tc
   101  }
   102  
   103  // Run starts the workers to clean up Jobs.
   104  func (tc *Controller) Run(ctx context.Context, workers int) {
   105  	defer utilruntime.HandleCrash()
   106  	defer tc.queue.ShutDown()
   107  
   108  	logger := klog.FromContext(ctx)
   109  	logger.Info("Starting TTL after finished controller")
   110  	defer logger.Info("Shutting down TTL after finished controller")
   111  
   112  	if !cache.WaitForNamedCacheSync("TTL after finished", ctx.Done(), tc.jListerSynced) {
   113  		return
   114  	}
   115  
   116  	for i := 0; i < workers; i++ {
   117  		go wait.UntilWithContext(ctx, tc.worker, time.Second)
   118  	}
   119  
   120  	<-ctx.Done()
   121  }
   122  
   123  func (tc *Controller) addJob(logger klog.Logger, obj interface{}) {
   124  	job := obj.(*batch.Job)
   125  	logger.V(4).Info("Adding job", "job", klog.KObj(job))
   126  
   127  	if job.DeletionTimestamp == nil && needsCleanup(job) {
   128  		tc.enqueue(logger, job)
   129  	}
   130  
   131  }
   132  
   133  func (tc *Controller) updateJob(logger klog.Logger, old, cur interface{}) {
   134  	job := cur.(*batch.Job)
   135  	logger.V(4).Info("Updating job", "job", klog.KObj(job))
   136  
   137  	if job.DeletionTimestamp == nil && needsCleanup(job) {
   138  		tc.enqueue(logger, job)
   139  	}
   140  }
   141  
   142  func (tc *Controller) enqueue(logger klog.Logger, job *batch.Job) {
   143  	logger.V(4).Info("Add job to cleanup", "job", klog.KObj(job))
   144  	key, err := controller.KeyFunc(job)
   145  	if err != nil {
   146  		utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", job, err))
   147  		return
   148  	}
   149  
   150  	tc.queue.Add(key)
   151  }
   152  
   153  func (tc *Controller) enqueueAfter(job *batch.Job, after time.Duration) {
   154  	key, err := controller.KeyFunc(job)
   155  	if err != nil {
   156  		utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", job, err))
   157  		return
   158  	}
   159  
   160  	tc.queue.AddAfter(key, after)
   161  }
   162  
   163  func (tc *Controller) worker(ctx context.Context) {
   164  	for tc.processNextWorkItem(ctx) {
   165  	}
   166  }
   167  
   168  func (tc *Controller) processNextWorkItem(ctx context.Context) bool {
   169  	key, quit := tc.queue.Get()
   170  	if quit {
   171  		return false
   172  	}
   173  	defer tc.queue.Done(key)
   174  
   175  	err := tc.processJob(ctx, key.(string))
   176  	tc.handleErr(err, key)
   177  
   178  	return true
   179  }
   180  
   181  func (tc *Controller) handleErr(err error, key interface{}) {
   182  	if err == nil {
   183  		tc.queue.Forget(key)
   184  		return
   185  	}
   186  
   187  	utilruntime.HandleError(fmt.Errorf("error cleaning up Job %v, will retry: %v", key, err))
   188  	tc.queue.AddRateLimited(key)
   189  }
   190  
   191  // processJob will check the Job's state and TTL and delete the Job when it
   192  // finishes and its TTL after finished has expired. If the Job hasn't finished or
   193  // its TTL hasn't expired, it will be added to the queue after the TTL is expected
   194  // to expire.
   195  // This function is not meant to be invoked concurrently with the same key.
   196  func (tc *Controller) processJob(ctx context.Context, key string) error {
   197  	namespace, name, err := cache.SplitMetaNamespaceKey(key)
   198  	if err != nil {
   199  		return err
   200  	}
   201  
   202  	// Ignore the Jobs that are already deleted or being deleted, or the ones that don't need clean up.
   203  	job, err := tc.jLister.Jobs(namespace).Get(name)
   204  
   205  	logger := klog.FromContext(ctx)
   206  	logger.V(4).Info("Checking if Job is ready for cleanup", "job", klog.KRef(namespace, name))
   207  
   208  	if errors.IsNotFound(err) {
   209  		return nil
   210  	}
   211  	if err != nil {
   212  		return err
   213  	}
   214  
   215  	if expiredAt, err := tc.processTTL(logger, job); err != nil {
   216  		return err
   217  	} else if expiredAt == nil {
   218  		return nil
   219  	}
   220  
   221  	// The Job's TTL is assumed to have expired, but the Job TTL might be stale.
   222  	// Before deleting the Job, do a final sanity check.
   223  	// If TTL is modified before we do this check, we cannot be sure if the TTL truly expires.
   224  	// The latest Job may have a different UID, but it's fine because the checks will be run again.
   225  	fresh, err := tc.client.BatchV1().Jobs(namespace).Get(ctx, name, metav1.GetOptions{})
   226  	if errors.IsNotFound(err) {
   227  		return nil
   228  	}
   229  	if err != nil {
   230  		return err
   231  	}
   232  	// Use the latest Job TTL to see if the TTL truly expires.
   233  	expiredAt, err := tc.processTTL(logger, fresh)
   234  	if err != nil {
   235  		return err
   236  	} else if expiredAt == nil {
   237  		return nil
   238  	}
   239  	// Cascade deletes the Jobs if TTL truly expires.
   240  	policy := metav1.DeletePropagationForeground
   241  	options := metav1.DeleteOptions{
   242  		PropagationPolicy: &policy,
   243  		Preconditions:     &metav1.Preconditions{UID: &fresh.UID},
   244  	}
   245  	logger.V(4).Info("Cleaning up Job", "job", klog.KObj(fresh))
   246  	if err := tc.client.BatchV1().Jobs(fresh.Namespace).Delete(ctx, fresh.Name, options); err != nil {
   247  		return err
   248  	}
   249  	metrics.JobDeletionDurationSeconds.Observe(time.Since(*expiredAt).Seconds())
   250  	return nil
   251  }
   252  
   253  // processTTL checks whether a given Job's TTL has expired, and add it to the queue after the TTL is expected to expire
   254  // if the TTL will expire later.
   255  func (tc *Controller) processTTL(logger klog.Logger, job *batch.Job) (expiredAt *time.Time, err error) {
   256  
   257  	// We don't care about the Jobs that are going to be deleted, or the ones that don't need clean up.
   258  	if job.DeletionTimestamp != nil || !needsCleanup(job) {
   259  		return nil, nil
   260  	}
   261  
   262  	now := tc.clock.Now()
   263  	t, e, err := timeLeft(logger, job, &now)
   264  	if err != nil {
   265  		return nil, err
   266  	}
   267  
   268  	// TTL has expired
   269  	if *t <= 0 {
   270  		return e, nil
   271  	}
   272  
   273  	tc.enqueueAfter(job, *t)
   274  	return nil, nil
   275  }
   276  
   277  // needsCleanup checks whether a Job has finished and has a TTL set.
   278  func needsCleanup(j *batch.Job) bool {
   279  	return j.Spec.TTLSecondsAfterFinished != nil && jobutil.IsJobFinished(j)
   280  }
   281  
   282  func getFinishAndExpireTime(j *batch.Job) (*time.Time, *time.Time, error) {
   283  	if !needsCleanup(j) {
   284  		return nil, nil, fmt.Errorf("job %s/%s should not be cleaned up", j.Namespace, j.Name)
   285  	}
   286  	t, err := jobFinishTime(j)
   287  	if err != nil {
   288  		return nil, nil, err
   289  	}
   290  	finishAt := t.Time
   291  	expireAt := finishAt.Add(time.Duration(*j.Spec.TTLSecondsAfterFinished) * time.Second)
   292  	return &finishAt, &expireAt, nil
   293  }
   294  
   295  func timeLeft(logger klog.Logger, j *batch.Job, since *time.Time) (*time.Duration, *time.Time, error) {
   296  	finishAt, expireAt, err := getFinishAndExpireTime(j)
   297  	if err != nil {
   298  		return nil, nil, err
   299  	}
   300  
   301  	if finishAt.After(*since) {
   302  		logger.Info("Warning: Found Job finished in the future. This is likely due to time skew in the cluster. Job cleanup will be deferred.", "job", klog.KObj(j))
   303  	}
   304  	remaining := expireAt.Sub(*since)
   305  	logger.V(4).Info("Found Job finished", "job", klog.KObj(j), "finishTime", finishAt.UTC(), "remainingTTL", remaining, "startTime", since.UTC(), "deadlineTTL", expireAt.UTC())
   306  	return &remaining, expireAt, nil
   307  }
   308  
   309  // jobFinishTime takes an already finished Job and returns the time it finishes.
   310  func jobFinishTime(finishedJob *batch.Job) (metav1.Time, error) {
   311  	for _, c := range finishedJob.Status.Conditions {
   312  		if (c.Type == batch.JobComplete || c.Type == batch.JobFailed) && c.Status == v1.ConditionTrue {
   313  			finishAt := c.LastTransitionTime
   314  			if finishAt.IsZero() {
   315  				return metav1.Time{}, fmt.Errorf("unable to find the time when the Job %s/%s finished", finishedJob.Namespace, finishedJob.Name)
   316  			}
   317  			return c.LastTransitionTime, nil
   318  		}
   319  	}
   320  
   321  	// This should never happen if the Jobs has finished
   322  	return metav1.Time{}, fmt.Errorf("unable to find the status of the finished Job %s/%s", finishedJob.Namespace, finishedJob.Name)
   323  }