k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/ttlafterfinished/ttlafterfinished_controller.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package ttlafterfinished
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"time"
    23  
    24  	batch "k8s.io/api/batch/v1"
    25  	v1 "k8s.io/api/core/v1"
    26  	"k8s.io/apimachinery/pkg/api/errors"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    29  	"k8s.io/apimachinery/pkg/util/wait"
    30  	batchinformers "k8s.io/client-go/informers/batch/v1"
    31  	clientset "k8s.io/client-go/kubernetes"
    32  	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
    33  	batchlisters "k8s.io/client-go/listers/batch/v1"
    34  	"k8s.io/client-go/tools/cache"
    35  	"k8s.io/client-go/tools/record"
    36  	"k8s.io/client-go/util/workqueue"
    37  	"k8s.io/klog/v2"
    38  	"k8s.io/kubectl/pkg/scheme"
    39  	"k8s.io/kubernetes/pkg/controller"
    40  	jobutil "k8s.io/kubernetes/pkg/controller/job/util"
    41  	"k8s.io/kubernetes/pkg/controller/ttlafterfinished/metrics"
    42  	"k8s.io/utils/clock"
    43  )
    44  
    45  // Controller watches for changes of Jobs API objects. Triggered by Job creation
    46  // and updates, it enqueues Jobs that have non-nil `.spec.ttlSecondsAfterFinished`
    47  // to the `queue`. The Controller has workers who consume `queue`, check whether
    48  // the Job TTL has expired or not; if the Job TTL hasn't expired, it will add the
    49  // Job to the queue after the TTL is expected to expire; if the TTL has expired, the
    50  // worker will send requests to the API server to delete the Jobs accordingly.
    51  // This is implemented outside of Job controller for separation of concerns, and
    52  // because it will be extended to handle other finishable resource types.
    53  type Controller struct {
    54  	client   clientset.Interface
    55  	recorder record.EventRecorder
    56  
    57  	// jLister can list/get Jobs from the shared informer's store
    58  	jLister batchlisters.JobLister
    59  
    60  	// jStoreSynced returns true if the Job store has been synced at least once.
    61  	// Added as a member to the struct to allow injection for testing.
    62  	jListerSynced cache.InformerSynced
    63  
    64  	// Jobs that the controller will check its TTL and attempt to delete when the TTL expires.
    65  	queue workqueue.TypedRateLimitingInterface[string]
    66  
    67  	// The clock for tracking time
    68  	clock clock.Clock
    69  }
    70  
    71  // New creates an instance of Controller
    72  func New(ctx context.Context, jobInformer batchinformers.JobInformer, client clientset.Interface) *Controller {
    73  	eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx))
    74  	eventBroadcaster.StartStructuredLogging(3)
    75  	eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: client.CoreV1().Events("")})
    76  
    77  	metrics.Register()
    78  
    79  	tc := &Controller{
    80  		client:   client,
    81  		recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "ttl-after-finished-controller"}),
    82  		queue: workqueue.NewTypedRateLimitingQueueWithConfig(
    83  			workqueue.DefaultTypedControllerRateLimiter[string](),
    84  			workqueue.TypedRateLimitingQueueConfig[string]{Name: "ttl_jobs_to_delete"},
    85  		),
    86  	}
    87  
    88  	logger := klog.FromContext(ctx)
    89  	jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
    90  		AddFunc: func(obj interface{}) {
    91  			tc.addJob(logger, obj)
    92  		},
    93  		UpdateFunc: func(oldObj, newObj interface{}) {
    94  			tc.updateJob(logger, oldObj, newObj)
    95  		},
    96  	})
    97  
    98  	tc.jLister = jobInformer.Lister()
    99  	tc.jListerSynced = jobInformer.Informer().HasSynced
   100  
   101  	tc.clock = clock.RealClock{}
   102  
   103  	return tc
   104  }
   105  
   106  // Run starts the workers to clean up Jobs.
   107  func (tc *Controller) Run(ctx context.Context, workers int) {
   108  	defer utilruntime.HandleCrash()
   109  	defer tc.queue.ShutDown()
   110  
   111  	logger := klog.FromContext(ctx)
   112  	logger.Info("Starting TTL after finished controller")
   113  	defer logger.Info("Shutting down TTL after finished controller")
   114  
   115  	if !cache.WaitForNamedCacheSync("TTL after finished", ctx.Done(), tc.jListerSynced) {
   116  		return
   117  	}
   118  
   119  	for i := 0; i < workers; i++ {
   120  		go wait.UntilWithContext(ctx, tc.worker, time.Second)
   121  	}
   122  
   123  	<-ctx.Done()
   124  }
   125  
   126  func (tc *Controller) addJob(logger klog.Logger, obj interface{}) {
   127  	job := obj.(*batch.Job)
   128  	logger.V(4).Info("Adding job", "job", klog.KObj(job))
   129  
   130  	if job.DeletionTimestamp == nil && needsCleanup(job) {
   131  		tc.enqueue(logger, job)
   132  	}
   133  
   134  }
   135  
   136  func (tc *Controller) updateJob(logger klog.Logger, old, cur interface{}) {
   137  	job := cur.(*batch.Job)
   138  	logger.V(4).Info("Updating job", "job", klog.KObj(job))
   139  
   140  	if job.DeletionTimestamp == nil && needsCleanup(job) {
   141  		tc.enqueue(logger, job)
   142  	}
   143  }
   144  
   145  func (tc *Controller) enqueue(logger klog.Logger, job *batch.Job) {
   146  	logger.V(4).Info("Add job to cleanup", "job", klog.KObj(job))
   147  	key, err := controller.KeyFunc(job)
   148  	if err != nil {
   149  		utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", job, err))
   150  		return
   151  	}
   152  
   153  	tc.queue.Add(key)
   154  }
   155  
   156  func (tc *Controller) enqueueAfter(job *batch.Job, after time.Duration) {
   157  	key, err := controller.KeyFunc(job)
   158  	if err != nil {
   159  		utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", job, err))
   160  		return
   161  	}
   162  
   163  	tc.queue.AddAfter(key, after)
   164  }
   165  
   166  func (tc *Controller) worker(ctx context.Context) {
   167  	for tc.processNextWorkItem(ctx) {
   168  	}
   169  }
   170  
   171  func (tc *Controller) processNextWorkItem(ctx context.Context) bool {
   172  	key, quit := tc.queue.Get()
   173  	if quit {
   174  		return false
   175  	}
   176  	defer tc.queue.Done(key)
   177  
   178  	err := tc.processJob(ctx, key)
   179  	tc.handleErr(err, key)
   180  
   181  	return true
   182  }
   183  
   184  func (tc *Controller) handleErr(err error, key string) {
   185  	if err == nil {
   186  		tc.queue.Forget(key)
   187  		return
   188  	}
   189  
   190  	utilruntime.HandleError(fmt.Errorf("error cleaning up Job %v, will retry: %v", key, err))
   191  	tc.queue.AddRateLimited(key)
   192  }
   193  
   194  // processJob will check the Job's state and TTL and delete the Job when it
   195  // finishes and its TTL after finished has expired. If the Job hasn't finished or
   196  // its TTL hasn't expired, it will be added to the queue after the TTL is expected
   197  // to expire.
   198  // This function is not meant to be invoked concurrently with the same key.
   199  func (tc *Controller) processJob(ctx context.Context, key string) error {
   200  	namespace, name, err := cache.SplitMetaNamespaceKey(key)
   201  	if err != nil {
   202  		return err
   203  	}
   204  
   205  	// Ignore the Jobs that are already deleted or being deleted, or the ones that don't need clean up.
   206  	job, err := tc.jLister.Jobs(namespace).Get(name)
   207  
   208  	logger := klog.FromContext(ctx)
   209  	logger.V(4).Info("Checking if Job is ready for cleanup", "job", klog.KRef(namespace, name))
   210  
   211  	if errors.IsNotFound(err) {
   212  		return nil
   213  	}
   214  	if err != nil {
   215  		return err
   216  	}
   217  
   218  	if expiredAt, err := tc.processTTL(logger, job); err != nil {
   219  		return err
   220  	} else if expiredAt == nil {
   221  		return nil
   222  	}
   223  
   224  	// The Job's TTL is assumed to have expired, but the Job TTL might be stale.
   225  	// Before deleting the Job, do a final sanity check.
   226  	// If TTL is modified before we do this check, we cannot be sure if the TTL truly expires.
   227  	// The latest Job may have a different UID, but it's fine because the checks will be run again.
   228  	fresh, err := tc.client.BatchV1().Jobs(namespace).Get(ctx, name, metav1.GetOptions{})
   229  	if errors.IsNotFound(err) {
   230  		return nil
   231  	}
   232  	if err != nil {
   233  		return err
   234  	}
   235  	// Use the latest Job TTL to see if the TTL truly expires.
   236  	expiredAt, err := tc.processTTL(logger, fresh)
   237  	if err != nil {
   238  		return err
   239  	} else if expiredAt == nil {
   240  		return nil
   241  	}
   242  	// Cascade deletes the Jobs if TTL truly expires.
   243  	policy := metav1.DeletePropagationForeground
   244  	options := metav1.DeleteOptions{
   245  		PropagationPolicy: &policy,
   246  		Preconditions:     &metav1.Preconditions{UID: &fresh.UID},
   247  	}
   248  	logger.V(4).Info("Cleaning up Job", "job", klog.KObj(fresh))
   249  	if err := tc.client.BatchV1().Jobs(fresh.Namespace).Delete(ctx, fresh.Name, options); err != nil {
   250  		return err
   251  	}
   252  	metrics.JobDeletionDurationSeconds.Observe(time.Since(*expiredAt).Seconds())
   253  	return nil
   254  }
   255  
   256  // processTTL checks whether a given Job's TTL has expired, and add it to the queue after the TTL is expected to expire
   257  // if the TTL will expire later.
   258  func (tc *Controller) processTTL(logger klog.Logger, job *batch.Job) (expiredAt *time.Time, err error) {
   259  
   260  	// We don't care about the Jobs that are going to be deleted, or the ones that don't need clean up.
   261  	if job.DeletionTimestamp != nil || !needsCleanup(job) {
   262  		return nil, nil
   263  	}
   264  
   265  	now := tc.clock.Now()
   266  	t, e, err := timeLeft(logger, job, &now)
   267  	if err != nil {
   268  		return nil, err
   269  	}
   270  
   271  	// TTL has expired
   272  	if *t <= 0 {
   273  		return e, nil
   274  	}
   275  
   276  	tc.enqueueAfter(job, *t)
   277  	return nil, nil
   278  }
   279  
   280  // needsCleanup checks whether a Job has finished and has a TTL set.
   281  func needsCleanup(j *batch.Job) bool {
   282  	return j.Spec.TTLSecondsAfterFinished != nil && jobutil.IsJobFinished(j)
   283  }
   284  
   285  func getFinishAndExpireTime(j *batch.Job) (*time.Time, *time.Time, error) {
   286  	if !needsCleanup(j) {
   287  		return nil, nil, fmt.Errorf("job %s/%s should not be cleaned up", j.Namespace, j.Name)
   288  	}
   289  	t, err := jobFinishTime(j)
   290  	if err != nil {
   291  		return nil, nil, err
   292  	}
   293  	finishAt := t.Time
   294  	expireAt := finishAt.Add(time.Duration(*j.Spec.TTLSecondsAfterFinished) * time.Second)
   295  	return &finishAt, &expireAt, nil
   296  }
   297  
   298  func timeLeft(logger klog.Logger, j *batch.Job, since *time.Time) (*time.Duration, *time.Time, error) {
   299  	finishAt, expireAt, err := getFinishAndExpireTime(j)
   300  	if err != nil {
   301  		return nil, nil, err
   302  	}
   303  
   304  	if finishAt.After(*since) {
   305  		logger.Info("Warning: Found Job finished in the future. This is likely due to time skew in the cluster. Job cleanup will be deferred.", "job", klog.KObj(j))
   306  	}
   307  	remaining := expireAt.Sub(*since)
   308  	logger.V(4).Info("Found Job finished", "job", klog.KObj(j), "finishTime", finishAt.UTC(), "remainingTTL", remaining, "startTime", since.UTC(), "deadlineTTL", expireAt.UTC())
   309  	return &remaining, expireAt, nil
   310  }
   311  
   312  // jobFinishTime takes an already finished Job and returns the time it finishes.
   313  func jobFinishTime(finishedJob *batch.Job) (metav1.Time, error) {
   314  	for _, c := range finishedJob.Status.Conditions {
   315  		if (c.Type == batch.JobComplete || c.Type == batch.JobFailed) && c.Status == v1.ConditionTrue {
   316  			finishAt := c.LastTransitionTime
   317  			if finishAt.IsZero() {
   318  				return metav1.Time{}, fmt.Errorf("unable to find the time when the Job %s/%s finished", finishedJob.Namespace, finishedJob.Name)
   319  			}
   320  			return c.LastTransitionTime, nil
   321  		}
   322  	}
   323  
   324  	// This should never happen if the Jobs has finished
   325  	return metav1.Time{}, fmt.Errorf("unable to find the status of the finished Job %s/%s", finishedJob.Namespace, finishedJob.Name)
   326  }