volcano.sh/volcano@v1.9.0/pkg/controllers/garbagecollector/garbagecollector.go (about)

     1  /*
     2  Copyright 2019 The Volcano Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package garbagecollector
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"time"
    23  
    24  	"k8s.io/apimachinery/pkg/api/errors"
    25  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    26  	"k8s.io/apimachinery/pkg/util/wait"
    27  	"k8s.io/client-go/tools/cache"
    28  	"k8s.io/client-go/util/workqueue"
    29  	"k8s.io/klog/v2"
    30  
    31  	"volcano.sh/apis/pkg/apis/batch/v1alpha1"
    32  	vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
    33  	informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
    34  	vcinformer "volcano.sh/apis/pkg/client/informers/externalversions"
    35  	batchinformers "volcano.sh/apis/pkg/client/informers/externalversions/batch/v1alpha1"
    36  	batchlisters "volcano.sh/apis/pkg/client/listers/batch/v1alpha1"
    37  	"volcano.sh/volcano/pkg/controllers/framework"
    38  )
    39  
    40  func init() {
    41  	framework.RegisterController(&gccontroller{})
    42  }
    43  
    44  // gccontroller runs reflectors to watch for changes of managed API
    45  // objects. Currently it only watches Jobs. Triggered by Job creation
    46  // and updates, it enqueues Jobs that have non-nil `.spec.ttlSecondsAfterFinished`
    47  // to the `queue`. The gccontroller has workers who consume `queue`, check whether
    48  // the Job TTL has expired or not; if the Job TTL hasn't expired, it will add the
    49  // Job to the queue after the TTL is expected to expire; if the TTL has expired, the
    50  // worker will send requests to the API server to delete the Jobs accordingly.
    51  // This is implemented outside of Job controller for separation of concerns, and
    52  // because it will be extended to handle other finishable resource types.
    53  type gccontroller struct {
    54  	vcClient vcclientset.Interface
    55  
    56  	jobInformer batchinformers.JobInformer
    57  
    58  	vcInformerFactory vcinformer.SharedInformerFactory
    59  
    60  	// A store of jobs
    61  	jobLister batchlisters.JobLister
    62  	jobSynced func() bool
    63  
    64  	// queues that need to be updated.
    65  	queue workqueue.RateLimitingInterface
    66  }
    67  
    68  func (gc *gccontroller) Name() string {
    69  	return "gc-controller"
    70  }
    71  
    72  // Initialize creates an instance of gccontroller.
    73  func (gc *gccontroller) Initialize(opt *framework.ControllerOption) error {
    74  	gc.vcClient = opt.VolcanoClient
    75  
    76  	factory := informerfactory.NewSharedInformerFactory(gc.vcClient, 0)
    77  	jobInformer := factory.Batch().V1alpha1().Jobs()
    78  
    79  	gc.vcInformerFactory = factory
    80  	gc.jobInformer = jobInformer
    81  	gc.jobLister = jobInformer.Lister()
    82  	gc.jobSynced = jobInformer.Informer().HasSynced
    83  	gc.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
    84  
    85  	jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
    86  		AddFunc:    gc.addJob,
    87  		UpdateFunc: gc.updateJob,
    88  	})
    89  
    90  	return nil
    91  }
    92  
    93  // Run starts the worker to clean up Jobs.
    94  func (gc *gccontroller) Run(stopCh <-chan struct{}) {
    95  	defer gc.queue.ShutDown()
    96  
    97  	klog.Infof("Starting garbage collector")
    98  	defer klog.Infof("Shutting down garbage collector")
    99  
   100  	gc.vcInformerFactory.Start(stopCh)
   101  	for informerType, ok := range gc.vcInformerFactory.WaitForCacheSync(stopCh) {
   102  		if !ok {
   103  			klog.Errorf("caches failed to sync: %v", informerType)
   104  			return
   105  		}
   106  	}
   107  
   108  	go wait.Until(gc.worker, time.Second, stopCh)
   109  
   110  	<-stopCh
   111  }
   112  
   113  func (gc *gccontroller) addJob(obj interface{}) {
   114  	job := obj.(*v1alpha1.Job)
   115  	klog.V(4).Infof("Adding job %s/%s", job.Namespace, job.Name)
   116  
   117  	if job.DeletionTimestamp == nil && needsCleanup(job) {
   118  		gc.enqueue(job)
   119  	}
   120  }
   121  
   122  func (gc *gccontroller) updateJob(old, cur interface{}) {
   123  	job := cur.(*v1alpha1.Job)
   124  	klog.V(4).Infof("Updating job %s/%s", job.Namespace, job.Name)
   125  
   126  	if job.DeletionTimestamp == nil && needsCleanup(job) {
   127  		gc.enqueue(job)
   128  	}
   129  }
   130  
   131  func (gc *gccontroller) enqueue(job *v1alpha1.Job) {
   132  	klog.V(4).Infof("Add job %s/%s to cleanup", job.Namespace, job.Name)
   133  	key, err := cache.MetaNamespaceKeyFunc(job)
   134  	if err != nil {
   135  		klog.Errorf("couldn't get key for object %#v: %v", job, err)
   136  		return
   137  	}
   138  
   139  	gc.queue.Add(key)
   140  }
   141  
   142  func (gc *gccontroller) enqueueAfter(job *v1alpha1.Job, after time.Duration) {
   143  	key, err := cache.MetaNamespaceKeyFunc(job)
   144  	if err != nil {
   145  		klog.Errorf("couldn't get key for object %#v: %v", job, err)
   146  		return
   147  	}
   148  
   149  	gc.queue.AddAfter(key, after)
   150  }
   151  
   152  func (gc *gccontroller) worker() {
   153  	for gc.processNextWorkItem() {
   154  	}
   155  }
   156  
   157  func (gc *gccontroller) processNextWorkItem() bool {
   158  	key, quit := gc.queue.Get()
   159  	if quit {
   160  		return false
   161  	}
   162  	defer gc.queue.Done(key)
   163  
   164  	err := gc.processJob(key.(string))
   165  	gc.handleErr(err, key)
   166  
   167  	return true
   168  }
   169  
   170  func (gc *gccontroller) handleErr(err error, key interface{}) {
   171  	if err == nil {
   172  		gc.queue.Forget(key)
   173  		return
   174  	}
   175  
   176  	klog.Errorf("error cleaning up Job %v, will retry: %v", key, err)
   177  	gc.queue.AddRateLimited(key)
   178  }
   179  
   180  // processJob will check the Job's state and TTL and delete the Job when it
   181  // finishes and its TTL after finished has expired. If the Job hasn't finished or
   182  // its TTL hasn't expired, it will be added to the queue after the TTL is expected
   183  // to expire.
   184  // This function is not meant to be invoked concurrently with the same key.
   185  func (gc *gccontroller) processJob(key string) error {
   186  	namespace, name, err := cache.SplitMetaNamespaceKey(key)
   187  	if err != nil {
   188  		return err
   189  	}
   190  
   191  	klog.V(4).Infof("Checking if Job %s/%s is ready for cleanup", namespace, name)
   192  	// Ignore the Jobs that are already deleted or being deleted, or the ones that don't need clean up.
   193  	job, err := gc.jobLister.Jobs(namespace).Get(name)
   194  	if errors.IsNotFound(err) {
   195  		return nil
   196  	}
   197  	if err != nil {
   198  		return err
   199  	}
   200  
   201  	if expired, err := gc.processTTL(job); err != nil {
   202  		return err
   203  	} else if !expired {
   204  		return nil
   205  	}
   206  
   207  	// The Job's TTL is assumed to have expired, but the Job TTL might be stale.
   208  	// Before deleting the Job, do a final sanity check.
   209  	// If TTL is modified before we do this check, we cannot be sure if the TTL truly expires.
   210  	// The latest Job may have a different UID, but it's fine because the checks will be run again.
   211  	fresh, err := gc.vcClient.BatchV1alpha1().Jobs(namespace).Get(context.TODO(), name, metav1.GetOptions{})
   212  	if errors.IsNotFound(err) {
   213  		return nil
   214  	}
   215  	if err != nil {
   216  		return err
   217  	}
   218  	// Use the latest Job TTL to see if the TTL truly expires.
   219  	if expired, err := gc.processTTL(fresh); err != nil {
   220  		return err
   221  	} else if !expired {
   222  		return nil
   223  	}
   224  	// Cascade deletes the Jobs if TTL truly expires.
   225  	policy := metav1.DeletePropagationForeground
   226  	options := metav1.DeleteOptions{
   227  		PropagationPolicy: &policy,
   228  		Preconditions:     &metav1.Preconditions{UID: &fresh.UID},
   229  	}
   230  	klog.V(4).Infof("Cleaning up Job %s/%s", namespace, name)
   231  	return gc.vcClient.BatchV1alpha1().Jobs(fresh.Namespace).Delete(context.TODO(), fresh.Name, options)
   232  }
   233  
   234  // processTTL checks whether a given Job's TTL has expired, and add it to the queue after the TTL is expected to expire
   235  // if the TTL will expire later.
   236  func (gc *gccontroller) processTTL(job *v1alpha1.Job) (expired bool, err error) {
   237  	// We don't care about the Jobs that are going to be deleted, or the ones that don't need clean up.
   238  	if job.DeletionTimestamp != nil || !needsCleanup(job) {
   239  		return false, nil
   240  	}
   241  
   242  	now := time.Now()
   243  	t, err := timeLeft(job, &now)
   244  	if err != nil {
   245  		return false, err
   246  	}
   247  
   248  	// TTL has expired
   249  	if *t <= 0 {
   250  		return true, nil
   251  	}
   252  
   253  	gc.enqueueAfter(job, *t)
   254  	return false, nil
   255  }
   256  
   257  // needsCleanup checks whether a Job has finished and has a TTL set.
   258  func needsCleanup(j *v1alpha1.Job) bool {
   259  	return j.Spec.TTLSecondsAfterFinished != nil && isJobFinished(j)
   260  }
   261  
   262  func isJobFinished(job *v1alpha1.Job) bool {
   263  	return job.Status.State.Phase == v1alpha1.Completed ||
   264  		job.Status.State.Phase == v1alpha1.Failed ||
   265  		job.Status.State.Phase == v1alpha1.Terminated
   266  }
   267  
   268  func getFinishAndExpireTime(j *v1alpha1.Job) (*time.Time, *time.Time, error) {
   269  	if !needsCleanup(j) {
   270  		return nil, nil, fmt.Errorf("job %s/%s should not be cleaned up", j.Namespace, j.Name)
   271  	}
   272  	finishAt, err := jobFinishTime(j)
   273  	if err != nil {
   274  		return nil, nil, err
   275  	}
   276  	finishAtUTC := finishAt.UTC()
   277  	expireAtUTC := finishAtUTC.Add(time.Duration(*j.Spec.TTLSecondsAfterFinished) * time.Second)
   278  	return &finishAtUTC, &expireAtUTC, nil
   279  }
   280  
   281  func timeLeft(j *v1alpha1.Job, since *time.Time) (*time.Duration, error) {
   282  	finishAt, expireAt, err := getFinishAndExpireTime(j)
   283  	if err != nil {
   284  		return nil, err
   285  	}
   286  	if finishAt.UTC().After(since.UTC()) {
   287  		klog.Warningf("Warning: Found Job %s/%s finished in the future. This is likely due to time skew in the cluster. Job cleanup will be deferred.", j.Namespace, j.Name)
   288  	}
   289  	remaining := expireAt.UTC().Sub(since.UTC())
   290  	klog.V(4).Infof("Found Job %s/%s finished at %v, remaining TTL %v since %v, TTL will expire at %v", j.Namespace, j.Name, finishAt.UTC(), remaining, since.UTC(), expireAt.UTC())
   291  	return &remaining, nil
   292  }
   293  
   294  // jobFinishTime takes an already finished Job and returns the time it finishes.
   295  func jobFinishTime(finishedJob *v1alpha1.Job) (metav1.Time, error) {
   296  	if finishedJob.Status.State.LastTransitionTime.IsZero() {
   297  		return metav1.Time{}, fmt.Errorf("unable to find the time when the Job %s/%s finished", finishedJob.Namespace, finishedJob.Name)
   298  	}
   299  	return finishedJob.Status.State.LastTransitionTime, nil
   300  }