volcano.sh/volcano@v1.9.0/pkg/controllers/garbagecollector/garbagecollector.go (about) 1 /* 2 Copyright 2019 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package garbagecollector 18 19 import ( 20 "context" 21 "fmt" 22 "time" 23 24 "k8s.io/apimachinery/pkg/api/errors" 25 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 "k8s.io/apimachinery/pkg/util/wait" 27 "k8s.io/client-go/tools/cache" 28 "k8s.io/client-go/util/workqueue" 29 "k8s.io/klog/v2" 30 31 "volcano.sh/apis/pkg/apis/batch/v1alpha1" 32 vcclientset "volcano.sh/apis/pkg/client/clientset/versioned" 33 informerfactory "volcano.sh/apis/pkg/client/informers/externalversions" 34 vcinformer "volcano.sh/apis/pkg/client/informers/externalversions" 35 batchinformers "volcano.sh/apis/pkg/client/informers/externalversions/batch/v1alpha1" 36 batchlisters "volcano.sh/apis/pkg/client/listers/batch/v1alpha1" 37 "volcano.sh/volcano/pkg/controllers/framework" 38 ) 39 40 func init() { 41 framework.RegisterController(&gccontroller{}) 42 } 43 44 // gccontroller runs reflectors to watch for changes of managed API 45 // objects. Currently it only watches Jobs. Triggered by Job creation 46 // and updates, it enqueues Jobs that have non-nil `.spec.ttlSecondsAfterFinished` 47 // to the `queue`. The gccontroller has workers who consume `queue`, check whether 48 // the Job TTL has expired or not; if the Job TTL hasn't expired, it will add the 49 // Job to the queue after the TTL is expected to expire; if the TTL has expired, the 50 // worker will send requests to the API server to delete the Jobs accordingly. 51 // This is implemented outside of Job controller for separation of concerns, and 52 // because it will be extended to handle other finishable resource types. 53 type gccontroller struct { 54 vcClient vcclientset.Interface 55 56 jobInformer batchinformers.JobInformer 57 58 vcInformerFactory vcinformer.SharedInformerFactory 59 60 // A store of jobs 61 jobLister batchlisters.JobLister 62 jobSynced func() bool 63 64 // queues that need to be updated. 65 queue workqueue.RateLimitingInterface 66 } 67 68 func (gc *gccontroller) Name() string { 69 return "gc-controller" 70 } 71 72 // Initialize creates an instance of gccontroller. 73 func (gc *gccontroller) Initialize(opt *framework.ControllerOption) error { 74 gc.vcClient = opt.VolcanoClient 75 76 factory := informerfactory.NewSharedInformerFactory(gc.vcClient, 0) 77 jobInformer := factory.Batch().V1alpha1().Jobs() 78 79 gc.vcInformerFactory = factory 80 gc.jobInformer = jobInformer 81 gc.jobLister = jobInformer.Lister() 82 gc.jobSynced = jobInformer.Informer().HasSynced 83 gc.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()) 84 85 jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 86 AddFunc: gc.addJob, 87 UpdateFunc: gc.updateJob, 88 }) 89 90 return nil 91 } 92 93 // Run starts the worker to clean up Jobs. 94 func (gc *gccontroller) Run(stopCh <-chan struct{}) { 95 defer gc.queue.ShutDown() 96 97 klog.Infof("Starting garbage collector") 98 defer klog.Infof("Shutting down garbage collector") 99 100 gc.vcInformerFactory.Start(stopCh) 101 for informerType, ok := range gc.vcInformerFactory.WaitForCacheSync(stopCh) { 102 if !ok { 103 klog.Errorf("caches failed to sync: %v", informerType) 104 return 105 } 106 } 107 108 go wait.Until(gc.worker, time.Second, stopCh) 109 110 <-stopCh 111 } 112 113 func (gc *gccontroller) addJob(obj interface{}) { 114 job := obj.(*v1alpha1.Job) 115 klog.V(4).Infof("Adding job %s/%s", job.Namespace, job.Name) 116 117 if job.DeletionTimestamp == nil && needsCleanup(job) { 118 gc.enqueue(job) 119 } 120 } 121 122 func (gc *gccontroller) updateJob(old, cur interface{}) { 123 job := cur.(*v1alpha1.Job) 124 klog.V(4).Infof("Updating job %s/%s", job.Namespace, job.Name) 125 126 if job.DeletionTimestamp == nil && needsCleanup(job) { 127 gc.enqueue(job) 128 } 129 } 130 131 func (gc *gccontroller) enqueue(job *v1alpha1.Job) { 132 klog.V(4).Infof("Add job %s/%s to cleanup", job.Namespace, job.Name) 133 key, err := cache.MetaNamespaceKeyFunc(job) 134 if err != nil { 135 klog.Errorf("couldn't get key for object %#v: %v", job, err) 136 return 137 } 138 139 gc.queue.Add(key) 140 } 141 142 func (gc *gccontroller) enqueueAfter(job *v1alpha1.Job, after time.Duration) { 143 key, err := cache.MetaNamespaceKeyFunc(job) 144 if err != nil { 145 klog.Errorf("couldn't get key for object %#v: %v", job, err) 146 return 147 } 148 149 gc.queue.AddAfter(key, after) 150 } 151 152 func (gc *gccontroller) worker() { 153 for gc.processNextWorkItem() { 154 } 155 } 156 157 func (gc *gccontroller) processNextWorkItem() bool { 158 key, quit := gc.queue.Get() 159 if quit { 160 return false 161 } 162 defer gc.queue.Done(key) 163 164 err := gc.processJob(key.(string)) 165 gc.handleErr(err, key) 166 167 return true 168 } 169 170 func (gc *gccontroller) handleErr(err error, key interface{}) { 171 if err == nil { 172 gc.queue.Forget(key) 173 return 174 } 175 176 klog.Errorf("error cleaning up Job %v, will retry: %v", key, err) 177 gc.queue.AddRateLimited(key) 178 } 179 180 // processJob will check the Job's state and TTL and delete the Job when it 181 // finishes and its TTL after finished has expired. If the Job hasn't finished or 182 // its TTL hasn't expired, it will be added to the queue after the TTL is expected 183 // to expire. 184 // This function is not meant to be invoked concurrently with the same key. 185 func (gc *gccontroller) processJob(key string) error { 186 namespace, name, err := cache.SplitMetaNamespaceKey(key) 187 if err != nil { 188 return err 189 } 190 191 klog.V(4).Infof("Checking if Job %s/%s is ready for cleanup", namespace, name) 192 // Ignore the Jobs that are already deleted or being deleted, or the ones that don't need clean up. 193 job, err := gc.jobLister.Jobs(namespace).Get(name) 194 if errors.IsNotFound(err) { 195 return nil 196 } 197 if err != nil { 198 return err 199 } 200 201 if expired, err := gc.processTTL(job); err != nil { 202 return err 203 } else if !expired { 204 return nil 205 } 206 207 // The Job's TTL is assumed to have expired, but the Job TTL might be stale. 208 // Before deleting the Job, do a final sanity check. 209 // If TTL is modified before we do this check, we cannot be sure if the TTL truly expires. 210 // The latest Job may have a different UID, but it's fine because the checks will be run again. 211 fresh, err := gc.vcClient.BatchV1alpha1().Jobs(namespace).Get(context.TODO(), name, metav1.GetOptions{}) 212 if errors.IsNotFound(err) { 213 return nil 214 } 215 if err != nil { 216 return err 217 } 218 // Use the latest Job TTL to see if the TTL truly expires. 219 if expired, err := gc.processTTL(fresh); err != nil { 220 return err 221 } else if !expired { 222 return nil 223 } 224 // Cascade deletes the Jobs if TTL truly expires. 225 policy := metav1.DeletePropagationForeground 226 options := metav1.DeleteOptions{ 227 PropagationPolicy: &policy, 228 Preconditions: &metav1.Preconditions{UID: &fresh.UID}, 229 } 230 klog.V(4).Infof("Cleaning up Job %s/%s", namespace, name) 231 return gc.vcClient.BatchV1alpha1().Jobs(fresh.Namespace).Delete(context.TODO(), fresh.Name, options) 232 } 233 234 // processTTL checks whether a given Job's TTL has expired, and add it to the queue after the TTL is expected to expire 235 // if the TTL will expire later. 236 func (gc *gccontroller) processTTL(job *v1alpha1.Job) (expired bool, err error) { 237 // We don't care about the Jobs that are going to be deleted, or the ones that don't need clean up. 238 if job.DeletionTimestamp != nil || !needsCleanup(job) { 239 return false, nil 240 } 241 242 now := time.Now() 243 t, err := timeLeft(job, &now) 244 if err != nil { 245 return false, err 246 } 247 248 // TTL has expired 249 if *t <= 0 { 250 return true, nil 251 } 252 253 gc.enqueueAfter(job, *t) 254 return false, nil 255 } 256 257 // needsCleanup checks whether a Job has finished and has a TTL set. 258 func needsCleanup(j *v1alpha1.Job) bool { 259 return j.Spec.TTLSecondsAfterFinished != nil && isJobFinished(j) 260 } 261 262 func isJobFinished(job *v1alpha1.Job) bool { 263 return job.Status.State.Phase == v1alpha1.Completed || 264 job.Status.State.Phase == v1alpha1.Failed || 265 job.Status.State.Phase == v1alpha1.Terminated 266 } 267 268 func getFinishAndExpireTime(j *v1alpha1.Job) (*time.Time, *time.Time, error) { 269 if !needsCleanup(j) { 270 return nil, nil, fmt.Errorf("job %s/%s should not be cleaned up", j.Namespace, j.Name) 271 } 272 finishAt, err := jobFinishTime(j) 273 if err != nil { 274 return nil, nil, err 275 } 276 finishAtUTC := finishAt.UTC() 277 expireAtUTC := finishAtUTC.Add(time.Duration(*j.Spec.TTLSecondsAfterFinished) * time.Second) 278 return &finishAtUTC, &expireAtUTC, nil 279 } 280 281 func timeLeft(j *v1alpha1.Job, since *time.Time) (*time.Duration, error) { 282 finishAt, expireAt, err := getFinishAndExpireTime(j) 283 if err != nil { 284 return nil, err 285 } 286 if finishAt.UTC().After(since.UTC()) { 287 klog.Warningf("Warning: Found Job %s/%s finished in the future. This is likely due to time skew in the cluster. Job cleanup will be deferred.", j.Namespace, j.Name) 288 } 289 remaining := expireAt.UTC().Sub(since.UTC()) 290 klog.V(4).Infof("Found Job %s/%s finished at %v, remaining TTL %v since %v, TTL will expire at %v", j.Namespace, j.Name, finishAt.UTC(), remaining, since.UTC(), expireAt.UTC()) 291 return &remaining, nil 292 } 293 294 // jobFinishTime takes an already finished Job and returns the time it finishes. 295 func jobFinishTime(finishedJob *v1alpha1.Job) (metav1.Time, error) { 296 if finishedJob.Status.State.LastTransitionTime.IsZero() { 297 return metav1.Time{}, fmt.Errorf("unable to find the time when the Job %s/%s finished", finishedJob.Namespace, finishedJob.Name) 298 } 299 return finishedJob.Status.State.LastTransitionTime, nil 300 }