k8s.io/kubernetes@v1.29.3/pkg/controller/ttlafterfinished/ttlafterfinished_controller.go (about) 1 /* 2 Copyright 2018 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package ttlafterfinished 18 19 import ( 20 "context" 21 "fmt" 22 "time" 23 24 batch "k8s.io/api/batch/v1" 25 v1 "k8s.io/api/core/v1" 26 "k8s.io/apimachinery/pkg/api/errors" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 29 "k8s.io/apimachinery/pkg/util/wait" 30 batchinformers "k8s.io/client-go/informers/batch/v1" 31 clientset "k8s.io/client-go/kubernetes" 32 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 33 batchlisters "k8s.io/client-go/listers/batch/v1" 34 "k8s.io/client-go/tools/cache" 35 "k8s.io/client-go/tools/record" 36 "k8s.io/client-go/util/workqueue" 37 "k8s.io/klog/v2" 38 "k8s.io/kubectl/pkg/scheme" 39 "k8s.io/kubernetes/pkg/controller" 40 jobutil "k8s.io/kubernetes/pkg/controller/job" 41 "k8s.io/kubernetes/pkg/controller/ttlafterfinished/metrics" 42 "k8s.io/utils/clock" 43 ) 44 45 // Controller watches for changes of Jobs API objects. Triggered by Job creation 46 // and updates, it enqueues Jobs that have non-nil `.spec.ttlSecondsAfterFinished` 47 // to the `queue`. The Controller has workers who consume `queue`, check whether 48 // the Job TTL has expired or not; if the Job TTL hasn't expired, it will add the 49 // Job to the queue after the TTL is expected to expire; if the TTL has expired, the 50 // worker will send requests to the API server to delete the Jobs accordingly. 51 // This is implemented outside of Job controller for separation of concerns, and 52 // because it will be extended to handle other finishable resource types. 53 type Controller struct { 54 client clientset.Interface 55 recorder record.EventRecorder 56 57 // jLister can list/get Jobs from the shared informer's store 58 jLister batchlisters.JobLister 59 60 // jStoreSynced returns true if the Job store has been synced at least once. 61 // Added as a member to the struct to allow injection for testing. 62 jListerSynced cache.InformerSynced 63 64 // Jobs that the controller will check its TTL and attempt to delete when the TTL expires. 65 queue workqueue.RateLimitingInterface 66 67 // The clock for tracking time 68 clock clock.Clock 69 } 70 71 // New creates an instance of Controller 72 func New(ctx context.Context, jobInformer batchinformers.JobInformer, client clientset.Interface) *Controller { 73 eventBroadcaster := record.NewBroadcaster() 74 eventBroadcaster.StartStructuredLogging(0) 75 eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: client.CoreV1().Events("")}) 76 77 metrics.Register() 78 79 tc := &Controller{ 80 client: client, 81 recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "ttl-after-finished-controller"}), 82 queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "ttl_jobs_to_delete"), 83 } 84 85 logger := klog.FromContext(ctx) 86 jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 87 AddFunc: func(obj interface{}) { 88 tc.addJob(logger, obj) 89 }, 90 UpdateFunc: func(oldObj, newObj interface{}) { 91 tc.updateJob(logger, oldObj, newObj) 92 }, 93 }) 94 95 tc.jLister = jobInformer.Lister() 96 tc.jListerSynced = jobInformer.Informer().HasSynced 97 98 tc.clock = clock.RealClock{} 99 100 return tc 101 } 102 103 // Run starts the workers to clean up Jobs. 104 func (tc *Controller) Run(ctx context.Context, workers int) { 105 defer utilruntime.HandleCrash() 106 defer tc.queue.ShutDown() 107 108 logger := klog.FromContext(ctx) 109 logger.Info("Starting TTL after finished controller") 110 defer logger.Info("Shutting down TTL after finished controller") 111 112 if !cache.WaitForNamedCacheSync("TTL after finished", ctx.Done(), tc.jListerSynced) { 113 return 114 } 115 116 for i := 0; i < workers; i++ { 117 go wait.UntilWithContext(ctx, tc.worker, time.Second) 118 } 119 120 <-ctx.Done() 121 } 122 123 func (tc *Controller) addJob(logger klog.Logger, obj interface{}) { 124 job := obj.(*batch.Job) 125 logger.V(4).Info("Adding job", "job", klog.KObj(job)) 126 127 if job.DeletionTimestamp == nil && needsCleanup(job) { 128 tc.enqueue(logger, job) 129 } 130 131 } 132 133 func (tc *Controller) updateJob(logger klog.Logger, old, cur interface{}) { 134 job := cur.(*batch.Job) 135 logger.V(4).Info("Updating job", "job", klog.KObj(job)) 136 137 if job.DeletionTimestamp == nil && needsCleanup(job) { 138 tc.enqueue(logger, job) 139 } 140 } 141 142 func (tc *Controller) enqueue(logger klog.Logger, job *batch.Job) { 143 logger.V(4).Info("Add job to cleanup", "job", klog.KObj(job)) 144 key, err := controller.KeyFunc(job) 145 if err != nil { 146 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", job, err)) 147 return 148 } 149 150 tc.queue.Add(key) 151 } 152 153 func (tc *Controller) enqueueAfter(job *batch.Job, after time.Duration) { 154 key, err := controller.KeyFunc(job) 155 if err != nil { 156 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", job, err)) 157 return 158 } 159 160 tc.queue.AddAfter(key, after) 161 } 162 163 func (tc *Controller) worker(ctx context.Context) { 164 for tc.processNextWorkItem(ctx) { 165 } 166 } 167 168 func (tc *Controller) processNextWorkItem(ctx context.Context) bool { 169 key, quit := tc.queue.Get() 170 if quit { 171 return false 172 } 173 defer tc.queue.Done(key) 174 175 err := tc.processJob(ctx, key.(string)) 176 tc.handleErr(err, key) 177 178 return true 179 } 180 181 func (tc *Controller) handleErr(err error, key interface{}) { 182 if err == nil { 183 tc.queue.Forget(key) 184 return 185 } 186 187 utilruntime.HandleError(fmt.Errorf("error cleaning up Job %v, will retry: %v", key, err)) 188 tc.queue.AddRateLimited(key) 189 } 190 191 // processJob will check the Job's state and TTL and delete the Job when it 192 // finishes and its TTL after finished has expired. If the Job hasn't finished or 193 // its TTL hasn't expired, it will be added to the queue after the TTL is expected 194 // to expire. 195 // This function is not meant to be invoked concurrently with the same key. 196 func (tc *Controller) processJob(ctx context.Context, key string) error { 197 namespace, name, err := cache.SplitMetaNamespaceKey(key) 198 if err != nil { 199 return err 200 } 201 202 // Ignore the Jobs that are already deleted or being deleted, or the ones that don't need clean up. 203 job, err := tc.jLister.Jobs(namespace).Get(name) 204 205 logger := klog.FromContext(ctx) 206 logger.V(4).Info("Checking if Job is ready for cleanup", "job", klog.KRef(namespace, name)) 207 208 if errors.IsNotFound(err) { 209 return nil 210 } 211 if err != nil { 212 return err 213 } 214 215 if expiredAt, err := tc.processTTL(logger, job); err != nil { 216 return err 217 } else if expiredAt == nil { 218 return nil 219 } 220 221 // The Job's TTL is assumed to have expired, but the Job TTL might be stale. 222 // Before deleting the Job, do a final sanity check. 223 // If TTL is modified before we do this check, we cannot be sure if the TTL truly expires. 224 // The latest Job may have a different UID, but it's fine because the checks will be run again. 225 fresh, err := tc.client.BatchV1().Jobs(namespace).Get(ctx, name, metav1.GetOptions{}) 226 if errors.IsNotFound(err) { 227 return nil 228 } 229 if err != nil { 230 return err 231 } 232 // Use the latest Job TTL to see if the TTL truly expires. 233 expiredAt, err := tc.processTTL(logger, fresh) 234 if err != nil { 235 return err 236 } else if expiredAt == nil { 237 return nil 238 } 239 // Cascade deletes the Jobs if TTL truly expires. 240 policy := metav1.DeletePropagationForeground 241 options := metav1.DeleteOptions{ 242 PropagationPolicy: &policy, 243 Preconditions: &metav1.Preconditions{UID: &fresh.UID}, 244 } 245 logger.V(4).Info("Cleaning up Job", "job", klog.KObj(fresh)) 246 if err := tc.client.BatchV1().Jobs(fresh.Namespace).Delete(ctx, fresh.Name, options); err != nil { 247 return err 248 } 249 metrics.JobDeletionDurationSeconds.Observe(time.Since(*expiredAt).Seconds()) 250 return nil 251 } 252 253 // processTTL checks whether a given Job's TTL has expired, and add it to the queue after the TTL is expected to expire 254 // if the TTL will expire later. 255 func (tc *Controller) processTTL(logger klog.Logger, job *batch.Job) (expiredAt *time.Time, err error) { 256 257 // We don't care about the Jobs that are going to be deleted, or the ones that don't need clean up. 258 if job.DeletionTimestamp != nil || !needsCleanup(job) { 259 return nil, nil 260 } 261 262 now := tc.clock.Now() 263 t, e, err := timeLeft(logger, job, &now) 264 if err != nil { 265 return nil, err 266 } 267 268 // TTL has expired 269 if *t <= 0 { 270 return e, nil 271 } 272 273 tc.enqueueAfter(job, *t) 274 return nil, nil 275 } 276 277 // needsCleanup checks whether a Job has finished and has a TTL set. 278 func needsCleanup(j *batch.Job) bool { 279 return j.Spec.TTLSecondsAfterFinished != nil && jobutil.IsJobFinished(j) 280 } 281 282 func getFinishAndExpireTime(j *batch.Job) (*time.Time, *time.Time, error) { 283 if !needsCleanup(j) { 284 return nil, nil, fmt.Errorf("job %s/%s should not be cleaned up", j.Namespace, j.Name) 285 } 286 t, err := jobFinishTime(j) 287 if err != nil { 288 return nil, nil, err 289 } 290 finishAt := t.Time 291 expireAt := finishAt.Add(time.Duration(*j.Spec.TTLSecondsAfterFinished) * time.Second) 292 return &finishAt, &expireAt, nil 293 } 294 295 func timeLeft(logger klog.Logger, j *batch.Job, since *time.Time) (*time.Duration, *time.Time, error) { 296 finishAt, expireAt, err := getFinishAndExpireTime(j) 297 if err != nil { 298 return nil, nil, err 299 } 300 301 if finishAt.After(*since) { 302 logger.Info("Warning: Found Job finished in the future. This is likely due to time skew in the cluster. Job cleanup will be deferred.", "job", klog.KObj(j)) 303 } 304 remaining := expireAt.Sub(*since) 305 logger.V(4).Info("Found Job finished", "job", klog.KObj(j), "finishTime", finishAt.UTC(), "remainingTTL", remaining, "startTime", since.UTC(), "deadlineTTL", expireAt.UTC()) 306 return &remaining, expireAt, nil 307 } 308 309 // jobFinishTime takes an already finished Job and returns the time it finishes. 310 func jobFinishTime(finishedJob *batch.Job) (metav1.Time, error) { 311 for _, c := range finishedJob.Status.Conditions { 312 if (c.Type == batch.JobComplete || c.Type == batch.JobFailed) && c.Status == v1.ConditionTrue { 313 finishAt := c.LastTransitionTime 314 if finishAt.IsZero() { 315 return metav1.Time{}, fmt.Errorf("unable to find the time when the Job %s/%s finished", finishedJob.Namespace, finishedJob.Name) 316 } 317 return c.LastTransitionTime, nil 318 } 319 } 320 321 // This should never happen if the Jobs has finished 322 return metav1.Time{}, fmt.Errorf("unable to find the status of the finished Job %s/%s", finishedJob.Namespace, finishedJob.Name) 323 }