k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/ttlafterfinished/ttlafterfinished_controller.go (about) 1 /* 2 Copyright 2018 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package ttlafterfinished 18 19 import ( 20 "context" 21 "fmt" 22 "time" 23 24 batch "k8s.io/api/batch/v1" 25 v1 "k8s.io/api/core/v1" 26 "k8s.io/apimachinery/pkg/api/errors" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 29 "k8s.io/apimachinery/pkg/util/wait" 30 batchinformers "k8s.io/client-go/informers/batch/v1" 31 clientset "k8s.io/client-go/kubernetes" 32 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 33 batchlisters "k8s.io/client-go/listers/batch/v1" 34 "k8s.io/client-go/tools/cache" 35 "k8s.io/client-go/tools/record" 36 "k8s.io/client-go/util/workqueue" 37 "k8s.io/klog/v2" 38 "k8s.io/kubectl/pkg/scheme" 39 "k8s.io/kubernetes/pkg/controller" 40 jobutil "k8s.io/kubernetes/pkg/controller/job/util" 41 "k8s.io/kubernetes/pkg/controller/ttlafterfinished/metrics" 42 "k8s.io/utils/clock" 43 ) 44 45 // Controller watches for changes of Jobs API objects. Triggered by Job creation 46 // and updates, it enqueues Jobs that have non-nil `.spec.ttlSecondsAfterFinished` 47 // to the `queue`. The Controller has workers who consume `queue`, check whether 48 // the Job TTL has expired or not; if the Job TTL hasn't expired, it will add the 49 // Job to the queue after the TTL is expected to expire; if the TTL has expired, the 50 // worker will send requests to the API server to delete the Jobs accordingly. 51 // This is implemented outside of Job controller for separation of concerns, and 52 // because it will be extended to handle other finishable resource types. 53 type Controller struct { 54 client clientset.Interface 55 recorder record.EventRecorder 56 57 // jLister can list/get Jobs from the shared informer's store 58 jLister batchlisters.JobLister 59 60 // jStoreSynced returns true if the Job store has been synced at least once. 61 // Added as a member to the struct to allow injection for testing. 62 jListerSynced cache.InformerSynced 63 64 // Jobs that the controller will check its TTL and attempt to delete when the TTL expires. 65 queue workqueue.TypedRateLimitingInterface[string] 66 67 // The clock for tracking time 68 clock clock.Clock 69 } 70 71 // New creates an instance of Controller 72 func New(ctx context.Context, jobInformer batchinformers.JobInformer, client clientset.Interface) *Controller { 73 eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx)) 74 eventBroadcaster.StartStructuredLogging(3) 75 eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: client.CoreV1().Events("")}) 76 77 metrics.Register() 78 79 tc := &Controller{ 80 client: client, 81 recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "ttl-after-finished-controller"}), 82 queue: workqueue.NewTypedRateLimitingQueueWithConfig( 83 workqueue.DefaultTypedControllerRateLimiter[string](), 84 workqueue.TypedRateLimitingQueueConfig[string]{Name: "ttl_jobs_to_delete"}, 85 ), 86 } 87 88 logger := klog.FromContext(ctx) 89 jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 90 AddFunc: func(obj interface{}) { 91 tc.addJob(logger, obj) 92 }, 93 UpdateFunc: func(oldObj, newObj interface{}) { 94 tc.updateJob(logger, oldObj, newObj) 95 }, 96 }) 97 98 tc.jLister = jobInformer.Lister() 99 tc.jListerSynced = jobInformer.Informer().HasSynced 100 101 tc.clock = clock.RealClock{} 102 103 return tc 104 } 105 106 // Run starts the workers to clean up Jobs. 107 func (tc *Controller) Run(ctx context.Context, workers int) { 108 defer utilruntime.HandleCrash() 109 defer tc.queue.ShutDown() 110 111 logger := klog.FromContext(ctx) 112 logger.Info("Starting TTL after finished controller") 113 defer logger.Info("Shutting down TTL after finished controller") 114 115 if !cache.WaitForNamedCacheSync("TTL after finished", ctx.Done(), tc.jListerSynced) { 116 return 117 } 118 119 for i := 0; i < workers; i++ { 120 go wait.UntilWithContext(ctx, tc.worker, time.Second) 121 } 122 123 <-ctx.Done() 124 } 125 126 func (tc *Controller) addJob(logger klog.Logger, obj interface{}) { 127 job := obj.(*batch.Job) 128 logger.V(4).Info("Adding job", "job", klog.KObj(job)) 129 130 if job.DeletionTimestamp == nil && needsCleanup(job) { 131 tc.enqueue(logger, job) 132 } 133 134 } 135 136 func (tc *Controller) updateJob(logger klog.Logger, old, cur interface{}) { 137 job := cur.(*batch.Job) 138 logger.V(4).Info("Updating job", "job", klog.KObj(job)) 139 140 if job.DeletionTimestamp == nil && needsCleanup(job) { 141 tc.enqueue(logger, job) 142 } 143 } 144 145 func (tc *Controller) enqueue(logger klog.Logger, job *batch.Job) { 146 logger.V(4).Info("Add job to cleanup", "job", klog.KObj(job)) 147 key, err := controller.KeyFunc(job) 148 if err != nil { 149 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", job, err)) 150 return 151 } 152 153 tc.queue.Add(key) 154 } 155 156 func (tc *Controller) enqueueAfter(job *batch.Job, after time.Duration) { 157 key, err := controller.KeyFunc(job) 158 if err != nil { 159 utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", job, err)) 160 return 161 } 162 163 tc.queue.AddAfter(key, after) 164 } 165 166 func (tc *Controller) worker(ctx context.Context) { 167 for tc.processNextWorkItem(ctx) { 168 } 169 } 170 171 func (tc *Controller) processNextWorkItem(ctx context.Context) bool { 172 key, quit := tc.queue.Get() 173 if quit { 174 return false 175 } 176 defer tc.queue.Done(key) 177 178 err := tc.processJob(ctx, key) 179 tc.handleErr(err, key) 180 181 return true 182 } 183 184 func (tc *Controller) handleErr(err error, key string) { 185 if err == nil { 186 tc.queue.Forget(key) 187 return 188 } 189 190 utilruntime.HandleError(fmt.Errorf("error cleaning up Job %v, will retry: %v", key, err)) 191 tc.queue.AddRateLimited(key) 192 } 193 194 // processJob will check the Job's state and TTL and delete the Job when it 195 // finishes and its TTL after finished has expired. If the Job hasn't finished or 196 // its TTL hasn't expired, it will be added to the queue after the TTL is expected 197 // to expire. 198 // This function is not meant to be invoked concurrently with the same key. 199 func (tc *Controller) processJob(ctx context.Context, key string) error { 200 namespace, name, err := cache.SplitMetaNamespaceKey(key) 201 if err != nil { 202 return err 203 } 204 205 // Ignore the Jobs that are already deleted or being deleted, or the ones that don't need clean up. 206 job, err := tc.jLister.Jobs(namespace).Get(name) 207 208 logger := klog.FromContext(ctx) 209 logger.V(4).Info("Checking if Job is ready for cleanup", "job", klog.KRef(namespace, name)) 210 211 if errors.IsNotFound(err) { 212 return nil 213 } 214 if err != nil { 215 return err 216 } 217 218 if expiredAt, err := tc.processTTL(logger, job); err != nil { 219 return err 220 } else if expiredAt == nil { 221 return nil 222 } 223 224 // The Job's TTL is assumed to have expired, but the Job TTL might be stale. 225 // Before deleting the Job, do a final sanity check. 226 // If TTL is modified before we do this check, we cannot be sure if the TTL truly expires. 227 // The latest Job may have a different UID, but it's fine because the checks will be run again. 228 fresh, err := tc.client.BatchV1().Jobs(namespace).Get(ctx, name, metav1.GetOptions{}) 229 if errors.IsNotFound(err) { 230 return nil 231 } 232 if err != nil { 233 return err 234 } 235 // Use the latest Job TTL to see if the TTL truly expires. 236 expiredAt, err := tc.processTTL(logger, fresh) 237 if err != nil { 238 return err 239 } else if expiredAt == nil { 240 return nil 241 } 242 // Cascade deletes the Jobs if TTL truly expires. 243 policy := metav1.DeletePropagationForeground 244 options := metav1.DeleteOptions{ 245 PropagationPolicy: &policy, 246 Preconditions: &metav1.Preconditions{UID: &fresh.UID}, 247 } 248 logger.V(4).Info("Cleaning up Job", "job", klog.KObj(fresh)) 249 if err := tc.client.BatchV1().Jobs(fresh.Namespace).Delete(ctx, fresh.Name, options); err != nil { 250 return err 251 } 252 metrics.JobDeletionDurationSeconds.Observe(time.Since(*expiredAt).Seconds()) 253 return nil 254 } 255 256 // processTTL checks whether a given Job's TTL has expired, and add it to the queue after the TTL is expected to expire 257 // if the TTL will expire later. 258 func (tc *Controller) processTTL(logger klog.Logger, job *batch.Job) (expiredAt *time.Time, err error) { 259 260 // We don't care about the Jobs that are going to be deleted, or the ones that don't need clean up. 261 if job.DeletionTimestamp != nil || !needsCleanup(job) { 262 return nil, nil 263 } 264 265 now := tc.clock.Now() 266 t, e, err := timeLeft(logger, job, &now) 267 if err != nil { 268 return nil, err 269 } 270 271 // TTL has expired 272 if *t <= 0 { 273 return e, nil 274 } 275 276 tc.enqueueAfter(job, *t) 277 return nil, nil 278 } 279 280 // needsCleanup checks whether a Job has finished and has a TTL set. 281 func needsCleanup(j *batch.Job) bool { 282 return j.Spec.TTLSecondsAfterFinished != nil && jobutil.IsJobFinished(j) 283 } 284 285 func getFinishAndExpireTime(j *batch.Job) (*time.Time, *time.Time, error) { 286 if !needsCleanup(j) { 287 return nil, nil, fmt.Errorf("job %s/%s should not be cleaned up", j.Namespace, j.Name) 288 } 289 t, err := jobFinishTime(j) 290 if err != nil { 291 return nil, nil, err 292 } 293 finishAt := t.Time 294 expireAt := finishAt.Add(time.Duration(*j.Spec.TTLSecondsAfterFinished) * time.Second) 295 return &finishAt, &expireAt, nil 296 } 297 298 func timeLeft(logger klog.Logger, j *batch.Job, since *time.Time) (*time.Duration, *time.Time, error) { 299 finishAt, expireAt, err := getFinishAndExpireTime(j) 300 if err != nil { 301 return nil, nil, err 302 } 303 304 if finishAt.After(*since) { 305 logger.Info("Warning: Found Job finished in the future. This is likely due to time skew in the cluster. Job cleanup will be deferred.", "job", klog.KObj(j)) 306 } 307 remaining := expireAt.Sub(*since) 308 logger.V(4).Info("Found Job finished", "job", klog.KObj(j), "finishTime", finishAt.UTC(), "remainingTTL", remaining, "startTime", since.UTC(), "deadlineTTL", expireAt.UTC()) 309 return &remaining, expireAt, nil 310 } 311 312 // jobFinishTime takes an already finished Job and returns the time it finishes. 313 func jobFinishTime(finishedJob *batch.Job) (metav1.Time, error) { 314 for _, c := range finishedJob.Status.Conditions { 315 if (c.Type == batch.JobComplete || c.Type == batch.JobFailed) && c.Status == v1.ConditionTrue { 316 finishAt := c.LastTransitionTime 317 if finishAt.IsZero() { 318 return metav1.Time{}, fmt.Errorf("unable to find the time when the Job %s/%s finished", finishedJob.Namespace, finishedJob.Name) 319 } 320 return c.LastTransitionTime, nil 321 } 322 } 323 324 // This should never happen if the Jobs has finished 325 return metav1.Time{}, fmt.Errorf("unable to find the status of the finished Job %s/%s", finishedJob.Namespace, finishedJob.Name) 326 }