sigs.k8s.io/kubebuilder/v3@v3.14.0/hack/docs/internal/cronjob-tutorial/controller_implementation.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cronjob 18 19 const ControllerIntro = ` 20 // +kubebuilder:docs-gen:collapse=Apache License 21 22 /* 23 We'll start out with some imports. You'll see below that we'll need a few more imports 24 than those scaffolded for us. We'll talk about each one when we use it. 25 */` 26 27 const ControllerImport = `import ( 28 "context" 29 "fmt" 30 "sort" 31 "time" 32 33 "github.com/robfig/cron" 34 kbatch "k8s.io/api/batch/v1" 35 corev1 "k8s.io/api/core/v1" 36 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 37 "k8s.io/apimachinery/pkg/runtime" 38 ref "k8s.io/client-go/tools/reference" 39 ctrl "sigs.k8s.io/controller-runtime" 40 "sigs.k8s.io/controller-runtime/pkg/client" 41 "sigs.k8s.io/controller-runtime/pkg/log" 42 43 batchv1 "tutorial.kubebuilder.io/project/api/v1" 44 ) 45 46 /* 47 Next, we'll need a Clock, which will allow us to fake timing in our tests. 48 */ 49 ` 50 51 const ControllerMockClock = ` 52 /* 53 We'll mock out the clock to make it easier to jump around in time while testing, 54 the "real" clock just calls` + " `" + `time.Now` + "`" + `. 55 */ 56 type realClock struct{} 57 58 func (_ realClock) Now() time.Time { return time.Now() } 59 60 // clock knows how to get the current time. 61 // It can be used to fake out timing for testing. 62 type Clock interface { 63 Now() time.Time 64 } 65 66 // +kubebuilder:docs-gen:collapse=Clock 67 68 /* 69 Notice that we need a few more RBAC permissions -- since we're creating and 70 managing jobs now, we'll need permissions for those, which means adding 71 a couple more [markers](/reference/markers/rbac.md). 72 */ 73 ` 74 75 const ControllerReconcile = ` 76 //+kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete 77 //+kubebuilder:rbac:groups=batch,resources=jobs/status,verbs=get 78 79 /* 80 Now, we get to the heart of the controller -- the reconciler logic. 81 */ 82 var ( 83 scheduledTimeAnnotation = "batch.tutorial.kubebuilder.io/scheduled-at" 84 ) 85 ` 86 87 const ControllerReconcileLogic = `log := log.FromContext(ctx) 88 89 /* 90 ### 1: Load the CronJob by name 91 92 We'll fetch the CronJob using our client. All client methods take a 93 context (to allow for cancellation) as their first argument, and the object 94 in question as their last. Get is a bit special, in that it takes a 95 [` + "`" + `NamespacedName` + "`" + `](https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/client?tab=doc#ObjectKey) 96 as the middle argument (most don't have a middle argument, as we'll see 97 below). 98 99 Many client methods also take variadic options at the end. 100 */ 101 var cronJob batchv1.CronJob 102 if err := r.Get(ctx, req.NamespacedName, &cronJob); err != nil { 103 log.Error(err, "unable to fetch CronJob") 104 // we'll ignore not-found errors, since they can't be fixed by an immediate 105 // requeue (we'll need to wait for a new notification), and we can get them 106 // on deleted requests. 107 return ctrl.Result{}, client.IgnoreNotFound(err) 108 } 109 110 /* 111 ### 2: List all active jobs, and update the status 112 113 To fully update our status, we'll need to list all child jobs in this namespace that belong to this CronJob. 114 Similarly to Get, we can use the List method to list the child jobs. Notice that we use variadic options to 115 set the namespace and field match (which is actually an index lookup that we set up below). 116 */ 117 var childJobs kbatch.JobList 118 if err := r.List(ctx, &childJobs, client.InNamespace(req.Namespace), client.MatchingFields{jobOwnerKey: req.Name}); err != nil { 119 log.Error(err, "unable to list child Jobs") 120 return ctrl.Result{}, err 121 } 122 123 /* 124 125 <aside class="note"> 126 127 <h1>What is this index about?</h1> 128 129 <p>The reconciler fetches all jobs owned by the cronjob for the status. As our number of cronjobs increases, 130 looking these up can become quite slow as we have to filter through all of them. For a more efficient lookup, 131 these jobs will be indexed locally on the controller's name. A jobOwnerKey field is added to the 132 cached job objects. This key references the owning controller and functions as the index. Later in this 133 document we will configure the manager to actually index this field.</p> 134 135 </aside> 136 137 Once we have all the jobs we own, we'll split them into active, successful, 138 and failed jobs, keeping track of the most recent run so that we can record it 139 in status. Remember, status should be able to be reconstituted from the state 140 of the world, so it's generally not a good idea to read from the status of the 141 root object. Instead, you should reconstruct it every run. That's what we'll 142 do here. 143 144 We can check if a job is "finished" and whether it succeeded or failed using status 145 conditions. We'll put that logic in a helper to make our code cleaner. 146 */ 147 148 // find the active list of jobs 149 var activeJobs []*kbatch.Job 150 var successfulJobs []*kbatch.Job 151 var failedJobs []*kbatch.Job 152 var mostRecentTime *time.Time // find the last run so we can update the status 153 154 /* 155 We consider a job "finished" if it has a "Complete" or "Failed" condition marked as true. 156 Status conditions allow us to add extensible status information to our objects that other 157 humans and controllers can examine to check things like completion and health. 158 */ 159 isJobFinished := func(job *kbatch.Job) (bool, kbatch.JobConditionType) { 160 for _, c := range job.Status.Conditions { 161 if (c.Type == kbatch.JobComplete || c.Type == kbatch.JobFailed) && c.Status == corev1.ConditionTrue { 162 return true, c.Type 163 } 164 } 165 166 return false, "" 167 } 168 // +kubebuilder:docs-gen:collapse=isJobFinished 169 170 /* 171 We'll use a helper to extract the scheduled time from the annotation that 172 we added during job creation. 173 */ 174 getScheduledTimeForJob := func(job *kbatch.Job) (*time.Time, error) { 175 timeRaw := job.Annotations[scheduledTimeAnnotation] 176 if len(timeRaw) == 0 { 177 return nil, nil 178 } 179 180 timeParsed, err := time.Parse(time.RFC3339, timeRaw) 181 if err != nil { 182 return nil, err 183 } 184 return &timeParsed, nil 185 } 186 // +kubebuilder:docs-gen:collapse=getScheduledTimeForJob 187 188 for i, job := range childJobs.Items { 189 _, finishedType := isJobFinished(&job) 190 switch finishedType { 191 case "": // ongoing 192 activeJobs = append(activeJobs, &childJobs.Items[i]) 193 case kbatch.JobFailed: 194 failedJobs = append(failedJobs, &childJobs.Items[i]) 195 case kbatch.JobComplete: 196 successfulJobs = append(successfulJobs, &childJobs.Items[i]) 197 } 198 199 // We'll store the launch time in an annotation, so we'll reconstitute that from 200 // the active jobs themselves. 201 scheduledTimeForJob, err := getScheduledTimeForJob(&job) 202 if err != nil { 203 log.Error(err, "unable to parse schedule time for child job", "job", &job) 204 continue 205 } 206 if scheduledTimeForJob != nil { 207 if mostRecentTime == nil || mostRecentTime.Before(*scheduledTimeForJob) { 208 mostRecentTime = scheduledTimeForJob 209 } 210 } 211 } 212 213 if mostRecentTime != nil { 214 cronJob.Status.LastScheduleTime = &metav1.Time{Time: *mostRecentTime} 215 } else { 216 cronJob.Status.LastScheduleTime = nil 217 } 218 cronJob.Status.Active = nil 219 for _, activeJob := range activeJobs { 220 jobRef, err := ref.GetReference(r.Scheme, activeJob) 221 if err != nil { 222 log.Error(err, "unable to make reference to active job", "job", activeJob) 223 continue 224 } 225 cronJob.Status.Active = append(cronJob.Status.Active, *jobRef) 226 } 227 228 /* 229 Here, we'll log how many jobs we observed at a slightly higher logging level, 230 for debugging. Notice how instead of using a format string, we use a fixed message, 231 and attach key-value pairs with the extra information. This makes it easier to 232 filter and query log lines. 233 */ 234 log.V(1).Info("job count", "active jobs", len(activeJobs), "successful jobs", len(successfulJobs), "failed jobs", len(failedJobs)) 235 236 /* 237 Using the data we've gathered, we'll update the status of our CRD. 238 Just like before, we use our client. To specifically update the status 239 subresource, we'll use the` + " `" + `Status` + "`" + ` part of the client, with the` + " `" + `Update` + "`" + ` 240 method. 241 242 The status subresource ignores changes to spec, so it's less likely to conflict 243 with any other updates, and can have separate permissions. 244 */ 245 if err := r.Status().Update(ctx, &cronJob); err != nil { 246 log.Error(err, "unable to update CronJob status") 247 return ctrl.Result{}, err 248 } 249 250 /* 251 Once we've updated our status, we can move on to ensuring that the status of 252 the world matches what we want in our spec. 253 254 ### 3: Clean up old jobs according to the history limit 255 256 First, we'll try to clean up old jobs, so that we don't leave too many lying 257 around. 258 */ 259 260 // NB: deleting these are "best effort" -- if we fail on a particular one, 261 // we won't requeue just to finish the deleting. 262 if cronJob.Spec.FailedJobsHistoryLimit != nil { 263 sort.Slice(failedJobs, func(i, j int) bool { 264 if failedJobs[i].Status.StartTime == nil { 265 return failedJobs[j].Status.StartTime != nil 266 } 267 return failedJobs[i].Status.StartTime.Before(failedJobs[j].Status.StartTime) 268 }) 269 for i, job := range failedJobs { 270 if int32(i) >= int32(len(failedJobs))-*cronJob.Spec.FailedJobsHistoryLimit { 271 break 272 } 273 if err := r.Delete(ctx, job, client.PropagationPolicy(metav1.DeletePropagationBackground)); client.IgnoreNotFound(err) != nil { 274 log.Error(err, "unable to delete old failed job", "job", job) 275 } else { 276 log.V(0).Info("deleted old failed job", "job", job) 277 } 278 } 279 } 280 281 if cronJob.Spec.SuccessfulJobsHistoryLimit != nil { 282 sort.Slice(successfulJobs, func(i, j int) bool { 283 if successfulJobs[i].Status.StartTime == nil { 284 return successfulJobs[j].Status.StartTime != nil 285 } 286 return successfulJobs[i].Status.StartTime.Before(successfulJobs[j].Status.StartTime) 287 }) 288 for i, job := range successfulJobs { 289 if int32(i) >= int32(len(successfulJobs))-*cronJob.Spec.SuccessfulJobsHistoryLimit { 290 break 291 } 292 if err := r.Delete(ctx, job, client.PropagationPolicy(metav1.DeletePropagationBackground)); err != nil { 293 log.Error(err, "unable to delete old successful job", "job", job) 294 } else { 295 log.V(0).Info("deleted old successful job", "job", job) 296 } 297 } 298 } 299 300 /* ### 4: Check if we're suspended 301 302 If this object is suspended, we don't want to run any jobs, so we'll stop now. 303 This is useful if something's broken with the job we're running and we want to 304 pause runs to investigate or putz with the cluster, without deleting the object. 305 */ 306 307 if cronJob.Spec.Suspend != nil && *cronJob.Spec.Suspend { 308 log.V(1).Info("cronjob suspended, skipping") 309 return ctrl.Result{}, nil 310 } 311 312 /* 313 ### 5: Get the next scheduled run 314 315 If we're not paused, we'll need to calculate the next scheduled run, and whether 316 or not we've got a run that we haven't processed yet. 317 */ 318 319 /* 320 We'll calculate the next scheduled time using our helpful cron library. 321 We'll start calculating appropriate times from our last run, or the creation 322 of the CronJob if we can't find a last run. 323 324 If there are too many missed runs and we don't have any deadlines set, we'll 325 bail so that we don't cause issues on controller restarts or wedges. 326 327 Otherwise, we'll just return the missed runs (of which we'll just use the latest), 328 and the next run, so that we can know when it's time to reconcile again. 329 */ 330 getNextSchedule := func(cronJob *batchv1.CronJob, now time.Time) (lastMissed time.Time, next time.Time, err error) { 331 sched, err := cron.ParseStandard(cronJob.Spec.Schedule) 332 if err != nil { 333 return time.Time{}, time.Time{}, fmt.Errorf("Unparseable schedule %q: %v", cronJob.Spec.Schedule, err) 334 } 335 336 // for optimization purposes, cheat a bit and start from our last observed run time 337 // we could reconstitute this here, but there's not much point, since we've 338 // just updated it. 339 var earliestTime time.Time 340 if cronJob.Status.LastScheduleTime != nil { 341 earliestTime = cronJob.Status.LastScheduleTime.Time 342 } else { 343 earliestTime = cronJob.ObjectMeta.CreationTimestamp.Time 344 } 345 if cronJob.Spec.StartingDeadlineSeconds != nil { 346 // controller is not going to schedule anything below this point 347 schedulingDeadline := now.Add(-time.Second * time.Duration(*cronJob.Spec.StartingDeadlineSeconds)) 348 349 if schedulingDeadline.After(earliestTime) { 350 earliestTime = schedulingDeadline 351 } 352 } 353 if earliestTime.After(now) { 354 return time.Time{}, sched.Next(now), nil 355 } 356 357 starts := 0 358 for t := sched.Next(earliestTime); !t.After(now); t = sched.Next(t) { 359 lastMissed = t 360 // An object might miss several starts. For example, if 361 // controller gets wedged on Friday at 5:01pm when everyone has 362 // gone home, and someone comes in on Tuesday AM and discovers 363 // the problem and restarts the controller, then all the hourly 364 // jobs, more than 80 of them for one hourly scheduledJob, should 365 // all start running with no further intervention (if the scheduledJob 366 // allows concurrency and late starts). 367 // 368 // However, if there is a bug somewhere, or incorrect clock 369 // on controller's server or apiservers (for setting creationTimestamp) 370 // then there could be so many missed start times (it could be off 371 // by decades or more), that it would eat up all the CPU and memory 372 // of this controller. In that case, we want to not try to list 373 // all the missed start times. 374 starts++ 375 if starts > 100 { 376 // We can't get the most recent times so just return an empty slice 377 return time.Time{}, time.Time{}, fmt.Errorf("Too many missed start times (> 100). Set or decrease .spec.startingDeadlineSeconds or check clock skew.") 378 } 379 } 380 return lastMissed, sched.Next(now), nil 381 } 382 // +kubebuilder:docs-gen:collapse=getNextSchedule 383 384 // figure out the next times that we need to create 385 // jobs at (or anything we missed). 386 missedRun, nextRun, err := getNextSchedule(&cronJob, r.Now()) 387 if err != nil { 388 log.Error(err, "unable to figure out CronJob schedule") 389 // we don't really care about requeuing until we get an update that 390 // fixes the schedule, so don't return an error 391 return ctrl.Result{}, nil 392 } 393 394 /* 395 We'll prep our eventual request to requeue until the next job, and then figure 396 out if we actually need to run. 397 */ 398 scheduledResult := ctrl.Result{RequeueAfter: nextRun.Sub(r.Now())} // save this so we can re-use it elsewhere 399 log = log.WithValues("now", r.Now(), "next run", nextRun) 400 401 /* 402 ### 6: Run a new job if it's on schedule, not past the deadline, and not blocked by our concurrency policy 403 404 If we've missed a run, and we're still within the deadline to start it, we'll need to run a job. 405 */ 406 if missedRun.IsZero() { 407 log.V(1).Info("no upcoming scheduled times, sleeping until next") 408 return scheduledResult, nil 409 } 410 411 // make sure we're not too late to start the run 412 log = log.WithValues("current run", missedRun) 413 tooLate := false 414 if cronJob.Spec.StartingDeadlineSeconds != nil { 415 tooLate = missedRun.Add(time.Duration(*cronJob.Spec.StartingDeadlineSeconds) * time.Second).Before(r.Now()) 416 } 417 if tooLate { 418 log.V(1).Info("missed starting deadline for last run, sleeping till next") 419 // TODO(directxman12): events 420 return scheduledResult, nil 421 } 422 423 /* 424 If we actually have to run a job, we'll need to either wait till existing ones finish, 425 replace the existing ones, or just add new ones. If our information is out of date due 426 to cache delay, we'll get a requeue when we get up-to-date information. 427 */ 428 // figure out how to run this job -- concurrency policy might forbid us from running 429 // multiple at the same time... 430 if cronJob.Spec.ConcurrencyPolicy == batchv1.ForbidConcurrent && len(activeJobs) > 0 { 431 log.V(1).Info("concurrency policy blocks concurrent runs, skipping", "num active", len(activeJobs)) 432 return scheduledResult, nil 433 } 434 435 // ...or instruct us to replace existing ones... 436 if cronJob.Spec.ConcurrencyPolicy == batchv1.ReplaceConcurrent { 437 for _, activeJob := range activeJobs { 438 // we don't care if the job was already deleted 439 if err := r.Delete(ctx, activeJob, client.PropagationPolicy(metav1.DeletePropagationBackground)); client.IgnoreNotFound(err) != nil { 440 log.Error(err, "unable to delete active job", "job", activeJob) 441 return ctrl.Result{}, err 442 } 443 } 444 } 445 446 /* 447 Once we've figured out what to do with existing jobs, we'll actually create our desired job 448 */ 449 450 /* 451 We need to construct a job based on our CronJob's template. We'll copy over the spec 452 from the template and copy some basic object meta. 453 454 Then, we'll set the "scheduled time" annotation so that we can reconstitute our 455 ` + "`" + `LastScheduleTime` + "`" + ` field each reconcile. 456 457 Finally, we'll need to set an owner reference. This allows the Kubernetes garbage collector 458 to clean up jobs when we delete the CronJob, and allows controller-runtime to figure out 459 which cronjob needs to be reconciled when a given job changes (is added, deleted, completes, etc). 460 */ 461 constructJobForCronJob := func(cronJob *batchv1.CronJob, scheduledTime time.Time) (*kbatch.Job, error) { 462 // We want job names for a given nominal start time to have a deterministic name to avoid the same job being created twice 463 name := fmt.Sprintf("%s-%d", cronJob.Name, scheduledTime.Unix()) 464 465 job := &kbatch.Job{ 466 ObjectMeta: metav1.ObjectMeta{ 467 Labels: make(map[string]string), 468 Annotations: make(map[string]string), 469 Name: name, 470 Namespace: cronJob.Namespace, 471 }, 472 Spec: *cronJob.Spec.JobTemplate.Spec.DeepCopy(), 473 } 474 for k, v := range cronJob.Spec.JobTemplate.Annotations { 475 job.Annotations[k] = v 476 } 477 job.Annotations[scheduledTimeAnnotation] = scheduledTime.Format(time.RFC3339) 478 for k, v := range cronJob.Spec.JobTemplate.Labels { 479 job.Labels[k] = v 480 } 481 if err := ctrl.SetControllerReference(cronJob, job, r.Scheme); err != nil { 482 return nil, err 483 } 484 485 return job, nil 486 } 487 // +kubebuilder:docs-gen:collapse=constructJobForCronJob 488 489 // actually make the job... 490 job, err := constructJobForCronJob(&cronJob, missedRun) 491 if err != nil { 492 log.Error(err, "unable to construct job from template") 493 // don't bother requeuing until we get a change to the spec 494 return scheduledResult, nil 495 } 496 497 // ...and create it on the cluster 498 if err := r.Create(ctx, job); err != nil { 499 log.Error(err, "unable to create Job for CronJob", "job", job) 500 return ctrl.Result{}, err 501 } 502 503 log.V(1).Info("created Job for CronJob run", "job", job) 504 505 /* 506 ### 7: Requeue when we either see a running job or it's time for the next scheduled run 507 508 Finally, we'll return the result that we prepped above, that says we want to requeue 509 when our next run would need to occur. This is taken as a maximum deadline -- if something 510 else changes in between, like our job starts or finishes, we get modified, etc, we might 511 reconcile again sooner. 512 */ 513 // we'll requeue once we see the running job, and update our status 514 return scheduledResult, nil 515 } 516 517 /* 518 ### Setup 519 520 Finally, we'll update our setup. In order to allow our reconciler to quickly 521 look up Jobs by their owner, we'll need an index. We declare an index key that 522 we can later use with the client as a pseudo-field name, and then describe how to 523 extract the indexed value from the Job object. The indexer will automatically take 524 care of namespaces for us, so we just have to extract the owner name if the Job has 525 a CronJob owner. 526 527 Additionally, we'll inform the manager that this controller owns some Jobs, so that it 528 will automatically call Reconcile on the underlying CronJob when a Job changes, is 529 deleted, etc. 530 */ 531 var ( 532 jobOwnerKey = ".metadata.controller" 533 apiGVStr = batchv1.GroupVersion.String() 534 ) 535 ` 536 const ControllerSetupWithManager = ` 537 // set up a real clock, since we're not in a test 538 if r.Clock == nil { 539 r.Clock = realClock{} 540 } 541 542 if err := mgr.GetFieldIndexer().IndexField(context.Background(), &kbatch.Job{}, jobOwnerKey, func(rawObj client.Object) []string { 543 // grab the job object, extract the owner... 544 job := rawObj.(*kbatch.Job) 545 owner := metav1.GetControllerOf(job) 546 if owner == nil { 547 return nil 548 } 549 // ...make sure it's a CronJob... 550 if owner.APIVersion != apiGVStr || owner.Kind != "CronJob" { 551 return nil 552 } 553 554 // ...and if so, return it 555 return []string{owner.Name} 556 }); err != nil { 557 return err 558 } 559 `