github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allochealth/tracker.go (about) 1 package allochealth 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "sync" 8 "time" 9 10 "github.com/hashicorp/consul/api" 11 "github.com/hashicorp/go-hclog" 12 "github.com/hashicorp/nomad/client/serviceregistration" 13 "github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore" 14 cstructs "github.com/hashicorp/nomad/client/structs" 15 "github.com/hashicorp/nomad/helper" 16 "github.com/hashicorp/nomad/nomad/structs" 17 ) 18 19 const ( 20 // AllocHealthEventSource is the source used for emitting task events 21 AllocHealthEventSource = "Alloc Unhealthy" 22 23 // checkLookupInterval is the pace at which we check if the Consul or Nomad 24 // checks for an allocation are healthy or unhealthy. 25 checkLookupInterval = 500 * time.Millisecond 26 ) 27 28 // Tracker tracks the health of an allocation and makes health events watchable 29 // via channels. 30 type Tracker struct { 31 // ctx and cancelFn is used to shutdown the tracker 32 ctx context.Context 33 cancelFn context.CancelFunc 34 35 // alloc is the alloc we are tracking 36 alloc *structs.Allocation 37 38 // tg is the task group we are tracking 39 tg *structs.TaskGroup 40 41 // minHealthyTime is the duration an alloc must remain healthy to be 42 // considered healthy 43 minHealthyTime time.Duration 44 45 // checkLookupInterval is the repeated interval after which which we check 46 // if the Consul checks are healthy or unhealthy. 47 checkLookupInterval time.Duration 48 49 // useChecks specifies whether to consider Consul and Nomad service checks. 50 useChecks bool 51 52 // consulCheckCount is the total number of Consul service checks in the task 53 // group including task level checks. 54 consulCheckCount int 55 56 // nomadCheckCount is the total the number of Nomad service checks in the task 57 // group including task level checks. 58 nomadCheckCount int 59 60 // allocUpdates is a listener for retrieving new alloc updates 61 allocUpdates *cstructs.AllocListener 62 63 // consulClient is used to look up the status of Consul service checks 64 consulClient serviceregistration.Handler 65 66 // checkStore is used to lookup the status of Nomad service checks 67 checkStore checkstore.Shim 68 69 // healthy is used to signal whether we have determined the allocation to be 70 // healthy or unhealthy 71 healthy chan bool 72 73 // allocStopped is triggered when the allocation is stopped and tracking is 74 // not needed 75 allocStopped chan struct{} 76 77 // lifecycleTasks is a map of ephemeral tasks and their lifecycle hooks. 78 // These tasks may terminate without affecting alloc health 79 lifecycleTasks map[string]string 80 81 // lock is used to lock shared fields listed below 82 lock sync.Mutex 83 84 // tasksHealthy marks whether all the tasks have met their health check 85 // (disregards Consul and Nomad checks) 86 tasksHealthy bool 87 88 // allocFailed marks whether the allocation failed 89 allocFailed bool 90 91 // checksHealthy marks whether all the task's Consul checks are healthy 92 checksHealthy bool 93 94 // taskHealth contains the health state for each task in the allocation 95 // name -> state 96 taskHealth map[string]*taskHealthState 97 98 // logger is for logging things 99 logger hclog.Logger 100 } 101 102 // NewTracker returns a health tracker for the given allocation. 103 // 104 // Depending on job configuration, an allocation's health takes into consideration 105 // - An alloc listener 106 // - Consul checks (via consul API) 107 // - Nomad checks (via client state) 108 func NewTracker( 109 parentCtx context.Context, 110 logger hclog.Logger, 111 alloc *structs.Allocation, 112 allocUpdates *cstructs.AllocListener, 113 consulClient serviceregistration.Handler, 114 checkStore checkstore.Shim, 115 minHealthyTime time.Duration, 116 useChecks bool, 117 ) *Tracker { 118 119 t := &Tracker{ 120 healthy: make(chan bool, 1), 121 allocStopped: make(chan struct{}), 122 alloc: alloc, 123 tg: alloc.Job.LookupTaskGroup(alloc.TaskGroup), 124 minHealthyTime: minHealthyTime, 125 useChecks: useChecks, 126 allocUpdates: allocUpdates, 127 consulClient: consulClient, 128 checkStore: checkStore, 129 checkLookupInterval: checkLookupInterval, 130 logger: logger, 131 lifecycleTasks: map[string]string{}, 132 } 133 134 t.taskHealth = make(map[string]*taskHealthState, len(t.tg.Tasks)) 135 for _, task := range t.tg.Tasks { 136 t.taskHealth[task.Name] = &taskHealthState{task: task} 137 138 if task.Lifecycle != nil && !task.Lifecycle.Sidecar { 139 t.lifecycleTasks[task.Name] = task.Lifecycle.Hook 140 } 141 142 c, n := countChecks(task.Services) 143 t.consulCheckCount += c 144 t.nomadCheckCount += n 145 } 146 147 c, n := countChecks(t.tg.Services) 148 t.consulCheckCount += c 149 t.nomadCheckCount += n 150 151 t.ctx, t.cancelFn = context.WithCancel(parentCtx) 152 return t 153 } 154 155 func countChecks(services []*structs.Service) (consul, nomad int) { 156 for _, service := range services { 157 switch service.Provider { 158 case structs.ServiceProviderNomad: 159 nomad += len(service.Checks) 160 default: 161 consul += len(service.Checks) 162 } 163 } 164 return 165 } 166 167 // Start starts the watcher. 168 func (t *Tracker) Start() { 169 go t.watchTaskEvents() 170 171 switch { 172 case !t.useChecks: 173 return 174 case t.consulCheckCount > 0: 175 go t.watchConsulEvents() 176 case t.nomadCheckCount > 0: 177 go t.watchNomadEvents() 178 } 179 } 180 181 // HealthyCh returns a channel that will emit a boolean indicating the health of 182 // the allocation. 183 func (t *Tracker) HealthyCh() <-chan bool { 184 return t.healthy 185 } 186 187 // AllocStoppedCh returns a channel that will be fired if the allocation is 188 // stopped. This means that health will not be set. 189 func (t *Tracker) AllocStoppedCh() <-chan struct{} { 190 return t.allocStopped 191 } 192 193 // TaskEvents returns a map of events by task. This should only be called after 194 // health has been determined. Only tasks that have contributed to the 195 // allocation being unhealthy will have an event. 196 func (t *Tracker) TaskEvents() map[string]*structs.TaskEvent { 197 t.lock.Lock() 198 defer t.lock.Unlock() 199 200 // Nothing to do since the failure wasn't task related 201 if t.allocFailed { 202 return nil 203 } 204 205 deadline, _ := t.ctx.Deadline() 206 events := make(map[string]*structs.TaskEvent, len(t.tg.Tasks)) 207 208 // Go through are task information and build the event map 209 for task, state := range t.taskHealth { 210 useChecks := t.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks 211 if e, ok := state.event(deadline, t.tg.Update.HealthyDeadline, t.tg.Update.MinHealthyTime, useChecks); ok { 212 events[task] = structs.NewTaskEvent(AllocHealthEventSource).SetMessage(e) 213 } 214 } 215 216 return events 217 } 218 219 // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the 220 // allocation is terminal, health is immediately broadcast. 221 func (t *Tracker) setTaskHealth(healthy, terminal bool) { 222 t.lock.Lock() 223 defer t.lock.Unlock() 224 225 t.tasksHealthy = healthy 226 227 // if unhealthy, force waiting for new checks health status 228 if !terminal && !healthy { 229 t.checksHealthy = false 230 return 231 } 232 233 // If we are marked healthy but we also require Consul checks to be healthy 234 // and they are not yet, return, unless the task is terminal. 235 usesConsulChecks := t.useChecks && t.consulCheckCount > 0 236 if !terminal && healthy && usesConsulChecks && !t.checksHealthy { 237 return 238 } 239 240 // If we are marked healthy but also require Nomad checks to be healthy and 241 // they are not yet, return, unless the task is terminal. 242 usesNomadChecks := t.useChecks && t.nomadCheckCount > 0 243 if !terminal && healthy && usesNomadChecks && !t.checksHealthy { 244 return 245 } 246 247 select { 248 case t.healthy <- healthy: 249 // nothing 250 default: 251 } 252 253 // Shutdown the tracker 254 t.cancelFn() 255 } 256 257 // setCheckHealth is used to mark the checks as either healthy or unhealthy. 258 // returns true if health is propagated and no more health monitoring is needed 259 // 260 // todo: this is currently being shared by watchConsulEvents and watchNomadEvents, 261 // 262 // and must be split up if/when we support registering services (and thus checks) 263 // of different providers. 264 func (t *Tracker) setCheckHealth(healthy bool) bool { 265 t.lock.Lock() 266 defer t.lock.Unlock() 267 268 // check health should always be false if tasks are unhealthy 269 // as checks might be missing from unhealthy tasks 270 t.checksHealthy = healthy && t.tasksHealthy 271 272 // Only signal if we are healthy and so is the tasks 273 if !t.checksHealthy { 274 return false 275 } 276 277 select { 278 case t.healthy <- healthy: 279 // nothing 280 default: 281 } 282 283 // Shutdown the tracker, things are healthy so nothing to do 284 t.cancelFn() 285 return true 286 } 287 288 // markAllocStopped is used to mark the allocation as having stopped. 289 func (t *Tracker) markAllocStopped() { 290 close(t.allocStopped) 291 t.cancelFn() 292 } 293 294 // watchTaskEvents is a long lived watcher that watches for the health of the 295 // allocation's tasks. 296 func (t *Tracker) watchTaskEvents() { 297 alloc := t.alloc 298 allStartedTime := time.Time{} 299 300 waiter := newHealthyFuture() 301 302 for { 303 // If the alloc is being stopped by the server just exit 304 switch alloc.DesiredStatus { 305 case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: 306 t.logger.Trace("desired status is terminal for alloc", "alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus) 307 t.markAllocStopped() 308 return 309 } 310 311 // Store the task states 312 t.lock.Lock() 313 for task, state := range alloc.TaskStates { 314 //TODO(schmichael) for now skip unknown tasks as 315 //they're task group services which don't currently 316 //support checks anyway 317 if v, ok := t.taskHealth[task]; ok { 318 v.state = state 319 } 320 } 321 t.lock.Unlock() 322 323 // Detect if the alloc is unhealthy or if all tasks have started yet 324 latestStartTime := time.Time{} 325 for taskName, state := range alloc.TaskStates { 326 // If the task is a poststop task we do not want to evaluate it 327 // since it will remain pending until the main task has finished 328 // or exited. 329 if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststop { 330 continue 331 } 332 333 // If this is a poststart task which has already succeeded, we 334 // should skip evaluation. 335 if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststart && state.Successful() { 336 continue 337 } 338 339 // One of the tasks has failed so we can exit watching 340 if state.Failed || (!state.FinishedAt.IsZero() && t.lifecycleTasks[taskName] != structs.TaskLifecycleHookPrestart) { 341 t.setTaskHealth(false, true) 342 return 343 } 344 345 if state.State == structs.TaskStatePending { 346 latestStartTime = time.Time{} 347 break 348 } else if state.StartedAt.After(latestStartTime) { 349 // task is either running or exited successfully 350 latestStartTime = state.StartedAt 351 } 352 } 353 354 // If the alloc is marked as failed by the client but none of the 355 // individual tasks failed, that means something failed at the alloc 356 // level. 357 if alloc.ClientStatus == structs.AllocClientStatusFailed { 358 t.lock.Lock() 359 t.allocFailed = true 360 t.lock.Unlock() 361 362 t.setTaskHealth(false, true) 363 return 364 } 365 366 if !latestStartTime.Equal(allStartedTime) { 367 // reset task health 368 t.setTaskHealth(false, false) 369 370 // Prevent the timer from firing at the old start time 371 waiter.disable() 372 373 // Set the timer since all tasks are started 374 if !latestStartTime.IsZero() { 375 allStartedTime = latestStartTime 376 waiter.wait(t.minHealthyTime) 377 } 378 } 379 380 select { 381 case <-t.ctx.Done(): 382 return 383 case newAlloc, ok := <-t.allocUpdates.Ch(): 384 if !ok { 385 return 386 } 387 alloc = newAlloc 388 case <-waiter.C(): 389 t.setTaskHealth(true, false) 390 } 391 } 392 } 393 394 // healthyFuture is used to fire after checks have been healthy for MinHealthyTime 395 type healthyFuture struct { 396 timer *time.Timer 397 } 398 399 // newHealthyFuture will create a healthyFuture in a disabled state, and 400 // will do nothing until a call to wait takes place 401 func newHealthyFuture() *healthyFuture { 402 timer := time.NewTimer(0) 403 ht := &healthyFuture{timer: timer} 404 ht.disable() 405 return ht 406 } 407 408 // disable the healthyFuture from triggering 409 func (h *healthyFuture) disable() { 410 if !h.timer.Stop() { 411 // must ensure channel is clear 412 // https://pkg.go.dev/time#Timer.Stop 413 select { 414 case <-h.timer.C: 415 default: 416 } 417 } 418 } 419 420 // wait will reset the healthyFuture to trigger after dur passes. 421 func (h *healthyFuture) wait(dur time.Duration) { 422 // must ensure timer is stopped 423 // https://pkg.go.dev/time#Timer.Reset 424 h.disable() 425 h.timer.Reset(dur) 426 } 427 428 // C returns a channel on which the future will send when ready. 429 func (h *healthyFuture) C() <-chan time.Time { 430 return h.timer.C 431 } 432 433 // watchConsulEvents is a watcher for the health of the allocation's Consul 434 // checks. If all checks report healthy the watcher will exit after the 435 // MinHealthyTime has been reached, otherwise the watcher will continue to 436 // check unhealthy checks until the ctx is cancelled. 437 // 438 // Does not watch Nomad service checks; see watchNomadEvents for those. 439 func (t *Tracker) watchConsulEvents() { 440 // checkTicker is the ticker that triggers us to look at the checks in Consul 441 checkTicker := time.NewTicker(t.checkLookupInterval) 442 defer checkTicker.Stop() 443 444 // waiter is used to fire when the checks have been healthy for the MinHealthyTime 445 waiter := newHealthyFuture() 446 447 // primed marks whether the healthy waiter has been set 448 primed := false 449 450 // Store whether the last Consul checks call was successful or not 451 consulChecksErr := false 452 453 // allocReg are the registered objects in Consul for the allocation 454 var allocReg *serviceregistration.AllocRegistration 455 456 OUTER: 457 for { 458 select { 459 460 // we are shutting down 461 case <-t.ctx.Done(): 462 return 463 464 // it is time to check the checks 465 case <-checkTicker.C: 466 newAllocReg, err := t.consulClient.AllocRegistrations(t.alloc.ID) 467 if err != nil { 468 if !consulChecksErr { 469 consulChecksErr = true 470 t.logger.Warn("error looking up Consul registrations for allocation", "error", err, "alloc_id", t.alloc.ID) 471 } 472 continue OUTER 473 } else { 474 consulChecksErr = false 475 allocReg = newAllocReg 476 } 477 478 // enough time has passed with healthy checks 479 case <-waiter.C(): 480 if t.setCheckHealth(true) { 481 // final health set and propagated 482 return 483 } 484 // checks are healthy but tasks are unhealthy, 485 // reset and wait until all is healthy 486 primed = false 487 } 488 489 if allocReg == nil { 490 continue 491 } 492 493 // Store the task registrations 494 t.lock.Lock() 495 for task, reg := range allocReg.Tasks { 496 if v, ok := t.taskHealth[task]; ok { 497 v.taskRegistrations = reg 498 } 499 } 500 t.lock.Unlock() 501 502 // Detect if all the checks are passing 503 passed := true 504 505 CHECKS: 506 for _, treg := range allocReg.Tasks { 507 for _, sreg := range treg.Services { 508 for _, check := range sreg.Checks { 509 onUpdate := sreg.CheckOnUpdate[check.CheckID] 510 switch check.Status { 511 case api.HealthPassing: 512 continue 513 case api.HealthWarning: 514 if onUpdate == structs.OnUpdateIgnoreWarn || onUpdate == structs.OnUpdateIgnore { 515 continue 516 } 517 case api.HealthCritical: 518 if onUpdate == structs.OnUpdateIgnore { 519 continue 520 } 521 default: 522 } 523 524 passed = false 525 t.setCheckHealth(false) 526 break CHECKS 527 } 528 } 529 } 530 531 if !passed { 532 // Reset the timer since we have transitioned back to unhealthy 533 if primed { 534 primed = false 535 waiter.disable() 536 } 537 } else if !primed { 538 // Reset the timer to fire after MinHealthyTime 539 primed = true 540 waiter.disable() 541 waiter.wait(t.minHealthyTime) 542 } 543 } 544 } 545 546 // watchNomadEvents is a watcher for the health of the allocation's Nomad checks. 547 // If all checks report healthy the watcher will exit after the MinHealthyTime has 548 // been reached, otherwise the watcher will continue to check unhealthy checks until 549 // the ctx is cancelled. 550 // 551 // Does not watch Consul service checks; see watchConsulEvents for those. 552 func (t *Tracker) watchNomadEvents() { 553 // checkTicker is the ticker that triggers us to look at the checks in Nomad 554 checkTicker, cancel := helper.NewSafeTimer(t.checkLookupInterval) 555 defer cancel() 556 557 // waiter is used to fire when the checks have been healthy for the MinHealthyTime 558 waiter := newHealthyFuture() 559 560 // allocID of the allocation we are watching checks for 561 allocID := t.alloc.ID 562 563 // primed marks whether the healthy waiter has been set 564 primed := false 565 566 // latest set of nomad check results 567 var results map[structs.CheckID]*structs.CheckQueryResult 568 569 for { 570 select { 571 572 // tracker has been canceled, so stop waiting 573 case <-t.ctx.Done(): 574 return 575 576 // it is time to check the checks 577 case <-checkTicker.C: 578 results = t.checkStore.List(allocID) 579 checkTicker.Reset(t.checkLookupInterval) 580 581 // enough time has passed with healthy checks 582 case <-waiter.C(): 583 if t.setCheckHealth(true) { // todo(shoenig) this needs to be split between Consul and Nomad 584 return // final health set and propagated 585 } 586 // checks are healthy but tasks are unhealthy, reset and wait 587 primed = false 588 } 589 590 // scan to see if any checks are failing 591 passing := true 592 for _, result := range results { 593 switch result.Status { 594 case structs.CheckSuccess: 595 continue 596 case structs.CheckFailure: 597 if result.Mode == structs.Readiness { 598 continue 599 } 600 passing = false 601 default: 602 // i.e. pending check; do not consider healthy or ready 603 passing = false 604 } 605 606 if !passing { 607 break // 1+ check is failing; no need to continue 608 } 609 } 610 611 if !passing { 612 // at least one check is failing, transition to unhealthy 613 t.setCheckHealth(false) 614 primed = false 615 waiter.disable() 616 } 617 618 if passing && !primed { 619 // healthy but not yet primed, set timer to wait 620 primed = true 621 waiter.wait(t.minHealthyTime) 622 } 623 } 624 } 625 626 // taskHealthState captures all known health information about a task. It is 627 // largely used to determine if the task has contributed to the allocation being 628 // unhealthy. 629 type taskHealthState struct { 630 task *structs.Task 631 state *structs.TaskState 632 taskRegistrations *serviceregistration.ServiceRegistrations 633 } 634 635 // event takes the deadline time for the allocation to be healthy and the update 636 // strategy of the group. It returns true if the task has contributed to the 637 // allocation being unhealthy and if so, an event description of why. 638 func (t *taskHealthState) event(deadline time.Time, healthyDeadline, minHealthyTime time.Duration, useChecks bool) (string, bool) { 639 desiredChecks := 0 640 for _, s := range t.task.Services { 641 if nc := len(s.Checks); nc > 0 { 642 desiredChecks += nc 643 } 644 } 645 requireChecks := (desiredChecks > 0) && useChecks 646 647 if t.state != nil { 648 if t.state.Failed { 649 return "Unhealthy because of failed task", true 650 } 651 652 switch t.state.State { 653 case structs.TaskStatePending: 654 return fmt.Sprintf("Task not running by healthy_deadline of %v", healthyDeadline), true 655 case structs.TaskStateDead: 656 // non-sidecar hook lifecycle tasks are healthy if they exit with success 657 if t.task.Lifecycle == nil || t.task.Lifecycle.Sidecar { 658 return "Unhealthy because of dead task", true 659 } 660 case structs.TaskStateRunning: 661 // We are running so check if we have been running long enough 662 if t.state.StartedAt.Add(minHealthyTime).After(deadline) { 663 return fmt.Sprintf("Task not running for min_healthy_time of %v by healthy_deadline of %v", minHealthyTime, healthyDeadline), true 664 } 665 } 666 } 667 668 if t.taskRegistrations != nil { 669 var notPassing []string 670 passing := 0 671 672 OUTER: 673 for _, sreg := range t.taskRegistrations.Services { 674 for _, check := range sreg.Checks { 675 if check.Status != api.HealthPassing { 676 notPassing = append(notPassing, sreg.Service.Service) 677 continue OUTER 678 } else { 679 passing++ 680 } 681 } 682 } 683 684 if len(notPassing) != 0 { 685 return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true 686 } 687 688 if passing != desiredChecks { 689 return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true 690 } 691 692 } else if requireChecks { 693 return "Service checks not registered", true 694 } 695 696 return "", false 697 }