github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/client/allochealth/tracker.go (about) 1 package allochealth 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "sync" 8 "time" 9 10 "github.com/hashicorp/consul/api" 11 hclog "github.com/hashicorp/go-hclog" 12 cconsul "github.com/hashicorp/nomad/client/consul" 13 cstructs "github.com/hashicorp/nomad/client/structs" 14 "github.com/hashicorp/nomad/command/agent/consul" 15 "github.com/hashicorp/nomad/nomad/structs" 16 ) 17 18 const ( 19 // AllocHealthEventSource is the source used for emitting task events 20 AllocHealthEventSource = "Alloc Unhealthy" 21 22 // consulCheckLookupInterval is the interval at which we check if the 23 // Consul checks are healthy or unhealthy. 24 consulCheckLookupInterval = 500 * time.Millisecond 25 ) 26 27 // Tracker tracks the health of an allocation and makes health events watchable 28 // via channels. 29 type Tracker struct { 30 // ctx and cancelFn is used to shutdown the tracker 31 ctx context.Context 32 cancelFn context.CancelFunc 33 34 // alloc is the alloc we are tracking 35 alloc *structs.Allocation 36 37 // tg is the task group we are tracking 38 tg *structs.TaskGroup 39 40 // minHealthyTime is the duration an alloc must remain healthy to be 41 // considered healthy 42 minHealthyTime time.Duration 43 44 // checkLookupInterval is the interval at which we check if the 45 // Consul checks are healthy or unhealthy. 46 checkLookupInterval time.Duration 47 48 // useChecks specifies whether to use Consul healh checks or not 49 useChecks bool 50 51 // consulCheckCount is the number of checks the task group will attempt to 52 // register 53 consulCheckCount int 54 55 // allocUpdates is a listener for retrieving new alloc updates 56 allocUpdates *cstructs.AllocListener 57 58 // consulClient is used to look up the state of the task's checks 59 consulClient cconsul.ConsulServiceAPI 60 61 // healthy is used to signal whether we have determined the allocation to be 62 // healthy or unhealthy 63 healthy chan bool 64 65 // allocStopped is triggered when the allocation is stopped and tracking is 66 // not needed 67 allocStopped chan struct{} 68 69 // lifecycleTasks is a map of ephemeral tasks and their lifecycle hooks. 70 // These tasks may terminate without affecting alloc health 71 lifecycleTasks map[string]string 72 73 // l is used to lock shared fields listed below 74 l sync.Mutex 75 76 // tasksHealthy marks whether all the tasks have met their health check 77 // (disregards Consul) 78 tasksHealthy bool 79 80 // allocFailed marks whether the allocation failed 81 allocFailed bool 82 83 // checksHealthy marks whether all the task's Consul checks are healthy 84 checksHealthy bool 85 86 // taskHealth contains the health state for each task 87 taskHealth map[string]*taskHealthState 88 89 logger hclog.Logger 90 } 91 92 // NewTracker returns a health tracker for the given allocation. An alloc 93 // listener and consul API object are given so that the watcher can detect 94 // health changes. 95 func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.Allocation, 96 allocUpdates *cstructs.AllocListener, consulClient cconsul.ConsulServiceAPI, 97 minHealthyTime time.Duration, useChecks bool) *Tracker { 98 99 // Do not create a named sub-logger as the hook controlling 100 // this struct should pass in an appropriately named 101 // sub-logger. 102 t := &Tracker{ 103 healthy: make(chan bool, 1), 104 allocStopped: make(chan struct{}), 105 alloc: alloc, 106 tg: alloc.Job.LookupTaskGroup(alloc.TaskGroup), 107 minHealthyTime: minHealthyTime, 108 useChecks: useChecks, 109 allocUpdates: allocUpdates, 110 consulClient: consulClient, 111 checkLookupInterval: consulCheckLookupInterval, 112 logger: logger, 113 lifecycleTasks: map[string]string{}, 114 } 115 116 t.taskHealth = make(map[string]*taskHealthState, len(t.tg.Tasks)) 117 for _, task := range t.tg.Tasks { 118 t.taskHealth[task.Name] = &taskHealthState{task: task} 119 120 if task.Lifecycle != nil && !task.Lifecycle.Sidecar { 121 t.lifecycleTasks[task.Name] = task.Lifecycle.Hook 122 } 123 124 for _, s := range task.Services { 125 t.consulCheckCount += len(s.Checks) 126 } 127 } 128 129 for _, s := range t.tg.Services { 130 t.consulCheckCount += len(s.Checks) 131 } 132 133 t.ctx, t.cancelFn = context.WithCancel(parentCtx) 134 return t 135 } 136 137 // Start starts the watcher. 138 func (t *Tracker) Start() { 139 go t.watchTaskEvents() 140 if t.useChecks { 141 go t.watchConsulEvents() 142 } 143 } 144 145 // HealthyCh returns a channel that will emit a boolean indicating the health of 146 // the allocation. 147 func (t *Tracker) HealthyCh() <-chan bool { 148 return t.healthy 149 } 150 151 // AllocStoppedCh returns a channel that will be fired if the allocation is 152 // stopped. This means that health will not be set. 153 func (t *Tracker) AllocStoppedCh() <-chan struct{} { 154 return t.allocStopped 155 } 156 157 // TaskEvents returns a map of events by task. This should only be called after 158 // health has been determined. Only tasks that have contributed to the 159 // allocation being unhealthy will have an event. 160 func (t *Tracker) TaskEvents() map[string]*structs.TaskEvent { 161 t.l.Lock() 162 defer t.l.Unlock() 163 164 // Nothing to do since the failure wasn't task related 165 if t.allocFailed { 166 return nil 167 } 168 169 deadline, _ := t.ctx.Deadline() 170 events := make(map[string]*structs.TaskEvent, len(t.tg.Tasks)) 171 172 // Go through are task information and build the event map 173 for task, state := range t.taskHealth { 174 useChecks := t.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks 175 if e, ok := state.event(deadline, t.tg.Update.MinHealthyTime, useChecks); ok { 176 events[task] = structs.NewTaskEvent(AllocHealthEventSource).SetMessage(e) 177 } 178 } 179 180 return events 181 } 182 183 // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the 184 // allocation is terminal, health is immediately broadcasted. 185 func (t *Tracker) setTaskHealth(healthy, terminal bool) { 186 t.l.Lock() 187 defer t.l.Unlock() 188 t.tasksHealthy = healthy 189 190 // if unhealthy, force waiting for new checks health status 191 if !terminal && !healthy { 192 t.checksHealthy = false 193 return 194 } 195 196 // If we are marked healthy but we also require Consul to be healthy and it 197 // isn't yet, return, unless the task is terminal 198 requireConsul := t.useChecks && t.consulCheckCount > 0 199 if !terminal && healthy && requireConsul && !t.checksHealthy { 200 return 201 } 202 203 select { 204 case t.healthy <- healthy: 205 default: 206 } 207 208 // Shutdown the tracker 209 t.cancelFn() 210 } 211 212 // setCheckHealth is used to mark the checks as either healthy or unhealthy. 213 // returns true if health is propagated and no more health monitoring is needed 214 func (t *Tracker) setCheckHealth(healthy bool) bool { 215 t.l.Lock() 216 defer t.l.Unlock() 217 218 // check health should always be false if tasks are unhealthy 219 // as checks might be missing from unhealthy tasks 220 t.checksHealthy = healthy && t.tasksHealthy 221 222 // Only signal if we are healthy and so is the tasks 223 if !t.checksHealthy { 224 return false 225 } 226 227 select { 228 case t.healthy <- healthy: 229 default: 230 } 231 232 // Shutdown the tracker 233 t.cancelFn() 234 return true 235 } 236 237 // markAllocStopped is used to mark the allocation as having stopped. 238 func (t *Tracker) markAllocStopped() { 239 close(t.allocStopped) 240 t.cancelFn() 241 } 242 243 // watchTaskEvents is a long lived watcher that watches for the health of the 244 // allocation's tasks. 245 func (t *Tracker) watchTaskEvents() { 246 alloc := t.alloc 247 allStartedTime := time.Time{} 248 healthyTimer := time.NewTimer(0) 249 if !healthyTimer.Stop() { 250 select { 251 case <-healthyTimer.C: 252 default: 253 } 254 } 255 256 for { 257 // If the alloc is being stopped by the server just exit 258 switch alloc.DesiredStatus { 259 case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: 260 t.logger.Trace("desired status is terminal for alloc", "alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus) 261 t.markAllocStopped() 262 return 263 } 264 265 // Store the task states 266 t.l.Lock() 267 for task, state := range alloc.TaskStates { 268 //TODO(schmichael) for now skip unknown tasks as 269 //they're task group services which don't currently 270 //support checks anyway 271 if v, ok := t.taskHealth[task]; ok { 272 v.state = state 273 } 274 } 275 t.l.Unlock() 276 277 // Detect if the alloc is unhealthy or if all tasks have started yet 278 latestStartTime := time.Time{} 279 for taskName, state := range alloc.TaskStates { 280 // If the task is a poststop task we do not want to evaluate it 281 // since it will remain pending until the main task has finished 282 // or exited. 283 if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststop { 284 continue 285 } 286 287 // One of the tasks has failed so we can exit watching 288 if state.Failed || (!state.FinishedAt.IsZero() && t.lifecycleTasks[taskName] != structs.TaskLifecycleHookPrestart) { 289 t.setTaskHealth(false, true) 290 return 291 } 292 293 if state.State == structs.TaskStatePending { 294 latestStartTime = time.Time{} 295 break 296 } else if state.StartedAt.After(latestStartTime) { 297 // task is either running or exited successfully 298 latestStartTime = state.StartedAt 299 } 300 } 301 302 // If the alloc is marked as failed by the client but none of the 303 // individual tasks failed, that means something failed at the alloc 304 // level. 305 if alloc.ClientStatus == structs.AllocClientStatusFailed { 306 t.l.Lock() 307 t.allocFailed = true 308 t.l.Unlock() 309 310 t.setTaskHealth(false, true) 311 return 312 } 313 314 if !latestStartTime.Equal(allStartedTime) { 315 // reset task health 316 t.setTaskHealth(false, false) 317 318 // Avoid the timer from firing at the old start time 319 if !healthyTimer.Stop() { 320 select { 321 case <-healthyTimer.C: 322 default: 323 } 324 } 325 326 // Set the timer since all tasks are started 327 if !latestStartTime.IsZero() { 328 allStartedTime = latestStartTime 329 healthyTimer.Reset(t.minHealthyTime) 330 } 331 } 332 333 select { 334 case <-t.ctx.Done(): 335 return 336 case newAlloc, ok := <-t.allocUpdates.Ch(): 337 if !ok { 338 return 339 } 340 alloc = newAlloc 341 case <-healthyTimer.C: 342 t.setTaskHealth(true, false) 343 } 344 } 345 } 346 347 // watchConsulEvents is a watcher for the health of the allocation's Consul 348 // checks. If all checks report healthy the watcher will exit after the 349 // MinHealthyTime has been reached, Otherwise the watcher will continue to 350 // check unhealthy checks until the ctx is cancelled 351 func (t *Tracker) watchConsulEvents() { 352 // checkTicker is the ticker that triggers us to look at the checks in 353 // Consul 354 checkTicker := time.NewTicker(t.checkLookupInterval) 355 defer checkTicker.Stop() 356 357 // healthyTimer fires when the checks have been healthy for the 358 // MinHealthyTime 359 healthyTimer := time.NewTimer(0) 360 if !healthyTimer.Stop() { 361 select { 362 case <-healthyTimer.C: 363 default: 364 } 365 } 366 367 // primed marks whether the healthy timer has been set 368 primed := false 369 370 // Store whether the last Consul checks call was successful or not 371 consulChecksErr := false 372 373 // allocReg are the registered objects in Consul for the allocation 374 var allocReg *consul.AllocRegistration 375 376 OUTER: 377 for { 378 select { 379 case <-t.ctx.Done(): 380 return 381 case <-checkTicker.C: 382 newAllocReg, err := t.consulClient.AllocRegistrations(t.alloc.ID) 383 if err != nil { 384 if !consulChecksErr { 385 consulChecksErr = true 386 t.logger.Warn("error looking up Consul registrations for allocation", "error", err, "alloc_id", t.alloc.ID) 387 } 388 continue OUTER 389 } else { 390 consulChecksErr = false 391 allocReg = newAllocReg 392 } 393 case <-healthyTimer.C: 394 if t.setCheckHealth(true) { 395 // final health set and propagated 396 return 397 } 398 // tasks are unhealthy, reset and wait until all is healthy 399 primed = false 400 } 401 402 if allocReg == nil { 403 continue 404 } 405 406 // Store the task registrations 407 t.l.Lock() 408 for task, reg := range allocReg.Tasks { 409 //TODO(schmichael) for now skip unknown tasks as 410 //they're task group services which don't currently 411 //support checks anyway 412 if v, ok := t.taskHealth[task]; ok { 413 v.taskRegistrations = reg 414 } 415 } 416 t.l.Unlock() 417 418 // Detect if all the checks are passing 419 passed := true 420 421 CHECKS: 422 for _, treg := range allocReg.Tasks { 423 for _, sreg := range treg.Services { 424 for _, check := range sreg.Checks { 425 onupdate := sreg.CheckOnUpdate[check.CheckID] 426 switch check.Status { 427 case api.HealthPassing: 428 continue 429 case api.HealthWarning: 430 if onupdate == structs.OnUpdateIgnoreWarn || onupdate == structs.OnUpdateIgnore { 431 continue 432 } 433 case api.HealthCritical: 434 if onupdate == structs.OnUpdateIgnore { 435 continue 436 } 437 default: 438 } 439 440 passed = false 441 t.setCheckHealth(false) 442 break CHECKS 443 } 444 } 445 } 446 447 if !passed { 448 // Reset the timer since we have transitioned back to unhealthy 449 if primed { 450 if !healthyTimer.Stop() { 451 select { 452 case <-healthyTimer.C: 453 default: 454 } 455 } 456 primed = false 457 } 458 } else if !primed { 459 // Reset the timer to fire after MinHealthyTime 460 if !healthyTimer.Stop() { 461 select { 462 case <-healthyTimer.C: 463 default: 464 } 465 } 466 467 primed = true 468 healthyTimer.Reset(t.minHealthyTime) 469 } 470 } 471 } 472 473 // taskHealthState captures all known health information about a task. It is 474 // largely used to determine if the task has contributed to the allocation being 475 // unhealthy. 476 type taskHealthState struct { 477 task *structs.Task 478 state *structs.TaskState 479 taskRegistrations *consul.ServiceRegistrations 480 } 481 482 // event takes the deadline time for the allocation to be healthy and the update 483 // strategy of the group. It returns true if the task has contributed to the 484 // allocation being unhealthy and if so, an event description of why. 485 func (t *taskHealthState) event(deadline time.Time, minHealthyTime time.Duration, useChecks bool) (string, bool) { 486 requireChecks := false 487 desiredChecks := 0 488 for _, s := range t.task.Services { 489 if nc := len(s.Checks); nc > 0 { 490 requireChecks = true 491 desiredChecks += nc 492 } 493 } 494 requireChecks = requireChecks && useChecks 495 496 if t.state != nil { 497 if t.state.Failed { 498 return "Unhealthy because of failed task", true 499 } 500 501 switch t.state.State { 502 case structs.TaskStatePending: 503 return "Task not running by deadline", true 504 case structs.TaskStateDead: 505 // hook tasks are healthy when dead successfully 506 if t.task.Lifecycle == nil || t.task.Lifecycle.Sidecar { 507 return "Unhealthy because of dead task", true 508 } 509 case structs.TaskStateRunning: 510 // We are running so check if we have been running long enough 511 if t.state.StartedAt.Add(minHealthyTime).After(deadline) { 512 return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", minHealthyTime), true 513 } 514 } 515 } 516 517 if t.taskRegistrations != nil { 518 var notPassing []string 519 passing := 0 520 521 OUTER: 522 for _, sreg := range t.taskRegistrations.Services { 523 for _, check := range sreg.Checks { 524 if check.Status != api.HealthPassing { 525 notPassing = append(notPassing, sreg.Service.Service) 526 continue OUTER 527 } else { 528 passing++ 529 } 530 } 531 } 532 533 if len(notPassing) != 0 { 534 return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true 535 } 536 537 if passing != desiredChecks { 538 return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true 539 } 540 541 } else if requireChecks { 542 return "Service checks not registered", true 543 } 544 545 return "", false 546 }