github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allochealth/tracker.go (about) 1 package allochealth 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "sync" 8 "time" 9 10 "github.com/hashicorp/consul/api" 11 hclog "github.com/hashicorp/go-hclog" 12 cconsul "github.com/hashicorp/nomad/client/consul" 13 cstructs "github.com/hashicorp/nomad/client/structs" 14 "github.com/hashicorp/nomad/command/agent/consul" 15 "github.com/hashicorp/nomad/nomad/structs" 16 ) 17 18 const ( 19 // AllocHealthEventSource is the source used for emitting task events 20 AllocHealthEventSource = "Alloc Unhealthy" 21 22 // consulCheckLookupInterval is the interval at which we check if the 23 // Consul checks are healthy or unhealthy. 24 consulCheckLookupInterval = 500 * time.Millisecond 25 ) 26 27 // Tracker tracks the health of an allocation and makes health events watchable 28 // via channels. 29 type Tracker struct { 30 // ctx and cancelFn is used to shutdown the tracker 31 ctx context.Context 32 cancelFn context.CancelFunc 33 34 // alloc is the alloc we are tracking 35 alloc *structs.Allocation 36 37 // tg is the task group we are tracking 38 tg *structs.TaskGroup 39 40 // minHealthyTime is the duration an alloc must remain healthy to be 41 // considered healthy 42 minHealthyTime time.Duration 43 44 // checkLookupInterval is the interval at which we check if the 45 // Consul checks are healthy or unhealthy. 46 checkLookupInterval time.Duration 47 48 // useChecks specifies whether to use Consul healh checks or not 49 useChecks bool 50 51 // consulCheckCount is the number of checks the task group will attempt to 52 // register 53 consulCheckCount int 54 55 // allocUpdates is a listener for retrieving new alloc updates 56 allocUpdates *cstructs.AllocListener 57 58 // consulClient is used to look up the state of the task's checks 59 consulClient cconsul.ConsulServiceAPI 60 61 // healthy is used to signal whether we have determined the allocation to be 62 // healthy or unhealthy 63 healthy chan bool 64 65 // allocStopped is triggered when the allocation is stopped and tracking is 66 // not needed 67 allocStopped chan struct{} 68 69 // lifecycleTasks is a set of tasks with lifecycle hook set and may 70 // terminate without affecting alloc health 71 lifecycleTasks map[string]bool 72 73 // l is used to lock shared fields listed below 74 l sync.Mutex 75 76 // tasksHealthy marks whether all the tasks have met their health check 77 // (disregards Consul) 78 tasksHealthy bool 79 80 // allocFailed marks whether the allocation failed 81 allocFailed bool 82 83 // checksHealthy marks whether all the task's Consul checks are healthy 84 checksHealthy bool 85 86 // taskHealth contains the health state for each task 87 taskHealth map[string]*taskHealthState 88 89 logger hclog.Logger 90 } 91 92 // NewTracker returns a health tracker for the given allocation. An alloc 93 // listener and consul API object are given so that the watcher can detect 94 // health changes. 95 func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.Allocation, 96 allocUpdates *cstructs.AllocListener, consulClient cconsul.ConsulServiceAPI, 97 minHealthyTime time.Duration, useChecks bool) *Tracker { 98 99 // Do not create a named sub-logger as the hook controlling 100 // this struct should pass in an appropriately named 101 // sub-logger. 102 t := &Tracker{ 103 healthy: make(chan bool, 1), 104 allocStopped: make(chan struct{}), 105 alloc: alloc, 106 tg: alloc.Job.LookupTaskGroup(alloc.TaskGroup), 107 minHealthyTime: minHealthyTime, 108 useChecks: useChecks, 109 allocUpdates: allocUpdates, 110 consulClient: consulClient, 111 checkLookupInterval: consulCheckLookupInterval, 112 logger: logger, 113 lifecycleTasks: map[string]bool{}, 114 } 115 116 t.taskHealth = make(map[string]*taskHealthState, len(t.tg.Tasks)) 117 for _, task := range t.tg.Tasks { 118 t.taskHealth[task.Name] = &taskHealthState{task: task} 119 120 if task.Lifecycle != nil && !task.Lifecycle.Sidecar { 121 t.lifecycleTasks[task.Name] = true 122 } 123 124 for _, s := range task.Services { 125 t.consulCheckCount += len(s.Checks) 126 } 127 } 128 129 for _, s := range t.tg.Services { 130 t.consulCheckCount += len(s.Checks) 131 } 132 133 t.ctx, t.cancelFn = context.WithCancel(parentCtx) 134 return t 135 } 136 137 // Start starts the watcher. 138 func (t *Tracker) Start() { 139 go t.watchTaskEvents() 140 if t.useChecks { 141 go t.watchConsulEvents() 142 } 143 } 144 145 // HealthyCh returns a channel that will emit a boolean indicating the health of 146 // the allocation. 147 func (t *Tracker) HealthyCh() <-chan bool { 148 return t.healthy 149 } 150 151 // AllocStoppedCh returns a channel that will be fired if the allocation is 152 // stopped. This means that health will not be set. 153 func (t *Tracker) AllocStoppedCh() <-chan struct{} { 154 return t.allocStopped 155 } 156 157 // TaskEvents returns a map of events by task. This should only be called after 158 // health has been determined. Only tasks that have contributed to the 159 // allocation being unhealthy will have an event. 160 func (t *Tracker) TaskEvents() map[string]*structs.TaskEvent { 161 t.l.Lock() 162 defer t.l.Unlock() 163 164 // Nothing to do since the failure wasn't task related 165 if t.allocFailed { 166 return nil 167 } 168 169 deadline, _ := t.ctx.Deadline() 170 events := make(map[string]*structs.TaskEvent, len(t.tg.Tasks)) 171 172 // Go through are task information and build the event map 173 for task, state := range t.taskHealth { 174 useChecks := t.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks 175 if e, ok := state.event(deadline, t.tg.Update.MinHealthyTime, useChecks); ok { 176 events[task] = structs.NewTaskEvent(AllocHealthEventSource).SetMessage(e) 177 } 178 } 179 180 return events 181 } 182 183 // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the 184 // allocation is terminal, health is immediately broadcasted. 185 func (t *Tracker) setTaskHealth(healthy, terminal bool) { 186 t.l.Lock() 187 defer t.l.Unlock() 188 t.tasksHealthy = healthy 189 190 // if unhealthy, force waiting for new checks health status 191 if !terminal && !healthy { 192 t.checksHealthy = false 193 return 194 } 195 196 // If we are marked healthy but we also require Consul to be healthy and it 197 // isn't yet, return, unless the task is terminal 198 requireConsul := t.useChecks && t.consulCheckCount > 0 199 if !terminal && healthy && requireConsul && !t.checksHealthy { 200 return 201 } 202 203 select { 204 case t.healthy <- healthy: 205 default: 206 } 207 208 // Shutdown the tracker 209 t.cancelFn() 210 } 211 212 // setCheckHealth is used to mark the checks as either healthy or unhealthy. 213 // returns true if health is propagated and no more health monitoring is needed 214 func (t *Tracker) setCheckHealth(healthy bool) bool { 215 t.l.Lock() 216 defer t.l.Unlock() 217 218 // check health should always be false if tasks are unhealthy 219 // as checks might be missing from unhealthy tasks 220 t.checksHealthy = healthy && t.tasksHealthy 221 222 // Only signal if we are healthy and so is the tasks 223 if !t.checksHealthy { 224 return false 225 } 226 227 select { 228 case t.healthy <- healthy: 229 default: 230 } 231 232 // Shutdown the tracker 233 t.cancelFn() 234 return true 235 } 236 237 // markAllocStopped is used to mark the allocation as having stopped. 238 func (t *Tracker) markAllocStopped() { 239 close(t.allocStopped) 240 t.cancelFn() 241 } 242 243 // watchTaskEvents is a long lived watcher that watches for the health of the 244 // allocation's tasks. 245 func (t *Tracker) watchTaskEvents() { 246 alloc := t.alloc 247 allStartedTime := time.Time{} 248 healthyTimer := time.NewTimer(0) 249 if !healthyTimer.Stop() { 250 select { 251 case <-healthyTimer.C: 252 default: 253 } 254 } 255 256 for { 257 // If the alloc is being stopped by the server just exit 258 switch alloc.DesiredStatus { 259 case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: 260 t.logger.Trace("desired status is terminal for alloc", "alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus) 261 t.markAllocStopped() 262 return 263 } 264 265 // Store the task states 266 t.l.Lock() 267 for task, state := range alloc.TaskStates { 268 //TODO(schmichael) for now skip unknown tasks as 269 //they're task group services which don't currently 270 //support checks anyway 271 if v, ok := t.taskHealth[task]; ok { 272 v.state = state 273 } 274 } 275 t.l.Unlock() 276 277 // Detect if the alloc is unhealthy or if all tasks have started yet 278 latestStartTime := time.Time{} 279 for taskName, state := range alloc.TaskStates { 280 // One of the tasks has failed so we can exit watching 281 if state.Failed || (!state.FinishedAt.IsZero() && !t.lifecycleTasks[taskName]) { 282 t.setTaskHealth(false, true) 283 return 284 } 285 286 if state.State == structs.TaskStatePending { 287 latestStartTime = time.Time{} 288 break 289 } else if state.StartedAt.After(latestStartTime) { 290 // task is either running or exited successfully 291 latestStartTime = state.StartedAt 292 } 293 } 294 295 // If the alloc is marked as failed by the client but none of the 296 // individual tasks failed, that means something failed at the alloc 297 // level. 298 if alloc.ClientStatus == structs.AllocClientStatusFailed { 299 t.l.Lock() 300 t.allocFailed = true 301 t.l.Unlock() 302 t.setTaskHealth(false, true) 303 return 304 } 305 306 if !latestStartTime.Equal(allStartedTime) { 307 // reset task health 308 t.setTaskHealth(false, false) 309 310 // Avoid the timer from firing at the old start time 311 if !healthyTimer.Stop() { 312 select { 313 case <-healthyTimer.C: 314 default: 315 } 316 } 317 318 // Set the timer since all tasks are started 319 if !latestStartTime.IsZero() { 320 allStartedTime = latestStartTime 321 healthyTimer.Reset(t.minHealthyTime) 322 } 323 } 324 325 select { 326 case <-t.ctx.Done(): 327 return 328 case newAlloc, ok := <-t.allocUpdates.Ch(): 329 if !ok { 330 return 331 } 332 alloc = newAlloc 333 case <-healthyTimer.C: 334 t.setTaskHealth(true, false) 335 } 336 } 337 } 338 339 // watchConsulEvents is a long lived watcher for the health of the allocation's 340 // Consul checks. 341 func (t *Tracker) watchConsulEvents() { 342 // checkTicker is the ticker that triggers us to look at the checks in 343 // Consul 344 checkTicker := time.NewTicker(t.checkLookupInterval) 345 defer checkTicker.Stop() 346 347 // healthyTimer fires when the checks have been healthy for the 348 // MinHealthyTime 349 healthyTimer := time.NewTimer(0) 350 if !healthyTimer.Stop() { 351 select { 352 case <-healthyTimer.C: 353 default: 354 } 355 } 356 357 // primed marks whether the healthy timer has been set 358 primed := false 359 360 // Store whether the last Consul checks call was successful or not 361 consulChecksErr := false 362 363 // allocReg are the registered objects in Consul for the allocation 364 var allocReg *consul.AllocRegistration 365 366 OUTER: 367 for { 368 select { 369 case <-t.ctx.Done(): 370 return 371 case <-checkTicker.C: 372 newAllocReg, err := t.consulClient.AllocRegistrations(t.alloc.ID) 373 if err != nil { 374 if !consulChecksErr { 375 consulChecksErr = true 376 t.logger.Warn("error looking up Consul registrations for allocation", "error", err, "alloc_id", t.alloc.ID) 377 } 378 continue OUTER 379 } else { 380 consulChecksErr = false 381 allocReg = newAllocReg 382 } 383 case <-healthyTimer.C: 384 if t.setCheckHealth(true) { 385 // final health set and propagated 386 return 387 } 388 // tasks are unhealthy, reset and wait until all is healthy 389 primed = false 390 } 391 392 if allocReg == nil { 393 continue 394 } 395 396 // Store the task registrations 397 t.l.Lock() 398 for task, reg := range allocReg.Tasks { 399 //TODO(schmichael) for now skip unknown tasks as 400 //they're task group services which don't currently 401 //support checks anyway 402 if v, ok := t.taskHealth[task]; ok { 403 v.taskRegistrations = reg 404 } 405 } 406 t.l.Unlock() 407 408 // Detect if all the checks are passing 409 passed := true 410 411 CHECKS: 412 for _, treg := range allocReg.Tasks { 413 for _, sreg := range treg.Services { 414 for _, check := range sreg.Checks { 415 if check.Status == api.HealthPassing { 416 continue 417 } 418 419 passed = false 420 t.setCheckHealth(false) 421 break CHECKS 422 } 423 } 424 } 425 426 if !passed { 427 // Reset the timer since we have transitioned back to unhealthy 428 if primed { 429 if !healthyTimer.Stop() { 430 select { 431 case <-healthyTimer.C: 432 default: 433 } 434 } 435 primed = false 436 } 437 } else if !primed { 438 // Reset the timer to fire after MinHealthyTime 439 if !healthyTimer.Stop() { 440 select { 441 case <-healthyTimer.C: 442 default: 443 } 444 } 445 446 primed = true 447 healthyTimer.Reset(t.minHealthyTime) 448 } 449 } 450 } 451 452 // taskHealthState captures all known health information about a task. It is 453 // largely used to determine if the task has contributed to the allocation being 454 // unhealthy. 455 type taskHealthState struct { 456 task *structs.Task 457 state *structs.TaskState 458 taskRegistrations *consul.ServiceRegistrations 459 } 460 461 // event takes the deadline time for the allocation to be healthy and the update 462 // strategy of the group. It returns true if the task has contributed to the 463 // allocation being unhealthy and if so, an event description of why. 464 func (t *taskHealthState) event(deadline time.Time, minHealthyTime time.Duration, useChecks bool) (string, bool) { 465 requireChecks := false 466 desiredChecks := 0 467 for _, s := range t.task.Services { 468 if nc := len(s.Checks); nc > 0 { 469 requireChecks = true 470 desiredChecks += nc 471 } 472 } 473 requireChecks = requireChecks && useChecks 474 475 if t.state != nil { 476 if t.state.Failed { 477 return "Unhealthy because of failed task", true 478 } 479 480 switch t.state.State { 481 case structs.TaskStatePending: 482 return "Task not running by deadline", true 483 case structs.TaskStateDead: 484 // hook tasks are healthy when dead successfully 485 if t.task.Lifecycle == nil || t.task.Lifecycle.Sidecar { 486 return "Unhealthy because of dead task", true 487 } 488 case structs.TaskStateRunning: 489 // We are running so check if we have been running long enough 490 if t.state.StartedAt.Add(minHealthyTime).After(deadline) { 491 return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", minHealthyTime), true 492 } 493 } 494 } 495 496 if t.taskRegistrations != nil { 497 var notPassing []string 498 passing := 0 499 500 OUTER: 501 for _, sreg := range t.taskRegistrations.Services { 502 for _, check := range sreg.Checks { 503 if check.Status != api.HealthPassing { 504 notPassing = append(notPassing, sreg.Service.Service) 505 continue OUTER 506 } else { 507 passing++ 508 } 509 } 510 } 511 512 if len(notPassing) != 0 { 513 return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true 514 } 515 516 if passing != desiredChecks { 517 return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true 518 } 519 520 } else if requireChecks { 521 return "Service checks not registered", true 522 } 523 524 return "", false 525 }