github.com/bigcommerce/nomad@v0.9.3-bc/client/allochealth/tracker.go (about) 1 package allochealth 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "sync" 8 "time" 9 10 "github.com/hashicorp/consul/api" 11 hclog "github.com/hashicorp/go-hclog" 12 cconsul "github.com/hashicorp/nomad/client/consul" 13 cstructs "github.com/hashicorp/nomad/client/structs" 14 "github.com/hashicorp/nomad/command/agent/consul" 15 "github.com/hashicorp/nomad/nomad/structs" 16 ) 17 18 const ( 19 // AllocHealthEventSource is the source used for emitting task events 20 AllocHealthEventSource = "Alloc Unhealthy" 21 22 // consulCheckLookupInterval is the interval at which we check if the 23 // Consul checks are healthy or unhealthy. 24 consulCheckLookupInterval = 500 * time.Millisecond 25 ) 26 27 // Tracker tracks the health of an allocation and makes health events watchable 28 // via channels. 29 type Tracker struct { 30 // ctx and cancelFn is used to shutdown the tracker 31 ctx context.Context 32 cancelFn context.CancelFunc 33 34 // alloc is the alloc we are tracking 35 alloc *structs.Allocation 36 37 // tg is the task group we are tracking 38 tg *structs.TaskGroup 39 40 // minHealthyTime is the duration an alloc must remain healthy to be 41 // considered healthy 42 minHealthyTime time.Duration 43 44 // useChecks specifies whether to use Consul healh checks or not 45 useChecks bool 46 47 // consulCheckCount is the number of checks the task group will attempt to 48 // register 49 consulCheckCount int 50 51 // allocUpdates is a listener for retrieving new alloc updates 52 allocUpdates *cstructs.AllocListener 53 54 // consulClient is used to look up the state of the task's checks 55 consulClient cconsul.ConsulServiceAPI 56 57 // healthy is used to signal whether we have determined the allocation to be 58 // healthy or unhealthy 59 healthy chan bool 60 61 // allocStopped is triggered when the allocation is stopped and tracking is 62 // not needed 63 allocStopped chan struct{} 64 65 // l is used to lock shared fields listed below 66 l sync.Mutex 67 68 // tasksHealthy marks whether all the tasks have met their health check 69 // (disregards Consul) 70 tasksHealthy bool 71 72 // allocFailed marks whether the allocation failed 73 allocFailed bool 74 75 // checksHealthy marks whether all the task's Consul checks are healthy 76 checksHealthy bool 77 78 // taskHealth contains the health state for each task 79 taskHealth map[string]*taskHealthState 80 81 logger hclog.Logger 82 } 83 84 // NewTracker returns a health tracker for the given allocation. An alloc 85 // listener and consul API object are given so that the watcher can detect 86 // health changes. 87 func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.Allocation, 88 allocUpdates *cstructs.AllocListener, consulClient cconsul.ConsulServiceAPI, 89 minHealthyTime time.Duration, useChecks bool) *Tracker { 90 91 // Do not create a named sub-logger as the hook controlling 92 // this struct should pass in an appropriately named 93 // sub-logger. 94 t := &Tracker{ 95 healthy: make(chan bool, 1), 96 allocStopped: make(chan struct{}), 97 alloc: alloc, 98 tg: alloc.Job.LookupTaskGroup(alloc.TaskGroup), 99 minHealthyTime: minHealthyTime, 100 useChecks: useChecks, 101 allocUpdates: allocUpdates, 102 consulClient: consulClient, 103 logger: logger, 104 } 105 106 t.taskHealth = make(map[string]*taskHealthState, len(t.tg.Tasks)) 107 for _, task := range t.tg.Tasks { 108 t.taskHealth[task.Name] = &taskHealthState{task: task} 109 } 110 111 for _, task := range t.tg.Tasks { 112 for _, s := range task.Services { 113 t.consulCheckCount += len(s.Checks) 114 } 115 } 116 117 t.ctx, t.cancelFn = context.WithCancel(parentCtx) 118 return t 119 } 120 121 // Start starts the watcher. 122 func (t *Tracker) Start() { 123 go t.watchTaskEvents() 124 if t.useChecks { 125 go t.watchConsulEvents() 126 } 127 } 128 129 // HealthyCh returns a channel that will emit a boolean indicating the health of 130 // the allocation. 131 func (t *Tracker) HealthyCh() <-chan bool { 132 return t.healthy 133 } 134 135 // AllocStoppedCh returns a channel that will be fired if the allocation is 136 // stopped. This means that health will not be set. 137 func (t *Tracker) AllocStoppedCh() <-chan struct{} { 138 return t.allocStopped 139 } 140 141 // TaskEvents returns a map of events by task. This should only be called after 142 // health has been determined. Only tasks that have contributed to the 143 // allocation being unhealthy will have an event. 144 func (t *Tracker) TaskEvents() map[string]*structs.TaskEvent { 145 t.l.Lock() 146 defer t.l.Unlock() 147 148 // Nothing to do since the failure wasn't task related 149 if t.allocFailed { 150 return nil 151 } 152 153 deadline, _ := t.ctx.Deadline() 154 events := make(map[string]*structs.TaskEvent, len(t.tg.Tasks)) 155 156 // Go through are task information and build the event map 157 for task, state := range t.taskHealth { 158 useChecks := t.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks 159 if e, ok := state.event(deadline, t.tg.Update.MinHealthyTime, useChecks); ok { 160 events[task] = structs.NewTaskEvent(AllocHealthEventSource).SetMessage(e) 161 } 162 } 163 164 return events 165 } 166 167 // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the 168 // allocation is terminal, health is immediately broadcasted. 169 func (t *Tracker) setTaskHealth(healthy, terminal bool) { 170 t.l.Lock() 171 defer t.l.Unlock() 172 t.tasksHealthy = healthy 173 174 // If we are marked healthy but we also require Consul to be healthy and it 175 // isn't yet, return, unless the task is terminal 176 requireConsul := t.useChecks && t.consulCheckCount > 0 177 if !terminal && healthy && requireConsul && !t.checksHealthy { 178 return 179 } 180 181 select { 182 case t.healthy <- healthy: 183 default: 184 } 185 186 // Shutdown the tracker 187 t.cancelFn() 188 } 189 190 // setCheckHealth is used to mark the checks as either healthy or unhealthy. 191 func (t *Tracker) setCheckHealth(healthy bool) { 192 t.l.Lock() 193 defer t.l.Unlock() 194 t.checksHealthy = healthy 195 196 // Only signal if we are healthy and so is the tasks 197 if !healthy || !t.tasksHealthy { 198 return 199 } 200 201 select { 202 case t.healthy <- healthy: 203 default: 204 } 205 206 // Shutdown the tracker 207 t.cancelFn() 208 } 209 210 // markAllocStopped is used to mark the allocation as having stopped. 211 func (t *Tracker) markAllocStopped() { 212 close(t.allocStopped) 213 t.cancelFn() 214 } 215 216 // watchTaskEvents is a long lived watcher that watches for the health of the 217 // allocation's tasks. 218 func (t *Tracker) watchTaskEvents() { 219 alloc := t.alloc 220 allStartedTime := time.Time{} 221 healthyTimer := time.NewTimer(0) 222 if !healthyTimer.Stop() { 223 select { 224 case <-healthyTimer.C: 225 default: 226 } 227 } 228 229 for { 230 // If the alloc is being stopped by the server just exit 231 switch alloc.DesiredStatus { 232 case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: 233 t.logger.Trace("desired status is terminal for alloc", "alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus) 234 t.markAllocStopped() 235 return 236 } 237 238 // Store the task states 239 t.l.Lock() 240 for task, state := range alloc.TaskStates { 241 t.taskHealth[task].state = state 242 } 243 t.l.Unlock() 244 245 // Detect if the alloc is unhealthy or if all tasks have started yet 246 latestStartTime := time.Time{} 247 for _, state := range alloc.TaskStates { 248 // One of the tasks has failed so we can exit watching 249 if state.Failed || !state.FinishedAt.IsZero() { 250 t.setTaskHealth(false, true) 251 return 252 } 253 254 if state.State != structs.TaskStateRunning { 255 latestStartTime = time.Time{} 256 break 257 } else if state.StartedAt.After(latestStartTime) { 258 latestStartTime = state.StartedAt 259 } 260 } 261 262 // If the alloc is marked as failed by the client but none of the 263 // individual tasks failed, that means something failed at the alloc 264 // level. 265 if alloc.ClientStatus == structs.AllocClientStatusFailed { 266 t.l.Lock() 267 t.allocFailed = true 268 t.l.Unlock() 269 t.setTaskHealth(false, true) 270 return 271 } 272 273 if !latestStartTime.Equal(allStartedTime) { 274 // Avoid the timer from firing at the old start time 275 if !healthyTimer.Stop() { 276 select { 277 case <-healthyTimer.C: 278 default: 279 } 280 } 281 282 // Set the timer since all tasks are started 283 if !latestStartTime.IsZero() { 284 allStartedTime = latestStartTime 285 healthyTimer.Reset(t.minHealthyTime) 286 } 287 } 288 289 select { 290 case <-t.ctx.Done(): 291 return 292 case newAlloc, ok := <-t.allocUpdates.Ch(): 293 if !ok { 294 return 295 } 296 alloc = newAlloc 297 case <-healthyTimer.C: 298 t.setTaskHealth(true, false) 299 } 300 } 301 } 302 303 // watchConsulEvents is a long lived watcher for the health of the allocation's 304 // Consul checks. 305 func (t *Tracker) watchConsulEvents() { 306 // checkTicker is the ticker that triggers us to look at the checks in 307 // Consul 308 checkTicker := time.NewTicker(consulCheckLookupInterval) 309 defer checkTicker.Stop() 310 311 // healthyTimer fires when the checks have been healthy for the 312 // MinHealthyTime 313 healthyTimer := time.NewTimer(0) 314 if !healthyTimer.Stop() { 315 select { 316 case <-healthyTimer.C: 317 default: 318 } 319 } 320 321 // primed marks whether the healthy timer has been set 322 primed := false 323 324 // Store whether the last Consul checks call was successful or not 325 consulChecksErr := false 326 327 // allocReg are the registered objects in Consul for the allocation 328 var allocReg *consul.AllocRegistration 329 330 OUTER: 331 for { 332 select { 333 case <-t.ctx.Done(): 334 return 335 case <-checkTicker.C: 336 newAllocReg, err := t.consulClient.AllocRegistrations(t.alloc.ID) 337 if err != nil { 338 if !consulChecksErr { 339 consulChecksErr = true 340 t.logger.Warn("error looking up Consul registrations for allocation", "error", err, "alloc_id", t.alloc.ID) 341 } 342 continue OUTER 343 } else { 344 consulChecksErr = false 345 allocReg = newAllocReg 346 } 347 case <-healthyTimer.C: 348 t.setCheckHealth(true) 349 } 350 351 if allocReg == nil { 352 continue 353 } 354 355 // Store the task registrations 356 t.l.Lock() 357 for task, reg := range allocReg.Tasks { 358 t.taskHealth[task].taskRegistrations = reg 359 } 360 t.l.Unlock() 361 362 // Detect if all the checks are passing 363 passed := true 364 365 CHECKS: 366 for _, treg := range allocReg.Tasks { 367 for _, sreg := range treg.Services { 368 for _, check := range sreg.Checks { 369 if check.Status == api.HealthPassing { 370 continue 371 } 372 373 passed = false 374 t.setCheckHealth(false) 375 break CHECKS 376 } 377 } 378 } 379 380 if !passed { 381 // Reset the timer since we have transitioned back to unhealthy 382 if primed { 383 if !healthyTimer.Stop() { 384 select { 385 case <-healthyTimer.C: 386 default: 387 } 388 } 389 primed = false 390 } 391 } else if !primed { 392 // Reset the timer to fire after MinHealthyTime 393 if !healthyTimer.Stop() { 394 select { 395 case <-healthyTimer.C: 396 default: 397 } 398 } 399 400 primed = true 401 healthyTimer.Reset(t.minHealthyTime) 402 } 403 } 404 } 405 406 // taskHealthState captures all known health information about a task. It is 407 // largely used to determine if the task has contributed to the allocation being 408 // unhealthy. 409 type taskHealthState struct { 410 task *structs.Task 411 state *structs.TaskState 412 taskRegistrations *consul.TaskRegistration 413 } 414 415 // event takes the deadline time for the allocation to be healthy and the update 416 // strategy of the group. It returns true if the task has contributed to the 417 // allocation being unhealthy and if so, an event description of why. 418 func (t *taskHealthState) event(deadline time.Time, minHealthyTime time.Duration, useChecks bool) (string, bool) { 419 requireChecks := false 420 desiredChecks := 0 421 for _, s := range t.task.Services { 422 if nc := len(s.Checks); nc > 0 { 423 requireChecks = true 424 desiredChecks += nc 425 } 426 } 427 requireChecks = requireChecks && useChecks 428 429 if t.state != nil { 430 if t.state.Failed { 431 return "Unhealthy because of failed task", true 432 } 433 if t.state.State != structs.TaskStateRunning { 434 return "Task not running by deadline", true 435 } 436 437 // We are running so check if we have been running long enough 438 if t.state.StartedAt.Add(minHealthyTime).After(deadline) { 439 return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", minHealthyTime), true 440 } 441 } 442 443 if t.taskRegistrations != nil { 444 var notPassing []string 445 passing := 0 446 447 OUTER: 448 for _, sreg := range t.taskRegistrations.Services { 449 for _, check := range sreg.Checks { 450 if check.Status != api.HealthPassing { 451 notPassing = append(notPassing, sreg.Service.Service) 452 continue OUTER 453 } else { 454 passing++ 455 } 456 } 457 } 458 459 if len(notPassing) != 0 { 460 return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true 461 } 462 463 if passing != desiredChecks { 464 return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true 465 } 466 467 } else if requireChecks { 468 return "Service checks not registered", true 469 } 470 471 return "", false 472 }