github.com/manicqin/nomad@v0.9.5/client/allochealth/tracker.go (about) 1 package allochealth 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "sync" 8 "time" 9 10 "github.com/hashicorp/consul/api" 11 hclog "github.com/hashicorp/go-hclog" 12 cconsul "github.com/hashicorp/nomad/client/consul" 13 cstructs "github.com/hashicorp/nomad/client/structs" 14 "github.com/hashicorp/nomad/command/agent/consul" 15 "github.com/hashicorp/nomad/nomad/structs" 16 ) 17 18 const ( 19 // AllocHealthEventSource is the source used for emitting task events 20 AllocHealthEventSource = "Alloc Unhealthy" 21 22 // consulCheckLookupInterval is the interval at which we check if the 23 // Consul checks are healthy or unhealthy. 24 consulCheckLookupInterval = 500 * time.Millisecond 25 ) 26 27 // Tracker tracks the health of an allocation and makes health events watchable 28 // via channels. 29 type Tracker struct { 30 // ctx and cancelFn is used to shutdown the tracker 31 ctx context.Context 32 cancelFn context.CancelFunc 33 34 // alloc is the alloc we are tracking 35 alloc *structs.Allocation 36 37 // tg is the task group we are tracking 38 tg *structs.TaskGroup 39 40 // minHealthyTime is the duration an alloc must remain healthy to be 41 // considered healthy 42 minHealthyTime time.Duration 43 44 // useChecks specifies whether to use Consul healh checks or not 45 useChecks bool 46 47 // consulCheckCount is the number of checks the task group will attempt to 48 // register 49 consulCheckCount int 50 51 // allocUpdates is a listener for retrieving new alloc updates 52 allocUpdates *cstructs.AllocListener 53 54 // consulClient is used to look up the state of the task's checks 55 consulClient cconsul.ConsulServiceAPI 56 57 // healthy is used to signal whether we have determined the allocation to be 58 // healthy or unhealthy 59 healthy chan bool 60 61 // allocStopped is triggered when the allocation is stopped and tracking is 62 // not needed 63 allocStopped chan struct{} 64 65 // l is used to lock shared fields listed below 66 l sync.Mutex 67 68 // tasksHealthy marks whether all the tasks have met their health check 69 // (disregards Consul) 70 tasksHealthy bool 71 72 // allocFailed marks whether the allocation failed 73 allocFailed bool 74 75 // checksHealthy marks whether all the task's Consul checks are healthy 76 checksHealthy bool 77 78 // taskHealth contains the health state for each task 79 taskHealth map[string]*taskHealthState 80 81 logger hclog.Logger 82 } 83 84 // NewTracker returns a health tracker for the given allocation. An alloc 85 // listener and consul API object are given so that the watcher can detect 86 // health changes. 87 func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.Allocation, 88 allocUpdates *cstructs.AllocListener, consulClient cconsul.ConsulServiceAPI, 89 minHealthyTime time.Duration, useChecks bool) *Tracker { 90 91 // Do not create a named sub-logger as the hook controlling 92 // this struct should pass in an appropriately named 93 // sub-logger. 94 t := &Tracker{ 95 healthy: make(chan bool, 1), 96 allocStopped: make(chan struct{}), 97 alloc: alloc, 98 tg: alloc.Job.LookupTaskGroup(alloc.TaskGroup), 99 minHealthyTime: minHealthyTime, 100 useChecks: useChecks, 101 allocUpdates: allocUpdates, 102 consulClient: consulClient, 103 logger: logger, 104 } 105 106 t.taskHealth = make(map[string]*taskHealthState, len(t.tg.Tasks)) 107 for _, task := range t.tg.Tasks { 108 t.taskHealth[task.Name] = &taskHealthState{task: task} 109 } 110 111 for _, task := range t.tg.Tasks { 112 for _, s := range task.Services { 113 t.consulCheckCount += len(s.Checks) 114 } 115 } 116 117 t.ctx, t.cancelFn = context.WithCancel(parentCtx) 118 return t 119 } 120 121 // Start starts the watcher. 122 func (t *Tracker) Start() { 123 go t.watchTaskEvents() 124 if t.useChecks { 125 go t.watchConsulEvents() 126 } 127 } 128 129 // HealthyCh returns a channel that will emit a boolean indicating the health of 130 // the allocation. 131 func (t *Tracker) HealthyCh() <-chan bool { 132 return t.healthy 133 } 134 135 // AllocStoppedCh returns a channel that will be fired if the allocation is 136 // stopped. This means that health will not be set. 137 func (t *Tracker) AllocStoppedCh() <-chan struct{} { 138 return t.allocStopped 139 } 140 141 // TaskEvents returns a map of events by task. This should only be called after 142 // health has been determined. Only tasks that have contributed to the 143 // allocation being unhealthy will have an event. 144 func (t *Tracker) TaskEvents() map[string]*structs.TaskEvent { 145 t.l.Lock() 146 defer t.l.Unlock() 147 148 // Nothing to do since the failure wasn't task related 149 if t.allocFailed { 150 return nil 151 } 152 153 deadline, _ := t.ctx.Deadline() 154 events := make(map[string]*structs.TaskEvent, len(t.tg.Tasks)) 155 156 // Go through are task information and build the event map 157 for task, state := range t.taskHealth { 158 useChecks := t.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks 159 if e, ok := state.event(deadline, t.tg.Update.MinHealthyTime, useChecks); ok { 160 events[task] = structs.NewTaskEvent(AllocHealthEventSource).SetMessage(e) 161 } 162 } 163 164 return events 165 } 166 167 // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the 168 // allocation is terminal, health is immediately broadcasted. 169 func (t *Tracker) setTaskHealth(healthy, terminal bool) { 170 t.l.Lock() 171 defer t.l.Unlock() 172 t.tasksHealthy = healthy 173 174 // If we are marked healthy but we also require Consul to be healthy and it 175 // isn't yet, return, unless the task is terminal 176 requireConsul := t.useChecks && t.consulCheckCount > 0 177 if !terminal && healthy && requireConsul && !t.checksHealthy { 178 return 179 } 180 181 select { 182 case t.healthy <- healthy: 183 default: 184 } 185 186 // Shutdown the tracker 187 t.cancelFn() 188 } 189 190 // setCheckHealth is used to mark the checks as either healthy or unhealthy. 191 func (t *Tracker) setCheckHealth(healthy bool) { 192 t.l.Lock() 193 defer t.l.Unlock() 194 t.checksHealthy = healthy 195 196 // Only signal if we are healthy and so is the tasks 197 if !healthy || !t.tasksHealthy { 198 return 199 } 200 201 select { 202 case t.healthy <- healthy: 203 default: 204 } 205 206 // Shutdown the tracker 207 t.cancelFn() 208 } 209 210 // markAllocStopped is used to mark the allocation as having stopped. 211 func (t *Tracker) markAllocStopped() { 212 close(t.allocStopped) 213 t.cancelFn() 214 } 215 216 // watchTaskEvents is a long lived watcher that watches for the health of the 217 // allocation's tasks. 218 func (t *Tracker) watchTaskEvents() { 219 alloc := t.alloc 220 allStartedTime := time.Time{} 221 healthyTimer := time.NewTimer(0) 222 if !healthyTimer.Stop() { 223 select { 224 case <-healthyTimer.C: 225 default: 226 } 227 } 228 229 for { 230 // If the alloc is being stopped by the server just exit 231 switch alloc.DesiredStatus { 232 case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: 233 t.logger.Trace("desired status is terminal for alloc", "alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus) 234 t.markAllocStopped() 235 return 236 } 237 238 // Store the task states 239 t.l.Lock() 240 for task, state := range alloc.TaskStates { 241 //TODO(schmichael) for now skip unknown tasks as 242 //they're task group services which don't currently 243 //support checks anyway 244 if v, ok := t.taskHealth[task]; ok { 245 v.state = state 246 } 247 } 248 t.l.Unlock() 249 250 // Detect if the alloc is unhealthy or if all tasks have started yet 251 latestStartTime := time.Time{} 252 for _, state := range alloc.TaskStates { 253 // One of the tasks has failed so we can exit watching 254 if state.Failed || !state.FinishedAt.IsZero() { 255 t.setTaskHealth(false, true) 256 return 257 } 258 259 if state.State != structs.TaskStateRunning { 260 latestStartTime = time.Time{} 261 break 262 } else if state.StartedAt.After(latestStartTime) { 263 latestStartTime = state.StartedAt 264 } 265 } 266 267 // If the alloc is marked as failed by the client but none of the 268 // individual tasks failed, that means something failed at the alloc 269 // level. 270 if alloc.ClientStatus == structs.AllocClientStatusFailed { 271 t.l.Lock() 272 t.allocFailed = true 273 t.l.Unlock() 274 t.setTaskHealth(false, true) 275 return 276 } 277 278 if !latestStartTime.Equal(allStartedTime) { 279 // Avoid the timer from firing at the old start time 280 if !healthyTimer.Stop() { 281 select { 282 case <-healthyTimer.C: 283 default: 284 } 285 } 286 287 // Set the timer since all tasks are started 288 if !latestStartTime.IsZero() { 289 allStartedTime = latestStartTime 290 healthyTimer.Reset(t.minHealthyTime) 291 } 292 } 293 294 select { 295 case <-t.ctx.Done(): 296 return 297 case newAlloc, ok := <-t.allocUpdates.Ch(): 298 if !ok { 299 return 300 } 301 alloc = newAlloc 302 case <-healthyTimer.C: 303 t.setTaskHealth(true, false) 304 } 305 } 306 } 307 308 // watchConsulEvents is a long lived watcher for the health of the allocation's 309 // Consul checks. 310 func (t *Tracker) watchConsulEvents() { 311 // checkTicker is the ticker that triggers us to look at the checks in 312 // Consul 313 checkTicker := time.NewTicker(consulCheckLookupInterval) 314 defer checkTicker.Stop() 315 316 // healthyTimer fires when the checks have been healthy for the 317 // MinHealthyTime 318 healthyTimer := time.NewTimer(0) 319 if !healthyTimer.Stop() { 320 select { 321 case <-healthyTimer.C: 322 default: 323 } 324 } 325 326 // primed marks whether the healthy timer has been set 327 primed := false 328 329 // Store whether the last Consul checks call was successful or not 330 consulChecksErr := false 331 332 // allocReg are the registered objects in Consul for the allocation 333 var allocReg *consul.AllocRegistration 334 335 OUTER: 336 for { 337 select { 338 case <-t.ctx.Done(): 339 return 340 case <-checkTicker.C: 341 newAllocReg, err := t.consulClient.AllocRegistrations(t.alloc.ID) 342 if err != nil { 343 if !consulChecksErr { 344 consulChecksErr = true 345 t.logger.Warn("error looking up Consul registrations for allocation", "error", err, "alloc_id", t.alloc.ID) 346 } 347 continue OUTER 348 } else { 349 consulChecksErr = false 350 allocReg = newAllocReg 351 } 352 case <-healthyTimer.C: 353 t.setCheckHealth(true) 354 } 355 356 if allocReg == nil { 357 continue 358 } 359 360 // Store the task registrations 361 t.l.Lock() 362 for task, reg := range allocReg.Tasks { 363 //TODO(schmichael) for now skip unknown tasks as 364 //they're task group services which don't currently 365 //support checks anyway 366 if v, ok := t.taskHealth[task]; ok { 367 v.taskRegistrations = reg 368 } 369 } 370 t.l.Unlock() 371 372 // Detect if all the checks are passing 373 passed := true 374 375 CHECKS: 376 for _, treg := range allocReg.Tasks { 377 for _, sreg := range treg.Services { 378 for _, check := range sreg.Checks { 379 if check.Status == api.HealthPassing { 380 continue 381 } 382 383 passed = false 384 t.setCheckHealth(false) 385 break CHECKS 386 } 387 } 388 } 389 390 if !passed { 391 // Reset the timer since we have transitioned back to unhealthy 392 if primed { 393 if !healthyTimer.Stop() { 394 select { 395 case <-healthyTimer.C: 396 default: 397 } 398 } 399 primed = false 400 } 401 } else if !primed { 402 // Reset the timer to fire after MinHealthyTime 403 if !healthyTimer.Stop() { 404 select { 405 case <-healthyTimer.C: 406 default: 407 } 408 } 409 410 primed = true 411 healthyTimer.Reset(t.minHealthyTime) 412 } 413 } 414 } 415 416 // taskHealthState captures all known health information about a task. It is 417 // largely used to determine if the task has contributed to the allocation being 418 // unhealthy. 419 type taskHealthState struct { 420 task *structs.Task 421 state *structs.TaskState 422 taskRegistrations *consul.ServiceRegistrations 423 } 424 425 // event takes the deadline time for the allocation to be healthy and the update 426 // strategy of the group. It returns true if the task has contributed to the 427 // allocation being unhealthy and if so, an event description of why. 428 func (t *taskHealthState) event(deadline time.Time, minHealthyTime time.Duration, useChecks bool) (string, bool) { 429 requireChecks := false 430 desiredChecks := 0 431 for _, s := range t.task.Services { 432 if nc := len(s.Checks); nc > 0 { 433 requireChecks = true 434 desiredChecks += nc 435 } 436 } 437 requireChecks = requireChecks && useChecks 438 439 if t.state != nil { 440 if t.state.Failed { 441 return "Unhealthy because of failed task", true 442 } 443 if t.state.State != structs.TaskStateRunning { 444 return "Task not running by deadline", true 445 } 446 447 // We are running so check if we have been running long enough 448 if t.state.StartedAt.Add(minHealthyTime).After(deadline) { 449 return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", minHealthyTime), true 450 } 451 } 452 453 if t.taskRegistrations != nil { 454 var notPassing []string 455 passing := 0 456 457 OUTER: 458 for _, sreg := range t.taskRegistrations.Services { 459 for _, check := range sreg.Checks { 460 if check.Status != api.HealthPassing { 461 notPassing = append(notPassing, sreg.Service.Service) 462 continue OUTER 463 } else { 464 passing++ 465 } 466 } 467 } 468 469 if len(notPassing) != 0 { 470 return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true 471 } 472 473 if passing != desiredChecks { 474 return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true 475 } 476 477 } else if requireChecks { 478 return "Service checks not registered", true 479 } 480 481 return "", false 482 }