github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/client/alloc_runner_health_watcher.go (about) 1 package client 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "strings" 8 "sync" 9 "time" 10 11 "github.com/hashicorp/consul/api" 12 cstructs "github.com/hashicorp/nomad/client/structs" 13 "github.com/hashicorp/nomad/command/agent/consul" 14 "github.com/hashicorp/nomad/helper" 15 "github.com/hashicorp/nomad/nomad/structs" 16 ) 17 18 const ( 19 // consulCheckLookupInterval is the interval at which we check if the 20 // Consul checks are healthy or unhealthy. 21 consulCheckLookupInterval = 500 * time.Millisecond 22 23 // allocHealthEventSource is the source used for emitting task events 24 allocHealthEventSource = "Alloc Unhealthy" 25 ) 26 27 // watchHealth is responsible for watching an allocation's task status and 28 // potentially Consul health check status to determine if the allocation is 29 // healthy or unhealthy. 30 func (r *AllocRunner) watchHealth(ctx context.Context) { 31 32 // See if we should watch the allocs health 33 alloc := r.Alloc() 34 if alloc.DeploymentID == "" || alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() { 35 return 36 } 37 38 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 39 if tg == nil { 40 r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher") 41 return 42 } else if tg.Update == nil || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual { 43 return 44 } 45 46 // Get an allocation listener to watch for alloc events 47 l := r.allocBroadcast.Listen() 48 defer l.Close() 49 50 // Create a new context with the health deadline 51 deadline := time.Now().Add(tg.Update.HealthyDeadline) 52 healthCtx, healthCtxCancel := context.WithDeadline(ctx, deadline) 53 defer healthCtxCancel() 54 r.logger.Printf("[DEBUG] client.alloc_watcher: deadline (%v) for alloc %q is at %v", tg.Update.HealthyDeadline, alloc.ID, deadline) 55 56 // Create the health tracker object 57 tracker := newAllocHealthTracker(healthCtx, r.logger, alloc, l, r.consulClient) 58 tracker.Start() 59 60 allocHealthy := false 61 select { 62 case <-healthCtx.Done(): 63 // We were cancelled which means we are no longer needed 64 if healthCtx.Err() == context.Canceled { 65 return 66 } 67 68 // Since the deadline has been reached we are not healthy 69 case <-tracker.AllocStoppedCh(): 70 // The allocation was stopped so nothing to do 71 return 72 case healthy := <-tracker.HealthyCh(): 73 allocHealthy = healthy 74 } 75 76 r.allocLock.Lock() 77 r.allocHealth = helper.BoolToPtr(allocHealthy) 78 r.allocLock.Unlock() 79 80 // We are unhealthy so emit task events explaining why 81 if !allocHealthy { 82 r.taskLock.RLock() 83 for task, event := range tracker.TaskEvents() { 84 if tr, ok := r.tasks[task]; ok { 85 tr.EmitEvent(allocHealthEventSource, event) 86 } 87 } 88 r.taskLock.RUnlock() 89 } 90 91 r.syncStatus() 92 } 93 94 // allocHealthTracker tracks the health of an allocation and makes health events 95 // watchable via channels. 96 type allocHealthTracker struct { 97 // logger is used to log 98 logger *log.Logger 99 100 // ctx and cancelFn is used to shutdown the tracker 101 ctx context.Context 102 cancelFn context.CancelFunc 103 104 // alloc is the alloc we are tracking 105 alloc *structs.Allocation 106 107 // tg is the task group we are tracking 108 tg *structs.TaskGroup 109 110 // consulCheckCount is the number of checks the task group will attempt to 111 // register 112 consulCheckCount int 113 114 // allocUpdates is a listener for retrieving new alloc updates 115 allocUpdates *cstructs.AllocListener 116 117 // consulClient is used to look up the state of the task's checks 118 consulClient ConsulServiceAPI 119 120 // healthy is used to signal whether we have determined the allocation to be 121 // healthy or unhealthy 122 healthy chan bool 123 124 // allocStopped is triggered when the allocation is stopped and tracking is 125 // not needed 126 allocStopped chan struct{} 127 128 // l is used to lock shared fields listed below 129 l sync.Mutex 130 131 // tasksHealthy marks whether all the tasks have met their health check 132 // (disregards Consul) 133 tasksHealthy bool 134 135 // allocFailed marks whether the allocation failed 136 allocFailed bool 137 138 // checksHealthy marks whether all the task's Consul checks are healthy 139 checksHealthy bool 140 141 // taskHealth contains the health state for each task 142 taskHealth map[string]*taskHealthState 143 } 144 145 // newAllocHealthTracker returns a health tracker for the given allocation. An 146 // alloc listener and consul API object are given so that the watcher can detect 147 // health changes. 148 func newAllocHealthTracker(parentCtx context.Context, logger *log.Logger, alloc *structs.Allocation, 149 allocUpdates *cstructs.AllocListener, consulClient ConsulServiceAPI) *allocHealthTracker { 150 151 a := &allocHealthTracker{ 152 logger: logger, 153 healthy: make(chan bool, 1), 154 allocStopped: make(chan struct{}), 155 alloc: alloc, 156 tg: alloc.Job.LookupTaskGroup(alloc.TaskGroup), 157 allocUpdates: allocUpdates, 158 consulClient: consulClient, 159 } 160 161 a.taskHealth = make(map[string]*taskHealthState, len(a.tg.Tasks)) 162 for _, task := range a.tg.Tasks { 163 a.taskHealth[task.Name] = &taskHealthState{task: task} 164 } 165 166 for _, task := range a.tg.Tasks { 167 for _, s := range task.Services { 168 a.consulCheckCount += len(s.Checks) 169 } 170 } 171 172 a.ctx, a.cancelFn = context.WithCancel(parentCtx) 173 return a 174 } 175 176 // Start starts the watcher. 177 func (a *allocHealthTracker) Start() { 178 go a.watchTaskEvents() 179 if a.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks { 180 go a.watchConsulEvents() 181 } 182 } 183 184 // HealthyCh returns a channel that will emit a boolean indicating the health of 185 // the allocation. 186 func (a *allocHealthTracker) HealthyCh() <-chan bool { 187 return a.healthy 188 } 189 190 // AllocStoppedCh returns a channel that will be fired if the allocation is 191 // stopped. This means that health will not be set. 192 func (a *allocHealthTracker) AllocStoppedCh() <-chan struct{} { 193 return a.allocStopped 194 } 195 196 // TaskEvents returns a map of events by task. This should only be called after 197 // health has been determined. Only tasks that have contributed to the 198 // allocation being unhealthy will have an event. 199 func (a *allocHealthTracker) TaskEvents() map[string]string { 200 a.l.Lock() 201 defer a.l.Unlock() 202 203 // Nothing to do since the failure wasn't task related 204 if a.allocFailed { 205 return nil 206 } 207 208 deadline, _ := a.ctx.Deadline() 209 events := make(map[string]string, len(a.tg.Tasks)) 210 211 // Go through are task information and build the event map 212 for task, state := range a.taskHealth { 213 if e, ok := state.event(deadline, a.tg.Update); ok { 214 events[task] = e 215 } 216 } 217 218 return events 219 } 220 221 // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the 222 // allocation is terminal, health is immediately broadcasted. 223 func (a *allocHealthTracker) setTaskHealth(healthy, terminal bool) { 224 a.l.Lock() 225 defer a.l.Unlock() 226 a.tasksHealthy = healthy 227 228 // If we are marked healthy but we also require Consul to be healthy and it 229 // isn't yet, return, unless the task is terminal 230 requireConsul := a.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks && a.consulCheckCount > 0 231 if !terminal && healthy && requireConsul && !a.checksHealthy { 232 return 233 } 234 235 select { 236 case a.healthy <- healthy: 237 default: 238 } 239 240 // Shutdown the tracker 241 a.cancelFn() 242 } 243 244 // setCheckHealth is used to mark the checks as either healthy or unhealthy. 245 func (a *allocHealthTracker) setCheckHealth(healthy bool) { 246 a.l.Lock() 247 defer a.l.Unlock() 248 a.checksHealthy = healthy 249 250 // Only signal if we are healthy and so is the tasks 251 if !healthy || !a.tasksHealthy { 252 return 253 } 254 255 select { 256 case a.healthy <- healthy: 257 default: 258 } 259 260 // Shutdown the tracker 261 a.cancelFn() 262 } 263 264 // markAllocStopped is used to mark the allocation as having stopped. 265 func (a *allocHealthTracker) markAllocStopped() { 266 close(a.allocStopped) 267 a.cancelFn() 268 } 269 270 // watchTaskEvents is a long lived watcher that watches for the health of the 271 // allocation's tasks. 272 func (a *allocHealthTracker) watchTaskEvents() { 273 alloc := a.alloc 274 allStartedTime := time.Time{} 275 healthyTimer := time.NewTimer(0) 276 if !healthyTimer.Stop() { 277 select { 278 case <-healthyTimer.C: 279 default: 280 } 281 } 282 283 for { 284 // If the alloc is being stopped by the server just exit 285 switch alloc.DesiredStatus { 286 case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: 287 a.logger.Printf("[TRACE] client.alloc_watcher: desired status terminal for alloc %q", alloc.ID) 288 a.markAllocStopped() 289 return 290 } 291 292 // Store the task states 293 a.l.Lock() 294 for task, state := range alloc.TaskStates { 295 a.taskHealth[task].state = state 296 } 297 a.l.Unlock() 298 299 // Detect if the alloc is unhealthy or if all tasks have started yet 300 latestStartTime := time.Time{} 301 for _, state := range alloc.TaskStates { 302 // One of the tasks has failed so we can exit watching 303 if state.Failed || !state.FinishedAt.IsZero() { 304 a.setTaskHealth(false, true) 305 return 306 } 307 308 if state.State != structs.TaskStateRunning { 309 latestStartTime = time.Time{} 310 break 311 } else if state.StartedAt.After(latestStartTime) { 312 latestStartTime = state.StartedAt 313 } 314 } 315 316 // If the alloc is marked as failed by the client but none of the 317 // individual tasks failed, that means something failed at the alloc 318 // level. 319 if alloc.ClientStatus == structs.AllocClientStatusFailed { 320 a.logger.Printf("[TRACE] client.alloc_watcher: client status failed for alloc %q", alloc.ID) 321 a.l.Lock() 322 a.allocFailed = true 323 a.l.Unlock() 324 a.setTaskHealth(false, true) 325 return 326 } 327 328 if !latestStartTime.Equal(allStartedTime) { 329 // Avoid the timer from firing at the old start time 330 if !healthyTimer.Stop() { 331 select { 332 case <-healthyTimer.C: 333 default: 334 } 335 } 336 337 // Set the timer since all tasks are started 338 if !latestStartTime.IsZero() { 339 allStartedTime = latestStartTime 340 healthyTimer.Reset(a.tg.Update.MinHealthyTime) 341 } 342 } 343 344 select { 345 case <-a.ctx.Done(): 346 return 347 case newAlloc, ok := <-a.allocUpdates.Ch: 348 if !ok { 349 return 350 } 351 alloc = newAlloc 352 case <-healthyTimer.C: 353 a.setTaskHealth(true, false) 354 } 355 } 356 } 357 358 // watchConsulEvents iis a long lived watcher that watches for the health of the 359 // allocation's Consul checks. 360 func (a *allocHealthTracker) watchConsulEvents() { 361 // checkTicker is the ticker that triggers us to look at the checks in 362 // Consul 363 checkTicker := time.NewTicker(consulCheckLookupInterval) 364 defer checkTicker.Stop() 365 366 // healthyTimer fires when the checks have been healthy for the 367 // MinHealthyTime 368 healthyTimer := time.NewTimer(0) 369 if !healthyTimer.Stop() { 370 select { 371 case <-healthyTimer.C: 372 default: 373 } 374 } 375 376 // primed marks whether the healthy timer has been set 377 primed := false 378 379 // Store whether the last Consul checks call was successful or not 380 consulChecksErr := false 381 382 // allocReg are the registered objects in Consul for the allocation 383 var allocReg *consul.AllocRegistration 384 385 OUTER: 386 for { 387 select { 388 case <-a.ctx.Done(): 389 return 390 case <-checkTicker.C: 391 newAllocReg, err := a.consulClient.AllocRegistrations(a.alloc.ID) 392 if err != nil { 393 if !consulChecksErr { 394 consulChecksErr = true 395 a.logger.Printf("[WARN] client.alloc_watcher: failed to lookup Consul registrations for allocation %q: %v", a.alloc.ID, err) 396 } 397 continue OUTER 398 } else { 399 consulChecksErr = false 400 allocReg = newAllocReg 401 } 402 case <-healthyTimer.C: 403 a.setCheckHealth(true) 404 } 405 406 if allocReg == nil { 407 continue 408 } 409 410 // Store the task registrations 411 a.l.Lock() 412 for task, reg := range allocReg.Tasks { 413 a.taskHealth[task].taskRegistrations = reg 414 } 415 a.l.Unlock() 416 417 // Detect if all the checks are passing 418 passed := true 419 420 CHECKS: 421 for _, treg := range allocReg.Tasks { 422 for _, sreg := range treg.Services { 423 for _, check := range sreg.Checks { 424 if check.Status == api.HealthPassing { 425 continue 426 } 427 428 passed = false 429 a.setCheckHealth(false) 430 break CHECKS 431 } 432 } 433 } 434 435 if !passed { 436 // Reset the timer since we have transistioned back to unhealthy 437 if primed { 438 if !healthyTimer.Stop() { 439 select { 440 case <-healthyTimer.C: 441 default: 442 } 443 } 444 primed = false 445 } 446 } else if !primed { 447 // Reset the timer to fire after MinHealthyTime 448 if !healthyTimer.Stop() { 449 select { 450 case <-healthyTimer.C: 451 default: 452 } 453 } 454 455 primed = true 456 healthyTimer.Reset(a.tg.Update.MinHealthyTime) 457 } 458 } 459 } 460 461 // taskHealthState captures all known health information about a task. It is 462 // largely used to determine if the task has contributed to the allocation being 463 // unhealthy. 464 type taskHealthState struct { 465 task *structs.Task 466 state *structs.TaskState 467 taskRegistrations *consul.TaskRegistration 468 } 469 470 // event takes the deadline time for the allocation to be healthy and the update 471 // strategy of the group. It returns true if the task has contributed to the 472 // allocation being unhealthy and if so, an event description of why. 473 func (t *taskHealthState) event(deadline time.Time, update *structs.UpdateStrategy) (string, bool) { 474 requireChecks := false 475 desiredChecks := 0 476 for _, s := range t.task.Services { 477 if nc := len(s.Checks); nc > 0 { 478 requireChecks = true 479 desiredChecks += nc 480 } 481 } 482 requireChecks = requireChecks && update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks 483 484 if t.state != nil { 485 if t.state.Failed { 486 return "Unhealthy because of failed task", true 487 } 488 if t.state.State != structs.TaskStateRunning { 489 return "Task not running by deadline", true 490 } 491 492 // We are running so check if we have been running long enough 493 if t.state.StartedAt.Add(update.MinHealthyTime).After(deadline) { 494 return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", update.MinHealthyTime), true 495 } 496 } 497 498 if t.taskRegistrations != nil { 499 var notPassing []string 500 passing := 0 501 502 OUTER: 503 for _, sreg := range t.taskRegistrations.Services { 504 for _, check := range sreg.Checks { 505 if check.Status != api.HealthPassing { 506 notPassing = append(notPassing, sreg.Service.Service) 507 continue OUTER 508 } else { 509 passing++ 510 } 511 } 512 } 513 514 if len(notPassing) != 0 { 515 return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true 516 } 517 518 if passing != desiredChecks { 519 return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true 520 } 521 522 } else if requireChecks { 523 return "Service checks not registered", true 524 } 525 526 return "", false 527 }