github.com/djenriquez/nomad-1@v0.8.1/client/alloc_runner_health_watcher.go (about) 1 package client 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "strings" 8 "sync" 9 "time" 10 11 "github.com/hashicorp/consul/api" 12 cstructs "github.com/hashicorp/nomad/client/structs" 13 "github.com/hashicorp/nomad/command/agent/consul" 14 "github.com/hashicorp/nomad/helper" 15 "github.com/hashicorp/nomad/nomad/structs" 16 ) 17 18 const ( 19 // consulCheckLookupInterval is the interval at which we check if the 20 // Consul checks are healthy or unhealthy. 21 consulCheckLookupInterval = 500 * time.Millisecond 22 23 // allocHealthEventSource is the source used for emitting task events 24 allocHealthEventSource = "Alloc Unhealthy" 25 ) 26 27 // watchHealth is responsible for watching an allocation's task status and 28 // potentially Consul health check status to determine if the allocation is 29 // healthy or unhealthy. 30 func (r *AllocRunner) watchHealth(ctx context.Context) { 31 32 // See if we should watch the allocs health 33 alloc := r.Alloc() 34 35 // Neither deployments nor migrations care about the health of 36 // non-service jobs so never watch their health 37 if alloc.Job.Type != structs.JobTypeService { 38 return 39 } 40 41 // No need to watch health as it's already set 42 if alloc.DeploymentStatus.HasHealth() { 43 return 44 } 45 46 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 47 if tg == nil { 48 r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation %q task group %q. Exiting watcher", 49 alloc.ID, alloc.TaskGroup) 50 return 51 } 52 53 isDeploy := alloc.DeploymentID != "" 54 55 // No need to watch allocs for deployments that rely on operators 56 // manually setting health 57 if isDeploy && (tg.Update == nil || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual) { 58 return 59 } 60 61 // Get an allocation listener to watch for alloc events 62 l := r.allocBroadcast.Listen() 63 defer l.Close() 64 65 // Define the deadline, health method, min healthy time from the 66 // deployment if this is a deployment; otherwise from the migration 67 // strategy. 68 var deadline time.Time 69 var useChecks bool 70 var minHealthyTime time.Duration 71 72 if isDeploy { 73 deadline = time.Now().Add(tg.Update.HealthyDeadline) 74 minHealthyTime = tg.Update.MinHealthyTime 75 useChecks = tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks 76 } else { 77 strategy := tg.Migrate 78 if strategy == nil { 79 // For backwards compat with pre-0.8 allocations that 80 // don't have a migrate strategy set. 81 strategy = structs.DefaultMigrateStrategy() 82 } 83 deadline = time.Now().Add(strategy.HealthyDeadline) 84 minHealthyTime = strategy.MinHealthyTime 85 useChecks = strategy.HealthCheck == structs.MigrateStrategyHealthChecks 86 } 87 88 // Create a new context with the health deadline 89 healthCtx, healthCtxCancel := context.WithDeadline(ctx, deadline) 90 defer healthCtxCancel() 91 r.logger.Printf("[DEBUG] client.alloc_watcher: deadline for alloc %q is at %v (deploy=%t checks=%t)", alloc.ID, deadline, isDeploy, useChecks) 92 93 // Create the health tracker object 94 tracker := newAllocHealthTracker(healthCtx, r.logger, alloc, l, r.consulClient, minHealthyTime, useChecks) 95 tracker.Start() 96 97 allocHealthy := false 98 select { 99 case <-healthCtx.Done(): 100 // We were cancelled which means we are no longer needed 101 if healthCtx.Err() == context.Canceled { 102 return 103 } 104 105 // Since the deadline has been reached we are not healthy 106 case <-tracker.AllocStoppedCh(): 107 // The allocation was stopped so nothing to do 108 return 109 case healthy := <-tracker.HealthyCh(): 110 allocHealthy = healthy 111 } 112 113 r.allocLock.Lock() 114 r.allocHealth = helper.BoolToPtr(allocHealthy) 115 r.allocLock.Unlock() 116 117 // If deployment is unhealthy emit task events explaining why 118 if !allocHealthy && isDeploy { 119 r.taskLock.RLock() 120 for task, event := range tracker.TaskEvents() { 121 if tr, ok := r.tasks[task]; ok { 122 tr.EmitEvent(allocHealthEventSource, event) 123 } 124 } 125 r.taskLock.RUnlock() 126 } 127 128 r.syncStatus() 129 } 130 131 // allocHealthTracker tracks the health of an allocation and makes health events 132 // watchable via channels. 133 type allocHealthTracker struct { 134 // logger is used to log 135 logger *log.Logger 136 137 // ctx and cancelFn is used to shutdown the tracker 138 ctx context.Context 139 cancelFn context.CancelFunc 140 141 // alloc is the alloc we are tracking 142 alloc *structs.Allocation 143 144 // tg is the task group we are tracking 145 tg *structs.TaskGroup 146 147 // minHealthyTime is the duration an alloc must remain healthy to be 148 // considered healthy 149 minHealthyTime time.Duration 150 151 // useChecks specifies whether to use Consul healh checks or not 152 useChecks bool 153 154 // consulCheckCount is the number of checks the task group will attempt to 155 // register 156 consulCheckCount int 157 158 // allocUpdates is a listener for retrieving new alloc updates 159 allocUpdates *cstructs.AllocListener 160 161 // consulClient is used to look up the state of the task's checks 162 consulClient ConsulServiceAPI 163 164 // healthy is used to signal whether we have determined the allocation to be 165 // healthy or unhealthy 166 healthy chan bool 167 168 // allocStopped is triggered when the allocation is stopped and tracking is 169 // not needed 170 allocStopped chan struct{} 171 172 // l is used to lock shared fields listed below 173 l sync.Mutex 174 175 // tasksHealthy marks whether all the tasks have met their health check 176 // (disregards Consul) 177 tasksHealthy bool 178 179 // allocFailed marks whether the allocation failed 180 allocFailed bool 181 182 // checksHealthy marks whether all the task's Consul checks are healthy 183 checksHealthy bool 184 185 // taskHealth contains the health state for each task 186 taskHealth map[string]*taskHealthState 187 } 188 189 // newAllocHealthTracker returns a health tracker for the given allocation. An 190 // alloc listener and consul API object are given so that the watcher can detect 191 // health changes. 192 func newAllocHealthTracker(parentCtx context.Context, logger *log.Logger, alloc *structs.Allocation, 193 allocUpdates *cstructs.AllocListener, consulClient ConsulServiceAPI, 194 minHealthyTime time.Duration, useChecks bool) *allocHealthTracker { 195 196 a := &allocHealthTracker{ 197 logger: logger, 198 healthy: make(chan bool, 1), 199 allocStopped: make(chan struct{}), 200 alloc: alloc, 201 tg: alloc.Job.LookupTaskGroup(alloc.TaskGroup), 202 minHealthyTime: minHealthyTime, 203 useChecks: useChecks, 204 allocUpdates: allocUpdates, 205 consulClient: consulClient, 206 } 207 208 a.taskHealth = make(map[string]*taskHealthState, len(a.tg.Tasks)) 209 for _, task := range a.tg.Tasks { 210 a.taskHealth[task.Name] = &taskHealthState{task: task} 211 } 212 213 for _, task := range a.tg.Tasks { 214 for _, s := range task.Services { 215 a.consulCheckCount += len(s.Checks) 216 } 217 } 218 219 a.ctx, a.cancelFn = context.WithCancel(parentCtx) 220 return a 221 } 222 223 // Start starts the watcher. 224 func (a *allocHealthTracker) Start() { 225 go a.watchTaskEvents() 226 if a.useChecks { 227 go a.watchConsulEvents() 228 } 229 } 230 231 // HealthyCh returns a channel that will emit a boolean indicating the health of 232 // the allocation. 233 func (a *allocHealthTracker) HealthyCh() <-chan bool { 234 return a.healthy 235 } 236 237 // AllocStoppedCh returns a channel that will be fired if the allocation is 238 // stopped. This means that health will not be set. 239 func (a *allocHealthTracker) AllocStoppedCh() <-chan struct{} { 240 return a.allocStopped 241 } 242 243 // TaskEvents returns a map of events by task. This should only be called after 244 // health has been determined. Only tasks that have contributed to the 245 // allocation being unhealthy will have an event. 246 func (a *allocHealthTracker) TaskEvents() map[string]string { 247 a.l.Lock() 248 defer a.l.Unlock() 249 250 // Nothing to do since the failure wasn't task related 251 if a.allocFailed { 252 return nil 253 } 254 255 deadline, _ := a.ctx.Deadline() 256 events := make(map[string]string, len(a.tg.Tasks)) 257 258 // Go through are task information and build the event map 259 for task, state := range a.taskHealth { 260 useChecks := a.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks 261 if e, ok := state.event(deadline, a.tg.Update.MinHealthyTime, useChecks); ok { 262 events[task] = e 263 } 264 } 265 266 return events 267 } 268 269 // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the 270 // allocation is terminal, health is immediately broadcasted. 271 func (a *allocHealthTracker) setTaskHealth(healthy, terminal bool) { 272 a.l.Lock() 273 defer a.l.Unlock() 274 a.tasksHealthy = healthy 275 276 // If we are marked healthy but we also require Consul to be healthy and it 277 // isn't yet, return, unless the task is terminal 278 requireConsul := a.useChecks && a.consulCheckCount > 0 279 if !terminal && healthy && requireConsul && !a.checksHealthy { 280 return 281 } 282 283 select { 284 case a.healthy <- healthy: 285 default: 286 } 287 288 // Shutdown the tracker 289 a.cancelFn() 290 } 291 292 // setCheckHealth is used to mark the checks as either healthy or unhealthy. 293 func (a *allocHealthTracker) setCheckHealth(healthy bool) { 294 a.l.Lock() 295 defer a.l.Unlock() 296 a.checksHealthy = healthy 297 298 // Only signal if we are healthy and so is the tasks 299 if !healthy || !a.tasksHealthy { 300 return 301 } 302 303 select { 304 case a.healthy <- healthy: 305 default: 306 } 307 308 // Shutdown the tracker 309 a.cancelFn() 310 } 311 312 // markAllocStopped is used to mark the allocation as having stopped. 313 func (a *allocHealthTracker) markAllocStopped() { 314 close(a.allocStopped) 315 a.cancelFn() 316 } 317 318 // watchTaskEvents is a long lived watcher that watches for the health of the 319 // allocation's tasks. 320 func (a *allocHealthTracker) watchTaskEvents() { 321 alloc := a.alloc 322 allStartedTime := time.Time{} 323 healthyTimer := time.NewTimer(0) 324 if !healthyTimer.Stop() { 325 select { 326 case <-healthyTimer.C: 327 default: 328 } 329 } 330 331 for { 332 // If the alloc is being stopped by the server just exit 333 switch alloc.DesiredStatus { 334 case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: 335 a.logger.Printf("[TRACE] client.alloc_watcher: desired status terminal for alloc %q", alloc.ID) 336 a.markAllocStopped() 337 return 338 } 339 340 // Store the task states 341 a.l.Lock() 342 for task, state := range alloc.TaskStates { 343 a.taskHealth[task].state = state 344 } 345 a.l.Unlock() 346 347 // Detect if the alloc is unhealthy or if all tasks have started yet 348 latestStartTime := time.Time{} 349 for _, state := range alloc.TaskStates { 350 // One of the tasks has failed so we can exit watching 351 if state.Failed || !state.FinishedAt.IsZero() { 352 a.setTaskHealth(false, true) 353 return 354 } 355 356 if state.State != structs.TaskStateRunning { 357 latestStartTime = time.Time{} 358 break 359 } else if state.StartedAt.After(latestStartTime) { 360 latestStartTime = state.StartedAt 361 } 362 } 363 364 // If the alloc is marked as failed by the client but none of the 365 // individual tasks failed, that means something failed at the alloc 366 // level. 367 if alloc.ClientStatus == structs.AllocClientStatusFailed { 368 a.logger.Printf("[TRACE] client.alloc_watcher: client status failed for alloc %q", alloc.ID) 369 a.l.Lock() 370 a.allocFailed = true 371 a.l.Unlock() 372 a.setTaskHealth(false, true) 373 return 374 } 375 376 if !latestStartTime.Equal(allStartedTime) { 377 // Avoid the timer from firing at the old start time 378 if !healthyTimer.Stop() { 379 select { 380 case <-healthyTimer.C: 381 default: 382 } 383 } 384 385 // Set the timer since all tasks are started 386 if !latestStartTime.IsZero() { 387 allStartedTime = latestStartTime 388 healthyTimer.Reset(a.minHealthyTime) 389 } 390 } 391 392 select { 393 case <-a.ctx.Done(): 394 return 395 case newAlloc, ok := <-a.allocUpdates.Ch: 396 if !ok { 397 return 398 } 399 alloc = newAlloc 400 case <-healthyTimer.C: 401 a.setTaskHealth(true, false) 402 } 403 } 404 } 405 406 // watchConsulEvents iis a long lived watcher that watches for the health of the 407 // allocation's Consul checks. 408 func (a *allocHealthTracker) watchConsulEvents() { 409 // checkTicker is the ticker that triggers us to look at the checks in 410 // Consul 411 checkTicker := time.NewTicker(consulCheckLookupInterval) 412 defer checkTicker.Stop() 413 414 // healthyTimer fires when the checks have been healthy for the 415 // MinHealthyTime 416 healthyTimer := time.NewTimer(0) 417 if !healthyTimer.Stop() { 418 select { 419 case <-healthyTimer.C: 420 default: 421 } 422 } 423 424 // primed marks whether the healthy timer has been set 425 primed := false 426 427 // Store whether the last Consul checks call was successful or not 428 consulChecksErr := false 429 430 // allocReg are the registered objects in Consul for the allocation 431 var allocReg *consul.AllocRegistration 432 433 OUTER: 434 for { 435 select { 436 case <-a.ctx.Done(): 437 return 438 case <-checkTicker.C: 439 newAllocReg, err := a.consulClient.AllocRegistrations(a.alloc.ID) 440 if err != nil { 441 if !consulChecksErr { 442 consulChecksErr = true 443 a.logger.Printf("[WARN] client.alloc_watcher: failed to lookup Consul registrations for allocation %q: %v", a.alloc.ID, err) 444 } 445 continue OUTER 446 } else { 447 consulChecksErr = false 448 allocReg = newAllocReg 449 } 450 case <-healthyTimer.C: 451 a.setCheckHealth(true) 452 } 453 454 if allocReg == nil { 455 continue 456 } 457 458 // Store the task registrations 459 a.l.Lock() 460 for task, reg := range allocReg.Tasks { 461 a.taskHealth[task].taskRegistrations = reg 462 } 463 a.l.Unlock() 464 465 // Detect if all the checks are passing 466 passed := true 467 468 CHECKS: 469 for _, treg := range allocReg.Tasks { 470 for _, sreg := range treg.Services { 471 for _, check := range sreg.Checks { 472 if check.Status == api.HealthPassing { 473 continue 474 } 475 476 passed = false 477 a.setCheckHealth(false) 478 break CHECKS 479 } 480 } 481 } 482 483 if !passed { 484 // Reset the timer since we have transitioned back to unhealthy 485 if primed { 486 if !healthyTimer.Stop() { 487 select { 488 case <-healthyTimer.C: 489 default: 490 } 491 } 492 primed = false 493 } 494 } else if !primed { 495 // Reset the timer to fire after MinHealthyTime 496 if !healthyTimer.Stop() { 497 select { 498 case <-healthyTimer.C: 499 default: 500 } 501 } 502 503 primed = true 504 healthyTimer.Reset(a.minHealthyTime) 505 } 506 } 507 } 508 509 // taskHealthState captures all known health information about a task. It is 510 // largely used to determine if the task has contributed to the allocation being 511 // unhealthy. 512 type taskHealthState struct { 513 task *structs.Task 514 state *structs.TaskState 515 taskRegistrations *consul.TaskRegistration 516 } 517 518 // event takes the deadline time for the allocation to be healthy and the update 519 // strategy of the group. It returns true if the task has contributed to the 520 // allocation being unhealthy and if so, an event description of why. 521 func (t *taskHealthState) event(deadline time.Time, minHealthyTime time.Duration, useChecks bool) (string, bool) { 522 requireChecks := false 523 desiredChecks := 0 524 for _, s := range t.task.Services { 525 if nc := len(s.Checks); nc > 0 { 526 requireChecks = true 527 desiredChecks += nc 528 } 529 } 530 requireChecks = requireChecks && useChecks 531 532 if t.state != nil { 533 if t.state.Failed { 534 return "Unhealthy because of failed task", true 535 } 536 if t.state.State != structs.TaskStateRunning { 537 return "Task not running by deadline", true 538 } 539 540 // We are running so check if we have been running long enough 541 if t.state.StartedAt.Add(minHealthyTime).After(deadline) { 542 return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", minHealthyTime), true 543 } 544 } 545 546 if t.taskRegistrations != nil { 547 var notPassing []string 548 passing := 0 549 550 OUTER: 551 for _, sreg := range t.taskRegistrations.Services { 552 for _, check := range sreg.Checks { 553 if check.Status != api.HealthPassing { 554 notPassing = append(notPassing, sreg.Service.Service) 555 continue OUTER 556 } else { 557 passing++ 558 } 559 } 560 } 561 562 if len(notPassing) != 0 { 563 return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true 564 } 565 566 if passing != desiredChecks { 567 return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true 568 } 569 570 } else if requireChecks { 571 return "Service checks not registered", true 572 } 573 574 return "", false 575 }