github.com/zhizhiboom/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/client/allocrunner/alloc_runner_health_watcher.go (about) 1 package allocrunner 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "strings" 8 "sync" 9 "time" 10 11 "github.com/hashicorp/consul/api" 12 consulApi "github.com/hashicorp/nomad/client/consul" 13 cstructs "github.com/hashicorp/nomad/client/structs" 14 "github.com/hashicorp/nomad/command/agent/consul" 15 "github.com/hashicorp/nomad/helper" 16 "github.com/hashicorp/nomad/nomad/structs" 17 ) 18 19 const ( 20 // consulCheckLookupInterval is the interval at which we check if the 21 // Consul checks are healthy or unhealthy. 22 consulCheckLookupInterval = 500 * time.Millisecond 23 24 // allocHealthEventSource is the source used for emitting task events 25 allocHealthEventSource = "Alloc Unhealthy" 26 ) 27 28 // watchHealth is responsible for watching an allocation's task status and 29 // potentially Consul health check status to determine if the allocation is 30 // healthy or unhealthy. 31 func (r *AllocRunner) watchHealth(ctx context.Context) { 32 33 // See if we should watch the allocs health 34 alloc := r.Alloc() 35 36 // Neither deployments nor migrations care about the health of 37 // non-service jobs so never watch their health 38 if alloc.Job.Type != structs.JobTypeService { 39 return 40 } 41 42 // No need to watch health as it's already set 43 if alloc.DeploymentStatus.HasHealth() { 44 return 45 } 46 47 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 48 if tg == nil { 49 r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation %q task group %q. Exiting watcher", 50 alloc.ID, alloc.TaskGroup) 51 return 52 } 53 54 isDeploy := alloc.DeploymentID != "" 55 56 // No need to watch allocs for deployments that rely on operators 57 // manually setting health 58 if isDeploy && (tg.Update == nil || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual) { 59 return 60 } 61 62 // Get an allocation listener to watch for alloc events 63 l := r.allocBroadcast.Listen() 64 defer l.Close() 65 66 // Define the deadline, health method, min healthy time from the 67 // deployment if this is a deployment; otherwise from the migration 68 // strategy. 69 var deadline time.Time 70 var useChecks bool 71 var minHealthyTime time.Duration 72 73 if isDeploy { 74 deadline = time.Now().Add(tg.Update.HealthyDeadline) 75 minHealthyTime = tg.Update.MinHealthyTime 76 useChecks = tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks 77 } else { 78 strategy := tg.Migrate 79 if strategy == nil { 80 // For backwards compat with pre-0.8 allocations that 81 // don't have a migrate strategy set. 82 strategy = structs.DefaultMigrateStrategy() 83 } 84 deadline = time.Now().Add(strategy.HealthyDeadline) 85 minHealthyTime = strategy.MinHealthyTime 86 useChecks = strategy.HealthCheck == structs.MigrateStrategyHealthChecks 87 } 88 89 // Create a new context with the health deadline 90 healthCtx, healthCtxCancel := context.WithDeadline(ctx, deadline) 91 defer healthCtxCancel() 92 r.logger.Printf("[DEBUG] client.alloc_watcher: deadline for alloc %q is at %v (deploy=%t checks=%t)", alloc.ID, deadline, isDeploy, useChecks) 93 94 // Create the health tracker object 95 tracker := newAllocHealthTracker(healthCtx, r.logger, alloc, l, r.consulClient, minHealthyTime, useChecks) 96 tracker.Start() 97 98 allocHealthy := false 99 select { 100 case <-healthCtx.Done(): 101 // We were cancelled which means we are no longer needed 102 if healthCtx.Err() == context.Canceled { 103 return 104 } 105 106 // Since the deadline has been reached we are not healthy 107 case <-tracker.AllocStoppedCh(): 108 // The allocation was stopped so nothing to do 109 return 110 case healthy := <-tracker.HealthyCh(): 111 allocHealthy = healthy 112 } 113 114 r.allocLock.Lock() 115 r.allocHealth = helper.BoolToPtr(allocHealthy) 116 r.allocHealthTime = time.Now() 117 r.allocLock.Unlock() 118 119 // If deployment is unhealthy emit task events explaining why 120 if !allocHealthy && isDeploy { 121 r.taskLock.RLock() 122 for task, event := range tracker.TaskEvents() { 123 if tr, ok := r.tasks[task]; ok { 124 tr.EmitEvent(allocHealthEventSource, event) 125 } 126 } 127 r.taskLock.RUnlock() 128 } 129 130 r.syncStatus() 131 } 132 133 // allocHealthTracker tracks the health of an allocation and makes health events 134 // watchable via channels. 135 type allocHealthTracker struct { 136 // logger is used to log 137 logger *log.Logger 138 139 // ctx and cancelFn is used to shutdown the tracker 140 ctx context.Context 141 cancelFn context.CancelFunc 142 143 // alloc is the alloc we are tracking 144 alloc *structs.Allocation 145 146 // tg is the task group we are tracking 147 tg *structs.TaskGroup 148 149 // minHealthyTime is the duration an alloc must remain healthy to be 150 // considered healthy 151 minHealthyTime time.Duration 152 153 // useChecks specifies whether to use Consul healh checks or not 154 useChecks bool 155 156 // consulCheckCount is the number of checks the task group will attempt to 157 // register 158 consulCheckCount int 159 160 // allocUpdates is a listener for retrieving new alloc updates 161 allocUpdates *cstructs.AllocListener 162 163 // consulClient is used to look up the state of the task's checks 164 consulClient consulApi.ConsulServiceAPI 165 166 // healthy is used to signal whether we have determined the allocation to be 167 // healthy or unhealthy 168 healthy chan bool 169 170 // allocStopped is triggered when the allocation is stopped and tracking is 171 // not needed 172 allocStopped chan struct{} 173 174 // l is used to lock shared fields listed below 175 l sync.Mutex 176 177 // tasksHealthy marks whether all the tasks have met their health check 178 // (disregards Consul) 179 tasksHealthy bool 180 181 // allocFailed marks whether the allocation failed 182 allocFailed bool 183 184 // checksHealthy marks whether all the task's Consul checks are healthy 185 checksHealthy bool 186 187 // taskHealth contains the health state for each task 188 taskHealth map[string]*taskHealthState 189 } 190 191 // newAllocHealthTracker returns a health tracker for the given allocation. An 192 // alloc listener and consul API object are given so that the watcher can detect 193 // health changes. 194 func newAllocHealthTracker(parentCtx context.Context, logger *log.Logger, alloc *structs.Allocation, 195 allocUpdates *cstructs.AllocListener, consulClient consulApi.ConsulServiceAPI, 196 minHealthyTime time.Duration, useChecks bool) *allocHealthTracker { 197 198 a := &allocHealthTracker{ 199 logger: logger, 200 healthy: make(chan bool, 1), 201 allocStopped: make(chan struct{}), 202 alloc: alloc, 203 tg: alloc.Job.LookupTaskGroup(alloc.TaskGroup), 204 minHealthyTime: minHealthyTime, 205 useChecks: useChecks, 206 allocUpdates: allocUpdates, 207 consulClient: consulClient, 208 } 209 210 a.taskHealth = make(map[string]*taskHealthState, len(a.tg.Tasks)) 211 for _, task := range a.tg.Tasks { 212 a.taskHealth[task.Name] = &taskHealthState{task: task} 213 } 214 215 for _, task := range a.tg.Tasks { 216 for _, s := range task.Services { 217 a.consulCheckCount += len(s.Checks) 218 } 219 } 220 221 a.ctx, a.cancelFn = context.WithCancel(parentCtx) 222 return a 223 } 224 225 // Start starts the watcher. 226 func (a *allocHealthTracker) Start() { 227 go a.watchTaskEvents() 228 if a.useChecks { 229 go a.watchConsulEvents() 230 } 231 } 232 233 // HealthyCh returns a channel that will emit a boolean indicating the health of 234 // the allocation. 235 func (a *allocHealthTracker) HealthyCh() <-chan bool { 236 return a.healthy 237 } 238 239 // AllocStoppedCh returns a channel that will be fired if the allocation is 240 // stopped. This means that health will not be set. 241 func (a *allocHealthTracker) AllocStoppedCh() <-chan struct{} { 242 return a.allocStopped 243 } 244 245 // TaskEvents returns a map of events by task. This should only be called after 246 // health has been determined. Only tasks that have contributed to the 247 // allocation being unhealthy will have an event. 248 func (a *allocHealthTracker) TaskEvents() map[string]string { 249 a.l.Lock() 250 defer a.l.Unlock() 251 252 // Nothing to do since the failure wasn't task related 253 if a.allocFailed { 254 return nil 255 } 256 257 deadline, _ := a.ctx.Deadline() 258 events := make(map[string]string, len(a.tg.Tasks)) 259 260 // Go through are task information and build the event map 261 for task, state := range a.taskHealth { 262 useChecks := a.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks 263 if e, ok := state.event(deadline, a.tg.Update.MinHealthyTime, useChecks); ok { 264 events[task] = e 265 } 266 } 267 268 return events 269 } 270 271 // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the 272 // allocation is terminal, health is immediately broadcasted. 273 func (a *allocHealthTracker) setTaskHealth(healthy, terminal bool) { 274 a.l.Lock() 275 defer a.l.Unlock() 276 a.tasksHealthy = healthy 277 278 // If we are marked healthy but we also require Consul to be healthy and it 279 // isn't yet, return, unless the task is terminal 280 requireConsul := a.useChecks && a.consulCheckCount > 0 281 if !terminal && healthy && requireConsul && !a.checksHealthy { 282 return 283 } 284 285 select { 286 case a.healthy <- healthy: 287 default: 288 } 289 290 // Shutdown the tracker 291 a.cancelFn() 292 } 293 294 // setCheckHealth is used to mark the checks as either healthy or unhealthy. 295 func (a *allocHealthTracker) setCheckHealth(healthy bool) { 296 a.l.Lock() 297 defer a.l.Unlock() 298 a.checksHealthy = healthy 299 300 // Only signal if we are healthy and so is the tasks 301 if !healthy || !a.tasksHealthy { 302 return 303 } 304 305 select { 306 case a.healthy <- healthy: 307 default: 308 } 309 310 // Shutdown the tracker 311 a.cancelFn() 312 } 313 314 // markAllocStopped is used to mark the allocation as having stopped. 315 func (a *allocHealthTracker) markAllocStopped() { 316 close(a.allocStopped) 317 a.cancelFn() 318 } 319 320 // watchTaskEvents is a long lived watcher that watches for the health of the 321 // allocation's tasks. 322 func (a *allocHealthTracker) watchTaskEvents() { 323 alloc := a.alloc 324 allStartedTime := time.Time{} 325 healthyTimer := time.NewTimer(0) 326 if !healthyTimer.Stop() { 327 select { 328 case <-healthyTimer.C: 329 default: 330 } 331 } 332 333 for { 334 // If the alloc is being stopped by the server just exit 335 switch alloc.DesiredStatus { 336 case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: 337 a.logger.Printf("[TRACE] client.alloc_watcher: desired status terminal for alloc %q", alloc.ID) 338 a.markAllocStopped() 339 return 340 } 341 342 // Store the task states 343 a.l.Lock() 344 for task, state := range alloc.TaskStates { 345 a.taskHealth[task].state = state 346 } 347 a.l.Unlock() 348 349 // Detect if the alloc is unhealthy or if all tasks have started yet 350 latestStartTime := time.Time{} 351 for _, state := range alloc.TaskStates { 352 // One of the tasks has failed so we can exit watching 353 if state.Failed || !state.FinishedAt.IsZero() { 354 a.setTaskHealth(false, true) 355 return 356 } 357 358 if state.State != structs.TaskStateRunning { 359 latestStartTime = time.Time{} 360 break 361 } else if state.StartedAt.After(latestStartTime) { 362 latestStartTime = state.StartedAt 363 } 364 } 365 366 // If the alloc is marked as failed by the client but none of the 367 // individual tasks failed, that means something failed at the alloc 368 // level. 369 if alloc.ClientStatus == structs.AllocClientStatusFailed { 370 a.logger.Printf("[TRACE] client.alloc_watcher: client status failed for alloc %q", alloc.ID) 371 a.l.Lock() 372 a.allocFailed = true 373 a.l.Unlock() 374 a.setTaskHealth(false, true) 375 return 376 } 377 378 if !latestStartTime.Equal(allStartedTime) { 379 // Avoid the timer from firing at the old start time 380 if !healthyTimer.Stop() { 381 select { 382 case <-healthyTimer.C: 383 default: 384 } 385 } 386 387 // Set the timer since all tasks are started 388 if !latestStartTime.IsZero() { 389 allStartedTime = latestStartTime 390 healthyTimer.Reset(a.minHealthyTime) 391 } 392 } 393 394 select { 395 case <-a.ctx.Done(): 396 return 397 case newAlloc, ok := <-a.allocUpdates.Ch: 398 if !ok { 399 return 400 } 401 alloc = newAlloc 402 case <-healthyTimer.C: 403 a.setTaskHealth(true, false) 404 } 405 } 406 } 407 408 // watchConsulEvents iis a long lived watcher that watches for the health of the 409 // allocation's Consul checks. 410 func (a *allocHealthTracker) watchConsulEvents() { 411 // checkTicker is the ticker that triggers us to look at the checks in 412 // Consul 413 checkTicker := time.NewTicker(consulCheckLookupInterval) 414 defer checkTicker.Stop() 415 416 // healthyTimer fires when the checks have been healthy for the 417 // MinHealthyTime 418 healthyTimer := time.NewTimer(0) 419 if !healthyTimer.Stop() { 420 select { 421 case <-healthyTimer.C: 422 default: 423 } 424 } 425 426 // primed marks whether the healthy timer has been set 427 primed := false 428 429 // Store whether the last Consul checks call was successful or not 430 consulChecksErr := false 431 432 // allocReg are the registered objects in Consul for the allocation 433 var allocReg *consul.AllocRegistration 434 435 OUTER: 436 for { 437 select { 438 case <-a.ctx.Done(): 439 return 440 case <-checkTicker.C: 441 newAllocReg, err := a.consulClient.AllocRegistrations(a.alloc.ID) 442 if err != nil { 443 if !consulChecksErr { 444 consulChecksErr = true 445 a.logger.Printf("[WARN] client.alloc_watcher: failed to lookup Consul registrations for allocation %q: %v", a.alloc.ID, err) 446 } 447 continue OUTER 448 } else { 449 consulChecksErr = false 450 allocReg = newAllocReg 451 } 452 case <-healthyTimer.C: 453 a.setCheckHealth(true) 454 } 455 456 if allocReg == nil { 457 continue 458 } 459 460 // Store the task registrations 461 a.l.Lock() 462 for task, reg := range allocReg.Tasks { 463 a.taskHealth[task].taskRegistrations = reg 464 } 465 a.l.Unlock() 466 467 // Detect if all the checks are passing 468 passed := true 469 470 CHECKS: 471 for _, treg := range allocReg.Tasks { 472 for _, sreg := range treg.Services { 473 for _, check := range sreg.Checks { 474 if check.Status == api.HealthPassing { 475 continue 476 } 477 478 passed = false 479 a.setCheckHealth(false) 480 break CHECKS 481 } 482 } 483 } 484 485 if !passed { 486 // Reset the timer since we have transitioned back to unhealthy 487 if primed { 488 if !healthyTimer.Stop() { 489 select { 490 case <-healthyTimer.C: 491 default: 492 } 493 } 494 primed = false 495 } 496 } else if !primed { 497 // Reset the timer to fire after MinHealthyTime 498 if !healthyTimer.Stop() { 499 select { 500 case <-healthyTimer.C: 501 default: 502 } 503 } 504 505 primed = true 506 healthyTimer.Reset(a.minHealthyTime) 507 } 508 } 509 } 510 511 // taskHealthState captures all known health information about a task. It is 512 // largely used to determine if the task has contributed to the allocation being 513 // unhealthy. 514 type taskHealthState struct { 515 task *structs.Task 516 state *structs.TaskState 517 taskRegistrations *consul.TaskRegistration 518 } 519 520 // event takes the deadline time for the allocation to be healthy and the update 521 // strategy of the group. It returns true if the task has contributed to the 522 // allocation being unhealthy and if so, an event description of why. 523 func (t *taskHealthState) event(deadline time.Time, minHealthyTime time.Duration, useChecks bool) (string, bool) { 524 requireChecks := false 525 desiredChecks := 0 526 for _, s := range t.task.Services { 527 if nc := len(s.Checks); nc > 0 { 528 requireChecks = true 529 desiredChecks += nc 530 } 531 } 532 requireChecks = requireChecks && useChecks 533 534 if t.state != nil { 535 if t.state.Failed { 536 return "Unhealthy because of failed task", true 537 } 538 if t.state.State != structs.TaskStateRunning { 539 return "Task not running by deadline", true 540 } 541 542 // We are running so check if we have been running long enough 543 if t.state.StartedAt.Add(minHealthyTime).After(deadline) { 544 return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", minHealthyTime), true 545 } 546 } 547 548 if t.taskRegistrations != nil { 549 var notPassing []string 550 passing := 0 551 552 OUTER: 553 for _, sreg := range t.taskRegistrations.Services { 554 for _, check := range sreg.Checks { 555 if check.Status != api.HealthPassing { 556 notPassing = append(notPassing, sreg.Service.Service) 557 continue OUTER 558 } else { 559 passing++ 560 } 561 } 562 } 563 564 if len(notPassing) != 0 { 565 return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true 566 } 567 568 if passing != desiredChecks { 569 return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true 570 } 571 572 } else if requireChecks { 573 return "Service checks not registered", true 574 } 575 576 return "", false 577 }