github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/client/alloc_runner_health_watcher.go (about) 1 package client 2 3 import ( 4 "context" 5 "time" 6 7 "github.com/hashicorp/consul/api" 8 "github.com/hashicorp/nomad/helper" 9 "github.com/hashicorp/nomad/nomad/structs" 10 ) 11 12 const ( 13 // consulCheckLookupInterval is the interval at which we check if the 14 // Consul checks are healthy or unhealthy. 15 consulCheckLookupInterval = 500 * time.Millisecond 16 ) 17 18 // watchHealth is responsible for watching an allocation's task status and 19 // potentially consul health check status to determine if the allocation is 20 // healthy or unhealthy. 21 func (r *AllocRunner) watchHealth(ctx context.Context) { 22 // See if we should watch the allocs health 23 alloc := r.Alloc() 24 if alloc.DeploymentID == "" { 25 r.logger.Printf("[TRACE] client.alloc_watcher: exiting because alloc isn't part of a deployment") 26 return 27 } else if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() { 28 r.logger.Printf("[TRACE] client.alloc_watcher: exiting because alloc deployment health already determined") 29 return 30 } 31 32 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 33 if tg == nil { 34 r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher") 35 return 36 } 37 38 // Checks marks whether we should be watching for Consul health checks 39 desiredChecks := 0 40 var checkTicker *time.Ticker 41 var checkCh <-chan time.Time 42 43 u := tg.Update 44 switch { 45 case u == nil: 46 r.logger.Printf("[TRACE] client.alloc_watcher: no update block for alloc %q. exiting", alloc.ID) 47 return 48 case u.HealthCheck == structs.UpdateStrategyHealthCheck_Manual: 49 r.logger.Printf("[TRACE] client.alloc_watcher: update block has manual checks for alloc %q. exiting", alloc.ID) 50 return 51 case u.HealthCheck == structs.UpdateStrategyHealthCheck_Checks: 52 for _, task := range tg.Tasks { 53 for _, s := range task.Services { 54 desiredChecks += len(s.Checks) 55 } 56 } 57 58 checkTicker = time.NewTicker(consulCheckLookupInterval) 59 checkCh = checkTicker.C 60 } 61 62 // Get a listener so we know when an allocation is updated. 63 l := r.allocBroadcast.Listen() 64 65 // Create a deadline timer for the health 66 r.logger.Printf("[DEBUG] client.alloc_watcher: deadline (%v) for alloc %q is at %v", u.HealthyDeadline, alloc.ID, time.Now().Add(u.HealthyDeadline)) 67 deadline := time.NewTimer(u.HealthyDeadline) 68 69 // Create a healthy timer 70 latestTaskHealthy := time.Unix(0, 0) 71 latestChecksHealthy := time.Unix(0, 0) 72 healthyTimer := time.NewTimer(0) 73 healthyTime := time.Time{} 74 cancelHealthyTimer := func() { 75 if !healthyTimer.Stop() { 76 select { 77 case <-healthyTimer.C: 78 default: 79 } 80 } 81 } 82 cancelHealthyTimer() 83 84 // Cleanup function 85 defer func() { 86 if !deadline.Stop() { 87 <-deadline.C 88 } 89 if !healthyTimer.Stop() { 90 <-healthyTimer.C 91 } 92 if checkTicker != nil { 93 checkTicker.Stop() 94 } 95 l.Close() 96 }() 97 98 setHealth := func(h bool) { 99 r.allocLock.Lock() 100 r.allocHealth = helper.BoolToPtr(h) 101 r.allocLock.Unlock() 102 r.syncStatus() 103 } 104 105 // Store whether the last consul checks call was successful or not 106 consulChecksErr := false 107 108 var checks []*api.AgentCheck 109 first := true 110 OUTER: 111 for { 112 if !first { 113 select { 114 case <-ctx.Done(): 115 return 116 case newAlloc, ok := <-l.Ch: 117 if !ok { 118 return 119 } 120 121 alloc = newAlloc 122 r.logger.Printf("[TRACE] client.alloc_watcher: new alloc version for %q", alloc.ID) 123 case <-checkCh: 124 newChecks, err := r.consulClient.Checks(alloc) 125 if err != nil { 126 if !consulChecksErr { 127 consulChecksErr = true 128 r.logger.Printf("[WARN] client.alloc_watcher: failed to lookup consul checks for allocation %q: %v", alloc.ID, err) 129 } 130 } else { 131 consulChecksErr = false 132 checks = newChecks 133 } 134 case <-deadline.C: 135 // We have exceeded our deadline without being healthy. 136 r.logger.Printf("[TRACE] client.alloc_watcher: alloc %q hit healthy deadline", alloc.ID) 137 setHealth(false) 138 return 139 case <-healthyTimer.C: 140 r.logger.Printf("[TRACE] client.alloc_watcher: alloc %q is healthy", alloc.ID) 141 setHealth(true) 142 return 143 } 144 } 145 first = false 146 147 // If the alloc is being stopped by the server just exit 148 switch alloc.DesiredStatus { 149 case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: 150 r.logger.Printf("[TRACE] client.alloc_watcher: desired status terminal for alloc %q", alloc.ID) 151 return 152 } 153 154 // If the alloc is marked as failed by the client set the status to 155 // unhealthy 156 if alloc.ClientStatus == structs.AllocClientStatusFailed { 157 r.logger.Printf("[TRACE] client.alloc_watcher: client status failed for alloc %q", alloc.ID) 158 setHealth(false) 159 return 160 } 161 162 if len(alloc.TaskStates) != len(tg.Tasks) { 163 r.logger.Printf("[TRACE] client.alloc_watcher: all task runners haven't started") 164 continue OUTER 165 } 166 167 // If the task is dead or has restarted, fail 168 for _, tstate := range alloc.TaskStates { 169 if tstate.Failed || !tstate.FinishedAt.IsZero() || tstate.Restarts != 0 { 170 r.logger.Printf("[TRACE] client.alloc_watcher: setting health to false for alloc %q", alloc.ID) 171 setHealth(false) 172 return 173 } 174 } 175 176 // If we should have checks and they aren't all healthy continue 177 if len(checks) != desiredChecks { 178 r.logger.Printf("[TRACE] client.alloc_watcher: continuing since all checks (want %d; got %d) haven't been registered for alloc %q", desiredChecks, len(checks), alloc.ID) 179 cancelHealthyTimer() 180 continue OUTER 181 } 182 183 // Check if all the checks are passing 184 for _, check := range checks { 185 if check.Status != api.HealthPassing { 186 r.logger.Printf("[TRACE] client.alloc_watcher: continuing since check %q isn't passing for alloc %q", check.CheckID, alloc.ID) 187 latestChecksHealthy = time.Time{} 188 cancelHealthyTimer() 189 continue OUTER 190 } 191 } 192 if latestChecksHealthy.IsZero() { 193 latestChecksHealthy = time.Now() 194 } 195 196 // Determine if the allocation is healthy 197 for task, tstate := range alloc.TaskStates { 198 if tstate.State != structs.TaskStateRunning { 199 r.logger.Printf("[TRACE] client.alloc_watcher: continuing since task %q hasn't started for alloc %q", task, alloc.ID) 200 continue OUTER 201 } 202 203 if tstate.StartedAt.After(latestTaskHealthy) { 204 latestTaskHealthy = tstate.StartedAt 205 } 206 } 207 208 // Determine when we can mark ourselves as healthy. 209 totalHealthy := latestTaskHealthy 210 if totalHealthy.Before(latestChecksHealthy) { 211 totalHealthy = latestChecksHealthy 212 } 213 214 // Nothing to do since we are already waiting for the healthy timer to 215 // fire at the same time. 216 if totalHealthy.Equal(healthyTime) { 217 continue OUTER 218 } 219 220 healthyTime = totalHealthy 221 cancelHealthyTimer() 222 d := time.Until(totalHealthy.Add(u.MinHealthyTime)) 223 healthyTimer.Reset(d) 224 r.logger.Printf("[TRACE] client.alloc_watcher: setting healthy timer to %v for alloc %q", d, alloc.ID) 225 } 226 }