github.com/djenriquez/nomad-1@v0.8.1/command/agent/consul/check_watcher.go (about) 1 package consul 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "time" 8 9 "github.com/hashicorp/consul/api" 10 "github.com/hashicorp/nomad/nomad/structs" 11 ) 12 13 const ( 14 // defaultPollFreq is the default rate to poll the Consul Checks API 15 defaultPollFreq = 900 * time.Millisecond 16 ) 17 18 // ChecksAPI is the part of the Consul API the checkWatcher requires. 19 type ChecksAPI interface { 20 // Checks returns a list of all checks. 21 Checks() (map[string]*api.AgentCheck, error) 22 } 23 24 // TaskRestarter allows the checkWatcher to restart tasks. 25 type TaskRestarter interface { 26 Restart(source, reason string, failure bool) 27 } 28 29 // checkRestart handles restarting a task if a check is unhealthy. 30 type checkRestart struct { 31 allocID string 32 taskName string 33 checkID string 34 checkName string 35 taskKey string // composite of allocID + taskName for uniqueness 36 37 task TaskRestarter 38 grace time.Duration 39 interval time.Duration 40 timeLimit time.Duration 41 ignoreWarnings bool 42 43 // Mutable fields 44 45 // unhealthyState is the time a check first went unhealthy. Set to the 46 // zero value if the check passes before timeLimit. 47 unhealthyState time.Time 48 49 // graceUntil is when the check's grace period expires and unhealthy 50 // checks should be counted. 51 graceUntil time.Time 52 53 logger *log.Logger 54 } 55 56 // apply restart state for check and restart task if necessary. Current 57 // timestamp is passed in so all check updates have the same view of time (and 58 // to ease testing). 59 // 60 // Returns true if a restart was triggered in which case this check should be 61 // removed (checks are added on task startup). 62 func (c *checkRestart) apply(now time.Time, status string) bool { 63 healthy := func() { 64 if !c.unhealthyState.IsZero() { 65 c.logger.Printf("[DEBUG] consul.health: alloc %q task %q check %q became healthy; canceling restart", 66 c.allocID, c.taskName, c.checkName) 67 c.unhealthyState = time.Time{} 68 } 69 } 70 switch status { 71 case api.HealthCritical: 72 case api.HealthWarning: 73 if c.ignoreWarnings { 74 // Warnings are ignored, reset state and exit 75 healthy() 76 return false 77 } 78 default: 79 // All other statuses are ok, reset state and exit 80 healthy() 81 return false 82 } 83 84 if now.Before(c.graceUntil) { 85 // In grace period, exit 86 return false 87 } 88 89 if c.unhealthyState.IsZero() { 90 // First failure, set restart deadline 91 if c.timeLimit != 0 { 92 c.logger.Printf("[DEBUG] consul.health: alloc %q task %q check %q became unhealthy. Restarting in %s if not healthy", 93 c.allocID, c.taskName, c.checkName, c.timeLimit) 94 } 95 c.unhealthyState = now 96 } 97 98 // restart timeLimit after start of this check becoming unhealthy 99 restartAt := c.unhealthyState.Add(c.timeLimit) 100 101 // Must test >= because if limit=1, restartAt == first failure 102 if now.Equal(restartAt) || now.After(restartAt) { 103 // hasn't become healthy by deadline, restart! 104 c.logger.Printf("[DEBUG] consul.health: restarting alloc %q task %q due to unhealthy check %q", c.allocID, c.taskName, c.checkName) 105 106 // Tell TaskRunner to restart due to failure 107 const failure = true 108 c.task.Restart("healthcheck", fmt.Sprintf("check %q unhealthy", c.checkName), failure) 109 return true 110 } 111 112 return false 113 } 114 115 // checkWatchUpdates add or remove checks from the watcher 116 type checkWatchUpdate struct { 117 checkID string 118 remove bool 119 checkRestart *checkRestart 120 } 121 122 // checkWatcher watches Consul checks and restarts tasks when they're 123 // unhealthy. 124 type checkWatcher struct { 125 consul ChecksAPI 126 127 // pollFreq is how often to poll the checks API and defaults to 128 // defaultPollFreq 129 pollFreq time.Duration 130 131 // checkUpdateCh is how watches (and removals) are sent to the main 132 // watching loop 133 checkUpdateCh chan checkWatchUpdate 134 135 // done is closed when Run has exited 136 done chan struct{} 137 138 // lastErr is true if the last Consul call failed. It is used to 139 // squelch repeated error messages. 140 lastErr bool 141 142 logger *log.Logger 143 } 144 145 // newCheckWatcher creates a new checkWatcher but does not call its Run method. 146 func newCheckWatcher(logger *log.Logger, consul ChecksAPI) *checkWatcher { 147 return &checkWatcher{ 148 consul: consul, 149 pollFreq: defaultPollFreq, 150 checkUpdateCh: make(chan checkWatchUpdate, 8), 151 done: make(chan struct{}), 152 logger: logger, 153 } 154 } 155 156 // Run the main Consul checks watching loop to restart tasks when their checks 157 // fail. Blocks until context is canceled. 158 func (w *checkWatcher) Run(ctx context.Context) { 159 defer close(w.done) 160 161 // map of check IDs to their metadata 162 checks := map[string]*checkRestart{} 163 164 // timer for check polling 165 checkTimer := time.NewTimer(0) 166 defer checkTimer.Stop() // ensure timer is never leaked 167 168 stopTimer := func() { 169 checkTimer.Stop() 170 select { 171 case <-checkTimer.C: 172 default: 173 } 174 } 175 176 // disable by default 177 stopTimer() 178 179 // Main watch loop 180 for { 181 // disable polling if there are no checks 182 if len(checks) == 0 { 183 stopTimer() 184 } 185 186 select { 187 case update := <-w.checkUpdateCh: 188 if update.remove { 189 // Remove a check 190 delete(checks, update.checkID) 191 continue 192 } 193 194 // Add/update a check 195 checks[update.checkID] = update.checkRestart 196 w.logger.Printf("[DEBUG] consul.health: watching alloc %q task %q check %q", 197 update.checkRestart.allocID, update.checkRestart.taskName, update.checkRestart.checkName) 198 199 // if first check was added make sure polling is enabled 200 if len(checks) == 1 { 201 stopTimer() 202 checkTimer.Reset(w.pollFreq) 203 } 204 205 case <-ctx.Done(): 206 return 207 208 case <-checkTimer.C: 209 checkTimer.Reset(w.pollFreq) 210 211 // Set "now" as the point in time the following check results represent 212 now := time.Now() 213 214 results, err := w.consul.Checks() 215 if err != nil { 216 if !w.lastErr { 217 w.lastErr = true 218 w.logger.Printf("[ERR] consul.health: error retrieving health checks: %q", err) 219 } 220 continue 221 } 222 223 w.lastErr = false 224 225 // Keep track of tasks restarted this period so they 226 // are only restarted once and all of their checks are 227 // removed. 228 restartedTasks := map[string]struct{}{} 229 230 // Loop over watched checks and update their status from results 231 for cid, check := range checks { 232 if _, ok := restartedTasks[check.taskKey]; ok { 233 // Check for this task already restarted; remove and skip check 234 delete(checks, cid) 235 continue 236 } 237 238 result, ok := results[cid] 239 if !ok { 240 // Only warn if outside grace period to avoid races with check registration 241 if now.After(check.graceUntil) { 242 w.logger.Printf("[WARN] consul.health: watched check %q (%s) not found in Consul", check.checkName, cid) 243 } 244 continue 245 } 246 247 restarted := check.apply(now, result.Status) 248 if restarted { 249 // Checks are registered+watched on 250 // startup, so it's safe to remove them 251 // whenever they're restarted 252 delete(checks, cid) 253 254 restartedTasks[check.taskKey] = struct{}{} 255 } 256 } 257 258 // Ensure even passing checks for restartedTasks are removed 259 if len(restartedTasks) > 0 { 260 for cid, check := range checks { 261 if _, ok := restartedTasks[check.taskKey]; ok { 262 delete(checks, cid) 263 } 264 } 265 } 266 } 267 } 268 } 269 270 // Watch a check and restart its task if unhealthy. 271 func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, restarter TaskRestarter) { 272 if !check.TriggersRestarts() { 273 // Not watched, noop 274 return 275 } 276 277 c := &checkRestart{ 278 allocID: allocID, 279 taskName: taskName, 280 checkID: checkID, 281 checkName: check.Name, 282 taskKey: fmt.Sprintf("%s%s", allocID, taskName), // unique task ID 283 task: restarter, 284 interval: check.Interval, 285 grace: check.CheckRestart.Grace, 286 graceUntil: time.Now().Add(check.CheckRestart.Grace), 287 timeLimit: check.Interval * time.Duration(check.CheckRestart.Limit-1), 288 ignoreWarnings: check.CheckRestart.IgnoreWarnings, 289 logger: w.logger, 290 } 291 292 update := checkWatchUpdate{ 293 checkID: checkID, 294 checkRestart: c, 295 } 296 297 select { 298 case w.checkUpdateCh <- update: 299 // sent watch 300 case <-w.done: 301 // exited; nothing to do 302 } 303 } 304 305 // Unwatch a check. 306 func (w *checkWatcher) Unwatch(cid string) { 307 c := checkWatchUpdate{ 308 checkID: cid, 309 remove: true, 310 } 311 select { 312 case w.checkUpdateCh <- c: 313 // sent remove watch 314 case <-w.done: 315 // exited; nothing to do 316 } 317 }