github.com/superfly/nomad@v0.10.5-fly/command/agent/consul/check_watcher.go (about) 1 package consul 2 3 import ( 4 "context" 5 "fmt" 6 "time" 7 8 log "github.com/hashicorp/go-hclog" 9 10 "github.com/hashicorp/consul/api" 11 "github.com/hashicorp/nomad/nomad/structs" 12 ) 13 14 const ( 15 // defaultPollFreq is the default rate to poll the Consul Checks API 16 defaultPollFreq = 900 * time.Millisecond 17 ) 18 19 // ChecksAPI is the part of the Consul API the checkWatcher requires. 20 type ChecksAPI interface { 21 // Checks returns a list of all checks. 22 Checks() (map[string]*api.AgentCheck, error) 23 } 24 25 // WorkloadRestarter allows the checkWatcher to restart tasks or entire task groups. 26 type WorkloadRestarter interface { 27 Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error 28 } 29 30 // checkRestart handles restarting a task if a check is unhealthy. 31 type checkRestart struct { 32 allocID string 33 taskName string 34 checkID string 35 checkName string 36 taskKey string // composite of allocID + taskName for uniqueness 37 38 task WorkloadRestarter 39 grace time.Duration 40 interval time.Duration 41 timeLimit time.Duration 42 ignoreWarnings bool 43 44 // Mutable fields 45 46 // unhealthyState is the time a check first went unhealthy. Set to the 47 // zero value if the check passes before timeLimit. 48 unhealthyState time.Time 49 50 // graceUntil is when the check's grace period expires and unhealthy 51 // checks should be counted. 52 graceUntil time.Time 53 54 logger log.Logger 55 } 56 57 // apply restart state for check and restart task if necessary. Current 58 // timestamp is passed in so all check updates have the same view of time (and 59 // to ease testing). 60 // 61 // Returns true if a restart was triggered in which case this check should be 62 // removed (checks are added on task startup). 63 func (c *checkRestart) apply(ctx context.Context, now time.Time, status string) bool { 64 healthy := func() { 65 if !c.unhealthyState.IsZero() { 66 c.logger.Debug("canceling restart because check became healthy") 67 c.unhealthyState = time.Time{} 68 } 69 } 70 switch status { 71 case api.HealthCritical: 72 case api.HealthWarning: 73 if c.ignoreWarnings { 74 // Warnings are ignored, reset state and exit 75 healthy() 76 return false 77 } 78 default: 79 // All other statuses are ok, reset state and exit 80 healthy() 81 return false 82 } 83 84 if now.Before(c.graceUntil) { 85 // In grace period, exit 86 return false 87 } 88 89 if c.unhealthyState.IsZero() { 90 // First failure, set restart deadline 91 if c.timeLimit != 0 { 92 c.logger.Debug("check became unhealthy. Will restart if check doesn't become healthy", "time_limit", c.timeLimit) 93 } 94 c.unhealthyState = now 95 } 96 97 // restart timeLimit after start of this check becoming unhealthy 98 restartAt := c.unhealthyState.Add(c.timeLimit) 99 100 // Must test >= because if limit=1, restartAt == first failure 101 if now.Equal(restartAt) || now.After(restartAt) { 102 // hasn't become healthy by deadline, restart! 103 c.logger.Debug("restarting due to unhealthy check") 104 105 // Tell TaskRunner to restart due to failure 106 reason := fmt.Sprintf("healthcheck: check %q unhealthy", c.checkName) 107 event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason) 108 go asyncRestart(ctx, c.logger, c.task, event) 109 return true 110 } 111 112 return false 113 } 114 115 // asyncRestart mimics the pre-0.9 TaskRunner.Restart behavior and is intended 116 // to be called in a goroutine. 117 func asyncRestart(ctx context.Context, logger log.Logger, task WorkloadRestarter, event *structs.TaskEvent) { 118 // Check watcher restarts are always failures 119 const failure = true 120 121 // Restarting is asynchronous so there's no reason to allow this 122 // goroutine to block indefinitely. 123 ctx, cancel := context.WithTimeout(ctx, 10*time.Second) 124 defer cancel() 125 126 if err := task.Restart(ctx, event, failure); err != nil { 127 // Restart errors are not actionable and only relevant when 128 // debugging allocation lifecycle management. 129 logger.Debug("failed to restart task", "error", err, 130 "event_time", event.Time, "event_type", event.Type) 131 } 132 } 133 134 // checkWatchUpdates add or remove checks from the watcher 135 type checkWatchUpdate struct { 136 checkID string 137 remove bool 138 checkRestart *checkRestart 139 } 140 141 // checkWatcher watches Consul checks and restarts tasks when they're 142 // unhealthy. 143 type checkWatcher struct { 144 consul ChecksAPI 145 146 // pollFreq is how often to poll the checks API and defaults to 147 // defaultPollFreq 148 pollFreq time.Duration 149 150 // checkUpdateCh is how watches (and removals) are sent to the main 151 // watching loop 152 checkUpdateCh chan checkWatchUpdate 153 154 // done is closed when Run has exited 155 done chan struct{} 156 157 // lastErr is true if the last Consul call failed. It is used to 158 // squelch repeated error messages. 159 lastErr bool 160 161 logger log.Logger 162 } 163 164 // newCheckWatcher creates a new checkWatcher but does not call its Run method. 165 func newCheckWatcher(logger log.Logger, consul ChecksAPI) *checkWatcher { 166 return &checkWatcher{ 167 consul: consul, 168 pollFreq: defaultPollFreq, 169 checkUpdateCh: make(chan checkWatchUpdate, 8), 170 done: make(chan struct{}), 171 logger: logger.ResetNamed("consul.health"), 172 } 173 } 174 175 // Run the main Consul checks watching loop to restart tasks when their checks 176 // fail. Blocks until context is canceled. 177 func (w *checkWatcher) Run(ctx context.Context) { 178 defer close(w.done) 179 180 // map of check IDs to their metadata 181 checks := map[string]*checkRestart{} 182 183 // timer for check polling 184 checkTimer := time.NewTimer(0) 185 defer checkTimer.Stop() // ensure timer is never leaked 186 187 stopTimer := func() { 188 checkTimer.Stop() 189 select { 190 case <-checkTimer.C: 191 default: 192 } 193 } 194 195 // disable by default 196 stopTimer() 197 198 // Main watch loop 199 for { 200 // disable polling if there are no checks 201 if len(checks) == 0 { 202 stopTimer() 203 } 204 205 select { 206 case update := <-w.checkUpdateCh: 207 if update.remove { 208 // Remove a check 209 delete(checks, update.checkID) 210 continue 211 } 212 213 // Add/update a check 214 checks[update.checkID] = update.checkRestart 215 w.logger.Debug("watching check", "alloc_id", update.checkRestart.allocID, 216 "task", update.checkRestart.taskName, "check", update.checkRestart.checkName) 217 218 // if first check was added make sure polling is enabled 219 if len(checks) == 1 { 220 stopTimer() 221 checkTimer.Reset(w.pollFreq) 222 } 223 224 case <-ctx.Done(): 225 return 226 227 case <-checkTimer.C: 228 checkTimer.Reset(w.pollFreq) 229 230 // Set "now" as the point in time the following check results represent 231 now := time.Now() 232 233 results, err := w.consul.Checks() 234 if err != nil { 235 if !w.lastErr { 236 w.lastErr = true 237 w.logger.Error("failed retrieving health checks", "error", err) 238 } 239 continue 240 } 241 242 w.lastErr = false 243 244 // Keep track of tasks restarted this period so they 245 // are only restarted once and all of their checks are 246 // removed. 247 restartedTasks := map[string]struct{}{} 248 249 // Loop over watched checks and update their status from results 250 for cid, check := range checks { 251 // Shortcircuit if told to exit 252 if ctx.Err() != nil { 253 return 254 } 255 256 if _, ok := restartedTasks[check.taskKey]; ok { 257 // Check for this task already restarted; remove and skip check 258 delete(checks, cid) 259 continue 260 } 261 262 result, ok := results[cid] 263 if !ok { 264 // Only warn if outside grace period to avoid races with check registration 265 if now.After(check.graceUntil) { 266 w.logger.Warn("watched check not found in Consul", "check", check.checkName, "check_id", cid) 267 } 268 continue 269 } 270 271 restarted := check.apply(ctx, now, result.Status) 272 if restarted { 273 // Checks are registered+watched on 274 // startup, so it's safe to remove them 275 // whenever they're restarted 276 delete(checks, cid) 277 278 restartedTasks[check.taskKey] = struct{}{} 279 } 280 } 281 282 // Ensure even passing checks for restartedTasks are removed 283 if len(restartedTasks) > 0 { 284 for cid, check := range checks { 285 if _, ok := restartedTasks[check.taskKey]; ok { 286 delete(checks, cid) 287 } 288 } 289 } 290 } 291 } 292 } 293 294 // Watch a check and restart its task if unhealthy. 295 func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, restarter WorkloadRestarter) { 296 if !check.TriggersRestarts() { 297 // Not watched, noop 298 return 299 } 300 301 c := &checkRestart{ 302 allocID: allocID, 303 taskName: taskName, 304 checkID: checkID, 305 checkName: check.Name, 306 taskKey: fmt.Sprintf("%s%s", allocID, taskName), // unique task ID 307 task: restarter, 308 interval: check.Interval, 309 grace: check.CheckRestart.Grace, 310 graceUntil: time.Now().Add(check.CheckRestart.Grace), 311 timeLimit: check.Interval * time.Duration(check.CheckRestart.Limit-1), 312 ignoreWarnings: check.CheckRestart.IgnoreWarnings, 313 logger: w.logger.With("alloc_id", allocID, "task", taskName, "check", check.Name), 314 } 315 316 update := checkWatchUpdate{ 317 checkID: checkID, 318 checkRestart: c, 319 } 320 321 select { 322 case w.checkUpdateCh <- update: 323 // sent watch 324 case <-w.done: 325 // exited; nothing to do 326 } 327 } 328 329 // Unwatch a check. 330 func (w *checkWatcher) Unwatch(cid string) { 331 c := checkWatchUpdate{ 332 checkID: cid, 333 remove: true, 334 } 335 select { 336 case w.checkUpdateCh <- c: 337 // sent remove watch 338 case <-w.done: 339 // exited; nothing to do 340 } 341 }