github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/serviceregistration/watcher.go (about) 1 package serviceregistration 2 3 import ( 4 "context" 5 "fmt" 6 "time" 7 8 "github.com/hashicorp/go-hclog" 9 "github.com/hashicorp/go-set" 10 "github.com/hashicorp/nomad/helper" 11 "github.com/hashicorp/nomad/nomad/structs" 12 ) 13 14 // composite of allocID + taskName for uniqueness 15 type key string 16 17 type restarter struct { 18 allocID string 19 taskName string 20 checkID string 21 checkName string 22 taskKey key 23 24 logger hclog.Logger 25 task WorkloadRestarter 26 grace time.Duration 27 interval time.Duration 28 timeLimit time.Duration 29 ignoreWarnings bool 30 31 // unhealthyState is the time a check first went unhealthy. Set to the 32 // zero value if the check passes before timeLimit. 33 unhealthyState time.Time 34 35 // graceUntil is when the check's grace period expires and unhealthy 36 // checks should be counted. 37 graceUntil time.Time 38 } 39 40 // apply restart state for check and restart task if necessary. Current 41 // timestamp is passed in so all check updates have the same view of time (and 42 // to ease testing). 43 // 44 // Returns true if a restart was triggered in which case this check should be 45 // removed (checks are added on task startup). 46 func (r *restarter) apply(ctx context.Context, now time.Time, status string) bool { 47 healthy := func() { 48 if !r.unhealthyState.IsZero() { 49 r.logger.Debug("canceling restart because check became healthy") 50 r.unhealthyState = time.Time{} 51 } 52 } 53 switch status { 54 case "critical": // consul 55 case string(structs.CheckFailure): // nomad 56 case string(structs.CheckPending): // nomad 57 case "warning": // consul 58 if r.ignoreWarnings { 59 // Warnings are ignored, reset state and exit 60 healthy() 61 return false 62 } 63 default: 64 // All other statuses are ok, reset state and exit 65 healthy() 66 return false 67 } 68 69 if now.Before(r.graceUntil) { 70 // In grace period, exit 71 return false 72 } 73 74 if r.unhealthyState.IsZero() { 75 // First failure, set restart deadline 76 if r.timeLimit != 0 { 77 r.logger.Debug("check became unhealthy. Will restart if check doesn't become healthy", "time_limit", r.timeLimit) 78 } 79 r.unhealthyState = now 80 } 81 82 // restart timeLimit after start of this check becoming unhealthy 83 restartAt := r.unhealthyState.Add(r.timeLimit) 84 85 // Must test >= because if limit=1, restartAt == first failure 86 if now.Equal(restartAt) || now.After(restartAt) { 87 // hasn't become healthy by deadline, restart! 88 r.logger.Debug("restarting due to unhealthy check") 89 90 // Tell TaskRunner to restart due to failure 91 reason := fmt.Sprintf("healthcheck: check %q unhealthy", r.checkName) 92 event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason) 93 go asyncRestart(ctx, r.logger, r.task, event) 94 return true 95 } 96 97 return false 98 } 99 100 // asyncRestart mimics the pre-0.9 TaskRunner.Restart behavior and is intended 101 // to be called in a goroutine. 102 func asyncRestart(ctx context.Context, logger hclog.Logger, task WorkloadRestarter, event *structs.TaskEvent) { 103 // Check watcher restarts are always failures 104 const failure = true 105 106 // Restarting is asynchronous so there's no reason to allow this 107 // goroutine to block indefinitely. 108 ctx, cancel := context.WithTimeout(ctx, 10*time.Second) 109 defer cancel() 110 111 if err := task.Restart(ctx, event, failure); err != nil { 112 // Restart errors are not actionable and only relevant when 113 // debugging allocation lifecycle management. 114 logger.Debug("failed to restart task", "error", err, "event_time", event.Time, "event_type", event.Type) 115 } 116 } 117 118 // CheckStatusGetter is implemented per-provider. 119 type CheckStatusGetter interface { 120 // Get returns a map from CheckID -> (minimal) CheckStatus 121 Get() (map[string]string, error) 122 } 123 124 // checkWatchUpdates add or remove checks from the watcher 125 type checkWatchUpdate struct { 126 checkID string 127 remove bool 128 restart *restarter 129 } 130 131 // A CheckWatcher watches for check failures and restarts tasks according to 132 // their check_restart policy. 133 type CheckWatcher interface { 134 // Run the CheckWatcher. Maintains a background process to continuously 135 // monitor active checks. Must be called before Watch or Unwatch. Must be 136 // called as a goroutine. 137 Run(ctx context.Context) 138 139 // Watch the given check. If the check status enters a failing state, the 140 // task associated with the check will be restarted according to its check_restart 141 // policy via wr. 142 Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, wr WorkloadRestarter) 143 144 // Unwatch will cause the CheckWatcher to no longer monitor the check of given checkID. 145 Unwatch(checkID string) 146 } 147 148 // UniversalCheckWatcher is an implementation of CheckWatcher capable of watching 149 // checks in the Nomad or Consul service providers. 150 type UniversalCheckWatcher struct { 151 logger hclog.Logger 152 getter CheckStatusGetter 153 154 // pollFrequency is how often to poll the checks API 155 pollFrequency time.Duration 156 157 // checkUpdateCh sends watches/removals to the main loop 158 checkUpdateCh chan checkWatchUpdate 159 160 // done is closed when Run has exited 161 done chan struct{} 162 163 // failedPreviousInterval is used to indicate whether something went wrong during 164 // the previous poll interval - if so we can silence ongoing errors 165 failedPreviousInterval bool 166 } 167 168 func NewCheckWatcher(logger hclog.Logger, getter CheckStatusGetter) *UniversalCheckWatcher { 169 return &UniversalCheckWatcher{ 170 logger: logger.ResetNamed("watch.checks"), 171 getter: getter, 172 pollFrequency: 1 * time.Second, 173 checkUpdateCh: make(chan checkWatchUpdate, 8), 174 done: make(chan struct{}), 175 } 176 } 177 178 // Watch a check and restart its task if unhealthy. 179 func (w *UniversalCheckWatcher) Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, wr WorkloadRestarter) { 180 if !check.TriggersRestarts() { 181 return // check_restart not set; no-op 182 } 183 184 c := &restarter{ 185 allocID: allocID, 186 taskName: taskName, 187 checkID: checkID, 188 checkName: check.Name, 189 taskKey: key(allocID + taskName), 190 task: wr, 191 interval: check.Interval, 192 grace: check.CheckRestart.Grace, 193 graceUntil: time.Now().Add(check.CheckRestart.Grace), 194 timeLimit: check.Interval * time.Duration(check.CheckRestart.Limit-1), 195 ignoreWarnings: check.CheckRestart.IgnoreWarnings, 196 logger: w.logger.With("alloc_id", allocID, "task", taskName, "check", check.Name), 197 } 198 199 select { 200 case w.checkUpdateCh <- checkWatchUpdate{ 201 checkID: checkID, 202 restart: c, 203 }: // activate watch 204 case <-w.done: // exited; nothing to do 205 } 206 } 207 208 // Unwatch a check. 209 func (w *UniversalCheckWatcher) Unwatch(checkID string) { 210 select { 211 case w.checkUpdateCh <- checkWatchUpdate{ 212 checkID: checkID, 213 remove: true, 214 }: // deactivate watch 215 case <-w.done: // exited; nothing to do 216 } 217 } 218 219 func (w *UniversalCheckWatcher) Run(ctx context.Context) { 220 defer close(w.done) 221 222 // map of checkID to their restarter handle (contains only checks we are watching) 223 watched := make(map[string]*restarter) 224 225 checkTimer, cleanupCheckTimer := helper.NewSafeTimer(0) 226 defer cleanupCheckTimer() 227 228 stopCheckTimer := func() { // todo: refactor using that other pattern 229 checkTimer.Stop() 230 select { 231 case <-checkTimer.C: 232 default: 233 } 234 } 235 236 // initialize with checkTimer disabled 237 stopCheckTimer() 238 239 for { 240 // disable polling if there are no checks 241 if len(watched) == 0 { 242 stopCheckTimer() 243 } 244 245 select { 246 // caller cancelled us; goodbye 247 case <-ctx.Done(): 248 return 249 250 // received an update; add or remove check 251 case update := <-w.checkUpdateCh: 252 if update.remove { 253 delete(watched, update.checkID) 254 continue 255 } 256 257 watched[update.checkID] = update.restart 258 allocID := update.restart.allocID 259 taskName := update.restart.taskName 260 checkName := update.restart.checkName 261 w.logger.Trace("now watching check", "alloc_i", allocID, "task", taskName, "check", checkName) 262 263 // turn on the timer if we are now active 264 if len(watched) == 1 { 265 stopCheckTimer() 266 checkTimer.Reset(w.pollFrequency) 267 } 268 269 // poll time; refresh check statuses 270 case now := <-checkTimer.C: 271 w.interval(ctx, now, watched) 272 checkTimer.Reset(w.pollFrequency) 273 } 274 } 275 } 276 277 func (w *UniversalCheckWatcher) interval(ctx context.Context, now time.Time, watched map[string]*restarter) { 278 statuses, err := w.getter.Get() 279 if err != nil && !w.failedPreviousInterval { 280 w.failedPreviousInterval = true 281 w.logger.Error("failed to retrieve check statuses", "error", err) 282 return 283 } 284 w.failedPreviousInterval = false 285 286 // keep track of tasks restarted this interval 287 restarts := set.New[key](len(statuses)) 288 289 // iterate over status of all checks, and update the status of checks 290 // we care about watching 291 for checkID, checkRestarter := range watched { 292 if ctx.Err() != nil { 293 return // short circuit; caller cancelled us 294 } 295 296 if restarts.Contains(checkRestarter.taskKey) { 297 // skip; task is already being restarted 298 delete(watched, checkID) 299 continue 300 } 301 302 status, exists := statuses[checkID] 303 if !exists { 304 // warn only if outside grace period; avoiding race with check registration 305 if now.After(checkRestarter.graceUntil) { 306 w.logger.Warn("watched check not found", "check_id", checkID) 307 } 308 continue 309 } 310 311 if checkRestarter.apply(ctx, now, status) { 312 // check will be re-registered & re-watched on startup 313 delete(watched, checkID) 314 restarts.Insert(checkRestarter.taskKey) 315 } 316 } 317 318 // purge passing checks of tasks that are being restarted 319 if restarts.Size() > 0 { 320 for checkID, checkRestarter := range watched { 321 if restarts.Contains(checkRestarter.taskKey) { 322 delete(watched, checkID) 323 } 324 } 325 } 326 }