github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allocrunner/health_hook.go (about) 1 package allocrunner 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 "time" 8 9 "github.com/hashicorp/go-hclog" 10 "github.com/hashicorp/nomad/client/allochealth" 11 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 12 "github.com/hashicorp/nomad/client/serviceregistration" 13 "github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore" 14 cstructs "github.com/hashicorp/nomad/client/structs" 15 "github.com/hashicorp/nomad/nomad/structs" 16 ) 17 18 // healthSetter is able to set/clear alloc health. 19 type healthSetter interface { 20 // HasHealth returns true if health is already set. 21 HasHealth() bool 22 23 // SetHealth via the mutator. 24 SetHealth(healthy, isDeploy bool, taskEvents map[string]*structs.TaskEvent) 25 26 // ClearHealth for when the deployment ID changes. 27 ClearHealth() 28 } 29 30 // allocHealthWatcherHook is responsible for watching an allocation's task 31 // status and (optionally) Consul health check status to determine if the 32 // allocation is healthy or unhealthy. Used by deployments and migrations. 33 type allocHealthWatcherHook struct { 34 healthSetter healthSetter 35 36 // consul client used to monitor Consul service health checks 37 consul serviceregistration.Handler 38 39 // checkStore is used to monitor Nomad service health checks 40 checkStore checkstore.Shim 41 42 // listener is given to trackers to listen for alloc updates and closed 43 // when the alloc is destroyed. 44 listener *cstructs.AllocListener 45 46 // hookLock is held by hook methods to prevent concurrent access by 47 // Update and synchronous hooks. 48 hookLock sync.Mutex 49 50 // watchDone is created before calling watchHealth and is closed when 51 // watchHealth exits. Must be passed into watchHealth to avoid races. 52 // Initialized already closed as Update may be called before Prerun. 53 watchDone chan struct{} 54 55 // ranOnce is set once Prerun or Update have run at least once. This 56 // prevents Prerun from running if an Update has already been 57 // processed. Must hold hookLock to access. 58 ranOnce bool 59 60 // cancelFn stops the health watching/setting goroutine. Wait on 61 // watchLock to block until the watcher exits. 62 cancelFn context.CancelFunc 63 64 // alloc set by new func or Update. Must hold hookLock to access. 65 alloc *structs.Allocation 66 67 // isDeploy is true if monitoring a deployment. Set in init(). Must 68 // hold hookLock to access. 69 isDeploy bool 70 71 logger hclog.Logger 72 } 73 74 func newAllocHealthWatcherHook(logger hclog.Logger, alloc *structs.Allocation, hs healthSetter, 75 listener *cstructs.AllocListener, consul serviceregistration.Handler, checkStore checkstore.Shim) interfaces.RunnerHook { 76 77 // Neither deployments nor migrations care about the health of 78 // non-service jobs so never watch their health 79 if alloc.Job.Type != structs.JobTypeService { 80 return noopAllocHealthWatcherHook{} 81 } 82 83 // Initialize watchDone with a closed chan in case Update runs before Prerun 84 closedDone := make(chan struct{}) 85 close(closedDone) 86 87 h := &allocHealthWatcherHook{ 88 alloc: alloc, 89 cancelFn: func() {}, // initialize to prevent nil func panics 90 watchDone: closedDone, 91 consul: consul, 92 checkStore: checkStore, 93 healthSetter: hs, 94 listener: listener, 95 } 96 97 h.logger = logger.Named(h.Name()) 98 return h 99 } 100 101 func (h *allocHealthWatcherHook) Name() string { 102 return "alloc_health_watcher" 103 } 104 105 // init starts the allochealth.Tracker and watchHealth goroutine on either 106 // Prerun or Update. Caller must set/update alloc and logger fields. 107 // 108 // Not threadsafe so the caller should lock since Updates occur concurrently. 109 func (h *allocHealthWatcherHook) init() error { 110 // No need to watch health as it's already set 111 if h.healthSetter.HasHealth() { 112 h.logger.Trace("not watching; already has health set") 113 return nil 114 } 115 116 tg := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup) 117 if tg == nil { 118 return fmt.Errorf("task group %q does not exist in job %q", h.alloc.TaskGroup, h.alloc.Job.ID) 119 } 120 121 h.isDeploy = h.alloc.DeploymentID != "" 122 123 // No need to watch allocs for deployments that rely on operators 124 // manually setting health 125 if h.isDeploy && (tg.Update.IsEmpty() || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual) { 126 return nil 127 } 128 129 // Define the deadline, health method, min healthy time from the 130 // deployment if this is a deployment; otherwise from the migration 131 // strategy. 132 deadline, useChecks, minHealthyTime := getHealthParams(time.Now(), tg, h.isDeploy) 133 134 // Create a context that is canceled when the tracker should shutdown. 135 ctx := context.Background() 136 ctx, h.cancelFn = context.WithCancel(ctx) 137 138 h.logger.Trace("watching", "deadline", deadline, "checks", useChecks, "min_healthy_time", minHealthyTime) 139 // Create a new tracker, start it, and watch for health results. 140 tracker := allochealth.NewTracker( 141 ctx, h.logger, h.alloc, h.listener, h.consul, h.checkStore, minHealthyTime, useChecks, 142 ) 143 tracker.Start() 144 145 // Create a new done chan and start watching for health updates 146 h.watchDone = make(chan struct{}) 147 go h.watchHealth(ctx, deadline, tracker, h.watchDone) 148 return nil 149 } 150 151 func (h *allocHealthWatcherHook) Prerun() error { 152 h.hookLock.Lock() 153 defer h.hookLock.Unlock() 154 155 if h.ranOnce { 156 // An Update beat Prerun to running the watcher; noop 157 return nil 158 } 159 160 h.ranOnce = true 161 return h.init() 162 } 163 164 func (h *allocHealthWatcherHook) Update(req *interfaces.RunnerUpdateRequest) error { 165 h.hookLock.Lock() 166 defer h.hookLock.Unlock() 167 168 // Prevent Prerun from running after an Update 169 h.ranOnce = true 170 171 // Cancel the old watcher and create a new one 172 h.cancelFn() 173 174 // Wait until the watcher exits 175 <-h.watchDone 176 177 // Deployment has changed, reset status 178 if req.Alloc.DeploymentID != h.alloc.DeploymentID { 179 h.healthSetter.ClearHealth() 180 } 181 182 // Update alloc 183 h.alloc = req.Alloc 184 185 return h.init() 186 } 187 188 func (h *allocHealthWatcherHook) Postrun() error { 189 h.hookLock.Lock() 190 defer h.hookLock.Unlock() 191 192 h.cancelFn() 193 h.listener.Close() 194 195 // Wait until the watcher exits 196 <-h.watchDone 197 198 return nil 199 } 200 201 func (h *allocHealthWatcherHook) Shutdown() { 202 // Same as Postrun 203 _ = h.Postrun() 204 } 205 206 // watchHealth watches alloc health until it is set, the alloc is stopped, the 207 // deadline is reached, or the context is canceled. watchHealth will be 208 // canceled and restarted on Updates so calls are serialized with a lock. 209 func (h *allocHealthWatcherHook) watchHealth(ctx context.Context, deadline time.Time, tracker *allochealth.Tracker, done chan<- struct{}) { 210 defer close(done) 211 212 // Default to unhealthy for the deadline reached case 213 healthy := false 214 215 select { 216 case <-ctx.Done(): 217 // Graceful shutdown 218 return 219 220 case <-tracker.AllocStoppedCh(): 221 // Allocation has stopped so no need to set health 222 return 223 224 case <-time.After(time.Until(deadline)): 225 // Time is up! Fallthrough to set unhealthy. 226 h.logger.Trace("deadline reached; setting unhealthy", "deadline", deadline) 227 228 case healthy = <-tracker.HealthyCh(): 229 // Health received. Fallthrough to set it. 230 } 231 232 h.logger.Trace("health set", "healthy", healthy) 233 234 // If this is an unhealthy deployment emit events for tasks 235 var taskEvents map[string]*structs.TaskEvent 236 if !healthy && h.isDeploy { 237 taskEvents = tracker.TaskEvents() 238 } 239 240 h.healthSetter.SetHealth(healthy, h.isDeploy, taskEvents) 241 } 242 243 // getHealthParams returns the health watcher parameters which vary based on 244 // whether this allocation is in a deployment or migration. 245 func getHealthParams(now time.Time, tg *structs.TaskGroup, isDeploy bool) (deadline time.Time, useChecks bool, minHealthyTime time.Duration) { 246 if isDeploy { 247 deadline = now.Add(tg.Update.HealthyDeadline) 248 minHealthyTime = tg.Update.MinHealthyTime 249 useChecks = tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks 250 } else { 251 strategy := tg.Migrate 252 if strategy == nil { 253 // For backwards compat with pre-0.8 allocations that 254 // don't have a migrate strategy set. 255 strategy = structs.DefaultMigrateStrategy() 256 } 257 258 deadline = now.Add(strategy.HealthyDeadline) 259 minHealthyTime = strategy.MinHealthyTime 260 useChecks = strategy.HealthCheck == structs.MigrateStrategyHealthChecks 261 } 262 return 263 } 264 265 // noopAllocHealthWatcherHook is an empty hook implementation returned by 266 // newAllocHealthWatcherHook when an allocation will never need its health 267 // monitored. 268 type noopAllocHealthWatcherHook struct{} 269 270 func (noopAllocHealthWatcherHook) Name() string { 271 return "alloc_health_watcher" 272 }