github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allocrunner/taskrunner/script_check_hook.go (about) 1 package taskrunner 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 "time" 8 9 "github.com/hashicorp/consul/api" 10 log "github.com/hashicorp/go-hclog" 11 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 12 tinterfaces "github.com/hashicorp/nomad/client/allocrunner/taskrunner/interfaces" 13 "github.com/hashicorp/nomad/client/consul" 14 "github.com/hashicorp/nomad/client/taskenv" 15 agentconsul "github.com/hashicorp/nomad/command/agent/consul" 16 "github.com/hashicorp/nomad/nomad/structs" 17 ) 18 19 var _ interfaces.TaskPoststartHook = &scriptCheckHook{} 20 var _ interfaces.TaskUpdateHook = &scriptCheckHook{} 21 var _ interfaces.TaskStopHook = &scriptCheckHook{} 22 23 // default max amount of time to wait for all scripts on shutdown. 24 const defaultShutdownWait = time.Minute 25 26 type scriptCheckHookConfig struct { 27 alloc *structs.Allocation 28 task *structs.Task 29 consul consul.ConsulServiceAPI 30 logger log.Logger 31 shutdownWait time.Duration 32 } 33 34 // scriptCheckHook implements a task runner hook for running script 35 // checks in the context of a task 36 type scriptCheckHook struct { 37 consul consul.ConsulServiceAPI 38 alloc *structs.Allocation 39 task *structs.Task 40 logger log.Logger 41 shutdownWait time.Duration // max time to wait for scripts to shutdown 42 shutdownCh chan struct{} // closed when all scripts should shutdown 43 44 // The following fields can be changed by Update() 45 driverExec tinterfaces.ScriptExecutor 46 taskEnv *taskenv.TaskEnv 47 48 // These maintain state and are populated by Poststart() or Update() 49 scripts map[string]*scriptCheck 50 runningScripts map[string]*taskletHandle 51 52 // Since Update() may be called concurrently with any other hook all 53 // hook methods must be fully serialized 54 mu sync.Mutex 55 } 56 57 // newScriptCheckHook returns a hook without any scriptChecks. 58 // They will get created only once their task environment is ready 59 // in Poststart() or Update() 60 func newScriptCheckHook(c scriptCheckHookConfig) *scriptCheckHook { 61 h := &scriptCheckHook{ 62 consul: c.consul, 63 alloc: c.alloc, 64 task: c.task, 65 scripts: make(map[string]*scriptCheck), 66 runningScripts: make(map[string]*taskletHandle), 67 shutdownWait: defaultShutdownWait, 68 shutdownCh: make(chan struct{}), 69 } 70 71 if c.shutdownWait != 0 { 72 h.shutdownWait = c.shutdownWait // override for testing 73 } 74 h.logger = c.logger.Named(h.Name()) 75 return h 76 } 77 78 func (h *scriptCheckHook) Name() string { 79 return "script_checks" 80 } 81 82 // Prestart implements interfaces.TaskPrestartHook. It stores the 83 // initial structs.Task 84 func (h *scriptCheckHook) Prestart(ctx context.Context, req *interfaces.TaskPrestartRequest, _ *interfaces.TaskPrestartResponse) error { 85 h.mu.Lock() 86 defer h.mu.Unlock() 87 h.task = req.Task 88 return nil 89 } 90 91 // PostStart implements interfaces.TaskPoststartHook. It creates new 92 // script checks with the current task context (driver and env), and 93 // starts up the scripts. 94 func (h *scriptCheckHook) Poststart(ctx context.Context, req *interfaces.TaskPoststartRequest, _ *interfaces.TaskPoststartResponse) error { 95 h.mu.Lock() 96 defer h.mu.Unlock() 97 98 if req.DriverExec == nil { 99 h.logger.Debug("driver doesn't support script checks") 100 return nil 101 } 102 h.driverExec = req.DriverExec 103 h.taskEnv = req.TaskEnv 104 105 return h.upsertChecks() 106 } 107 108 // Updated implements interfaces.TaskUpdateHook. It creates new 109 // script checks with the current task context (driver and env and possibly 110 // new structs.Task), and starts up the scripts. 111 func (h *scriptCheckHook) Update(ctx context.Context, req *interfaces.TaskUpdateRequest, _ *interfaces.TaskUpdateResponse) error { 112 h.mu.Lock() 113 defer h.mu.Unlock() 114 115 task := req.Alloc.LookupTask(h.task.Name) 116 if task == nil { 117 return fmt.Errorf("task %q not found in updated alloc", h.task.Name) 118 } 119 h.alloc = req.Alloc 120 h.task = task 121 h.taskEnv = req.TaskEnv 122 123 return h.upsertChecks() 124 } 125 126 func (h *scriptCheckHook) upsertChecks() error { 127 // Create new script checks struct with new task context 128 oldScriptChecks := h.scripts 129 h.scripts = h.newScriptChecks() 130 131 // Run new or replacement scripts 132 for id, script := range h.scripts { 133 // If it's already running, cancel and replace 134 if oldScript, running := h.runningScripts[id]; running { 135 oldScript.cancel() 136 } 137 // Start and store the handle 138 h.runningScripts[id] = script.run() 139 } 140 141 // Cancel scripts we no longer want 142 for id := range oldScriptChecks { 143 if _, ok := h.scripts[id]; !ok { 144 if oldScript, running := h.runningScripts[id]; running { 145 oldScript.cancel() 146 } 147 } 148 } 149 return nil 150 } 151 152 // Stop implements interfaces.TaskStopHook and blocks waiting for running 153 // scripts to finish (or for the shutdownWait timeout to expire). 154 func (h *scriptCheckHook) Stop(ctx context.Context, req *interfaces.TaskStopRequest, resp *interfaces.TaskStopResponse) error { 155 h.mu.Lock() 156 defer h.mu.Unlock() 157 close(h.shutdownCh) 158 deadline := time.After(h.shutdownWait) 159 err := fmt.Errorf("timed out waiting for script checks to exit") 160 for _, script := range h.runningScripts { 161 select { 162 case <-script.wait(): 163 case <-ctx.Done(): 164 // the caller is passing the background context, so 165 // we should never really see this outside of testing 166 case <-deadline: 167 // at this point the Consul client has been cleaned 168 // up so we don't want to hang onto this. 169 return err 170 } 171 } 172 return nil 173 } 174 175 func (h *scriptCheckHook) newScriptChecks() map[string]*scriptCheck { 176 scriptChecks := make(map[string]*scriptCheck) 177 interpolatedTaskServices := taskenv.InterpolateServices(h.taskEnv, h.task.Services) 178 for _, service := range interpolatedTaskServices { 179 for _, check := range service.Checks { 180 if check.Type != structs.ServiceCheckScript { 181 continue 182 } 183 serviceID := agentconsul.MakeAllocServiceID( 184 h.alloc.ID, h.task.Name, service) 185 sc := newScriptCheck(&scriptCheckConfig{ 186 allocID: h.alloc.ID, 187 taskName: h.task.Name, 188 check: check, 189 serviceID: serviceID, 190 agent: h.consul, 191 driverExec: h.driverExec, 192 taskEnv: h.taskEnv, 193 logger: h.logger, 194 shutdownCh: h.shutdownCh, 195 }) 196 if sc != nil { 197 scriptChecks[sc.id] = sc 198 } 199 } 200 } 201 202 // Walk back through the task group to see if there are script checks 203 // associated with the task. If so, we'll create scriptCheck tasklets 204 // for them. The group-level service and any check restart behaviors it 205 // needs are entirely encapsulated within the group service hook which 206 // watches Consul for status changes. 207 tg := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup) 208 interpolatedGroupServices := taskenv.InterpolateServices(h.taskEnv, tg.Services) 209 for _, service := range interpolatedGroupServices { 210 for _, check := range service.Checks { 211 if check.Type != structs.ServiceCheckScript { 212 continue 213 } 214 if check.TaskName != h.task.Name { 215 continue 216 } 217 groupTaskName := "group-" + tg.Name 218 serviceID := agentconsul.MakeAllocServiceID( 219 h.alloc.ID, groupTaskName, service) 220 sc := newScriptCheck(&scriptCheckConfig{ 221 allocID: h.alloc.ID, 222 taskName: groupTaskName, 223 check: check, 224 serviceID: serviceID, 225 agent: h.consul, 226 driverExec: h.driverExec, 227 taskEnv: h.taskEnv, 228 logger: h.logger, 229 shutdownCh: h.shutdownCh, 230 isGroup: true, 231 }) 232 if sc != nil { 233 scriptChecks[sc.id] = sc 234 } 235 } 236 } 237 return scriptChecks 238 } 239 240 // heartbeater is the subset of consul agent functionality needed by script 241 // checks to heartbeat 242 type heartbeater interface { 243 UpdateTTL(id, output, status string) error 244 } 245 246 // scriptCheck runs script checks via a interfaces.ScriptExecutor and updates the 247 // appropriate check's TTL when the script succeeds. 248 type scriptCheck struct { 249 id string 250 agent heartbeater 251 check *structs.ServiceCheck 252 lastCheckOk bool // true if the last check was ok; otherwise false 253 tasklet 254 } 255 256 // scriptCheckConfig is a parameter struct for newScriptCheck 257 type scriptCheckConfig struct { 258 allocID string 259 taskName string 260 serviceID string 261 check *structs.ServiceCheck 262 agent heartbeater 263 driverExec tinterfaces.ScriptExecutor 264 taskEnv *taskenv.TaskEnv 265 logger log.Logger 266 shutdownCh chan struct{} 267 isGroup bool 268 } 269 270 // newScriptCheck constructs a scriptCheck. we're only going to 271 // configure the immutable fields of scriptCheck here, with the 272 // rest being configured during the Poststart hook so that we have 273 // the rest of the task execution environment 274 func newScriptCheck(config *scriptCheckConfig) *scriptCheck { 275 276 // Guard against not having a valid taskEnv. This can be the case if the 277 // PreKilling or Exited hook is run before Poststart. 278 if config.taskEnv == nil || config.driverExec == nil { 279 return nil 280 } 281 282 orig := config.check 283 sc := &scriptCheck{ 284 agent: config.agent, 285 check: config.check.Copy(), 286 lastCheckOk: true, // start logging on first failure 287 } 288 289 // we can't use the promoted fields of tasklet in the struct literal 290 sc.Command = config.taskEnv.ReplaceEnv(config.check.Command) 291 sc.Args = config.taskEnv.ParseAndReplace(config.check.Args) 292 sc.Interval = config.check.Interval 293 sc.Timeout = config.check.Timeout 294 sc.exec = config.driverExec 295 sc.callback = newScriptCheckCallback(sc) 296 sc.logger = config.logger 297 sc.shutdownCh = config.shutdownCh 298 sc.check.Command = sc.Command 299 sc.check.Args = sc.Args 300 301 if config.isGroup { 302 // group services don't have access to a task environment 303 // at creation, so their checks get registered before the 304 // check can be interpolated here. if we don't use the 305 // original checkID, they can't be updated. 306 sc.id = agentconsul.MakeCheckID(config.serviceID, orig) 307 } else { 308 sc.id = agentconsul.MakeCheckID(config.serviceID, sc.check) 309 } 310 return sc 311 } 312 313 // Copy does a *shallow* copy of script checks. 314 func (sc *scriptCheck) Copy() *scriptCheck { 315 newSc := sc 316 return newSc 317 } 318 319 // closes over the script check and returns the taskletCallback for 320 // when the script check executes. 321 func newScriptCheckCallback(s *scriptCheck) taskletCallback { 322 323 return func(ctx context.Context, params execResult) { 324 output := params.output 325 code := params.code 326 err := params.err 327 328 state := api.HealthCritical 329 switch code { 330 case 0: 331 state = api.HealthPassing 332 case 1: 333 state = api.HealthWarning 334 } 335 336 var outputMsg string 337 if err != nil { 338 state = api.HealthCritical 339 outputMsg = err.Error() 340 } else { 341 outputMsg = string(output) 342 } 343 344 // heartbeat the check to Consul 345 err = s.updateTTL(ctx, outputMsg, state) 346 select { 347 case <-ctx.Done(): 348 // check has been removed; don't report errors 349 return 350 default: 351 } 352 353 if err != nil { 354 if s.lastCheckOk { 355 s.lastCheckOk = false 356 s.logger.Warn("updating check failed", "error", err) 357 } else { 358 s.logger.Debug("updating check still failing", "error", err) 359 } 360 361 } else if !s.lastCheckOk { 362 // Succeeded for the first time or after failing; log 363 s.lastCheckOk = true 364 s.logger.Info("updating check succeeded") 365 } 366 } 367 } 368 369 const ( 370 updateTTLBackoffBaseline = 1 * time.Second 371 updateTTLBackoffLimit = 3 * time.Second 372 ) 373 374 // updateTTL updates the state to Consul, performing an expontential backoff 375 // in the case where the check isn't registered in Consul to avoid a race between 376 // service registration and the first check. 377 func (s *scriptCheck) updateTTL(ctx context.Context, msg, state string) error { 378 for attempts := 0; ; attempts++ { 379 err := s.agent.UpdateTTL(s.id, msg, state) 380 if err == nil { 381 return nil 382 } 383 384 // Handle the retry case 385 backoff := (1 << (2 * uint64(attempts))) * updateTTLBackoffBaseline 386 if backoff > updateTTLBackoffLimit { 387 return err 388 } 389 390 // Wait till retrying 391 select { 392 case <-ctx.Done(): 393 return err 394 case <-time.After(backoff): 395 } 396 } 397 }