github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/client/allocrunner/taskrunner/script_check_hook.go (about) 1 package taskrunner 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 "time" 8 9 "github.com/hashicorp/consul/api" 10 log "github.com/hashicorp/go-hclog" 11 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 12 tinterfaces "github.com/hashicorp/nomad/client/allocrunner/taskrunner/interfaces" 13 "github.com/hashicorp/nomad/client/consul" 14 "github.com/hashicorp/nomad/client/taskenv" 15 agentconsul "github.com/hashicorp/nomad/command/agent/consul" 16 "github.com/hashicorp/nomad/nomad/structs" 17 ) 18 19 var _ interfaces.TaskPoststartHook = &scriptCheckHook{} 20 var _ interfaces.TaskUpdateHook = &scriptCheckHook{} 21 var _ interfaces.TaskStopHook = &scriptCheckHook{} 22 23 // default max amount of time to wait for all scripts on shutdown. 24 const defaultShutdownWait = time.Minute 25 26 type scriptCheckHookConfig struct { 27 alloc *structs.Allocation 28 task *structs.Task 29 consul consul.ConsulServiceAPI 30 logger log.Logger 31 shutdownWait time.Duration 32 } 33 34 // scriptCheckHook implements a task runner hook for running script 35 // checks in the context of a task 36 type scriptCheckHook struct { 37 consul consul.ConsulServiceAPI 38 alloc *structs.Allocation 39 task *structs.Task 40 logger log.Logger 41 shutdownWait time.Duration // max time to wait for scripts to shutdown 42 shutdownCh chan struct{} // closed when all scripts should shutdown 43 44 // The following fields can be changed by Update() 45 driverExec tinterfaces.ScriptExecutor 46 taskEnv *taskenv.TaskEnv 47 48 // These maintain state and are populated by Poststart() or Update() 49 scripts map[string]*scriptCheck 50 runningScripts map[string]*taskletHandle 51 52 // Since Update() may be called concurrently with any other hook all 53 // hook methods must be fully serialized 54 mu sync.Mutex 55 } 56 57 // newScriptCheckHook returns a hook without any scriptChecks. 58 // They will get created only once their task environment is ready 59 // in Poststart() or Update() 60 func newScriptCheckHook(c scriptCheckHookConfig) *scriptCheckHook { 61 h := &scriptCheckHook{ 62 consul: c.consul, 63 alloc: c.alloc, 64 task: c.task, 65 scripts: make(map[string]*scriptCheck), 66 runningScripts: make(map[string]*taskletHandle), 67 shutdownWait: defaultShutdownWait, 68 shutdownCh: make(chan struct{}), 69 } 70 71 if c.shutdownWait != 0 { 72 h.shutdownWait = c.shutdownWait // override for testing 73 } 74 h.logger = c.logger.Named(h.Name()) 75 return h 76 } 77 78 func (h *scriptCheckHook) Name() string { 79 return "script_checks" 80 } 81 82 // Prestart implements interfaces.TaskPrestartHook. It stores the 83 // initial structs.Task 84 func (h *scriptCheckHook) Prestart(ctx context.Context, req *interfaces.TaskPrestartRequest, _ *interfaces.TaskPrestartResponse) error { 85 h.mu.Lock() 86 defer h.mu.Unlock() 87 h.task = req.Task 88 return nil 89 } 90 91 // PostStart implements interfaces.TaskPoststartHook. It creates new 92 // script checks with the current task context (driver and env), and 93 // starts up the scripts. 94 func (h *scriptCheckHook) Poststart(ctx context.Context, req *interfaces.TaskPoststartRequest, _ *interfaces.TaskPoststartResponse) error { 95 h.mu.Lock() 96 defer h.mu.Unlock() 97 98 if req.DriverExec == nil { 99 h.logger.Debug("driver doesn't support script checks") 100 return nil 101 } 102 h.driverExec = req.DriverExec 103 h.taskEnv = req.TaskEnv 104 105 return h.upsertChecks() 106 } 107 108 // Updated implements interfaces.TaskUpdateHook. It creates new 109 // script checks with the current task context (driver and env and possibly 110 // new structs.Task), and starts up the scripts. 111 func (h *scriptCheckHook) Update(ctx context.Context, req *interfaces.TaskUpdateRequest, _ *interfaces.TaskUpdateResponse) error { 112 h.mu.Lock() 113 defer h.mu.Unlock() 114 115 task := req.Alloc.LookupTask(h.task.Name) 116 if task == nil { 117 return fmt.Errorf("task %q not found in updated alloc", h.task.Name) 118 } 119 h.alloc = req.Alloc 120 h.task = task 121 h.taskEnv = req.TaskEnv 122 123 return h.upsertChecks() 124 } 125 126 func (h *scriptCheckHook) upsertChecks() error { 127 // Create new script checks struct with new task context 128 oldScriptChecks := h.scripts 129 h.scripts = h.newScriptChecks() 130 131 // Run new or replacement scripts 132 for id, script := range h.scripts { 133 // If it's already running, cancel and replace 134 if oldScript, running := h.runningScripts[id]; running { 135 oldScript.cancel() 136 } 137 // Start and store the handle 138 h.runningScripts[id] = script.run() 139 } 140 141 // Cancel scripts we no longer want 142 for id := range oldScriptChecks { 143 if _, ok := h.scripts[id]; !ok { 144 if oldScript, running := h.runningScripts[id]; running { 145 oldScript.cancel() 146 } 147 } 148 } 149 return nil 150 } 151 152 // Stop implements interfaces.TaskStopHook and blocks waiting for running 153 // scripts to finish (or for the shutdownWait timeout to expire). 154 func (h *scriptCheckHook) Stop(ctx context.Context, req *interfaces.TaskStopRequest, resp *interfaces.TaskStopResponse) error { 155 h.mu.Lock() 156 defer h.mu.Unlock() 157 close(h.shutdownCh) 158 deadline := time.After(h.shutdownWait) 159 err := fmt.Errorf("timed out waiting for script checks to exit") 160 for _, script := range h.runningScripts { 161 select { 162 case <-script.wait(): 163 case <-ctx.Done(): 164 // the caller is passing the background context, so 165 // we should never really see this outside of testing 166 case <-deadline: 167 // at this point the Consul client has been cleaned 168 // up so we don't want to hang onto this. 169 return err 170 } 171 } 172 return nil 173 } 174 175 func (h *scriptCheckHook) newScriptChecks() map[string]*scriptCheck { 176 scriptChecks := make(map[string]*scriptCheck) 177 interpolatedTaskServices := taskenv.InterpolateServices(h.taskEnv, h.task.Services) 178 for _, service := range interpolatedTaskServices { 179 for _, check := range service.Checks { 180 if check.Type != structs.ServiceCheckScript { 181 continue 182 } 183 serviceID := agentconsul.MakeAllocServiceID( 184 h.alloc.ID, h.task.Name, service) 185 sc := newScriptCheck(&scriptCheckConfig{ 186 allocID: h.alloc.ID, 187 taskName: h.task.Name, 188 check: check, 189 serviceID: serviceID, 190 agent: h.consul, 191 driverExec: h.driverExec, 192 taskEnv: h.taskEnv, 193 logger: h.logger, 194 shutdownCh: h.shutdownCh, 195 }) 196 if sc != nil { 197 scriptChecks[sc.id] = sc 198 } 199 } 200 } 201 202 // Walk back through the task group to see if there are script checks 203 // associated with the task. If so, we'll create scriptCheck tasklets 204 // for them. The group-level service and any check restart behaviors it 205 // needs are entirely encapsulated within the group service hook which 206 // watches Consul for status changes. 207 // 208 // The script check is associated with a group task if the service.task or 209 // service.check.task matches the task name. The service.check.task takes 210 // precedence. 211 tg := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup) 212 interpolatedGroupServices := taskenv.InterpolateServices(h.taskEnv, tg.Services) 213 for _, service := range interpolatedGroupServices { 214 for _, check := range service.Checks { 215 if check.Type != structs.ServiceCheckScript { 216 continue 217 } 218 if !h.associated(h.task.Name, service.TaskName, check.TaskName) { 219 continue 220 } 221 groupTaskName := "group-" + tg.Name 222 serviceID := agentconsul.MakeAllocServiceID( 223 h.alloc.ID, groupTaskName, service) 224 sc := newScriptCheck(&scriptCheckConfig{ 225 allocID: h.alloc.ID, 226 taskName: groupTaskName, 227 check: check, 228 serviceID: serviceID, 229 agent: h.consul, 230 driverExec: h.driverExec, 231 taskEnv: h.taskEnv, 232 logger: h.logger, 233 shutdownCh: h.shutdownCh, 234 isGroup: true, 235 }) 236 if sc != nil { 237 scriptChecks[sc.id] = sc 238 } 239 } 240 } 241 return scriptChecks 242 } 243 244 // associated returns true if the script check is associated with the task. This 245 // would be the case if the check.task is the same as task, or if the service.task 246 // is the same as the task _and_ check.task is not configured (i.e. the check 247 // inherits the task of the service). 248 func (*scriptCheckHook) associated(task, serviceTask, checkTask string) bool { 249 if checkTask == task { 250 return true 251 } 252 if serviceTask == task && checkTask == "" { 253 return true 254 } 255 return false 256 } 257 258 // heartbeater is the subset of consul agent functionality needed by script 259 // checks to heartbeat 260 type heartbeater interface { 261 UpdateTTL(id, output, status string) error 262 } 263 264 // scriptCheck runs script checks via a interfaces.ScriptExecutor and updates the 265 // appropriate check's TTL when the script succeeds. 266 type scriptCheck struct { 267 id string 268 agent heartbeater 269 check *structs.ServiceCheck 270 lastCheckOk bool // true if the last check was ok; otherwise false 271 tasklet 272 } 273 274 // scriptCheckConfig is a parameter struct for newScriptCheck 275 type scriptCheckConfig struct { 276 allocID string 277 taskName string 278 serviceID string 279 check *structs.ServiceCheck 280 agent heartbeater 281 driverExec tinterfaces.ScriptExecutor 282 taskEnv *taskenv.TaskEnv 283 logger log.Logger 284 shutdownCh chan struct{} 285 isGroup bool 286 } 287 288 // newScriptCheck constructs a scriptCheck. we're only going to 289 // configure the immutable fields of scriptCheck here, with the 290 // rest being configured during the Poststart hook so that we have 291 // the rest of the task execution environment 292 func newScriptCheck(config *scriptCheckConfig) *scriptCheck { 293 294 // Guard against not having a valid taskEnv. This can be the case if the 295 // PreKilling or Exited hook is run before Poststart. 296 if config.taskEnv == nil || config.driverExec == nil { 297 return nil 298 } 299 300 orig := config.check 301 sc := &scriptCheck{ 302 agent: config.agent, 303 check: config.check.Copy(), 304 lastCheckOk: true, // start logging on first failure 305 } 306 307 // we can't use the promoted fields of tasklet in the struct literal 308 sc.Command = config.taskEnv.ReplaceEnv(config.check.Command) 309 sc.Args = config.taskEnv.ParseAndReplace(config.check.Args) 310 sc.Interval = config.check.Interval 311 sc.Timeout = config.check.Timeout 312 sc.exec = config.driverExec 313 sc.callback = newScriptCheckCallback(sc) 314 sc.logger = config.logger 315 sc.shutdownCh = config.shutdownCh 316 sc.check.Command = sc.Command 317 sc.check.Args = sc.Args 318 319 if config.isGroup { 320 // group services don't have access to a task environment 321 // at creation, so their checks get registered before the 322 // check can be interpolated here. if we don't use the 323 // original checkID, they can't be updated. 324 sc.id = agentconsul.MakeCheckID(config.serviceID, orig) 325 } else { 326 sc.id = agentconsul.MakeCheckID(config.serviceID, sc.check) 327 } 328 return sc 329 } 330 331 // Copy does a *shallow* copy of script checks. 332 func (sc *scriptCheck) Copy() *scriptCheck { 333 newSc := sc 334 return newSc 335 } 336 337 // closes over the script check and returns the taskletCallback for 338 // when the script check executes. 339 func newScriptCheckCallback(s *scriptCheck) taskletCallback { 340 341 return func(ctx context.Context, params execResult) { 342 output := params.output 343 code := params.code 344 err := params.err 345 346 state := api.HealthCritical 347 switch code { 348 case 0: 349 state = api.HealthPassing 350 case 1: 351 state = api.HealthWarning 352 } 353 354 var outputMsg string 355 if err != nil { 356 state = api.HealthCritical 357 outputMsg = err.Error() 358 } else { 359 outputMsg = string(output) 360 } 361 362 // heartbeat the check to Consul 363 err = s.updateTTL(ctx, outputMsg, state) 364 select { 365 case <-ctx.Done(): 366 // check has been removed; don't report errors 367 return 368 default: 369 } 370 371 if err != nil { 372 if s.lastCheckOk { 373 s.lastCheckOk = false 374 s.logger.Warn("updating check failed", "error", err) 375 } else { 376 s.logger.Debug("updating check still failing", "error", err) 377 } 378 379 } else if !s.lastCheckOk { 380 // Succeeded for the first time or after failing; log 381 s.lastCheckOk = true 382 s.logger.Info("updating check succeeded") 383 } 384 } 385 } 386 387 const ( 388 updateTTLBackoffBaseline = 1 * time.Second 389 updateTTLBackoffLimit = 3 * time.Second 390 ) 391 392 // updateTTL updates the state to Consul, performing an exponential backoff 393 // in the case where the check isn't registered in Consul to avoid a race between 394 // service registration and the first check. 395 func (s *scriptCheck) updateTTL(ctx context.Context, msg, state string) error { 396 for attempts := 0; ; attempts++ { 397 err := s.agent.UpdateTTL(s.id, msg, state) 398 if err == nil { 399 return nil 400 } 401 402 // Handle the retry case 403 backoff := (1 << (2 * uint64(attempts))) * updateTTLBackoffBaseline 404 if backoff > updateTTLBackoffLimit { 405 return err 406 } 407 408 // Wait till retrying 409 select { 410 case <-ctx.Done(): 411 return err 412 case <-time.After(backoff): 413 } 414 } 415 }