github.com/smithx10/nomad@v0.9.1-rc1/command/agent/consul/script.go (about) 1 package consul 2 3 import ( 4 "context" 5 "time" 6 7 metrics "github.com/armon/go-metrics" 8 log "github.com/hashicorp/go-hclog" 9 10 "github.com/hashicorp/consul/api" 11 "github.com/hashicorp/nomad/client/allocrunner/taskrunner/interfaces" 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 // heartbeater is the subset of consul agent functionality needed by script 16 // checks to heartbeat 17 type heartbeater interface { 18 UpdateTTL(id, output, status string) error 19 } 20 21 // contextExec allows canceling a ScriptExecutor with a context. 22 type contextExec struct { 23 // pctx is the parent context. A subcontext will be created with Exec's 24 // timeout. 25 pctx context.Context 26 27 // exec to be wrapped in a context 28 exec interfaces.ScriptExecutor 29 } 30 31 func newContextExec(ctx context.Context, exec interfaces.ScriptExecutor) *contextExec { 32 return &contextExec{ 33 pctx: ctx, 34 exec: exec, 35 } 36 } 37 38 type execResult struct { 39 buf []byte 40 code int 41 err error 42 } 43 44 // Exec a command until the timeout expires, the context is canceled, or the 45 // underlying Exec returns. 46 func (c *contextExec) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) { 47 resCh := make(chan execResult, 1) 48 49 // Don't trust the underlying implementation to obey timeout 50 ctx, cancel := context.WithTimeout(c.pctx, timeout) 51 defer cancel() 52 53 go func() { 54 output, code, err := c.exec.Exec(timeout, cmd, args) 55 select { 56 case resCh <- execResult{output, code, err}: 57 case <-ctx.Done(): 58 } 59 }() 60 61 select { 62 case res := <-resCh: 63 return res.buf, res.code, res.err 64 case <-ctx.Done(): 65 return nil, 0, ctx.Err() 66 } 67 } 68 69 // scriptHandle is returned by scriptCheck.run by cancelling a scriptCheck and 70 // waiting for it to shutdown. 71 type scriptHandle struct { 72 // cancel the script 73 cancel func() 74 exitCh chan struct{} 75 } 76 77 // wait returns a chan that's closed when the script exits 78 func (s *scriptHandle) wait() <-chan struct{} { 79 return s.exitCh 80 } 81 82 // scriptCheck runs script checks via a ScriptExecutor and updates the 83 // appropriate check's TTL when the script succeeds. 84 type scriptCheck struct { 85 allocID string 86 taskName string 87 88 id string 89 check *structs.ServiceCheck 90 exec interfaces.ScriptExecutor 91 agent heartbeater 92 93 // lastCheckOk is true if the last check was ok; otherwise false 94 lastCheckOk bool 95 96 logger log.Logger 97 shutdownCh <-chan struct{} 98 } 99 100 // newScriptCheck creates a new scriptCheck. run() should be called once the 101 // initial check is registered with Consul. 102 func newScriptCheck(allocID, taskName, checkID string, check *structs.ServiceCheck, 103 exec interfaces.ScriptExecutor, agent heartbeater, logger log.Logger, 104 shutdownCh <-chan struct{}) *scriptCheck { 105 106 logger = logger.ResetNamed("consul.checks").With("task", taskName, "alloc_id", allocID, "check", check.Name) 107 return &scriptCheck{ 108 allocID: allocID, 109 taskName: taskName, 110 id: checkID, 111 check: check, 112 exec: exec, 113 agent: agent, 114 lastCheckOk: true, // start logging on first failure 115 logger: logger, 116 shutdownCh: shutdownCh, 117 } 118 } 119 120 // run this script check and return its cancel func. If the shutdownCh is 121 // closed the check will be run once more before exiting. 122 func (s *scriptCheck) run() *scriptHandle { 123 ctx, cancel := context.WithCancel(context.Background()) 124 exitCh := make(chan struct{}) 125 126 // Wrap the original ScriptExecutor in one that obeys context 127 // cancelation. 128 ctxExec := newContextExec(ctx, s.exec) 129 130 go func() { 131 defer close(exitCh) 132 timer := time.NewTimer(0) 133 defer timer.Stop() 134 for { 135 // Block until check is removed, Nomad is shutting 136 // down, or the check interval is up 137 select { 138 case <-ctx.Done(): 139 // check has been removed 140 return 141 case <-s.shutdownCh: 142 // unblock but don't exit until after we heartbeat once more 143 case <-timer.C: 144 timer.Reset(s.check.Interval) 145 } 146 metrics.IncrCounter([]string{"client", "consul", "script_runs"}, 1) 147 148 // Execute check script with timeout 149 output, code, err := ctxExec.Exec(s.check.Timeout, s.check.Command, s.check.Args) 150 switch err { 151 case context.Canceled: 152 // check removed during execution; exit 153 return 154 case context.DeadlineExceeded: 155 metrics.IncrCounter([]string{"client", "consul", "script_timeouts"}, 1) 156 // If no error was returned, set one to make sure the task goes critical 157 if err == nil { 158 err = context.DeadlineExceeded 159 } 160 161 // Log deadline exceeded every time as it's a 162 // distinct issue from checks returning 163 // failures 164 s.logger.Warn("check timed out", "timeout", s.check.Timeout) 165 } 166 167 state := api.HealthCritical 168 switch code { 169 case 0: 170 state = api.HealthPassing 171 case 1: 172 state = api.HealthWarning 173 } 174 175 var outputMsg string 176 if err != nil { 177 state = api.HealthCritical 178 outputMsg = err.Error() 179 } else { 180 outputMsg = string(output) 181 } 182 183 // Actually heartbeat the check 184 err = s.agent.UpdateTTL(s.id, outputMsg, state) 185 select { 186 case <-ctx.Done(): 187 // check has been removed; don't report errors 188 return 189 default: 190 } 191 192 if err != nil { 193 if s.lastCheckOk { 194 s.lastCheckOk = false 195 s.logger.Warn("updating check failed", "error", err) 196 } else { 197 s.logger.Debug("updating check still failing", "error", err) 198 } 199 200 } else if !s.lastCheckOk { 201 // Succeeded for the first time or after failing; log 202 s.lastCheckOk = true 203 s.logger.Info("updating check succeeded") 204 } 205 206 select { 207 case <-s.shutdownCh: 208 // We've been told to exit and just heartbeated so exit 209 return 210 default: 211 } 212 } 213 }() 214 return &scriptHandle{cancel: cancel, exitCh: exitCh} 215 }