github.com/nir0s/nomad@v0.8.7-rc1/command/agent/consul/script.go (about) 1 package consul 2 3 import ( 4 "context" 5 "log" 6 "time" 7 8 metrics "github.com/armon/go-metrics" 9 "github.com/hashicorp/consul/api" 10 "github.com/hashicorp/nomad/client/driver" 11 "github.com/hashicorp/nomad/nomad/structs" 12 ) 13 14 // heartbeater is the subset of consul agent functionality needed by script 15 // checks to heartbeat 16 type heartbeater interface { 17 UpdateTTL(id, output, status string) error 18 } 19 20 // scriptHandle is returned by scriptCheck.run by cancelling a scriptCheck and 21 // waiting for it to shutdown. 22 type scriptHandle struct { 23 // cancel the script 24 cancel func() 25 exitCh chan struct{} 26 } 27 28 // wait returns a chan that's closed when the script exits 29 func (s *scriptHandle) wait() <-chan struct{} { 30 return s.exitCh 31 } 32 33 // scriptCheck runs script checks via a ScriptExecutor and updates the 34 // appropriate check's TTL when the script succeeds. 35 type scriptCheck struct { 36 allocID string 37 taskName string 38 39 id string 40 check *structs.ServiceCheck 41 exec driver.ScriptExecutor 42 agent heartbeater 43 44 // lastCheckOk is true if the last check was ok; otherwise false 45 lastCheckOk bool 46 47 logger *log.Logger 48 shutdownCh <-chan struct{} 49 } 50 51 // newScriptCheck creates a new scriptCheck. run() should be called once the 52 // initial check is registered with Consul. 53 func newScriptCheck(allocID, taskName, checkID string, check *structs.ServiceCheck, 54 exec driver.ScriptExecutor, agent heartbeater, logger *log.Logger, 55 shutdownCh <-chan struct{}) *scriptCheck { 56 57 return &scriptCheck{ 58 allocID: allocID, 59 taskName: taskName, 60 id: checkID, 61 check: check, 62 exec: exec, 63 agent: agent, 64 lastCheckOk: true, // start logging on first failure 65 logger: logger, 66 shutdownCh: shutdownCh, 67 } 68 } 69 70 // run this script check and return its cancel func. If the shutdownCh is 71 // closed the check will be run once more before exiting. 72 func (s *scriptCheck) run() *scriptHandle { 73 ctx, cancel := context.WithCancel(context.Background()) 74 exitCh := make(chan struct{}) 75 go func() { 76 defer close(exitCh) 77 timer := time.NewTimer(0) 78 defer timer.Stop() 79 for { 80 // Block until check is removed, Nomad is shutting 81 // down, or the check interval is up 82 select { 83 case <-ctx.Done(): 84 // check has been removed 85 return 86 case <-s.shutdownCh: 87 // unblock but don't exit until after we heartbeat once more 88 case <-timer.C: 89 timer.Reset(s.check.Interval) 90 } 91 metrics.IncrCounter([]string{"client", "consul", "script_runs"}, 1) 92 93 // Execute check script with timeout 94 execctx, cancel := context.WithTimeout(ctx, s.check.Timeout) 95 output, code, err := s.exec.Exec(execctx, s.check.Command, s.check.Args) 96 switch execctx.Err() { 97 case context.Canceled: 98 // check removed during execution; exit 99 cancel() 100 return 101 case context.DeadlineExceeded: 102 metrics.IncrCounter([]string{"client", "consul", "script_timeouts"}, 1) 103 // If no error was returned, set one to make sure the task goes critical 104 if err == nil { 105 err = context.DeadlineExceeded 106 } 107 108 // Log deadline exceeded every time as it's a 109 // distinct issue from checks returning 110 // failures 111 s.logger.Printf("[WARN] consul.checks: check %q for task %q alloc %q timed out (%s)", 112 s.check.Name, s.taskName, s.allocID, s.check.Timeout) 113 } 114 115 // cleanup context 116 cancel() 117 118 state := api.HealthCritical 119 switch code { 120 case 0: 121 state = api.HealthPassing 122 case 1: 123 state = api.HealthWarning 124 } 125 126 var outputMsg string 127 if err != nil { 128 state = api.HealthCritical 129 outputMsg = err.Error() 130 } else { 131 outputMsg = string(output) 132 } 133 134 // Actually heartbeat the check 135 err = s.agent.UpdateTTL(s.id, outputMsg, state) 136 select { 137 case <-ctx.Done(): 138 // check has been removed; don't report errors 139 return 140 default: 141 } 142 143 if err != nil { 144 if s.lastCheckOk { 145 s.lastCheckOk = false 146 s.logger.Printf("[WARN] consul.checks: update for task %q alloc %q check %q failed: %v", 147 s.taskName, s.allocID, s.check.Name, err) 148 } else { 149 s.logger.Printf("[DEBUG] consul.checks: update for task %q alloc %q check %q still failing: %v", 150 s.taskName, s.allocID, s.check.Name, err) 151 } 152 153 } else if !s.lastCheckOk { 154 // Succeeded for the first time or after failing; log 155 s.lastCheckOk = true 156 s.logger.Printf("[INFO] consul.checks: update for task %q alloc %q check %q succeeded", 157 s.taskName, s.allocID, s.check.Name) 158 } 159 160 select { 161 case <-s.shutdownCh: 162 // We've been told to exit and just heartbeated so exit 163 return 164 default: 165 } 166 } 167 }() 168 return &scriptHandle{cancel: cancel, exitCh: exitCh} 169 }