github.com/pquerna/agent@v2.1.8+incompatible/agent/agent_worker.go (about) 1 package agent 2 3 import ( 4 "fmt" 5 "strings" 6 "sync" 7 "time" 8 9 "github.com/buildkite/agent/api" 10 "github.com/buildkite/agent/logger" 11 "github.com/buildkite/agent/proctitle" 12 "github.com/buildkite/agent/retry" 13 ) 14 15 type AgentWorker struct { 16 // The API Client used when this agent is communicating with the API 17 APIClient *api.Client 18 19 // The endpoint that should be used when communicating with the API 20 Endpoint string 21 22 // The registred agent API record 23 Agent *api.Agent 24 25 // The configuration of the agent from the CLI 26 AgentConfiguration *AgentConfiguration 27 28 // Whether or not the agent is running 29 running bool 30 31 // Used by the Start call to control the looping of the pings 32 ticker *time.Ticker 33 34 // Stop controls 35 stop chan struct{} 36 stopping bool 37 stopMutex sync.Mutex 38 39 // When this worker runs a job, we'll store an instance of the 40 // JobRunner here 41 jobRunner *JobRunner 42 } 43 44 // Creates the agent worker and initializes it's API Client 45 func (a AgentWorker) Create() AgentWorker { 46 var endpoint string 47 if a.Agent.Endpoint != "" { 48 endpoint = a.Agent.Endpoint 49 } else { 50 endpoint = a.Endpoint 51 } 52 53 a.APIClient = APIClient{Endpoint: endpoint, Token: a.Agent.AccessToken}.Create() 54 55 return a 56 } 57 58 // Starts the agent worker 59 func (a *AgentWorker) Start() error { 60 // Mark the agent as running 61 a.running = true 62 63 // Create the intervals we'll be using 64 pingInterval := time.Second * time.Duration(a.Agent.PingInterval) 65 heartbeatInterval := time.Second * time.Duration(a.Agent.HearbeatInterval) 66 67 // Setup and start the heartbeater 68 go func() { 69 // Keep the heartbeat running as long as the agent is 70 for a.running { 71 err := a.Heartbeat() 72 if err != nil { 73 logger.Error("Failed to heartbeat %s. Will try again in %s", err, heartbeatInterval) 74 } 75 76 time.Sleep(heartbeatInterval) 77 } 78 }() 79 80 // Create the ticker and stop channels 81 a.ticker = time.NewTicker(pingInterval) 82 a.stop = make(chan struct{}) 83 84 // Continue this loop until the the ticker is stopped, and we received 85 // a message on the stop channel. 86 for { 87 a.Ping() 88 89 select { 90 case <-a.ticker.C: 91 continue 92 case <-a.stop: 93 a.ticker.Stop() 94 return nil 95 } 96 } 97 98 // Mark the agent as not running anymore 99 a.running = false 100 101 return nil 102 } 103 104 // Stops the agent from accepting new work and cancels any current work it's 105 // running 106 func (a *AgentWorker) Stop(graceful bool) { 107 // Only allow one stop to run at a time (because we're playing with 108 // channels) 109 a.stopMutex.Lock() 110 defer a.stopMutex.Unlock() 111 112 if graceful { 113 if a.stopping { 114 logger.Warn("Agent is already gracefully stopping...") 115 } else { 116 // If we have a job, tell the user that we'll wait for 117 // it to finish before disconnecting 118 if a.jobRunner != nil { 119 logger.Info("Gracefully stopping agent. Waiting for current job to finish before disconnecting...") 120 } else { 121 logger.Info("Gracefully stopping agent. Since there is no job running, the agent will disconnect immediately") 122 } 123 } 124 } else { 125 // If there's a job running, kill it, then disconnect 126 if a.jobRunner != nil { 127 logger.Info("Forcefully stopping agent. The current job will be canceled before disconnecting...") 128 129 // Kill the current job. Doesn't do anything if the job 130 // is already being killed, so it's safe to call 131 // multiple times. 132 a.jobRunner.Kill() 133 } else { 134 logger.Info("Forcefully stopping agent. Since there is no job running, the agent will disconnect immediately") 135 } 136 } 137 138 // We don't need to do the below operations again since we've already 139 // done them before 140 if a.stopping { 141 return 142 } 143 144 // Update the proc title 145 a.UpdateProcTitle("stopping") 146 147 // If we have a ticker, stop it, and send a signal to the stop channel, 148 // which will cause the agent worker to stop looping immediatly. 149 if a.ticker != nil { 150 close(a.stop) 151 } 152 153 // Mark the agent as stopping 154 a.stopping = true 155 } 156 157 // Connects the agent to the Buildkite Agent API, retrying up to 30 times if it 158 // fails. 159 func (a *AgentWorker) Connect() error { 160 // Update the proc title 161 a.UpdateProcTitle("connecting") 162 163 return retry.Do(func(s *retry.Stats) error { 164 _, err := a.APIClient.Agents.Connect() 165 if err != nil { 166 logger.Warn("%s (%s)", err, s) 167 } 168 169 return err 170 }, &retry.Config{Maximum: 10, Interval: 5 * time.Second}) 171 } 172 173 // Performs a heatbeat 174 func (a *AgentWorker) Heartbeat() error { 175 var beat *api.Heartbeat 176 var err error 177 178 // Retry the heartbeat a few times 179 err = retry.Do(func(s *retry.Stats) error { 180 beat, _, err = a.APIClient.Heartbeats.Beat() 181 if err != nil { 182 logger.Warn("%s (%s)", err, s) 183 } 184 return err 185 }, &retry.Config{Maximum: 5, Interval: 5 * time.Second}) 186 187 if err != nil { 188 return err 189 } 190 191 logger.Debug("Heartbeat sent at %s and received at %s", beat.SentAt, beat.ReceivedAt) 192 return nil 193 } 194 195 // Performs a ping, which returns what action the agent should take next. 196 func (a *AgentWorker) Ping() { 197 // Update the proc title 198 a.UpdateProcTitle("pinging") 199 200 ping, _, err := a.APIClient.Pings.Get() 201 if err != nil { 202 // If a ping fails, we don't really care, because it'll 203 // ping again after the interval. 204 logger.Warn("Failed to ping: %s", err) 205 return 206 } 207 208 // Should we switch endpoints? 209 if ping.Endpoint != "" && ping.Endpoint != a.Agent.Endpoint { 210 // Before switching to the new one, do a ping test to make sure it's 211 // valid. If it is, switch and carry on, otherwise ignore the switch 212 // for now. 213 newAPIClient := APIClient{Endpoint: ping.Endpoint, Token: a.Agent.AccessToken}.Create() 214 newPing, _, err := newAPIClient.Pings.Get() 215 if err != nil { 216 logger.Warn("Failed to ping the new endpoint %s - ignoring switch for now (%s)", ping.Endpoint, err) 217 } else { 218 // Replace the APIClient and process the new ping 219 a.APIClient = newAPIClient 220 a.Agent.Endpoint = ping.Endpoint 221 ping = newPing 222 } 223 } 224 225 // Is there a message that should be shown in the logs? 226 if ping.Message != "" { 227 logger.Info(ping.Message) 228 } 229 230 // Should the agent disconnect? 231 if ping.Action == "disconnect" { 232 a.Stop(false) 233 return 234 } 235 236 // If we don't have a job, there's nothing to do! 237 if ping.Job == nil { 238 // Update the proc title 239 a.UpdateProcTitle("idle") 240 241 return 242 } 243 244 // Update the proc title 245 a.UpdateProcTitle(fmt.Sprintf("job %s", strings.Split(ping.Job.ID, "-")[0])) 246 247 logger.Info("Assigned job %s. Accepting...", ping.Job.ID) 248 249 // Accept the job. We'll retry on connection related issues, but if 250 // Buildkite returns a 422 or 500 for example, we'll just bail out, 251 // re-ping, and try the whole process again. 252 var accepted *api.Job 253 retry.Do(func(s *retry.Stats) error { 254 accepted, _, err = a.APIClient.Jobs.Accept(ping.Job) 255 256 if err != nil { 257 if api.IsRetryableError(err) { 258 logger.Warn("%s (%s)", err, s) 259 } else { 260 logger.Warn("Buildkite rejected the call to accept the job (%s)", err) 261 s.Break() 262 } 263 } 264 265 return err 266 }, &retry.Config{Maximum: 30, Interval: 5 * time.Second}) 267 268 // If `accepted` is nil, then the job was never accepted 269 if accepted == nil { 270 logger.Error("Failed to accept job") 271 return 272 } 273 274 // Now that the job has been accepted, we can start it. 275 a.jobRunner, err = JobRunner{ 276 Endpoint: accepted.Endpoint, 277 Agent: a.Agent, 278 AgentConfiguration: a.AgentConfiguration, 279 Job: accepted, 280 }.Create() 281 282 // Was there an error creating the job runner? 283 if err != nil { 284 logger.Error("Failed to initialize job: %s", err) 285 return 286 } 287 288 // Start running the job 289 if err = a.jobRunner.Run(); err != nil { 290 logger.Error("Failed to run job: %s", err) 291 } 292 293 // No more job, no more runner. 294 a.jobRunner = nil 295 } 296 297 // Disconnects the agent from the Buildkite Agent API, doesn't bother retrying 298 // because we want to disconnect as fast as possible. 299 func (a *AgentWorker) Disconnect() error { 300 // Update the proc title 301 a.UpdateProcTitle("disconnecting") 302 303 _, err := a.APIClient.Agents.Disconnect() 304 if err != nil { 305 logger.Warn("There was an error sending the disconnect API call to Buildkite. If this agent still appears online, you may have to manually stop it (%s)", err) 306 } 307 308 return err 309 } 310 311 func (a *AgentWorker) UpdateProcTitle(action string) { 312 proctitle.Replace(fmt.Sprintf("buildkite-agent v%s [%s]", Version(), action)) 313 }