github.com/bshelton229/agent@v3.5.4+incompatible/agent/agent_worker.go (about) 1 package agent 2 3 import ( 4 "fmt" 5 "strings" 6 "sync" 7 "sync/atomic" 8 "time" 9 10 "github.com/buildkite/agent/api" 11 "github.com/buildkite/agent/logger" 12 "github.com/buildkite/agent/proctitle" 13 "github.com/buildkite/agent/retry" 14 ) 15 16 type AgentWorker struct { 17 // Tracks the last successful heartbeat and ping 18 // NOTE: to avoid alignment issues on ARM architectures when 19 // using atomic.StoreInt64 we need to keep this at the beginning 20 // of the struct 21 lastPing, lastHeartbeat int64 22 23 // The API Client used when this agent is communicating with the API 24 APIClient *api.Client 25 26 // The endpoint that should be used when communicating with the API 27 Endpoint string 28 29 // The registred agent API record 30 Agent *api.Agent 31 32 // The configuration of the agent from the CLI 33 AgentConfiguration *AgentConfiguration 34 35 // Whether or not the agent is running 36 running bool 37 38 // Used by the Start call to control the looping of the pings 39 ticker *time.Ticker 40 41 // Tracking the auto disconnect timer 42 disconnectTimeoutTimer *time.Timer 43 44 // Stop controls 45 stop chan struct{} 46 stopping bool 47 stopMutex sync.Mutex 48 49 // When this worker runs a job, we'll store an instance of the 50 // JobRunner here 51 jobRunner *JobRunner 52 } 53 54 // Creates the agent worker and initializes it's API Client 55 func (a AgentWorker) Create() AgentWorker { 56 var endpoint string 57 if a.Agent.Endpoint != "" { 58 endpoint = a.Agent.Endpoint 59 } else { 60 endpoint = a.Endpoint 61 } 62 63 a.APIClient = APIClient{Endpoint: endpoint, Token: a.Agent.AccessToken}.Create() 64 65 return a 66 } 67 68 // Starts the agent worker 69 func (a *AgentWorker) Start() error { 70 // Mark the agent as running 71 a.running = true 72 73 // Create the intervals we'll be using 74 pingInterval := time.Second * time.Duration(a.Agent.PingInterval) 75 heartbeatInterval := time.Second * time.Duration(a.Agent.HearbeatInterval) 76 77 // Setup and start the heartbeater 78 go func() { 79 // Keep the heartbeat running as long as the agent is 80 for a.running { 81 err := a.Heartbeat() 82 if err != nil { 83 // Get the last heartbeat time to the nearest microsecond 84 lastHeartbeat := time.Unix(atomic.LoadInt64(&a.lastPing), 0) 85 86 logger.Error("Failed to heartbeat %s. Will try again in %s. (Last successful was %v ago)", 87 err, heartbeatInterval, time.Now().Sub(lastHeartbeat)) 88 } 89 90 time.Sleep(heartbeatInterval) 91 } 92 }() 93 94 // Create the ticker and stop channels 95 a.ticker = time.NewTicker(pingInterval) 96 a.stop = make(chan struct{}) 97 98 // Setup a timer to automatically disconnect if no job has started 99 if a.AgentConfiguration.DisconnectAfterJob { 100 a.disconnectTimeoutTimer = time.NewTimer(time.Second * time.Duration(a.AgentConfiguration.DisconnectAfterJobTimeout)) 101 go func() { 102 <-a.disconnectTimeoutTimer.C 103 logger.Debug("[DisconnectionTimer] Reached %d seconds...", a.AgentConfiguration.DisconnectAfterJobTimeout) 104 105 // Just double check that the agent isn't running a 106 // job. The timer is stopped just after this is 107 // assigned, but there's a potential race condition 108 // where in between accepting the job, and creating the 109 // `jobRunner`, the timer pops. 110 if a.jobRunner == nil && !a.stopping { 111 logger.Debug("[DisconnectionTimer] The agent isn't running a job, going to signal a stop") 112 a.Stop(true) 113 } else { 114 logger.Debug("[DisconnectionTimer] Agent is running a job, going to just ignore and let it finish it's work") 115 } 116 }() 117 118 logger.Debug("[DisconnectionTimer] Started for %d seconds...", a.AgentConfiguration.DisconnectAfterJobTimeout) 119 } 120 121 // Continue this loop until the the ticker is stopped, and we received 122 // a message on the stop channel. 123 for { 124 if !a.stopping { 125 a.Ping() 126 } 127 128 select { 129 case <-a.ticker.C: 130 continue 131 case <-a.stop: 132 a.ticker.Stop() 133 return nil 134 } 135 } 136 137 // Mark the agent as not running anymore 138 a.running = false 139 140 return nil 141 } 142 143 // Stops the agent from accepting new work and cancels any current work it's 144 // running 145 func (a *AgentWorker) Stop(graceful bool) { 146 // Only allow one stop to run at a time (because we're playing with 147 // channels) 148 a.stopMutex.Lock() 149 defer a.stopMutex.Unlock() 150 151 if graceful { 152 if a.stopping { 153 logger.Warn("Agent is already gracefully stopping...") 154 } else { 155 // If we have a job, tell the user that we'll wait for 156 // it to finish before disconnecting 157 if a.jobRunner != nil { 158 logger.Info("Gracefully stopping agent. Waiting for current job to finish before disconnecting...") 159 } else { 160 logger.Info("Gracefully stopping agent. Since there is no job running, the agent will disconnect immediately") 161 } 162 } 163 } else { 164 // If there's a job running, kill it, then disconnect 165 if a.jobRunner != nil { 166 logger.Info("Forcefully stopping agent. The current job will be canceled before disconnecting...") 167 168 // Kill the current job. Doesn't do anything if the job 169 // is already being killed, so it's safe to call 170 // multiple times. 171 a.jobRunner.Kill() 172 } else { 173 logger.Info("Forcefully stopping agent. Since there is no job running, the agent will disconnect immediately") 174 } 175 } 176 177 // We don't need to do the below operations again since we've already 178 // done them before 179 if a.stopping { 180 return 181 } 182 183 // Update the proc title 184 a.UpdateProcTitle("stopping") 185 186 // If we have a ticker, stop it, and send a signal to the stop channel, 187 // which will cause the agent worker to stop looping immediatly. 188 if a.ticker != nil { 189 close(a.stop) 190 } 191 192 // Mark the agent as stopping 193 a.stopping = true 194 } 195 196 // Connects the agent to the Buildkite Agent API, retrying up to 30 times if it 197 // fails. 198 func (a *AgentWorker) Connect() error { 199 // Update the proc title 200 a.UpdateProcTitle("connecting") 201 202 return retry.Do(func(s *retry.Stats) error { 203 _, err := a.APIClient.Agents.Connect() 204 if err != nil { 205 logger.Warn("%s (%s)", err, s) 206 } 207 208 return err 209 }, &retry.Config{Maximum: 10, Interval: 5 * time.Second}) 210 } 211 212 // Performs a heatbeat 213 func (a *AgentWorker) Heartbeat() error { 214 var beat *api.Heartbeat 215 var err error 216 217 // Retry the heartbeat a few times 218 err = retry.Do(func(s *retry.Stats) error { 219 beat, _, err = a.APIClient.Heartbeats.Beat() 220 if err != nil { 221 logger.Warn("%s (%s)", err, s) 222 } 223 return err 224 }, &retry.Config{Maximum: 5, Interval: 5 * time.Second}) 225 226 if err != nil { 227 return err 228 } 229 230 // Track a timestamp for the successful heartbeat for better errors 231 atomic.StoreInt64(&a.lastHeartbeat, time.Now().Unix()) 232 233 logger.Debug("Heartbeat sent at %s and received at %s", beat.SentAt, beat.ReceivedAt) 234 return nil 235 } 236 237 // Performs a ping, which returns what action the agent should take next. 238 func (a *AgentWorker) Ping() { 239 // Update the proc title 240 a.UpdateProcTitle("pinging") 241 242 ping, _, err := a.APIClient.Pings.Get() 243 if err != nil { 244 // Get the last ping time to the nearest microsecond 245 lastPing := time.Unix(atomic.LoadInt64(&a.lastPing), 0) 246 247 // If a ping fails, we don't really care, because it'll 248 // ping again after the interval. 249 logger.Warn("Failed to ping: %s (Last successful was %v ago)", err, time.Now().Sub(lastPing)) 250 251 // When the ping fails, we wan't to reset our disconnection 252 // timer. It wouldnt' be very nice if we just killed the agent 253 // because Buildkite was having some connection issues. 254 if a.disconnectTimeoutTimer != nil { 255 jobTimeoutSeconds := time.Second * time.Duration(a.AgentConfiguration.DisconnectAfterJobTimeout) 256 a.disconnectTimeoutTimer.Reset(jobTimeoutSeconds) 257 258 logger.Debug("[DisconnectionTimer] Reset back to %d seconds because of ping failure...", a.AgentConfiguration.DisconnectAfterJobTimeout) 259 } 260 261 return 262 } else { 263 // Track a timestamp for the successful ping for better errors 264 atomic.StoreInt64(&a.lastPing, time.Now().Unix()) 265 } 266 267 // Should we switch endpoints? 268 if ping.Endpoint != "" && ping.Endpoint != a.Agent.Endpoint { 269 // Before switching to the new one, do a ping test to make sure it's 270 // valid. If it is, switch and carry on, otherwise ignore the switch 271 // for now. 272 newAPIClient := APIClient{Endpoint: ping.Endpoint, Token: a.Agent.AccessToken}.Create() 273 newPing, _, err := newAPIClient.Pings.Get() 274 if err != nil { 275 logger.Warn("Failed to ping the new endpoint %s - ignoring switch for now (%s)", ping.Endpoint, err) 276 } else { 277 // Replace the APIClient and process the new ping 278 a.APIClient = newAPIClient 279 a.Agent.Endpoint = ping.Endpoint 280 ping = newPing 281 } 282 } 283 284 // Is there a message that should be shown in the logs? 285 if ping.Message != "" { 286 logger.Info(ping.Message) 287 } 288 289 // Should the agent disconnect? 290 if ping.Action == "disconnect" { 291 a.Stop(false) 292 return 293 } 294 295 // If we don't have a job, there's nothing to do! 296 if ping.Job == nil { 297 // Update the proc title 298 a.UpdateProcTitle("idle") 299 300 return 301 } 302 303 // Update the proc title 304 a.UpdateProcTitle(fmt.Sprintf("job %s", strings.Split(ping.Job.ID, "-")[0])) 305 306 logger.Info("Assigned job %s. Accepting...", ping.Job.ID) 307 308 // Accept the job. We'll retry on connection related issues, but if 309 // Buildkite returns a 422 or 500 for example, we'll just bail out, 310 // re-ping, and try the whole process again. 311 var accepted *api.Job 312 retry.Do(func(s *retry.Stats) error { 313 accepted, _, err = a.APIClient.Jobs.Accept(ping.Job) 314 315 if err != nil { 316 if api.IsRetryableError(err) { 317 logger.Warn("%s (%s)", err, s) 318 } else { 319 logger.Warn("Buildkite rejected the call to accept the job (%s)", err) 320 s.Break() 321 } 322 } 323 324 return err 325 }, &retry.Config{Maximum: 30, Interval: 5 * time.Second}) 326 327 // If `accepted` is nil, then the job was never accepted 328 if accepted == nil { 329 logger.Error("Failed to accept job") 330 return 331 } 332 333 // Now that the job has been accepted, we can start it. 334 a.jobRunner, err = JobRunner{ 335 Endpoint: accepted.Endpoint, 336 Agent: a.Agent, 337 AgentConfiguration: a.AgentConfiguration, 338 Job: accepted, 339 }.Create() 340 341 // Woo! We've got a job, and successfully accepted it, let's kill our auto-disconnect timer 342 if a.disconnectTimeoutTimer != nil { 343 logger.Debug("[DisconnectionTimer] A job was assigned and accepted, stopping timer...") 344 a.disconnectTimeoutTimer.Stop() 345 } 346 347 // Was there an error creating the job runner? 348 if err != nil { 349 logger.Error("Failed to initialize job: %s", err) 350 return 351 } 352 353 // Start running the job 354 if err = a.jobRunner.Run(); err != nil { 355 logger.Error("Failed to run job: %s", err) 356 } 357 358 // No more job, no more runner. 359 a.jobRunner = nil 360 361 if a.AgentConfiguration.DisconnectAfterJob { 362 logger.Info("Job finished. Disconnecting...") 363 364 // We can just kill this timer now as well 365 if a.disconnectTimeoutTimer != nil { 366 a.disconnectTimeoutTimer.Stop() 367 } 368 369 // Tell the agent to finish up 370 a.Stop(true) 371 } 372 } 373 374 // Disconnects the agent from the Buildkite Agent API, doesn't bother retrying 375 // because we want to disconnect as fast as possible. 376 func (a *AgentWorker) Disconnect() error { 377 // Update the proc title 378 a.UpdateProcTitle("disconnecting") 379 380 _, err := a.APIClient.Agents.Disconnect() 381 if err != nil { 382 logger.Warn("There was an error sending the disconnect API call to Buildkite. If this agent still appears online, you may have to manually stop it (%s)", err) 383 } 384 385 return err 386 } 387 388 func (a *AgentWorker) UpdateProcTitle(action string) { 389 proctitle.Replace(fmt.Sprintf("buildkite-agent v%s [%s]", Version(), action)) 390 }