github.com/bshelton229/agent@v3.5.4+incompatible/agent/agent_worker.go (about)

     1  package agent
     2  
     3  import (
     4  	"fmt"
     5  	"strings"
     6  	"sync"
     7  	"sync/atomic"
     8  	"time"
     9  
    10  	"github.com/buildkite/agent/api"
    11  	"github.com/buildkite/agent/logger"
    12  	"github.com/buildkite/agent/proctitle"
    13  	"github.com/buildkite/agent/retry"
    14  )
    15  
    16  type AgentWorker struct {
    17  	// Tracks the last successful heartbeat and ping
    18  	// NOTE: to avoid alignment issues on ARM architectures when
    19  	// using atomic.StoreInt64 we need to keep this at the beginning
    20  	// of the struct
    21  	lastPing, lastHeartbeat int64
    22  
    23  	// The API Client used when this agent is communicating with the API
    24  	APIClient *api.Client
    25  
    26  	// The endpoint that should be used when communicating with the API
    27  	Endpoint string
    28  
    29  	// The registred agent API record
    30  	Agent *api.Agent
    31  
    32  	// The configuration of the agent from the CLI
    33  	AgentConfiguration *AgentConfiguration
    34  
    35  	// Whether or not the agent is running
    36  	running bool
    37  
    38  	// Used by the Start call to control the looping of the pings
    39  	ticker *time.Ticker
    40  
    41  	// Tracking the auto disconnect timer
    42  	disconnectTimeoutTimer *time.Timer
    43  
    44  	// Stop controls
    45  	stop      chan struct{}
    46  	stopping  bool
    47  	stopMutex sync.Mutex
    48  
    49  	// When this worker runs a job, we'll store an instance of the
    50  	// JobRunner here
    51  	jobRunner *JobRunner
    52  }
    53  
    54  // Creates the agent worker and initializes it's API Client
    55  func (a AgentWorker) Create() AgentWorker {
    56  	var endpoint string
    57  	if a.Agent.Endpoint != "" {
    58  		endpoint = a.Agent.Endpoint
    59  	} else {
    60  		endpoint = a.Endpoint
    61  	}
    62  
    63  	a.APIClient = APIClient{Endpoint: endpoint, Token: a.Agent.AccessToken}.Create()
    64  
    65  	return a
    66  }
    67  
    68  // Starts the agent worker
    69  func (a *AgentWorker) Start() error {
    70  	// Mark the agent as running
    71  	a.running = true
    72  
    73  	// Create the intervals we'll be using
    74  	pingInterval := time.Second * time.Duration(a.Agent.PingInterval)
    75  	heartbeatInterval := time.Second * time.Duration(a.Agent.HearbeatInterval)
    76  
    77  	// Setup and start the heartbeater
    78  	go func() {
    79  		// Keep the heartbeat running as long as the agent is
    80  		for a.running {
    81  			err := a.Heartbeat()
    82  			if err != nil {
    83  				// Get the last heartbeat time to the nearest microsecond
    84  				lastHeartbeat := time.Unix(atomic.LoadInt64(&a.lastPing), 0)
    85  
    86  				logger.Error("Failed to heartbeat %s. Will try again in %s. (Last successful was %v ago)",
    87  					err, heartbeatInterval, time.Now().Sub(lastHeartbeat))
    88  			}
    89  
    90  			time.Sleep(heartbeatInterval)
    91  		}
    92  	}()
    93  
    94  	// Create the ticker and stop channels
    95  	a.ticker = time.NewTicker(pingInterval)
    96  	a.stop = make(chan struct{})
    97  
    98  	// Setup a timer to automatically disconnect if no job has started
    99  	if a.AgentConfiguration.DisconnectAfterJob {
   100  		a.disconnectTimeoutTimer = time.NewTimer(time.Second * time.Duration(a.AgentConfiguration.DisconnectAfterJobTimeout))
   101  		go func() {
   102  			<-a.disconnectTimeoutTimer.C
   103  			logger.Debug("[DisconnectionTimer] Reached %d seconds...", a.AgentConfiguration.DisconnectAfterJobTimeout)
   104  
   105  			// Just double check that the agent isn't running a
   106  			// job. The timer is stopped just after this is
   107  			// assigned, but there's a potential race condition
   108  			// where in between accepting the job, and creating the
   109  			// `jobRunner`, the timer pops.
   110  			if a.jobRunner == nil && !a.stopping {
   111  				logger.Debug("[DisconnectionTimer] The agent isn't running a job, going to signal a stop")
   112  				a.Stop(true)
   113  			} else {
   114  				logger.Debug("[DisconnectionTimer] Agent is running a job, going to just ignore and let it finish it's work")
   115  			}
   116  		}()
   117  
   118  		logger.Debug("[DisconnectionTimer] Started for %d seconds...", a.AgentConfiguration.DisconnectAfterJobTimeout)
   119  	}
   120  
   121  	// Continue this loop until the the ticker is stopped, and we received
   122  	// a message on the stop channel.
   123  	for {
   124  		if !a.stopping {
   125  			a.Ping()
   126  		}
   127  
   128  		select {
   129  		case <-a.ticker.C:
   130  			continue
   131  		case <-a.stop:
   132  			a.ticker.Stop()
   133  			return nil
   134  		}
   135  	}
   136  
   137  	// Mark the agent as not running anymore
   138  	a.running = false
   139  
   140  	return nil
   141  }
   142  
   143  // Stops the agent from accepting new work and cancels any current work it's
   144  // running
   145  func (a *AgentWorker) Stop(graceful bool) {
   146  	// Only allow one stop to run at a time (because we're playing with
   147  	// channels)
   148  	a.stopMutex.Lock()
   149  	defer a.stopMutex.Unlock()
   150  
   151  	if graceful {
   152  		if a.stopping {
   153  			logger.Warn("Agent is already gracefully stopping...")
   154  		} else {
   155  			// If we have a job, tell the user that we'll wait for
   156  			// it to finish before disconnecting
   157  			if a.jobRunner != nil {
   158  				logger.Info("Gracefully stopping agent. Waiting for current job to finish before disconnecting...")
   159  			} else {
   160  				logger.Info("Gracefully stopping agent. Since there is no job running, the agent will disconnect immediately")
   161  			}
   162  		}
   163  	} else {
   164  		// If there's a job running, kill it, then disconnect
   165  		if a.jobRunner != nil {
   166  			logger.Info("Forcefully stopping agent. The current job will be canceled before disconnecting...")
   167  
   168  			// Kill the current job. Doesn't do anything if the job
   169  			// is already being killed, so it's safe to call
   170  			// multiple times.
   171  			a.jobRunner.Kill()
   172  		} else {
   173  			logger.Info("Forcefully stopping agent. Since there is no job running, the agent will disconnect immediately")
   174  		}
   175  	}
   176  
   177  	// We don't need to do the below operations again since we've already
   178  	// done them before
   179  	if a.stopping {
   180  		return
   181  	}
   182  
   183  	// Update the proc title
   184  	a.UpdateProcTitle("stopping")
   185  
   186  	// If we have a ticker, stop it, and send a signal to the stop channel,
   187  	// which will cause the agent worker to stop looping immediatly.
   188  	if a.ticker != nil {
   189  		close(a.stop)
   190  	}
   191  
   192  	// Mark the agent as stopping
   193  	a.stopping = true
   194  }
   195  
   196  // Connects the agent to the Buildkite Agent API, retrying up to 30 times if it
   197  // fails.
   198  func (a *AgentWorker) Connect() error {
   199  	// Update the proc title
   200  	a.UpdateProcTitle("connecting")
   201  
   202  	return retry.Do(func(s *retry.Stats) error {
   203  		_, err := a.APIClient.Agents.Connect()
   204  		if err != nil {
   205  			logger.Warn("%s (%s)", err, s)
   206  		}
   207  
   208  		return err
   209  	}, &retry.Config{Maximum: 10, Interval: 5 * time.Second})
   210  }
   211  
   212  // Performs a heatbeat
   213  func (a *AgentWorker) Heartbeat() error {
   214  	var beat *api.Heartbeat
   215  	var err error
   216  
   217  	// Retry the heartbeat a few times
   218  	err = retry.Do(func(s *retry.Stats) error {
   219  		beat, _, err = a.APIClient.Heartbeats.Beat()
   220  		if err != nil {
   221  			logger.Warn("%s (%s)", err, s)
   222  		}
   223  		return err
   224  	}, &retry.Config{Maximum: 5, Interval: 5 * time.Second})
   225  
   226  	if err != nil {
   227  		return err
   228  	}
   229  
   230  	// Track a timestamp for the successful heartbeat for better errors
   231  	atomic.StoreInt64(&a.lastHeartbeat, time.Now().Unix())
   232  
   233  	logger.Debug("Heartbeat sent at %s and received at %s", beat.SentAt, beat.ReceivedAt)
   234  	return nil
   235  }
   236  
   237  // Performs a ping, which returns what action the agent should take next.
   238  func (a *AgentWorker) Ping() {
   239  	// Update the proc title
   240  	a.UpdateProcTitle("pinging")
   241  
   242  	ping, _, err := a.APIClient.Pings.Get()
   243  	if err != nil {
   244  		// Get the last ping time to the nearest microsecond
   245  		lastPing := time.Unix(atomic.LoadInt64(&a.lastPing), 0)
   246  
   247  		// If a ping fails, we don't really care, because it'll
   248  		// ping again after the interval.
   249  		logger.Warn("Failed to ping: %s (Last successful was %v ago)", err, time.Now().Sub(lastPing))
   250  
   251  		// When the ping fails, we wan't to reset our disconnection
   252  		// timer. It wouldnt' be very nice if we just killed the agent
   253  		// because Buildkite was having some connection issues.
   254  		if a.disconnectTimeoutTimer != nil {
   255  			jobTimeoutSeconds := time.Second * time.Duration(a.AgentConfiguration.DisconnectAfterJobTimeout)
   256  			a.disconnectTimeoutTimer.Reset(jobTimeoutSeconds)
   257  
   258  			logger.Debug("[DisconnectionTimer] Reset back to %d seconds because of ping failure...", a.AgentConfiguration.DisconnectAfterJobTimeout)
   259  		}
   260  
   261  		return
   262  	} else {
   263  		// Track a timestamp for the successful ping for better errors
   264  		atomic.StoreInt64(&a.lastPing, time.Now().Unix())
   265  	}
   266  
   267  	// Should we switch endpoints?
   268  	if ping.Endpoint != "" && ping.Endpoint != a.Agent.Endpoint {
   269  		// Before switching to the new one, do a ping test to make sure it's
   270  		// valid. If it is, switch and carry on, otherwise ignore the switch
   271  		// for now.
   272  		newAPIClient := APIClient{Endpoint: ping.Endpoint, Token: a.Agent.AccessToken}.Create()
   273  		newPing, _, err := newAPIClient.Pings.Get()
   274  		if err != nil {
   275  			logger.Warn("Failed to ping the new endpoint %s - ignoring switch for now (%s)", ping.Endpoint, err)
   276  		} else {
   277  			// Replace the APIClient and process the new ping
   278  			a.APIClient = newAPIClient
   279  			a.Agent.Endpoint = ping.Endpoint
   280  			ping = newPing
   281  		}
   282  	}
   283  
   284  	// Is there a message that should be shown in the logs?
   285  	if ping.Message != "" {
   286  		logger.Info(ping.Message)
   287  	}
   288  
   289  	// Should the agent disconnect?
   290  	if ping.Action == "disconnect" {
   291  		a.Stop(false)
   292  		return
   293  	}
   294  
   295  	// If we don't have a job, there's nothing to do!
   296  	if ping.Job == nil {
   297  		// Update the proc title
   298  		a.UpdateProcTitle("idle")
   299  
   300  		return
   301  	}
   302  
   303  	// Update the proc title
   304  	a.UpdateProcTitle(fmt.Sprintf("job %s", strings.Split(ping.Job.ID, "-")[0]))
   305  
   306  	logger.Info("Assigned job %s. Accepting...", ping.Job.ID)
   307  
   308  	// Accept the job. We'll retry on connection related issues, but if
   309  	// Buildkite returns a 422 or 500 for example, we'll just bail out,
   310  	// re-ping, and try the whole process again.
   311  	var accepted *api.Job
   312  	retry.Do(func(s *retry.Stats) error {
   313  		accepted, _, err = a.APIClient.Jobs.Accept(ping.Job)
   314  
   315  		if err != nil {
   316  			if api.IsRetryableError(err) {
   317  				logger.Warn("%s (%s)", err, s)
   318  			} else {
   319  				logger.Warn("Buildkite rejected the call to accept the job (%s)", err)
   320  				s.Break()
   321  			}
   322  		}
   323  
   324  		return err
   325  	}, &retry.Config{Maximum: 30, Interval: 5 * time.Second})
   326  
   327  	// If `accepted` is nil, then the job was never accepted
   328  	if accepted == nil {
   329  		logger.Error("Failed to accept job")
   330  		return
   331  	}
   332  
   333  	// Now that the job has been accepted, we can start it.
   334  	a.jobRunner, err = JobRunner{
   335  		Endpoint:           accepted.Endpoint,
   336  		Agent:              a.Agent,
   337  		AgentConfiguration: a.AgentConfiguration,
   338  		Job:                accepted,
   339  	}.Create()
   340  
   341  	// Woo! We've got a job, and successfully accepted it, let's kill our auto-disconnect timer
   342  	if a.disconnectTimeoutTimer != nil {
   343  		logger.Debug("[DisconnectionTimer] A job was assigned and accepted, stopping timer...")
   344  		a.disconnectTimeoutTimer.Stop()
   345  	}
   346  
   347  	// Was there an error creating the job runner?
   348  	if err != nil {
   349  		logger.Error("Failed to initialize job: %s", err)
   350  		return
   351  	}
   352  
   353  	// Start running the job
   354  	if err = a.jobRunner.Run(); err != nil {
   355  		logger.Error("Failed to run job: %s", err)
   356  	}
   357  
   358  	// No more job, no more runner.
   359  	a.jobRunner = nil
   360  
   361  	if a.AgentConfiguration.DisconnectAfterJob {
   362  		logger.Info("Job finished. Disconnecting...")
   363  
   364  		// We can just kill this timer now as well
   365  		if a.disconnectTimeoutTimer != nil {
   366  			a.disconnectTimeoutTimer.Stop()
   367  		}
   368  
   369  		// Tell the agent to finish up
   370  		a.Stop(true)
   371  	}
   372  }
   373  
   374  // Disconnects the agent from the Buildkite Agent API, doesn't bother retrying
   375  // because we want to disconnect as fast as possible.
   376  func (a *AgentWorker) Disconnect() error {
   377  	// Update the proc title
   378  	a.UpdateProcTitle("disconnecting")
   379  
   380  	_, err := a.APIClient.Agents.Disconnect()
   381  	if err != nil {
   382  		logger.Warn("There was an error sending the disconnect API call to Buildkite. If this agent still appears online, you may have to manually stop it (%s)", err)
   383  	}
   384  
   385  	return err
   386  }
   387  
   388  func (a *AgentWorker) UpdateProcTitle(action string) {
   389  	proctitle.Replace(fmt.Sprintf("buildkite-agent v%s [%s]", Version(), action))
   390  }