github.com/pquerna/agent@v2.1.8+incompatible/agent/agent_worker.go (about)

     1  package agent
     2  
     3  import (
     4  	"fmt"
     5  	"strings"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/buildkite/agent/api"
    10  	"github.com/buildkite/agent/logger"
    11  	"github.com/buildkite/agent/proctitle"
    12  	"github.com/buildkite/agent/retry"
    13  )
    14  
    15  type AgentWorker struct {
    16  	// The API Client used when this agent is communicating with the API
    17  	APIClient *api.Client
    18  
    19  	// The endpoint that should be used when communicating with the API
    20  	Endpoint string
    21  
    22  	// The registred agent API record
    23  	Agent *api.Agent
    24  
    25  	// The configuration of the agent from the CLI
    26  	AgentConfiguration *AgentConfiguration
    27  
    28  	// Whether or not the agent is running
    29  	running bool
    30  
    31  	// Used by the Start call to control the looping of the pings
    32  	ticker *time.Ticker
    33  
    34  	// Stop controls
    35  	stop      chan struct{}
    36  	stopping  bool
    37  	stopMutex sync.Mutex
    38  
    39  	// When this worker runs a job, we'll store an instance of the
    40  	// JobRunner here
    41  	jobRunner *JobRunner
    42  }
    43  
    44  // Creates the agent worker and initializes it's API Client
    45  func (a AgentWorker) Create() AgentWorker {
    46  	var endpoint string
    47  	if a.Agent.Endpoint != "" {
    48  		endpoint = a.Agent.Endpoint
    49  	} else {
    50  		endpoint = a.Endpoint
    51  	}
    52  
    53  	a.APIClient = APIClient{Endpoint: endpoint, Token: a.Agent.AccessToken}.Create()
    54  
    55  	return a
    56  }
    57  
    58  // Starts the agent worker
    59  func (a *AgentWorker) Start() error {
    60  	// Mark the agent as running
    61  	a.running = true
    62  
    63  	// Create the intervals we'll be using
    64  	pingInterval := time.Second * time.Duration(a.Agent.PingInterval)
    65  	heartbeatInterval := time.Second * time.Duration(a.Agent.HearbeatInterval)
    66  
    67  	// Setup and start the heartbeater
    68  	go func() {
    69  		// Keep the heartbeat running as long as the agent is
    70  		for a.running {
    71  			err := a.Heartbeat()
    72  			if err != nil {
    73  				logger.Error("Failed to heartbeat %s. Will try again in %s", err, heartbeatInterval)
    74  			}
    75  
    76  			time.Sleep(heartbeatInterval)
    77  		}
    78  	}()
    79  
    80  	// Create the ticker and stop channels
    81  	a.ticker = time.NewTicker(pingInterval)
    82  	a.stop = make(chan struct{})
    83  
    84  	// Continue this loop until the the ticker is stopped, and we received
    85  	// a message on the stop channel.
    86  	for {
    87  		a.Ping()
    88  
    89  		select {
    90  		case <-a.ticker.C:
    91  			continue
    92  		case <-a.stop:
    93  			a.ticker.Stop()
    94  			return nil
    95  		}
    96  	}
    97  
    98  	// Mark the agent as not running anymore
    99  	a.running = false
   100  
   101  	return nil
   102  }
   103  
   104  // Stops the agent from accepting new work and cancels any current work it's
   105  // running
   106  func (a *AgentWorker) Stop(graceful bool) {
   107  	// Only allow one stop to run at a time (because we're playing with
   108  	// channels)
   109  	a.stopMutex.Lock()
   110  	defer a.stopMutex.Unlock()
   111  
   112  	if graceful {
   113  		if a.stopping {
   114  			logger.Warn("Agent is already gracefully stopping...")
   115  		} else {
   116  			// If we have a job, tell the user that we'll wait for
   117  			// it to finish before disconnecting
   118  			if a.jobRunner != nil {
   119  				logger.Info("Gracefully stopping agent. Waiting for current job to finish before disconnecting...")
   120  			} else {
   121  				logger.Info("Gracefully stopping agent. Since there is no job running, the agent will disconnect immediately")
   122  			}
   123  		}
   124  	} else {
   125  		// If there's a job running, kill it, then disconnect
   126  		if a.jobRunner != nil {
   127  			logger.Info("Forcefully stopping agent. The current job will be canceled before disconnecting...")
   128  
   129  			// Kill the current job. Doesn't do anything if the job
   130  			// is already being killed, so it's safe to call
   131  			// multiple times.
   132  			a.jobRunner.Kill()
   133  		} else {
   134  			logger.Info("Forcefully stopping agent. Since there is no job running, the agent will disconnect immediately")
   135  		}
   136  	}
   137  
   138  	// We don't need to do the below operations again since we've already
   139  	// done them before
   140  	if a.stopping {
   141  		return
   142  	}
   143  
   144  	// Update the proc title
   145  	a.UpdateProcTitle("stopping")
   146  
   147  	// If we have a ticker, stop it, and send a signal to the stop channel,
   148  	// which will cause the agent worker to stop looping immediatly.
   149  	if a.ticker != nil {
   150  		close(a.stop)
   151  	}
   152  
   153  	// Mark the agent as stopping
   154  	a.stopping = true
   155  }
   156  
   157  // Connects the agent to the Buildkite Agent API, retrying up to 30 times if it
   158  // fails.
   159  func (a *AgentWorker) Connect() error {
   160  	// Update the proc title
   161  	a.UpdateProcTitle("connecting")
   162  
   163  	return retry.Do(func(s *retry.Stats) error {
   164  		_, err := a.APIClient.Agents.Connect()
   165  		if err != nil {
   166  			logger.Warn("%s (%s)", err, s)
   167  		}
   168  
   169  		return err
   170  	}, &retry.Config{Maximum: 10, Interval: 5 * time.Second})
   171  }
   172  
   173  // Performs a heatbeat
   174  func (a *AgentWorker) Heartbeat() error {
   175  	var beat *api.Heartbeat
   176  	var err error
   177  
   178  	// Retry the heartbeat a few times
   179  	err = retry.Do(func(s *retry.Stats) error {
   180  		beat, _, err = a.APIClient.Heartbeats.Beat()
   181  		if err != nil {
   182  			logger.Warn("%s (%s)", err, s)
   183  		}
   184  		return err
   185  	}, &retry.Config{Maximum: 5, Interval: 5 * time.Second})
   186  
   187  	if err != nil {
   188  		return err
   189  	}
   190  
   191  	logger.Debug("Heartbeat sent at %s and received at %s", beat.SentAt, beat.ReceivedAt)
   192  	return nil
   193  }
   194  
   195  // Performs a ping, which returns what action the agent should take next.
   196  func (a *AgentWorker) Ping() {
   197  	// Update the proc title
   198  	a.UpdateProcTitle("pinging")
   199  
   200  	ping, _, err := a.APIClient.Pings.Get()
   201  	if err != nil {
   202  		// If a ping fails, we don't really care, because it'll
   203  		// ping again after the interval.
   204  		logger.Warn("Failed to ping: %s", err)
   205  		return
   206  	}
   207  
   208  	// Should we switch endpoints?
   209  	if ping.Endpoint != "" && ping.Endpoint != a.Agent.Endpoint {
   210  		// Before switching to the new one, do a ping test to make sure it's
   211  		// valid. If it is, switch and carry on, otherwise ignore the switch
   212  		// for now.
   213  		newAPIClient := APIClient{Endpoint: ping.Endpoint, Token: a.Agent.AccessToken}.Create()
   214  		newPing, _, err := newAPIClient.Pings.Get()
   215  		if err != nil {
   216  			logger.Warn("Failed to ping the new endpoint %s - ignoring switch for now (%s)", ping.Endpoint, err)
   217  		} else {
   218  			// Replace the APIClient and process the new ping
   219  			a.APIClient = newAPIClient
   220  			a.Agent.Endpoint = ping.Endpoint
   221  			ping = newPing
   222  		}
   223  	}
   224  
   225  	// Is there a message that should be shown in the logs?
   226  	if ping.Message != "" {
   227  		logger.Info(ping.Message)
   228  	}
   229  
   230  	// Should the agent disconnect?
   231  	if ping.Action == "disconnect" {
   232  		a.Stop(false)
   233  		return
   234  	}
   235  
   236  	// If we don't have a job, there's nothing to do!
   237  	if ping.Job == nil {
   238  		// Update the proc title
   239  		a.UpdateProcTitle("idle")
   240  
   241  		return
   242  	}
   243  
   244  	// Update the proc title
   245  	a.UpdateProcTitle(fmt.Sprintf("job %s", strings.Split(ping.Job.ID, "-")[0]))
   246  
   247  	logger.Info("Assigned job %s. Accepting...", ping.Job.ID)
   248  
   249  	// Accept the job. We'll retry on connection related issues, but if
   250  	// Buildkite returns a 422 or 500 for example, we'll just bail out,
   251  	// re-ping, and try the whole process again.
   252  	var accepted *api.Job
   253  	retry.Do(func(s *retry.Stats) error {
   254  		accepted, _, err = a.APIClient.Jobs.Accept(ping.Job)
   255  
   256  		if err != nil {
   257  			if api.IsRetryableError(err) {
   258  				logger.Warn("%s (%s)", err, s)
   259  			} else {
   260  				logger.Warn("Buildkite rejected the call to accept the job (%s)", err)
   261  				s.Break()
   262  			}
   263  		}
   264  
   265  		return err
   266  	}, &retry.Config{Maximum: 30, Interval: 5 * time.Second})
   267  
   268  	// If `accepted` is nil, then the job was never accepted
   269  	if accepted == nil {
   270  		logger.Error("Failed to accept job")
   271  		return
   272  	}
   273  
   274  	// Now that the job has been accepted, we can start it.
   275  	a.jobRunner, err = JobRunner{
   276  		Endpoint:           accepted.Endpoint,
   277  		Agent:              a.Agent,
   278  		AgentConfiguration: a.AgentConfiguration,
   279  		Job:                accepted,
   280  	}.Create()
   281  
   282  	// Was there an error creating the job runner?
   283  	if err != nil {
   284  		logger.Error("Failed to initialize job: %s", err)
   285  		return
   286  	}
   287  
   288  	// Start running the job
   289  	if err = a.jobRunner.Run(); err != nil {
   290  		logger.Error("Failed to run job: %s", err)
   291  	}
   292  
   293  	// No more job, no more runner.
   294  	a.jobRunner = nil
   295  }
   296  
   297  // Disconnects the agent from the Buildkite Agent API, doesn't bother retrying
   298  // because we want to disconnect as fast as possible.
   299  func (a *AgentWorker) Disconnect() error {
   300  	// Update the proc title
   301  	a.UpdateProcTitle("disconnecting")
   302  
   303  	_, err := a.APIClient.Agents.Disconnect()
   304  	if err != nil {
   305  		logger.Warn("There was an error sending the disconnect API call to Buildkite. If this agent still appears online, you may have to manually stop it (%s)", err)
   306  	}
   307  
   308  	return err
   309  }
   310  
   311  func (a *AgentWorker) UpdateProcTitle(action string) {
   312  	proctitle.Replace(fmt.Sprintf("buildkite-agent v%s [%s]", Version(), action))
   313  }