github.com/pquerna/agent@v2.1.8+incompatible/agent/job_runner.go (about)

     1  package agent
     2  
     3  import (
     4  	"fmt"
     5  	"os"
     6  	"path/filepath"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/buildkite/agent/api"
    11  	"github.com/buildkite/agent/logger"
    12  	"github.com/buildkite/agent/process"
    13  	"github.com/buildkite/agent/retry"
    14  )
    15  
    16  type JobRunner struct {
    17  	// The job being run
    18  	Job *api.Job
    19  
    20  	// The APIClient that will be used when updating the job
    21  	APIClient *api.Client
    22  
    23  	// The endpoint that should be used when communicating with the API
    24  	Endpoint string
    25  
    26  	// The registred agent API record running this job
    27  	Agent *api.Agent
    28  
    29  	// The configuration of the agent from the CLI
    30  	AgentConfiguration *AgentConfiguration
    31  
    32  	// The interal process of the job
    33  	process *process.Process
    34  
    35  	// The internal header time streamer
    36  	headerTimesStreamer *HeaderTimesStreamer
    37  
    38  	// The internal log streamer
    39  	logStreamer *LogStreamer
    40  
    41  	// If the job is being cancelled
    42  	cancelled bool
    43  
    44  	// Used to wait on various routines that we spin up
    45  	routineWaitGroup sync.WaitGroup
    46  }
    47  
    48  // Initializes the job runner
    49  func (r JobRunner) Create() (runner *JobRunner, err error) {
    50  	runner = &r
    51  
    52  	// Our own APIClient using the endpoint and the agents access token
    53  	runner.APIClient = APIClient{Endpoint: r.Endpoint, Token: r.Agent.AccessToken}.Create()
    54  
    55  	// // Create our header times struct
    56  	runner.headerTimesStreamer = &HeaderTimesStreamer{UploadCallback: r.onUploadHeaderTime}
    57  
    58  	// The log streamer that will take the output chunks, and send them to
    59  	// the Buildkite Agent API
    60  	runner.logStreamer = LogStreamer{MaxChunkSizeBytes: r.Job.ChunksMaxSizeBytes, Callback: r.onUploadChunk}.New()
    61  
    62  	// The process that will run the bootstrap script
    63  	runner.process = process.Process{
    64  		Script:        r.AgentConfiguration.BootstrapScript,
    65  		Env:           r.createEnvironment(),
    66  		PTY:           r.AgentConfiguration.RunInPty,
    67  		StartCallback: r.onProcessStartCallback,
    68  		LineCallback:  runner.headerTimesStreamer.Scan,
    69  	}.Create()
    70  
    71  	return
    72  }
    73  
    74  // Runs the job
    75  func (r *JobRunner) Run() error {
    76  	logger.Info("Starting job %s", r.Job.ID)
    77  
    78  	// Start the build in the Buildkite Agent API. This is the first thing
    79  	// we do so if it fails, we don't have to worry about cleaning things
    80  	// up like started log streamer workers, etc.
    81  	if err := r.startJob(time.Now()); err != nil {
    82  		return err
    83  	}
    84  
    85  	// Start the header time streamer
    86  	if err := r.headerTimesStreamer.Start(); err != nil {
    87  		return err
    88  	}
    89  
    90  	// Start the log streamer
    91  	if err := r.logStreamer.Start(); err != nil {
    92  		return err
    93  	}
    94  
    95  	// Start the process. This will block until it finishes.
    96  	if err := r.process.Start(); err != nil {
    97  		// Send the error as output
    98  		r.logStreamer.Process(fmt.Sprintf("%s", err))
    99  	} else {
   100  		// Add the final output to the streamer
   101  		r.logStreamer.Process(r.process.Output())
   102  	}
   103  
   104  	// Store the finished at time
   105  	finishedAt := time.Now()
   106  
   107  	// Stop the header time streamer. This will block until all the chunks
   108  	// have been uploaded
   109  	r.headerTimesStreamer.Stop()
   110  
   111  	// Stop the log streamer. This will block until all the chunks have
   112  	// been uploaded
   113  	r.logStreamer.Stop()
   114  
   115  	// Warn about failed chunks
   116  	if r.logStreamer.ChunksFailedCount > 0 {
   117  		logger.Warn("%d chunks failed to upload for this job", r.logStreamer.ChunksFailedCount)
   118  	}
   119  
   120  	// Finish the build in the Buildkite Agent API
   121  	r.finishJob(finishedAt, r.process.ExitStatus, int(r.logStreamer.ChunksFailedCount))
   122  
   123  	// Wait for the routines that we spun up to finish
   124  	logger.Debug("[JobRunner] Waiting for all other routines to finish")
   125  	r.routineWaitGroup.Wait()
   126  
   127  	logger.Info("Finished job %s", r.Job.ID)
   128  
   129  	return nil
   130  }
   131  
   132  func (r *JobRunner) Kill() error {
   133  	if !r.cancelled {
   134  		logger.Info("Canceling job %s", r.Job.ID)
   135  		r.cancelled = true
   136  
   137  		if r.process != nil {
   138  			r.process.Kill()
   139  		} else {
   140  			logger.Error("No process to kill")
   141  		}
   142  	}
   143  
   144  	return nil
   145  }
   146  
   147  // Creates the environment variables that will be used in the process
   148  func (r *JobRunner) createEnvironment() []string {
   149  	// Create a clone of our jobs environment. We'll then set the
   150  	// environment variables provided by the agent, which will override any
   151  	// sent by Buildkite. The variables below should always take
   152  	// precedence.
   153  	env := make(map[string]string)
   154  	for key, value := range r.Job.Env {
   155  		env[key] = value
   156  	}
   157  
   158  	// Add agent environment variables
   159  	env["BUILDKITE_AGENT_ENDPOINT"] = r.Endpoint
   160  	env["BUILDKITE_AGENT_ACCESS_TOKEN"] = r.Agent.AccessToken
   161  	env["BUILDKITE_AGENT_DEBUG"] = fmt.Sprintf("%t", logger.GetLevel() == logger.DEBUG)
   162  	env["BUILDKITE_AGENT_PID"] = fmt.Sprintf("%d", os.Getpid())
   163  
   164  	// We know the BUILDKITE_BIN_PATH dir, because it's the path to the
   165  	// currently running file (there is only 1 binary)
   166  	dir, _ := filepath.Abs(filepath.Dir(os.Args[0]))
   167  	env["BUILDKITE_BIN_PATH"] = dir
   168  
   169  	// Add misc options
   170  	env["BUILDKITE_BUILD_PATH"] = r.AgentConfiguration.BuildPath
   171  	env["BUILDKITE_HOOKS_PATH"] = r.AgentConfiguration.HooksPath
   172  	env["BUILDKITE_AUTO_SSH_FINGERPRINT_VERIFICATION"] = fmt.Sprintf("%t", r.AgentConfiguration.AutoSSHFingerprintVerification)
   173  	env["BUILDKITE_COMMAND_EVAL"] = fmt.Sprintf("%t", r.AgentConfiguration.CommandEval)
   174  
   175  	// Convert the env map into a slice (which is what the script gear
   176  	// needs)
   177  	envSlice := []string{}
   178  	for key, value := range env {
   179  		envSlice = append(envSlice, fmt.Sprintf("%s=%s", key, value))
   180  	}
   181  
   182  	return envSlice
   183  }
   184  
   185  // Starts the job in the Buildkite Agent API. We'll retry on connection-related
   186  // issues, but if a connection succeeds and we get an error response back from
   187  // Buildkite, we won't bother retrying. For example, a "no such host" will
   188  // retry, but a 422 from Buildkite won't.
   189  func (r *JobRunner) startJob(startedAt time.Time) error {
   190  	r.Job.StartedAt = startedAt.UTC().Format(time.RFC3339Nano)
   191  
   192  	return retry.Do(func(s *retry.Stats) error {
   193  		_, err := r.APIClient.Jobs.Start(r.Job)
   194  
   195  		if err != nil {
   196  			if api.IsRetryableError(err) {
   197  				logger.Warn("%s (%s)", err, s)
   198  			} else {
   199  				logger.Warn("Buildkite rejected the call to start the job (%s)", err)
   200  				s.Break()
   201  			}
   202  		}
   203  
   204  		return err
   205  	}, &retry.Config{Maximum: 30, Interval: 5 * time.Second})
   206  }
   207  
   208  // Finishes the job in the Buildkite Agent API. This call will keep on retrying
   209  // forever until it finally gets a successfull response from the API.
   210  func (r *JobRunner) finishJob(finishedAt time.Time, exitStatus string, failedChunkCount int) error {
   211  	r.Job.FinishedAt = finishedAt.UTC().Format(time.RFC3339Nano)
   212  	r.Job.ExitStatus = exitStatus
   213  	r.Job.ChunksFailedCount = failedChunkCount
   214  
   215  	return retry.Do(func(s *retry.Stats) error {
   216  		response, err := r.APIClient.Jobs.Finish(r.Job)
   217  		if err != nil {
   218  			// If the API returns with a 422, that means that we
   219  			// succesfully tried to finish the job, but Buildkite
   220  			// rejected the finish for some reason. This can
   221  			// sometimes mean that Buildkite has cancelled the job
   222  			// before we get a chance to send the final API call
   223  			// (maybe this agent took too long to kill the
   224  			// process). In that case, we don't want to keep trying
   225  			// to finish the job forever so we'll just bail out and
   226  			// go find some more work to do.
   227  			if response != nil && response.StatusCode == 422 {
   228  				logger.Warn("Buildkite rejected the call to finish the job (%s)", err)
   229  				s.Break()
   230  			} else {
   231  				logger.Warn("%s (%s)", err, s)
   232  			}
   233  		}
   234  
   235  		return err
   236  	}, &retry.Config{Forever: true, Interval: 1 * time.Second})
   237  }
   238  
   239  func (r *JobRunner) onProcessStartCallback() {
   240  	// Since we're spinning up 2 routines here, we might as well add them
   241  	// to the routine wait group here.
   242  	r.routineWaitGroup.Add(2)
   243  
   244  	// Start a routine that will grab the output every few seconds and send
   245  	// it back to Buildkite
   246  	go func() {
   247  		for r.process.IsRunning() {
   248  			// Send the output of the process to the log streamer
   249  			// for processing
   250  			r.logStreamer.Process(r.process.Output())
   251  
   252  			// Check the output in another second
   253  			time.Sleep(1 * time.Second)
   254  		}
   255  
   256  		// Mark this routine as done in the wait group
   257  		r.routineWaitGroup.Done()
   258  
   259  		logger.Debug("[JobRunner] Routine that processes the log has finished")
   260  	}()
   261  
   262  	// Start a routine that will constantly ping Buildkite to see if the
   263  	// job has been canceled
   264  	go func() {
   265  		for r.process.IsRunning() {
   266  			// Re-get the job and check it's status to see if it's been
   267  			// cancelled
   268  			jobState, _, err := r.APIClient.Jobs.GetState(r.Job.ID)
   269  			if err != nil {
   270  				// We don't really care if it fails, we'll just
   271  				// try again in a second anyway
   272  				logger.Warn("Problem with getting job state %s (%s)", r.Job.ID, err)
   273  			} else if jobState.State == "canceling" || jobState.State == "canceled" {
   274  				r.Kill()
   275  			}
   276  
   277  			// Check for cancellations every few seconds
   278  			time.Sleep(3 * time.Second)
   279  		}
   280  
   281  		// Mark this routine as done in the wait group
   282  		r.routineWaitGroup.Done()
   283  
   284  		logger.Debug("[JobRunner] Routine that refreshes the job has finished")
   285  	}()
   286  }
   287  
   288  func (r *JobRunner) onUploadHeaderTime(cursor int, total int, times map[string]string) {
   289  	retry.Do(func(s *retry.Stats) error {
   290  		_, err := r.APIClient.HeaderTimes.Save(r.Job.ID, &api.HeaderTimes{Times: times})
   291  		if err != nil {
   292  			logger.Warn("%s (%s)", err, s)
   293  		}
   294  
   295  		return err
   296  	}, &retry.Config{Maximum: 10, Interval: 5 * time.Second})
   297  }
   298  
   299  // Call when a chunk is ready for upload. It retry the chunk upload with an
   300  // interval before giving up.
   301  func (r *JobRunner) onUploadChunk(chunk *LogStreamerChunk) error {
   302  	return retry.Do(func(s *retry.Stats) error {
   303  		_, err := r.APIClient.Chunks.Upload(r.Job.ID, &api.Chunk{
   304  			Data:     chunk.Data,
   305  			Sequence: chunk.Order,
   306  			Offset:   chunk.Offset,
   307  			Size:     chunk.Size,
   308  		})
   309  		if err != nil {
   310  			logger.Warn("%s (%s)", err, s)
   311  		}
   312  
   313  		return err
   314  	}, &retry.Config{Maximum: 10, Interval: 5 * time.Second})
   315  }