github.com/bshelton229/agent@v3.5.4+incompatible/agent/job_runner.go (about)

     1  package agent
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io/ioutil"
     7  	"os"
     8  	"path/filepath"
     9  	"strings"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/buildkite/agent/api"
    14  	"github.com/buildkite/agent/experiments"
    15  	"github.com/buildkite/agent/logger"
    16  	"github.com/buildkite/agent/process"
    17  	"github.com/buildkite/agent/retry"
    18  	"github.com/buildkite/shellwords"
    19  )
    20  
    21  type JobRunner struct {
    22  	// The job being run
    23  	Job *api.Job
    24  
    25  	// The APIClient that will be used when updating the job
    26  	APIClient *api.Client
    27  
    28  	// The APIProxy that will be exposed to the job bootstrap
    29  	APIProxy *APIProxy
    30  
    31  	// The endpoint that should be used when communicating with the API
    32  	Endpoint string
    33  
    34  	// The registred agent API record running this job
    35  	Agent *api.Agent
    36  
    37  	// The configuration of the agent from the CLI
    38  	AgentConfiguration *AgentConfiguration
    39  
    40  	// Go context for goroutine supervision
    41  	context       context.Context
    42  	contextCancel context.CancelFunc
    43  
    44  	// The interal process of the job
    45  	process *process.Process
    46  
    47  	// The internal header time streamer
    48  	headerTimesStreamer *HeaderTimesStreamer
    49  
    50  	// The internal log streamer
    51  	logStreamer *LogStreamer
    52  
    53  	// If the job is being cancelled
    54  	cancelled bool
    55  
    56  	// Used to wait on various routines that we spin up
    57  	routineWaitGroup sync.WaitGroup
    58  
    59  	// A lock to protect concurrent calls to kill
    60  	killLock sync.Mutex
    61  
    62  	// File containing a copy of the job env
    63  	envFile *os.File
    64  }
    65  
    66  // Initializes the job runner
    67  func (r JobRunner) Create() (runner *JobRunner, err error) {
    68  	runner = &r
    69  
    70  	runner.context, runner.contextCancel = context.WithCancel(context.Background())
    71  
    72  	// Our own APIClient using the endpoint and the agents access token
    73  	runner.APIClient = APIClient{Endpoint: r.Endpoint, Token: r.Agent.AccessToken}.Create()
    74  
    75  	// A proxy for the agent API that is expose to the bootstrap
    76  	runner.APIProxy = NewAPIProxy(r.Endpoint, r.Agent.AccessToken)
    77  
    78  	// Create our header times struct
    79  	runner.headerTimesStreamer = &HeaderTimesStreamer{UploadCallback: r.onUploadHeaderTime}
    80  
    81  	// The log streamer that will take the output chunks, and send them to
    82  	// the Buildkite Agent API
    83  	runner.logStreamer = LogStreamer{MaxChunkSizeBytes: r.Job.ChunksMaxSizeBytes, Callback: r.onUploadChunk}.New()
    84  
    85  	// Start a proxy to give to the job for api operations
    86  	if experiments.IsEnabled("agent-socket") {
    87  		if err := r.APIProxy.Listen(); err != nil {
    88  			return nil, err
    89  		}
    90  	}
    91  
    92  	// Prepare a file to recieve the given job environment
    93  	if file, err := ioutil.TempFile("", fmt.Sprintf("job-env-%s", runner.Job.ID)); err != nil {
    94  		return runner, err
    95  	} else {
    96  		logger.Debug("[JobRunner] Created env file: %s", file.Name())
    97  		runner.envFile = file
    98  	}
    99  
   100  	env, err := r.createEnvironment()
   101  	if err != nil {
   102  		return nil, err
   103  	}
   104  
   105  	// The bootstrap-script gets parsed based on the operating system
   106  	cmd, err := shellwords.Split(r.AgentConfiguration.BootstrapScript)
   107  	if err != nil {
   108  		return nil, fmt.Errorf("Failed to split bootstrap-script (%q) into tokens: %v",
   109  			r.AgentConfiguration.BootstrapScript, err)
   110  	}
   111  
   112  	// The process that will run the bootstrap script
   113  	runner.process = &process.Process{
   114  		Script:             cmd,
   115  		Env:                env,
   116  		PTY:                r.AgentConfiguration.RunInPty,
   117  		Timestamp:          r.AgentConfiguration.TimestampLines,
   118  		StartCallback:      r.onProcessStartCallback,
   119  		LineCallback:       runner.headerTimesStreamer.Scan,
   120  		LinePreProcessor:   runner.headerTimesStreamer.LinePreProcessor,
   121  		LineCallbackFilter: runner.headerTimesStreamer.LineIsHeader,
   122  	}
   123  
   124  	return
   125  }
   126  
   127  // Runs the job
   128  func (r *JobRunner) Run() error {
   129  	logger.Info("Starting job %s", r.Job.ID)
   130  
   131  	// Start the build in the Buildkite Agent API. This is the first thing
   132  	// we do so if it fails, we don't have to worry about cleaning things
   133  	// up like started log streamer workers, etc.
   134  	if err := r.startJob(time.Now()); err != nil {
   135  		return err
   136  	}
   137  
   138  	// Start the header time streamer
   139  	if err := r.headerTimesStreamer.Start(); err != nil {
   140  		return err
   141  	}
   142  
   143  	// Start the log streamer
   144  	if err := r.logStreamer.Start(); err != nil {
   145  		return err
   146  	}
   147  
   148  	// Start the process. This will block until it finishes.
   149  	if err := r.process.Start(); err != nil {
   150  		// Send the error as output
   151  		r.logStreamer.Process(fmt.Sprintf("%s", err))
   152  	} else {
   153  		// Add the final output to the streamer
   154  		r.logStreamer.Process(r.process.Output())
   155  	}
   156  
   157  	// Store the finished at time
   158  	finishedAt := time.Now()
   159  
   160  	// Stop the header time streamer. This will block until all the chunks
   161  	// have been uploaded
   162  	r.headerTimesStreamer.Stop()
   163  
   164  	// Stop the log streamer. This will block until all the chunks have
   165  	// been uploaded
   166  	r.logStreamer.Stop()
   167  
   168  	// Warn about failed chunks
   169  	if r.logStreamer.ChunksFailedCount > 0 {
   170  		logger.Warn("%d chunks failed to upload for this job", r.logStreamer.ChunksFailedCount)
   171  	}
   172  
   173  	// Wait for the routines that we spun up to finish
   174  	logger.Debug("[JobRunner] Waiting for all other routines to finish")
   175  	r.contextCancel()
   176  	r.routineWaitGroup.Wait()
   177  
   178  	// Remove the env file, if any
   179  	if r.envFile != nil {
   180  		if err := os.Remove(r.envFile.Name()); err != nil {
   181  			logger.Warn("[JobRunner] Error cleaning up env file: %s", err)
   182  		}
   183  		logger.Debug("[JobRunner] Deleted env file: %s", r.envFile.Name())
   184  	}
   185  
   186  	// Destroy the proxy
   187  	if experiments.IsEnabled("agent-socket") {
   188  		if err := r.APIProxy.Close(); err != nil {
   189  			logger.Warn("[JobRunner] Failed to close API proxy: %v", err)
   190  		}
   191  	}
   192  
   193  	// Finish the build in the Buildkite Agent API
   194  	//
   195  	// Once we tell the API we're finished it might assign us new work, so make
   196  	// sure everything else is done first.
   197  	r.finishJob(finishedAt, r.process.ExitStatus, int(r.logStreamer.ChunksFailedCount))
   198  
   199  	logger.Info("Finished job %s", r.Job.ID)
   200  
   201  	return nil
   202  }
   203  
   204  func (r *JobRunner) Kill() error {
   205  	r.killLock.Lock()
   206  	defer r.killLock.Unlock()
   207  
   208  	if !r.cancelled {
   209  		logger.Info("Canceling job %s", r.Job.ID)
   210  		r.cancelled = true
   211  
   212  		if r.process != nil {
   213  			r.process.Kill()
   214  		} else {
   215  			logger.Error("No process to kill")
   216  		}
   217  	}
   218  
   219  	return nil
   220  }
   221  
   222  // Creates the environment variables that will be used in the process and writes a flat environment file
   223  func (r *JobRunner) createEnvironment() ([]string, error) {
   224  	// Create a clone of our jobs environment. We'll then set the
   225  	// environment variables provided by the agent, which will override any
   226  	// sent by Buildkite. The variables below should always take
   227  	// precedence.
   228  	env := make(map[string]string)
   229  	for key, value := range r.Job.Env {
   230  		env[key] = value
   231  	}
   232  
   233  	// Write out the job environment to a file, in k="v" format, with newlines escaped
   234  	// We present only the clean environment - i.e only variables configured
   235  	// on the job upstream - and expose the path in another environment variable.
   236  	if r.envFile != nil {
   237  		for key, value := range env {
   238  			if _, err := r.envFile.WriteString(fmt.Sprintf("%s=%q\n", key, value)); err != nil {
   239  				return nil, err
   240  			}
   241  		}
   242  		if err := r.envFile.Close(); err != nil {
   243  			return nil, err
   244  		}
   245  		env["BUILDKITE_ENV_FILE"] = r.envFile.Name()
   246  	}
   247  
   248  	// Certain env can only be set by agent configuration.
   249  	// We show the user a warning in the bootstrap if they use any of these at a job level.
   250  
   251  	var protectedEnv = []string{
   252  		`BUILDKITE_AGENT_ENDPOINT`,
   253  		`BUILDKITE_AGENT_ACCESS_TOKEN`,
   254  		`BUILDKITE_AGENT_DEBUG`,
   255  		`BUILDKITE_AGENT_PID`,
   256  		`BUILDKITE_BIN_PATH`,
   257  		`BUILDKITE_CONFIG_PATH`,
   258  		`BUILDKITE_BUILD_PATH`,
   259  		`BUILDKITE_HOOKS_PATH`,
   260  		`BUILDKITE_PLUGINS_PATH`,
   261  		`BUILDKITE_SSH_KEYSCAN`,
   262  		`BUILDKITE_GIT_SUBMODULES`,
   263  		`BUILDKITE_COMMAND_EVAL`,
   264  		`BUILDKITE_PLUGINS_ENABLED`,
   265  		`BUILDKITE_LOCAL_HOOKS_ENABLED`,
   266  		`BUILDKITE_GIT_CLONE_FLAGS`,
   267  		`BUILDKITE_GIT_CLEAN_FLAGS`,
   268  		`BUILDKITE_SHELL`,
   269  	}
   270  
   271  	var ignoredEnv []string
   272  
   273  	// Check if the user has defined any protected env
   274  	for _, p := range protectedEnv {
   275  		if _, exists := r.Job.Env[p]; exists {
   276  			ignoredEnv = append(ignoredEnv, p)
   277  		}
   278  	}
   279  
   280  	// Set BUILDKITE_IGNORED_ENV so the bootstrap can show warnings
   281  	if len(ignoredEnv) > 0 {
   282  		env["BUILDKITE_IGNORED_ENV"] = strings.Join(ignoredEnv, ",")
   283  	}
   284  
   285  	if experiments.IsEnabled("agent-socket") {
   286  		env["BUILDKITE_AGENT_ENDPOINT"] = r.APIProxy.Endpoint()
   287  		env["BUILDKITE_AGENT_ACCESS_TOKEN"] = r.APIProxy.AccessToken()
   288  	} else {
   289  		env["BUILDKITE_AGENT_ENDPOINT"] = r.Endpoint
   290  		env["BUILDKITE_AGENT_ACCESS_TOKEN"] = r.Agent.AccessToken
   291  	}
   292  
   293  	// Add agent environment variables
   294  	env["BUILDKITE_AGENT_DEBUG"] = fmt.Sprintf("%t", logger.GetLevel() == logger.DEBUG)
   295  	env["BUILDKITE_AGENT_PID"] = fmt.Sprintf("%d", os.Getpid())
   296  
   297  	// We know the BUILDKITE_BIN_PATH dir, because it's the path to the
   298  	// currently running file (there is only 1 binary)
   299  	dir, _ := filepath.Abs(filepath.Dir(os.Args[0]))
   300  	env["BUILDKITE_BIN_PATH"] = dir
   301  
   302  	// Add options from the agent configuration
   303  	env["BUILDKITE_CONFIG_PATH"] = r.AgentConfiguration.ConfigPath
   304  	env["BUILDKITE_BUILD_PATH"] = r.AgentConfiguration.BuildPath
   305  	env["BUILDKITE_HOOKS_PATH"] = r.AgentConfiguration.HooksPath
   306  	env["BUILDKITE_PLUGINS_PATH"] = r.AgentConfiguration.PluginsPath
   307  	env["BUILDKITE_SSH_KEYSCAN"] = fmt.Sprintf("%t", r.AgentConfiguration.SSHKeyscan)
   308  	env["BUILDKITE_GIT_SUBMODULES"] = fmt.Sprintf("%t", r.AgentConfiguration.GitSubmodules)
   309  	env["BUILDKITE_COMMAND_EVAL"] = fmt.Sprintf("%t", r.AgentConfiguration.CommandEval)
   310  	env["BUILDKITE_PLUGINS_ENABLED"] = fmt.Sprintf("%t", r.AgentConfiguration.PluginsEnabled)
   311  	env["BUILDKITE_LOCAL_HOOKS_ENABLED"] = fmt.Sprintf("%t", r.AgentConfiguration.LocalHooksEnabled)
   312  	env["BUILDKITE_GIT_CLONE_FLAGS"] = r.AgentConfiguration.GitCloneFlags
   313  	env["BUILDKITE_GIT_CLEAN_FLAGS"] = r.AgentConfiguration.GitCleanFlags
   314  	env["BUILDKITE_SHELL"] = r.AgentConfiguration.Shell
   315  
   316  	enablePluginValidation := r.AgentConfiguration.PluginValidation
   317  
   318  	// Allow BUILDKITE_PLUGIN_VALIDATION to be enabled from env for easier
   319  	// per-pipeline testing
   320  	if pluginValidation, ok := env["BUILDKITE_PLUGIN_VALIDATION"]; ok {
   321  		switch pluginValidation {
   322  		case "true", "1", "on":
   323  			enablePluginValidation = true
   324  		}
   325  	}
   326  
   327  	env["BUILDKITE_PLUGIN_VALIDATION"] = fmt.Sprintf("%t", enablePluginValidation)
   328  
   329  	// Convert the env map into a slice (which is what the script gear
   330  	// needs)
   331  	envSlice := []string{}
   332  	for key, value := range env {
   333  		envSlice = append(envSlice, fmt.Sprintf("%s=%s", key, value))
   334  	}
   335  
   336  	return envSlice, nil
   337  }
   338  
   339  // Starts the job in the Buildkite Agent API. We'll retry on connection-related
   340  // issues, but if a connection succeeds and we get an error response back from
   341  // Buildkite, we won't bother retrying. For example, a "no such host" will
   342  // retry, but a 422 from Buildkite won't.
   343  func (r *JobRunner) startJob(startedAt time.Time) error {
   344  	r.Job.StartedAt = startedAt.UTC().Format(time.RFC3339Nano)
   345  
   346  	return retry.Do(func(s *retry.Stats) error {
   347  		_, err := r.APIClient.Jobs.Start(r.Job)
   348  
   349  		if err != nil {
   350  			if api.IsRetryableError(err) {
   351  				logger.Warn("%s (%s)", err, s)
   352  			} else {
   353  				logger.Warn("Buildkite rejected the call to start the job (%s)", err)
   354  				s.Break()
   355  			}
   356  		}
   357  
   358  		return err
   359  	}, &retry.Config{Maximum: 30, Interval: 5 * time.Second})
   360  }
   361  
   362  // Finishes the job in the Buildkite Agent API. This call will keep on retrying
   363  // forever until it finally gets a successfull response from the API.
   364  func (r *JobRunner) finishJob(finishedAt time.Time, exitStatus string, failedChunkCount int) error {
   365  	r.Job.FinishedAt = finishedAt.UTC().Format(time.RFC3339Nano)
   366  	r.Job.ExitStatus = exitStatus
   367  	r.Job.ChunksFailedCount = failedChunkCount
   368  
   369  	return retry.Do(func(s *retry.Stats) error {
   370  		response, err := r.APIClient.Jobs.Finish(r.Job)
   371  		if err != nil {
   372  			// If the API returns with a 422, that means that we
   373  			// succesfully tried to finish the job, but Buildkite
   374  			// rejected the finish for some reason. This can
   375  			// sometimes mean that Buildkite has cancelled the job
   376  			// before we get a chance to send the final API call
   377  			// (maybe this agent took too long to kill the
   378  			// process). In that case, we don't want to keep trying
   379  			// to finish the job forever so we'll just bail out and
   380  			// go find some more work to do.
   381  			if response != nil && response.StatusCode == 422 {
   382  				logger.Warn("Buildkite rejected the call to finish the job (%s)", err)
   383  				s.Break()
   384  			} else {
   385  				logger.Warn("%s (%s)", err, s)
   386  			}
   387  		}
   388  
   389  		return err
   390  	}, &retry.Config{Forever: true, Interval: 1 * time.Second})
   391  }
   392  
   393  func (r *JobRunner) onProcessStartCallback() {
   394  	// Since we're spinning up 2 routines here, we might as well add them
   395  	// to the routine wait group here.
   396  	r.routineWaitGroup.Add(2)
   397  
   398  	// Start a routine that will grab the output every few seconds and send
   399  	// it back to Buildkite
   400  	go func() {
   401  		for r.process.IsRunning() {
   402  			// Send the output of the process to the log streamer
   403  			// for processing
   404  			r.logStreamer.Process(r.process.Output())
   405  
   406  			// Sleep for a bit, or until the job is finished
   407  			select {
   408  			case <-time.After(1 * time.Second):
   409  			case <-r.context.Done():
   410  			}
   411  		}
   412  
   413  		// The final output after the process has finished is processed in Run()
   414  
   415  		// Mark this routine as done in the wait group
   416  		r.routineWaitGroup.Done()
   417  
   418  		logger.Debug("[JobRunner] Routine that processes the log has finished")
   419  	}()
   420  
   421  	// Start a routine that will constantly ping Buildkite to see if the
   422  	// job has been canceled
   423  	go func() {
   424  		for r.process.IsRunning() {
   425  			// Re-get the job and check it's status to see if it's been
   426  			// cancelled
   427  			jobState, _, err := r.APIClient.Jobs.GetState(r.Job.ID)
   428  			if err != nil {
   429  				// We don't really care if it fails, we'll just
   430  				// try again soon anyway
   431  				logger.Warn("Problem with getting job state %s (%s)", r.Job.ID, err)
   432  			} else if jobState.State == "canceling" || jobState.State == "canceled" {
   433  				r.Kill()
   434  			}
   435  
   436  			// Sleep for a bit, or until the job is finished
   437  			select {
   438  			case <-time.After(time.Duration(r.Agent.JobStatusInterval) * time.Second):
   439  			case <-r.context.Done():
   440  			}
   441  		}
   442  
   443  		// Mark this routine as done in the wait group
   444  		r.routineWaitGroup.Done()
   445  
   446  		logger.Debug("[JobRunner] Routine that refreshes the job has finished")
   447  	}()
   448  }
   449  
   450  func (r *JobRunner) onUploadHeaderTime(cursor int, total int, times map[string]string) {
   451  	retry.Do(func(s *retry.Stats) error {
   452  		_, err := r.APIClient.HeaderTimes.Save(r.Job.ID, &api.HeaderTimes{Times: times})
   453  		if err != nil {
   454  			logger.Warn("%s (%s)", err, s)
   455  		}
   456  
   457  		return err
   458  	}, &retry.Config{Maximum: 10, Interval: 5 * time.Second})
   459  }
   460  
   461  // Call when a chunk is ready for upload. It retry the chunk upload with an
   462  // interval before giving up.
   463  func (r *JobRunner) onUploadChunk(chunk *LogStreamerChunk) error {
   464  	return retry.Do(func(s *retry.Stats) error {
   465  		_, err := r.APIClient.Chunks.Upload(r.Job.ID, &api.Chunk{
   466  			Data:     chunk.Data,
   467  			Sequence: chunk.Order,
   468  			Offset:   chunk.Offset,
   469  			Size:     chunk.Size,
   470  		})
   471  		if err != nil {
   472  			logger.Warn("%s (%s)", err, s)
   473  		}
   474  
   475  		return err
   476  	}, &retry.Config{Maximum: 10, Interval: 5 * time.Second})
   477  }