github.com/pquerna/agent@v2.1.8+incompatible/agent/job_runner.go (about) 1 package agent 2 3 import ( 4 "fmt" 5 "os" 6 "path/filepath" 7 "sync" 8 "time" 9 10 "github.com/buildkite/agent/api" 11 "github.com/buildkite/agent/logger" 12 "github.com/buildkite/agent/process" 13 "github.com/buildkite/agent/retry" 14 ) 15 16 type JobRunner struct { 17 // The job being run 18 Job *api.Job 19 20 // The APIClient that will be used when updating the job 21 APIClient *api.Client 22 23 // The endpoint that should be used when communicating with the API 24 Endpoint string 25 26 // The registred agent API record running this job 27 Agent *api.Agent 28 29 // The configuration of the agent from the CLI 30 AgentConfiguration *AgentConfiguration 31 32 // The interal process of the job 33 process *process.Process 34 35 // The internal header time streamer 36 headerTimesStreamer *HeaderTimesStreamer 37 38 // The internal log streamer 39 logStreamer *LogStreamer 40 41 // If the job is being cancelled 42 cancelled bool 43 44 // Used to wait on various routines that we spin up 45 routineWaitGroup sync.WaitGroup 46 } 47 48 // Initializes the job runner 49 func (r JobRunner) Create() (runner *JobRunner, err error) { 50 runner = &r 51 52 // Our own APIClient using the endpoint and the agents access token 53 runner.APIClient = APIClient{Endpoint: r.Endpoint, Token: r.Agent.AccessToken}.Create() 54 55 // // Create our header times struct 56 runner.headerTimesStreamer = &HeaderTimesStreamer{UploadCallback: r.onUploadHeaderTime} 57 58 // The log streamer that will take the output chunks, and send them to 59 // the Buildkite Agent API 60 runner.logStreamer = LogStreamer{MaxChunkSizeBytes: r.Job.ChunksMaxSizeBytes, Callback: r.onUploadChunk}.New() 61 62 // The process that will run the bootstrap script 63 runner.process = process.Process{ 64 Script: r.AgentConfiguration.BootstrapScript, 65 Env: r.createEnvironment(), 66 PTY: r.AgentConfiguration.RunInPty, 67 StartCallback: r.onProcessStartCallback, 68 LineCallback: runner.headerTimesStreamer.Scan, 69 }.Create() 70 71 return 72 } 73 74 // Runs the job 75 func (r *JobRunner) Run() error { 76 logger.Info("Starting job %s", r.Job.ID) 77 78 // Start the build in the Buildkite Agent API. This is the first thing 79 // we do so if it fails, we don't have to worry about cleaning things 80 // up like started log streamer workers, etc. 81 if err := r.startJob(time.Now()); err != nil { 82 return err 83 } 84 85 // Start the header time streamer 86 if err := r.headerTimesStreamer.Start(); err != nil { 87 return err 88 } 89 90 // Start the log streamer 91 if err := r.logStreamer.Start(); err != nil { 92 return err 93 } 94 95 // Start the process. This will block until it finishes. 96 if err := r.process.Start(); err != nil { 97 // Send the error as output 98 r.logStreamer.Process(fmt.Sprintf("%s", err)) 99 } else { 100 // Add the final output to the streamer 101 r.logStreamer.Process(r.process.Output()) 102 } 103 104 // Store the finished at time 105 finishedAt := time.Now() 106 107 // Stop the header time streamer. This will block until all the chunks 108 // have been uploaded 109 r.headerTimesStreamer.Stop() 110 111 // Stop the log streamer. This will block until all the chunks have 112 // been uploaded 113 r.logStreamer.Stop() 114 115 // Warn about failed chunks 116 if r.logStreamer.ChunksFailedCount > 0 { 117 logger.Warn("%d chunks failed to upload for this job", r.logStreamer.ChunksFailedCount) 118 } 119 120 // Finish the build in the Buildkite Agent API 121 r.finishJob(finishedAt, r.process.ExitStatus, int(r.logStreamer.ChunksFailedCount)) 122 123 // Wait for the routines that we spun up to finish 124 logger.Debug("[JobRunner] Waiting for all other routines to finish") 125 r.routineWaitGroup.Wait() 126 127 logger.Info("Finished job %s", r.Job.ID) 128 129 return nil 130 } 131 132 func (r *JobRunner) Kill() error { 133 if !r.cancelled { 134 logger.Info("Canceling job %s", r.Job.ID) 135 r.cancelled = true 136 137 if r.process != nil { 138 r.process.Kill() 139 } else { 140 logger.Error("No process to kill") 141 } 142 } 143 144 return nil 145 } 146 147 // Creates the environment variables that will be used in the process 148 func (r *JobRunner) createEnvironment() []string { 149 // Create a clone of our jobs environment. We'll then set the 150 // environment variables provided by the agent, which will override any 151 // sent by Buildkite. The variables below should always take 152 // precedence. 153 env := make(map[string]string) 154 for key, value := range r.Job.Env { 155 env[key] = value 156 } 157 158 // Add agent environment variables 159 env["BUILDKITE_AGENT_ENDPOINT"] = r.Endpoint 160 env["BUILDKITE_AGENT_ACCESS_TOKEN"] = r.Agent.AccessToken 161 env["BUILDKITE_AGENT_DEBUG"] = fmt.Sprintf("%t", logger.GetLevel() == logger.DEBUG) 162 env["BUILDKITE_AGENT_PID"] = fmt.Sprintf("%d", os.Getpid()) 163 164 // We know the BUILDKITE_BIN_PATH dir, because it's the path to the 165 // currently running file (there is only 1 binary) 166 dir, _ := filepath.Abs(filepath.Dir(os.Args[0])) 167 env["BUILDKITE_BIN_PATH"] = dir 168 169 // Add misc options 170 env["BUILDKITE_BUILD_PATH"] = r.AgentConfiguration.BuildPath 171 env["BUILDKITE_HOOKS_PATH"] = r.AgentConfiguration.HooksPath 172 env["BUILDKITE_AUTO_SSH_FINGERPRINT_VERIFICATION"] = fmt.Sprintf("%t", r.AgentConfiguration.AutoSSHFingerprintVerification) 173 env["BUILDKITE_COMMAND_EVAL"] = fmt.Sprintf("%t", r.AgentConfiguration.CommandEval) 174 175 // Convert the env map into a slice (which is what the script gear 176 // needs) 177 envSlice := []string{} 178 for key, value := range env { 179 envSlice = append(envSlice, fmt.Sprintf("%s=%s", key, value)) 180 } 181 182 return envSlice 183 } 184 185 // Starts the job in the Buildkite Agent API. We'll retry on connection-related 186 // issues, but if a connection succeeds and we get an error response back from 187 // Buildkite, we won't bother retrying. For example, a "no such host" will 188 // retry, but a 422 from Buildkite won't. 189 func (r *JobRunner) startJob(startedAt time.Time) error { 190 r.Job.StartedAt = startedAt.UTC().Format(time.RFC3339Nano) 191 192 return retry.Do(func(s *retry.Stats) error { 193 _, err := r.APIClient.Jobs.Start(r.Job) 194 195 if err != nil { 196 if api.IsRetryableError(err) { 197 logger.Warn("%s (%s)", err, s) 198 } else { 199 logger.Warn("Buildkite rejected the call to start the job (%s)", err) 200 s.Break() 201 } 202 } 203 204 return err 205 }, &retry.Config{Maximum: 30, Interval: 5 * time.Second}) 206 } 207 208 // Finishes the job in the Buildkite Agent API. This call will keep on retrying 209 // forever until it finally gets a successfull response from the API. 210 func (r *JobRunner) finishJob(finishedAt time.Time, exitStatus string, failedChunkCount int) error { 211 r.Job.FinishedAt = finishedAt.UTC().Format(time.RFC3339Nano) 212 r.Job.ExitStatus = exitStatus 213 r.Job.ChunksFailedCount = failedChunkCount 214 215 return retry.Do(func(s *retry.Stats) error { 216 response, err := r.APIClient.Jobs.Finish(r.Job) 217 if err != nil { 218 // If the API returns with a 422, that means that we 219 // succesfully tried to finish the job, but Buildkite 220 // rejected the finish for some reason. This can 221 // sometimes mean that Buildkite has cancelled the job 222 // before we get a chance to send the final API call 223 // (maybe this agent took too long to kill the 224 // process). In that case, we don't want to keep trying 225 // to finish the job forever so we'll just bail out and 226 // go find some more work to do. 227 if response != nil && response.StatusCode == 422 { 228 logger.Warn("Buildkite rejected the call to finish the job (%s)", err) 229 s.Break() 230 } else { 231 logger.Warn("%s (%s)", err, s) 232 } 233 } 234 235 return err 236 }, &retry.Config{Forever: true, Interval: 1 * time.Second}) 237 } 238 239 func (r *JobRunner) onProcessStartCallback() { 240 // Since we're spinning up 2 routines here, we might as well add them 241 // to the routine wait group here. 242 r.routineWaitGroup.Add(2) 243 244 // Start a routine that will grab the output every few seconds and send 245 // it back to Buildkite 246 go func() { 247 for r.process.IsRunning() { 248 // Send the output of the process to the log streamer 249 // for processing 250 r.logStreamer.Process(r.process.Output()) 251 252 // Check the output in another second 253 time.Sleep(1 * time.Second) 254 } 255 256 // Mark this routine as done in the wait group 257 r.routineWaitGroup.Done() 258 259 logger.Debug("[JobRunner] Routine that processes the log has finished") 260 }() 261 262 // Start a routine that will constantly ping Buildkite to see if the 263 // job has been canceled 264 go func() { 265 for r.process.IsRunning() { 266 // Re-get the job and check it's status to see if it's been 267 // cancelled 268 jobState, _, err := r.APIClient.Jobs.GetState(r.Job.ID) 269 if err != nil { 270 // We don't really care if it fails, we'll just 271 // try again in a second anyway 272 logger.Warn("Problem with getting job state %s (%s)", r.Job.ID, err) 273 } else if jobState.State == "canceling" || jobState.State == "canceled" { 274 r.Kill() 275 } 276 277 // Check for cancellations every few seconds 278 time.Sleep(3 * time.Second) 279 } 280 281 // Mark this routine as done in the wait group 282 r.routineWaitGroup.Done() 283 284 logger.Debug("[JobRunner] Routine that refreshes the job has finished") 285 }() 286 } 287 288 func (r *JobRunner) onUploadHeaderTime(cursor int, total int, times map[string]string) { 289 retry.Do(func(s *retry.Stats) error { 290 _, err := r.APIClient.HeaderTimes.Save(r.Job.ID, &api.HeaderTimes{Times: times}) 291 if err != nil { 292 logger.Warn("%s (%s)", err, s) 293 } 294 295 return err 296 }, &retry.Config{Maximum: 10, Interval: 5 * time.Second}) 297 } 298 299 // Call when a chunk is ready for upload. It retry the chunk upload with an 300 // interval before giving up. 301 func (r *JobRunner) onUploadChunk(chunk *LogStreamerChunk) error { 302 return retry.Do(func(s *retry.Stats) error { 303 _, err := r.APIClient.Chunks.Upload(r.Job.ID, &api.Chunk{ 304 Data: chunk.Data, 305 Sequence: chunk.Order, 306 Offset: chunk.Offset, 307 Size: chunk.Size, 308 }) 309 if err != nil { 310 logger.Warn("%s (%s)", err, s) 311 } 312 313 return err 314 }, &retry.Config{Maximum: 10, Interval: 5 * time.Second}) 315 }