github.com/stevenmatthewt/agent@v3.5.4+incompatible/agent/job_runner.go (about) 1 package agent 2 3 import ( 4 "context" 5 "fmt" 6 "io/ioutil" 7 "os" 8 "path/filepath" 9 "strings" 10 "sync" 11 "time" 12 13 "github.com/buildkite/agent/api" 14 "github.com/buildkite/agent/experiments" 15 "github.com/buildkite/agent/logger" 16 "github.com/buildkite/agent/process" 17 "github.com/buildkite/agent/retry" 18 "github.com/buildkite/shellwords" 19 ) 20 21 type JobRunner struct { 22 // The job being run 23 Job *api.Job 24 25 // The APIClient that will be used when updating the job 26 APIClient *api.Client 27 28 // The APIProxy that will be exposed to the job bootstrap 29 APIProxy *APIProxy 30 31 // The endpoint that should be used when communicating with the API 32 Endpoint string 33 34 // The registred agent API record running this job 35 Agent *api.Agent 36 37 // The configuration of the agent from the CLI 38 AgentConfiguration *AgentConfiguration 39 40 // Go context for goroutine supervision 41 context context.Context 42 contextCancel context.CancelFunc 43 44 // The interal process of the job 45 process *process.Process 46 47 // The internal header time streamer 48 headerTimesStreamer *HeaderTimesStreamer 49 50 // The internal log streamer 51 logStreamer *LogStreamer 52 53 // If the job is being cancelled 54 cancelled bool 55 56 // Used to wait on various routines that we spin up 57 routineWaitGroup sync.WaitGroup 58 59 // A lock to protect concurrent calls to kill 60 killLock sync.Mutex 61 62 // File containing a copy of the job env 63 envFile *os.File 64 } 65 66 // Initializes the job runner 67 func (r JobRunner) Create() (runner *JobRunner, err error) { 68 runner = &r 69 70 runner.context, runner.contextCancel = context.WithCancel(context.Background()) 71 72 // Our own APIClient using the endpoint and the agents access token 73 runner.APIClient = APIClient{Endpoint: r.Endpoint, Token: r.Agent.AccessToken}.Create() 74 75 // A proxy for the agent API that is expose to the bootstrap 76 runner.APIProxy = NewAPIProxy(r.Endpoint, r.Agent.AccessToken) 77 78 // Create our header times struct 79 runner.headerTimesStreamer = &HeaderTimesStreamer{UploadCallback: r.onUploadHeaderTime} 80 81 // The log streamer that will take the output chunks, and send them to 82 // the Buildkite Agent API 83 runner.logStreamer = LogStreamer{MaxChunkSizeBytes: r.Job.ChunksMaxSizeBytes, Callback: r.onUploadChunk}.New() 84 85 // Start a proxy to give to the job for api operations 86 if experiments.IsEnabled("agent-socket") { 87 if err := r.APIProxy.Listen(); err != nil { 88 return nil, err 89 } 90 } 91 92 // Prepare a file to recieve the given job environment 93 if file, err := ioutil.TempFile("", fmt.Sprintf("job-env-%s", runner.Job.ID)); err != nil { 94 return runner, err 95 } else { 96 logger.Debug("[JobRunner] Created env file: %s", file.Name()) 97 runner.envFile = file 98 } 99 100 env, err := r.createEnvironment() 101 if err != nil { 102 return nil, err 103 } 104 105 // The bootstrap-script gets parsed based on the operating system 106 cmd, err := shellwords.Split(r.AgentConfiguration.BootstrapScript) 107 if err != nil { 108 return nil, fmt.Errorf("Failed to split bootstrap-script (%q) into tokens: %v", 109 r.AgentConfiguration.BootstrapScript, err) 110 } 111 112 // The process that will run the bootstrap script 113 runner.process = &process.Process{ 114 Script: cmd, 115 Env: env, 116 PTY: r.AgentConfiguration.RunInPty, 117 Timestamp: r.AgentConfiguration.TimestampLines, 118 StartCallback: r.onProcessStartCallback, 119 LineCallback: runner.headerTimesStreamer.Scan, 120 LinePreProcessor: runner.headerTimesStreamer.LinePreProcessor, 121 LineCallbackFilter: runner.headerTimesStreamer.LineIsHeader, 122 } 123 124 return 125 } 126 127 // Runs the job 128 func (r *JobRunner) Run() error { 129 logger.Info("Starting job %s", r.Job.ID) 130 131 // Start the build in the Buildkite Agent API. This is the first thing 132 // we do so if it fails, we don't have to worry about cleaning things 133 // up like started log streamer workers, etc. 134 if err := r.startJob(time.Now()); err != nil { 135 return err 136 } 137 138 // Start the header time streamer 139 if err := r.headerTimesStreamer.Start(); err != nil { 140 return err 141 } 142 143 // Start the log streamer 144 if err := r.logStreamer.Start(); err != nil { 145 return err 146 } 147 148 // Start the process. This will block until it finishes. 149 if err := r.process.Start(); err != nil { 150 // Send the error as output 151 r.logStreamer.Process(fmt.Sprintf("%s", err)) 152 } else { 153 // Add the final output to the streamer 154 r.logStreamer.Process(r.process.Output()) 155 } 156 157 // Store the finished at time 158 finishedAt := time.Now() 159 160 // Stop the header time streamer. This will block until all the chunks 161 // have been uploaded 162 r.headerTimesStreamer.Stop() 163 164 // Stop the log streamer. This will block until all the chunks have 165 // been uploaded 166 r.logStreamer.Stop() 167 168 // Warn about failed chunks 169 if r.logStreamer.ChunksFailedCount > 0 { 170 logger.Warn("%d chunks failed to upload for this job", r.logStreamer.ChunksFailedCount) 171 } 172 173 // Wait for the routines that we spun up to finish 174 logger.Debug("[JobRunner] Waiting for all other routines to finish") 175 r.contextCancel() 176 r.routineWaitGroup.Wait() 177 178 // Remove the env file, if any 179 if r.envFile != nil { 180 if err := os.Remove(r.envFile.Name()); err != nil { 181 logger.Warn("[JobRunner] Error cleaning up env file: %s", err) 182 } 183 logger.Debug("[JobRunner] Deleted env file: %s", r.envFile.Name()) 184 } 185 186 // Destroy the proxy 187 if experiments.IsEnabled("agent-socket") { 188 if err := r.APIProxy.Close(); err != nil { 189 logger.Warn("[JobRunner] Failed to close API proxy: %v", err) 190 } 191 } 192 193 // Finish the build in the Buildkite Agent API 194 // 195 // Once we tell the API we're finished it might assign us new work, so make 196 // sure everything else is done first. 197 r.finishJob(finishedAt, r.process.ExitStatus, int(r.logStreamer.ChunksFailedCount)) 198 199 logger.Info("Finished job %s", r.Job.ID) 200 201 return nil 202 } 203 204 func (r *JobRunner) Kill() error { 205 r.killLock.Lock() 206 defer r.killLock.Unlock() 207 208 if !r.cancelled { 209 logger.Info("Canceling job %s", r.Job.ID) 210 r.cancelled = true 211 212 if r.process != nil { 213 r.process.Kill() 214 } else { 215 logger.Error("No process to kill") 216 } 217 } 218 219 return nil 220 } 221 222 // Creates the environment variables that will be used in the process and writes a flat environment file 223 func (r *JobRunner) createEnvironment() ([]string, error) { 224 // Create a clone of our jobs environment. We'll then set the 225 // environment variables provided by the agent, which will override any 226 // sent by Buildkite. The variables below should always take 227 // precedence. 228 env := make(map[string]string) 229 for key, value := range r.Job.Env { 230 env[key] = value 231 } 232 233 // Write out the job environment to a file, in k="v" format, with newlines escaped 234 // We present only the clean environment - i.e only variables configured 235 // on the job upstream - and expose the path in another environment variable. 236 if r.envFile != nil { 237 for key, value := range env { 238 if _, err := r.envFile.WriteString(fmt.Sprintf("%s=%q\n", key, value)); err != nil { 239 return nil, err 240 } 241 } 242 if err := r.envFile.Close(); err != nil { 243 return nil, err 244 } 245 env["BUILDKITE_ENV_FILE"] = r.envFile.Name() 246 } 247 248 // Certain env can only be set by agent configuration. 249 // We show the user a warning in the bootstrap if they use any of these at a job level. 250 251 var protectedEnv = []string{ 252 `BUILDKITE_AGENT_ENDPOINT`, 253 `BUILDKITE_AGENT_ACCESS_TOKEN`, 254 `BUILDKITE_AGENT_DEBUG`, 255 `BUILDKITE_AGENT_PID`, 256 `BUILDKITE_BIN_PATH`, 257 `BUILDKITE_CONFIG_PATH`, 258 `BUILDKITE_BUILD_PATH`, 259 `BUILDKITE_HOOKS_PATH`, 260 `BUILDKITE_PLUGINS_PATH`, 261 `BUILDKITE_SSH_KEYSCAN`, 262 `BUILDKITE_GIT_SUBMODULES`, 263 `BUILDKITE_COMMAND_EVAL`, 264 `BUILDKITE_PLUGINS_ENABLED`, 265 `BUILDKITE_LOCAL_HOOKS_ENABLED`, 266 `BUILDKITE_GIT_CLONE_FLAGS`, 267 `BUILDKITE_GIT_CLEAN_FLAGS`, 268 `BUILDKITE_SHELL`, 269 } 270 271 var ignoredEnv []string 272 273 // Check if the user has defined any protected env 274 for _, p := range protectedEnv { 275 if _, exists := r.Job.Env[p]; exists { 276 ignoredEnv = append(ignoredEnv, p) 277 } 278 } 279 280 // Set BUILDKITE_IGNORED_ENV so the bootstrap can show warnings 281 if len(ignoredEnv) > 0 { 282 env["BUILDKITE_IGNORED_ENV"] = strings.Join(ignoredEnv, ",") 283 } 284 285 if experiments.IsEnabled("agent-socket") { 286 env["BUILDKITE_AGENT_ENDPOINT"] = r.APIProxy.Endpoint() 287 env["BUILDKITE_AGENT_ACCESS_TOKEN"] = r.APIProxy.AccessToken() 288 } else { 289 env["BUILDKITE_AGENT_ENDPOINT"] = r.Endpoint 290 env["BUILDKITE_AGENT_ACCESS_TOKEN"] = r.Agent.AccessToken 291 } 292 293 // Add agent environment variables 294 env["BUILDKITE_AGENT_DEBUG"] = fmt.Sprintf("%t", logger.GetLevel() == logger.DEBUG) 295 env["BUILDKITE_AGENT_PID"] = fmt.Sprintf("%d", os.Getpid()) 296 297 // We know the BUILDKITE_BIN_PATH dir, because it's the path to the 298 // currently running file (there is only 1 binary) 299 dir, _ := filepath.Abs(filepath.Dir(os.Args[0])) 300 env["BUILDKITE_BIN_PATH"] = dir 301 302 // Add options from the agent configuration 303 env["BUILDKITE_CONFIG_PATH"] = r.AgentConfiguration.ConfigPath 304 env["BUILDKITE_BUILD_PATH"] = r.AgentConfiguration.BuildPath 305 env["BUILDKITE_HOOKS_PATH"] = r.AgentConfiguration.HooksPath 306 env["BUILDKITE_PLUGINS_PATH"] = r.AgentConfiguration.PluginsPath 307 env["BUILDKITE_SSH_KEYSCAN"] = fmt.Sprintf("%t", r.AgentConfiguration.SSHKeyscan) 308 env["BUILDKITE_GIT_SUBMODULES"] = fmt.Sprintf("%t", r.AgentConfiguration.GitSubmodules) 309 env["BUILDKITE_COMMAND_EVAL"] = fmt.Sprintf("%t", r.AgentConfiguration.CommandEval) 310 env["BUILDKITE_PLUGINS_ENABLED"] = fmt.Sprintf("%t", r.AgentConfiguration.PluginsEnabled) 311 env["BUILDKITE_LOCAL_HOOKS_ENABLED"] = fmt.Sprintf("%t", r.AgentConfiguration.LocalHooksEnabled) 312 env["BUILDKITE_GIT_CLONE_FLAGS"] = r.AgentConfiguration.GitCloneFlags 313 env["BUILDKITE_GIT_CLEAN_FLAGS"] = r.AgentConfiguration.GitCleanFlags 314 env["BUILDKITE_SHELL"] = r.AgentConfiguration.Shell 315 316 enablePluginValidation := r.AgentConfiguration.PluginValidation 317 318 // Allow BUILDKITE_PLUGIN_VALIDATION to be enabled from env for easier 319 // per-pipeline testing 320 if pluginValidation, ok := env["BUILDKITE_PLUGIN_VALIDATION"]; ok { 321 switch pluginValidation { 322 case "true", "1", "on": 323 enablePluginValidation = true 324 } 325 } 326 327 env["BUILDKITE_PLUGIN_VALIDATION"] = fmt.Sprintf("%t", enablePluginValidation) 328 329 // Convert the env map into a slice (which is what the script gear 330 // needs) 331 envSlice := []string{} 332 for key, value := range env { 333 envSlice = append(envSlice, fmt.Sprintf("%s=%s", key, value)) 334 } 335 336 return envSlice, nil 337 } 338 339 // Starts the job in the Buildkite Agent API. We'll retry on connection-related 340 // issues, but if a connection succeeds and we get an error response back from 341 // Buildkite, we won't bother retrying. For example, a "no such host" will 342 // retry, but a 422 from Buildkite won't. 343 func (r *JobRunner) startJob(startedAt time.Time) error { 344 r.Job.StartedAt = startedAt.UTC().Format(time.RFC3339Nano) 345 346 return retry.Do(func(s *retry.Stats) error { 347 _, err := r.APIClient.Jobs.Start(r.Job) 348 349 if err != nil { 350 if api.IsRetryableError(err) { 351 logger.Warn("%s (%s)", err, s) 352 } else { 353 logger.Warn("Buildkite rejected the call to start the job (%s)", err) 354 s.Break() 355 } 356 } 357 358 return err 359 }, &retry.Config{Maximum: 30, Interval: 5 * time.Second}) 360 } 361 362 // Finishes the job in the Buildkite Agent API. This call will keep on retrying 363 // forever until it finally gets a successfull response from the API. 364 func (r *JobRunner) finishJob(finishedAt time.Time, exitStatus string, failedChunkCount int) error { 365 r.Job.FinishedAt = finishedAt.UTC().Format(time.RFC3339Nano) 366 r.Job.ExitStatus = exitStatus 367 r.Job.ChunksFailedCount = failedChunkCount 368 369 return retry.Do(func(s *retry.Stats) error { 370 response, err := r.APIClient.Jobs.Finish(r.Job) 371 if err != nil { 372 // If the API returns with a 422, that means that we 373 // succesfully tried to finish the job, but Buildkite 374 // rejected the finish for some reason. This can 375 // sometimes mean that Buildkite has cancelled the job 376 // before we get a chance to send the final API call 377 // (maybe this agent took too long to kill the 378 // process). In that case, we don't want to keep trying 379 // to finish the job forever so we'll just bail out and 380 // go find some more work to do. 381 if response != nil && response.StatusCode == 422 { 382 logger.Warn("Buildkite rejected the call to finish the job (%s)", err) 383 s.Break() 384 } else { 385 logger.Warn("%s (%s)", err, s) 386 } 387 } 388 389 return err 390 }, &retry.Config{Forever: true, Interval: 1 * time.Second}) 391 } 392 393 func (r *JobRunner) onProcessStartCallback() { 394 // Since we're spinning up 2 routines here, we might as well add them 395 // to the routine wait group here. 396 r.routineWaitGroup.Add(2) 397 398 // Start a routine that will grab the output every few seconds and send 399 // it back to Buildkite 400 go func() { 401 for r.process.IsRunning() { 402 // Send the output of the process to the log streamer 403 // for processing 404 r.logStreamer.Process(r.process.Output()) 405 406 // Sleep for a bit, or until the job is finished 407 select { 408 case <-time.After(1 * time.Second): 409 case <-r.context.Done(): 410 } 411 } 412 413 // The final output after the process has finished is processed in Run() 414 415 // Mark this routine as done in the wait group 416 r.routineWaitGroup.Done() 417 418 logger.Debug("[JobRunner] Routine that processes the log has finished") 419 }() 420 421 // Start a routine that will constantly ping Buildkite to see if the 422 // job has been canceled 423 go func() { 424 for r.process.IsRunning() { 425 // Re-get the job and check it's status to see if it's been 426 // cancelled 427 jobState, _, err := r.APIClient.Jobs.GetState(r.Job.ID) 428 if err != nil { 429 // We don't really care if it fails, we'll just 430 // try again soon anyway 431 logger.Warn("Problem with getting job state %s (%s)", r.Job.ID, err) 432 } else if jobState.State == "canceling" || jobState.State == "canceled" { 433 r.Kill() 434 } 435 436 // Sleep for a bit, or until the job is finished 437 select { 438 case <-time.After(time.Duration(r.Agent.JobStatusInterval) * time.Second): 439 case <-r.context.Done(): 440 } 441 } 442 443 // Mark this routine as done in the wait group 444 r.routineWaitGroup.Done() 445 446 logger.Debug("[JobRunner] Routine that refreshes the job has finished") 447 }() 448 } 449 450 func (r *JobRunner) onUploadHeaderTime(cursor int, total int, times map[string]string) { 451 retry.Do(func(s *retry.Stats) error { 452 _, err := r.APIClient.HeaderTimes.Save(r.Job.ID, &api.HeaderTimes{Times: times}) 453 if err != nil { 454 logger.Warn("%s (%s)", err, s) 455 } 456 457 return err 458 }, &retry.Config{Maximum: 10, Interval: 5 * time.Second}) 459 } 460 461 // Call when a chunk is ready for upload. It retry the chunk upload with an 462 // interval before giving up. 463 func (r *JobRunner) onUploadChunk(chunk *LogStreamerChunk) error { 464 return retry.Do(func(s *retry.Stats) error { 465 _, err := r.APIClient.Chunks.Upload(r.Job.ID, &api.Chunk{ 466 Data: chunk.Data, 467 Sequence: chunk.Order, 468 Offset: chunk.Offset, 469 Size: chunk.Size, 470 }) 471 if err != nil { 472 logger.Warn("%s (%s)", err, s) 473 } 474 475 return err 476 }, &retry.Config{Maximum: 10, Interval: 5 * time.Second}) 477 }