github.com/brandur/modulir@v0.0.0-20240305213423-94ee82929cbd/pool.go (about) 1 package modulir 2 3 import ( 4 "errors" 5 "sort" 6 "sync" 7 "time" 8 9 "golang.org/x/xerrors" 10 ) 11 12 ////////////////////////////////////////////////////////////////////////////// 13 // 14 // 15 // 16 // Public 17 // 18 // 19 // 20 ////////////////////////////////////////////////////////////////////////////// 21 22 // Job is a wrapper for a piece of work that should be executed by the job 23 // pool. 24 // 25 //nolint:errname 26 type Job struct { 27 // Duration is the time it took the job to run. It's set regardless of 28 // whether the job's finished state was executed, not executed, or errored. 29 Duration time.Duration 30 31 // Err is an error that the job produced, if any. 32 Err error 33 34 // Executed is whether the job "did work", signaled by it returning true. 35 Executed bool 36 37 // F is the function which makes up the job's workload. 38 F func() (bool, error) 39 40 // Name is a name for the job which is helpful for informational and 41 // debugging purposes. 42 Name string 43 } 44 45 // Error returns the error message of the error wrapped in the job if this was 46 // an errored job. Job implements the error interface so that it can return 47 // itself in situations where error handling is being done but job errors may 48 // be mixed in with other sorts of errors. 49 // 50 // It panics if the job wasn't errored, so be careful to only use this when 51 // iterating across something like Pool.JobsErrored. 52 func (j *Job) Error() string { 53 if j.Err == nil { 54 panic("Error called on a non-errored Job") 55 } 56 57 return j.Err.Error() 58 } 59 60 // NewJob initializes and returns a new Job. 61 func NewJob(name string, f func() (bool, error)) *Job { 62 return &Job{Name: name, F: f} 63 } 64 65 // Pool is a worker group that runs a number of jobs at a configured 66 // concurrency. 67 type Pool struct { 68 Jobs chan *Job 69 70 // JobsAll is a slice of all the jobs that were fed into the pool on the 71 // last run. 72 JobsAll []*Job 73 74 // JobsErrored is a slice of jobs that errored on the last run. 75 // 76 // See also JobErrors which is a shortcut for extracting all the errors 77 // from the jobs. 78 JobsErrored []*Job 79 80 // JobsExecuted is a slice of jobs that were executed on the last run. 81 JobsExecuted []*Job 82 83 colorizer *colorizer 84 concurrency int 85 jobsInternal chan *Job 86 jobsErroredMu sync.Mutex 87 jobsExecutedMu sync.Mutex 88 jobsFeederDone chan struct{} 89 log LoggerInterface 90 roundNum int 91 roundStarted bool 92 wg sync.WaitGroup 93 workerInfos []workerInfo 94 } 95 96 // NewPool initializes a new pool with the given jobs and at the given 97 // concurrency. It calls Init so that the pool is fully spun up and ready to 98 // start a round. 99 func NewPool(log LoggerInterface, concurrency int) *Pool { 100 // By default a pool gets a no-op colorizer. NewContext may set one 101 // separately for pools created within the package. 102 pool := &Pool{ 103 colorizer: &colorizer{LogColor: false}, 104 concurrency: concurrency, 105 log: log, 106 workerInfos: make([]workerInfo, concurrency), 107 } 108 return pool 109 } 110 111 // JobErrors is a shortcut from extracting all the errors out of JobsErrored, 112 // the set of jobs that errored on the last round. 113 func (p *Pool) JobErrors() []error { 114 if len(p.JobsErrored) < 1 { 115 return nil 116 } 117 118 errs := make([]error, len(p.JobsErrored)) 119 for i, job := range p.JobsErrored { 120 errs[i] = job.Err 121 } 122 return errs 123 } 124 125 // LogErrors logs a limited set of errors that occurred during a build. 126 func (p *Pool) LogErrors() { 127 p.LogErrorsSlice(p.JobErrors()) 128 } 129 130 // LogErrorsSlice logs a limited set of errors from the given slice. 131 func (p *Pool) LogErrorsSlice(errs []error) { 132 if errs == nil { 133 return 134 } 135 136 for i, err := range errs { 137 // When dealing with an errored job (in practice, this is going to be 138 // the common case), we can provide a little more detail on what went 139 // wrong. 140 var job *Job 141 if errors.As(err, &job) { 142 p.log.Errorf( 143 p.colorizer.Bold(p.colorizer.Red("Job error:")).String()+ 144 " %v (job: '%s', time: %v)", 145 job.Err, job.Name, job.Duration.Truncate(100*time.Microsecond)) 146 } else { 147 p.log.Errorf( 148 p.colorizer.Bold(p.colorizer.Red("Build error:")).String()+ 149 " %v", 150 err) 151 } 152 153 if i >= maxMessages-1 { 154 p.log.Errorf("... too many errors (limit reached)") 155 break 156 } 157 } 158 } 159 160 // LogSlowest logs a limited set of executed jobs from the last build starting 161 // with the slowest jobs on top. 162 func (p *Pool) LogSlowest() { 163 p.LogSlowestSlice(p.JobsExecuted) 164 } 165 166 // LogSlowestSlice logs a limited set of executed jobs from the given slice. 167 func (p *Pool) LogSlowestSlice(jobs []*Job) { 168 sortJobsBySlowest(jobs) 169 170 for i, job := range jobs { 171 // Having this in the loop ensures we don't print it if zero jobs 172 // executed 173 if i == 0 { 174 p.log.Infof("Jobs executed (slowest first):") 175 } 176 177 p.log.Infof( 178 p.colorizer.Bold(p.colorizer.Cyan(" %s")).String()+ 179 " (time: %v)", 180 job.Name, job.Duration.Truncate(100*time.Microsecond)) 181 182 if i >= maxMessages-1 { 183 p.log.Infof("... many jobs executed (limit reached)") 184 break 185 } 186 } 187 } 188 189 // StartRound begins an execution round. Internal statistics and other tracking 190 // are all reset. 191 func (p *Pool) StartRound(roundNum int) { 192 if p.roundStarted { 193 panic("StartRound already called (call Wait before calling it again)") 194 } 195 196 p.roundNum = roundNum 197 p.log.Debugf("pool: Starting round %v at concurrency %v", p.roundNum, p.concurrency) 198 199 p.Jobs = make(chan *Job, 500) 200 p.JobsAll = nil 201 p.JobsErrored = nil 202 p.JobsExecuted = nil 203 p.jobsFeederDone = make(chan struct{}, 1) 204 p.jobsInternal = make(chan *Job, 500) 205 p.roundStarted = true 206 207 for i := range p.workerInfos { 208 p.workerInfos[i].reset() 209 } 210 211 // Job feeder 212 go func() { 213 p.log.Debugf("pool: Job feeder: Starting") 214 215 for job := range p.Jobs { 216 p.wg.Add(1) 217 p.jobsInternal <- job 218 p.JobsAll = append(p.JobsAll, job) 219 } 220 221 p.log.Debugf("pool: Job feeder: Finished feeding") 222 223 // Runs after Jobs has been closed. 224 close(p.jobsFeederDone) 225 }() 226 227 // Worker Goroutines 228 for i := 0; i < p.concurrency; i++ { 229 workerNum := i 230 go func() { 231 p.workForRound(workerNum) 232 }() 233 } 234 } 235 236 // Wait waits until all jobs are finished and stops the pool. 237 // 238 // Returns true if the round of jobs all executed successfully, and false 239 // otherwise. In the latter case, the caller should stop and observe the 240 // contents of Errors. 241 // 242 // If the pool isn't running, it falls through without doing anything so it's 243 // safe to call Wait multiple times. 244 func (p *Pool) Wait() bool { 245 if !p.roundStarted { 246 panic("Can't wait on a job pool that's not primed (call StartRound first)") 247 } 248 249 p.log.Debugf("pool: Starting round %v", p.roundNum) 250 251 p.roundStarted = false 252 253 // First signal over the jobs chan that all work has been enqueued). 254 close(p.Jobs) 255 256 // Now wait for the job feeder to be finished so that we know all jobs have 257 // been enqueued in jobsInternal. 258 <-p.jobsFeederDone 259 260 // Prints some debug information to help us in case we run into stalling 261 // problems in the main job loop. 262 done := make(chan struct{}, 1) 263 go func() { 264 select { 265 case <-time.After(waitSoftTimeout): 266 p.logWaitTimeoutInfo() 267 case <-done: 268 } 269 }() 270 271 p.log.Debugf("pool: Waiting for %v job(s) to be done", len(p.JobsAll)) 272 273 // Now wait for all those jobs to be done. 274 p.wg.Wait() 275 276 // Kill the timeout Goroutine. 277 done <- struct{}{} 278 279 // Drops workers out of their run loop. Their Goroutines return. 280 // wait on the run gate. 281 close(p.jobsInternal) 282 283 // Occasionally useful for debugging. 284 // p.logWaitTimeoutInfo() 285 286 return p.JobsErrored == nil 287 } 288 289 ////////////////////////////////////////////////////////////////////////////// 290 // 291 // 292 // 293 // Private 294 // 295 // 296 // 297 ////////////////////////////////////////////////////////////////////////////// 298 299 const ( 300 // When to report that a job is probably timed out. We call it a "soft" 301 // timeout because we can't actually kill jobs. 302 jobSoftTimeout = 15 * time.Second 303 304 // Maximum number of errors or jobs to print on screen after a build loop. 305 maxMessages = 10 306 307 // When to report that a wait round is probably timed out. We call it a 308 // "soft" timeout because no jobs are killed -- it's just for reporting and 309 // debugging purposes. 310 waitSoftTimeout = 60 * time.Second 311 ) 312 313 // Keeps track of the information on a worker. Used for debugging purposes 314 // only. 315 type workerInfo struct { 316 activeJob *Job 317 state workerState 318 319 // Number of jobs finished is the total number that the worker "saw" as it 320 // was running this round, so it includes errored jobs as well as jobs that 321 // didn't "execute", which means that they were given a chance to run, but 322 // decided no changes were required and therefore didn't do any serious 323 // work. 324 numJobsErrored int 325 numJobsExecuted int 326 numJobsFinished int 327 } 328 329 // Resets statistics for the worker info. 330 func (wi *workerInfo) reset() { 331 wi.state = workerStateWaitingOnRunOrStop 332 wi.numJobsErrored = 0 333 wi.numJobsExecuted = 0 334 wi.numJobsFinished = 0 335 } 336 337 // Keeps track of the state of a worker. Used for debugging purposes only. 338 type workerState string 339 340 // The possible states that a worker can be in. Used for debugging purposes 341 // only. 342 const ( 343 workerStateJobExecuting workerState = "job_executing" 344 workerStateJobFinished workerState = "job_finished" 345 workerStatePanicked workerState = "panicked" 346 workerStateStopped workerState = "stopped" 347 workerStateWaitingOnRunOrStop workerState = "waiting_on_run_or_stop" 348 ) 349 350 func (p *Pool) logWaitTimeoutInfo() { 351 // We don't have an easy channel to count on for this number, so sum the 352 // numbers across all workers. 353 numJobsFinished := 0 354 for _, info := range p.workerInfos { 355 numJobsFinished += info.numJobsFinished 356 } 357 358 p.log.Errorf( 359 "Wait soft timeout (jobs queued: %v, finished: %v, errored: %v, executed: %v, left: %v)", 360 len(p.JobsAll), 361 numJobsFinished, 362 len(p.JobsErrored), 363 len(p.JobsExecuted), 364 len(p.jobsInternal), 365 ) 366 367 for i, info := range p.workerInfos { 368 jobName := "<none>" 369 if info.activeJob != nil { 370 jobName = info.activeJob.Name 371 } 372 373 p.log.Errorf(" Worker %v state: %v, jobs finished: %v, errored: %v, executed: %v, job: %v", 374 i, info.state, info.numJobsFinished, info.numJobsErrored, info.numJobsExecuted, jobName) 375 } 376 } 377 378 // Puts a finished job in the right channel and adds run statistics to the 379 // worker's info. 380 func (p *Pool) setWorkerJobFinished(workerNum int, job *Job, executed bool, err error) { 381 p.workerInfos[workerNum].numJobsFinished++ 382 383 if err != nil { 384 job.Err = err 385 386 p.jobsErroredMu.Lock() 387 p.JobsErrored = append(p.JobsErrored, job) 388 p.jobsErroredMu.Unlock() 389 390 p.workerInfos[workerNum].numJobsErrored++ 391 } 392 393 if executed { 394 job.Executed = true 395 396 p.jobsExecutedMu.Lock() 397 p.JobsExecuted = append(p.JobsExecuted, job) 398 p.jobsExecutedMu.Unlock() 399 400 p.workerInfos[workerNum].numJobsExecuted++ 401 } 402 403 p.wg.Done() 404 405 p.workerInfos[workerNum].activeJob = nil 406 p.workerInfos[workerNum].state = workerStateJobFinished 407 } 408 409 func (p *Pool) setWorkerJobExecuting(workerNum int, job *Job) { 410 p.workerInfos[workerNum].activeJob = job 411 p.workerInfos[workerNum].state = workerStateJobExecuting 412 } 413 414 // Sorts a slice of jobs with the slowest on top. 415 func sortJobsBySlowest(jobs []*Job) { 416 sort.Slice(jobs, func(i, j int) bool { 417 return jobs[j].Duration < jobs[i].Duration 418 }) 419 } 420 421 // The work loop for a single round within a single worker Goroutine. 422 func (p *Pool) workForRound(workerNum int) { 423 for j := range p.jobsInternal { 424 // Required so that we have a stable pointer that we can keep past the 425 // lifetime of the loop. Don't change this. 426 job := j 427 428 p.workJob(workerNum, job) 429 } 430 431 p.workerInfos[workerNum].state = workerStateStopped 432 } 433 434 // A worker working a single job. Extracted this way so that we can add a defer 435 // that will help debug a panic. 436 func (p *Pool) workJob(workerNum int, job *Job) { 437 p.setWorkerJobExecuting(workerNum, job) 438 439 // Start a Goroutine to track the time taken to do this work. 440 // Unfortunately, we can't actually kill a timed out Goroutine because 441 // Go (and we rely on the user to make sure these get fixed instead), 442 // but we can at least raise on the interface which job is problematic 443 // to help identify what needs to be fixed. 444 done := make(chan struct{}, 1) 445 go func() { 446 select { 447 case <-time.After(jobSoftTimeout): 448 p.log.Errorf("Job soft timeout (job: '%s')", job.Name) 449 case <-done: 450 } 451 }() 452 453 var executed bool 454 var jobErr error 455 start := time.Now() 456 457 defer func() { 458 job.Duration = time.Since(start) 459 460 // Kill the timeout Goroutine. 461 done <- struct{}{} 462 463 var panicked bool 464 if r := recover(); r != nil { 465 if err, ok := r.(error); ok { 466 jobErr = xerrors.Errorf("job panicked: %w", err) 467 } else { 468 // Panics are often just given a string to panic with, so make 469 // sure to handle that as well 470 jobErr = xerrors.Errorf("job panicked: %v", r) 471 } 472 panicked = true 473 } 474 475 p.setWorkerJobFinished(workerNum, job, executed, jobErr) 476 477 // And set the special panicked worker status if we panicked 478 // because it means that this worker is down and no longer 479 // available. 480 // 481 // TODO: It is possible to hit a deadlock if all workers have 482 // panicked and there's still work left to do. The framework should 483 // detect this condition and exit. 484 if panicked { 485 p.workerInfos[workerNum].state = workerStatePanicked 486 } 487 }() 488 489 executed, jobErr = job.F() 490 }