github.com/brandur/modulir@v0.0.0-20240305213423-94ee82929cbd/pool.go

github.com/brandur/modulir@v0.0.0-20240305213423-94ee82929cbd/pool.go (about)

     1  package modulir
     2  
     3  import (
     4  	"errors"
     5  	"sort"
     6  	"sync"
     7  	"time"
     8  
     9  	"golang.org/x/xerrors"
    10  )
    11  
    12  //////////////////////////////////////////////////////////////////////////////
    13  //
    14  //
    15  //
    16  // Public
    17  //
    18  //
    19  //
    20  //////////////////////////////////////////////////////////////////////////////
    21  
    22  // Job is a wrapper for a piece of work that should be executed by the job
    23  // pool.
    24  //
    25  //nolint:errname
    26  type Job struct {
    27  	// Duration is the time it took the job to run. It's set regardless of
    28  	// whether the job's finished state was executed, not executed, or errored.
    29  	Duration time.Duration
    30  
    31  	// Err is an error that the job produced, if any.
    32  	Err error
    33  
    34  	// Executed is whether the job "did work", signaled by it returning true.
    35  	Executed bool
    36  
    37  	// F is the function which makes up the job's workload.
    38  	F func() (bool, error)
    39  
    40  	// Name is a name for the job which is helpful for informational and
    41  	// debugging purposes.
    42  	Name string
    43  }
    44  
    45  // Error returns the error message of the error wrapped in the job if this was
    46  // an errored job. Job implements the error interface so that it can return
    47  // itself in situations where error handling is being done but job errors may
    48  // be mixed in with other sorts of errors.
    49  //
    50  // It panics if the job wasn't errored, so be careful to only use this when
    51  // iterating across something like Pool.JobsErrored.
    52  func (j *Job) Error() string {
    53  	if j.Err == nil {
    54  		panic("Error called on a non-errored Job")
    55  	}
    56  
    57  	return j.Err.Error()
    58  }
    59  
    60  // NewJob initializes and returns a new Job.
    61  func NewJob(name string, f func() (bool, error)) *Job {
    62  	return &Job{Name: name, F: f}
    63  }
    64  
    65  // Pool is a worker group that runs a number of jobs at a configured
    66  // concurrency.
    67  type Pool struct {
    68  	Jobs chan *Job
    69  
    70  	// JobsAll is a slice of all the jobs that were fed into the pool on the
    71  	// last run.
    72  	JobsAll []*Job
    73  
    74  	// JobsErrored is a slice of jobs that errored on the last run.
    75  	//
    76  	// See also JobErrors which is a shortcut for extracting all the errors
    77  	// from the jobs.
    78  	JobsErrored []*Job
    79  
    80  	// JobsExecuted is a slice of jobs that were executed on the last run.
    81  	JobsExecuted []*Job
    82  
    83  	colorizer      *colorizer
    84  	concurrency    int
    85  	jobsInternal   chan *Job
    86  	jobsErroredMu  sync.Mutex
    87  	jobsExecutedMu sync.Mutex
    88  	jobsFeederDone chan struct{}
    89  	log            LoggerInterface
    90  	roundNum       int
    91  	roundStarted   bool
    92  	wg             sync.WaitGroup
    93  	workerInfos    []workerInfo
    94  }
    95  
    96  // NewPool initializes a new pool with the given jobs and at the given
    97  // concurrency. It calls Init so that the pool is fully spun up and ready to
    98  // start a round.
    99  func NewPool(log LoggerInterface, concurrency int) *Pool {
   100  	// By default a pool gets a no-op colorizer. NewContext may set one
   101  	// separately for pools created within the package.
   102  	pool := &Pool{
   103  		colorizer:   &colorizer{LogColor: false},
   104  		concurrency: concurrency,
   105  		log:         log,
   106  		workerInfos: make([]workerInfo, concurrency),
   107  	}
   108  	return pool
   109  }
   110  
   111  // JobErrors is a shortcut from extracting all the errors out of JobsErrored,
   112  // the set of jobs that errored on the last round.
   113  func (p *Pool) JobErrors() []error {
   114  	if len(p.JobsErrored) < 1 {
   115  		return nil
   116  	}
   117  
   118  	errs := make([]error, len(p.JobsErrored))
   119  	for i, job := range p.JobsErrored {
   120  		errs[i] = job.Err
   121  	}
   122  	return errs
   123  }
   124  
   125  // LogErrors logs a limited set of errors that occurred during a build.
   126  func (p *Pool) LogErrors() {
   127  	p.LogErrorsSlice(p.JobErrors())
   128  }
   129  
   130  // LogErrorsSlice logs a limited set of errors from the given slice.
   131  func (p *Pool) LogErrorsSlice(errs []error) {
   132  	if errs == nil {
   133  		return
   134  	}
   135  
   136  	for i, err := range errs {
   137  		// When dealing with an errored job (in practice, this is going to be
   138  		// the common case), we can provide a little more detail on what went
   139  		// wrong.
   140  		var job *Job
   141  		if errors.As(err, &job) {
   142  			p.log.Errorf(
   143  				p.colorizer.Bold(p.colorizer.Red("Job error:")).String()+
   144  					" %v (job: '%s', time: %v)",
   145  				job.Err, job.Name, job.Duration.Truncate(100*time.Microsecond))
   146  		} else {
   147  			p.log.Errorf(
   148  				p.colorizer.Bold(p.colorizer.Red("Build error:")).String()+
   149  					" %v",
   150  				err)
   151  		}
   152  
   153  		if i >= maxMessages-1 {
   154  			p.log.Errorf("... too many errors (limit reached)")
   155  			break
   156  		}
   157  	}
   158  }
   159  
   160  // LogSlowest logs a limited set of executed jobs from the last build starting
   161  // with the slowest jobs on top.
   162  func (p *Pool) LogSlowest() {
   163  	p.LogSlowestSlice(p.JobsExecuted)
   164  }
   165  
   166  // LogSlowestSlice logs a limited set of executed jobs from the given slice.
   167  func (p *Pool) LogSlowestSlice(jobs []*Job) {
   168  	sortJobsBySlowest(jobs)
   169  
   170  	for i, job := range jobs {
   171  		// Having this in the loop ensures we don't print it if zero jobs
   172  		// executed
   173  		if i == 0 {
   174  			p.log.Infof("Jobs executed (slowest first):")
   175  		}
   176  
   177  		p.log.Infof(
   178  			p.colorizer.Bold(p.colorizer.Cyan("    %s")).String()+
   179  				" (time: %v)",
   180  			job.Name, job.Duration.Truncate(100*time.Microsecond))
   181  
   182  		if i >= maxMessages-1 {
   183  			p.log.Infof("... many jobs executed (limit reached)")
   184  			break
   185  		}
   186  	}
   187  }
   188  
   189  // StartRound begins an execution round. Internal statistics and other tracking
   190  // are all reset.
   191  func (p *Pool) StartRound(roundNum int) {
   192  	if p.roundStarted {
   193  		panic("StartRound already called (call Wait before calling it again)")
   194  	}
   195  
   196  	p.roundNum = roundNum
   197  	p.log.Debugf("pool: Starting round %v at concurrency %v", p.roundNum, p.concurrency)
   198  
   199  	p.Jobs = make(chan *Job, 500)
   200  	p.JobsAll = nil
   201  	p.JobsErrored = nil
   202  	p.JobsExecuted = nil
   203  	p.jobsFeederDone = make(chan struct{}, 1)
   204  	p.jobsInternal = make(chan *Job, 500)
   205  	p.roundStarted = true
   206  
   207  	for i := range p.workerInfos {
   208  		p.workerInfos[i].reset()
   209  	}
   210  
   211  	// Job feeder
   212  	go func() {
   213  		p.log.Debugf("pool: Job feeder: Starting")
   214  
   215  		for job := range p.Jobs {
   216  			p.wg.Add(1)
   217  			p.jobsInternal <- job
   218  			p.JobsAll = append(p.JobsAll, job)
   219  		}
   220  
   221  		p.log.Debugf("pool: Job feeder: Finished feeding")
   222  
   223  		// Runs after Jobs has been closed.
   224  		close(p.jobsFeederDone)
   225  	}()
   226  
   227  	// Worker Goroutines
   228  	for i := 0; i < p.concurrency; i++ {
   229  		workerNum := i
   230  		go func() {
   231  			p.workForRound(workerNum)
   232  		}()
   233  	}
   234  }
   235  
   236  // Wait waits until all jobs are finished and stops the pool.
   237  //
   238  // Returns true if the round of jobs all executed successfully, and false
   239  // otherwise. In the latter case, the caller should stop and observe the
   240  // contents of Errors.
   241  //
   242  // If the pool isn't running, it falls through without doing anything so it's
   243  // safe to call Wait multiple times.
   244  func (p *Pool) Wait() bool {
   245  	if !p.roundStarted {
   246  		panic("Can't wait on a job pool that's not primed (call StartRound first)")
   247  	}
   248  
   249  	p.log.Debugf("pool: Starting round %v", p.roundNum)
   250  
   251  	p.roundStarted = false
   252  
   253  	// First signal over the jobs chan that all work has been enqueued).
   254  	close(p.Jobs)
   255  
   256  	// Now wait for the job feeder to be finished so that we know all jobs have
   257  	// been enqueued in jobsInternal.
   258  	<-p.jobsFeederDone
   259  
   260  	// Prints some debug information to help us in case we run into stalling
   261  	// problems in the main job loop.
   262  	done := make(chan struct{}, 1)
   263  	go func() {
   264  		select {
   265  		case <-time.After(waitSoftTimeout):
   266  			p.logWaitTimeoutInfo()
   267  		case <-done:
   268  		}
   269  	}()
   270  
   271  	p.log.Debugf("pool: Waiting for %v job(s) to be done", len(p.JobsAll))
   272  
   273  	// Now wait for all those jobs to be done.
   274  	p.wg.Wait()
   275  
   276  	// Kill the timeout Goroutine.
   277  	done <- struct{}{}
   278  
   279  	// Drops workers out of their run loop. Their Goroutines return.
   280  	// wait on the run gate.
   281  	close(p.jobsInternal)
   282  
   283  	// Occasionally useful for debugging.
   284  	// p.logWaitTimeoutInfo()
   285  
   286  	return p.JobsErrored == nil
   287  }
   288  
   289  //////////////////////////////////////////////////////////////////////////////
   290  //
   291  //
   292  //
   293  // Private
   294  //
   295  //
   296  //
   297  //////////////////////////////////////////////////////////////////////////////
   298  
   299  const (
   300  	// When to report that a job is probably timed out. We call it a "soft"
   301  	// timeout because we can't actually kill jobs.
   302  	jobSoftTimeout = 15 * time.Second
   303  
   304  	// Maximum number of errors or jobs to print on screen after a build loop.
   305  	maxMessages = 10
   306  
   307  	// When to report that a wait round is probably timed out. We call it a
   308  	// "soft" timeout because no jobs are killed -- it's just for reporting and
   309  	// debugging purposes.
   310  	waitSoftTimeout = 60 * time.Second
   311  )
   312  
   313  // Keeps track of the information on a worker. Used for debugging purposes
   314  // only.
   315  type workerInfo struct {
   316  	activeJob *Job
   317  	state     workerState
   318  
   319  	// Number of jobs finished is the total number that the worker "saw" as it
   320  	// was running this round, so it includes errored jobs as well as jobs that
   321  	// didn't "execute", which means that they were given a chance to run, but
   322  	// decided no changes were required and therefore didn't do any serious
   323  	// work.
   324  	numJobsErrored  int
   325  	numJobsExecuted int
   326  	numJobsFinished int
   327  }
   328  
   329  // Resets statistics for the worker info.
   330  func (wi *workerInfo) reset() {
   331  	wi.state = workerStateWaitingOnRunOrStop
   332  	wi.numJobsErrored = 0
   333  	wi.numJobsExecuted = 0
   334  	wi.numJobsFinished = 0
   335  }
   336  
   337  // Keeps track of the state of a worker. Used for debugging purposes only.
   338  type workerState string
   339  
   340  // The possible states that a worker can be in. Used for debugging purposes
   341  // only.
   342  const (
   343  	workerStateJobExecuting       workerState = "job_executing"
   344  	workerStateJobFinished        workerState = "job_finished"
   345  	workerStatePanicked           workerState = "panicked"
   346  	workerStateStopped            workerState = "stopped"
   347  	workerStateWaitingOnRunOrStop workerState = "waiting_on_run_or_stop"
   348  )
   349  
   350  func (p *Pool) logWaitTimeoutInfo() {
   351  	// We don't have an easy channel to count on for this number, so sum the
   352  	// numbers across all workers.
   353  	numJobsFinished := 0
   354  	for _, info := range p.workerInfos {
   355  		numJobsFinished += info.numJobsFinished
   356  	}
   357  
   358  	p.log.Errorf(
   359  		"Wait soft timeout (jobs queued: %v, finished: %v, errored: %v, executed: %v, left: %v)",
   360  		len(p.JobsAll),
   361  		numJobsFinished,
   362  		len(p.JobsErrored),
   363  		len(p.JobsExecuted),
   364  		len(p.jobsInternal),
   365  	)
   366  
   367  	for i, info := range p.workerInfos {
   368  		jobName := "<none>"
   369  		if info.activeJob != nil {
   370  			jobName = info.activeJob.Name
   371  		}
   372  
   373  		p.log.Errorf("    Worker %v state: %v, jobs finished: %v, errored: %v, executed: %v, job: %v",
   374  			i, info.state, info.numJobsFinished, info.numJobsErrored, info.numJobsExecuted, jobName)
   375  	}
   376  }
   377  
   378  // Puts a finished job in the right channel and adds run statistics to the
   379  // worker's info.
   380  func (p *Pool) setWorkerJobFinished(workerNum int, job *Job, executed bool, err error) {
   381  	p.workerInfos[workerNum].numJobsFinished++
   382  
   383  	if err != nil {
   384  		job.Err = err
   385  
   386  		p.jobsErroredMu.Lock()
   387  		p.JobsErrored = append(p.JobsErrored, job)
   388  		p.jobsErroredMu.Unlock()
   389  
   390  		p.workerInfos[workerNum].numJobsErrored++
   391  	}
   392  
   393  	if executed {
   394  		job.Executed = true
   395  
   396  		p.jobsExecutedMu.Lock()
   397  		p.JobsExecuted = append(p.JobsExecuted, job)
   398  		p.jobsExecutedMu.Unlock()
   399  
   400  		p.workerInfos[workerNum].numJobsExecuted++
   401  	}
   402  
   403  	p.wg.Done()
   404  
   405  	p.workerInfos[workerNum].activeJob = nil
   406  	p.workerInfos[workerNum].state = workerStateJobFinished
   407  }
   408  
   409  func (p *Pool) setWorkerJobExecuting(workerNum int, job *Job) {
   410  	p.workerInfos[workerNum].activeJob = job
   411  	p.workerInfos[workerNum].state = workerStateJobExecuting
   412  }
   413  
   414  // Sorts a slice of jobs with the slowest on top.
   415  func sortJobsBySlowest(jobs []*Job) {
   416  	sort.Slice(jobs, func(i, j int) bool {
   417  		return jobs[j].Duration < jobs[i].Duration
   418  	})
   419  }
   420  
   421  // The work loop for a single round within a single worker Goroutine.
   422  func (p *Pool) workForRound(workerNum int) {
   423  	for j := range p.jobsInternal {
   424  		// Required so that we have a stable pointer that we can keep past the
   425  		// lifetime of the loop. Don't change this.
   426  		job := j
   427  
   428  		p.workJob(workerNum, job)
   429  	}
   430  
   431  	p.workerInfos[workerNum].state = workerStateStopped
   432  }
   433  
   434  // A worker working a single job. Extracted this way so that we can add a defer
   435  // that will help debug a panic.
   436  func (p *Pool) workJob(workerNum int, job *Job) {
   437  	p.setWorkerJobExecuting(workerNum, job)
   438  
   439  	// Start a Goroutine to track the time taken to do this work.
   440  	// Unfortunately, we can't actually kill a timed out Goroutine because
   441  	// Go (and we rely on the user to make sure these get fixed instead),
   442  	// but we can at least raise on the interface which job is problematic
   443  	// to help identify what needs to be fixed.
   444  	done := make(chan struct{}, 1)
   445  	go func() {
   446  		select {
   447  		case <-time.After(jobSoftTimeout):
   448  			p.log.Errorf("Job soft timeout (job: '%s')", job.Name)
   449  		case <-done:
   450  		}
   451  	}()
   452  
   453  	var executed bool
   454  	var jobErr error
   455  	start := time.Now()
   456  
   457  	defer func() {
   458  		job.Duration = time.Since(start)
   459  
   460  		// Kill the timeout Goroutine.
   461  		done <- struct{}{}
   462  
   463  		var panicked bool
   464  		if r := recover(); r != nil {
   465  			if err, ok := r.(error); ok {
   466  				jobErr = xerrors.Errorf("job panicked: %w", err)
   467  			} else {
   468  				// Panics are often just given a string to panic with, so make
   469  				// sure to handle that as well
   470  				jobErr = xerrors.Errorf("job panicked: %v", r)
   471  			}
   472  			panicked = true
   473  		}
   474  
   475  		p.setWorkerJobFinished(workerNum, job, executed, jobErr)
   476  
   477  		// And set the special panicked worker status if we panicked
   478  		// because it means that this worker is down and no longer
   479  		// available.
   480  		//
   481  		// TODO: It is possible to hit a deadlock if all workers have
   482  		// panicked and there's still work left to do. The framework should
   483  		// detect this condition and exit.
   484  		if panicked {
   485  			p.workerInfos[workerNum].state = workerStatePanicked
   486  		}
   487  	}()
   488  
   489  	executed, jobErr = job.F()
   490  }