github.com/Axway/agent-sdk@v1.1.101/pkg/jobs/pool.go (about)

     1  package jobs
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  	"time"
     7  
     8  	"github.com/Axway/agent-sdk/pkg/util/log"
     9  )
    10  
    11  // Pool - represents a pool of jobs that are related in such a way that when one is not running none of them should be
    12  type Pool struct {
    13  	jobs                    map[string]JobExecution // All jobs that are in this pool
    14  	cronJobs                map[string]JobExecution // Jobs that run continuously, not just ran once
    15  	detachedCronJobs        map[string]JobExecution // Jobs that run continuously, not just ran once, detached from all others
    16  	poolStatus              PoolStatus              // Holds the current status of the pool of jobs
    17  	failedJob               string                  // Holds the ID of the job that is the reason for a non-running status
    18  	jobsMapLock             sync.Mutex
    19  	cronJobsMapLock         sync.Mutex
    20  	detachedCronJobsMapLock sync.Mutex
    21  	poolStatusLock          sync.Mutex
    22  	backoffLock             sync.Mutex
    23  	failedJobLock           sync.Mutex
    24  	failJobChan             chan string
    25  	stopJobsChan            chan bool
    26  	backoff                 *backoff
    27  	logger                  log.FieldLogger
    28  	startStopLock           sync.Mutex
    29  	isStartStopping         bool
    30  	isStartStopLock         sync.Mutex
    31  }
    32  
    33  func newPool() *Pool {
    34  	logger := log.NewFieldLogger().
    35  		WithComponent("Pool").
    36  		WithPackage("sdk.jobs")
    37  
    38  	newPool := Pool{
    39  		jobs:             make(map[string]JobExecution),
    40  		cronJobs:         make(map[string]JobExecution),
    41  		detachedCronJobs: make(map[string]JobExecution),
    42  		failedJob:        "",
    43  		startStopLock:    sync.Mutex{},
    44  		isStartStopLock:  sync.Mutex{},
    45  		failJobChan:      make(chan string, 1),
    46  		stopJobsChan:     make(chan bool, 1),
    47  		backoff:          newBackoffTimeout(defaultRetryInterval, 10*time.Minute, 2),
    48  		logger:           logger,
    49  	}
    50  	newPool.SetStatus(PoolStatusInitializing)
    51  
    52  	return &newPool
    53  }
    54  
    55  // getBackoff - get the job backoff
    56  func (p *Pool) getBackoff() *backoff {
    57  	p.backoffLock.Lock()
    58  	defer p.backoffLock.Unlock()
    59  	return p.backoff
    60  }
    61  
    62  // setBackoff - set the job backoff
    63  func (p *Pool) setBackoff(backoff *backoff) {
    64  	p.backoffLock.Lock()
    65  	defer p.backoffLock.Unlock()
    66  	p.backoff = backoff
    67  }
    68  
    69  // recordJob - Adds a job to the jobs map
    70  func (p *Pool) recordJob(job JobExecution) string {
    71  	p.jobsMapLock.Lock()
    72  	defer p.jobsMapLock.Unlock()
    73  	if len(p.jobs) == 0 && p.GetStatus() == PoolStatusInitializing.String() {
    74  		// start routine to check all job status funcs and catch any failures
    75  		go p.jobChecker()
    76  		// start the pool watcher
    77  		go p.watchJobs()
    78  	}
    79  
    80  	p.logger.
    81  		WithField("job-id", job.GetID()).
    82  		WithField("job-name", job.GetName()).
    83  		Trace("registered job")
    84  	p.jobs[job.GetID()] = job
    85  	return job.GetID()
    86  }
    87  
    88  func (p *Pool) setCronJob(job JobExecution) {
    89  	p.cronJobsMapLock.Lock()
    90  	defer p.cronJobsMapLock.Unlock()
    91  	p.cronJobs[job.GetID()] = job
    92  }
    93  
    94  func (p *Pool) getCronJob(jobID string) (JobExecution, bool) {
    95  	p.cronJobsMapLock.Lock()
    96  	defer p.cronJobsMapLock.Unlock()
    97  	value, exists := p.cronJobs[jobID]
    98  	return value, exists
    99  }
   100  
   101  func (p *Pool) getCronJobs() map[string]JobExecution {
   102  	p.cronJobsMapLock.Lock()
   103  	defer p.cronJobsMapLock.Unlock()
   104  
   105  	// Create the target map
   106  	newMap := make(map[string]JobExecution)
   107  
   108  	// Copy from the original map to the target map to avoid race conditions
   109  	for key, value := range p.cronJobs {
   110  		newMap[key] = value
   111  	}
   112  	return newMap
   113  }
   114  
   115  func (p *Pool) setDetachedCronJob(job JobExecution) {
   116  	p.detachedCronJobsMapLock.Lock()
   117  	defer p.detachedCronJobsMapLock.Unlock()
   118  	p.detachedCronJobs[job.GetID()] = job
   119  }
   120  
   121  func (p *Pool) getDetachedCronJob(jobID string) (JobExecution, bool) {
   122  	p.detachedCronJobsMapLock.Lock()
   123  	defer p.detachedCronJobsMapLock.Unlock()
   124  	value, exists := p.detachedCronJobs[jobID]
   125  	return value, exists
   126  }
   127  
   128  // recordCronJob - Adds a job to the cron jobs map
   129  func (p *Pool) recordCronJob(job JobExecution) string {
   130  	p.setCronJob(job)
   131  	p.logger.Tracef("added new cron job, now running %v cron jobs", len(p.cronJobs))
   132  	return p.recordJob(job)
   133  }
   134  
   135  // recordDetachedCronJob - Adds a job to the detached cron jobs map
   136  func (p *Pool) recordDetachedCronJob(job JobExecution) string {
   137  	p.setDetachedCronJob(job)
   138  	p.logger.Tracef("added new cron job, now running %v detached cron jobs", len(p.detachedCronJobs))
   139  	return p.recordJob(job)
   140  }
   141  
   142  // recordJob - Removes the specified job from jobs map
   143  func (p *Pool) removeJob(jobID string) {
   144  	p.jobsMapLock.Lock()
   145  	job, ok := p.jobs[jobID]
   146  	if ok {
   147  		job.stop()
   148  		delete(p.jobs, jobID)
   149  	}
   150  	p.jobsMapLock.Unlock()
   151  
   152  	// remove from cron jobs, if present
   153  	_, found := p.getCronJob(jobID)
   154  	p.cronJobsMapLock.Lock()
   155  	if found {
   156  		delete(p.cronJobs, jobID)
   157  	}
   158  	p.cronJobsMapLock.Unlock()
   159  
   160  	// remove from detached cron jobs, if present
   161  	_, found = p.getDetachedCronJob(jobID)
   162  	p.detachedCronJobsMapLock.Lock()
   163  	if found {
   164  		delete(p.detachedCronJobs, jobID)
   165  	}
   166  	p.detachedCronJobsMapLock.Unlock()
   167  }
   168  
   169  // RegisterSingleRunJob - Runs a single run job
   170  func (p *Pool) RegisterSingleRunJob(newJob Job) (string, error) {
   171  	return p.RegisterSingleRunJobWithName(newJob, JobTypeSingleRun)
   172  }
   173  
   174  // RegisterSingleRunJobWithName - Runs a single run job
   175  func (p *Pool) RegisterSingleRunJobWithName(newJob Job, name string) (string, error) {
   176  	job, err := newBaseJob(newJob, p.failJobChan, name)
   177  	if err != nil {
   178  		return "", err
   179  	}
   180  	return p.recordJob(job), nil
   181  }
   182  
   183  // RegisterIntervalJob - Runs a job with a specific interval between each run
   184  func (p *Pool) RegisterIntervalJob(newJob Job, interval time.Duration, opts ...jobOpt) (string, error) {
   185  	return p.RegisterIntervalJobWithName(newJob, interval, JobTypeInterval, opts...)
   186  }
   187  
   188  // RegisterIntervalJobWithName - Runs a job with a specific interval between each run
   189  func (p *Pool) RegisterIntervalJobWithName(newJob Job, interval time.Duration, name string, opts ...jobOpt) (string, error) {
   190  	job, err := newIntervalJob(newJob, interval, name, p.failJobChan, opts...)
   191  	if err != nil {
   192  		return "", err
   193  	}
   194  	return p.recordCronJob(job), nil
   195  }
   196  
   197  // RegisterChannelJob - Runs a job with a specific interval between each run
   198  func (p *Pool) RegisterChannelJob(newJob Job, stopChan chan interface{}) (string, error) {
   199  	return p.RegisterChannelJobWithName(newJob, stopChan, JobTypeChannel)
   200  }
   201  
   202  // RegisterChannelJobWithName - Runs a job with a specific interval between each run
   203  func (p *Pool) RegisterChannelJobWithName(newJob Job, stopChan chan interface{}, name string) (string, error) {
   204  	job, err := newChannelJob(newJob, stopChan, name, p.failJobChan)
   205  	if err != nil {
   206  		return "", err
   207  	}
   208  	return p.recordCronJob(job), nil
   209  }
   210  
   211  // RegisterDetachedChannelJob - Runs a job with a stop channel, detached from other jobs
   212  func (p *Pool) RegisterDetachedChannelJob(newJob Job, stopChan chan interface{}) (string, error) {
   213  	return p.RegisterDetachedChannelJobWithName(newJob, stopChan, JobTypeDetachedChannel)
   214  }
   215  
   216  // RegisterDetachedChannelJobWithName - Runs a named job with a stop channel, detached from other jobs
   217  func (p *Pool) RegisterDetachedChannelJobWithName(newJob Job, stopChan chan interface{}, name string) (string, error) {
   218  	job, err := newDetachedChannelJob(newJob, stopChan, name, p.failJobChan)
   219  	if err != nil {
   220  		return "", err
   221  	}
   222  	return p.recordDetachedCronJob(job), nil
   223  }
   224  
   225  // RegisterDetachedIntervalJob - Runs a job with a specific interval between each run, detached from other jobs
   226  func (p *Pool) RegisterDetachedIntervalJob(newJob Job, interval time.Duration, opts ...jobOpt) (string, error) {
   227  	return p.RegisterDetachedIntervalJobWithName(newJob, interval, JobTypeDetachedInterval, opts...)
   228  }
   229  
   230  // RegisterDetachedIntervalJobWithName - Runs a job with a specific interval between each run, detached from other jobs
   231  func (p *Pool) RegisterDetachedIntervalJobWithName(newJob Job, interval time.Duration, name string, opts ...jobOpt) (string, error) {
   232  	job, err := newDetachedIntervalJob(newJob, interval, name, opts...)
   233  	if err != nil {
   234  		return "", err
   235  	}
   236  	return p.recordDetachedCronJob(job), nil
   237  }
   238  
   239  // RegisterScheduledJob - Runs a job on a specific schedule
   240  func (p *Pool) RegisterScheduledJob(newJob Job, schedule string, opts ...jobOpt) (string, error) {
   241  	return p.RegisterScheduledJobWithName(newJob, schedule, JobTypeScheduled, opts...)
   242  }
   243  
   244  // RegisterScheduledJobWithName - Runs a job on a specific schedule
   245  func (p *Pool) RegisterScheduledJobWithName(newJob Job, schedule, name string, opts ...jobOpt) (string, error) {
   246  	job, err := newScheduledJob(newJob, schedule, name, p.failJobChan, opts...)
   247  	if err != nil {
   248  		return "", err
   249  	}
   250  	return p.recordCronJob(job), nil
   251  }
   252  
   253  // RegisterRetryJob - Runs a job with a limited number of retries
   254  func (p *Pool) RegisterRetryJob(newJob Job, retries int) (string, error) {
   255  	return p.RegisterRetryJobWithName(newJob, retries, JobTypeRetry)
   256  }
   257  
   258  // RegisterRetryJobWithName  - Runs a job with a limited number of retries
   259  func (p *Pool) RegisterRetryJobWithName(newJob Job, retries int, name string) (string, error) {
   260  	job, err := newRetryJob(newJob, retries, name, p.failJobChan)
   261  	if err != nil {
   262  		return "", err
   263  	}
   264  	return p.recordJob(job), nil
   265  }
   266  
   267  // UnregisterJob - Removes the specified job
   268  func (p *Pool) UnregisterJob(jobID string) {
   269  	p.removeJob(jobID)
   270  }
   271  
   272  // GetJob - Returns the Job based on the id
   273  func (p *Pool) GetJob(id string) JobExecution {
   274  	return p.jobs[id].GetJob()
   275  }
   276  
   277  // JobLock - Locks the job, returns when the lock is granted
   278  func (p *Pool) JobLock(id string) {
   279  	p.jobs[id].Lock()
   280  }
   281  
   282  // JobUnlock - Unlocks the job
   283  func (p *Pool) JobUnlock(id string) {
   284  	p.jobs[id].Unlock()
   285  }
   286  
   287  func (p *Pool) getFailedJob() string {
   288  	p.failedJobLock.Lock()
   289  	defer p.failedJobLock.Unlock()
   290  	return p.failedJob
   291  }
   292  
   293  func (p *Pool) setFailedJob(job string) {
   294  	p.failedJobLock.Lock()
   295  	defer p.failedJobLock.Unlock()
   296  	p.failedJob = job
   297  }
   298  
   299  // GetJobStatus - Returns the Status of the Job based on the id
   300  func (p *Pool) GetJobStatus(id string) string {
   301  	return p.jobs[id].GetStatus().String()
   302  }
   303  
   304  // GetStatus - returns the status of the pool of jobs
   305  func (p *Pool) GetStatus() string {
   306  	p.poolStatusLock.Lock()
   307  	defer p.poolStatusLock.Unlock()
   308  	return p.poolStatus.String()
   309  }
   310  
   311  // SetStatus - Sets the status of the pool of jobs
   312  func (p *Pool) SetStatus(status PoolStatus) {
   313  	p.poolStatusLock.Lock()
   314  	defer p.poolStatusLock.Unlock()
   315  	p.poolStatus = status
   316  }
   317  
   318  // waits with timeout for the specified status in all cron jobs
   319  func (p *Pool) waitStartStop(jobStatus JobStatus) bool {
   320  	ctx, cancel := context.WithTimeout(context.Background(), getStatusCheckInterval())
   321  	defer cancel()
   322  
   323  	done := make(chan bool)
   324  	go func() {
   325  		for {
   326  			running := true
   327  			for _, job := range p.getCronJobs() {
   328  				if job.GetStatus() != jobStatus {
   329  					running = false
   330  				}
   331  			}
   332  			if running {
   333  				done <- true
   334  				break
   335  			}
   336  			time.Sleep(10 * time.Millisecond)
   337  		}
   338  	}()
   339  
   340  	select {
   341  	case b := <-done:
   342  		return b
   343  	case <-ctx.Done():
   344  		return false
   345  	}
   346  }
   347  
   348  func (p *Pool) setIsStartStop(isStartStop bool) {
   349  	p.isStartStopLock.Lock()
   350  	defer p.isStartStopLock.Unlock()
   351  	p.isStartStopping = isStartStop
   352  }
   353  
   354  func (p *Pool) getIsStartStop() bool {
   355  	p.isStartStopLock.Lock()
   356  	defer p.isStartStopLock.Unlock()
   357  	return p.isStartStopping
   358  }
   359  
   360  // startAll - starts all jobs defined in the cronJobs map, used by watchJobs
   361  //
   362  //	          other jobs are single run and never restarted
   363  //						 returns true when successful, false when not
   364  func (p *Pool) startAll() bool {
   365  	p.stopAll()
   366  
   367  	// Check that all are ready before starting
   368  	p.logger.Debug("Checking for all cron jobs to be ready")
   369  	for _, job := range p.getCronJobs() {
   370  		if !job.Ready() {
   371  			p.logger.WithField("job-id", job.GetID()).Debugf("job is not ready")
   372  			return false
   373  		}
   374  	}
   375  	p.logger.Debug("Starting all cron jobs")
   376  	for _, job := range p.getCronJobs() {
   377  		go job.start()
   378  	}
   379  
   380  	if p.waitStartStop(JobStatusRunning) {
   381  		p.SetStatus(PoolStatusRunning)
   382  	}
   383  
   384  	return true
   385  }
   386  
   387  // stopAll - stops all jobs defined in the cronJobs map, used by watchJobs
   388  //
   389  //	other jobs are single run and should not need stopped
   390  func (p *Pool) stopAll() {
   391  	p.logger.Debug("Stopping all cron jobs")
   392  
   393  	// Must do the map copy so that the loop can run without a race condition.
   394  	// Can NOT do a defer on this unlock, or will get stuck
   395  	mapCopy := make(map[string]JobExecution)
   396  	for key, value := range p.getCronJobs() {
   397  		mapCopy[key] = value
   398  	}
   399  	for _, job := range mapCopy {
   400  		p.logger.WithField("job-name", job.GetName()).Trace("stopping job")
   401  		job.stop()
   402  		p.logger.WithField("job-name", job.GetName()).Tracef("finished stopping job")
   403  	}
   404  
   405  	if p.waitStartStop(JobStatusStopped) {
   406  		p.SetStatus(PoolStatusStopped)
   407  	}
   408  }
   409  
   410  // jobChecker - regularly checks the status of cron jobs, stopping jobs if error returned
   411  func (p *Pool) jobChecker() {
   412  	ticker := time.NewTicker(getStatusCheckInterval())
   413  	defer ticker.Stop()
   414  	for {
   415  		select {
   416  		case <-ticker.C:
   417  			go func() {
   418  				failedJob := ""
   419  				for _, job := range p.getCronJobs() {
   420  					job.updateStatus()
   421  					if job.GetStatus() != JobStatusRunning {
   422  						failedJob = job.GetID()
   423  						break
   424  					}
   425  				}
   426  
   427  				if !p.getIsStartStop() {
   428  					if failedJob != "" {
   429  						p.failJobChan <- failedJob
   430  					} else {
   431  						p.SetStatus(PoolStatusRunning)
   432  					}
   433  				}
   434  			}()
   435  		case failedJob := <-p.failJobChan:
   436  			p.setFailedJob(failedJob) // this is the job for the current fail loop
   437  			p.stopJobsChan <- true
   438  			p.SetStatus(PoolStatusStopped)
   439  		}
   440  	}
   441  }
   442  
   443  func (p *Pool) stopPool() {
   444  	p.startStopLock.Lock()
   445  	defer p.startStopLock.Unlock()
   446  
   447  	p.setIsStartStop(true)
   448  	defer p.setIsStartStop(false)
   449  	p.stopAll()
   450  }
   451  
   452  func (p *Pool) startPool() {
   453  	p.startStopLock.Lock()
   454  	defer p.startStopLock.Unlock()
   455  
   456  	if p.GetStatus() == PoolStatusStopped.String() {
   457  		p.setIsStartStop(true)
   458  		defer p.setIsStartStop(false)
   459  		// attempt to restart all jobs
   460  		if p.startAll() {
   461  			p.getBackoff().reset()
   462  		} else {
   463  			p.getBackoff().increaseTimeout()
   464  		}
   465  		p.setFailedJob("")
   466  	}
   467  }
   468  
   469  // watchJobs - the main loop of a pool of jobs, constantly checks for status of jobs and acts accordingly
   470  func (p *Pool) watchJobs() {
   471  	p.SetStatus(PoolStatusRunning)
   472  	ticker := time.NewTicker(p.getBackoff().getCurrentTimeout())
   473  	defer ticker.Stop()
   474  	for {
   475  		select {
   476  		case <-p.stopJobsChan:
   477  			if job, found := p.getCronJob(p.getFailedJob()); found {
   478  				p.logger.
   479  					WithField("job-name", job.GetName()).
   480  					WithField("failed-job", p.getFailedJob()).
   481  					Debug("Job failed, stop all jobs")
   482  			}
   483  			p.stopPool()
   484  		case <-ticker.C:
   485  			p.startPool()
   486  			ticker = time.NewTicker(p.getBackoff().getCurrentTimeout())
   487  			p.logger.
   488  				WithField("interval", p.getBackoff().getCurrentTimeout()).
   489  				Trace("setting next job restart backoff interval")
   490  		}
   491  	}
   492  }