github.com/Axway/agent-sdk@v1.1.101/pkg/jobs/pool.go (about) 1 package jobs 2 3 import ( 4 "context" 5 "sync" 6 "time" 7 8 "github.com/Axway/agent-sdk/pkg/util/log" 9 ) 10 11 // Pool - represents a pool of jobs that are related in such a way that when one is not running none of them should be 12 type Pool struct { 13 jobs map[string]JobExecution // All jobs that are in this pool 14 cronJobs map[string]JobExecution // Jobs that run continuously, not just ran once 15 detachedCronJobs map[string]JobExecution // Jobs that run continuously, not just ran once, detached from all others 16 poolStatus PoolStatus // Holds the current status of the pool of jobs 17 failedJob string // Holds the ID of the job that is the reason for a non-running status 18 jobsMapLock sync.Mutex 19 cronJobsMapLock sync.Mutex 20 detachedCronJobsMapLock sync.Mutex 21 poolStatusLock sync.Mutex 22 backoffLock sync.Mutex 23 failedJobLock sync.Mutex 24 failJobChan chan string 25 stopJobsChan chan bool 26 backoff *backoff 27 logger log.FieldLogger 28 startStopLock sync.Mutex 29 isStartStopping bool 30 isStartStopLock sync.Mutex 31 } 32 33 func newPool() *Pool { 34 logger := log.NewFieldLogger(). 35 WithComponent("Pool"). 36 WithPackage("sdk.jobs") 37 38 newPool := Pool{ 39 jobs: make(map[string]JobExecution), 40 cronJobs: make(map[string]JobExecution), 41 detachedCronJobs: make(map[string]JobExecution), 42 failedJob: "", 43 startStopLock: sync.Mutex{}, 44 isStartStopLock: sync.Mutex{}, 45 failJobChan: make(chan string, 1), 46 stopJobsChan: make(chan bool, 1), 47 backoff: newBackoffTimeout(defaultRetryInterval, 10*time.Minute, 2), 48 logger: logger, 49 } 50 newPool.SetStatus(PoolStatusInitializing) 51 52 return &newPool 53 } 54 55 // getBackoff - get the job backoff 56 func (p *Pool) getBackoff() *backoff { 57 p.backoffLock.Lock() 58 defer p.backoffLock.Unlock() 59 return p.backoff 60 } 61 62 // setBackoff - set the job backoff 63 func (p *Pool) setBackoff(backoff *backoff) { 64 p.backoffLock.Lock() 65 defer p.backoffLock.Unlock() 66 p.backoff = backoff 67 } 68 69 // recordJob - Adds a job to the jobs map 70 func (p *Pool) recordJob(job JobExecution) string { 71 p.jobsMapLock.Lock() 72 defer p.jobsMapLock.Unlock() 73 if len(p.jobs) == 0 && p.GetStatus() == PoolStatusInitializing.String() { 74 // start routine to check all job status funcs and catch any failures 75 go p.jobChecker() 76 // start the pool watcher 77 go p.watchJobs() 78 } 79 80 p.logger. 81 WithField("job-id", job.GetID()). 82 WithField("job-name", job.GetName()). 83 Trace("registered job") 84 p.jobs[job.GetID()] = job 85 return job.GetID() 86 } 87 88 func (p *Pool) setCronJob(job JobExecution) { 89 p.cronJobsMapLock.Lock() 90 defer p.cronJobsMapLock.Unlock() 91 p.cronJobs[job.GetID()] = job 92 } 93 94 func (p *Pool) getCronJob(jobID string) (JobExecution, bool) { 95 p.cronJobsMapLock.Lock() 96 defer p.cronJobsMapLock.Unlock() 97 value, exists := p.cronJobs[jobID] 98 return value, exists 99 } 100 101 func (p *Pool) getCronJobs() map[string]JobExecution { 102 p.cronJobsMapLock.Lock() 103 defer p.cronJobsMapLock.Unlock() 104 105 // Create the target map 106 newMap := make(map[string]JobExecution) 107 108 // Copy from the original map to the target map to avoid race conditions 109 for key, value := range p.cronJobs { 110 newMap[key] = value 111 } 112 return newMap 113 } 114 115 func (p *Pool) setDetachedCronJob(job JobExecution) { 116 p.detachedCronJobsMapLock.Lock() 117 defer p.detachedCronJobsMapLock.Unlock() 118 p.detachedCronJobs[job.GetID()] = job 119 } 120 121 func (p *Pool) getDetachedCronJob(jobID string) (JobExecution, bool) { 122 p.detachedCronJobsMapLock.Lock() 123 defer p.detachedCronJobsMapLock.Unlock() 124 value, exists := p.detachedCronJobs[jobID] 125 return value, exists 126 } 127 128 // recordCronJob - Adds a job to the cron jobs map 129 func (p *Pool) recordCronJob(job JobExecution) string { 130 p.setCronJob(job) 131 p.logger.Tracef("added new cron job, now running %v cron jobs", len(p.cronJobs)) 132 return p.recordJob(job) 133 } 134 135 // recordDetachedCronJob - Adds a job to the detached cron jobs map 136 func (p *Pool) recordDetachedCronJob(job JobExecution) string { 137 p.setDetachedCronJob(job) 138 p.logger.Tracef("added new cron job, now running %v detached cron jobs", len(p.detachedCronJobs)) 139 return p.recordJob(job) 140 } 141 142 // recordJob - Removes the specified job from jobs map 143 func (p *Pool) removeJob(jobID string) { 144 p.jobsMapLock.Lock() 145 job, ok := p.jobs[jobID] 146 if ok { 147 job.stop() 148 delete(p.jobs, jobID) 149 } 150 p.jobsMapLock.Unlock() 151 152 // remove from cron jobs, if present 153 _, found := p.getCronJob(jobID) 154 p.cronJobsMapLock.Lock() 155 if found { 156 delete(p.cronJobs, jobID) 157 } 158 p.cronJobsMapLock.Unlock() 159 160 // remove from detached cron jobs, if present 161 _, found = p.getDetachedCronJob(jobID) 162 p.detachedCronJobsMapLock.Lock() 163 if found { 164 delete(p.detachedCronJobs, jobID) 165 } 166 p.detachedCronJobsMapLock.Unlock() 167 } 168 169 // RegisterSingleRunJob - Runs a single run job 170 func (p *Pool) RegisterSingleRunJob(newJob Job) (string, error) { 171 return p.RegisterSingleRunJobWithName(newJob, JobTypeSingleRun) 172 } 173 174 // RegisterSingleRunJobWithName - Runs a single run job 175 func (p *Pool) RegisterSingleRunJobWithName(newJob Job, name string) (string, error) { 176 job, err := newBaseJob(newJob, p.failJobChan, name) 177 if err != nil { 178 return "", err 179 } 180 return p.recordJob(job), nil 181 } 182 183 // RegisterIntervalJob - Runs a job with a specific interval between each run 184 func (p *Pool) RegisterIntervalJob(newJob Job, interval time.Duration, opts ...jobOpt) (string, error) { 185 return p.RegisterIntervalJobWithName(newJob, interval, JobTypeInterval, opts...) 186 } 187 188 // RegisterIntervalJobWithName - Runs a job with a specific interval between each run 189 func (p *Pool) RegisterIntervalJobWithName(newJob Job, interval time.Duration, name string, opts ...jobOpt) (string, error) { 190 job, err := newIntervalJob(newJob, interval, name, p.failJobChan, opts...) 191 if err != nil { 192 return "", err 193 } 194 return p.recordCronJob(job), nil 195 } 196 197 // RegisterChannelJob - Runs a job with a specific interval between each run 198 func (p *Pool) RegisterChannelJob(newJob Job, stopChan chan interface{}) (string, error) { 199 return p.RegisterChannelJobWithName(newJob, stopChan, JobTypeChannel) 200 } 201 202 // RegisterChannelJobWithName - Runs a job with a specific interval between each run 203 func (p *Pool) RegisterChannelJobWithName(newJob Job, stopChan chan interface{}, name string) (string, error) { 204 job, err := newChannelJob(newJob, stopChan, name, p.failJobChan) 205 if err != nil { 206 return "", err 207 } 208 return p.recordCronJob(job), nil 209 } 210 211 // RegisterDetachedChannelJob - Runs a job with a stop channel, detached from other jobs 212 func (p *Pool) RegisterDetachedChannelJob(newJob Job, stopChan chan interface{}) (string, error) { 213 return p.RegisterDetachedChannelJobWithName(newJob, stopChan, JobTypeDetachedChannel) 214 } 215 216 // RegisterDetachedChannelJobWithName - Runs a named job with a stop channel, detached from other jobs 217 func (p *Pool) RegisterDetachedChannelJobWithName(newJob Job, stopChan chan interface{}, name string) (string, error) { 218 job, err := newDetachedChannelJob(newJob, stopChan, name, p.failJobChan) 219 if err != nil { 220 return "", err 221 } 222 return p.recordDetachedCronJob(job), nil 223 } 224 225 // RegisterDetachedIntervalJob - Runs a job with a specific interval between each run, detached from other jobs 226 func (p *Pool) RegisterDetachedIntervalJob(newJob Job, interval time.Duration, opts ...jobOpt) (string, error) { 227 return p.RegisterDetachedIntervalJobWithName(newJob, interval, JobTypeDetachedInterval, opts...) 228 } 229 230 // RegisterDetachedIntervalJobWithName - Runs a job with a specific interval between each run, detached from other jobs 231 func (p *Pool) RegisterDetachedIntervalJobWithName(newJob Job, interval time.Duration, name string, opts ...jobOpt) (string, error) { 232 job, err := newDetachedIntervalJob(newJob, interval, name, opts...) 233 if err != nil { 234 return "", err 235 } 236 return p.recordDetachedCronJob(job), nil 237 } 238 239 // RegisterScheduledJob - Runs a job on a specific schedule 240 func (p *Pool) RegisterScheduledJob(newJob Job, schedule string, opts ...jobOpt) (string, error) { 241 return p.RegisterScheduledJobWithName(newJob, schedule, JobTypeScheduled, opts...) 242 } 243 244 // RegisterScheduledJobWithName - Runs a job on a specific schedule 245 func (p *Pool) RegisterScheduledJobWithName(newJob Job, schedule, name string, opts ...jobOpt) (string, error) { 246 job, err := newScheduledJob(newJob, schedule, name, p.failJobChan, opts...) 247 if err != nil { 248 return "", err 249 } 250 return p.recordCronJob(job), nil 251 } 252 253 // RegisterRetryJob - Runs a job with a limited number of retries 254 func (p *Pool) RegisterRetryJob(newJob Job, retries int) (string, error) { 255 return p.RegisterRetryJobWithName(newJob, retries, JobTypeRetry) 256 } 257 258 // RegisterRetryJobWithName - Runs a job with a limited number of retries 259 func (p *Pool) RegisterRetryJobWithName(newJob Job, retries int, name string) (string, error) { 260 job, err := newRetryJob(newJob, retries, name, p.failJobChan) 261 if err != nil { 262 return "", err 263 } 264 return p.recordJob(job), nil 265 } 266 267 // UnregisterJob - Removes the specified job 268 func (p *Pool) UnregisterJob(jobID string) { 269 p.removeJob(jobID) 270 } 271 272 // GetJob - Returns the Job based on the id 273 func (p *Pool) GetJob(id string) JobExecution { 274 return p.jobs[id].GetJob() 275 } 276 277 // JobLock - Locks the job, returns when the lock is granted 278 func (p *Pool) JobLock(id string) { 279 p.jobs[id].Lock() 280 } 281 282 // JobUnlock - Unlocks the job 283 func (p *Pool) JobUnlock(id string) { 284 p.jobs[id].Unlock() 285 } 286 287 func (p *Pool) getFailedJob() string { 288 p.failedJobLock.Lock() 289 defer p.failedJobLock.Unlock() 290 return p.failedJob 291 } 292 293 func (p *Pool) setFailedJob(job string) { 294 p.failedJobLock.Lock() 295 defer p.failedJobLock.Unlock() 296 p.failedJob = job 297 } 298 299 // GetJobStatus - Returns the Status of the Job based on the id 300 func (p *Pool) GetJobStatus(id string) string { 301 return p.jobs[id].GetStatus().String() 302 } 303 304 // GetStatus - returns the status of the pool of jobs 305 func (p *Pool) GetStatus() string { 306 p.poolStatusLock.Lock() 307 defer p.poolStatusLock.Unlock() 308 return p.poolStatus.String() 309 } 310 311 // SetStatus - Sets the status of the pool of jobs 312 func (p *Pool) SetStatus(status PoolStatus) { 313 p.poolStatusLock.Lock() 314 defer p.poolStatusLock.Unlock() 315 p.poolStatus = status 316 } 317 318 // waits with timeout for the specified status in all cron jobs 319 func (p *Pool) waitStartStop(jobStatus JobStatus) bool { 320 ctx, cancel := context.WithTimeout(context.Background(), getStatusCheckInterval()) 321 defer cancel() 322 323 done := make(chan bool) 324 go func() { 325 for { 326 running := true 327 for _, job := range p.getCronJobs() { 328 if job.GetStatus() != jobStatus { 329 running = false 330 } 331 } 332 if running { 333 done <- true 334 break 335 } 336 time.Sleep(10 * time.Millisecond) 337 } 338 }() 339 340 select { 341 case b := <-done: 342 return b 343 case <-ctx.Done(): 344 return false 345 } 346 } 347 348 func (p *Pool) setIsStartStop(isStartStop bool) { 349 p.isStartStopLock.Lock() 350 defer p.isStartStopLock.Unlock() 351 p.isStartStopping = isStartStop 352 } 353 354 func (p *Pool) getIsStartStop() bool { 355 p.isStartStopLock.Lock() 356 defer p.isStartStopLock.Unlock() 357 return p.isStartStopping 358 } 359 360 // startAll - starts all jobs defined in the cronJobs map, used by watchJobs 361 // 362 // other jobs are single run and never restarted 363 // returns true when successful, false when not 364 func (p *Pool) startAll() bool { 365 p.stopAll() 366 367 // Check that all are ready before starting 368 p.logger.Debug("Checking for all cron jobs to be ready") 369 for _, job := range p.getCronJobs() { 370 if !job.Ready() { 371 p.logger.WithField("job-id", job.GetID()).Debugf("job is not ready") 372 return false 373 } 374 } 375 p.logger.Debug("Starting all cron jobs") 376 for _, job := range p.getCronJobs() { 377 go job.start() 378 } 379 380 if p.waitStartStop(JobStatusRunning) { 381 p.SetStatus(PoolStatusRunning) 382 } 383 384 return true 385 } 386 387 // stopAll - stops all jobs defined in the cronJobs map, used by watchJobs 388 // 389 // other jobs are single run and should not need stopped 390 func (p *Pool) stopAll() { 391 p.logger.Debug("Stopping all cron jobs") 392 393 // Must do the map copy so that the loop can run without a race condition. 394 // Can NOT do a defer on this unlock, or will get stuck 395 mapCopy := make(map[string]JobExecution) 396 for key, value := range p.getCronJobs() { 397 mapCopy[key] = value 398 } 399 for _, job := range mapCopy { 400 p.logger.WithField("job-name", job.GetName()).Trace("stopping job") 401 job.stop() 402 p.logger.WithField("job-name", job.GetName()).Tracef("finished stopping job") 403 } 404 405 if p.waitStartStop(JobStatusStopped) { 406 p.SetStatus(PoolStatusStopped) 407 } 408 } 409 410 // jobChecker - regularly checks the status of cron jobs, stopping jobs if error returned 411 func (p *Pool) jobChecker() { 412 ticker := time.NewTicker(getStatusCheckInterval()) 413 defer ticker.Stop() 414 for { 415 select { 416 case <-ticker.C: 417 go func() { 418 failedJob := "" 419 for _, job := range p.getCronJobs() { 420 job.updateStatus() 421 if job.GetStatus() != JobStatusRunning { 422 failedJob = job.GetID() 423 break 424 } 425 } 426 427 if !p.getIsStartStop() { 428 if failedJob != "" { 429 p.failJobChan <- failedJob 430 } else { 431 p.SetStatus(PoolStatusRunning) 432 } 433 } 434 }() 435 case failedJob := <-p.failJobChan: 436 p.setFailedJob(failedJob) // this is the job for the current fail loop 437 p.stopJobsChan <- true 438 p.SetStatus(PoolStatusStopped) 439 } 440 } 441 } 442 443 func (p *Pool) stopPool() { 444 p.startStopLock.Lock() 445 defer p.startStopLock.Unlock() 446 447 p.setIsStartStop(true) 448 defer p.setIsStartStop(false) 449 p.stopAll() 450 } 451 452 func (p *Pool) startPool() { 453 p.startStopLock.Lock() 454 defer p.startStopLock.Unlock() 455 456 if p.GetStatus() == PoolStatusStopped.String() { 457 p.setIsStartStop(true) 458 defer p.setIsStartStop(false) 459 // attempt to restart all jobs 460 if p.startAll() { 461 p.getBackoff().reset() 462 } else { 463 p.getBackoff().increaseTimeout() 464 } 465 p.setFailedJob("") 466 } 467 } 468 469 // watchJobs - the main loop of a pool of jobs, constantly checks for status of jobs and acts accordingly 470 func (p *Pool) watchJobs() { 471 p.SetStatus(PoolStatusRunning) 472 ticker := time.NewTicker(p.getBackoff().getCurrentTimeout()) 473 defer ticker.Stop() 474 for { 475 select { 476 case <-p.stopJobsChan: 477 if job, found := p.getCronJob(p.getFailedJob()); found { 478 p.logger. 479 WithField("job-name", job.GetName()). 480 WithField("failed-job", p.getFailedJob()). 481 Debug("Job failed, stop all jobs") 482 } 483 p.stopPool() 484 case <-ticker.C: 485 p.startPool() 486 ticker = time.NewTicker(p.getBackoff().getCurrentTimeout()) 487 p.logger. 488 WithField("interval", p.getBackoff().getCurrentTimeout()). 489 Trace("setting next job restart backoff interval") 490 } 491 } 492 }