gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/renter/workerjobgeneric.go (about)

     1  package renter
     2  
     3  import (
     4  	"container/list"
     5  	"context"
     6  	"sync"
     7  	"time"
     8  
     9  	"gitlab.com/NebulousLabs/errors"
    10  )
    11  
    12  var errDiscardingCanceledJob = errors.New("callNext: skipping and discarding already canceled job")
    13  
    14  type (
    15  	// jobGeneric implements the basic functionality for a job.
    16  	jobGeneric struct {
    17  		staticCtx context.Context
    18  
    19  		staticQueue workerJobQueue
    20  
    21  		// staticMetadata is a generic field on the job that can be set and
    22  		// casted by implementations of a job
    23  		staticMetadata interface{}
    24  
    25  		// externExecuteTime is set when the job gets executed
    26  		//
    27  		// NOTE: the 'extern' prefix is used here even though the field is not
    28  		// governed by a mutex, it can be accessed by anyone since a job is not
    29  		// used from more than one thread - it is not static because it's not
    30  		// set on initialization but rather by the implementation of callExecute
    31  		externExecuteTime time.Time
    32  	}
    33  
    34  	// jobGenericQueue is a generic queue for a job. It has a mutex, references
    35  	// a worker, tracks whether or not it has been killed, and has a cooldown
    36  	// timer. It does not have an array of jobs that are in the queue, because
    37  	// those are type specific.
    38  	jobGenericQueue struct {
    39  		jobs *list.List
    40  
    41  		killed bool
    42  
    43  		cooldownUntil       time.Time
    44  		consecutiveFailures uint64
    45  
    46  		// firstFailureTime is set by a failed job in case that job was executed
    47  		// after the current value, it is cleared by a successful job, it's used
    48  		// to conditionally increment the consecutive failures field, rather
    49  		// than having parallelly executed failed jobs all increment it
    50  		firstFailureTime time.Time
    51  
    52  		recentErr     error
    53  		recentErrTime time.Time
    54  
    55  		staticWorkerObj *worker // name conflict with staticWorker method
    56  		mu              sync.Mutex
    57  	}
    58  
    59  	// workerJob defines a job that the worker is able to perform.
    60  	workerJob interface {
    61  		// callDicard will discard this job, sending an error down the response
    62  		// channel of the job. The provided error should be part of the error
    63  		// that gets sent.
    64  		callDiscard(error)
    65  
    66  		// callExecute will run the actual job.
    67  		callExecute() error
    68  
    69  		// callExpectedBandwidth will return the amount of bandwidth that a job
    70  		// expects to consume.
    71  		callExpectedBandwidth() (upload uint64, download uint64)
    72  
    73  		// staticGetMetadata returns a metadata object.
    74  		staticGetMetadata() interface{}
    75  
    76  		// staticCanceled returns true if the job has been canceled, false
    77  		// otherwise.
    78  		staticCanceled() bool
    79  	}
    80  
    81  	// workerJobQueue defines an interface to create a worker job queue.
    82  	workerJobQueue interface {
    83  		// callDiscardAll will discard all of the jobs in the queue using the
    84  		// provided error.
    85  		callDiscardAll(error)
    86  
    87  		// callReportFailure should be called on the queue every time that a job
    88  		// fails, and include the error associated with the failure and the time
    89  		// at which the job was executed.
    90  		callReportFailure(error, time.Time, time.Time)
    91  
    92  		// callReportSuccess should be called on the queue every time that a job
    93  		// succeeds.
    94  		callReportSuccess()
    95  
    96  		// callStatus returns the status of the queue
    97  		callStatus() workerJobQueueStatus
    98  
    99  		// staticWorker will return the worker of the job queue.
   100  		staticWorker() *worker
   101  	}
   102  
   103  	// workerJobQueueStatus is a struct that reflects the status of the queue
   104  	workerJobQueueStatus struct {
   105  		size                uint64
   106  		cooldownUntil       time.Time
   107  		consecutiveFailures uint64
   108  		recentErr           error
   109  		recentErrTime       time.Time
   110  	}
   111  )
   112  
   113  // newJobGeneric returns an initialized jobGeneric. The queue that is associated
   114  // with the job should be used as the input to this function. The job will
   115  // cancel itself if the cancelChan is closed.
   116  func newJobGeneric(ctx context.Context, queue workerJobQueue, metadata interface{}) jobGeneric {
   117  	return jobGeneric{
   118  		staticCtx:      ctx,
   119  		staticQueue:    queue,
   120  		staticMetadata: metadata,
   121  	}
   122  }
   123  
   124  // newJobGenericQueue will return an initialized generic job queue.
   125  func newJobGenericQueue(w *worker) *jobGenericQueue {
   126  	return &jobGenericQueue{
   127  		jobs:            list.New(),
   128  		staticWorkerObj: w,
   129  	}
   130  }
   131  
   132  // staticCanceled returns whether or not the job has been canceled.
   133  func (j *jobGeneric) staticCanceled() bool {
   134  	select {
   135  	case <-j.staticCtx.Done():
   136  		return true
   137  	default:
   138  		return false
   139  	}
   140  }
   141  
   142  // staticGetMetadata returns the job's metadata.
   143  func (j *jobGeneric) staticGetMetadata() interface{} {
   144  	return j.staticMetadata
   145  }
   146  
   147  // add will add a job to the queue.
   148  func (jq *jobGenericQueue) add(j workerJob) bool {
   149  	if jq.killed || jq.onCooldown() {
   150  		return false
   151  	}
   152  	jq.jobs.PushBack(j)
   153  	jq.staticWorkerObj.staticWake()
   154  	return true
   155  }
   156  
   157  // callAdd will add a job to the queue.
   158  func (jq *jobGenericQueue) callAdd(j workerJob) bool {
   159  	jq.mu.Lock()
   160  	defer jq.mu.Unlock()
   161  	return jq.add(j)
   162  }
   163  
   164  // callCooldownStatus returns all necessary information to present the queues' cooldown status.
   165  func (jq *jobGenericQueue) callCooldownStatus() (bool, bool, int, time.Duration, string) {
   166  	jq.mu.Lock()
   167  	defer jq.mu.Unlock()
   168  
   169  	var coolDownErrStr string
   170  	if jq.onCooldown() && jq.recentErr != nil {
   171  		coolDownErrStr = jq.recentErr.Error()
   172  	}
   173  
   174  	var coolDownUntil time.Duration
   175  	if jq.onCooldown() {
   176  		coolDownUntil = time.Until(jq.cooldownUntil)
   177  	}
   178  
   179  	return jq.onCooldown(), jq.killed, jq.jobs.Len(), coolDownUntil, coolDownErrStr
   180  }
   181  
   182  // callDiscardAll will discard all jobs in the queue using the provided error.
   183  func (jq *jobGenericQueue) callDiscardAll(err error) {
   184  	jq.mu.Lock()
   185  	defer jq.mu.Unlock()
   186  	jq.discardAll(err)
   187  }
   188  
   189  // callKill will kill the queue, discarding all jobs and ensuring no more jobs
   190  // can be added.
   191  func (jq *jobGenericQueue) callKill() {
   192  	jq.mu.Lock()
   193  	defer jq.mu.Unlock()
   194  
   195  	err := errors.New("worker is being killed")
   196  	jq.discardAll(err)
   197  	jq.killed = true
   198  }
   199  
   200  // callIsKilled returns whether or not the jobGenericQueue was killed or not
   201  func (jq *jobGenericQueue) callIsKilled() bool {
   202  	jq.mu.Lock()
   203  	defer jq.mu.Unlock()
   204  	return jq.killed
   205  }
   206  
   207  // callLen returns the number of jobs in the queue.
   208  func (jq *jobGenericQueue) callLen() int {
   209  	jq.mu.Lock()
   210  	defer jq.mu.Unlock()
   211  	return jq.jobs.Len()
   212  }
   213  
   214  // callNext returns the next job in the worker queue. If there is no job in the
   215  // queue, 'nil' will be returned.
   216  func (jq *jobGenericQueue) callNext() workerJob {
   217  	jq.mu.Lock()
   218  	defer jq.mu.Unlock()
   219  
   220  	// Loop through the jobs, looking for the first job that hasn't yet been
   221  	// canceled. Remove jobs from the queue along the way.
   222  	for job := jq.jobs.Front(); job != nil; job = job.Next() {
   223  		// Remove the job from the list.
   224  		jq.jobs.Remove(job)
   225  
   226  		// Check if the job is already canceled.
   227  		wj := job.Value.(workerJob)
   228  		if wj.staticCanceled() {
   229  			wj.callDiscard(errDiscardingCanceledJob)
   230  			continue
   231  		}
   232  		return wj
   233  	}
   234  
   235  	// Job queue is empty, return nil.
   236  	return nil
   237  }
   238  
   239  // callOnCooldown returns whether the queue is on cooldown.
   240  func (jq *jobGenericQueue) callOnCooldown() bool {
   241  	jq.mu.Lock()
   242  	defer jq.mu.Unlock()
   243  	return jq.onCooldown()
   244  }
   245  
   246  // callReportFailure reports that a job has failed within the queue. This will
   247  // cause all remaining jobs in the queue to be discarded, and will put the queue
   248  // on cooldown.
   249  func (jq *jobGenericQueue) callReportFailure(err error, executedAt, failedAt time.Time) {
   250  	jq.mu.Lock()
   251  	defer jq.mu.Unlock()
   252  
   253  	// only update the cooldown if we're currently not on cooldown
   254  	if !jq.onCooldown() {
   255  		jq.cooldownUntil = cooldownUntil(jq.consecutiveFailures)
   256  	}
   257  
   258  	jq.recentErr = errors.AddContext(err, "discarding all jobs in this queue and going on cooldown")
   259  	jq.recentErrTime = time.Now()
   260  
   261  	// discard all jobs in the queue
   262  	jq.discardAll(jq.recentErr)
   263  
   264  	// only if the job was executed after the time of the first failure we want
   265  	// to count it as a consective failure, when that is the case we also want
   266  	// to update the time of the first failure to the current time
   267  	//
   268  	// NOTE: this is to ensure multiple concurrent jobs that fail at about the
   269  	// same time don't all count towards the consecutive failures, causing the
   270  	// cooldown to go from zero to max immediately
   271  	if executedAt.After(jq.firstFailureTime) {
   272  		jq.consecutiveFailures++
   273  		jq.firstFailureTime = failedAt
   274  	}
   275  }
   276  
   277  // callReportSuccess lets the job queue know that there was a successsful job.
   278  // Note that this will reset the consecutive failure count, but will not reset
   279  // the recentErr value - the recentErr value is left as an error so that when
   280  // debugging later, developers and users can see what errors had been caused by
   281  // past issues.
   282  func (jq *jobGenericQueue) callReportSuccess() {
   283  	jq.mu.Lock()
   284  	jq.consecutiveFailures = 0
   285  	jq.firstFailureTime = time.Time{}
   286  	jq.mu.Unlock()
   287  }
   288  
   289  // callStatus returns the queue status
   290  func (jq *jobGenericQueue) callStatus() workerJobQueueStatus {
   291  	jq.mu.Lock()
   292  	defer jq.mu.Unlock()
   293  	return workerJobQueueStatus{
   294  		size:                uint64(jq.jobs.Len()),
   295  		cooldownUntil:       jq.cooldownUntil,
   296  		consecutiveFailures: jq.consecutiveFailures,
   297  		recentErr:           jq.recentErr,
   298  		recentErrTime:       jq.recentErrTime,
   299  	}
   300  }
   301  
   302  // discardAll will drop all jobs from the queue.
   303  func (jq *jobGenericQueue) discardAll(err error) {
   304  	for job := jq.jobs.Front(); job != nil; job = job.Next() {
   305  		wj := job.Value.(workerJob)
   306  		wj.callDiscard(err)
   307  	}
   308  	jq.jobs = list.New()
   309  }
   310  
   311  // staticWorker will return the worker that is associated with this job queue.
   312  func (jq *jobGenericQueue) staticWorker() *worker {
   313  	return jq.staticWorkerObj
   314  }
   315  
   316  // onCooldown returns whether the queue is on cooldown.
   317  func (jq *jobGenericQueue) onCooldown() bool {
   318  	return time.Now().Before(jq.cooldownUntil)
   319  }