gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/renter/workerloop.go (about)

     1  package renter
     2  
     3  import (
     4  	"sync/atomic"
     5  	"time"
     6  
     7  	"gitlab.com/SkynetLabs/skyd/build"
     8  
     9  	"gitlab.com/NebulousLabs/errors"
    10  )
    11  
    12  var (
    13  	errOnMaintenanceCooldown = errors.New("the worker account is on cooldown")
    14  	errInvalidPriceTable     = errors.New("price table with host is no longer valid")
    15  )
    16  
    17  type (
    18  	// workerLoopState tracks the state of the worker loop.
    19  	workerLoopState struct {
    20  		// Variables to count the number of jobs running. Note that these
    21  		// variables can only be incremented in the primary work loop of the
    22  		// worker, because there are blocking conditions within the primary work
    23  		// loop that need to know only one thread is running at a time, and
    24  		// safety is derived from knowing that no new threads are launching
    25  		// while we are waiting for all existing threads to finish.
    26  		//
    27  		// These values can be decremented in a goroutine.
    28  		atomicAsyncJobsRunning uint64
    29  		atomicSerialJobRunning uint64
    30  
    31  		// atomicSuspectRevisionMismatch indicates that the worker encountered
    32  		// some error where it believes that it needs to resync its contract
    33  		// with the host.
    34  		atomicSuspectRevisionMismatch uint64
    35  
    36  		// Variables to track the total amount of async data outstanding. This
    37  		// indicates the total amount of data that we expect to use from async
    38  		// jobs that we have submitted for the worker.
    39  		atomicReadDataOutstanding  uint64
    40  		atomicWriteDataOutstanding uint64
    41  
    42  		// The read data limit and the write data limit define how much work is
    43  		// allowed to be outstanding before new jobs will be blocked from being
    44  		// launched async.
    45  		atomicReadDataLimit  uint64
    46  		atomicWriteDataLimit uint64
    47  	}
    48  )
    49  
    50  // staticSerialJobRunning indicates whether a serial job is currently running
    51  // for the worker.
    52  func (wls *workerLoopState) staticSerialJobRunning() bool {
    53  	return atomic.LoadUint64(&wls.atomicSerialJobRunning) == 1
    54  }
    55  
    56  // externLaunchSerialJob will launch a serial job for the worker, ensuring that
    57  // exclusivity is handled correctly.
    58  //
    59  // The 'extern' indicates that this function is only allowed to be called from
    60  // 'threadedWorkLoop', and it is expected that only one instance of
    61  // 'threadedWorkLoop' is ever created per-worker.
    62  func (w *worker) externLaunchSerialJob(job func() error) {
    63  	// Mark that there is now a job running. Only one job may be running at a
    64  	// time.
    65  	ok := atomic.CompareAndSwapUint64(&w.staticLoopState.atomicSerialJobRunning, 0, 1)
    66  	if !ok {
    67  		// There already is a job running. This is not allowed.
    68  		w.staticRenter.staticLog.Critical("running a job when another job is already running")
    69  	}
    70  
    71  	fn := func() {
    72  		// Execute the job in a goroutine.
    73  		err := job()
    74  		if err != nil {
    75  			w.staticHandleError(err)
    76  		}
    77  
    78  		// After the job has executed, update to indicate that no serial job
    79  		// is running.
    80  		atomic.StoreUint64(&w.staticLoopState.atomicSerialJobRunning, 0)
    81  		// After updating to indicate that no serial job is running, wake the
    82  		// worker to check for a new serial job.
    83  		w.staticWake()
    84  	}
    85  	err := w.staticTG.Launch(fn)
    86  	if err != nil {
    87  		// Renter has closed, job will not be executed.
    88  		atomic.StoreUint64(&w.staticLoopState.atomicSerialJobRunning, 0)
    89  		return
    90  	}
    91  }
    92  
    93  // externTryLaunchSerialJob will attempt to launch a serial job on the worker.
    94  // Only one serial job is allowed to be running at a time (each serial job
    95  // requires exclusive access to the worker's contract). If there is already a
    96  // serial job running, nothing will happen.
    97  //
    98  // The 'extern' indicates that this function is only allowed to be called from
    99  // 'threadedWorkLoop', and it is expected that only one instance of
   100  // 'threadedWorkLoop' is ever created per-worker.
   101  func (w *worker) externTryLaunchSerialJob() {
   102  	// Return if the renter's shutting down
   103  	if w.staticIsShuttingDown() {
   104  		return
   105  	}
   106  
   107  	// If there is already a serial job running, that job has exclusivity, do
   108  	// nothing.
   109  	if w.staticLoopState.staticSerialJobRunning() {
   110  		return
   111  	}
   112  
   113  	// Perform a disrupt for testing. See the implementation in
   114  	// workerloop_test.go for more info.
   115  	if w.staticRenter.staticDeps.Disrupt("TestJobSerialExecution") {
   116  		return
   117  	}
   118  
   119  	// Check every potential serial job that the worker may be required to
   120  	// perform. This scheduling allows a flood of jobs earlier in the list to
   121  	// starve out jobs later in the list. At some point we will probably
   122  	// revisit this to try and address the starvation issue.
   123  	job := w.staticJobRenewQueue.callNext()
   124  	if job != nil {
   125  		w.externLaunchSerialJob(job.callExecute)
   126  		return
   127  	}
   128  	if w.managedNeedsToUpdatePriceTable() {
   129  		w.externLaunchSerialJob(w.staticUpdatePriceTable)
   130  		return
   131  	}
   132  	if w.managedNeedsToRefillAccount() {
   133  		w.externLaunchSerialJob(w.managedRefillAccount)
   134  		return
   135  	}
   136  	job = w.staticJobUploadSnapshotQueue.callNext()
   137  	if job != nil {
   138  		w.externLaunchSerialJob(job.callExecute)
   139  		return
   140  	}
   141  	job = w.staticJobDownloadSnapshotQueue.callNext()
   142  	if job != nil {
   143  		w.externLaunchSerialJob(job.callExecute)
   144  		return
   145  	}
   146  	job = w.staticJobUploadSnapshotQueue.callNext()
   147  	if job != nil {
   148  		w.externLaunchSerialJob(job.callExecute)
   149  		return
   150  	}
   151  	if w.managedHasUploadJob() {
   152  		w.externLaunchSerialJob(w.managedPerformUploadChunkJob)
   153  		return
   154  	}
   155  }
   156  
   157  // externLaunchAsyncJob accepts a function to retrieve a job and then uses that
   158  // to retrieve a job and launch it. The bandwidth consumption will be updated as
   159  // the job starts and finishes.
   160  func (w *worker) externLaunchAsyncJob(job workerJob) bool {
   161  	// Add the resource requirements to the worker loop state. Also add this
   162  	// thread to the number of jobs running.
   163  	uploadBandwidth, downloadBandwidth := job.callExpectedBandwidth()
   164  	atomic.AddUint64(&w.staticLoopState.atomicReadDataOutstanding, downloadBandwidth)
   165  	atomic.AddUint64(&w.staticLoopState.atomicWriteDataOutstanding, uploadBandwidth)
   166  	atomic.AddUint64(&w.staticLoopState.atomicAsyncJobsRunning, 1)
   167  	fn := func() {
   168  		err := job.callExecute()
   169  		if err != nil {
   170  			w.staticHandleError(err)
   171  		}
   172  
   173  		// Subtract the outstanding data now that the job is complete. Atomic
   174  		// subtraction works by adding and using some bit tricks.
   175  		atomic.AddUint64(&w.staticLoopState.atomicReadDataOutstanding, -downloadBandwidth)
   176  		atomic.AddUint64(&w.staticLoopState.atomicWriteDataOutstanding, -uploadBandwidth)
   177  		atomic.AddUint64(&w.staticLoopState.atomicAsyncJobsRunning, ^uint64(0)) // subtract 1
   178  		// Wake the worker to run any additional async jobs that may have been
   179  		// blocked / ignored because there was not enough bandwidth available.
   180  		w.staticWake()
   181  	}
   182  	err := w.staticTG.Launch(fn)
   183  	if err != nil {
   184  		// Renter has closed, but we want to represent that the work was
   185  		// processed anyway - returning true indicates that the worker should
   186  		// continue processing jobs.
   187  		atomic.AddUint64(&w.staticLoopState.atomicReadDataOutstanding, -downloadBandwidth)
   188  		atomic.AddUint64(&w.staticLoopState.atomicWriteDataOutstanding, -uploadBandwidth)
   189  		atomic.AddUint64(&w.staticLoopState.atomicAsyncJobsRunning, ^uint64(0)) // subtract 1
   190  		return true
   191  	}
   192  	return true
   193  }
   194  
   195  // managedAsyncReady will return 'false' if any of the key requirements for
   196  // performing async work have not been met. 'true' will be returned if the
   197  // worker is ready for async work.
   198  func (w *worker) managedAsyncReady() bool {
   199  	// A valid price table is required to perform async tasks.
   200  	if wpt := w.staticPriceTable(); !wpt.staticValid() {
   201  		w.managedDiscardAsyncJobs(errInvalidPriceTable)
   202  		return false
   203  	}
   204  
   205  	// RHP3 must not be on cooldown to perform async tasks.
   206  	if w.managedOnMaintenanceCooldown() {
   207  		w.managedDiscardAsyncJobs(errOnMaintenanceCooldown)
   208  		return false
   209  	}
   210  	return true
   211  }
   212  
   213  // externTryLaunchAsyncJob will look at the async jobs which are in the worker
   214  // queue and attempt to launch any that are ready. The job launcher will fail if
   215  // the price table is out of date or if the worker account is empty.
   216  //
   217  // The job launcher will also fail if the worker has too much work in jobs
   218  // already queued. Every time a job is launched, a bandwidth estimate is made.
   219  // The worker will not allow more than a certain amount of bandwidth to be
   220  // queued at once to prevent jobs from being spread too thin and sharing too
   221  // much bandwidth.
   222  func (w *worker) externTryLaunchAsyncJob() bool {
   223  	// Return if the renter's shutting down
   224  	if w.staticIsShuttingDown() {
   225  		return false
   226  	}
   227  
   228  	// Exit if the worker is not currently equipped to perform async tasks.
   229  	if !w.managedAsyncReady() {
   230  		return false
   231  	}
   232  
   233  	// Verify that the worker has not reached its limits for doing multiple
   234  	// jobs at once.
   235  	readLimit := atomic.LoadUint64(&w.staticLoopState.atomicReadDataLimit)
   236  	writeLimit := atomic.LoadUint64(&w.staticLoopState.atomicWriteDataLimit)
   237  	readOutstanding := atomic.LoadUint64(&w.staticLoopState.atomicReadDataOutstanding)
   238  	writeOutstanding := atomic.LoadUint64(&w.staticLoopState.atomicWriteDataOutstanding)
   239  	if readOutstanding > readLimit || writeOutstanding > writeLimit {
   240  		// Worker does not need to discard jobs, it is making progress, it's
   241  		// just not launching any new jobs until its current jobs finish up.
   242  		return false
   243  	}
   244  
   245  	// Perform a disrupt for testing. This is some code that ensures async job
   246  	// launches are controlled correctly. The disrupt operates on a mock worker,
   247  	// so it needs to happen after the ratelimit checks but before the cache,
   248  	// price table, and account checks.
   249  	if w.staticRenter.staticDeps.Disrupt("TestAsyncJobLaunches") {
   250  		return true
   251  	}
   252  
   253  	// Check every potential async job that can be launched.
   254  	job := w.staticJobHasSectorQueue.callNext()
   255  	if job != nil {
   256  		w.externLaunchAsyncJob(job)
   257  		return true
   258  	}
   259  	// Check if registry jobs are supported.
   260  	cache := w.staticCache()
   261  	if build.VersionCmp(cache.staticHostVersion, minRegistryVersion) >= 0 {
   262  		job = w.staticJobUpdateRegistryQueue.callNext()
   263  		if job != nil {
   264  			w.externLaunchAsyncJob(job)
   265  			return true
   266  		}
   267  		job = w.staticJobReadRegistryQueue.callNext()
   268  		if job != nil {
   269  			w.externLaunchAsyncJob(job)
   270  			return true
   271  		}
   272  	}
   273  	job = w.staticJobReadQueue.callNext()
   274  	if job != nil {
   275  		w.externLaunchAsyncJob(job)
   276  		return true
   277  	}
   278  	job = w.staticJobLowPrioReadQueue.callNext()
   279  	if job != nil {
   280  		w.externLaunchAsyncJob(job)
   281  		return true
   282  	}
   283  	return false
   284  }
   285  
   286  // managedBlockUntilReady will block until the worker has internet connectivity.
   287  // 'false' will be returned if a kill signal is received or if the renter is
   288  // shut down before internet connectivity is restored. 'true' will be returned
   289  // if internet connectivity is successfully restored.
   290  func (w *worker) managedBlockUntilReady() bool {
   291  	// Check internet connectivity. If the worker does not have internet
   292  	// connectivity, block until connectivity is restored.
   293  	for !w.staticRenter.staticGateway.Online() {
   294  		select {
   295  		case <-w.staticTG.StopChan():
   296  			return false
   297  		case <-time.After(offlineCheckFrequency):
   298  		}
   299  	}
   300  	return true
   301  }
   302  
   303  // managedDiscardAsyncJobs will drop all of the worker's async jobs because the
   304  // worker has not met sufficient conditions to retain async jobs.
   305  func (w *worker) managedDiscardAsyncJobs(err error) {
   306  	w.staticJobHasSectorQueue.callDiscardAll(err)
   307  	w.staticJobUpdateRegistryQueue.callDiscardAll(err)
   308  	w.staticJobReadRegistryQueue.callDiscardAll(err)
   309  	w.staticJobReadQueue.callDiscardAll(err)
   310  	w.staticJobLowPrioReadQueue.callDiscardAll(err)
   311  }
   312  
   313  // threadedWorkLoop is a perpetual loop run by the worker that accepts new jobs
   314  // and performs them. Work is divided into two types of work, serial work and
   315  // async work. Serial work requires exclusive access to the worker's contract,
   316  // meaning that only one of these tasks can be performed at a time.  Async work
   317  // can be performed with high parallelism.
   318  func (w *worker) threadedWorkLoop() {
   319  	// Perform a disrupt for testing.
   320  	if w.staticRenter.staticDeps.Disrupt("DisableWorkerLoop") {
   321  		return
   322  	}
   323  
   324  	// Upon shutdown, release all jobs.
   325  	defer w.managedKillUploading()
   326  	defer w.staticJobLowPrioReadQueue.callKill()
   327  	defer w.staticJobHasSectorQueue.callKill()
   328  	defer w.staticJobUpdateRegistryQueue.callKill()
   329  	defer w.staticJobReadQueue.callKill()
   330  	defer w.staticJobDownloadSnapshotQueue.callKill()
   331  	defer w.staticJobUploadSnapshotQueue.callKill()
   332  
   333  	// Ensure the renter's revision number of the underlying file contract
   334  	// is in sync with the host's revision number. This check must happen at
   335  	// the top as consecutive checks make use of the file contract for
   336  	// payment.
   337  	w.externTryFixRevisionMismatch()
   338  
   339  	// The worker cannot execute any async tasks unless the price table of
   340  	// the host is known, the balance of the worker account is known, and
   341  	// the account has sufficient funds in it. This update is done as a
   342  	// blocking update to ensure nothing else runs until the price table is
   343  	// available.
   344  	w.staticUpdatePriceTable()
   345  
   346  	// Perform a balance check on the host and sync it to his version if
   347  	// necessary. This avoids running into MaxBalanceExceeded errors upon
   348  	// refill after an unclean shutdown.
   349  	if w.staticPriceTable().staticValid() {
   350  		w.staticHandleError(w.externSyncAccountBalanceToHost(false))
   351  	}
   352  
   353  	// This update is done as a blocking update to ensure nothing else runs
   354  	// until the account has filled.
   355  	if w.managedNeedsToRefillAccount() {
   356  		w.staticHandleError(w.managedRefillAccount())
   357  	}
   358  
   359  	// The worker will continuously perform jobs in a loop.
   360  	for {
   361  		// Ensure the threadgroup is not stopped at the start of the iteration.
   362  		select {
   363  		case <-w.staticTG.StopChan():
   364  			return
   365  		default:
   366  		}
   367  
   368  		// There are certain conditions under which the worker should either
   369  		// block or exit. This function will block until those conditions are
   370  		// met, returning 'true' when the worker can proceed and 'false' if the
   371  		// worker should exit.
   372  		if !w.managedBlockUntilReady() {
   373  			return
   374  		}
   375  
   376  		// Try and fix a revision number mismatch if the flag is set. This will
   377  		// be the case if other processes errored out with an error indicating a
   378  		// mismatch.
   379  		if w.staticSuspectRevisionMismatch() {
   380  			w.externTryFixRevisionMismatch()
   381  		}
   382  
   383  		// Update the worker cache object, note that we do this after trying to
   384  		// sync the revision as that might influence the contract, which is used
   385  		// to build the cache object.
   386  		w.staticTryUpdateCache()
   387  
   388  		// If the worker needs to sync the account balance, perform a sync
   389  		// operation. This should be attempted before launching any jobs.
   390  		needsToSync, forced := w.managedNeedsToSyncAccountBalanceToHost()
   391  		if needsToSync {
   392  			w.staticHandleError(w.externSyncAccountBalanceToHost(forced))
   393  		}
   394  
   395  		// Attempt to launch a serial job. If there is already a job running,
   396  		// this will no-op. If no job is running, a goroutine will be spun up
   397  		// to run a job, this call is non-blocking.
   398  		w.externTryLaunchSerialJob()
   399  
   400  		// Attempt to launch an async job. If the async job launches
   401  		// successfully, skip the blocking phase and attempt to launch another
   402  		// async job.
   403  		//
   404  		// The worker will only allow a handful of async jobs to be running at
   405  		// once, to protect the total usage of the network connection. The
   406  		// worker wants to avoid a situation where 1,000 jobs each requiring a
   407  		// large amount of bandwidth are all running simultaneously. If the
   408  		// jobs are tiny in terms of resource footprints, the worker will allow
   409  		// more of them to be running at once.
   410  		if w.externTryLaunchAsyncJob() {
   411  			continue
   412  		}
   413  
   414  		// Block until:
   415  		//    + New work has been submitted
   416  		//    + The renter is stopped
   417  		select {
   418  		case <-w.wakeChan:
   419  			continue
   420  		case <-w.staticTG.StopChan():
   421  			return
   422  		}
   423  	}
   424  }