gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/renter/worker.go (about)

     1  package renter
     2  
     3  // worker.go defines a worker with a work loop. Each worker is connected to a
     4  // single host, and the work loop will listen for jobs and then perform them.
     5  //
     6  // The worker has a set of jobs that it is capable of performing. The standard
     7  // functions for a job are Queue, Kill, and Perform. Queue will add a job to the
     8  // queue of work of that type. Kill will empty the queue and close out any work
     9  // that will not be completed. Perform will grab a job from the queue if one
    10  // exists and complete that piece of work.
    11  //
    12  // The worker has an ephemeral account on the host. It can use this account to
    13  // pay for downloads and uploads. In order to ensure the account's balance does
    14  // not run out, it maintains a balance target by refilling it when necessary.
    15  
    16  import (
    17  	"container/list"
    18  	"sync"
    19  	"time"
    20  	"unsafe"
    21  
    22  	"gitlab.com/NebulousLabs/threadgroup"
    23  	"gitlab.com/SkynetLabs/skyd/build"
    24  	"gitlab.com/SkynetLabs/skyd/skymodules"
    25  	"go.sia.tech/siad/modules"
    26  	"go.sia.tech/siad/types"
    27  
    28  	"gitlab.com/NebulousLabs/errors"
    29  )
    30  
    31  const (
    32  	// minRegistryVersion defines the minimum version that is required for a
    33  	// host to support the registry.
    34  	minRegistryVersion = "1.5.5"
    35  
    36  	// registryCacheSize is the cache size used by a single worker for the
    37  	// registry cache.
    38  	registryCacheSize = 1 << 20 // 1 MiB
    39  )
    40  
    41  var (
    42  	// These variables define the total amount of data that a worker is willing
    43  	// to queue at once when performing async tasks. If the worker has more data
    44  	// queued in its async queue than this, it will stop launching jobs so that
    45  	// the jobs it does launch have more breathing room to complete.
    46  	//
    47  	// The worker may adjust these values dynamically as it starts to run and
    48  	// determines how much stuff it can do simultaneously before its jobs start
    49  	// to have significant latency impact.
    50  	//
    51  	// NOTE: these variables are lowered in test environment currently to avoid
    52  	// a large amount of parallel downloads. We've found that the host is
    53  	// currently facing a locking issue causing slow reads on the CI when
    54  	// there's a lot of parallel reads taking place. This issue is tackled by
    55  	// the following PR https://github.com/SiaFoundation/siad/pull/50
    56  	// (partially) and thus this build var should be removed again when that is
    57  	// merged and rolled out fully.
    58  	initialConcurrentAsyncReadData = build.Select(build.Var{
    59  		Standard: 10e6,
    60  		Dev:      10e6,
    61  		Testing:  10e4,
    62  	}).(float64)
    63  	initialConcurrentAsyncWriteData = build.Select(build.Var{
    64  		Standard: 10e6,
    65  		Dev:      10e6,
    66  		Testing:  10e4,
    67  	}).(float64)
    68  )
    69  
    70  type (
    71  	// A worker listens for work on a certain host.
    72  	//
    73  	// The mutex of the worker only protects the 'unprocessedChunks' and the
    74  	// 'standbyChunks' fields of the worker. The rest of the fields are only
    75  	// interacted with exclusively by the primary worker thread, and only one of
    76  	// those ever exists at a time.
    77  	//
    78  	// The workers have a concept of 'cooldown' for the jobs it performs. If a
    79  	// job fails, the assumption is that future attempts are also likely to
    80  	// fail, because whatever condition resulted in the failure will still be
    81  	// present until some time has passed.
    82  	worker struct {
    83  		// Atomics are used to minimize lock contention on the worker object.
    84  		atomicAccountBalanceCheckRunning uint64         // used for a sanity check
    85  		atomicCache                      unsafe.Pointer // points to a workerCache object
    86  		atomicCacheUpdating              uint64         // ensures only one cache update happens at a time
    87  		atomicPriceTable                 unsafe.Pointer // points to a workerPriceTable object
    88  		atomicPriceTableUpdateRunning    uint64         // used for a sanity check
    89  
    90  		// accountSyncMu is a special mutex used when syncing the
    91  		// worker's account balance with the host's. During the sync,
    92  		// the worker can't have any pending withdrawals or deposits. To
    93  		// avoid that, externSyncAccountBalanceToHost waits for all
    94  		// serial and async jobs to finish before doing the sync.
    95  		// Unfortunately that won't work for the subscription background
    96  		// loop since it's always running. That's why accountSyncMu
    97  		// needs to be locked by the subscription loop every time before
    98  		// it starts a pending deposit/withdrawal and unlocked after
    99  		// committing that deposit/withdrawal. That way
   100  		// externSyncAccountBalanceToHost only executes when the pending
   101  		// deposits/withdrawals are 0 and vice versa the subscription
   102  		// loop is blocked for a short period of time while the worker
   103  		// and host sync up on their balance.
   104  		accountSyncMu sync.Mutex
   105  
   106  		// The host pub key also serves as an id for the worker, as there is
   107  		// only one worker per host.
   108  		staticHostPubKey    types.SiaPublicKey
   109  		staticHostPubKeyStr string
   110  
   111  		// Job queues for the worker.
   112  		staticJobDownloadSnapshotQueue *jobDownloadSnapshotQueue
   113  		staticJobHasSectorQueue        *jobHasSectorQueue
   114  		staticJobReadQueue             *jobReadQueue
   115  		staticJobLowPrioReadQueue      *jobReadQueue
   116  		staticJobReadRegistryQueue     *jobReadRegistryQueue
   117  		staticJobRenewQueue            *jobRenewQueue
   118  		staticJobUpdateRegistryQueue   *jobUpdateRegistryQueue
   119  		staticJobUploadSnapshotQueue   *jobUploadSnapshotQueue
   120  
   121  		// Stats
   122  		staticJobReadRegistryDT *skymodules.DistributionTracker
   123  
   124  		// Upload variables.
   125  		unprocessedChunks         *uploadChunks // Yet unprocessed work items.
   126  		uploadConsecutiveFailures int           // How many times in a row uploading has failed.
   127  		uploadRecentFailure       time.Time     // How recent was the last failure?
   128  		uploadRecentFailureErr    error         // What was the reason for the last failure?
   129  		uploadTerminated          bool          // Have we stopped uploading?
   130  
   131  		// The staticAccount represent the renter's ephemeral account on the
   132  		// host. It keeps track of the available balance in the account, the
   133  		// worker has a refill mechanism that keeps the account balance filled
   134  		// up until the staticAccountBalanceTarget configured on the renter.
   135  		staticAccount *account
   136  
   137  		// The loop state contains information about the worker loop. It is
   138  		// mostly atomic variables that the worker uses to ratelimit the
   139  		// launching of async jobs.
   140  		staticLoopState *workerLoopState
   141  
   142  		// The maintenance state contains information about the worker's RHP3
   143  		// related state. It is used to determine whether or not the worker's
   144  		// maintenance cooldown can be reset.
   145  		staticMaintenanceState *workerMaintenanceState
   146  
   147  		// staticRegistryCache caches information about the worker's host's
   148  		// registry entries.
   149  		staticRegistryCache *registryRevisionCache
   150  
   151  		// staticSetInitialEstimates is an object that ensures the initial queue
   152  		// estimates of the HS and RJ queues are only set once.
   153  		staticSetInitialEstimates sync.Once
   154  
   155  		// subscription-related fields
   156  		staticSubscriptionInfo *subscriptionInfos
   157  
   158  		// Utilities.
   159  		staticTG     threadgroup.ThreadGroup
   160  		mu           sync.Mutex
   161  		staticRenter *Renter
   162  		wakeChan     chan struct{} // Worker will check queues if given a wake signal.
   163  	}
   164  )
   165  
   166  // callReadQueue returns the appropriate read queue depending on the priority of
   167  // the download.
   168  func (w *worker) callReadQueue(lowPrio bool) *jobReadQueue {
   169  	if lowPrio {
   170  		return w.staticJobLowPrioReadQueue
   171  	}
   172  	return w.staticJobReadQueue
   173  }
   174  
   175  // uploadChunks is a queue of upload chunks.
   176  type uploadChunks struct {
   177  	*list.List
   178  }
   179  
   180  // newUploadChunks initializes a new queue.
   181  func newUploadChunks() *uploadChunks {
   182  	return &uploadChunks{
   183  		List: list.New(),
   184  	}
   185  }
   186  
   187  // Pop removes the first element of the queue.
   188  func (queue *uploadChunks) Pop() *unfinishedUploadChunk {
   189  	mr := queue.Front()
   190  	if mr == nil {
   191  		return nil
   192  	}
   193  	return queue.List.Remove(mr).(*unfinishedUploadChunk)
   194  }
   195  
   196  // managedKill will kill the worker.
   197  func (w *worker) managedKill() {
   198  	err := w.staticTG.Stop()
   199  	if err != nil && !errors.Contains(err, threadgroup.ErrStopped) {
   200  		w.staticRenter.staticLog.Printf("Worker %v: kill failed: %v", w.staticHostPubKeyStr, err)
   201  	}
   202  }
   203  
   204  // staticIsShuttingDown returns true if the worker's threadgroup stopped,
   205  // indicating the renter is shutting down
   206  func (w *worker) staticIsShuttingDown() bool {
   207  	select {
   208  	case <-w.staticTG.StopChan():
   209  		return true
   210  	default:
   211  	}
   212  	return false
   213  }
   214  
   215  // staticWake will wake the worker from sleeping. This should be called any time
   216  // that a job is queued or a job completes.
   217  func (w *worker) staticWake() {
   218  	select {
   219  	case w.wakeChan <- struct{}{}:
   220  	default:
   221  	}
   222  }
   223  
   224  // newWorker will create and return a worker that is ready to receive jobs.
   225  func (r *Renter) newWorker(hostPubKey types.SiaPublicKey) (*worker, error) {
   226  	_, ok, err := r.staticHostDB.Host(hostPubKey)
   227  	if err != nil {
   228  		return nil, errors.AddContext(err, "could not find host entry")
   229  	}
   230  	if !ok {
   231  		return nil, errors.New("host does not exist")
   232  	}
   233  
   234  	// open the account
   235  	account, err := r.staticAccountManager.managedOpenAccount(hostPubKey)
   236  	if err != nil {
   237  		return nil, errors.AddContext(err, "could not open account")
   238  	}
   239  
   240  	w := &worker{
   241  		staticHostPubKey:    hostPubKey,
   242  		staticHostPubKeyStr: hostPubKey.String(),
   243  
   244  		staticAccount: account,
   245  
   246  		staticRegistryCache: newRegistryCache(registryCacheSize, hostPubKey),
   247  
   248  		staticSubscriptionInfo: &subscriptionInfos{
   249  			subscriptions:  make(map[modules.RegistryEntryID]*subscription),
   250  			staticWakeChan: make(chan struct{}, 1),
   251  			staticManager:  r.staticSubscriptionManager,
   252  		},
   253  
   254  		// Initialize the read and write limits for the async worker tasks.
   255  		// These may be updated in real time as the worker collects metrics
   256  		// about itself.
   257  		staticLoopState: &workerLoopState{
   258  			atomicReadDataLimit:  uint64(initialConcurrentAsyncReadData),
   259  			atomicWriteDataLimit: uint64(initialConcurrentAsyncWriteData),
   260  		},
   261  
   262  		unprocessedChunks: newUploadChunks(),
   263  		wakeChan:          make(chan struct{}, 1),
   264  		staticRenter:      r,
   265  	}
   266  	// Share the read stats between the read queues. That way a repair
   267  	// download will contribute to user download estimations and vice versa.
   268  	jrs := NewJobReadStats()
   269  
   270  	// staticJobReadRegistryDT will be seeded when the first price table is
   271  	// fetched.
   272  	w.staticJobReadRegistryDT = skymodules.NewDistributionTrackerStandard()
   273  
   274  	w.newPriceTable()
   275  	w.newMaintenanceState()
   276  	w.initJobHasSectorQueue()
   277  	w.initJobReadQueue(jrs)
   278  	w.initJobLowPrioReadQueue(jrs)
   279  	w.initJobRenewQueue()
   280  	w.initJobDownloadSnapshotQueue()
   281  	w.initJobReadRegistryQueue()
   282  	w.initJobUpdateRegistryQueue()
   283  	w.initJobUploadSnapshotQueue()
   284  
   285  	// Close the worker when the renter is stopped.
   286  	err = r.tg.OnStop(func() error {
   287  		w.managedKill()
   288  		return nil
   289  	})
   290  	if err != nil {
   291  		return nil, errors.AddContext(err, "failed to register OnStop for worker threadgroup")
   292  	}
   293  
   294  	// Get the worker cache set up before returning the worker. This prevents a
   295  	// race condition in some tests.
   296  	w.managedUpdateCache()
   297  	if w.staticCache() == nil {
   298  		return nil, errors.New("unable to build a cache for the worker")
   299  	}
   300  	return w, nil
   301  }
   302  
   303  // ReadRegCutoffEstimate is the estimate to use for deciding whether a worker is
   304  // good enough to be part of the regread cutoff estimate.
   305  func (w *worker) ReadRegCutoffEstimate() time.Duration {
   306  	return w.staticJobReadRegistryDT.Percentiles()[0][1] // p90
   307  }