gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/renter/worker.go (about) 1 package renter 2 3 // worker.go defines a worker with a work loop. Each worker is connected to a 4 // single host, and the work loop will listen for jobs and then perform them. 5 // 6 // The worker has a set of jobs that it is capable of performing. The standard 7 // functions for a job are Queue, Kill, and Perform. Queue will add a job to the 8 // queue of work of that type. Kill will empty the queue and close out any work 9 // that will not be completed. Perform will grab a job from the queue if one 10 // exists and complete that piece of work. 11 // 12 // The worker has an ephemeral account on the host. It can use this account to 13 // pay for downloads and uploads. In order to ensure the account's balance does 14 // not run out, it maintains a balance target by refilling it when necessary. 15 16 import ( 17 "container/list" 18 "sync" 19 "time" 20 "unsafe" 21 22 "gitlab.com/NebulousLabs/threadgroup" 23 "gitlab.com/SkynetLabs/skyd/build" 24 "gitlab.com/SkynetLabs/skyd/skymodules" 25 "go.sia.tech/siad/modules" 26 "go.sia.tech/siad/types" 27 28 "gitlab.com/NebulousLabs/errors" 29 ) 30 31 const ( 32 // minRegistryVersion defines the minimum version that is required for a 33 // host to support the registry. 34 minRegistryVersion = "1.5.5" 35 36 // registryCacheSize is the cache size used by a single worker for the 37 // registry cache. 38 registryCacheSize = 1 << 20 // 1 MiB 39 ) 40 41 var ( 42 // These variables define the total amount of data that a worker is willing 43 // to queue at once when performing async tasks. If the worker has more data 44 // queued in its async queue than this, it will stop launching jobs so that 45 // the jobs it does launch have more breathing room to complete. 46 // 47 // The worker may adjust these values dynamically as it starts to run and 48 // determines how much stuff it can do simultaneously before its jobs start 49 // to have significant latency impact. 50 // 51 // NOTE: these variables are lowered in test environment currently to avoid 52 // a large amount of parallel downloads. We've found that the host is 53 // currently facing a locking issue causing slow reads on the CI when 54 // there's a lot of parallel reads taking place. This issue is tackled by 55 // the following PR https://github.com/SiaFoundation/siad/pull/50 56 // (partially) and thus this build var should be removed again when that is 57 // merged and rolled out fully. 58 initialConcurrentAsyncReadData = build.Select(build.Var{ 59 Standard: 10e6, 60 Dev: 10e6, 61 Testing: 10e4, 62 }).(float64) 63 initialConcurrentAsyncWriteData = build.Select(build.Var{ 64 Standard: 10e6, 65 Dev: 10e6, 66 Testing: 10e4, 67 }).(float64) 68 ) 69 70 type ( 71 // A worker listens for work on a certain host. 72 // 73 // The mutex of the worker only protects the 'unprocessedChunks' and the 74 // 'standbyChunks' fields of the worker. The rest of the fields are only 75 // interacted with exclusively by the primary worker thread, and only one of 76 // those ever exists at a time. 77 // 78 // The workers have a concept of 'cooldown' for the jobs it performs. If a 79 // job fails, the assumption is that future attempts are also likely to 80 // fail, because whatever condition resulted in the failure will still be 81 // present until some time has passed. 82 worker struct { 83 // Atomics are used to minimize lock contention on the worker object. 84 atomicAccountBalanceCheckRunning uint64 // used for a sanity check 85 atomicCache unsafe.Pointer // points to a workerCache object 86 atomicCacheUpdating uint64 // ensures only one cache update happens at a time 87 atomicPriceTable unsafe.Pointer // points to a workerPriceTable object 88 atomicPriceTableUpdateRunning uint64 // used for a sanity check 89 90 // accountSyncMu is a special mutex used when syncing the 91 // worker's account balance with the host's. During the sync, 92 // the worker can't have any pending withdrawals or deposits. To 93 // avoid that, externSyncAccountBalanceToHost waits for all 94 // serial and async jobs to finish before doing the sync. 95 // Unfortunately that won't work for the subscription background 96 // loop since it's always running. That's why accountSyncMu 97 // needs to be locked by the subscription loop every time before 98 // it starts a pending deposit/withdrawal and unlocked after 99 // committing that deposit/withdrawal. That way 100 // externSyncAccountBalanceToHost only executes when the pending 101 // deposits/withdrawals are 0 and vice versa the subscription 102 // loop is blocked for a short period of time while the worker 103 // and host sync up on their balance. 104 accountSyncMu sync.Mutex 105 106 // The host pub key also serves as an id for the worker, as there is 107 // only one worker per host. 108 staticHostPubKey types.SiaPublicKey 109 staticHostPubKeyStr string 110 111 // Job queues for the worker. 112 staticJobDownloadSnapshotQueue *jobDownloadSnapshotQueue 113 staticJobHasSectorQueue *jobHasSectorQueue 114 staticJobReadQueue *jobReadQueue 115 staticJobLowPrioReadQueue *jobReadQueue 116 staticJobReadRegistryQueue *jobReadRegistryQueue 117 staticJobRenewQueue *jobRenewQueue 118 staticJobUpdateRegistryQueue *jobUpdateRegistryQueue 119 staticJobUploadSnapshotQueue *jobUploadSnapshotQueue 120 121 // Stats 122 staticJobReadRegistryDT *skymodules.DistributionTracker 123 124 // Upload variables. 125 unprocessedChunks *uploadChunks // Yet unprocessed work items. 126 uploadConsecutiveFailures int // How many times in a row uploading has failed. 127 uploadRecentFailure time.Time // How recent was the last failure? 128 uploadRecentFailureErr error // What was the reason for the last failure? 129 uploadTerminated bool // Have we stopped uploading? 130 131 // The staticAccount represent the renter's ephemeral account on the 132 // host. It keeps track of the available balance in the account, the 133 // worker has a refill mechanism that keeps the account balance filled 134 // up until the staticAccountBalanceTarget configured on the renter. 135 staticAccount *account 136 137 // The loop state contains information about the worker loop. It is 138 // mostly atomic variables that the worker uses to ratelimit the 139 // launching of async jobs. 140 staticLoopState *workerLoopState 141 142 // The maintenance state contains information about the worker's RHP3 143 // related state. It is used to determine whether or not the worker's 144 // maintenance cooldown can be reset. 145 staticMaintenanceState *workerMaintenanceState 146 147 // staticRegistryCache caches information about the worker's host's 148 // registry entries. 149 staticRegistryCache *registryRevisionCache 150 151 // staticSetInitialEstimates is an object that ensures the initial queue 152 // estimates of the HS and RJ queues are only set once. 153 staticSetInitialEstimates sync.Once 154 155 // subscription-related fields 156 staticSubscriptionInfo *subscriptionInfos 157 158 // Utilities. 159 staticTG threadgroup.ThreadGroup 160 mu sync.Mutex 161 staticRenter *Renter 162 wakeChan chan struct{} // Worker will check queues if given a wake signal. 163 } 164 ) 165 166 // callReadQueue returns the appropriate read queue depending on the priority of 167 // the download. 168 func (w *worker) callReadQueue(lowPrio bool) *jobReadQueue { 169 if lowPrio { 170 return w.staticJobLowPrioReadQueue 171 } 172 return w.staticJobReadQueue 173 } 174 175 // uploadChunks is a queue of upload chunks. 176 type uploadChunks struct { 177 *list.List 178 } 179 180 // newUploadChunks initializes a new queue. 181 func newUploadChunks() *uploadChunks { 182 return &uploadChunks{ 183 List: list.New(), 184 } 185 } 186 187 // Pop removes the first element of the queue. 188 func (queue *uploadChunks) Pop() *unfinishedUploadChunk { 189 mr := queue.Front() 190 if mr == nil { 191 return nil 192 } 193 return queue.List.Remove(mr).(*unfinishedUploadChunk) 194 } 195 196 // managedKill will kill the worker. 197 func (w *worker) managedKill() { 198 err := w.staticTG.Stop() 199 if err != nil && !errors.Contains(err, threadgroup.ErrStopped) { 200 w.staticRenter.staticLog.Printf("Worker %v: kill failed: %v", w.staticHostPubKeyStr, err) 201 } 202 } 203 204 // staticIsShuttingDown returns true if the worker's threadgroup stopped, 205 // indicating the renter is shutting down 206 func (w *worker) staticIsShuttingDown() bool { 207 select { 208 case <-w.staticTG.StopChan(): 209 return true 210 default: 211 } 212 return false 213 } 214 215 // staticWake will wake the worker from sleeping. This should be called any time 216 // that a job is queued or a job completes. 217 func (w *worker) staticWake() { 218 select { 219 case w.wakeChan <- struct{}{}: 220 default: 221 } 222 } 223 224 // newWorker will create and return a worker that is ready to receive jobs. 225 func (r *Renter) newWorker(hostPubKey types.SiaPublicKey) (*worker, error) { 226 _, ok, err := r.staticHostDB.Host(hostPubKey) 227 if err != nil { 228 return nil, errors.AddContext(err, "could not find host entry") 229 } 230 if !ok { 231 return nil, errors.New("host does not exist") 232 } 233 234 // open the account 235 account, err := r.staticAccountManager.managedOpenAccount(hostPubKey) 236 if err != nil { 237 return nil, errors.AddContext(err, "could not open account") 238 } 239 240 w := &worker{ 241 staticHostPubKey: hostPubKey, 242 staticHostPubKeyStr: hostPubKey.String(), 243 244 staticAccount: account, 245 246 staticRegistryCache: newRegistryCache(registryCacheSize, hostPubKey), 247 248 staticSubscriptionInfo: &subscriptionInfos{ 249 subscriptions: make(map[modules.RegistryEntryID]*subscription), 250 staticWakeChan: make(chan struct{}, 1), 251 staticManager: r.staticSubscriptionManager, 252 }, 253 254 // Initialize the read and write limits for the async worker tasks. 255 // These may be updated in real time as the worker collects metrics 256 // about itself. 257 staticLoopState: &workerLoopState{ 258 atomicReadDataLimit: uint64(initialConcurrentAsyncReadData), 259 atomicWriteDataLimit: uint64(initialConcurrentAsyncWriteData), 260 }, 261 262 unprocessedChunks: newUploadChunks(), 263 wakeChan: make(chan struct{}, 1), 264 staticRenter: r, 265 } 266 // Share the read stats between the read queues. That way a repair 267 // download will contribute to user download estimations and vice versa. 268 jrs := NewJobReadStats() 269 270 // staticJobReadRegistryDT will be seeded when the first price table is 271 // fetched. 272 w.staticJobReadRegistryDT = skymodules.NewDistributionTrackerStandard() 273 274 w.newPriceTable() 275 w.newMaintenanceState() 276 w.initJobHasSectorQueue() 277 w.initJobReadQueue(jrs) 278 w.initJobLowPrioReadQueue(jrs) 279 w.initJobRenewQueue() 280 w.initJobDownloadSnapshotQueue() 281 w.initJobReadRegistryQueue() 282 w.initJobUpdateRegistryQueue() 283 w.initJobUploadSnapshotQueue() 284 285 // Close the worker when the renter is stopped. 286 err = r.tg.OnStop(func() error { 287 w.managedKill() 288 return nil 289 }) 290 if err != nil { 291 return nil, errors.AddContext(err, "failed to register OnStop for worker threadgroup") 292 } 293 294 // Get the worker cache set up before returning the worker. This prevents a 295 // race condition in some tests. 296 w.managedUpdateCache() 297 if w.staticCache() == nil { 298 return nil, errors.New("unable to build a cache for the worker") 299 } 300 return w, nil 301 } 302 303 // ReadRegCutoffEstimate is the estimate to use for deciding whether a worker is 304 // good enough to be part of the regread cutoff estimate. 305 func (w *worker) ReadRegCutoffEstimate() time.Duration { 306 return w.staticJobReadRegistryDT.Percentiles()[0][1] // p90 307 }