gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/renter/workerloop.go (about) 1 package renter 2 3 import ( 4 "sync/atomic" 5 "time" 6 7 "gitlab.com/SkynetLabs/skyd/build" 8 9 "gitlab.com/NebulousLabs/errors" 10 ) 11 12 var ( 13 errOnMaintenanceCooldown = errors.New("the worker account is on cooldown") 14 errInvalidPriceTable = errors.New("price table with host is no longer valid") 15 ) 16 17 type ( 18 // workerLoopState tracks the state of the worker loop. 19 workerLoopState struct { 20 // Variables to count the number of jobs running. Note that these 21 // variables can only be incremented in the primary work loop of the 22 // worker, because there are blocking conditions within the primary work 23 // loop that need to know only one thread is running at a time, and 24 // safety is derived from knowing that no new threads are launching 25 // while we are waiting for all existing threads to finish. 26 // 27 // These values can be decremented in a goroutine. 28 atomicAsyncJobsRunning uint64 29 atomicSerialJobRunning uint64 30 31 // atomicSuspectRevisionMismatch indicates that the worker encountered 32 // some error where it believes that it needs to resync its contract 33 // with the host. 34 atomicSuspectRevisionMismatch uint64 35 36 // Variables to track the total amount of async data outstanding. This 37 // indicates the total amount of data that we expect to use from async 38 // jobs that we have submitted for the worker. 39 atomicReadDataOutstanding uint64 40 atomicWriteDataOutstanding uint64 41 42 // The read data limit and the write data limit define how much work is 43 // allowed to be outstanding before new jobs will be blocked from being 44 // launched async. 45 atomicReadDataLimit uint64 46 atomicWriteDataLimit uint64 47 } 48 ) 49 50 // staticSerialJobRunning indicates whether a serial job is currently running 51 // for the worker. 52 func (wls *workerLoopState) staticSerialJobRunning() bool { 53 return atomic.LoadUint64(&wls.atomicSerialJobRunning) == 1 54 } 55 56 // externLaunchSerialJob will launch a serial job for the worker, ensuring that 57 // exclusivity is handled correctly. 58 // 59 // The 'extern' indicates that this function is only allowed to be called from 60 // 'threadedWorkLoop', and it is expected that only one instance of 61 // 'threadedWorkLoop' is ever created per-worker. 62 func (w *worker) externLaunchSerialJob(job func() error) { 63 // Mark that there is now a job running. Only one job may be running at a 64 // time. 65 ok := atomic.CompareAndSwapUint64(&w.staticLoopState.atomicSerialJobRunning, 0, 1) 66 if !ok { 67 // There already is a job running. This is not allowed. 68 w.staticRenter.staticLog.Critical("running a job when another job is already running") 69 } 70 71 fn := func() { 72 // Execute the job in a goroutine. 73 err := job() 74 if err != nil { 75 w.staticHandleError(err) 76 } 77 78 // After the job has executed, update to indicate that no serial job 79 // is running. 80 atomic.StoreUint64(&w.staticLoopState.atomicSerialJobRunning, 0) 81 // After updating to indicate that no serial job is running, wake the 82 // worker to check for a new serial job. 83 w.staticWake() 84 } 85 err := w.staticTG.Launch(fn) 86 if err != nil { 87 // Renter has closed, job will not be executed. 88 atomic.StoreUint64(&w.staticLoopState.atomicSerialJobRunning, 0) 89 return 90 } 91 } 92 93 // externTryLaunchSerialJob will attempt to launch a serial job on the worker. 94 // Only one serial job is allowed to be running at a time (each serial job 95 // requires exclusive access to the worker's contract). If there is already a 96 // serial job running, nothing will happen. 97 // 98 // The 'extern' indicates that this function is only allowed to be called from 99 // 'threadedWorkLoop', and it is expected that only one instance of 100 // 'threadedWorkLoop' is ever created per-worker. 101 func (w *worker) externTryLaunchSerialJob() { 102 // Return if the renter's shutting down 103 if w.staticIsShuttingDown() { 104 return 105 } 106 107 // If there is already a serial job running, that job has exclusivity, do 108 // nothing. 109 if w.staticLoopState.staticSerialJobRunning() { 110 return 111 } 112 113 // Perform a disrupt for testing. See the implementation in 114 // workerloop_test.go for more info. 115 if w.staticRenter.staticDeps.Disrupt("TestJobSerialExecution") { 116 return 117 } 118 119 // Check every potential serial job that the worker may be required to 120 // perform. This scheduling allows a flood of jobs earlier in the list to 121 // starve out jobs later in the list. At some point we will probably 122 // revisit this to try and address the starvation issue. 123 job := w.staticJobRenewQueue.callNext() 124 if job != nil { 125 w.externLaunchSerialJob(job.callExecute) 126 return 127 } 128 if w.managedNeedsToUpdatePriceTable() { 129 w.externLaunchSerialJob(w.staticUpdatePriceTable) 130 return 131 } 132 if w.managedNeedsToRefillAccount() { 133 w.externLaunchSerialJob(w.managedRefillAccount) 134 return 135 } 136 job = w.staticJobUploadSnapshotQueue.callNext() 137 if job != nil { 138 w.externLaunchSerialJob(job.callExecute) 139 return 140 } 141 job = w.staticJobDownloadSnapshotQueue.callNext() 142 if job != nil { 143 w.externLaunchSerialJob(job.callExecute) 144 return 145 } 146 job = w.staticJobUploadSnapshotQueue.callNext() 147 if job != nil { 148 w.externLaunchSerialJob(job.callExecute) 149 return 150 } 151 if w.managedHasUploadJob() { 152 w.externLaunchSerialJob(w.managedPerformUploadChunkJob) 153 return 154 } 155 } 156 157 // externLaunchAsyncJob accepts a function to retrieve a job and then uses that 158 // to retrieve a job and launch it. The bandwidth consumption will be updated as 159 // the job starts and finishes. 160 func (w *worker) externLaunchAsyncJob(job workerJob) bool { 161 // Add the resource requirements to the worker loop state. Also add this 162 // thread to the number of jobs running. 163 uploadBandwidth, downloadBandwidth := job.callExpectedBandwidth() 164 atomic.AddUint64(&w.staticLoopState.atomicReadDataOutstanding, downloadBandwidth) 165 atomic.AddUint64(&w.staticLoopState.atomicWriteDataOutstanding, uploadBandwidth) 166 atomic.AddUint64(&w.staticLoopState.atomicAsyncJobsRunning, 1) 167 fn := func() { 168 err := job.callExecute() 169 if err != nil { 170 w.staticHandleError(err) 171 } 172 173 // Subtract the outstanding data now that the job is complete. Atomic 174 // subtraction works by adding and using some bit tricks. 175 atomic.AddUint64(&w.staticLoopState.atomicReadDataOutstanding, -downloadBandwidth) 176 atomic.AddUint64(&w.staticLoopState.atomicWriteDataOutstanding, -uploadBandwidth) 177 atomic.AddUint64(&w.staticLoopState.atomicAsyncJobsRunning, ^uint64(0)) // subtract 1 178 // Wake the worker to run any additional async jobs that may have been 179 // blocked / ignored because there was not enough bandwidth available. 180 w.staticWake() 181 } 182 err := w.staticTG.Launch(fn) 183 if err != nil { 184 // Renter has closed, but we want to represent that the work was 185 // processed anyway - returning true indicates that the worker should 186 // continue processing jobs. 187 atomic.AddUint64(&w.staticLoopState.atomicReadDataOutstanding, -downloadBandwidth) 188 atomic.AddUint64(&w.staticLoopState.atomicWriteDataOutstanding, -uploadBandwidth) 189 atomic.AddUint64(&w.staticLoopState.atomicAsyncJobsRunning, ^uint64(0)) // subtract 1 190 return true 191 } 192 return true 193 } 194 195 // managedAsyncReady will return 'false' if any of the key requirements for 196 // performing async work have not been met. 'true' will be returned if the 197 // worker is ready for async work. 198 func (w *worker) managedAsyncReady() bool { 199 // A valid price table is required to perform async tasks. 200 if wpt := w.staticPriceTable(); !wpt.staticValid() { 201 w.managedDiscardAsyncJobs(errInvalidPriceTable) 202 return false 203 } 204 205 // RHP3 must not be on cooldown to perform async tasks. 206 if w.managedOnMaintenanceCooldown() { 207 w.managedDiscardAsyncJobs(errOnMaintenanceCooldown) 208 return false 209 } 210 return true 211 } 212 213 // externTryLaunchAsyncJob will look at the async jobs which are in the worker 214 // queue and attempt to launch any that are ready. The job launcher will fail if 215 // the price table is out of date or if the worker account is empty. 216 // 217 // The job launcher will also fail if the worker has too much work in jobs 218 // already queued. Every time a job is launched, a bandwidth estimate is made. 219 // The worker will not allow more than a certain amount of bandwidth to be 220 // queued at once to prevent jobs from being spread too thin and sharing too 221 // much bandwidth. 222 func (w *worker) externTryLaunchAsyncJob() bool { 223 // Return if the renter's shutting down 224 if w.staticIsShuttingDown() { 225 return false 226 } 227 228 // Exit if the worker is not currently equipped to perform async tasks. 229 if !w.managedAsyncReady() { 230 return false 231 } 232 233 // Verify that the worker has not reached its limits for doing multiple 234 // jobs at once. 235 readLimit := atomic.LoadUint64(&w.staticLoopState.atomicReadDataLimit) 236 writeLimit := atomic.LoadUint64(&w.staticLoopState.atomicWriteDataLimit) 237 readOutstanding := atomic.LoadUint64(&w.staticLoopState.atomicReadDataOutstanding) 238 writeOutstanding := atomic.LoadUint64(&w.staticLoopState.atomicWriteDataOutstanding) 239 if readOutstanding > readLimit || writeOutstanding > writeLimit { 240 // Worker does not need to discard jobs, it is making progress, it's 241 // just not launching any new jobs until its current jobs finish up. 242 return false 243 } 244 245 // Perform a disrupt for testing. This is some code that ensures async job 246 // launches are controlled correctly. The disrupt operates on a mock worker, 247 // so it needs to happen after the ratelimit checks but before the cache, 248 // price table, and account checks. 249 if w.staticRenter.staticDeps.Disrupt("TestAsyncJobLaunches") { 250 return true 251 } 252 253 // Check every potential async job that can be launched. 254 job := w.staticJobHasSectorQueue.callNext() 255 if job != nil { 256 w.externLaunchAsyncJob(job) 257 return true 258 } 259 // Check if registry jobs are supported. 260 cache := w.staticCache() 261 if build.VersionCmp(cache.staticHostVersion, minRegistryVersion) >= 0 { 262 job = w.staticJobUpdateRegistryQueue.callNext() 263 if job != nil { 264 w.externLaunchAsyncJob(job) 265 return true 266 } 267 job = w.staticJobReadRegistryQueue.callNext() 268 if job != nil { 269 w.externLaunchAsyncJob(job) 270 return true 271 } 272 } 273 job = w.staticJobReadQueue.callNext() 274 if job != nil { 275 w.externLaunchAsyncJob(job) 276 return true 277 } 278 job = w.staticJobLowPrioReadQueue.callNext() 279 if job != nil { 280 w.externLaunchAsyncJob(job) 281 return true 282 } 283 return false 284 } 285 286 // managedBlockUntilReady will block until the worker has internet connectivity. 287 // 'false' will be returned if a kill signal is received or if the renter is 288 // shut down before internet connectivity is restored. 'true' will be returned 289 // if internet connectivity is successfully restored. 290 func (w *worker) managedBlockUntilReady() bool { 291 // Check internet connectivity. If the worker does not have internet 292 // connectivity, block until connectivity is restored. 293 for !w.staticRenter.staticGateway.Online() { 294 select { 295 case <-w.staticTG.StopChan(): 296 return false 297 case <-time.After(offlineCheckFrequency): 298 } 299 } 300 return true 301 } 302 303 // managedDiscardAsyncJobs will drop all of the worker's async jobs because the 304 // worker has not met sufficient conditions to retain async jobs. 305 func (w *worker) managedDiscardAsyncJobs(err error) { 306 w.staticJobHasSectorQueue.callDiscardAll(err) 307 w.staticJobUpdateRegistryQueue.callDiscardAll(err) 308 w.staticJobReadRegistryQueue.callDiscardAll(err) 309 w.staticJobReadQueue.callDiscardAll(err) 310 w.staticJobLowPrioReadQueue.callDiscardAll(err) 311 } 312 313 // threadedWorkLoop is a perpetual loop run by the worker that accepts new jobs 314 // and performs them. Work is divided into two types of work, serial work and 315 // async work. Serial work requires exclusive access to the worker's contract, 316 // meaning that only one of these tasks can be performed at a time. Async work 317 // can be performed with high parallelism. 318 func (w *worker) threadedWorkLoop() { 319 // Perform a disrupt for testing. 320 if w.staticRenter.staticDeps.Disrupt("DisableWorkerLoop") { 321 return 322 } 323 324 // Upon shutdown, release all jobs. 325 defer w.managedKillUploading() 326 defer w.staticJobLowPrioReadQueue.callKill() 327 defer w.staticJobHasSectorQueue.callKill() 328 defer w.staticJobUpdateRegistryQueue.callKill() 329 defer w.staticJobReadQueue.callKill() 330 defer w.staticJobDownloadSnapshotQueue.callKill() 331 defer w.staticJobUploadSnapshotQueue.callKill() 332 333 // Ensure the renter's revision number of the underlying file contract 334 // is in sync with the host's revision number. This check must happen at 335 // the top as consecutive checks make use of the file contract for 336 // payment. 337 w.externTryFixRevisionMismatch() 338 339 // The worker cannot execute any async tasks unless the price table of 340 // the host is known, the balance of the worker account is known, and 341 // the account has sufficient funds in it. This update is done as a 342 // blocking update to ensure nothing else runs until the price table is 343 // available. 344 w.staticUpdatePriceTable() 345 346 // Perform a balance check on the host and sync it to his version if 347 // necessary. This avoids running into MaxBalanceExceeded errors upon 348 // refill after an unclean shutdown. 349 if w.staticPriceTable().staticValid() { 350 w.staticHandleError(w.externSyncAccountBalanceToHost(false)) 351 } 352 353 // This update is done as a blocking update to ensure nothing else runs 354 // until the account has filled. 355 if w.managedNeedsToRefillAccount() { 356 w.staticHandleError(w.managedRefillAccount()) 357 } 358 359 // The worker will continuously perform jobs in a loop. 360 for { 361 // Ensure the threadgroup is not stopped at the start of the iteration. 362 select { 363 case <-w.staticTG.StopChan(): 364 return 365 default: 366 } 367 368 // There are certain conditions under which the worker should either 369 // block or exit. This function will block until those conditions are 370 // met, returning 'true' when the worker can proceed and 'false' if the 371 // worker should exit. 372 if !w.managedBlockUntilReady() { 373 return 374 } 375 376 // Try and fix a revision number mismatch if the flag is set. This will 377 // be the case if other processes errored out with an error indicating a 378 // mismatch. 379 if w.staticSuspectRevisionMismatch() { 380 w.externTryFixRevisionMismatch() 381 } 382 383 // Update the worker cache object, note that we do this after trying to 384 // sync the revision as that might influence the contract, which is used 385 // to build the cache object. 386 w.staticTryUpdateCache() 387 388 // If the worker needs to sync the account balance, perform a sync 389 // operation. This should be attempted before launching any jobs. 390 needsToSync, forced := w.managedNeedsToSyncAccountBalanceToHost() 391 if needsToSync { 392 w.staticHandleError(w.externSyncAccountBalanceToHost(forced)) 393 } 394 395 // Attempt to launch a serial job. If there is already a job running, 396 // this will no-op. If no job is running, a goroutine will be spun up 397 // to run a job, this call is non-blocking. 398 w.externTryLaunchSerialJob() 399 400 // Attempt to launch an async job. If the async job launches 401 // successfully, skip the blocking phase and attempt to launch another 402 // async job. 403 // 404 // The worker will only allow a handful of async jobs to be running at 405 // once, to protect the total usage of the network connection. The 406 // worker wants to avoid a situation where 1,000 jobs each requiring a 407 // large amount of bandwidth are all running simultaneously. If the 408 // jobs are tiny in terms of resource footprints, the worker will allow 409 // more of them to be running at once. 410 if w.externTryLaunchAsyncJob() { 411 continue 412 } 413 414 // Block until: 415 // + New work has been submitted 416 // + The renter is stopped 417 select { 418 case <-w.wakeChan: 419 continue 420 case <-w.staticTG.StopChan(): 421 return 422 } 423 } 424 }