github.com/nebulouslabs/sia@v1.3.7/modules/renter/workerdownload.go (about) 1 package renter 2 3 // workerdownload.go is responsible for coordinating the actual fetching of 4 // pieces, determining when to add standby workers, when to perform repairs, and 5 // coordinating resource management between the workers operating on a chunk. 6 7 import ( 8 "sync/atomic" 9 "time" 10 ) 11 12 // managedDownload will perform some download work. 13 func (w *worker) managedDownload(udc *unfinishedDownloadChunk) { 14 // Process this chunk. If the worker is not fit to do the download, or is 15 // put on standby, 'nil' will be returned. After the chunk has been 16 // processed, the worker will be registered with the chunk. 17 // 18 // If 'nil' is returned, it is either because the worker has been removed 19 // from the chunk entirely, or because the worker has been put on standby. 20 udc = w.ownedProcessDownloadChunk(udc) 21 if udc == nil { 22 return 23 } 24 // Worker is being given a chance to work. After the work is complete, 25 // whether successful or failed, the worker needs to be removed. 26 defer udc.managedRemoveWorker() 27 28 // Fetch the sector. If fetching the sector fails, the worker needs to be 29 // unregistered with the chunk. 30 d, err := w.renter.hostContractor.Downloader(w.contract.HostPublicKey, w.renter.tg.StopChan()) 31 if err != nil { 32 w.renter.log.Debugln("worker failed to create downloader:", err) 33 udc.managedUnregisterWorker(w) 34 return 35 } 36 defer d.Close() 37 pieceData, err := d.Sector(udc.staticChunkMap[string(w.contract.HostPublicKey.Key)].root) 38 if err != nil { 39 w.renter.log.Debugln("worker failed to download sector:", err) 40 udc.managedUnregisterWorker(w) 41 return 42 } 43 // TODO: Instead of adding the whole sector after the download completes, 44 // have the 'd.Sector' call add to this value ongoing as the sector comes 45 // in. Perhaps even include the data from creating the downloader and other 46 // data sent to and received from the host (like signatures) that aren't 47 // actually payload data. 48 atomic.AddUint64(&udc.download.atomicTotalDataTransferred, udc.staticPieceSize) 49 50 // Decrypt the piece. This might introduce some overhead for downloads with 51 // a large overdrive. It shouldn't be a bottleneck though since bandwidth 52 // is usually a lot more scarce than CPU processing power. 53 pieceIndex := udc.staticChunkMap[string(w.contract.HostPublicKey.Key)].index 54 key := deriveKey(udc.masterKey, udc.staticChunkIndex, pieceIndex) 55 decryptedPiece, err := key.DecryptBytesInPlace(pieceData) 56 if err != nil { 57 w.renter.log.Debugln("worker failed to decrypt piece:", err) 58 udc.managedUnregisterWorker(w) 59 return 60 } 61 62 // Mark the piece as completed. Perform chunk recovery if we newly have 63 // enough pieces to do so. Chunk recovery is an expensive operation that 64 // should be performed in a separate thread as to not block the worker. 65 udc.mu.Lock() 66 udc.piecesCompleted++ 67 udc.piecesRegistered-- 68 if udc.piecesCompleted <= udc.erasureCode.MinPieces() { 69 udc.physicalChunkData[pieceIndex] = decryptedPiece 70 } 71 if udc.piecesCompleted == udc.erasureCode.MinPieces() { 72 go udc.threadedRecoverLogicalData() 73 } 74 udc.mu.Unlock() 75 } 76 77 // managedKillDownloading will drop all of the download work given to the 78 // worker, and set a signal to prevent the worker from accepting more download 79 // work. 80 // 81 // The chunk cleanup needs to occur after the worker mutex is released so that 82 // the worker is not locked while chunk cleanup is happening. 83 func (w *worker) managedKillDownloading() { 84 w.downloadMu.Lock() 85 var removedChunks []*unfinishedDownloadChunk 86 for i := 0; i < len(w.downloadChunks); i++ { 87 removedChunks = append(removedChunks, w.downloadChunks[i]) 88 } 89 w.downloadChunks = w.downloadChunks[:0] 90 w.downloadTerminated = true 91 w.downloadMu.Unlock() 92 for i := 0; i < len(removedChunks); i++ { 93 removedChunks[i].managedRemoveWorker() 94 } 95 } 96 97 // managedNextDownloadChunk will pull the next potential chunk out of the work 98 // queue for downloading. 99 func (w *worker) managedNextDownloadChunk() *unfinishedDownloadChunk { 100 w.downloadMu.Lock() 101 defer w.downloadMu.Unlock() 102 103 if len(w.downloadChunks) == 0 { 104 return nil 105 } 106 nextChunk := w.downloadChunks[0] 107 w.downloadChunks = w.downloadChunks[1:] 108 return nextChunk 109 } 110 111 // managedQueueDownloadChunk adds a chunk to the worker's queue. 112 func (w *worker) managedQueueDownloadChunk(udc *unfinishedDownloadChunk) { 113 // Accept the chunk unless the worker has been terminated. Accepting the 114 // chunk needs to happen under the same lock as fetching the termination 115 // status. 116 w.downloadMu.Lock() 117 terminated := w.downloadTerminated 118 if !terminated { 119 // Accept the chunk and issue a notification to the master thread that 120 // there is a new download. 121 w.downloadChunks = append(w.downloadChunks, udc) 122 select { 123 case w.downloadChan <- struct{}{}: 124 default: 125 } 126 } 127 w.downloadMu.Unlock() 128 129 // If the worker has terminated, remove it from the udc. This call needs to 130 // happen without holding the worker lock. 131 if terminated { 132 udc.managedRemoveWorker() 133 } 134 } 135 136 // managedUnregisterWorker will remove the worker from an unfinished download 137 // chunk, and then un-register the pieces that it grabbed. This function should 138 // only be called when a worker download fails. 139 func (udc *unfinishedDownloadChunk) managedUnregisterWorker(w *worker) { 140 udc.mu.Lock() 141 udc.piecesRegistered-- 142 udc.pieceUsage[udc.staticChunkMap[string(w.contract.HostPublicKey.Key)].index] = false 143 udc.mu.Unlock() 144 } 145 146 // ownedOnDownloadCooldown returns true if the worker is on cooldown from failed 147 // downloads. This function should only be called by the master worker thread, 148 // and does not require any mutexes. 149 func (w *worker) ownedOnDownloadCooldown() bool { 150 requiredCooldown := downloadFailureCooldown 151 for i := 0; i < w.ownedDownloadConsecutiveFailures && i < maxConsecutivePenalty; i++ { 152 requiredCooldown *= 2 153 } 154 return time.Now().Before(w.ownedDownloadRecentFailure.Add(requiredCooldown)) 155 } 156 157 // ownedProcessDownloadChunk will take a potential download chunk, figure out if 158 // there is work to do, and then perform any registration or processing with the 159 // chunk before returning the chunk to the caller. 160 // 161 // If no immediate action is required, 'nil' will be returned. 162 func (w *worker) ownedProcessDownloadChunk(udc *unfinishedDownloadChunk) *unfinishedDownloadChunk { 163 // Determine whether the worker needs to drop the chunk. If so, remove the 164 // worker and return nil. Worker only needs to be removed if worker is being 165 // dropped. 166 udc.mu.Lock() 167 chunkComplete := udc.piecesCompleted >= udc.erasureCode.MinPieces() 168 chunkFailed := udc.piecesCompleted+udc.workersRemaining < udc.erasureCode.MinPieces() 169 pieceData, workerHasPiece := udc.staticChunkMap[string(w.contract.HostPublicKey.Key)] 170 pieceTaken := udc.pieceUsage[pieceData.index] 171 if chunkComplete || chunkFailed || w.ownedOnDownloadCooldown() || !workerHasPiece || pieceTaken { 172 udc.mu.Unlock() 173 udc.managedRemoveWorker() 174 return nil 175 } 176 defer udc.mu.Unlock() 177 178 // TODO: This is where we would put filters based on worker latency, worker 179 // price, worker throughput, etc. There's a lot of fancy stuff we can do 180 // with filtering to make sure that for any given chunk we always use the 181 // optimal set of workers, and this is the spot where most of the filtering 182 // will happen. 183 // 184 // One major thing that we will want to be careful about when we improve 185 // this section is total memory vs. worker bandwidth. If the renter is 186 // consistently memory bottlenecked such that the slow hosts are hogging all 187 // of the memory and choking out the fasts hosts, leading to underutilized 188 // network connections where we actually have enough fast hosts to be fully 189 // utilizing the network. Part of this will be solved by adding bandwidth 190 // stats to the hostdb, but part of it will need to be solved by making sure 191 // that we automatically put low-bandwidth or high-latency workers on 192 // standby if we know that memory is the bottleneck as opposed to download 193 // bandwidth. 194 // 195 // Workers that do not meet the extra criteria are not discarded but rather 196 // put on standby, so that they can step in if the workers that do meet the 197 // extra criteria fail or otherwise prove insufficient. 198 // 199 // NOTE: Any metrics that we pull from the worker here need to be 'owned' 200 // metrics, so that we can avoid holding the worker lock and the udc lock 201 // simultaneously (deadlock risk). The 'owned' variables of the worker are 202 // variables that are only accessed by the master worker thread. 203 meetsExtraCriteria := true 204 205 // TODO: There's going to need to be some method for relaxing criteria after 206 // the first wave of workers are sent off. If the first waves of workers 207 // fail, the next wave need to realize that they shouldn't immediately go on 208 // standby because for some reason there were failures in the first wave and 209 // now the second/etc. wave of workers is needed. 210 211 // Figure out if this chunk needs another worker actively downloading 212 // pieces. The number of workers that should be active simultaneously on 213 // this chunk is the minimum number of pieces required for recovery plus the 214 // number of overdrive workers (typically zero). For our purposes, completed 215 // pieces count as active workers, though the workers have actually 216 // finished. 217 piecesInProgress := udc.piecesRegistered + udc.piecesCompleted 218 desiredPiecesInProgress := udc.erasureCode.MinPieces() + udc.staticOverdrive 219 workersDesired := piecesInProgress < desiredPiecesInProgress 220 221 if workersDesired && meetsExtraCriteria { 222 // Worker can be useful. Register the worker and return the chunk for 223 // downloading. 224 udc.piecesRegistered++ 225 udc.pieceUsage[pieceData.index] = true 226 return udc 227 } 228 // Worker is not needed unless another worker fails, so put this worker on 229 // standby for this chunk. The worker is still available to help with the 230 // download, so the worker is not removed from the chunk in this codepath. 231 udc.workersStandby = append(udc.workersStandby, w) 232 return nil 233 }