github.com/fozzysec/SiaPrime@v0.0.0-20190612043147-66c8e8d11fe3/modules/renter/workerdownload.go (about) 1 package renter 2 3 // workerdownload.go is responsible for coordinating the actual fetching of 4 // pieces, determining when to add standby workers, when to perform repairs, and 5 // coordinating resource management between the workers operating on a chunk. 6 7 import ( 8 "sync/atomic" 9 "time" 10 ) 11 12 // managedDownload will perform some download work. 13 func (w *worker) managedDownload(udc *unfinishedDownloadChunk) { 14 // Process this chunk. If the worker is not fit to do the download, or is 15 // put on standby, 'nil' will be returned. After the chunk has been 16 // processed, the worker will be registered with the chunk. 17 // 18 // If 'nil' is returned, it is either because the worker has been removed 19 // from the chunk entirely, or because the worker has been put on standby. 20 udc = w.ownedProcessDownloadChunk(udc) 21 if udc == nil { 22 return 23 } 24 // Worker is being given a chance to work. After the work is complete, 25 // whether successful or failed, the worker needs to be removed. 26 defer udc.managedRemoveWorker() 27 28 // Fetch the sector. If fetching the sector fails, the worker needs to be 29 // unregistered with the chunk. 30 d, err := w.renter.hostContractor.Downloader(w.contract.HostPublicKey, w.renter.tg.StopChan()) 31 if err != nil { 32 w.renter.log.Debugln("worker failed to create downloader:", err) 33 udc.managedUnregisterWorker(w) 34 return 35 } 36 defer d.Close() 37 pieceData, err := d.Sector(udc.staticChunkMap[string(w.contract.HostPublicKey.Key)].root) 38 if err != nil { 39 w.renter.log.Debugln("worker failed to download sector:", err) 40 udc.managedUnregisterWorker(w) 41 return 42 } 43 // TODO: Instead of adding the whole sector after the download completes, 44 // have the 'd.Sector' call add to this value ongoing as the sector comes 45 // in. Perhaps even include the data from creating the downloader and other 46 // data sent to and received from the host (like signatures) that aren't 47 // actually payload data. 48 atomic.AddUint64(&udc.download.atomicTotalDataTransferred, udc.staticPieceSize) 49 50 // Decrypt the piece. This might introduce some overhead for downloads with 51 // a large overdrive. It shouldn't be a bottleneck though since bandwidth 52 // is usually a lot more scarce than CPU processing power. 53 pieceIndex := udc.staticChunkMap[string(w.contract.HostPublicKey.Key)].index 54 key := deriveKey(udc.masterKey, udc.staticChunkIndex, pieceIndex) 55 decryptedPiece, err := key.DecryptBytesInPlace(pieceData) 56 if err != nil { 57 w.renter.log.Debugln("worker failed to decrypt piece:", err) 58 udc.managedUnregisterWorker(w) 59 return 60 } 61 62 // Mark the piece as completed. Perform chunk recovery if we newly have 63 // enough pieces to do so. Chunk recovery is an expensive operation that 64 // should be performed in a separate thread as to not block the worker. 65 udc.mu.Lock() 66 udc.piecesCompleted++ 67 udc.piecesRegistered-- 68 if udc.piecesCompleted <= udc.erasureCode.MinPieces() { 69 atomic.AddUint64(&udc.download.atomicDataReceived, udc.staticFetchLength/uint64(udc.erasureCode.MinPieces())) 70 udc.physicalChunkData[pieceIndex] = decryptedPiece 71 } 72 if udc.piecesCompleted == udc.erasureCode.MinPieces() { 73 // Uint division might not always cause atomicDataReceived to cleanly 74 // add up to staticFetchLength so we need to figure out how much we 75 // already added to the download and how much is missing. 76 addedReceivedData := uint64(udc.erasureCode.MinPieces()) * (udc.staticFetchLength / uint64(udc.erasureCode.MinPieces())) 77 atomic.AddUint64(&udc.download.atomicDataReceived, udc.staticFetchLength-addedReceivedData) 78 // Recover the logical data. 79 go udc.threadedRecoverLogicalData() 80 } 81 udc.mu.Unlock() 82 } 83 84 // managedKillDownloading will drop all of the download work given to the 85 // worker, and set a signal to prevent the worker from accepting more download 86 // work. 87 // 88 // The chunk cleanup needs to occur after the worker mutex is released so that 89 // the worker is not locked while chunk cleanup is happening. 90 func (w *worker) managedKillDownloading() { 91 w.downloadMu.Lock() 92 var removedChunks []*unfinishedDownloadChunk 93 for i := 0; i < len(w.downloadChunks); i++ { 94 removedChunks = append(removedChunks, w.downloadChunks[i]) 95 } 96 w.downloadChunks = w.downloadChunks[:0] 97 w.downloadTerminated = true 98 w.downloadMu.Unlock() 99 for i := 0; i < len(removedChunks); i++ { 100 removedChunks[i].managedRemoveWorker() 101 } 102 } 103 104 // managedNextDownloadChunk will pull the next potential chunk out of the work 105 // queue for downloading. 106 func (w *worker) managedNextDownloadChunk() *unfinishedDownloadChunk { 107 w.downloadMu.Lock() 108 defer w.downloadMu.Unlock() 109 110 if len(w.downloadChunks) == 0 { 111 return nil 112 } 113 nextChunk := w.downloadChunks[0] 114 w.downloadChunks = w.downloadChunks[1:] 115 return nextChunk 116 } 117 118 // managedQueueDownloadChunk adds a chunk to the worker's queue. 119 func (w *worker) managedQueueDownloadChunk(udc *unfinishedDownloadChunk) { 120 // Accept the chunk unless the worker has been terminated. Accepting the 121 // chunk needs to happen under the same lock as fetching the termination 122 // status. 123 w.downloadMu.Lock() 124 terminated := w.downloadTerminated 125 if !terminated { 126 // Accept the chunk and issue a notification to the master thread that 127 // there is a new download. 128 w.downloadChunks = append(w.downloadChunks, udc) 129 select { 130 case w.downloadChan <- struct{}{}: 131 default: 132 } 133 } 134 w.downloadMu.Unlock() 135 136 // If the worker has terminated, remove it from the udc. This call needs to 137 // happen without holding the worker lock. 138 if terminated { 139 udc.managedRemoveWorker() 140 } 141 } 142 143 // managedUnregisterWorker will remove the worker from an unfinished download 144 // chunk, and then un-register the pieces that it grabbed. This function should 145 // only be called when a worker download fails. 146 func (udc *unfinishedDownloadChunk) managedUnregisterWorker(w *worker) { 147 udc.mu.Lock() 148 udc.piecesRegistered-- 149 udc.pieceUsage[udc.staticChunkMap[string(w.contract.HostPublicKey.Key)].index] = false 150 udc.mu.Unlock() 151 } 152 153 // ownedOnDownloadCooldown returns true if the worker is on cooldown from failed 154 // downloads. This function should only be called by the master worker thread, 155 // and does not require any mutexes. 156 func (w *worker) ownedOnDownloadCooldown() bool { 157 requiredCooldown := downloadFailureCooldown 158 for i := 0; i < w.ownedDownloadConsecutiveFailures && i < maxConsecutivePenalty; i++ { 159 requiredCooldown *= 2 160 } 161 return time.Now().Before(w.ownedDownloadRecentFailure.Add(requiredCooldown)) 162 } 163 164 // ownedProcessDownloadChunk will take a potential download chunk, figure out if 165 // there is work to do, and then perform any registration or processing with the 166 // chunk before returning the chunk to the caller. 167 // 168 // If no immediate action is required, 'nil' will be returned. 169 func (w *worker) ownedProcessDownloadChunk(udc *unfinishedDownloadChunk) *unfinishedDownloadChunk { 170 // Determine whether the worker needs to drop the chunk. If so, remove the 171 // worker and return nil. Worker only needs to be removed if worker is being 172 // dropped. 173 udc.mu.Lock() 174 chunkComplete := udc.piecesCompleted >= udc.erasureCode.MinPieces() 175 chunkFailed := udc.piecesCompleted+udc.workersRemaining < udc.erasureCode.MinPieces() 176 pieceData, workerHasPiece := udc.staticChunkMap[string(w.contract.HostPublicKey.Key)] 177 pieceTaken := udc.pieceUsage[pieceData.index] 178 if chunkComplete || chunkFailed || w.ownedOnDownloadCooldown() || !workerHasPiece || pieceTaken { 179 udc.mu.Unlock() 180 udc.managedRemoveWorker() 181 return nil 182 } 183 defer udc.mu.Unlock() 184 185 // TODO: This is where we would put filters based on worker latency, worker 186 // price, worker throughput, etc. There's a lot of fancy stuff we can do 187 // with filtering to make sure that for any given chunk we always use the 188 // optimal set of workers, and this is the spot where most of the filtering 189 // will happen. 190 // 191 // One major thing that we will want to be careful about when we improve 192 // this section is total memory vs. worker bandwidth. If the renter is 193 // consistently memory bottlenecked such that the slow hosts are hogging all 194 // of the memory and choking out the fasts hosts, leading to underutilized 195 // network connections where we actually have enough fast hosts to be fully 196 // utilizing the network. Part of this will be solved by adding bandwidth 197 // stats to the hostdb, but part of it will need to be solved by making sure 198 // that we automatically put low-bandwidth or high-latency workers on 199 // standby if we know that memory is the bottleneck as opposed to download 200 // bandwidth. 201 // 202 // Workers that do not meet the extra criteria are not discarded but rather 203 // put on standby, so that they can step in if the workers that do meet the 204 // extra criteria fail or otherwise prove insufficient. 205 // 206 // NOTE: Any metrics that we pull from the worker here need to be 'owned' 207 // metrics, so that we can avoid holding the worker lock and the udc lock 208 // simultaneously (deadlock risk). The 'owned' variables of the worker are 209 // variables that are only accessed by the master worker thread. 210 meetsExtraCriteria := true 211 212 // TODO: There's going to need to be some method for relaxing criteria after 213 // the first wave of workers are sent off. If the first waves of workers 214 // fail, the next wave need to realize that they shouldn't immediately go on 215 // standby because for some reason there were failures in the first wave and 216 // now the second/etc. wave of workers is needed. 217 218 // Figure out if this chunk needs another worker actively downloading 219 // pieces. The number of workers that should be active simultaneously on 220 // this chunk is the minimum number of pieces required for recovery plus the 221 // number of overdrive workers (typically zero). For our purposes, completed 222 // pieces count as active workers, though the workers have actually 223 // finished. 224 piecesInProgress := udc.piecesRegistered + udc.piecesCompleted 225 desiredPiecesInProgress := udc.erasureCode.MinPieces() + udc.staticOverdrive 226 workersDesired := piecesInProgress < desiredPiecesInProgress 227 228 if workersDesired && meetsExtraCriteria { 229 // Worker can be useful. Register the worker and return the chunk for 230 // downloading. 231 udc.piecesRegistered++ 232 udc.pieceUsage[pieceData.index] = true 233 return udc 234 } 235 // Worker is not needed unless another worker fails, so put this worker on 236 // standby for this chunk. The worker is still available to help with the 237 // download, so the worker is not removed from the chunk in this codepath. 238 udc.workersStandby = append(udc.workersStandby, w) 239 return nil 240 }