gitlab.com/jokerrs1/Sia@v1.3.2/modules/renter/workerdownload.go (about) 1 package renter 2 3 // workerdownload.go is responsible for coordinating the actual fetching of 4 // pieces, determining when to add standby workers, when to perform repairs, and 5 // coordinating resource management between the workers operating on a chunk. 6 7 import ( 8 "sync/atomic" 9 "time" 10 ) 11 12 // managedDownload will perform some download work. 13 func (w *worker) managedDownload(udc *unfinishedDownloadChunk) { 14 // Process this chunk. If the worker is not fit to do the download, or is 15 // put on standby, 'nil' will be returned. After the chunk has been 16 // processed, the worker will be registered with the chunk. 17 // 18 // If 'nil' is returned, it is either because the worker has been removed 19 // from the chunk entirely, or because the worker has been put on standby. 20 udc = w.ownedProcessDownloadChunk(udc) 21 if udc == nil { 22 return 23 } 24 // Worker is being given a chance to work. After the work is complete, 25 // whether successful or failed, the worker needs to be removed. 26 defer udc.managedRemoveWorker() 27 28 // Fetch the sector. If fetching the sector fails, the worker needs to be 29 // unregistered with the chunk. 30 d, err := w.renter.hostContractor.Downloader(w.contract.ID, w.renter.tg.StopChan()) 31 if err != nil { 32 udc.managedUnregisterWorker(w) 33 return 34 } 35 defer d.Close() 36 data, err := d.Sector(udc.staticChunkMap[w.contract.ID].root) 37 if err != nil { 38 udc.managedUnregisterWorker(w) 39 return 40 } 41 // TODO: Instead of adding the whole sector after the download completes, 42 // have the 'd.Sector' call add to this value ongoing as the sector comes 43 // in. Perhaps even include the data from creating the downloader and other 44 // data sent to and received from the host (like signatures) that aren't 45 // actually payload data. 46 atomic.AddUint64(&udc.download.atomicTotalDataTransferred, udc.staticPieceSize) 47 48 // Mark the piece as completed. Perform chunk recovery if we newly have 49 // enough pieces to do so. Chunk recovery is an expensive operation that 50 // should be performed in a separate thread as to not block the worker. 51 udc.mu.Lock() 52 udc.piecesCompleted++ 53 udc.piecesRegistered-- 54 if udc.piecesCompleted <= udc.erasureCode.MinPieces() { 55 udc.physicalChunkData[udc.staticChunkMap[w.contract.ID].index] = data 56 } 57 if udc.piecesCompleted == udc.erasureCode.MinPieces() { 58 go udc.threadedRecoverLogicalData() 59 } 60 udc.mu.Unlock() 61 } 62 63 // managedKillDownloading will drop all of the download work given to the 64 // worker, and set a signal to prevent the worker from accepting more download 65 // work. 66 // 67 // The chunk cleanup needs to occur after the worker mutex is released so that 68 // the worker is not locked while chunk cleanup is happening. 69 func (w *worker) managedKillDownloading() { 70 w.downloadMu.Lock() 71 var removedChunks []*unfinishedDownloadChunk 72 for i := 0; i < len(w.downloadChunks); i++ { 73 removedChunks = append(removedChunks, w.downloadChunks[i]) 74 } 75 w.downloadChunks = w.downloadChunks[:0] 76 w.downloadTerminated = true 77 w.downloadMu.Unlock() 78 for i := 0; i < len(removedChunks); i++ { 79 removedChunks[i].managedRemoveWorker() 80 } 81 } 82 83 // managedNextDownloadChunk will pull the next potential chunk out of the work 84 // queue for downloading. 85 func (w *worker) managedNextDownloadChunk() *unfinishedDownloadChunk { 86 w.downloadMu.Lock() 87 defer w.downloadMu.Unlock() 88 89 if len(w.downloadChunks) == 0 { 90 return nil 91 } 92 nextChunk := w.downloadChunks[0] 93 w.downloadChunks = w.downloadChunks[1:] 94 return nextChunk 95 } 96 97 // managedQueueDownloadChunk adds a chunk to the worker's queue. 98 func (w *worker) managedQueueDownloadChunk(udc *unfinishedDownloadChunk) { 99 // Accept the chunk unless the worker has been terminated. Accepting the 100 // chunk needs to happen under the same lock as fetching the termination 101 // status. 102 w.downloadMu.Lock() 103 terminated := w.downloadTerminated 104 if !terminated { 105 // Accept the chunk and issue a notification to the master thread that 106 // there is a new download. 107 w.downloadChunks = append(w.downloadChunks, udc) 108 select { 109 case w.downloadChan <- struct{}{}: 110 default: 111 } 112 } 113 w.downloadMu.Unlock() 114 115 // If the worker has terminated, remove it from the udc. This call needs to 116 // happen without holding the worker lock. 117 if terminated { 118 udc.managedRemoveWorker() 119 } 120 } 121 122 // managedUnregisterWorker will remove the worker from an unfinished download 123 // chunk, and then un-register the pieces that it grabbed. This function should 124 // only be called when a worker download fails. 125 func (udc *unfinishedDownloadChunk) managedUnregisterWorker(w *worker) { 126 udc.mu.Lock() 127 udc.piecesRegistered-- 128 udc.pieceUsage[udc.staticChunkMap[w.contract.ID].index] = false 129 udc.mu.Unlock() 130 } 131 132 // ownedOnDownloadCooldown returns true if the worker is on cooldown from failed 133 // downloads. This function should only be called by the master worker thread, 134 // and does not require any mutexes. 135 func (w *worker) ownedOnDownloadCooldown() bool { 136 requiredCooldown := downloadFailureCooldown 137 for i := 0; i < w.ownedDownloadConsecutiveFailures && i < maxConsecutivePenalty; i++ { 138 requiredCooldown *= 2 139 } 140 return time.Now().Before(w.ownedDownloadRecentFailure.Add(requiredCooldown)) 141 } 142 143 // ownedProcessDownloadChunk will take a potential download chunk, figure out if 144 // there is work to do, and then perform any registration or processing with the 145 // chunk before returning the chunk to the caller. 146 // 147 // If no immediate action is required, 'nil' will be returned. 148 func (w *worker) ownedProcessDownloadChunk(udc *unfinishedDownloadChunk) *unfinishedDownloadChunk { 149 // Determine whether the worker needs to drop the chunk. If so, remove the 150 // worker and return nil. Worker only needs to be removed if worker is being 151 // dropped. 152 udc.mu.Lock() 153 chunkComplete := udc.piecesCompleted >= udc.erasureCode.MinPieces() 154 chunkFailed := udc.piecesCompleted+udc.workersRemaining < udc.erasureCode.MinPieces() 155 pieceData, workerHasPiece := udc.staticChunkMap[w.contract.ID] 156 pieceTaken := udc.pieceUsage[pieceData.index] 157 if chunkComplete || chunkFailed || w.ownedOnDownloadCooldown() || !workerHasPiece || pieceTaken { 158 udc.mu.Unlock() 159 udc.managedRemoveWorker() 160 return nil 161 } 162 defer udc.mu.Unlock() 163 164 // TODO: This is where we would put filters based on worker latency, worker 165 // price, worker throughput, etc. There's a lot of fancy stuff we can do 166 // with filtering to make sure that for any given chunk we always use the 167 // optimal set of workers, and this is the spot where most of the filtering 168 // will happen. 169 // 170 // One major thing that we will want to be careful about when we improve 171 // this section is total memory vs. worker bandwidth. If the renter is 172 // consistently memory bottlenecked such that the slow hosts are hogging all 173 // of the memory and choking out the fasts hosts, leading to underutilized 174 // network connections where we actually have enough fast hosts to be fully 175 // utilizing the network. Part of this will be solved by adding bandwidth 176 // stats to the hostdb, but part of it will need to be solved by making sure 177 // that we automatically put low-bandwidth or high-latency workers on 178 // standby if we know that memory is the bottleneck as opposed to download 179 // bandwidth. 180 // 181 // Workers that do not meet the extra criteria are not discarded but rather 182 // put on standby, so that they can step in if the workers that do meet the 183 // extra criteria fail or otherwise prove insufficient. 184 // 185 // NOTE: Any metrics that we pull from the worker here need to be 'owned' 186 // metrics, so that we can avoid holding the worker lock and the udc lock 187 // simultaneously (deadlock risk). The 'owned' variables of the worker are 188 // variables that are only accessed by the master worker thread. 189 meetsExtraCriteria := true 190 191 // TODO: There's going to need to be some method for relaxing criteria after 192 // the first wave of workers are sent off. If the first waves of workers 193 // fail, the next wave need to realize that they shouldn't immediately go on 194 // standby because for some reason there were failures in the first wave and 195 // now the second/etc. wave of workers is needed. 196 197 // Figure out if this chunk needs another worker actively downloading 198 // pieces. The number of workers that should be active simultaneously on 199 // this chunk is the minimum number of pieces required for recovery plus the 200 // number of overdrive workers (typically zero). For our purposes, completed 201 // pieces count as active workers, though the workers have actually 202 // finished. 203 piecesInProgress := udc.piecesRegistered + udc.piecesCompleted 204 desiredPiecesInProgress := udc.erasureCode.MinPieces() + udc.staticOverdrive 205 workersDesired := piecesInProgress < desiredPiecesInProgress 206 207 if workersDesired && meetsExtraCriteria { 208 // Worker can be useful. Register the worker and return the chunk for 209 // downloading. 210 udc.piecesRegistered++ 211 udc.pieceUsage[pieceData.index] = true 212 return udc 213 } 214 // Worker is not needed unless another worker fails, so put this worker on 215 // standby for this chunk. The worker is still available to help with the 216 // download, so the worker is not removed from the chunk in this codepath. 217 udc.workersStandby = append(udc.workersStandby, w) 218 return nil 219 }