gitlab.com/SiaPrime/SiaPrime@v1.4.1/modules/renter/workerdownload.go (about) 1 package renter 2 3 // workerdownload.go is responsible for coordinating the actual fetching of 4 // pieces, determining when to add standby workers, when to perform repairs, and 5 // coordinating resource management between the workers operating on a chunk. 6 7 import ( 8 "sync/atomic" 9 "time" 10 11 "gitlab.com/SiaPrime/SiaPrime/crypto" 12 "gitlab.com/SiaPrime/SiaPrime/modules" 13 ) 14 15 // segmentsForRecovery calculates the first segment and how many segments we 16 // need in total to recover the requested data. 17 func segmentsForRecovery(chunkFetchOffset, chunkFetchLength uint64, rs modules.ErasureCoder) (uint64, uint64) { 18 // If partialDecoding is not available we need to download the whole 19 // sector. 20 if !rs.SupportsPartialEncoding() { 21 return 0, uint64(modules.SectorSize) / crypto.SegmentSize 22 } 23 // Else we need to figure out what segments of the piece we need to 24 // download for the recovered data to contain the data we want. 25 recoveredSegmentSize := uint64(rs.MinPieces() * crypto.SegmentSize) 26 // Calculate the offset of the download. 27 startSegment := chunkFetchOffset / recoveredSegmentSize 28 // Calculate the length of the download. 29 endSegment := (chunkFetchOffset + chunkFetchLength) / recoveredSegmentSize 30 if (chunkFetchOffset+chunkFetchLength)%recoveredSegmentSize != 0 { 31 endSegment++ 32 } 33 return startSegment, endSegment - startSegment 34 } 35 36 // sectorOffsetAndLength translates the fetch offset and length of the chunk 37 // into the offset and length of the sector we need to download for a 38 // successful recovery of the requested data. 39 func sectorOffsetAndLength(chunkFetchOffset, chunkFetchLength uint64, rs modules.ErasureCoder) (uint64, uint64) { 40 segmentIndex, numSegments := segmentsForRecovery(chunkFetchOffset, chunkFetchLength, rs) 41 return uint64(segmentIndex * crypto.SegmentSize), uint64(numSegments * crypto.SegmentSize) 42 } 43 44 // managedPerformDownloadChunkJob will perform some download work if any is 45 // available, returning false if no work is available. 46 func (w *worker) managedPerformDownloadChunkJob() bool { 47 w.downloadMu.Lock() 48 if len(w.downloadChunks) == 0 { 49 w.downloadMu.Unlock() 50 return false 51 } 52 udc := w.downloadChunks[0] 53 w.downloadChunks = w.downloadChunks[1:] 54 w.downloadMu.Unlock() 55 56 // Process this chunk. If the worker is not fit to do the download, or is 57 // put on standby, 'nil' will be returned. After the chunk has been 58 // processed, the worker will be registered with the chunk. 59 // 60 // If 'nil' is returned, it is either because the worker has been removed 61 // from the chunk entirely, or because the worker has been put on standby. 62 udc = w.ownedProcessDownloadChunk(udc) 63 if udc == nil { 64 return true 65 } 66 // Worker is being given a chance to work. After the work is complete, 67 // whether successful or failed, the worker needs to be removed. 68 defer udc.managedRemoveWorker() 69 70 // Fetch the sector. If fetching the sector fails, the worker needs to be 71 // unregistered with the chunk. 72 d, err := w.renter.hostContractor.Downloader(w.staticHostPubKey, w.renter.tg.StopChan()) 73 if err != nil { 74 w.renter.log.Debugln("worker failed to create downloader:", err) 75 udc.managedUnregisterWorker(w) 76 return true 77 } 78 defer d.Close() 79 fetchOffset, fetchLength := sectorOffsetAndLength(udc.staticFetchOffset, udc.staticFetchLength, udc.erasureCode) 80 root := udc.staticChunkMap[w.staticHostPubKey.String()].root 81 pieceData, err := d.Download(root, uint32(fetchOffset), uint32(fetchLength)) 82 if err != nil { 83 w.renter.log.Debugln("worker failed to download sector:", err) 84 udc.managedUnregisterWorker(w) 85 return true 86 } 87 // TODO: Instead of adding the whole sector after the download completes, 88 // have the 'd.Sector' call add to this value ongoing as the sector comes 89 // in. Perhaps even include the data from creating the downloader and other 90 // data sent to and received from the host (like signatures) that aren't 91 // actually payload data. 92 atomic.AddUint64(&udc.download.atomicTotalDataTransferred, udc.staticPieceSize) 93 94 // Decrypt the piece. This might introduce some overhead for downloads with 95 // a large overdrive. It shouldn't be a bottleneck though since bandwidth 96 // is usually a lot more scarce than CPU processing power. 97 pieceIndex := udc.staticChunkMap[w.staticHostPubKey.String()].index 98 key := udc.masterKey.Derive(udc.staticChunkIndex, pieceIndex) 99 decryptedPiece, err := key.DecryptBytesInPlace(pieceData, uint64(fetchOffset/crypto.SegmentSize)) 100 if err != nil { 101 w.renter.log.Debugln("worker failed to decrypt piece:", err) 102 udc.managedUnregisterWorker(w) 103 return true 104 } 105 106 // Mark the piece as completed. Perform chunk recovery if we newly have 107 // enough pieces to do so. Chunk recovery is an expensive operation that 108 // should be performed in a separate thread as to not block the worker. 109 udc.mu.Lock() 110 udc.markPieceCompleted(pieceIndex) 111 udc.piecesRegistered-- 112 if udc.piecesCompleted <= udc.erasureCode.MinPieces() { 113 atomic.AddUint64(&udc.download.atomicDataReceived, udc.staticFetchLength/uint64(udc.erasureCode.MinPieces())) 114 udc.physicalChunkData[pieceIndex] = decryptedPiece 115 } else { 116 // This worker's piece was not needed, another worker was faster. Nil 117 // the piece so the GC can find it faster. 118 decryptedPiece = nil 119 } 120 if udc.piecesCompleted == udc.erasureCode.MinPieces() { 121 // Uint division might not always cause atomicDataReceived to cleanly 122 // add up to staticFetchLength so we need to figure out how much we 123 // already added to the download and how much is missing. 124 addedReceivedData := uint64(udc.erasureCode.MinPieces()) * (udc.staticFetchLength / uint64(udc.erasureCode.MinPieces())) 125 atomic.AddUint64(&udc.download.atomicDataReceived, udc.staticFetchLength-addedReceivedData) 126 // Recover the logical data. 127 if err := w.renter.tg.Add(); err != nil { 128 w.renter.log.Debugln("worker failed to decrypt piece:", err) 129 udc.mu.Unlock() 130 return true 131 } 132 go func() { 133 defer w.renter.tg.Done() 134 udc.threadedRecoverLogicalData() 135 }() 136 } 137 udc.mu.Unlock() 138 return true 139 } 140 141 // managedKillDownloading will drop all of the download work given to the 142 // worker, and set a signal to prevent the worker from accepting more download 143 // work. 144 // 145 // The chunk cleanup needs to occur after the worker mutex is released so that 146 // the worker is not locked while chunk cleanup is happening. 147 func (w *worker) managedKillDownloading() { 148 w.downloadMu.Lock() 149 var removedChunks []*unfinishedDownloadChunk 150 for i := 0; i < len(w.downloadChunks); i++ { 151 removedChunks = append(removedChunks, w.downloadChunks[i]) 152 } 153 w.downloadChunks = w.downloadChunks[:0] 154 w.downloadTerminated = true 155 w.downloadMu.Unlock() 156 for i := 0; i < len(removedChunks); i++ { 157 removedChunks[i].managedRemoveWorker() 158 } 159 } 160 161 // callQueueDownloadChunk adds a chunk to the worker's queue. 162 func (w *worker) callQueueDownloadChunk(udc *unfinishedDownloadChunk) { 163 // Accept the chunk unless the worker has been terminated. Accepting the 164 // chunk needs to happen under the same lock as fetching the termination 165 // status. 166 w.downloadMu.Lock() 167 terminated := w.downloadTerminated 168 if !terminated { 169 // Accept the chunk and issue a notification to the master thread that 170 // there is a new download. 171 w.downloadChunks = append(w.downloadChunks, udc) 172 w.staticWake() 173 } 174 w.downloadMu.Unlock() 175 176 // If the worker has terminated, remove it from the udc. This call needs to 177 // happen without holding the worker lock. 178 if terminated { 179 udc.managedRemoveWorker() 180 } 181 } 182 183 // managedUnregisterWorker will remove the worker from an unfinished download 184 // chunk, and then un-register the pieces that it grabbed. This function should 185 // only be called when a worker download fails. 186 func (udc *unfinishedDownloadChunk) managedUnregisterWorker(w *worker) { 187 udc.mu.Lock() 188 udc.piecesRegistered-- 189 udc.pieceUsage[udc.staticChunkMap[w.staticHostPubKey.String()].index] = false 190 udc.mu.Unlock() 191 } 192 193 // ownedOnDownloadCooldown returns true if the worker is on cooldown from failed 194 // downloads. This function should only be called by the master worker thread, 195 // and does not require any mutexes. 196 func (w *worker) ownedOnDownloadCooldown() bool { 197 requiredCooldown := downloadFailureCooldown 198 for i := 0; i < w.ownedDownloadConsecutiveFailures && i < maxConsecutivePenalty; i++ { 199 requiredCooldown *= 2 200 } 201 return time.Now().Before(w.ownedDownloadRecentFailure.Add(requiredCooldown)) 202 } 203 204 // ownedProcessDownloadChunk will take a potential download chunk, figure out if 205 // there is work to do, and then perform any registration or processing with the 206 // chunk before returning the chunk to the caller. 207 // 208 // If no immediate action is required, 'nil' will be returned. 209 func (w *worker) ownedProcessDownloadChunk(udc *unfinishedDownloadChunk) *unfinishedDownloadChunk { 210 // Determine whether the worker needs to drop the chunk. If so, remove the 211 // worker and return nil. Worker only needs to be removed if worker is being 212 // dropped. 213 udc.mu.Lock() 214 chunkComplete := udc.piecesCompleted >= udc.erasureCode.MinPieces() || udc.download.staticComplete() 215 chunkFailed := udc.piecesCompleted+udc.workersRemaining < udc.erasureCode.MinPieces() 216 pieceData, workerHasPiece := udc.staticChunkMap[w.staticHostPubKey.String()] 217 pieceCompleted := udc.completedPieces[pieceData.index] 218 if chunkComplete || chunkFailed || w.ownedOnDownloadCooldown() || !workerHasPiece || pieceCompleted { 219 udc.mu.Unlock() 220 udc.managedRemoveWorker() 221 return nil 222 } 223 defer udc.mu.Unlock() 224 225 // TODO: This is where we would put filters based on worker latency, worker 226 // price, worker throughput, etc. There's a lot of fancy stuff we can do 227 // with filtering to make sure that for any given chunk we always use the 228 // optimal set of workers, and this is the spot where most of the filtering 229 // will happen. 230 // 231 // One major thing that we will want to be careful about when we improve 232 // this section is total memory vs. worker bandwidth. If the renter is 233 // consistently memory bottlenecked such that the slow hosts are hogging all 234 // of the memory and choking out the fasts hosts, leading to underutilized 235 // network connections where we actually have enough fast hosts to be fully 236 // utilizing the network. Part of this will be solved by adding bandwidth 237 // stats to the hostdb, but part of it will need to be solved by making sure 238 // that we automatically put low-bandwidth or high-latency workers on 239 // standby if we know that memory is the bottleneck as opposed to download 240 // bandwidth. 241 // 242 // Workers that do not meet the extra criteria are not discarded but rather 243 // put on standby, so that they can step in if the workers that do meet the 244 // extra criteria fail or otherwise prove insufficient. 245 // 246 // NOTE: Any metrics that we pull from the worker here need to be 'owned' 247 // metrics, so that we can avoid holding the worker lock and the udc lock 248 // simultaneously (deadlock risk). The 'owned' variables of the worker are 249 // variables that are only accessed by the master worker thread. 250 meetsExtraCriteria := true 251 252 // TODO: There's going to need to be some method for relaxing criteria after 253 // the first wave of workers are sent off. If the first waves of workers 254 // fail, the next wave need to realize that they shouldn't immediately go on 255 // standby because for some reason there were failures in the first wave and 256 // now the second/etc. wave of workers is needed. 257 258 // Figure out if this chunk needs another worker actively downloading 259 // pieces. The number of workers that should be active simultaneously on 260 // this chunk is the minimum number of pieces required for recovery plus the 261 // number of overdrive workers (typically zero). For our purposes, completed 262 // pieces count as active workers, though the workers have actually 263 // finished. 264 pieceTaken := udc.pieceUsage[pieceData.index] 265 piecesInProgress := udc.piecesRegistered + udc.piecesCompleted 266 desiredPiecesInProgress := udc.erasureCode.MinPieces() + udc.staticOverdrive 267 workersDesired := piecesInProgress < desiredPiecesInProgress && !pieceTaken 268 269 if workersDesired && meetsExtraCriteria { 270 // Worker can be useful. Register the worker and return the chunk for 271 // downloading. 272 udc.piecesRegistered++ 273 udc.pieceUsage[pieceData.index] = true 274 return udc 275 } 276 // Worker is not needed unless another worker fails, so put this worker on 277 // standby for this chunk. The worker is still available to help with the 278 // download, so the worker is not removed from the chunk in this codepath. 279 udc.workersStandby = append(udc.workersStandby, w) 280 return nil 281 }