gitlab.com/SiaPrime/SiaPrime@v1.4.1/modules/renter/workerdownload.go

gitlab.com/SiaPrime/SiaPrime@v1.4.1/modules/renter/workerdownload.go (about)

     1  package renter
     2  
     3  // workerdownload.go is responsible for coordinating the actual fetching of
     4  // pieces, determining when to add standby workers, when to perform repairs, and
     5  // coordinating resource management between the workers operating on a chunk.
     6  
     7  import (
     8  	"sync/atomic"
     9  	"time"
    10  
    11  	"gitlab.com/SiaPrime/SiaPrime/crypto"
    12  	"gitlab.com/SiaPrime/SiaPrime/modules"
    13  )
    14  
    15  // segmentsForRecovery calculates the first segment and how many segments we
    16  // need in total to recover the requested data.
    17  func segmentsForRecovery(chunkFetchOffset, chunkFetchLength uint64, rs modules.ErasureCoder) (uint64, uint64) {
    18  	// If partialDecoding is not available we need to download the whole
    19  	// sector.
    20  	if !rs.SupportsPartialEncoding() {
    21  		return 0, uint64(modules.SectorSize) / crypto.SegmentSize
    22  	}
    23  	// Else we need to figure out what segments of the piece we need to
    24  	// download for the recovered data to contain the data we want.
    25  	recoveredSegmentSize := uint64(rs.MinPieces() * crypto.SegmentSize)
    26  	// Calculate the offset of the download.
    27  	startSegment := chunkFetchOffset / recoveredSegmentSize
    28  	// Calculate the length of the download.
    29  	endSegment := (chunkFetchOffset + chunkFetchLength) / recoveredSegmentSize
    30  	if (chunkFetchOffset+chunkFetchLength)%recoveredSegmentSize != 0 {
    31  		endSegment++
    32  	}
    33  	return startSegment, endSegment - startSegment
    34  }
    35  
    36  // sectorOffsetAndLength translates the fetch offset and length of the chunk
    37  // into the offset and length of the sector we need to download for a
    38  // successful recovery of the requested data.
    39  func sectorOffsetAndLength(chunkFetchOffset, chunkFetchLength uint64, rs modules.ErasureCoder) (uint64, uint64) {
    40  	segmentIndex, numSegments := segmentsForRecovery(chunkFetchOffset, chunkFetchLength, rs)
    41  	return uint64(segmentIndex * crypto.SegmentSize), uint64(numSegments * crypto.SegmentSize)
    42  }
    43  
    44  // managedPerformDownloadChunkJob will perform some download work if any is
    45  // available, returning false if no work is available.
    46  func (w *worker) managedPerformDownloadChunkJob() bool {
    47  	w.downloadMu.Lock()
    48  	if len(w.downloadChunks) == 0 {
    49  		w.downloadMu.Unlock()
    50  		return false
    51  	}
    52  	udc := w.downloadChunks[0]
    53  	w.downloadChunks = w.downloadChunks[1:]
    54  	w.downloadMu.Unlock()
    55  
    56  	// Process this chunk. If the worker is not fit to do the download, or is
    57  	// put on standby, 'nil' will be returned. After the chunk has been
    58  	// processed, the worker will be registered with the chunk.
    59  	//
    60  	// If 'nil' is returned, it is either because the worker has been removed
    61  	// from the chunk entirely, or because the worker has been put on standby.
    62  	udc = w.ownedProcessDownloadChunk(udc)
    63  	if udc == nil {
    64  		return true
    65  	}
    66  	// Worker is being given a chance to work. After the work is complete,
    67  	// whether successful or failed, the worker needs to be removed.
    68  	defer udc.managedRemoveWorker()
    69  
    70  	// Fetch the sector. If fetching the sector fails, the worker needs to be
    71  	// unregistered with the chunk.
    72  	d, err := w.renter.hostContractor.Downloader(w.staticHostPubKey, w.renter.tg.StopChan())
    73  	if err != nil {
    74  		w.renter.log.Debugln("worker failed to create downloader:", err)
    75  		udc.managedUnregisterWorker(w)
    76  		return true
    77  	}
    78  	defer d.Close()
    79  	fetchOffset, fetchLength := sectorOffsetAndLength(udc.staticFetchOffset, udc.staticFetchLength, udc.erasureCode)
    80  	root := udc.staticChunkMap[w.staticHostPubKey.String()].root
    81  	pieceData, err := d.Download(root, uint32(fetchOffset), uint32(fetchLength))
    82  	if err != nil {
    83  		w.renter.log.Debugln("worker failed to download sector:", err)
    84  		udc.managedUnregisterWorker(w)
    85  		return true
    86  	}
    87  	// TODO: Instead of adding the whole sector after the download completes,
    88  	// have the 'd.Sector' call add to this value ongoing as the sector comes
    89  	// in. Perhaps even include the data from creating the downloader and other
    90  	// data sent to and received from the host (like signatures) that aren't
    91  	// actually payload data.
    92  	atomic.AddUint64(&udc.download.atomicTotalDataTransferred, udc.staticPieceSize)
    93  
    94  	// Decrypt the piece. This might introduce some overhead for downloads with
    95  	// a large overdrive. It shouldn't be a bottleneck though since bandwidth
    96  	// is usually a lot more scarce than CPU processing power.
    97  	pieceIndex := udc.staticChunkMap[w.staticHostPubKey.String()].index
    98  	key := udc.masterKey.Derive(udc.staticChunkIndex, pieceIndex)
    99  	decryptedPiece, err := key.DecryptBytesInPlace(pieceData, uint64(fetchOffset/crypto.SegmentSize))
   100  	if err != nil {
   101  		w.renter.log.Debugln("worker failed to decrypt piece:", err)
   102  		udc.managedUnregisterWorker(w)
   103  		return true
   104  	}
   105  
   106  	// Mark the piece as completed. Perform chunk recovery if we newly have
   107  	// enough pieces to do so. Chunk recovery is an expensive operation that
   108  	// should be performed in a separate thread as to not block the worker.
   109  	udc.mu.Lock()
   110  	udc.markPieceCompleted(pieceIndex)
   111  	udc.piecesRegistered--
   112  	if udc.piecesCompleted <= udc.erasureCode.MinPieces() {
   113  		atomic.AddUint64(&udc.download.atomicDataReceived, udc.staticFetchLength/uint64(udc.erasureCode.MinPieces()))
   114  		udc.physicalChunkData[pieceIndex] = decryptedPiece
   115  	} else {
   116  		// This worker's piece was not needed, another worker was faster. Nil
   117  		// the piece so the GC can find it faster.
   118  		decryptedPiece = nil
   119  	}
   120  	if udc.piecesCompleted == udc.erasureCode.MinPieces() {
   121  		// Uint division might not always cause atomicDataReceived to cleanly
   122  		// add up to staticFetchLength so we need to figure out how much we
   123  		// already added to the download and how much is missing.
   124  		addedReceivedData := uint64(udc.erasureCode.MinPieces()) * (udc.staticFetchLength / uint64(udc.erasureCode.MinPieces()))
   125  		atomic.AddUint64(&udc.download.atomicDataReceived, udc.staticFetchLength-addedReceivedData)
   126  		// Recover the logical data.
   127  		if err := w.renter.tg.Add(); err != nil {
   128  			w.renter.log.Debugln("worker failed to decrypt piece:", err)
   129  			udc.mu.Unlock()
   130  			return true
   131  		}
   132  		go func() {
   133  			defer w.renter.tg.Done()
   134  			udc.threadedRecoverLogicalData()
   135  		}()
   136  	}
   137  	udc.mu.Unlock()
   138  	return true
   139  }
   140  
   141  // managedKillDownloading will drop all of the download work given to the
   142  // worker, and set a signal to prevent the worker from accepting more download
   143  // work.
   144  //
   145  // The chunk cleanup needs to occur after the worker mutex is released so that
   146  // the worker is not locked while chunk cleanup is happening.
   147  func (w *worker) managedKillDownloading() {
   148  	w.downloadMu.Lock()
   149  	var removedChunks []*unfinishedDownloadChunk
   150  	for i := 0; i < len(w.downloadChunks); i++ {
   151  		removedChunks = append(removedChunks, w.downloadChunks[i])
   152  	}
   153  	w.downloadChunks = w.downloadChunks[:0]
   154  	w.downloadTerminated = true
   155  	w.downloadMu.Unlock()
   156  	for i := 0; i < len(removedChunks); i++ {
   157  		removedChunks[i].managedRemoveWorker()
   158  	}
   159  }
   160  
   161  // callQueueDownloadChunk adds a chunk to the worker's queue.
   162  func (w *worker) callQueueDownloadChunk(udc *unfinishedDownloadChunk) {
   163  	// Accept the chunk unless the worker has been terminated. Accepting the
   164  	// chunk needs to happen under the same lock as fetching the termination
   165  	// status.
   166  	w.downloadMu.Lock()
   167  	terminated := w.downloadTerminated
   168  	if !terminated {
   169  		// Accept the chunk and issue a notification to the master thread that
   170  		// there is a new download.
   171  		w.downloadChunks = append(w.downloadChunks, udc)
   172  		w.staticWake()
   173  	}
   174  	w.downloadMu.Unlock()
   175  
   176  	// If the worker has terminated, remove it from the udc. This call needs to
   177  	// happen without holding the worker lock.
   178  	if terminated {
   179  		udc.managedRemoveWorker()
   180  	}
   181  }
   182  
   183  // managedUnregisterWorker will remove the worker from an unfinished download
   184  // chunk, and then un-register the pieces that it grabbed. This function should
   185  // only be called when a worker download fails.
   186  func (udc *unfinishedDownloadChunk) managedUnregisterWorker(w *worker) {
   187  	udc.mu.Lock()
   188  	udc.piecesRegistered--
   189  	udc.pieceUsage[udc.staticChunkMap[w.staticHostPubKey.String()].index] = false
   190  	udc.mu.Unlock()
   191  }
   192  
   193  // ownedOnDownloadCooldown returns true if the worker is on cooldown from failed
   194  // downloads. This function should only be called by the master worker thread,
   195  // and does not require any mutexes.
   196  func (w *worker) ownedOnDownloadCooldown() bool {
   197  	requiredCooldown := downloadFailureCooldown
   198  	for i := 0; i < w.ownedDownloadConsecutiveFailures && i < maxConsecutivePenalty; i++ {
   199  		requiredCooldown *= 2
   200  	}
   201  	return time.Now().Before(w.ownedDownloadRecentFailure.Add(requiredCooldown))
   202  }
   203  
   204  // ownedProcessDownloadChunk will take a potential download chunk, figure out if
   205  // there is work to do, and then perform any registration or processing with the
   206  // chunk before returning the chunk to the caller.
   207  //
   208  // If no immediate action is required, 'nil' will be returned.
   209  func (w *worker) ownedProcessDownloadChunk(udc *unfinishedDownloadChunk) *unfinishedDownloadChunk {
   210  	// Determine whether the worker needs to drop the chunk. If so, remove the
   211  	// worker and return nil. Worker only needs to be removed if worker is being
   212  	// dropped.
   213  	udc.mu.Lock()
   214  	chunkComplete := udc.piecesCompleted >= udc.erasureCode.MinPieces() || udc.download.staticComplete()
   215  	chunkFailed := udc.piecesCompleted+udc.workersRemaining < udc.erasureCode.MinPieces()
   216  	pieceData, workerHasPiece := udc.staticChunkMap[w.staticHostPubKey.String()]
   217  	pieceCompleted := udc.completedPieces[pieceData.index]
   218  	if chunkComplete || chunkFailed || w.ownedOnDownloadCooldown() || !workerHasPiece || pieceCompleted {
   219  		udc.mu.Unlock()
   220  		udc.managedRemoveWorker()
   221  		return nil
   222  	}
   223  	defer udc.mu.Unlock()
   224  
   225  	// TODO: This is where we would put filters based on worker latency, worker
   226  	// price, worker throughput, etc. There's a lot of fancy stuff we can do
   227  	// with filtering to make sure that for any given chunk we always use the
   228  	// optimal set of workers, and this is the spot where most of the filtering
   229  	// will happen.
   230  	//
   231  	// One major thing that we will want to be careful about when we improve
   232  	// this section is total memory vs. worker bandwidth. If the renter is
   233  	// consistently memory bottlenecked such that the slow hosts are hogging all
   234  	// of the memory and choking out the fasts hosts, leading to underutilized
   235  	// network connections where we actually have enough fast hosts to be fully
   236  	// utilizing the network. Part of this will be solved by adding bandwidth
   237  	// stats to the hostdb, but part of it will need to be solved by making sure
   238  	// that we automatically put low-bandwidth or high-latency workers on
   239  	// standby if we know that memory is the bottleneck as opposed to download
   240  	// bandwidth.
   241  	//
   242  	// Workers that do not meet the extra criteria are not discarded but rather
   243  	// put on standby, so that they can step in if the workers that do meet the
   244  	// extra criteria fail or otherwise prove insufficient.
   245  	//
   246  	// NOTE: Any metrics that we pull from the worker here need to be 'owned'
   247  	// metrics, so that we can avoid holding the worker lock and the udc lock
   248  	// simultaneously (deadlock risk). The 'owned' variables of the worker are
   249  	// variables that are only accessed by the master worker thread.
   250  	meetsExtraCriteria := true
   251  
   252  	// TODO: There's going to need to be some method for relaxing criteria after
   253  	// the first wave of workers are sent off. If the first waves of workers
   254  	// fail, the next wave need to realize that they shouldn't immediately go on
   255  	// standby because for some reason there were failures in the first wave and
   256  	// now the second/etc. wave of workers is needed.
   257  
   258  	// Figure out if this chunk needs another worker actively downloading
   259  	// pieces. The number of workers that should be active simultaneously on
   260  	// this chunk is the minimum number of pieces required for recovery plus the
   261  	// number of overdrive workers (typically zero). For our purposes, completed
   262  	// pieces count as active workers, though the workers have actually
   263  	// finished.
   264  	pieceTaken := udc.pieceUsage[pieceData.index]
   265  	piecesInProgress := udc.piecesRegistered + udc.piecesCompleted
   266  	desiredPiecesInProgress := udc.erasureCode.MinPieces() + udc.staticOverdrive
   267  	workersDesired := piecesInProgress < desiredPiecesInProgress && !pieceTaken
   268  
   269  	if workersDesired && meetsExtraCriteria {
   270  		// Worker can be useful. Register the worker and return the chunk for
   271  		// downloading.
   272  		udc.piecesRegistered++
   273  		udc.pieceUsage[pieceData.index] = true
   274  		return udc
   275  	}
   276  	// Worker is not needed unless another worker fails, so put this worker on
   277  	// standby for this chunk. The worker is still available to help with the
   278  	// download, so the worker is not removed from the chunk in this codepath.
   279  	udc.workersStandby = append(udc.workersStandby, w)
   280  	return nil
   281  }