github.com/nebulouslabs/sia@v1.3.7/modules/renter/workerdownload.go

github.com/nebulouslabs/sia@v1.3.7/modules/renter/workerdownload.go (about)

     1  package renter
     2  
     3  // workerdownload.go is responsible for coordinating the actual fetching of
     4  // pieces, determining when to add standby workers, when to perform repairs, and
     5  // coordinating resource management between the workers operating on a chunk.
     6  
     7  import (
     8  	"sync/atomic"
     9  	"time"
    10  )
    11  
    12  // managedDownload will perform some download work.
    13  func (w *worker) managedDownload(udc *unfinishedDownloadChunk) {
    14  	// Process this chunk. If the worker is not fit to do the download, or is
    15  	// put on standby, 'nil' will be returned. After the chunk has been
    16  	// processed, the worker will be registered with the chunk.
    17  	//
    18  	// If 'nil' is returned, it is either because the worker has been removed
    19  	// from the chunk entirely, or because the worker has been put on standby.
    20  	udc = w.ownedProcessDownloadChunk(udc)
    21  	if udc == nil {
    22  		return
    23  	}
    24  	// Worker is being given a chance to work. After the work is complete,
    25  	// whether successful or failed, the worker needs to be removed.
    26  	defer udc.managedRemoveWorker()
    27  
    28  	// Fetch the sector. If fetching the sector fails, the worker needs to be
    29  	// unregistered with the chunk.
    30  	d, err := w.renter.hostContractor.Downloader(w.contract.HostPublicKey, w.renter.tg.StopChan())
    31  	if err != nil {
    32  		w.renter.log.Debugln("worker failed to create downloader:", err)
    33  		udc.managedUnregisterWorker(w)
    34  		return
    35  	}
    36  	defer d.Close()
    37  	pieceData, err := d.Sector(udc.staticChunkMap[string(w.contract.HostPublicKey.Key)].root)
    38  	if err != nil {
    39  		w.renter.log.Debugln("worker failed to download sector:", err)
    40  		udc.managedUnregisterWorker(w)
    41  		return
    42  	}
    43  	// TODO: Instead of adding the whole sector after the download completes,
    44  	// have the 'd.Sector' call add to this value ongoing as the sector comes
    45  	// in. Perhaps even include the data from creating the downloader and other
    46  	// data sent to and received from the host (like signatures) that aren't
    47  	// actually payload data.
    48  	atomic.AddUint64(&udc.download.atomicTotalDataTransferred, udc.staticPieceSize)
    49  
    50  	// Decrypt the piece. This might introduce some overhead for downloads with
    51  	// a large overdrive. It shouldn't be a bottleneck though since bandwidth
    52  	// is usually a lot more scarce than CPU processing power.
    53  	pieceIndex := udc.staticChunkMap[string(w.contract.HostPublicKey.Key)].index
    54  	key := deriveKey(udc.masterKey, udc.staticChunkIndex, pieceIndex)
    55  	decryptedPiece, err := key.DecryptBytesInPlace(pieceData)
    56  	if err != nil {
    57  		w.renter.log.Debugln("worker failed to decrypt piece:", err)
    58  		udc.managedUnregisterWorker(w)
    59  		return
    60  	}
    61  
    62  	// Mark the piece as completed. Perform chunk recovery if we newly have
    63  	// enough pieces to do so. Chunk recovery is an expensive operation that
    64  	// should be performed in a separate thread as to not block the worker.
    65  	udc.mu.Lock()
    66  	udc.piecesCompleted++
    67  	udc.piecesRegistered--
    68  	if udc.piecesCompleted <= udc.erasureCode.MinPieces() {
    69  		udc.physicalChunkData[pieceIndex] = decryptedPiece
    70  	}
    71  	if udc.piecesCompleted == udc.erasureCode.MinPieces() {
    72  		go udc.threadedRecoverLogicalData()
    73  	}
    74  	udc.mu.Unlock()
    75  }
    76  
    77  // managedKillDownloading will drop all of the download work given to the
    78  // worker, and set a signal to prevent the worker from accepting more download
    79  // work.
    80  //
    81  // The chunk cleanup needs to occur after the worker mutex is released so that
    82  // the worker is not locked while chunk cleanup is happening.
    83  func (w *worker) managedKillDownloading() {
    84  	w.downloadMu.Lock()
    85  	var removedChunks []*unfinishedDownloadChunk
    86  	for i := 0; i < len(w.downloadChunks); i++ {
    87  		removedChunks = append(removedChunks, w.downloadChunks[i])
    88  	}
    89  	w.downloadChunks = w.downloadChunks[:0]
    90  	w.downloadTerminated = true
    91  	w.downloadMu.Unlock()
    92  	for i := 0; i < len(removedChunks); i++ {
    93  		removedChunks[i].managedRemoveWorker()
    94  	}
    95  }
    96  
    97  // managedNextDownloadChunk will pull the next potential chunk out of the work
    98  // queue for downloading.
    99  func (w *worker) managedNextDownloadChunk() *unfinishedDownloadChunk {
   100  	w.downloadMu.Lock()
   101  	defer w.downloadMu.Unlock()
   102  
   103  	if len(w.downloadChunks) == 0 {
   104  		return nil
   105  	}
   106  	nextChunk := w.downloadChunks[0]
   107  	w.downloadChunks = w.downloadChunks[1:]
   108  	return nextChunk
   109  }
   110  
   111  // managedQueueDownloadChunk adds a chunk to the worker's queue.
   112  func (w *worker) managedQueueDownloadChunk(udc *unfinishedDownloadChunk) {
   113  	// Accept the chunk unless the worker has been terminated. Accepting the
   114  	// chunk needs to happen under the same lock as fetching the termination
   115  	// status.
   116  	w.downloadMu.Lock()
   117  	terminated := w.downloadTerminated
   118  	if !terminated {
   119  		// Accept the chunk and issue a notification to the master thread that
   120  		// there is a new download.
   121  		w.downloadChunks = append(w.downloadChunks, udc)
   122  		select {
   123  		case w.downloadChan <- struct{}{}:
   124  		default:
   125  		}
   126  	}
   127  	w.downloadMu.Unlock()
   128  
   129  	// If the worker has terminated, remove it from the udc. This call needs to
   130  	// happen without holding the worker lock.
   131  	if terminated {
   132  		udc.managedRemoveWorker()
   133  	}
   134  }
   135  
   136  // managedUnregisterWorker will remove the worker from an unfinished download
   137  // chunk, and then un-register the pieces that it grabbed. This function should
   138  // only be called when a worker download fails.
   139  func (udc *unfinishedDownloadChunk) managedUnregisterWorker(w *worker) {
   140  	udc.mu.Lock()
   141  	udc.piecesRegistered--
   142  	udc.pieceUsage[udc.staticChunkMap[string(w.contract.HostPublicKey.Key)].index] = false
   143  	udc.mu.Unlock()
   144  }
   145  
   146  // ownedOnDownloadCooldown returns true if the worker is on cooldown from failed
   147  // downloads. This function should only be called by the master worker thread,
   148  // and does not require any mutexes.
   149  func (w *worker) ownedOnDownloadCooldown() bool {
   150  	requiredCooldown := downloadFailureCooldown
   151  	for i := 0; i < w.ownedDownloadConsecutiveFailures && i < maxConsecutivePenalty; i++ {
   152  		requiredCooldown *= 2
   153  	}
   154  	return time.Now().Before(w.ownedDownloadRecentFailure.Add(requiredCooldown))
   155  }
   156  
   157  // ownedProcessDownloadChunk will take a potential download chunk, figure out if
   158  // there is work to do, and then perform any registration or processing with the
   159  // chunk before returning the chunk to the caller.
   160  //
   161  // If no immediate action is required, 'nil' will be returned.
   162  func (w *worker) ownedProcessDownloadChunk(udc *unfinishedDownloadChunk) *unfinishedDownloadChunk {
   163  	// Determine whether the worker needs to drop the chunk. If so, remove the
   164  	// worker and return nil. Worker only needs to be removed if worker is being
   165  	// dropped.
   166  	udc.mu.Lock()
   167  	chunkComplete := udc.piecesCompleted >= udc.erasureCode.MinPieces()
   168  	chunkFailed := udc.piecesCompleted+udc.workersRemaining < udc.erasureCode.MinPieces()
   169  	pieceData, workerHasPiece := udc.staticChunkMap[string(w.contract.HostPublicKey.Key)]
   170  	pieceTaken := udc.pieceUsage[pieceData.index]
   171  	if chunkComplete || chunkFailed || w.ownedOnDownloadCooldown() || !workerHasPiece || pieceTaken {
   172  		udc.mu.Unlock()
   173  		udc.managedRemoveWorker()
   174  		return nil
   175  	}
   176  	defer udc.mu.Unlock()
   177  
   178  	// TODO: This is where we would put filters based on worker latency, worker
   179  	// price, worker throughput, etc. There's a lot of fancy stuff we can do
   180  	// with filtering to make sure that for any given chunk we always use the
   181  	// optimal set of workers, and this is the spot where most of the filtering
   182  	// will happen.
   183  	//
   184  	// One major thing that we will want to be careful about when we improve
   185  	// this section is total memory vs. worker bandwidth. If the renter is
   186  	// consistently memory bottlenecked such that the slow hosts are hogging all
   187  	// of the memory and choking out the fasts hosts, leading to underutilized
   188  	// network connections where we actually have enough fast hosts to be fully
   189  	// utilizing the network. Part of this will be solved by adding bandwidth
   190  	// stats to the hostdb, but part of it will need to be solved by making sure
   191  	// that we automatically put low-bandwidth or high-latency workers on
   192  	// standby if we know that memory is the bottleneck as opposed to download
   193  	// bandwidth.
   194  	//
   195  	// Workers that do not meet the extra criteria are not discarded but rather
   196  	// put on standby, so that they can step in if the workers that do meet the
   197  	// extra criteria fail or otherwise prove insufficient.
   198  	//
   199  	// NOTE: Any metrics that we pull from the worker here need to be 'owned'
   200  	// metrics, so that we can avoid holding the worker lock and the udc lock
   201  	// simultaneously (deadlock risk). The 'owned' variables of the worker are
   202  	// variables that are only accessed by the master worker thread.
   203  	meetsExtraCriteria := true
   204  
   205  	// TODO: There's going to need to be some method for relaxing criteria after
   206  	// the first wave of workers are sent off. If the first waves of workers
   207  	// fail, the next wave need to realize that they shouldn't immediately go on
   208  	// standby because for some reason there were failures in the first wave and
   209  	// now the second/etc. wave of workers is needed.
   210  
   211  	// Figure out if this chunk needs another worker actively downloading
   212  	// pieces. The number of workers that should be active simultaneously on
   213  	// this chunk is the minimum number of pieces required for recovery plus the
   214  	// number of overdrive workers (typically zero). For our purposes, completed
   215  	// pieces count as active workers, though the workers have actually
   216  	// finished.
   217  	piecesInProgress := udc.piecesRegistered + udc.piecesCompleted
   218  	desiredPiecesInProgress := udc.erasureCode.MinPieces() + udc.staticOverdrive
   219  	workersDesired := piecesInProgress < desiredPiecesInProgress
   220  
   221  	if workersDesired && meetsExtraCriteria {
   222  		// Worker can be useful. Register the worker and return the chunk for
   223  		// downloading.
   224  		udc.piecesRegistered++
   225  		udc.pieceUsage[pieceData.index] = true
   226  		return udc
   227  	}
   228  	// Worker is not needed unless another worker fails, so put this worker on
   229  	// standby for this chunk. The worker is still available to help with the
   230  	// download, so the worker is not removed from the chunk in this codepath.
   231  	udc.workersStandby = append(udc.workersStandby, w)
   232  	return nil
   233  }