gitlab.com/jokerrs1/Sia@v1.3.2/modules/renter/workerdownload.go (about)

     1  package renter
     2  
     3  // workerdownload.go is responsible for coordinating the actual fetching of
     4  // pieces, determining when to add standby workers, when to perform repairs, and
     5  // coordinating resource management between the workers operating on a chunk.
     6  
     7  import (
     8  	"sync/atomic"
     9  	"time"
    10  )
    11  
    12  // managedDownload will perform some download work.
    13  func (w *worker) managedDownload(udc *unfinishedDownloadChunk) {
    14  	// Process this chunk. If the worker is not fit to do the download, or is
    15  	// put on standby, 'nil' will be returned. After the chunk has been
    16  	// processed, the worker will be registered with the chunk.
    17  	//
    18  	// If 'nil' is returned, it is either because the worker has been removed
    19  	// from the chunk entirely, or because the worker has been put on standby.
    20  	udc = w.ownedProcessDownloadChunk(udc)
    21  	if udc == nil {
    22  		return
    23  	}
    24  	// Worker is being given a chance to work. After the work is complete,
    25  	// whether successful or failed, the worker needs to be removed.
    26  	defer udc.managedRemoveWorker()
    27  
    28  	// Fetch the sector. If fetching the sector fails, the worker needs to be
    29  	// unregistered with the chunk.
    30  	d, err := w.renter.hostContractor.Downloader(w.contract.ID, w.renter.tg.StopChan())
    31  	if err != nil {
    32  		udc.managedUnregisterWorker(w)
    33  		return
    34  	}
    35  	defer d.Close()
    36  	data, err := d.Sector(udc.staticChunkMap[w.contract.ID].root)
    37  	if err != nil {
    38  		udc.managedUnregisterWorker(w)
    39  		return
    40  	}
    41  	// TODO: Instead of adding the whole sector after the download completes,
    42  	// have the 'd.Sector' call add to this value ongoing as the sector comes
    43  	// in. Perhaps even include the data from creating the downloader and other
    44  	// data sent to and received from the host (like signatures) that aren't
    45  	// actually payload data.
    46  	atomic.AddUint64(&udc.download.atomicTotalDataTransferred, udc.staticPieceSize)
    47  
    48  	// Mark the piece as completed. Perform chunk recovery if we newly have
    49  	// enough pieces to do so. Chunk recovery is an expensive operation that
    50  	// should be performed in a separate thread as to not block the worker.
    51  	udc.mu.Lock()
    52  	udc.piecesCompleted++
    53  	udc.piecesRegistered--
    54  	if udc.piecesCompleted <= udc.erasureCode.MinPieces() {
    55  		udc.physicalChunkData[udc.staticChunkMap[w.contract.ID].index] = data
    56  	}
    57  	if udc.piecesCompleted == udc.erasureCode.MinPieces() {
    58  		go udc.threadedRecoverLogicalData()
    59  	}
    60  	udc.mu.Unlock()
    61  }
    62  
    63  // managedKillDownloading will drop all of the download work given to the
    64  // worker, and set a signal to prevent the worker from accepting more download
    65  // work.
    66  //
    67  // The chunk cleanup needs to occur after the worker mutex is released so that
    68  // the worker is not locked while chunk cleanup is happening.
    69  func (w *worker) managedKillDownloading() {
    70  	w.downloadMu.Lock()
    71  	var removedChunks []*unfinishedDownloadChunk
    72  	for i := 0; i < len(w.downloadChunks); i++ {
    73  		removedChunks = append(removedChunks, w.downloadChunks[i])
    74  	}
    75  	w.downloadChunks = w.downloadChunks[:0]
    76  	w.downloadTerminated = true
    77  	w.downloadMu.Unlock()
    78  	for i := 0; i < len(removedChunks); i++ {
    79  		removedChunks[i].managedRemoveWorker()
    80  	}
    81  }
    82  
    83  // managedNextDownloadChunk will pull the next potential chunk out of the work
    84  // queue for downloading.
    85  func (w *worker) managedNextDownloadChunk() *unfinishedDownloadChunk {
    86  	w.downloadMu.Lock()
    87  	defer w.downloadMu.Unlock()
    88  
    89  	if len(w.downloadChunks) == 0 {
    90  		return nil
    91  	}
    92  	nextChunk := w.downloadChunks[0]
    93  	w.downloadChunks = w.downloadChunks[1:]
    94  	return nextChunk
    95  }
    96  
    97  // managedQueueDownloadChunk adds a chunk to the worker's queue.
    98  func (w *worker) managedQueueDownloadChunk(udc *unfinishedDownloadChunk) {
    99  	// Accept the chunk unless the worker has been terminated. Accepting the
   100  	// chunk needs to happen under the same lock as fetching the termination
   101  	// status.
   102  	w.downloadMu.Lock()
   103  	terminated := w.downloadTerminated
   104  	if !terminated {
   105  		// Accept the chunk and issue a notification to the master thread that
   106  		// there is a new download.
   107  		w.downloadChunks = append(w.downloadChunks, udc)
   108  		select {
   109  		case w.downloadChan <- struct{}{}:
   110  		default:
   111  		}
   112  	}
   113  	w.downloadMu.Unlock()
   114  
   115  	// If the worker has terminated, remove it from the udc. This call needs to
   116  	// happen without holding the worker lock.
   117  	if terminated {
   118  		udc.managedRemoveWorker()
   119  	}
   120  }
   121  
   122  // managedUnregisterWorker will remove the worker from an unfinished download
   123  // chunk, and then un-register the pieces that it grabbed. This function should
   124  // only be called when a worker download fails.
   125  func (udc *unfinishedDownloadChunk) managedUnregisterWorker(w *worker) {
   126  	udc.mu.Lock()
   127  	udc.piecesRegistered--
   128  	udc.pieceUsage[udc.staticChunkMap[w.contract.ID].index] = false
   129  	udc.mu.Unlock()
   130  }
   131  
   132  // ownedOnDownloadCooldown returns true if the worker is on cooldown from failed
   133  // downloads. This function should only be called by the master worker thread,
   134  // and does not require any mutexes.
   135  func (w *worker) ownedOnDownloadCooldown() bool {
   136  	requiredCooldown := downloadFailureCooldown
   137  	for i := 0; i < w.ownedDownloadConsecutiveFailures && i < maxConsecutivePenalty; i++ {
   138  		requiredCooldown *= 2
   139  	}
   140  	return time.Now().Before(w.ownedDownloadRecentFailure.Add(requiredCooldown))
   141  }
   142  
   143  // ownedProcessDownloadChunk will take a potential download chunk, figure out if
   144  // there is work to do, and then perform any registration or processing with the
   145  // chunk before returning the chunk to the caller.
   146  //
   147  // If no immediate action is required, 'nil' will be returned.
   148  func (w *worker) ownedProcessDownloadChunk(udc *unfinishedDownloadChunk) *unfinishedDownloadChunk {
   149  	// Determine whether the worker needs to drop the chunk. If so, remove the
   150  	// worker and return nil. Worker only needs to be removed if worker is being
   151  	// dropped.
   152  	udc.mu.Lock()
   153  	chunkComplete := udc.piecesCompleted >= udc.erasureCode.MinPieces()
   154  	chunkFailed := udc.piecesCompleted+udc.workersRemaining < udc.erasureCode.MinPieces()
   155  	pieceData, workerHasPiece := udc.staticChunkMap[w.contract.ID]
   156  	pieceTaken := udc.pieceUsage[pieceData.index]
   157  	if chunkComplete || chunkFailed || w.ownedOnDownloadCooldown() || !workerHasPiece || pieceTaken {
   158  		udc.mu.Unlock()
   159  		udc.managedRemoveWorker()
   160  		return nil
   161  	}
   162  	defer udc.mu.Unlock()
   163  
   164  	// TODO: This is where we would put filters based on worker latency, worker
   165  	// price, worker throughput, etc. There's a lot of fancy stuff we can do
   166  	// with filtering to make sure that for any given chunk we always use the
   167  	// optimal set of workers, and this is the spot where most of the filtering
   168  	// will happen.
   169  	//
   170  	// One major thing that we will want to be careful about when we improve
   171  	// this section is total memory vs. worker bandwidth. If the renter is
   172  	// consistently memory bottlenecked such that the slow hosts are hogging all
   173  	// of the memory and choking out the fasts hosts, leading to underutilized
   174  	// network connections where we actually have enough fast hosts to be fully
   175  	// utilizing the network. Part of this will be solved by adding bandwidth
   176  	// stats to the hostdb, but part of it will need to be solved by making sure
   177  	// that we automatically put low-bandwidth or high-latency workers on
   178  	// standby if we know that memory is the bottleneck as opposed to download
   179  	// bandwidth.
   180  	//
   181  	// Workers that do not meet the extra criteria are not discarded but rather
   182  	// put on standby, so that they can step in if the workers that do meet the
   183  	// extra criteria fail or otherwise prove insufficient.
   184  	//
   185  	// NOTE: Any metrics that we pull from the worker here need to be 'owned'
   186  	// metrics, so that we can avoid holding the worker lock and the udc lock
   187  	// simultaneously (deadlock risk). The 'owned' variables of the worker are
   188  	// variables that are only accessed by the master worker thread.
   189  	meetsExtraCriteria := true
   190  
   191  	// TODO: There's going to need to be some method for relaxing criteria after
   192  	// the first wave of workers are sent off. If the first waves of workers
   193  	// fail, the next wave need to realize that they shouldn't immediately go on
   194  	// standby because for some reason there were failures in the first wave and
   195  	// now the second/etc. wave of workers is needed.
   196  
   197  	// Figure out if this chunk needs another worker actively downloading
   198  	// pieces. The number of workers that should be active simultaneously on
   199  	// this chunk is the minimum number of pieces required for recovery plus the
   200  	// number of overdrive workers (typically zero). For our purposes, completed
   201  	// pieces count as active workers, though the workers have actually
   202  	// finished.
   203  	piecesInProgress := udc.piecesRegistered + udc.piecesCompleted
   204  	desiredPiecesInProgress := udc.erasureCode.MinPieces() + udc.staticOverdrive
   205  	workersDesired := piecesInProgress < desiredPiecesInProgress
   206  
   207  	if workersDesired && meetsExtraCriteria {
   208  		// Worker can be useful. Register the worker and return the chunk for
   209  		// downloading.
   210  		udc.piecesRegistered++
   211  		udc.pieceUsage[pieceData.index] = true
   212  		return udc
   213  	}
   214  	// Worker is not needed unless another worker fails, so put this worker on
   215  	// standby for this chunk. The worker is still available to help with the
   216  	// download, so the worker is not removed from the chunk in this codepath.
   217  	udc.workersStandby = append(udc.workersStandby, w)
   218  	return nil
   219  }