github.com/fozzysec/SiaPrime@v0.0.0-20190612043147-66c8e8d11fe3/modules/renter/workerdownload.go

github.com/fozzysec/SiaPrime@v0.0.0-20190612043147-66c8e8d11fe3/modules/renter/workerdownload.go (about)

     1  package renter
     2  
     3  // workerdownload.go is responsible for coordinating the actual fetching of
     4  // pieces, determining when to add standby workers, when to perform repairs, and
     5  // coordinating resource management between the workers operating on a chunk.
     6  
     7  import (
     8  	"sync/atomic"
     9  	"time"
    10  )
    11  
    12  // managedDownload will perform some download work.
    13  func (w *worker) managedDownload(udc *unfinishedDownloadChunk) {
    14  	// Process this chunk. If the worker is not fit to do the download, or is
    15  	// put on standby, 'nil' will be returned. After the chunk has been
    16  	// processed, the worker will be registered with the chunk.
    17  	//
    18  	// If 'nil' is returned, it is either because the worker has been removed
    19  	// from the chunk entirely, or because the worker has been put on standby.
    20  	udc = w.ownedProcessDownloadChunk(udc)
    21  	if udc == nil {
    22  		return
    23  	}
    24  	// Worker is being given a chance to work. After the work is complete,
    25  	// whether successful or failed, the worker needs to be removed.
    26  	defer udc.managedRemoveWorker()
    27  
    28  	// Fetch the sector. If fetching the sector fails, the worker needs to be
    29  	// unregistered with the chunk.
    30  	d, err := w.renter.hostContractor.Downloader(w.contract.HostPublicKey, w.renter.tg.StopChan())
    31  	if err != nil {
    32  		w.renter.log.Debugln("worker failed to create downloader:", err)
    33  		udc.managedUnregisterWorker(w)
    34  		return
    35  	}
    36  	defer d.Close()
    37  	pieceData, err := d.Sector(udc.staticChunkMap[string(w.contract.HostPublicKey.Key)].root)
    38  	if err != nil {
    39  		w.renter.log.Debugln("worker failed to download sector:", err)
    40  		udc.managedUnregisterWorker(w)
    41  		return
    42  	}
    43  	// TODO: Instead of adding the whole sector after the download completes,
    44  	// have the 'd.Sector' call add to this value ongoing as the sector comes
    45  	// in. Perhaps even include the data from creating the downloader and other
    46  	// data sent to and received from the host (like signatures) that aren't
    47  	// actually payload data.
    48  	atomic.AddUint64(&udc.download.atomicTotalDataTransferred, udc.staticPieceSize)
    49  
    50  	// Decrypt the piece. This might introduce some overhead for downloads with
    51  	// a large overdrive. It shouldn't be a bottleneck though since bandwidth
    52  	// is usually a lot more scarce than CPU processing power.
    53  	pieceIndex := udc.staticChunkMap[string(w.contract.HostPublicKey.Key)].index
    54  	key := deriveKey(udc.masterKey, udc.staticChunkIndex, pieceIndex)
    55  	decryptedPiece, err := key.DecryptBytesInPlace(pieceData)
    56  	if err != nil {
    57  		w.renter.log.Debugln("worker failed to decrypt piece:", err)
    58  		udc.managedUnregisterWorker(w)
    59  		return
    60  	}
    61  
    62  	// Mark the piece as completed. Perform chunk recovery if we newly have
    63  	// enough pieces to do so. Chunk recovery is an expensive operation that
    64  	// should be performed in a separate thread as to not block the worker.
    65  	udc.mu.Lock()
    66  	udc.piecesCompleted++
    67  	udc.piecesRegistered--
    68  	if udc.piecesCompleted <= udc.erasureCode.MinPieces() {
    69  		atomic.AddUint64(&udc.download.atomicDataReceived, udc.staticFetchLength/uint64(udc.erasureCode.MinPieces()))
    70  		udc.physicalChunkData[pieceIndex] = decryptedPiece
    71  	}
    72  	if udc.piecesCompleted == udc.erasureCode.MinPieces() {
    73  		// Uint division might not always cause atomicDataReceived to cleanly
    74  		// add up to staticFetchLength so we need to figure out how much we
    75  		// already added to the download and how much is missing.
    76  		addedReceivedData := uint64(udc.erasureCode.MinPieces()) * (udc.staticFetchLength / uint64(udc.erasureCode.MinPieces()))
    77  		atomic.AddUint64(&udc.download.atomicDataReceived, udc.staticFetchLength-addedReceivedData)
    78  		// Recover the logical data.
    79  		go udc.threadedRecoverLogicalData()
    80  	}
    81  	udc.mu.Unlock()
    82  }
    83  
    84  // managedKillDownloading will drop all of the download work given to the
    85  // worker, and set a signal to prevent the worker from accepting more download
    86  // work.
    87  //
    88  // The chunk cleanup needs to occur after the worker mutex is released so that
    89  // the worker is not locked while chunk cleanup is happening.
    90  func (w *worker) managedKillDownloading() {
    91  	w.downloadMu.Lock()
    92  	var removedChunks []*unfinishedDownloadChunk
    93  	for i := 0; i < len(w.downloadChunks); i++ {
    94  		removedChunks = append(removedChunks, w.downloadChunks[i])
    95  	}
    96  	w.downloadChunks = w.downloadChunks[:0]
    97  	w.downloadTerminated = true
    98  	w.downloadMu.Unlock()
    99  	for i := 0; i < len(removedChunks); i++ {
   100  		removedChunks[i].managedRemoveWorker()
   101  	}
   102  }
   103  
   104  // managedNextDownloadChunk will pull the next potential chunk out of the work
   105  // queue for downloading.
   106  func (w *worker) managedNextDownloadChunk() *unfinishedDownloadChunk {
   107  	w.downloadMu.Lock()
   108  	defer w.downloadMu.Unlock()
   109  
   110  	if len(w.downloadChunks) == 0 {
   111  		return nil
   112  	}
   113  	nextChunk := w.downloadChunks[0]
   114  	w.downloadChunks = w.downloadChunks[1:]
   115  	return nextChunk
   116  }
   117  
   118  // managedQueueDownloadChunk adds a chunk to the worker's queue.
   119  func (w *worker) managedQueueDownloadChunk(udc *unfinishedDownloadChunk) {
   120  	// Accept the chunk unless the worker has been terminated. Accepting the
   121  	// chunk needs to happen under the same lock as fetching the termination
   122  	// status.
   123  	w.downloadMu.Lock()
   124  	terminated := w.downloadTerminated
   125  	if !terminated {
   126  		// Accept the chunk and issue a notification to the master thread that
   127  		// there is a new download.
   128  		w.downloadChunks = append(w.downloadChunks, udc)
   129  		select {
   130  		case w.downloadChan <- struct{}{}:
   131  		default:
   132  		}
   133  	}
   134  	w.downloadMu.Unlock()
   135  
   136  	// If the worker has terminated, remove it from the udc. This call needs to
   137  	// happen without holding the worker lock.
   138  	if terminated {
   139  		udc.managedRemoveWorker()
   140  	}
   141  }
   142  
   143  // managedUnregisterWorker will remove the worker from an unfinished download
   144  // chunk, and then un-register the pieces that it grabbed. This function should
   145  // only be called when a worker download fails.
   146  func (udc *unfinishedDownloadChunk) managedUnregisterWorker(w *worker) {
   147  	udc.mu.Lock()
   148  	udc.piecesRegistered--
   149  	udc.pieceUsage[udc.staticChunkMap[string(w.contract.HostPublicKey.Key)].index] = false
   150  	udc.mu.Unlock()
   151  }
   152  
   153  // ownedOnDownloadCooldown returns true if the worker is on cooldown from failed
   154  // downloads. This function should only be called by the master worker thread,
   155  // and does not require any mutexes.
   156  func (w *worker) ownedOnDownloadCooldown() bool {
   157  	requiredCooldown := downloadFailureCooldown
   158  	for i := 0; i < w.ownedDownloadConsecutiveFailures && i < maxConsecutivePenalty; i++ {
   159  		requiredCooldown *= 2
   160  	}
   161  	return time.Now().Before(w.ownedDownloadRecentFailure.Add(requiredCooldown))
   162  }
   163  
   164  // ownedProcessDownloadChunk will take a potential download chunk, figure out if
   165  // there is work to do, and then perform any registration or processing with the
   166  // chunk before returning the chunk to the caller.
   167  //
   168  // If no immediate action is required, 'nil' will be returned.
   169  func (w *worker) ownedProcessDownloadChunk(udc *unfinishedDownloadChunk) *unfinishedDownloadChunk {
   170  	// Determine whether the worker needs to drop the chunk. If so, remove the
   171  	// worker and return nil. Worker only needs to be removed if worker is being
   172  	// dropped.
   173  	udc.mu.Lock()
   174  	chunkComplete := udc.piecesCompleted >= udc.erasureCode.MinPieces()
   175  	chunkFailed := udc.piecesCompleted+udc.workersRemaining < udc.erasureCode.MinPieces()
   176  	pieceData, workerHasPiece := udc.staticChunkMap[string(w.contract.HostPublicKey.Key)]
   177  	pieceTaken := udc.pieceUsage[pieceData.index]
   178  	if chunkComplete || chunkFailed || w.ownedOnDownloadCooldown() || !workerHasPiece || pieceTaken {
   179  		udc.mu.Unlock()
   180  		udc.managedRemoveWorker()
   181  		return nil
   182  	}
   183  	defer udc.mu.Unlock()
   184  
   185  	// TODO: This is where we would put filters based on worker latency, worker
   186  	// price, worker throughput, etc. There's a lot of fancy stuff we can do
   187  	// with filtering to make sure that for any given chunk we always use the
   188  	// optimal set of workers, and this is the spot where most of the filtering
   189  	// will happen.
   190  	//
   191  	// One major thing that we will want to be careful about when we improve
   192  	// this section is total memory vs. worker bandwidth. If the renter is
   193  	// consistently memory bottlenecked such that the slow hosts are hogging all
   194  	// of the memory and choking out the fasts hosts, leading to underutilized
   195  	// network connections where we actually have enough fast hosts to be fully
   196  	// utilizing the network. Part of this will be solved by adding bandwidth
   197  	// stats to the hostdb, but part of it will need to be solved by making sure
   198  	// that we automatically put low-bandwidth or high-latency workers on
   199  	// standby if we know that memory is the bottleneck as opposed to download
   200  	// bandwidth.
   201  	//
   202  	// Workers that do not meet the extra criteria are not discarded but rather
   203  	// put on standby, so that they can step in if the workers that do meet the
   204  	// extra criteria fail or otherwise prove insufficient.
   205  	//
   206  	// NOTE: Any metrics that we pull from the worker here need to be 'owned'
   207  	// metrics, so that we can avoid holding the worker lock and the udc lock
   208  	// simultaneously (deadlock risk). The 'owned' variables of the worker are
   209  	// variables that are only accessed by the master worker thread.
   210  	meetsExtraCriteria := true
   211  
   212  	// TODO: There's going to need to be some method for relaxing criteria after
   213  	// the first wave of workers are sent off. If the first waves of workers
   214  	// fail, the next wave need to realize that they shouldn't immediately go on
   215  	// standby because for some reason there were failures in the first wave and
   216  	// now the second/etc. wave of workers is needed.
   217  
   218  	// Figure out if this chunk needs another worker actively downloading
   219  	// pieces. The number of workers that should be active simultaneously on
   220  	// this chunk is the minimum number of pieces required for recovery plus the
   221  	// number of overdrive workers (typically zero). For our purposes, completed
   222  	// pieces count as active workers, though the workers have actually
   223  	// finished.
   224  	piecesInProgress := udc.piecesRegistered + udc.piecesCompleted
   225  	desiredPiecesInProgress := udc.erasureCode.MinPieces() + udc.staticOverdrive
   226  	workersDesired := piecesInProgress < desiredPiecesInProgress
   227  
   228  	if workersDesired && meetsExtraCriteria {
   229  		// Worker can be useful. Register the worker and return the chunk for
   230  		// downloading.
   231  		udc.piecesRegistered++
   232  		udc.pieceUsage[pieceData.index] = true
   233  		return udc
   234  	}
   235  	// Worker is not needed unless another worker fails, so put this worker on
   236  	// standby for this chunk. The worker is still available to help with the
   237  	// download, so the worker is not removed from the chunk in this codepath.
   238  	udc.workersStandby = append(udc.workersStandby, w)
   239  	return nil
   240  }