gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/renter/workerjobread.go (about)

     1  package renter
     2  
     3  import (
     4  	"sync"
     5  	"time"
     6  
     7  	"github.com/opentracing/opentracing-go"
     8  	"gitlab.com/SkynetLabs/skyd/build"
     9  	"gitlab.com/SkynetLabs/skyd/skymodules"
    10  	"go.sia.tech/siad/crypto"
    11  	"go.sia.tech/siad/modules"
    12  	"go.sia.tech/siad/types"
    13  
    14  	"gitlab.com/NebulousLabs/errors"
    15  )
    16  
    17  const (
    18  	// jobReadPerformanceDecay defines how much decay gets applied to the
    19  	// historic performance of jobRead each time new data comes back.
    20  	// Setting a low value makes the performance more volatile. If the worker
    21  	// tends to have inconsistent performance, having the decay be a low value
    22  	// (0.9 or lower) will be highly detrimental. A higher decay means that the
    23  	// predictor tends to be more accurate over time, but is less responsive to
    24  	// things like network load.
    25  	jobReadPerformanceDecay = 0.9
    26  
    27  	// jobLength64k is the threshold we use to label a download as 64kb
    28  	// jobLength1m is the threshold we use to label a download as 1m
    29  	// jobLength4m is the threshold we use to label a download as 4m
    30  	//
    31  	// usually the length is evaluated using an if-else structure, comparing the
    32  	// length to these threshold in ascending fashion, so we first check to see
    33  	// whether it's a 64kb, then a 1mb and so on
    34  	jobLength64k = uint64(1 << 16)
    35  	jobLength1m  = uint64(1 << 20)
    36  	jobLength4m  = uint64(1 << 24)
    37  )
    38  
    39  type (
    40  	// jobRead contains information about a Read query.
    41  	jobRead struct {
    42  		staticLength       uint64
    43  		staticResponseChan chan *jobReadResponse
    44  
    45  		// staticSpan is used for tracing. Note that this can be nil, and
    46  		// therefore should always be checked. Not all read jobs require
    47  		// tracing. By allowing it to be nil we avoid the extra overhead.
    48  		staticSpan opentracing.Span
    49  
    50  		jobGeneric
    51  	}
    52  
    53  	// jobReadQueue is a list of Read queries that have been assigned to the
    54  	// worker. The queue also tracks performance metrics, which can then be used
    55  	// by projects to optimize job scheduling between workers.
    56  	jobReadQueue struct {
    57  		staticStats *jobReadStats
    58  		*jobGenericQueue
    59  
    60  		// staticBaseCost is applied to download costs, defined in SC/TB
    61  		staticBaseCost types.Currency
    62  	}
    63  
    64  	// jobReadStats contains statistics about read jobs. This object is
    65  	// thread safe and can be shared between multiple queues.
    66  	jobReadStats struct {
    67  		// These float64s are converted time.Duration values. They are float64
    68  		// to get better precision on the exponential decay which gets applied
    69  		// with each new data point.
    70  		weightedJobTime64k float64
    71  		weightedJobTime1m  float64
    72  		weightedJobTime4m  float64
    73  
    74  		// These distribution trackers keep track of the read durations for
    75  		// every length category.
    76  		staticDT64k *skymodules.DistributionTracker
    77  		staticDT1m  *skymodules.DistributionTracker
    78  		staticDT4m  *skymodules.DistributionTracker
    79  
    80  		*jobGenericQueue
    81  		mu sync.Mutex
    82  	}
    83  
    84  	// jobReadResponse contains the result of a Read query.
    85  	jobReadResponse struct {
    86  		// The response data.
    87  		staticData  []byte
    88  		staticProof []crypto.Hash
    89  		staticErr   error
    90  
    91  		// Metadata related to the job.
    92  		staticMetadata jobReadMetadata
    93  
    94  		// The time it took for this job to complete.
    95  		staticJobTime time.Duration
    96  	}
    97  
    98  	// jobReadMetadata contains meta information about a read job.
    99  	jobReadMetadata struct {
   100  		staticSectorRoot          crypto.Hash
   101  		staticPieceRootIndex      uint64
   102  		staticLaunchedWorkerIndex uint64
   103  
   104  		// the category specifies what type of function the read job fulfils,
   105  		// this is necessary to pass along as the generic MDM executor needs to
   106  		// be update spending details and read jobs can be used for downloads
   107  		// but might also be used for snapshots for example
   108  		staticSpendingCategory spendingCategory
   109  
   110  		staticWorker           *worker
   111  		staticWorkerIdentifier uint32
   112  	}
   113  )
   114  
   115  // NewJobReadStats returns an initialized jobReadStats object.
   116  func NewJobReadStats() *jobReadStats {
   117  	return &jobReadStats{
   118  		staticDT64k: skymodules.NewDistributionTrackerStandard(),
   119  		staticDT1m:  skymodules.NewDistributionTrackerStandard(),
   120  		staticDT4m:  skymodules.NewDistributionTrackerStandard(),
   121  	}
   122  }
   123  
   124  // staticJobReadMetadata returns the read job's metadata.
   125  func (j *jobRead) staticJobReadMetadata() jobReadMetadata {
   126  	var metadata jobReadMetadata
   127  	md, ok := j.staticGetMetadata().(jobReadMetadata)
   128  	if ok {
   129  		metadata = md
   130  	}
   131  	return metadata
   132  }
   133  
   134  // callDiscard will discard a job, forwarding the error to the caller.
   135  func (j *jobRead) callDiscard(err error) {
   136  	// Log info and finish span.
   137  	if j.staticSpan != nil {
   138  		j.staticSpan.LogKV("callDiscard", err)
   139  		j.staticSpan.SetTag("success", false)
   140  		j.staticSpan.Finish()
   141  	}
   142  
   143  	w := j.staticQueue.staticWorker()
   144  	errLaunch := w.staticTG.Launch(func() {
   145  		response := &jobReadResponse{
   146  			staticErr:      err,
   147  			staticMetadata: j.staticJobReadMetadata(),
   148  		}
   149  		select {
   150  		case j.staticResponseChan <- response:
   151  		case <-w.staticTG.StopChan():
   152  		case <-j.staticCtx.Done():
   153  		}
   154  	})
   155  	if errLaunch != nil {
   156  		w.staticRenter.staticLog.Print("callDiscard: launch failed", errLaunch)
   157  	}
   158  }
   159  
   160  // managedFinishExecute will execute code that is shared by multiple read jobs
   161  // after execution. It updates the performance metrics, records whether the
   162  // execution was successful and returns the response.
   163  func (j *jobRead) managedFinishExecute(readData []byte, proof []crypto.Hash, readErr error, readJobTime time.Duration) {
   164  	// Log result and finish
   165  	if j.staticSpan != nil {
   166  		j.staticSpan.LogKV(
   167  			"err", readErr,
   168  			"duration", readJobTime,
   169  		)
   170  		j.staticSpan.SetTag("success", readErr == nil)
   171  		j.staticSpan.Finish()
   172  	}
   173  
   174  	// Send the response in a goroutine so that the worker resources can be
   175  	// released faster. Need to check if the job was canceled so that the
   176  	// goroutine will exit.
   177  	response := &jobReadResponse{
   178  		staticData:  readData,
   179  		staticProof: proof,
   180  		staticErr:   readErr,
   181  
   182  		staticMetadata: j.staticJobReadMetadata(),
   183  		staticJobTime:  readJobTime,
   184  	}
   185  	w := j.staticQueue.staticWorker()
   186  	err := w.staticTG.Launch(func() {
   187  		select {
   188  		case j.staticResponseChan <- response:
   189  		case <-j.staticCtx.Done():
   190  		case <-w.staticTG.StopChan():
   191  		}
   192  	})
   193  	if err != nil {
   194  		j.staticQueue.staticWorker().staticRenter.staticLog.Print("managedFinishExecute: launch failed", err)
   195  	}
   196  
   197  	// Report success or failure to the queue.
   198  	if readErr != nil {
   199  		j.staticQueue.callReportFailure(readErr, j.externExecuteTime, time.Now())
   200  		return
   201  	}
   202  	j.staticQueue.callReportSuccess()
   203  
   204  	// Job succeeded.
   205  	//
   206  	// Update the metrics in the read sector queue based on the amount of
   207  	// time the read took. Stats should only be added if the job did not
   208  	// result in an error. Because there was no failure, the consecutive
   209  	// failures stat can be reset.
   210  	jq := j.staticQueue.(*jobReadQueue)
   211  	jq.staticStats.callUpdateJobTimeMetrics(j.staticLength, readJobTime)
   212  }
   213  
   214  // callExpectedBandwidth returns the bandwidth that gets consumed by a
   215  // Read program.
   216  func (j *jobRead) callExpectedBandwidth() (ul, dl uint64) {
   217  	ul = 1 << 12                                      // 4 KiB
   218  	dl = uint64(float64(j.staticLength)*1.01) + 1<<12 // (readSize * 1.01 + 4 KiB)
   219  	return
   220  }
   221  
   222  // managedRead returns the sector data for the given read program and the merkle
   223  // proof.
   224  func (j *jobRead) managedRead(w *worker, program modules.Program, programData []byte, cost types.Currency, bandwidthRefund func(ul, dl uint64) types.Currency) ([]programResponse, error) {
   225  	// execute it
   226  	responses, _, err := w.managedExecuteProgram(program, programData, w.staticCache().staticContractID, j.staticJobReadMetadata().staticSpendingCategory, cost, bandwidthRefund)
   227  	if err != nil {
   228  		return []programResponse{}, err
   229  	}
   230  
   231  	// Sanity check number of responses.
   232  	if len(responses) > len(program) {
   233  		build.Critical("managedExecuteProgram should return at most len(program) instructions")
   234  	}
   235  	if len(responses) == 0 {
   236  		build.Critical("managedExecuteProgram should at least return one instruction when err == nil")
   237  	}
   238  	// If the number of responses doesn't match, the last response should
   239  	// contain an error message.
   240  	if len(responses) != len(program) {
   241  		err := responses[len(responses)-1].Error
   242  		return []programResponse{}, errors.AddContext(err, "managedRead: program execution was interrupted")
   243  	}
   244  
   245  	// The last instruction is the actual download.
   246  	response := responses[len(responses)-1]
   247  	if response.Error != nil {
   248  		return []programResponse{}, response.Error
   249  	}
   250  	sectorData := response.Output
   251  
   252  	// Check that we received the amount of data that we were expecting.
   253  	if uint64(len(sectorData)) != j.staticLength {
   254  		return []programResponse{}, errors.New("worker returned the wrong amount of data")
   255  	}
   256  	return responses, nil
   257  }
   258  
   259  // callAddWithEstimate will add a job to the job read queue while providing an
   260  // estimate for when the job is expected to return.
   261  func (jq *jobReadQueue) callAddWithEstimate(j *jobReadSector) (time.Time, bool) {
   262  	estimate := jq.staticStats.callExpectedJobTime(j.staticLength)
   263  
   264  	jq.mu.Lock()
   265  	defer jq.mu.Unlock()
   266  
   267  	if !jq.add(j) {
   268  		return time.Time{}, false
   269  	}
   270  	return time.Now().Add(estimate), true
   271  }
   272  
   273  // callExpectedJobTime will return the recent performance of the worker
   274  // attempting to complete read jobs. The call distinguishes based on the
   275  // size of the job, breaking the jobs into 3 categories: less than 64kb, less
   276  // than 1mb, and up to a full sector in size.
   277  //
   278  // The breakout is performed because low latency, low throughput workers are
   279  // common, and will have very different performance characteristics across the
   280  // three categories.
   281  //
   282  // TODO: Make this smarter.
   283  func (jrs *jobReadStats) callExpectedJobTime(length uint64) time.Duration {
   284  	jrs.mu.Lock()
   285  	defer jrs.mu.Unlock()
   286  	return jrs.expectedJobTime(length)
   287  }
   288  
   289  // expectedJobTime returns the expected job time, based on recent performance,
   290  // for the given read length.
   291  func (jrs *jobReadStats) expectedJobTime(length uint64) time.Duration {
   292  	if length <= jobLength64k {
   293  		return time.Duration(jrs.weightedJobTime64k)
   294  	} else if length <= jobLength1m {
   295  		return time.Duration(jrs.weightedJobTime1m)
   296  	} else {
   297  		return time.Duration(jrs.weightedJobTime4m)
   298  	}
   299  }
   300  
   301  // callExpectedJobCost returns an estimate for the price of performing a read
   302  // job with the given length.
   303  func (jq *jobReadQueue) callExpectedJobCost(length uint64) types.Currency {
   304  	pt := &jq.staticWorker().staticPriceTable().staticPriceTable
   305  
   306  	// Calculate init cost. The program we use has a 48 byte program data and 1
   307  	// instruction. 48 = 8 bytes length + 8 bytes offset + 32 bytes merkle root
   308  	cost := modules.MDMInitCost(pt, 48, 1)
   309  
   310  	// Add the execution cost.
   311  	cost = cost.Add(modules.MDMReadCost(pt, length))
   312  
   313  	// Add the memory cost.
   314  	memory := modules.MDMInitMemory() + modules.MDMReadMemory()
   315  	time := uint64(modules.MDMTimeReadSector)
   316  	cost = cost.Add(modules.MDMMemoryCost(pt, memory, time))
   317  
   318  	// Add the bandwidth cost.
   319  	ulBandwidth, dlBandwidth := new(jobReadSector).callExpectedBandwidth()
   320  	cost = cost.Add(modules.MDMBandwidthCost(*pt, ulBandwidth, dlBandwidth))
   321  
   322  	// Add the base cost.
   323  	cost = cost.Add(jq.staticBaseCost.Mul64(dlBandwidth))
   324  	return cost
   325  }
   326  
   327  // callUpdateJobTimeMetrics takes a length and the duration it took to fulfil
   328  // that job and uses it to update the job performance metrics on the queue.
   329  func (jrs *jobReadStats) callUpdateJobTimeMetrics(length uint64, jobTime time.Duration) {
   330  	jrs.mu.Lock()
   331  	defer jrs.mu.Unlock()
   332  	if length <= jobLength64k {
   333  		jrs.weightedJobTime64k = expMovingAvgHotStart(jrs.weightedJobTime64k, float64(jobTime), jobReadPerformanceDecay)
   334  	} else if length <= jobLength1m {
   335  		jrs.weightedJobTime1m = expMovingAvgHotStart(jrs.weightedJobTime1m, float64(jobTime), jobReadPerformanceDecay)
   336  	} else {
   337  		jrs.weightedJobTime4m = expMovingAvgHotStart(jrs.weightedJobTime4m, float64(jobTime), jobReadPerformanceDecay)
   338  	}
   339  
   340  	// update distribution tracker
   341  	dt := jrs.distributionTrackerForLength(length)
   342  	dt.AddDataPoint(jobTime)
   343  }
   344  
   345  // distributionTrackerForLength returns the distribution tracker that
   346  // corresponds to the given length.
   347  func (jrs *jobReadStats) distributionTrackerForLength(length uint64) *skymodules.DistributionTracker {
   348  	if length <= jobLength64k {
   349  		return jrs.staticDT64k
   350  	} else if length <= jobLength1m {
   351  		return jrs.staticDT1m
   352  	} else {
   353  		return jrs.staticDT4m
   354  	}
   355  }
   356  
   357  // initJobReadQueue will initialize a queue for downloading sectors by
   358  // their root for the worker. This is only meant to be run once at startup.
   359  func (w *worker) initJobReadQueue(jrs *jobReadStats) {
   360  	// Sanity check that there is no existing job queue.
   361  	if w.staticJobReadQueue != nil {
   362  		w.staticRenter.staticLog.Critical("incorrect call on initJobReadQueue")
   363  	}
   364  
   365  	w.staticJobReadQueue = &jobReadQueue{
   366  		jobGenericQueue: newJobGenericQueue(w),
   367  
   368  		staticBaseCost: skymodules.DefaultSkynetBaseCost,
   369  		staticStats:    jrs,
   370  	}
   371  }
   372  
   373  // initJobLowPrioReadQueue will initialize a queue for downloading sectors by
   374  // their root for the worker. This is only meant to be run once at startup.
   375  func (w *worker) initJobLowPrioReadQueue(jrs *jobReadStats) {
   376  	// Sanity check that there is no existing job queue.
   377  	if w.staticJobLowPrioReadQueue != nil {
   378  		w.staticRenter.staticLog.Critical("incorret call on initJobReadQueue")
   379  	}
   380  
   381  	w.staticJobLowPrioReadQueue = &jobReadQueue{
   382  		jobGenericQueue: newJobGenericQueue(w),
   383  
   384  		staticBaseCost: skymodules.DefaultSkynetBaseCost,
   385  		staticStats:    jrs,
   386  	}
   387  }