gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/renter/workerjobhassector.go (about)

     1  package renter
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  	"time"
     7  
     8  	"github.com/opentracing/opentracing-go"
     9  	"gitlab.com/SkynetLabs/skyd/build"
    10  	"gitlab.com/SkynetLabs/skyd/skymodules"
    11  	"gitlab.com/SkynetLabs/skyd/skymodules/gouging"
    12  	"go.sia.tech/siad/crypto"
    13  	"go.sia.tech/siad/modules"
    14  	"go.sia.tech/siad/types"
    15  
    16  	"gitlab.com/NebulousLabs/errors"
    17  )
    18  
    19  const (
    20  	// availabilityMetricsBucketScale is the amount with which we scale each
    21  	// bucket. Every bucket scales up 25%, this number was chosen because it
    22  	// provides sufficient granular coverage. Using this scale the buckets are:
    23  	// 1, 2, 3, 4-5, 6-7, 8-10, 11-13, 14-17, 18-22, 23-28, 29-36, ..., 93-116
    24  	availabilityMetricsBucketScale = 1.25
    25  
    26  	// availabilityMetricsDefaultHalfLife is the default half life of the decay
    27  	// applied to the availability buckets.
    28  	availabilityMetricsDefaultHalfLife = 100 * time.Hour
    29  
    30  	// availabilityMetricsNumBuckets is the total number of buckets we use to
    31  	// track the sector availability metrics for a certain host. Every bucket
    32  	// represents a range of total pieces uploaded to the network, the total
    33  	// number of pieces is decided by the redundancy scheme used during the
    34  	// upload.
    35  	availabilityMetricsNumBuckets = 16
    36  
    37  	// jobHasSectorPerformanceDecay defines how much the average performance is
    38  	// decayed each time a new datapoint is added. The jobs use an exponential
    39  	// weighted average.
    40  	jobHasSectorPerformanceDecay = 0.9
    41  
    42  	// jobHasSectorQueueMinAvailabilityRate is the minimum availability rate we
    43  	// return when there haven't been any jobs performed yet by the queue where
    44  	// the sector was available.
    45  	jobHasSectorQueueMinAvailabilityRate = 0.001
    46  
    47  	// hasSectorBatchSize is the number of has sector jobs batched together upon
    48  	// calling callNext.
    49  	// This number is the result of empirical testing which determined that 13
    50  	// requests can be batched together without increasing the required
    51  	// upload or download bandwidth.
    52  	hasSectorBatchSize = 13
    53  )
    54  
    55  // errEstimateAboveMax is returned if a HasSector job wasn't added due to the
    56  // estimate exceeding the max.
    57  var errEstimateAboveMax = errors.New("can't add job since estimate is above max timeout")
    58  
    59  type (
    60  	// jobHasSector contains information about a hasSector query.
    61  	jobHasSector struct {
    62  		staticSectors      []crypto.Hash
    63  		staticResponseChan chan *jobHasSectorResponse
    64  
    65  		// staticNumPieces represents the redundancy with which the sectors were
    66  		// uploaded, it is the total number of pieces meaning the sum of the
    67  		// data and parity pieces used by the erasure coder
    68  		//
    69  		// NOTE: we assume that all sectors corresponding to the roots listed
    70  		// in this HS job were uploaded using the same redundancy scheme
    71  		staticNumPieces int
    72  
    73  		staticPostExecutionHook func(*jobHasSectorResponse)
    74  		once                    sync.Once
    75  
    76  		staticSpan opentracing.Span
    77  
    78  		jobGeneric
    79  	}
    80  
    81  	// jobHasSectorBatch is a batch of has sector lookups.
    82  	jobHasSectorBatch struct {
    83  		staticJobs []*jobHasSector
    84  	}
    85  
    86  	// jobHasSectorQueue is a list of hasSector queries that have been assigned
    87  	// to the worker.
    88  	jobHasSectorQueue struct {
    89  		// These variables contain an exponential weighted average of the
    90  		// worker's recent performance for jobHasSectorQueue.
    91  		weightedJobTime float64
    92  
    93  		// availabilityMetrics keeps track of how often a sector was available
    94  		// on this host, we keep track of this in a way that we take the
    95  		// redundancy with which the sector was uploaded into account
    96  		availabilityMetrics *availabilityMetrics
    97  
    98  		// staticDT is a distribution tracker that keeps track of the HS job
    99  		// duration
   100  		staticDT *skymodules.DistributionTracker
   101  
   102  		*jobGenericQueue
   103  	}
   104  
   105  	// jobHasSectorResponse contains the result of a hasSector query.
   106  	jobHasSectorResponse struct {
   107  		staticAvailbleIndices []uint64
   108  		staticErr             error
   109  
   110  		// The worker is included in the response so that the caller can listen
   111  		// on one channel for a bunch of workers and still know which worker
   112  		// successfully found the sector root.
   113  		staticWorker *worker
   114  
   115  		// The time it took for this job to complete is included for debugging
   116  		// purposes.
   117  		staticJobTime time.Duration
   118  	}
   119  
   120  	// availabilityMetrics is a helper struct that keeps track of sector
   121  	// availability metrics, we keep track of these in several buckets that
   122  	// correspond with sectors that were uploaded with a similar redundancy
   123  	availabilityMetrics struct {
   124  		buckets         []*availabilityBucket
   125  		piecesToBuckets []int
   126  		mu              sync.Mutex
   127  	}
   128  
   129  	// availabilityBucket is a helper struct that keeps track of how often a
   130  	// sector was available, every bucket holds these stats for sectors that
   131  	// were uploaded with a similar redundancy scheme
   132  	availabilityBucket struct {
   133  		skymodules.GenericDecay
   134  
   135  		// Keeps track of the total amount of sectors that were available and
   136  		// the total amount of lookups that were performed. Note that a decaying
   137  		// factor is applied to these variables.
   138  		totalAvailable float64
   139  		totalLookups   float64
   140  	}
   141  )
   142  
   143  // newAvailabilityMetrics returns a new availabilityMetrics object
   144  func newAvailabilityMetrics(halfLife time.Duration) *availabilityMetrics {
   145  	metrics := &availabilityMetrics{
   146  		buckets:         make([]*availabilityBucket, availabilityMetricsNumBuckets),
   147  		piecesToBuckets: []int{-1}, // 0 num pieces is illegal
   148  	}
   149  
   150  	// initialize the buckets and a slice that maps piece indices to bucket
   151  	// indices that's used for constant time lookups.
   152  	curr := uint64(1)
   153  	for bucket := 0; bucket < availabilityMetricsNumBuckets; bucket++ {
   154  		metrics.buckets[bucket] = &availabilityBucket{GenericDecay: skymodules.NewDecay(halfLife)}
   155  
   156  		next := uint64(float64(curr) * availabilityMetricsBucketScale)
   157  		if next > curr {
   158  			for pieces := curr; pieces <= next; pieces++ {
   159  				metrics.piecesToBuckets = append(metrics.piecesToBuckets, bucket)
   160  			}
   161  			curr = next + 1
   162  			continue
   163  		}
   164  
   165  		metrics.piecesToBuckets = append(metrics.piecesToBuckets, bucket)
   166  		curr++
   167  	}
   168  
   169  	return metrics
   170  }
   171  
   172  // addDecay applies decay to the data in the availability bucket
   173  func (ab *availabilityBucket) addDecay() {
   174  	ab.Decay(func(decay float64) {
   175  		ab.totalAvailable *= decay
   176  		ab.totalLookups *= decay
   177  	})
   178  }
   179  
   180  // bucket will return the bucket corresponding with 'numPieces'
   181  func (am *availabilityMetrics) bucket(numPieces int) *availabilityBucket {
   182  	if numPieces < 1 {
   183  		build.Critical("num pieces can never be smaller than 1")
   184  		return nil
   185  	}
   186  
   187  	// return the last bucket if num pieces goes out of bounds
   188  	if numPieces >= len(am.piecesToBuckets) {
   189  		numPieces = len(am.piecesToBuckets) - 1
   190  	}
   191  	bucketIndex := am.piecesToBuckets[numPieces]
   192  	return am.buckets[bucketIndex]
   193  }
   194  
   195  // updateMetrics will update the availability metrics for the bucket
   196  // corresponding with 'numPieces'
   197  func (am *availabilityMetrics) updateMetrics(numPieces, numSectors, numAvailable int) {
   198  	bucket := am.bucket(numPieces)
   199  	if bucket == nil {
   200  		return
   201  	}
   202  
   203  	bucket.addDecay()
   204  
   205  	bucket.totalLookups += float64(numSectors)
   206  	bucket.totalAvailable += float64(numAvailable)
   207  }
   208  
   209  // callNext overwrites the generic call next and batches a certain number of has
   210  // sector jobs together.
   211  func (jq *jobHasSectorQueue) callNext() workerJob {
   212  	jobs := make([]*jobHasSector, 0, hasSectorBatchSize)
   213  	var next workerJob
   214  	for {
   215  		if len(jobs) >= hasSectorBatchSize {
   216  			break
   217  		}
   218  		next = jq.jobGenericQueue.callNext()
   219  		if next == nil {
   220  			break
   221  		}
   222  		j := next.(*jobHasSector)
   223  		jobs = append(jobs, j)
   224  	}
   225  	if len(jobs) == 0 {
   226  		return nil
   227  	}
   228  
   229  	return &jobHasSectorBatch{
   230  		staticJobs: jobs,
   231  	}
   232  }
   233  
   234  // newJobHasSector is a helper method to create a new HasSector job.
   235  func (w *worker) newJobHasSector(ctx context.Context, responseChan chan *jobHasSectorResponse, numPieces int, roots ...crypto.Hash) *jobHasSector {
   236  	return w.newJobHasSectorWithPostExecutionHook(ctx, responseChan, nil, numPieces, roots...)
   237  }
   238  
   239  // newJobHasSectorWithPostExecutionHook is a helper method to create a new
   240  // HasSector job with a post execution hook that is executed after the response
   241  // is available but before sending it over the channel.
   242  func (w *worker) newJobHasSectorWithPostExecutionHook(ctx context.Context, responseChan chan *jobHasSectorResponse, hook func(*jobHasSectorResponse), numPieces int, roots ...crypto.Hash) *jobHasSector {
   243  	t := opentracing.NoopTracer{} // NOTE: disabled for performance
   244  	span, _ := opentracing.StartSpanFromContextWithTracer(ctx, t, "HasSectorJob")
   245  	return &jobHasSector{
   246  		staticNumPieces:         numPieces,
   247  		staticSectors:           roots,
   248  		staticResponseChan:      responseChan,
   249  		staticPostExecutionHook: hook,
   250  		staticSpan:              span,
   251  		jobGeneric:              newJobGeneric(ctx, w.staticJobHasSectorQueue, nil),
   252  	}
   253  }
   254  
   255  // callDiscard will discard a job, sending the provided error.
   256  func (j *jobHasSector) callDiscard(err error) {
   257  	w := j.staticQueue.staticWorker()
   258  	errLaunch := w.staticTG.Launch(func() {
   259  		response := staticPoolJobHasSectorResponse.Get()
   260  		response.staticErr = err
   261  		response.staticWorker = w
   262  
   263  		j.managedCallPostExecutionHook(response)
   264  		select {
   265  		case j.staticResponseChan <- response:
   266  		case <-j.staticCtx.Done():
   267  		case <-w.staticTG.StopChan():
   268  		}
   269  	})
   270  	if errLaunch != nil {
   271  		w.staticRenter.staticLog.Print("callDiscard: launch failed", errLaunch)
   272  	}
   273  
   274  	j.staticSpan.LogKV("callDiscard", err)
   275  	j.staticSpan.SetTag("success", false)
   276  	j.staticSpan.Finish()
   277  }
   278  
   279  // callDiscard discards all jobs within the batch.
   280  func (j jobHasSectorBatch) callDiscard(err error) {
   281  	for _, hsj := range j.staticJobs {
   282  		hsj.callDiscard(err)
   283  	}
   284  }
   285  
   286  // staticCanceled always returns false. A batched job never resides in the
   287  // queue. It's constructed right before being executed.
   288  func (j jobHasSectorBatch) staticCanceled() bool {
   289  	return false
   290  }
   291  
   292  // staticGetMetadata return an empty struct. A batched has sector job doesn't
   293  // contain any metadata.
   294  func (j jobHasSectorBatch) staticGetMetadata() interface{} {
   295  	return struct{}{}
   296  }
   297  
   298  // callExecute will run the has sector job.
   299  func (j *jobHasSector) callExecute() error {
   300  	// Finish job span at the end.
   301  	defer j.staticSpan.Finish()
   302  
   303  	// Set the execute time
   304  	j.externExecuteTime = time.Now()
   305  
   306  	// Capture callExecute in new span.
   307  	span := opentracing.StartSpan("callExecute", opentracing.ChildOf(j.staticSpan.Context()))
   308  	defer span.Finish()
   309  
   310  	batch := jobHasSectorBatch{
   311  		staticJobs: []*jobHasSector{j},
   312  	}
   313  	return batch.callExecute()
   314  }
   315  
   316  // callExecute will run the has sector job.
   317  func (j jobHasSectorBatch) callExecute() error {
   318  	if len(j.staticJobs) == 0 {
   319  		build.Critical("empty hasSectorBatch")
   320  		return nil
   321  	}
   322  
   323  	start := time.Now()
   324  	w := j.staticJobs[0].staticQueue.staticWorker()
   325  	availables, err := j.managedHasSector()
   326  	jobTime := time.Since(start)
   327  
   328  	for i := range j.staticJobs {
   329  		hsj := j.staticJobs[i]
   330  		// Handle its span
   331  		if err != nil {
   332  			hsj.staticSpan.LogKV("error", err)
   333  		}
   334  		hsj.staticSpan.SetTag("success", err == nil)
   335  		hsj.staticSpan.Finish()
   336  
   337  		// Create the response.
   338  		response := staticPoolJobHasSectorResponse.Get()
   339  		response.staticErr = err
   340  		response.staticJobTime = jobTime
   341  		response.staticWorker = w
   342  
   343  		// If it was successful, attach the result.
   344  		if err == nil {
   345  			response.staticAvailbleIndices = availables[i]
   346  		}
   347  		// Send the response.
   348  		err2 := w.staticTG.Launch(func() {
   349  			hsj.managedCallPostExecutionHook(response)
   350  			select {
   351  			case hsj.staticResponseChan <- response:
   352  			case <-hsj.staticCtx.Done():
   353  			case <-w.staticTG.StopChan():
   354  			}
   355  		})
   356  		// Report success or failure to the queue.
   357  		if err != nil {
   358  			hsj.staticQueue.callReportFailure(err, start, time.Now())
   359  			continue
   360  		}
   361  		hsj.staticQueue.callReportSuccess()
   362  
   363  		// Job was a success, update the performance and availability stats on
   364  		// the queue.
   365  		jq := hsj.staticQueue.(*jobHasSectorQueue)
   366  		jq.callUpdateJobTimeMetrics(jobTime)
   367  		jq.callUpdateAvailabilityMetrics(hsj.staticNumPieces, len(hsj.staticSectors), len(availables[i]))
   368  		if err2 != nil {
   369  			w.staticRenter.staticLog.Println("callExecute: launch failed", err2)
   370  		}
   371  	}
   372  
   373  	return err
   374  }
   375  
   376  // callExpectedBandwidth returns the bandwidth that is expected to be consumed
   377  // by the job.
   378  func (j *jobHasSector) callExpectedBandwidth() (ul, dl uint64) {
   379  	// sanity check
   380  	if len(j.staticSectors) == 0 {
   381  		build.Critical("expected bandwidth requested for a job that has no staticSectors set")
   382  	}
   383  	return gouging.HasSectorJobExpectedBandwidth(len(j.staticSectors))
   384  }
   385  
   386  // callExpectedBandwidth returns the bandwidth that is expected to be consumed
   387  // by the job.
   388  func (j jobHasSectorBatch) callExpectedBandwidth() (ul, dl uint64) {
   389  	var totalSectors int
   390  	for _, hsj := range j.staticJobs {
   391  		// sanity check
   392  		if len(hsj.staticSectors) == 0 {
   393  			build.Critical("expected bandwidth requested for a job that has no staticSectors set")
   394  		}
   395  		totalSectors += len(hsj.staticSectors)
   396  	}
   397  	ul, dl = gouging.HasSectorJobExpectedBandwidth(totalSectors)
   398  	return
   399  }
   400  
   401  // managedHasSector returns whether or not the host has a sector with given root
   402  func (j *jobHasSectorBatch) managedHasSector() (results [][]uint64, err error) {
   403  	if len(j.staticJobs) == 0 {
   404  		return nil, nil
   405  	}
   406  
   407  	w := j.staticJobs[0].staticQueue.staticWorker()
   408  	// Create the program.
   409  	pt := w.staticPriceTable().staticPriceTable
   410  	pb := modules.NewProgramBuilder(&pt, 0) // 0 duration since HasSector doesn't depend on it.
   411  	for _, hsj := range j.staticJobs {
   412  		for _, sector := range hsj.staticSectors {
   413  			pb.AddHasSectorInstruction(sector)
   414  		}
   415  	}
   416  	program, programData := pb.Program()
   417  	cost, _, _ := pb.Cost(true)
   418  
   419  	// take into account bandwidth costs
   420  	ulBandwidth, dlBandwidth := j.callExpectedBandwidth()
   421  	bandwidthCost, bandwidthRefund := mdmBandwidthCost(pt, ulBandwidth, dlBandwidth)
   422  	cost = cost.Add(bandwidthCost)
   423  
   424  	// Execute the program and parse the responses.
   425  	hasSectors := make([]bool, 0, len(program))
   426  	var responses []programResponse
   427  	responses, _, err = w.managedExecuteProgram(program, programData, types.FileContractID{}, categoryDownload, cost, bandwidthRefund)
   428  	if err != nil {
   429  		return nil, errors.AddContext(err, "managedHasSector: unable to execute program for has sector job")
   430  	}
   431  	for _, resp := range responses {
   432  		if resp.Error != nil {
   433  			return nil, errors.AddContext(resp.Error, "Output error")
   434  		}
   435  		hasSectors = append(hasSectors, resp.Output[0] == 1)
   436  	}
   437  	if len(responses) != len(program) {
   438  		return nil, errors.New("received invalid number of responses but no error")
   439  	}
   440  
   441  	for _, hsj := range j.staticJobs {
   442  		var availables []uint64
   443  		for i := 0; i < len(hsj.staticSectors); i++ {
   444  			if hasSectors[i] {
   445  				availables = append(availables, uint64(i))
   446  			}
   447  		}
   448  		results = append(results, availables)
   449  		hasSectors = hasSectors[len(hsj.staticSectors):]
   450  	}
   451  	return results, nil
   452  }
   453  
   454  // callAddWithEstimate will add a job to the queue and return a timestamp for
   455  // when the job is estimated to complete. An error will be returned if the job
   456  // is not successfully queued.
   457  func (jq *jobHasSectorQueue) callAddWithEstimate(j *jobHasSector, maxEstimate time.Duration) (time.Time, error) {
   458  	jq.mu.Lock()
   459  	defer jq.mu.Unlock()
   460  	now := time.Now()
   461  	estimate := jq.expectedJobTime()
   462  	if estimate > maxEstimate {
   463  		return time.Time{}, errEstimateAboveMax
   464  	}
   465  
   466  	if !jq.add(j) {
   467  		return time.Time{}, errors.New("unable to add job to queue")
   468  	}
   469  	return now.Add(estimate), nil
   470  }
   471  
   472  // callExpectedJobTime returns the expected amount of time that this job will
   473  // take to complete.
   474  //
   475  // TODO: idealy we pass `numSectors` here and get the expected job time
   476  // depending on the amount of instructions in the program.
   477  func (jq *jobHasSectorQueue) callExpectedJobTime() time.Duration {
   478  	jq.mu.Lock()
   479  	defer jq.mu.Unlock()
   480  	return jq.expectedJobTime()
   481  }
   482  
   483  // callAvailabilityRate returns the percentage of jobs that came back having the
   484  // sector for this queue's worker.
   485  func (jq *jobHasSectorQueue) callAvailabilityRate(numPieces int) float64 {
   486  	jq.mu.Lock()
   487  	defer jq.mu.Unlock()
   488  
   489  	// assert the given value for num pieces makes sense, we throw a critical
   490  	// here as this can only be caused by developer error
   491  	if numPieces < 1 {
   492  		build.Critical("num pieces can never be smaller than 1")
   493  		return 0
   494  	}
   495  
   496  	// fetch the bucket that corresponds with the given redundancy
   497  	bucket := jq.availabilityMetrics.bucket(numPieces)
   498  
   499  	// if there haven't been any jobs yet where the sector was available on the
   500  	// host, we return a minimum rate of .1% to avoid multiplication by zero in
   501  	// our download code algorithms.
   502  	if bucket.totalAvailable == 0 || bucket.totalLookups == 0 {
   503  		return jobHasSectorQueueMinAvailabilityRate
   504  	}
   505  
   506  	return bucket.totalAvailable / bucket.totalLookups
   507  }
   508  
   509  // callUpdateAvailabilityMetrics updates the fields on the has sector queue that
   510  // keep track of how many jobs were executed successfully, and how many jobs had
   511  // the sector be available.
   512  func (jq *jobHasSectorQueue) callUpdateAvailabilityMetrics(numPieces, numSectors, numAvailable int) {
   513  	jq.mu.Lock()
   514  	defer jq.mu.Unlock()
   515  	jq.availabilityMetrics.updateMetrics(numPieces, numSectors, numAvailable)
   516  }
   517  
   518  // callUpdateJobTimeMetrics takes a duration it took to fulfil that job and uses
   519  // it to update the job performance metrics on the queue.
   520  func (jq *jobHasSectorQueue) callUpdateJobTimeMetrics(jobTime time.Duration) {
   521  	jq.mu.Lock()
   522  	defer jq.mu.Unlock()
   523  	jq.weightedJobTime = expMovingAvgHotStart(jq.weightedJobTime, float64(jobTime), jobHasSectorPerformanceDecay)
   524  	jq.staticDT.AddDataPoint(jobTime)
   525  }
   526  
   527  // expectedJobTime will return the amount of time that a job is expected to
   528  // take, given the current conditions of the queue.
   529  func (jq *jobHasSectorQueue) expectedJobTime() time.Duration {
   530  	return jq.staticDT.Distribution(0).ExpectedDuration()
   531  }
   532  
   533  // initJobHasSectorQueue will init the queue for the has sector jobs.
   534  func (w *worker) initJobHasSectorQueue() {
   535  	// Sanity check that there is no existing job queue.
   536  	if w.staticJobHasSectorQueue != nil {
   537  		w.staticRenter.staticLog.Critical("incorret call on initJobHasSectorQueue")
   538  		return
   539  	}
   540  
   541  	w.staticJobHasSectorQueue = &jobHasSectorQueue{
   542  		availabilityMetrics: newAvailabilityMetrics(availabilityMetricsDefaultHalfLife),
   543  		jobGenericQueue:     newJobGenericQueue(w),
   544  		staticDT:            skymodules.NewDistributionTrackerStandard(),
   545  	}
   546  }
   547  
   548  // managedCallPostExecutionHook calls a post execution hook if registered. The
   549  // hook will only be called the first time this method is executed. Subsequent
   550  // calls are no-ops.
   551  func (j *jobHasSector) managedCallPostExecutionHook(resp *jobHasSectorResponse) {
   552  	if j.staticPostExecutionHook == nil {
   553  		return // nothing to do
   554  	}
   555  	j.once.Do(func() {
   556  		j.staticPostExecutionHook(resp)
   557  	})
   558  }