gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/renter/workerjobhassector.go (about) 1 package renter 2 3 import ( 4 "context" 5 "sync" 6 "time" 7 8 "github.com/opentracing/opentracing-go" 9 "gitlab.com/SkynetLabs/skyd/build" 10 "gitlab.com/SkynetLabs/skyd/skymodules" 11 "gitlab.com/SkynetLabs/skyd/skymodules/gouging" 12 "go.sia.tech/siad/crypto" 13 "go.sia.tech/siad/modules" 14 "go.sia.tech/siad/types" 15 16 "gitlab.com/NebulousLabs/errors" 17 ) 18 19 const ( 20 // availabilityMetricsBucketScale is the amount with which we scale each 21 // bucket. Every bucket scales up 25%, this number was chosen because it 22 // provides sufficient granular coverage. Using this scale the buckets are: 23 // 1, 2, 3, 4-5, 6-7, 8-10, 11-13, 14-17, 18-22, 23-28, 29-36, ..., 93-116 24 availabilityMetricsBucketScale = 1.25 25 26 // availabilityMetricsDefaultHalfLife is the default half life of the decay 27 // applied to the availability buckets. 28 availabilityMetricsDefaultHalfLife = 100 * time.Hour 29 30 // availabilityMetricsNumBuckets is the total number of buckets we use to 31 // track the sector availability metrics for a certain host. Every bucket 32 // represents a range of total pieces uploaded to the network, the total 33 // number of pieces is decided by the redundancy scheme used during the 34 // upload. 35 availabilityMetricsNumBuckets = 16 36 37 // jobHasSectorPerformanceDecay defines how much the average performance is 38 // decayed each time a new datapoint is added. The jobs use an exponential 39 // weighted average. 40 jobHasSectorPerformanceDecay = 0.9 41 42 // jobHasSectorQueueMinAvailabilityRate is the minimum availability rate we 43 // return when there haven't been any jobs performed yet by the queue where 44 // the sector was available. 45 jobHasSectorQueueMinAvailabilityRate = 0.001 46 47 // hasSectorBatchSize is the number of has sector jobs batched together upon 48 // calling callNext. 49 // This number is the result of empirical testing which determined that 13 50 // requests can be batched together without increasing the required 51 // upload or download bandwidth. 52 hasSectorBatchSize = 13 53 ) 54 55 // errEstimateAboveMax is returned if a HasSector job wasn't added due to the 56 // estimate exceeding the max. 57 var errEstimateAboveMax = errors.New("can't add job since estimate is above max timeout") 58 59 type ( 60 // jobHasSector contains information about a hasSector query. 61 jobHasSector struct { 62 staticSectors []crypto.Hash 63 staticResponseChan chan *jobHasSectorResponse 64 65 // staticNumPieces represents the redundancy with which the sectors were 66 // uploaded, it is the total number of pieces meaning the sum of the 67 // data and parity pieces used by the erasure coder 68 // 69 // NOTE: we assume that all sectors corresponding to the roots listed 70 // in this HS job were uploaded using the same redundancy scheme 71 staticNumPieces int 72 73 staticPostExecutionHook func(*jobHasSectorResponse) 74 once sync.Once 75 76 staticSpan opentracing.Span 77 78 jobGeneric 79 } 80 81 // jobHasSectorBatch is a batch of has sector lookups. 82 jobHasSectorBatch struct { 83 staticJobs []*jobHasSector 84 } 85 86 // jobHasSectorQueue is a list of hasSector queries that have been assigned 87 // to the worker. 88 jobHasSectorQueue struct { 89 // These variables contain an exponential weighted average of the 90 // worker's recent performance for jobHasSectorQueue. 91 weightedJobTime float64 92 93 // availabilityMetrics keeps track of how often a sector was available 94 // on this host, we keep track of this in a way that we take the 95 // redundancy with which the sector was uploaded into account 96 availabilityMetrics *availabilityMetrics 97 98 // staticDT is a distribution tracker that keeps track of the HS job 99 // duration 100 staticDT *skymodules.DistributionTracker 101 102 *jobGenericQueue 103 } 104 105 // jobHasSectorResponse contains the result of a hasSector query. 106 jobHasSectorResponse struct { 107 staticAvailbleIndices []uint64 108 staticErr error 109 110 // The worker is included in the response so that the caller can listen 111 // on one channel for a bunch of workers and still know which worker 112 // successfully found the sector root. 113 staticWorker *worker 114 115 // The time it took for this job to complete is included for debugging 116 // purposes. 117 staticJobTime time.Duration 118 } 119 120 // availabilityMetrics is a helper struct that keeps track of sector 121 // availability metrics, we keep track of these in several buckets that 122 // correspond with sectors that were uploaded with a similar redundancy 123 availabilityMetrics struct { 124 buckets []*availabilityBucket 125 piecesToBuckets []int 126 mu sync.Mutex 127 } 128 129 // availabilityBucket is a helper struct that keeps track of how often a 130 // sector was available, every bucket holds these stats for sectors that 131 // were uploaded with a similar redundancy scheme 132 availabilityBucket struct { 133 skymodules.GenericDecay 134 135 // Keeps track of the total amount of sectors that were available and 136 // the total amount of lookups that were performed. Note that a decaying 137 // factor is applied to these variables. 138 totalAvailable float64 139 totalLookups float64 140 } 141 ) 142 143 // newAvailabilityMetrics returns a new availabilityMetrics object 144 func newAvailabilityMetrics(halfLife time.Duration) *availabilityMetrics { 145 metrics := &availabilityMetrics{ 146 buckets: make([]*availabilityBucket, availabilityMetricsNumBuckets), 147 piecesToBuckets: []int{-1}, // 0 num pieces is illegal 148 } 149 150 // initialize the buckets and a slice that maps piece indices to bucket 151 // indices that's used for constant time lookups. 152 curr := uint64(1) 153 for bucket := 0; bucket < availabilityMetricsNumBuckets; bucket++ { 154 metrics.buckets[bucket] = &availabilityBucket{GenericDecay: skymodules.NewDecay(halfLife)} 155 156 next := uint64(float64(curr) * availabilityMetricsBucketScale) 157 if next > curr { 158 for pieces := curr; pieces <= next; pieces++ { 159 metrics.piecesToBuckets = append(metrics.piecesToBuckets, bucket) 160 } 161 curr = next + 1 162 continue 163 } 164 165 metrics.piecesToBuckets = append(metrics.piecesToBuckets, bucket) 166 curr++ 167 } 168 169 return metrics 170 } 171 172 // addDecay applies decay to the data in the availability bucket 173 func (ab *availabilityBucket) addDecay() { 174 ab.Decay(func(decay float64) { 175 ab.totalAvailable *= decay 176 ab.totalLookups *= decay 177 }) 178 } 179 180 // bucket will return the bucket corresponding with 'numPieces' 181 func (am *availabilityMetrics) bucket(numPieces int) *availabilityBucket { 182 if numPieces < 1 { 183 build.Critical("num pieces can never be smaller than 1") 184 return nil 185 } 186 187 // return the last bucket if num pieces goes out of bounds 188 if numPieces >= len(am.piecesToBuckets) { 189 numPieces = len(am.piecesToBuckets) - 1 190 } 191 bucketIndex := am.piecesToBuckets[numPieces] 192 return am.buckets[bucketIndex] 193 } 194 195 // updateMetrics will update the availability metrics for the bucket 196 // corresponding with 'numPieces' 197 func (am *availabilityMetrics) updateMetrics(numPieces, numSectors, numAvailable int) { 198 bucket := am.bucket(numPieces) 199 if bucket == nil { 200 return 201 } 202 203 bucket.addDecay() 204 205 bucket.totalLookups += float64(numSectors) 206 bucket.totalAvailable += float64(numAvailable) 207 } 208 209 // callNext overwrites the generic call next and batches a certain number of has 210 // sector jobs together. 211 func (jq *jobHasSectorQueue) callNext() workerJob { 212 jobs := make([]*jobHasSector, 0, hasSectorBatchSize) 213 var next workerJob 214 for { 215 if len(jobs) >= hasSectorBatchSize { 216 break 217 } 218 next = jq.jobGenericQueue.callNext() 219 if next == nil { 220 break 221 } 222 j := next.(*jobHasSector) 223 jobs = append(jobs, j) 224 } 225 if len(jobs) == 0 { 226 return nil 227 } 228 229 return &jobHasSectorBatch{ 230 staticJobs: jobs, 231 } 232 } 233 234 // newJobHasSector is a helper method to create a new HasSector job. 235 func (w *worker) newJobHasSector(ctx context.Context, responseChan chan *jobHasSectorResponse, numPieces int, roots ...crypto.Hash) *jobHasSector { 236 return w.newJobHasSectorWithPostExecutionHook(ctx, responseChan, nil, numPieces, roots...) 237 } 238 239 // newJobHasSectorWithPostExecutionHook is a helper method to create a new 240 // HasSector job with a post execution hook that is executed after the response 241 // is available but before sending it over the channel. 242 func (w *worker) newJobHasSectorWithPostExecutionHook(ctx context.Context, responseChan chan *jobHasSectorResponse, hook func(*jobHasSectorResponse), numPieces int, roots ...crypto.Hash) *jobHasSector { 243 t := opentracing.NoopTracer{} // NOTE: disabled for performance 244 span, _ := opentracing.StartSpanFromContextWithTracer(ctx, t, "HasSectorJob") 245 return &jobHasSector{ 246 staticNumPieces: numPieces, 247 staticSectors: roots, 248 staticResponseChan: responseChan, 249 staticPostExecutionHook: hook, 250 staticSpan: span, 251 jobGeneric: newJobGeneric(ctx, w.staticJobHasSectorQueue, nil), 252 } 253 } 254 255 // callDiscard will discard a job, sending the provided error. 256 func (j *jobHasSector) callDiscard(err error) { 257 w := j.staticQueue.staticWorker() 258 errLaunch := w.staticTG.Launch(func() { 259 response := staticPoolJobHasSectorResponse.Get() 260 response.staticErr = err 261 response.staticWorker = w 262 263 j.managedCallPostExecutionHook(response) 264 select { 265 case j.staticResponseChan <- response: 266 case <-j.staticCtx.Done(): 267 case <-w.staticTG.StopChan(): 268 } 269 }) 270 if errLaunch != nil { 271 w.staticRenter.staticLog.Print("callDiscard: launch failed", errLaunch) 272 } 273 274 j.staticSpan.LogKV("callDiscard", err) 275 j.staticSpan.SetTag("success", false) 276 j.staticSpan.Finish() 277 } 278 279 // callDiscard discards all jobs within the batch. 280 func (j jobHasSectorBatch) callDiscard(err error) { 281 for _, hsj := range j.staticJobs { 282 hsj.callDiscard(err) 283 } 284 } 285 286 // staticCanceled always returns false. A batched job never resides in the 287 // queue. It's constructed right before being executed. 288 func (j jobHasSectorBatch) staticCanceled() bool { 289 return false 290 } 291 292 // staticGetMetadata return an empty struct. A batched has sector job doesn't 293 // contain any metadata. 294 func (j jobHasSectorBatch) staticGetMetadata() interface{} { 295 return struct{}{} 296 } 297 298 // callExecute will run the has sector job. 299 func (j *jobHasSector) callExecute() error { 300 // Finish job span at the end. 301 defer j.staticSpan.Finish() 302 303 // Set the execute time 304 j.externExecuteTime = time.Now() 305 306 // Capture callExecute in new span. 307 span := opentracing.StartSpan("callExecute", opentracing.ChildOf(j.staticSpan.Context())) 308 defer span.Finish() 309 310 batch := jobHasSectorBatch{ 311 staticJobs: []*jobHasSector{j}, 312 } 313 return batch.callExecute() 314 } 315 316 // callExecute will run the has sector job. 317 func (j jobHasSectorBatch) callExecute() error { 318 if len(j.staticJobs) == 0 { 319 build.Critical("empty hasSectorBatch") 320 return nil 321 } 322 323 start := time.Now() 324 w := j.staticJobs[0].staticQueue.staticWorker() 325 availables, err := j.managedHasSector() 326 jobTime := time.Since(start) 327 328 for i := range j.staticJobs { 329 hsj := j.staticJobs[i] 330 // Handle its span 331 if err != nil { 332 hsj.staticSpan.LogKV("error", err) 333 } 334 hsj.staticSpan.SetTag("success", err == nil) 335 hsj.staticSpan.Finish() 336 337 // Create the response. 338 response := staticPoolJobHasSectorResponse.Get() 339 response.staticErr = err 340 response.staticJobTime = jobTime 341 response.staticWorker = w 342 343 // If it was successful, attach the result. 344 if err == nil { 345 response.staticAvailbleIndices = availables[i] 346 } 347 // Send the response. 348 err2 := w.staticTG.Launch(func() { 349 hsj.managedCallPostExecutionHook(response) 350 select { 351 case hsj.staticResponseChan <- response: 352 case <-hsj.staticCtx.Done(): 353 case <-w.staticTG.StopChan(): 354 } 355 }) 356 // Report success or failure to the queue. 357 if err != nil { 358 hsj.staticQueue.callReportFailure(err, start, time.Now()) 359 continue 360 } 361 hsj.staticQueue.callReportSuccess() 362 363 // Job was a success, update the performance and availability stats on 364 // the queue. 365 jq := hsj.staticQueue.(*jobHasSectorQueue) 366 jq.callUpdateJobTimeMetrics(jobTime) 367 jq.callUpdateAvailabilityMetrics(hsj.staticNumPieces, len(hsj.staticSectors), len(availables[i])) 368 if err2 != nil { 369 w.staticRenter.staticLog.Println("callExecute: launch failed", err2) 370 } 371 } 372 373 return err 374 } 375 376 // callExpectedBandwidth returns the bandwidth that is expected to be consumed 377 // by the job. 378 func (j *jobHasSector) callExpectedBandwidth() (ul, dl uint64) { 379 // sanity check 380 if len(j.staticSectors) == 0 { 381 build.Critical("expected bandwidth requested for a job that has no staticSectors set") 382 } 383 return gouging.HasSectorJobExpectedBandwidth(len(j.staticSectors)) 384 } 385 386 // callExpectedBandwidth returns the bandwidth that is expected to be consumed 387 // by the job. 388 func (j jobHasSectorBatch) callExpectedBandwidth() (ul, dl uint64) { 389 var totalSectors int 390 for _, hsj := range j.staticJobs { 391 // sanity check 392 if len(hsj.staticSectors) == 0 { 393 build.Critical("expected bandwidth requested for a job that has no staticSectors set") 394 } 395 totalSectors += len(hsj.staticSectors) 396 } 397 ul, dl = gouging.HasSectorJobExpectedBandwidth(totalSectors) 398 return 399 } 400 401 // managedHasSector returns whether or not the host has a sector with given root 402 func (j *jobHasSectorBatch) managedHasSector() (results [][]uint64, err error) { 403 if len(j.staticJobs) == 0 { 404 return nil, nil 405 } 406 407 w := j.staticJobs[0].staticQueue.staticWorker() 408 // Create the program. 409 pt := w.staticPriceTable().staticPriceTable 410 pb := modules.NewProgramBuilder(&pt, 0) // 0 duration since HasSector doesn't depend on it. 411 for _, hsj := range j.staticJobs { 412 for _, sector := range hsj.staticSectors { 413 pb.AddHasSectorInstruction(sector) 414 } 415 } 416 program, programData := pb.Program() 417 cost, _, _ := pb.Cost(true) 418 419 // take into account bandwidth costs 420 ulBandwidth, dlBandwidth := j.callExpectedBandwidth() 421 bandwidthCost, bandwidthRefund := mdmBandwidthCost(pt, ulBandwidth, dlBandwidth) 422 cost = cost.Add(bandwidthCost) 423 424 // Execute the program and parse the responses. 425 hasSectors := make([]bool, 0, len(program)) 426 var responses []programResponse 427 responses, _, err = w.managedExecuteProgram(program, programData, types.FileContractID{}, categoryDownload, cost, bandwidthRefund) 428 if err != nil { 429 return nil, errors.AddContext(err, "managedHasSector: unable to execute program for has sector job") 430 } 431 for _, resp := range responses { 432 if resp.Error != nil { 433 return nil, errors.AddContext(resp.Error, "Output error") 434 } 435 hasSectors = append(hasSectors, resp.Output[0] == 1) 436 } 437 if len(responses) != len(program) { 438 return nil, errors.New("received invalid number of responses but no error") 439 } 440 441 for _, hsj := range j.staticJobs { 442 var availables []uint64 443 for i := 0; i < len(hsj.staticSectors); i++ { 444 if hasSectors[i] { 445 availables = append(availables, uint64(i)) 446 } 447 } 448 results = append(results, availables) 449 hasSectors = hasSectors[len(hsj.staticSectors):] 450 } 451 return results, nil 452 } 453 454 // callAddWithEstimate will add a job to the queue and return a timestamp for 455 // when the job is estimated to complete. An error will be returned if the job 456 // is not successfully queued. 457 func (jq *jobHasSectorQueue) callAddWithEstimate(j *jobHasSector, maxEstimate time.Duration) (time.Time, error) { 458 jq.mu.Lock() 459 defer jq.mu.Unlock() 460 now := time.Now() 461 estimate := jq.expectedJobTime() 462 if estimate > maxEstimate { 463 return time.Time{}, errEstimateAboveMax 464 } 465 466 if !jq.add(j) { 467 return time.Time{}, errors.New("unable to add job to queue") 468 } 469 return now.Add(estimate), nil 470 } 471 472 // callExpectedJobTime returns the expected amount of time that this job will 473 // take to complete. 474 // 475 // TODO: idealy we pass `numSectors` here and get the expected job time 476 // depending on the amount of instructions in the program. 477 func (jq *jobHasSectorQueue) callExpectedJobTime() time.Duration { 478 jq.mu.Lock() 479 defer jq.mu.Unlock() 480 return jq.expectedJobTime() 481 } 482 483 // callAvailabilityRate returns the percentage of jobs that came back having the 484 // sector for this queue's worker. 485 func (jq *jobHasSectorQueue) callAvailabilityRate(numPieces int) float64 { 486 jq.mu.Lock() 487 defer jq.mu.Unlock() 488 489 // assert the given value for num pieces makes sense, we throw a critical 490 // here as this can only be caused by developer error 491 if numPieces < 1 { 492 build.Critical("num pieces can never be smaller than 1") 493 return 0 494 } 495 496 // fetch the bucket that corresponds with the given redundancy 497 bucket := jq.availabilityMetrics.bucket(numPieces) 498 499 // if there haven't been any jobs yet where the sector was available on the 500 // host, we return a minimum rate of .1% to avoid multiplication by zero in 501 // our download code algorithms. 502 if bucket.totalAvailable == 0 || bucket.totalLookups == 0 { 503 return jobHasSectorQueueMinAvailabilityRate 504 } 505 506 return bucket.totalAvailable / bucket.totalLookups 507 } 508 509 // callUpdateAvailabilityMetrics updates the fields on the has sector queue that 510 // keep track of how many jobs were executed successfully, and how many jobs had 511 // the sector be available. 512 func (jq *jobHasSectorQueue) callUpdateAvailabilityMetrics(numPieces, numSectors, numAvailable int) { 513 jq.mu.Lock() 514 defer jq.mu.Unlock() 515 jq.availabilityMetrics.updateMetrics(numPieces, numSectors, numAvailable) 516 } 517 518 // callUpdateJobTimeMetrics takes a duration it took to fulfil that job and uses 519 // it to update the job performance metrics on the queue. 520 func (jq *jobHasSectorQueue) callUpdateJobTimeMetrics(jobTime time.Duration) { 521 jq.mu.Lock() 522 defer jq.mu.Unlock() 523 jq.weightedJobTime = expMovingAvgHotStart(jq.weightedJobTime, float64(jobTime), jobHasSectorPerformanceDecay) 524 jq.staticDT.AddDataPoint(jobTime) 525 } 526 527 // expectedJobTime will return the amount of time that a job is expected to 528 // take, given the current conditions of the queue. 529 func (jq *jobHasSectorQueue) expectedJobTime() time.Duration { 530 return jq.staticDT.Distribution(0).ExpectedDuration() 531 } 532 533 // initJobHasSectorQueue will init the queue for the has sector jobs. 534 func (w *worker) initJobHasSectorQueue() { 535 // Sanity check that there is no existing job queue. 536 if w.staticJobHasSectorQueue != nil { 537 w.staticRenter.staticLog.Critical("incorret call on initJobHasSectorQueue") 538 return 539 } 540 541 w.staticJobHasSectorQueue = &jobHasSectorQueue{ 542 availabilityMetrics: newAvailabilityMetrics(availabilityMetricsDefaultHalfLife), 543 jobGenericQueue: newJobGenericQueue(w), 544 staticDT: skymodules.NewDistributionTrackerStandard(), 545 } 546 } 547 548 // managedCallPostExecutionHook calls a post execution hook if registered. The 549 // hook will only be called the first time this method is executed. Subsequent 550 // calls are no-ops. 551 func (j *jobHasSector) managedCallPostExecutionHook(resp *jobHasSectorResponse) { 552 if j.staticPostExecutionHook == nil { 553 return // nothing to do 554 } 555 j.once.Do(func() { 556 j.staticPostExecutionHook(resp) 557 }) 558 }