gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/renter/workerjobread.go (about) 1 package renter 2 3 import ( 4 "sync" 5 "time" 6 7 "github.com/opentracing/opentracing-go" 8 "gitlab.com/SkynetLabs/skyd/build" 9 "gitlab.com/SkynetLabs/skyd/skymodules" 10 "go.sia.tech/siad/crypto" 11 "go.sia.tech/siad/modules" 12 "go.sia.tech/siad/types" 13 14 "gitlab.com/NebulousLabs/errors" 15 ) 16 17 const ( 18 // jobReadPerformanceDecay defines how much decay gets applied to the 19 // historic performance of jobRead each time new data comes back. 20 // Setting a low value makes the performance more volatile. If the worker 21 // tends to have inconsistent performance, having the decay be a low value 22 // (0.9 or lower) will be highly detrimental. A higher decay means that the 23 // predictor tends to be more accurate over time, but is less responsive to 24 // things like network load. 25 jobReadPerformanceDecay = 0.9 26 27 // jobLength64k is the threshold we use to label a download as 64kb 28 // jobLength1m is the threshold we use to label a download as 1m 29 // jobLength4m is the threshold we use to label a download as 4m 30 // 31 // usually the length is evaluated using an if-else structure, comparing the 32 // length to these threshold in ascending fashion, so we first check to see 33 // whether it's a 64kb, then a 1mb and so on 34 jobLength64k = uint64(1 << 16) 35 jobLength1m = uint64(1 << 20) 36 jobLength4m = uint64(1 << 24) 37 ) 38 39 type ( 40 // jobRead contains information about a Read query. 41 jobRead struct { 42 staticLength uint64 43 staticResponseChan chan *jobReadResponse 44 45 // staticSpan is used for tracing. Note that this can be nil, and 46 // therefore should always be checked. Not all read jobs require 47 // tracing. By allowing it to be nil we avoid the extra overhead. 48 staticSpan opentracing.Span 49 50 jobGeneric 51 } 52 53 // jobReadQueue is a list of Read queries that have been assigned to the 54 // worker. The queue also tracks performance metrics, which can then be used 55 // by projects to optimize job scheduling between workers. 56 jobReadQueue struct { 57 staticStats *jobReadStats 58 *jobGenericQueue 59 60 // staticBaseCost is applied to download costs, defined in SC/TB 61 staticBaseCost types.Currency 62 } 63 64 // jobReadStats contains statistics about read jobs. This object is 65 // thread safe and can be shared between multiple queues. 66 jobReadStats struct { 67 // These float64s are converted time.Duration values. They are float64 68 // to get better precision on the exponential decay which gets applied 69 // with each new data point. 70 weightedJobTime64k float64 71 weightedJobTime1m float64 72 weightedJobTime4m float64 73 74 // These distribution trackers keep track of the read durations for 75 // every length category. 76 staticDT64k *skymodules.DistributionTracker 77 staticDT1m *skymodules.DistributionTracker 78 staticDT4m *skymodules.DistributionTracker 79 80 *jobGenericQueue 81 mu sync.Mutex 82 } 83 84 // jobReadResponse contains the result of a Read query. 85 jobReadResponse struct { 86 // The response data. 87 staticData []byte 88 staticProof []crypto.Hash 89 staticErr error 90 91 // Metadata related to the job. 92 staticMetadata jobReadMetadata 93 94 // The time it took for this job to complete. 95 staticJobTime time.Duration 96 } 97 98 // jobReadMetadata contains meta information about a read job. 99 jobReadMetadata struct { 100 staticSectorRoot crypto.Hash 101 staticPieceRootIndex uint64 102 staticLaunchedWorkerIndex uint64 103 104 // the category specifies what type of function the read job fulfils, 105 // this is necessary to pass along as the generic MDM executor needs to 106 // be update spending details and read jobs can be used for downloads 107 // but might also be used for snapshots for example 108 staticSpendingCategory spendingCategory 109 110 staticWorker *worker 111 staticWorkerIdentifier uint32 112 } 113 ) 114 115 // NewJobReadStats returns an initialized jobReadStats object. 116 func NewJobReadStats() *jobReadStats { 117 return &jobReadStats{ 118 staticDT64k: skymodules.NewDistributionTrackerStandard(), 119 staticDT1m: skymodules.NewDistributionTrackerStandard(), 120 staticDT4m: skymodules.NewDistributionTrackerStandard(), 121 } 122 } 123 124 // staticJobReadMetadata returns the read job's metadata. 125 func (j *jobRead) staticJobReadMetadata() jobReadMetadata { 126 var metadata jobReadMetadata 127 md, ok := j.staticGetMetadata().(jobReadMetadata) 128 if ok { 129 metadata = md 130 } 131 return metadata 132 } 133 134 // callDiscard will discard a job, forwarding the error to the caller. 135 func (j *jobRead) callDiscard(err error) { 136 // Log info and finish span. 137 if j.staticSpan != nil { 138 j.staticSpan.LogKV("callDiscard", err) 139 j.staticSpan.SetTag("success", false) 140 j.staticSpan.Finish() 141 } 142 143 w := j.staticQueue.staticWorker() 144 errLaunch := w.staticTG.Launch(func() { 145 response := &jobReadResponse{ 146 staticErr: err, 147 staticMetadata: j.staticJobReadMetadata(), 148 } 149 select { 150 case j.staticResponseChan <- response: 151 case <-w.staticTG.StopChan(): 152 case <-j.staticCtx.Done(): 153 } 154 }) 155 if errLaunch != nil { 156 w.staticRenter.staticLog.Print("callDiscard: launch failed", errLaunch) 157 } 158 } 159 160 // managedFinishExecute will execute code that is shared by multiple read jobs 161 // after execution. It updates the performance metrics, records whether the 162 // execution was successful and returns the response. 163 func (j *jobRead) managedFinishExecute(readData []byte, proof []crypto.Hash, readErr error, readJobTime time.Duration) { 164 // Log result and finish 165 if j.staticSpan != nil { 166 j.staticSpan.LogKV( 167 "err", readErr, 168 "duration", readJobTime, 169 ) 170 j.staticSpan.SetTag("success", readErr == nil) 171 j.staticSpan.Finish() 172 } 173 174 // Send the response in a goroutine so that the worker resources can be 175 // released faster. Need to check if the job was canceled so that the 176 // goroutine will exit. 177 response := &jobReadResponse{ 178 staticData: readData, 179 staticProof: proof, 180 staticErr: readErr, 181 182 staticMetadata: j.staticJobReadMetadata(), 183 staticJobTime: readJobTime, 184 } 185 w := j.staticQueue.staticWorker() 186 err := w.staticTG.Launch(func() { 187 select { 188 case j.staticResponseChan <- response: 189 case <-j.staticCtx.Done(): 190 case <-w.staticTG.StopChan(): 191 } 192 }) 193 if err != nil { 194 j.staticQueue.staticWorker().staticRenter.staticLog.Print("managedFinishExecute: launch failed", err) 195 } 196 197 // Report success or failure to the queue. 198 if readErr != nil { 199 j.staticQueue.callReportFailure(readErr, j.externExecuteTime, time.Now()) 200 return 201 } 202 j.staticQueue.callReportSuccess() 203 204 // Job succeeded. 205 // 206 // Update the metrics in the read sector queue based on the amount of 207 // time the read took. Stats should only be added if the job did not 208 // result in an error. Because there was no failure, the consecutive 209 // failures stat can be reset. 210 jq := j.staticQueue.(*jobReadQueue) 211 jq.staticStats.callUpdateJobTimeMetrics(j.staticLength, readJobTime) 212 } 213 214 // callExpectedBandwidth returns the bandwidth that gets consumed by a 215 // Read program. 216 func (j *jobRead) callExpectedBandwidth() (ul, dl uint64) { 217 ul = 1 << 12 // 4 KiB 218 dl = uint64(float64(j.staticLength)*1.01) + 1<<12 // (readSize * 1.01 + 4 KiB) 219 return 220 } 221 222 // managedRead returns the sector data for the given read program and the merkle 223 // proof. 224 func (j *jobRead) managedRead(w *worker, program modules.Program, programData []byte, cost types.Currency, bandwidthRefund func(ul, dl uint64) types.Currency) ([]programResponse, error) { 225 // execute it 226 responses, _, err := w.managedExecuteProgram(program, programData, w.staticCache().staticContractID, j.staticJobReadMetadata().staticSpendingCategory, cost, bandwidthRefund) 227 if err != nil { 228 return []programResponse{}, err 229 } 230 231 // Sanity check number of responses. 232 if len(responses) > len(program) { 233 build.Critical("managedExecuteProgram should return at most len(program) instructions") 234 } 235 if len(responses) == 0 { 236 build.Critical("managedExecuteProgram should at least return one instruction when err == nil") 237 } 238 // If the number of responses doesn't match, the last response should 239 // contain an error message. 240 if len(responses) != len(program) { 241 err := responses[len(responses)-1].Error 242 return []programResponse{}, errors.AddContext(err, "managedRead: program execution was interrupted") 243 } 244 245 // The last instruction is the actual download. 246 response := responses[len(responses)-1] 247 if response.Error != nil { 248 return []programResponse{}, response.Error 249 } 250 sectorData := response.Output 251 252 // Check that we received the amount of data that we were expecting. 253 if uint64(len(sectorData)) != j.staticLength { 254 return []programResponse{}, errors.New("worker returned the wrong amount of data") 255 } 256 return responses, nil 257 } 258 259 // callAddWithEstimate will add a job to the job read queue while providing an 260 // estimate for when the job is expected to return. 261 func (jq *jobReadQueue) callAddWithEstimate(j *jobReadSector) (time.Time, bool) { 262 estimate := jq.staticStats.callExpectedJobTime(j.staticLength) 263 264 jq.mu.Lock() 265 defer jq.mu.Unlock() 266 267 if !jq.add(j) { 268 return time.Time{}, false 269 } 270 return time.Now().Add(estimate), true 271 } 272 273 // callExpectedJobTime will return the recent performance of the worker 274 // attempting to complete read jobs. The call distinguishes based on the 275 // size of the job, breaking the jobs into 3 categories: less than 64kb, less 276 // than 1mb, and up to a full sector in size. 277 // 278 // The breakout is performed because low latency, low throughput workers are 279 // common, and will have very different performance characteristics across the 280 // three categories. 281 // 282 // TODO: Make this smarter. 283 func (jrs *jobReadStats) callExpectedJobTime(length uint64) time.Duration { 284 jrs.mu.Lock() 285 defer jrs.mu.Unlock() 286 return jrs.expectedJobTime(length) 287 } 288 289 // expectedJobTime returns the expected job time, based on recent performance, 290 // for the given read length. 291 func (jrs *jobReadStats) expectedJobTime(length uint64) time.Duration { 292 if length <= jobLength64k { 293 return time.Duration(jrs.weightedJobTime64k) 294 } else if length <= jobLength1m { 295 return time.Duration(jrs.weightedJobTime1m) 296 } else { 297 return time.Duration(jrs.weightedJobTime4m) 298 } 299 } 300 301 // callExpectedJobCost returns an estimate for the price of performing a read 302 // job with the given length. 303 func (jq *jobReadQueue) callExpectedJobCost(length uint64) types.Currency { 304 pt := &jq.staticWorker().staticPriceTable().staticPriceTable 305 306 // Calculate init cost. The program we use has a 48 byte program data and 1 307 // instruction. 48 = 8 bytes length + 8 bytes offset + 32 bytes merkle root 308 cost := modules.MDMInitCost(pt, 48, 1) 309 310 // Add the execution cost. 311 cost = cost.Add(modules.MDMReadCost(pt, length)) 312 313 // Add the memory cost. 314 memory := modules.MDMInitMemory() + modules.MDMReadMemory() 315 time := uint64(modules.MDMTimeReadSector) 316 cost = cost.Add(modules.MDMMemoryCost(pt, memory, time)) 317 318 // Add the bandwidth cost. 319 ulBandwidth, dlBandwidth := new(jobReadSector).callExpectedBandwidth() 320 cost = cost.Add(modules.MDMBandwidthCost(*pt, ulBandwidth, dlBandwidth)) 321 322 // Add the base cost. 323 cost = cost.Add(jq.staticBaseCost.Mul64(dlBandwidth)) 324 return cost 325 } 326 327 // callUpdateJobTimeMetrics takes a length and the duration it took to fulfil 328 // that job and uses it to update the job performance metrics on the queue. 329 func (jrs *jobReadStats) callUpdateJobTimeMetrics(length uint64, jobTime time.Duration) { 330 jrs.mu.Lock() 331 defer jrs.mu.Unlock() 332 if length <= jobLength64k { 333 jrs.weightedJobTime64k = expMovingAvgHotStart(jrs.weightedJobTime64k, float64(jobTime), jobReadPerformanceDecay) 334 } else if length <= jobLength1m { 335 jrs.weightedJobTime1m = expMovingAvgHotStart(jrs.weightedJobTime1m, float64(jobTime), jobReadPerformanceDecay) 336 } else { 337 jrs.weightedJobTime4m = expMovingAvgHotStart(jrs.weightedJobTime4m, float64(jobTime), jobReadPerformanceDecay) 338 } 339 340 // update distribution tracker 341 dt := jrs.distributionTrackerForLength(length) 342 dt.AddDataPoint(jobTime) 343 } 344 345 // distributionTrackerForLength returns the distribution tracker that 346 // corresponds to the given length. 347 func (jrs *jobReadStats) distributionTrackerForLength(length uint64) *skymodules.DistributionTracker { 348 if length <= jobLength64k { 349 return jrs.staticDT64k 350 } else if length <= jobLength1m { 351 return jrs.staticDT1m 352 } else { 353 return jrs.staticDT4m 354 } 355 } 356 357 // initJobReadQueue will initialize a queue for downloading sectors by 358 // their root for the worker. This is only meant to be run once at startup. 359 func (w *worker) initJobReadQueue(jrs *jobReadStats) { 360 // Sanity check that there is no existing job queue. 361 if w.staticJobReadQueue != nil { 362 w.staticRenter.staticLog.Critical("incorrect call on initJobReadQueue") 363 } 364 365 w.staticJobReadQueue = &jobReadQueue{ 366 jobGenericQueue: newJobGenericQueue(w), 367 368 staticBaseCost: skymodules.DefaultSkynetBaseCost, 369 staticStats: jrs, 370 } 371 } 372 373 // initJobLowPrioReadQueue will initialize a queue for downloading sectors by 374 // their root for the worker. This is only meant to be run once at startup. 375 func (w *worker) initJobLowPrioReadQueue(jrs *jobReadStats) { 376 // Sanity check that there is no existing job queue. 377 if w.staticJobLowPrioReadQueue != nil { 378 w.staticRenter.staticLog.Critical("incorret call on initJobReadQueue") 379 } 380 381 w.staticJobLowPrioReadQueue = &jobReadQueue{ 382 jobGenericQueue: newJobGenericQueue(w), 383 384 staticBaseCost: skymodules.DefaultSkynetBaseCost, 385 staticStats: jrs, 386 } 387 }