gitlab.com/SkynetLabs/skyd@v1.6.9/skymodules/distributiontracker.go (about) 1 package skymodules 2 3 // distributiontracker.go creates a generic tool for tracking the performance of 4 // some function over time. It keeps a distribution of data points that has a 5 // time based exponential decay. At any point, the pXX can be requested. For 6 // example, requesting the p99 will tell you the smallest duration that is 7 // greater than 99% of all data points. Any float value can be requested between 8 // 0 and 1. 9 // 10 // The distribution tracker has been designed to be high performance, low memory 11 // overhead, and able to cover a range of values starting at 4ms and going up to 12 // over an hour. The distribution tracker is bucketed rather than continuous, 13 // which means the return values won't always be accurate, however they will 14 // always be accurate to within 4% as long as the result is under 1 hour total 15 // in duration. 16 // 17 // NOTE: There are a handful of hardcoded numbers in this file. This is because 18 // they have some tricky mathematical relationships to make the bucket ordering 19 // really nice, and I couldn't find a good way to make them generic. These 20 // relationships may also enable some performance optimizations in the future 21 // that wouldn't be possible with generic values. 22 23 import ( 24 "fmt" 25 "reflect" 26 "sync" 27 "time" 28 29 "gitlab.com/SkynetLabs/skyd/build" 30 ) 31 32 const ( 33 // distributionTrackerInitialStepSize defines the step size that we use for 34 // the first bucketsPerStepChange buckets of a distribution. This means that 35 // this is the smallest timing that is supported by the distribution. The 36 // highest value supported by the distribution is about 1 million times 37 // larger. 38 // 39 // Decreasing this will give better granularity on things that take very 40 // little time, but will also proportionally reduce the maximum amount of 41 // time that can be measured. For every time you decrease this value by a 42 // factor of 4, you should increase the distributionTrackerNumIncrements by 43 // '1' to maintain the same upper bound on the time. 44 distributionTrackerInitialStepSize = 4 * time.Millisecond 45 46 // distributionTrackerNumIncrements defines the number of times the stats 47 // get incremented. 48 distributionTrackerNumIncrements = 7 49 ) 50 51 // NOTE: These consts are interconnected, do not change them. 52 // distriubtionTrackerInitialBuckets needs to be a power of 2, 53 // distriubtionTrackerStepChangeMultiple needs to be a power of 2, and 54 // distriubtionTrackerBucketsPerStepChange needs to be: 55 // 56 // distributionTrackerInitialBuckets - (distributionTrackerInitialBuckets/distributionTrackerStepChangeMultiple) 57 const ( 58 // DistributionTrackerTotalBuckets is a shortcut defining one of the 59 // commonly used relationships between the other consts. 60 DistributionTrackerTotalBuckets = distributionTrackerInitialBuckets + distributionTrackerBucketsPerStepChange*distributionTrackerNumIncrements 61 62 // bucketsPerStepChange defines the number of buckets that are used in the 63 // first step. It has a mathematical relationship to 64 // distributoinTrackerBucketsPerStepChange, because we want every step range 65 // to cover the same amount of new ground, but the first step range has no 66 // history. By adding an extra few buckets at the beginning, we can give it 67 // that history and bootstrap the data structure. 68 distributionTrackerInitialBuckets = 64 69 70 // distributionTrackerBucketsPerStepChange defines the number of buckets per 71 // step change. Increasing this number will give better granularity at each 72 // step range at the cost of more memory and more computation. 73 distributionTrackerBucketsPerStepChange = 48 74 75 // distributionTrackerStepChangeMultiple defines the multiple that is used 76 // when a step change is applied. A larger multiple means the data structure 77 // can cover a greater range of data in fewer buckets at the cost of 78 // granularity. This saves computation and memory. 79 distributionTrackerStepChangeMultiple = 4 80 81 // numBuckets is the number of buckets a distribution consists of. 82 numBuckets = 64 + 48*distributionTrackerNumIncrements 83 ) 84 85 type ( 86 // Distribution tracks the distribution of durations for a particular half 87 // life. 88 // 89 // NOTE: This struct is not thread safe, thread safety is derived from the 90 // parent object. 91 // 92 // NOTE: If you extend this struct, take the changes into account in the 93 // 'Clone' method. 94 Distribution struct { 95 // Decay is applied to the distribution. 96 GenericDecay 97 98 // Buckets that represent the distribution. The first 99 // bucketsPerStepChange buckets start at 4ms and are 4ms spaced apart. 100 // The next 48 buckets are spaced 16ms apart, then the next 48 are 101 // spaced 64ms apart, the spacings multiplying by 4 every 48 buckets. 102 // The final bucket is just over an hour, anything over will be put into 103 // that bucket as well. 104 timings [numBuckets]float64 105 106 // expectedDurationNumerator tracks the current numerator for 107 // computing the expected duration of the Distribution. 108 // 109 // The full equation is: 110 // 111 // timing[0] * bucketDuration[0] + ... + timing[n] * bucketDuration[n] expectedDurationNumerator 112 // ExpectedDuration = ___________________________________________________________________ = _________________________ 113 // timing[0] + ... + timing[n] total 114 // 115 // 116 // This allows us to optimize the otherwise expensive 117 // computation from O(n) -> O(1). 118 expectedDurationNumerator float64 119 120 // total tracks the current total of all timings. 121 total float64 122 } 123 124 // DistributionTracker will track the performance distribution of a series 125 // of operations over a set of time ranges. Each time range corresponds to a 126 // different half life. A common choice is to track the half lives for {15 127 // minutes, 24 hours, Lifetime}. 128 DistributionTracker struct { 129 distributions []*Distribution 130 131 mu sync.Mutex 132 } 133 134 // DistributionTrackerStats houses a set of fields that get returned by the 135 // DistributionTracker which display the values of the underlying 136 // distributions. 137 DistributionTrackerStats struct { 138 Percentiles [][]time.Duration 139 DataPoints []float64 140 } 141 142 // PersistedDistribution contains the information about a distribution 143 // that is persisted to disk. 144 PersistedDistribution struct { 145 Timings [numBuckets]float64 `json:"timings"` 146 } 147 148 // PersistedDistributionTracker contains the information about a 149 // distributiontracker that is persisted to disk. 150 PersistedDistributionTracker struct { 151 Distributions []PersistedDistribution `json:"distributions"` 152 } 153 154 // Chances is a helper type that represent a distribition's chance array 155 Chances [DistributionTrackerTotalBuckets]float64 156 ) 157 158 // setTiming updates a timing for the distribution while also making sure the 159 // total and numerator remain up-to-date. 160 func (d *Distribution) setTiming(i int, t float64) { 161 // Subtract the timing from the total and numerator first. 162 d.total -= d.timings[i] 163 d.expectedDurationNumerator -= d.timings[i] * float64(DistributionDurationForBucketIndex(i)) 164 165 // Set the new timing. 166 d.timings[i] = t 167 168 // Add the timing and numerator back using the new timing. 169 d.total += d.timings[i] 170 d.expectedDurationNumerator += d.timings[i] * float64(DistributionDurationForBucketIndex(i)) 171 172 // Due to the fact that floats are not perfectly accurate, the total and 173 // numerator could be slightly negative instead of 0. e.g. -0.0000001. 174 // If that happens, we round to 0. 175 if d.total < 0 { 176 d.total = 0 177 } 178 if d.expectedDurationNumerator < 0 { 179 d.expectedDurationNumerator = 0 180 } 181 } 182 183 // Persist returns a PersistedDistributionTracker for the DistributionTracker by 184 // copying all of its buckets. 185 func (dt *DistributionTracker) Persist() PersistedDistributionTracker { 186 dt.mu.Lock() 187 defer dt.mu.Unlock() 188 distributions := make([]PersistedDistribution, 0, len(dt.distributions)) 189 for _, d := range dt.distributions { 190 pd := PersistedDistribution{} 191 for i := range pd.Timings { 192 pd.Timings[i] = float64(d.timings[i]) 193 } 194 distributions = append(distributions, pd) 195 } 196 return PersistedDistributionTracker{ 197 Distributions: distributions, 198 } 199 } 200 201 // DistributionBucketIndexForDuration converts the given duration to a bucket 202 // index 203 func DistributionBucketIndexForDuration(dur time.Duration) int { 204 index, _ := indexForDuration(dur) 205 return index 206 } 207 208 // staticDistributionDurationsForBucketIndices is a slice used for translating 209 // durations to bucket indices. As an optimization we only compute it on 210 // startup. 211 var staticDistributionDurationsForBucketIndices = func() []time.Duration { 212 durations := make([]time.Duration, DistributionTrackerTotalBuckets) 213 LOOP: 214 for index := 0; index < DistributionTrackerTotalBuckets; index++ { 215 stepSize := distributionTrackerInitialStepSize 216 if index <= distributionTrackerInitialBuckets { 217 durations[index] = stepSize * time.Duration(index) 218 continue LOOP 219 } 220 prevMax := stepSize * distributionTrackerInitialBuckets 221 for i := distributionTrackerInitialBuckets; i <= DistributionTrackerTotalBuckets; i += distributionTrackerBucketsPerStepChange { 222 stepSize *= distributionTrackerStepChangeMultiple 223 if index < i+distributionTrackerBucketsPerStepChange { 224 durations[index] = stepSize*time.Duration(index-i) + prevMax 225 continue LOOP 226 } 227 prevMax *= distributionTrackerStepChangeMultiple 228 } 229 } 230 return durations 231 }() 232 233 // DistributionDurationForBucketIndex converts the index of a timing bucket into 234 // a timing. 235 func DistributionDurationForBucketIndex(index int) time.Duration { 236 if index < 0 || index > DistributionTrackerTotalBuckets-1 { 237 build.Critical("distribution duration index out of bounds:", index) 238 } 239 return staticDistributionDurationsForBucketIndices[index] 240 } 241 242 // indexForDuration converts the given duration to a bucket index. Alongside the 243 // index it also returns a float that represents the fraction of the bucket that 244 // is included by the given duration. 245 // 246 // e.g. if we are dealing with 4ms buckets and a duration of 1ms is passed, the 247 // return values would be 0 and 0.25, indicating the duration corresponds with 248 // bucket at index 0, and the given duration included 25% of that bucket. 249 func indexForDuration(duration time.Duration) (int, float64) { 250 if duration < 0 { 251 build.Critical(fmt.Sprintf("negative duration %v", duration)) 252 return -1, 0 253 } 254 255 // check if it falls in the initial buckets 256 stepSize := distributionTrackerInitialStepSize 257 max := stepSize * distributionTrackerInitialBuckets 258 if duration < max { 259 index := duration / stepSize 260 fraction := float64(duration%stepSize) / float64(stepSize) 261 return int(index), fraction 262 } 263 264 // range over all buckets and see whether the given duration falls into it 265 for i := distributionTrackerInitialBuckets; i < DistributionTrackerTotalBuckets; i += distributionTrackerBucketsPerStepChange { 266 stepSize *= distributionTrackerStepChangeMultiple 267 max *= distributionTrackerStepChangeMultiple 268 if duration < max { 269 index := int(duration/stepSize) + i - distributionTrackerInitialBuckets/distributionTrackerStepChangeMultiple 270 fraction := float64(duration%stepSize) / float64(stepSize) 271 return int(index), fraction 272 } 273 } 274 275 // if we haven't found the index, return the last one 276 return DistributionTrackerTotalBuckets - 1, 1 277 } 278 279 // addDecay will decay the data in the distribution. 280 func (d *Distribution) addDecay() { 281 d.Decay(func(decay float64) { 282 d.total = 0 283 d.expectedDurationNumerator = 0 284 for i := 0; i < len(d.timings); i++ { 285 d.timings[i] = d.timings[i] * decay 286 d.total += d.timings[i] 287 d.expectedDurationNumerator += d.timings[i] * float64(DistributionDurationForBucketIndex(i)) 288 } 289 }) 290 } 291 292 // AddDataPoint will add a sampled time to the distribution, performing a decay 293 // operation if needed. 294 func (d *Distribution) AddDataPoint(dur time.Duration) { 295 // Check for negative inputs. 296 if dur < 0 { 297 build.Critical("cannot call AddDataPoint with negatime timestamp") 298 return 299 } 300 d.addDecay() 301 302 // Determine which bucket to add this datapoint to. 303 index, _ := indexForDuration(dur) 304 305 // Add the datapoint 306 d.setTiming(index, float64(d.timings[index]+1)) 307 } 308 309 // ChanceAfter returns the chance we find a data point after the given duration. 310 func (d *Distribution) ChanceAfter(dur time.Duration) float64 { 311 // Check for negative inputs. 312 if dur < 0 { 313 build.Critical("cannot call ChanceAfter with negative duration") 314 return 0 315 } 316 317 // Get the total data points. If no data was collected we return 0. 318 total := d.DataPoints() 319 if total == 0 { 320 return 0 321 } 322 323 // Get the amount of data points up until the bucket index. 324 count := float64(0) 325 index, fraction := indexForDuration(dur) 326 for i := 0; i < index; i++ { 327 count += float64(d.timings[i]) 328 } 329 330 // Add the fraction of the data points in the bucket at index. 331 count += fraction * float64(d.timings[index]) 332 333 // Calculate the chance 334 chance := count / float64(total) 335 return chance 336 } 337 338 // ChancesAfter returns an array of chances, every entry represents the chance 339 // we find a data point after the duration that corresponds with the bucket at 340 // the index of the entry. 341 func (d *Distribution) ChancesAfter() Chances { 342 var chances Chances 343 344 // Get the total data points. 345 total := d.DataPoints() 346 if total == 0 { 347 // If there are not datapoints, we set the last index to 1 to 348 // make sure the chances still add up to 100%. 349 return chances 350 } 351 352 // Loop over every bucket once and calculate the chance at that bucket 353 count := float64(0) 354 for i := 0; i < DistributionTrackerTotalBuckets; i++ { 355 chances[i] = count / float64(total) 356 count += float64(d.timings[i]) 357 } 358 359 return chances 360 } 361 362 // Clone returns a deep copy of the distribution. 363 func (d *Distribution) Clone() Distribution { 364 c := Distribution{ 365 GenericDecay: d.GenericDecay.Clone(), 366 total: d.total, 367 expectedDurationNumerator: d.expectedDurationNumerator, 368 timings: d.timings, 369 } 370 371 // sanity check using reflect package, only executed in testing 372 if build.Release == "testing" { 373 if !reflect.DeepEqual(*d, c) { 374 build.Critical("cloned distribution not equal") 375 } 376 } 377 378 return c 379 } 380 381 // DataPoints returns the total number of data points contained within the 382 // distribution. 383 func (d *Distribution) DataPoints() float64 { 384 // Decay is not applied automatically. If it has been a while since the last 385 // datapoint was added, decay should be applied so that the rates are 386 // correct. 387 d.addDecay() 388 return float64(d.total) 389 } 390 391 // DurationForIndex converts the index of a bucket into a duration. 392 func (d *Distribution) DurationForIndex(index int) time.Duration { 393 return DistributionDurationForBucketIndex(index) 394 } 395 396 // ExpectedDuration returns the estimated duration based upon the current 397 // distribution. 398 func (d Distribution) ExpectedDuration() time.Duration { 399 // Get the total data points. 400 total := d.DataPoints() 401 if total == 0 { 402 // No data collected, just return the worst case. 403 return DistributionDurationForBucketIndex(len(d.timings) - 1) 404 } 405 406 // Across all buckets, multiply the pct chance times the bucket's duration. 407 // The sum is the expected duration. 408 return time.Duration(float64(d.expectedDurationNumerator) / total) 409 } 410 411 // MergeWith merges the given distribution according to a certain weight. 412 func (d *Distribution) MergeWith(other *Distribution, weight float64) { 413 // validate the given distribution 414 if d.staticHalfLife != other.staticHalfLife { 415 build.Critical(fmt.Sprintf("only distributions with equal half lives should be merged, %v != %v", d.staticHalfLife, other.staticHalfLife)) 416 return 417 } 418 419 // validate the weight 420 if weight <= 0 || weight > 1 { 421 build.Critical(fmt.Sprintf("unexpected weight %v", weight)) 422 return 423 } 424 425 // loop all other timings and append them taking into account the given 426 // weight 427 d.total = 0 428 d.expectedDurationNumerator = 0 429 for bi, b := range other.timings { 430 d.timings[bi] += b * weight 431 d.total += d.timings[bi] 432 d.expectedDurationNumerator += d.timings[bi] * float64(DistributionDurationForBucketIndex(bi)) 433 } 434 } 435 436 // NumBuckets returns the total number of buckets in the distribution 437 func (d *Distribution) NumBuckets() int { 438 return len(d.timings) 439 } 440 441 // PStat will return the timing at which the percentage of requests is lower 442 // than the provided p. P must be greater than 0 and less than 1. 443 // 444 // A bad input will return 0. 445 func (d *Distribution) PStat(p float64) time.Duration { 446 // Check for an error value. 447 if p <= 0 || p >= 1 { 448 build.Critical("PStat needs to be called with a value inside of the range 0 to 1, used:", p) 449 return 0 450 } 451 452 // Get the total. 453 total := d.DataPoints() 454 if total == 0 { 455 // No data collected, just return the worst case. 456 return DistributionDurationForBucketIndex(DistributionTrackerTotalBuckets - 1) 457 } 458 459 // Count up until we reach p. 460 var run float64 461 var index int 462 for run/float64(total) < p && index < DistributionTrackerTotalBuckets-1 { 463 run += float64(d.timings[index]) 464 index++ 465 } 466 467 // Convert i into a duration. 468 return DistributionDurationForBucketIndex(index) 469 } 470 471 // shift returns the information required for a shifting of the distribution 472 // without actually shifting. 473 func (d *Distribution) shift(dur time.Duration) (index int, keep float64, smear float64) { 474 // Check for negative inputs. 475 if dur < 0 { 476 build.Critical("cannot call Shift with negative duration") 477 return 478 } 479 480 // Get the value at index 481 index, fraction := indexForDuration(dur) 482 value := float64(d.timings[index]) 483 484 // Calculate the fraction we want to keep and update the bucket 485 keep = (1 - fraction) * value 486 487 // Otherwise we calculate the remainder and smear it over all buckets 488 // up until we reach index. 489 remainder := fraction * value 490 smear = remainder / float64(index) 491 return 492 } 493 494 // ExpectedDurationWithShift is similar to ExpectedDuration but it assumes a 495 // shift before computing the expected value. The shift is not applied though. 496 // That makes it faster than cloning the distribution, shifting it and then 497 // calling ExpectedDuration on the copy. 498 func (d *Distribution) ExpectedDurationWithShift(dur time.Duration) time.Duration { 499 index, keep, smear := d.shift(dur) 500 501 var total float64 502 var durationNumerator float64 503 504 // Everything before index would be equal to smear. 505 for i := 0; i < index && smear > 0; i++ { 506 total += smear 507 durationNumerator += (smear * float64(DistributionDurationForBucketIndex(i))) 508 } 509 510 // At index, the value is 'keep'. 511 total += keep 512 durationNumerator += (keep * float64(DistributionDurationForBucketIndex(index))) 513 514 // After index we got the same values as before. 515 for i := index + 1; i < len(d.timings); i++ { 516 total += float64(d.timings[i]) 517 durationNumerator += float64(d.timings[i]) * float64(DistributionDurationForBucketIndex(i)) 518 } 519 // No data collected, just return the worst case. 520 if total == 0 { 521 return DistributionDurationForBucketIndex(DistributionTrackerTotalBuckets - 1) 522 } 523 return time.Duration(durationNumerator / total) 524 } 525 526 // Shift shifts the distribution by a certain duration. The shift operation will 527 // essentially ignore all data points up until the duration with which we're 528 // shifting. If that duration does not perfectly align with the distribution's 529 // buckets, we smear the fractionalised value over the buckets preceding the 530 // bucket that corresponds with the given duration. 531 func (d *Distribution) Shift(dur time.Duration) { 532 index, keep, smear := d.shift(dur) 533 534 // Set the timing at 'index' to keep. 535 d.setTiming(index, keep) 536 // If we're at index 0 we are done because there's no buckets preceding it. 537 if index == 0 { 538 return 539 } 540 // Otherwise set the smear. 541 for i := 0; i < index; i++ { 542 d.setTiming(i, smear) 543 } 544 } 545 546 // HalfLife returns this distribution's half file. 547 func (d *Distribution) HalfLife() time.Duration { 548 return d.staticHalfLife 549 } 550 551 // AddDataPoint will add a data point to each of the distributions in the 552 // tracker. 553 func (dt *DistributionTracker) AddDataPoint(dur time.Duration) { 554 dt.mu.Lock() 555 defer dt.mu.Unlock() 556 557 for _, tr := range dt.distributions { 558 tr.AddDataPoint(dur) 559 } 560 } 561 562 // Load loads the buckets of a PersistedDistributionTracker into the tracker 563 // that this method is called on, overwriting the buckets in the process. 564 func (dt *DistributionTracker) Load(tracker PersistedDistributionTracker) error { 565 dt.mu.Lock() 566 defer dt.mu.Unlock() 567 if len(dt.distributions) != len(tracker.Distributions) { 568 return fmt.Errorf("failed to load distribution tracker - number of persisted distributions doesn't match the expectations: %v != %v", len(dt.distributions), len(tracker.Distributions)) 569 } 570 for i := range tracker.Distributions { 571 for j := range dt.distributions[i].timings { 572 dt.distributions[i].setTiming(j, tracker.Distributions[i].Timings[j]) 573 } 574 } 575 return nil 576 } 577 578 // Percentiles returns the percentiles for 5 timings for each distribution in 579 // the tracker: 580 // - the p50 581 // - the p90 582 // - the p99 583 // - the p999 584 // - the p9999 585 func (dt *DistributionTracker) Percentiles() [][]time.Duration { 586 dt.mu.Lock() 587 defer dt.mu.Unlock() 588 589 timings := make([][]time.Duration, len(dt.distributions)) 590 for i := 0; i < len(timings); i++ { 591 timings[i] = make([]time.Duration, 5) 592 timings[i][0] = dt.distributions[i].PStat(.5) 593 timings[i][1] = dt.distributions[i].PStat(.9) 594 timings[i][2] = dt.distributions[i].PStat(.99) 595 timings[i][3] = dt.distributions[i].PStat(.999) 596 timings[i][4] = dt.distributions[i].PStat(.9999) 597 } 598 return timings 599 } 600 601 // DataPoints returns the total number of items represented in each 602 // distribution. 603 func (dt *DistributionTracker) DataPoints() []float64 { 604 dt.mu.Lock() 605 defer dt.mu.Unlock() 606 607 var totals []float64 608 for _, d := range dt.distributions { 609 totals = append(totals, float64(d.DataPoints())) 610 } 611 return totals 612 } 613 614 // Distribution returns the distribution at the requested index. If the given 615 // index is not within bounds it returns nil. 616 func (dt *DistributionTracker) Distribution(index int) Distribution { 617 dt.mu.Lock() 618 defer dt.mu.Unlock() 619 620 if index < 0 || index >= len(dt.distributions) { 621 build.Critical("unexpected distribution index") 622 index = 0 623 } 624 return dt.distributions[index].Clone() 625 } 626 627 // Stats returns a full suite of statistics about the distributions in the 628 // tracker. 629 func (dt *DistributionTracker) Stats() *DistributionTrackerStats { 630 return &DistributionTrackerStats{ 631 Percentiles: dt.Percentiles(), 632 DataPoints: dt.DataPoints(), 633 } 634 } 635 636 // NewDistribution will create a distribution with the provided half life. 637 func NewDistribution(halfLife time.Duration) *Distribution { 638 return &Distribution{ 639 GenericDecay: NewDecay(halfLife), 640 } 641 } 642 643 // NewDistributionTrackerStandard returns a standard distribution tracker, which 644 // tracks data points over distributions with half lives of 15 minutes, 24 645 // hours, and 30 days. 646 func NewDistributionTrackerStandard() *DistributionTracker { 647 return &DistributionTracker{ 648 distributions: []*Distribution{ 649 NewDistribution(15 * time.Minute), 650 NewDistribution(24 * time.Hour), 651 NewDistribution(30 * 24 * time.Hour), 652 }, 653 } 654 }