github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/randvar/zipf.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 // implied. See the License for the specific language governing 13 // permissions and limitations under the License. See the AUTHORS file 14 // for names of contributors. 15 // 16 // ZipfGenerator implements the Incrementing Zipfian Random Number Generator from 17 // [1]: "Quickly Generating Billion-Record Synthetic Databases" 18 // by Gray, Sundaresan, Englert, Baclawski, and Weinberger, SIGMOD 1994. 19 20 package randvar 21 22 import ( 23 "math" 24 "sync" 25 26 "github.com/cockroachdb/errors" 27 "golang.org/x/exp/rand" 28 ) 29 30 const ( 31 // See https://github.com/brianfrankcooper/YCSB/blob/f886c1e7988f8f4965cb88a1fe2f6bad2c61b56d/core/src/main/java/com/yahoo/ycsb/generator/ScrambledZipfianGenerator.java#L33-L35 32 defaultMax = 10000000000 33 defaultTheta = 0.99 34 defaultZetaN = 26.46902820178302 35 ) 36 37 // Zipf is a random number generator that generates random numbers from a Zipf 38 // distribution. Unlike rand.Zipf, this generator supports incrementing the max 39 // parameter without performing an expensive recomputation of the underlying 40 // hidden parameters, which is a pattern used in [1] for efficiently generating 41 // large volumes of Zipf-distributed records for synthetic data. Second, 42 // rand.Zipf only supports theta <= 1, we suppose all values of theta. 43 type Zipf struct { 44 // Supplied constants. 45 theta float64 46 min uint64 47 // Internally computed constants. 48 alpha, zeta2 float64 49 halfPowTheta float64 50 // Mutable state. 51 mu struct { 52 sync.RWMutex 53 max uint64 54 eta float64 55 zetaN float64 56 } 57 } 58 59 // NewDefaultZipf constructs a new Zipf generator with the default parameters. 60 func NewDefaultZipf() (*Zipf, error) { 61 return NewZipf(1, defaultMax, defaultTheta) 62 } 63 64 // NewZipf constructs a new Zipf generator with the given parameters. Returns 65 // an error if the parameters are outside the accepted range. 66 func NewZipf(min, max uint64, theta float64) (*Zipf, error) { 67 if min > max { 68 return nil, errors.Errorf("min %d > max %d", errors.Safe(min), errors.Safe(max)) 69 } 70 if theta < 0.0 || theta == 1.0 { 71 return nil, errors.New("0 < theta, and theta != 1") 72 } 73 74 z := &Zipf{ 75 min: min, 76 theta: theta, 77 } 78 z.mu.max = max 79 80 // Compute hidden parameters. 81 z.zeta2 = computeZetaFromScratch(2, theta) 82 z.halfPowTheta = 1.0 + math.Pow(0.5, z.theta) 83 z.mu.zetaN = computeZetaFromScratch(max+1-min, theta) 84 z.alpha = 1.0 / (1.0 - theta) 85 z.mu.eta = (1 - math.Pow(2.0/float64(z.mu.max+1-z.min), 1.0-theta)) / (1.0 - z.zeta2/z.mu.zetaN) 86 return z, nil 87 } 88 89 // computeZetaIncrementally recomputes zeta(max, theta), assuming that sum = 90 // zeta(oldMax, theta). Returns zeta(max, theta), computed incrementally. 91 func computeZetaIncrementally(oldMax, max uint64, theta float64, sum float64) float64 { 92 if max < oldMax { 93 panic("unable to decrement max!") 94 } 95 for i := oldMax + 1; i <= max; i++ { 96 sum += 1.0 / math.Pow(float64(i), theta) 97 } 98 return sum 99 } 100 101 // The function zeta computes the value 102 // zeta(n, theta) = (1/1)^theta + (1/2)^theta + (1/3)^theta + ... + (1/n)^theta 103 func computeZetaFromScratch(n uint64, theta float64) float64 { 104 if n == defaultMax && theta == defaultTheta { 105 // Precomputed value, borrowed from ScrambledZipfianGenerator.java. This is 106 // quite slow to calculate from scratch due to the large n value. 107 return defaultZetaN 108 } 109 return computeZetaIncrementally(0, n, theta, 0.0) 110 } 111 112 // IncMax increments max and recomputes the internal values that depend on 113 // it. Returns an error if the recomputation failed. 114 func (z *Zipf) IncMax(delta int) { 115 z.mu.Lock() 116 oldMax := z.mu.max 117 z.mu.max += uint64(delta) 118 z.mu.zetaN = computeZetaIncrementally(oldMax+1-z.min, z.mu.max+1-z.min, z.theta, z.mu.zetaN) 119 z.mu.eta = (1 - math.Pow(2.0/float64(z.mu.max+1-z.min), 1.0-z.theta)) / (1.0 - z.zeta2/z.mu.zetaN) 120 z.mu.Unlock() 121 } 122 123 // Max returns the max. 124 func (z *Zipf) Max() uint64 { 125 z.mu.Lock() 126 defer z.mu.Unlock() 127 return z.mu.max 128 } 129 130 // Uint64 draws a new value between min and max, with probabilities according 131 // to the Zipf distribution. 132 func (z *Zipf) Uint64(rng *rand.Rand) uint64 { 133 u := rng.Float64() 134 z.mu.RLock() 135 uz := u * z.mu.zetaN 136 var result uint64 137 if uz < 1.0 { 138 result = z.min 139 } else if uz < z.halfPowTheta { 140 result = z.min + 1 141 } else { 142 spread := float64(z.mu.max + 1 - z.min) 143 result = z.min + uint64(spread*math.Pow(z.mu.eta*u-z.mu.eta+1.0, z.alpha)) 144 } 145 z.mu.RUnlock() 146 return result 147 }