github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/internal/randvar/zipf.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
    12  // implied. See the License for the specific language governing
    13  // permissions and limitations under the License. See the AUTHORS file
    14  // for names of contributors.
    15  //
    16  // ZipfGenerator implements the Incrementing Zipfian Random Number Generator from
    17  // [1]: "Quickly Generating Billion-Record Synthetic Databases"
    18  // by Gray, Sundaresan, Englert, Baclawski, and Weinberger, SIGMOD 1994.
    19  
    20  package randvar
    21  
    22  import (
    23  	"fmt"
    24  	"math"
    25  	"sync"
    26  
    27  	"golang.org/x/exp/rand"
    28  )
    29  
    30  const (
    31  	// See https://github.com/brianfrankcooper/YCSB/blob/f886c1e7988f8f4965cb88a1fe2f6bad2c61b56d/core/src/main/java/com/yahoo/ycsb/generator/ScrambledZipfianGenerator.java#L33-L35
    32  	defaultMax   = 10000000000
    33  	defaultTheta = 0.99
    34  	defaultZetaN = 26.46902820178302
    35  )
    36  
    37  // Zipf is a random number generator that generates random numbers from a Zipf
    38  // distribution. Unlike rand.Zipf, this generator supports incrementing the max
    39  // parameter without performing an expensive recomputation of the underlying
    40  // hidden parameters, which is a pattern used in [1] for efficiently generating
    41  // large volumes of Zipf-distributed records for synthetic data. Second,
    42  // rand.Zipf only supports theta <= 1, we suppose all values of theta.
    43  type Zipf struct {
    44  	// Supplied constants.
    45  	theta float64
    46  	min   uint64
    47  	// Internally computed constants.
    48  	alpha, zeta2 float64
    49  	halfPowTheta float64
    50  	// Mutable state.
    51  	mu struct {
    52  		sync.Mutex
    53  		rng   *rand.Rand
    54  		max   uint64
    55  		eta   float64
    56  		zetaN float64
    57  	}
    58  }
    59  
    60  // NewDefaultZipf constructs a new Zipf generator with the default parameters.
    61  func NewDefaultZipf(rng *rand.Rand) (*Zipf, error) {
    62  	return NewZipf(rng, 1, defaultMax, defaultTheta)
    63  }
    64  
    65  // NewZipf constructs a new Zipf generator with the given parameters.  Returns
    66  // an error if the parameters are outside the accepted range.
    67  func NewZipf(rng *rand.Rand, min, max uint64, theta float64) (*Zipf, error) {
    68  	if min > max {
    69  		return nil, fmt.Errorf("min %d > max %d", min, max)
    70  	}
    71  	if theta < 0.0 || theta == 1.0 {
    72  		return nil, fmt.Errorf("0 < theta, and theta != 1")
    73  	}
    74  
    75  	z := &Zipf{
    76  		min:   min,
    77  		theta: theta,
    78  	}
    79  	z.mu.rng = ensureRand(rng)
    80  	z.mu.max = max
    81  
    82  	// Compute hidden parameters.
    83  	z.zeta2 = computeZetaFromScratch(2, theta)
    84  	z.halfPowTheta = 1.0 + math.Pow(0.5, z.theta)
    85  	z.mu.zetaN = computeZetaFromScratch(max+1-min, theta)
    86  	z.alpha = 1.0 / (1.0 - theta)
    87  	z.mu.eta = (1 - math.Pow(2.0/float64(z.mu.max+1-z.min), 1.0-theta)) / (1.0 - z.zeta2/z.mu.zetaN)
    88  	return z, nil
    89  }
    90  
    91  // computeZetaIncrementally recomputes zeta(max, theta), assuming that sum =
    92  // zeta(oldMax, theta). Returns zeta(max, theta), computed incrementally.
    93  func computeZetaIncrementally(oldMax, max uint64, theta float64, sum float64) float64 {
    94  	if max < oldMax {
    95  		panic("unable to decrement max!")
    96  	}
    97  	for i := oldMax + 1; i <= max; i++ {
    98  		sum += 1.0 / math.Pow(float64(i), theta)
    99  	}
   100  	return sum
   101  }
   102  
   103  // The function zeta computes the value
   104  // zeta(n, theta) = (1/1)^theta + (1/2)^theta + (1/3)^theta + ... + (1/n)^theta
   105  func computeZetaFromScratch(n uint64, theta float64) float64 {
   106  	if n == defaultMax && theta == defaultTheta {
   107  		// Precomputed value, borrowed from ScrambledZipfianGenerator.java. This is
   108  		// quite slow to calculate from scratch due to the large n value.
   109  		return defaultZetaN
   110  	}
   111  	return computeZetaIncrementally(0, n, theta, 0.0)
   112  }
   113  
   114  // IncMax increments max and recomputes the internal values that depend on
   115  // it. Returns an error if the recomputation failed.
   116  func (z *Zipf) IncMax(delta int) {
   117  	z.mu.Lock()
   118  	oldMax := z.mu.max
   119  	z.mu.max += uint64(delta)
   120  	z.mu.zetaN = computeZetaIncrementally(oldMax+1-z.min, z.mu.max+1-z.min, z.theta, z.mu.zetaN)
   121  	z.mu.eta = (1 - math.Pow(2.0/float64(z.mu.max+1-z.min), 1.0-z.theta)) / (1.0 - z.zeta2/z.mu.zetaN)
   122  	z.mu.Unlock()
   123  }
   124  
   125  // Uint64 draws a new value between min and max, with probabilities according
   126  // to the Zipf distribution.
   127  func (z *Zipf) Uint64() uint64 {
   128  	z.mu.Lock()
   129  	u := z.mu.rng.Float64()
   130  	uz := u * z.mu.zetaN
   131  	var result uint64
   132  	if uz < 1.0 {
   133  		result = z.min
   134  	} else if uz < z.halfPowTheta {
   135  		result = z.min + 1
   136  	} else {
   137  		spread := float64(z.mu.max + 1 - z.min)
   138  		result = z.min + uint64(spread*math.Pow(z.mu.eta*u-z.mu.eta+1.0, z.alpha))
   139  	}
   140  	z.mu.Unlock()
   141  	return result
   142  }