github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/workload/ycsb/zipfgenerator.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  //
    11  // ZipfGenerator implements the Incrementing Zipfian Random Number Generator from
    12  // [1]: "Quickly Generating Billion-Record Synthetic Databases"
    13  // by Gray, Sundaresan, Englert, Baclawski, and Weinberger, SIGMOD 1994.
    14  
    15  package ycsb
    16  
    17  import (
    18  	"fmt"
    19  	"math"
    20  	"math/rand"
    21  
    22  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    23  	"github.com/cockroachdb/errors"
    24  )
    25  
    26  const (
    27  	// See https://github.com/brianfrankcooper/YCSB/blob/f886c1e7988f8f4965cb88a1fe2f6bad2c61b56d/core/src/main/java/com/yahoo/ycsb/generator/ScrambledZipfianGenerator.java#L33-L35
    28  	defaultIMax  = 10000000000
    29  	defaultTheta = 0.99
    30  	defaultZetaN = 26.46902820178302
    31  )
    32  
    33  // ZipfGenerator is a random number generator that generates draws from a Zipf
    34  // distribution. Unlike rand.Zipf, this generator supports incrementing the
    35  // imax parameter without performing an expensive recomputation of the
    36  // underlying hidden parameters, which is a pattern used in [1] for efficiently
    37  // generating large volumes of Zipf-distributed records for synthetic data.
    38  // Second, rand.Zipf only supports theta <= 1, we suppose all values of theta.
    39  type ZipfGenerator struct {
    40  	// The underlying RNG
    41  	zipfGenMu ZipfGeneratorMu
    42  	// supplied values
    43  	theta float64
    44  	iMin  uint64
    45  	// internally computed values
    46  	alpha, zeta2, halfPowTheta float64
    47  	verbose                    bool
    48  }
    49  
    50  // ZipfGeneratorMu holds variables which must be globally synced.
    51  type ZipfGeneratorMu struct {
    52  	mu    syncutil.Mutex
    53  	r     *rand.Rand
    54  	iMax  uint64
    55  	eta   float64
    56  	zetaN float64
    57  }
    58  
    59  // NewZipfGenerator constructs a new ZipfGenerator with the given parameters.
    60  // It returns an error if the parameters are outside the accepted range.
    61  func NewZipfGenerator(
    62  	rng *rand.Rand, iMin, iMax uint64, theta float64, verbose bool,
    63  ) (*ZipfGenerator, error) {
    64  	if iMin > iMax {
    65  		return nil, errors.Errorf("iMin %d > iMax %d", iMin, iMax)
    66  	}
    67  	if theta < 0.0 || theta == 1.0 {
    68  		return nil, errors.Errorf("0 < theta, and theta != 1")
    69  	}
    70  
    71  	z := ZipfGenerator{
    72  		iMin: iMin,
    73  		zipfGenMu: ZipfGeneratorMu{
    74  			r:    rng,
    75  			iMax: iMax,
    76  		},
    77  		theta:   theta,
    78  		verbose: verbose,
    79  	}
    80  	z.zipfGenMu.mu.Lock()
    81  	defer z.zipfGenMu.mu.Unlock()
    82  
    83  	// Compute hidden parameters
    84  	zeta2, err := computeZetaFromScratch(2, theta)
    85  	if err != nil {
    86  		return nil, errors.Errorf("Could not compute zeta(2,theta): %s", err)
    87  	}
    88  	var zetaN float64
    89  	zetaN, err = computeZetaFromScratch(iMax+1-iMin, theta)
    90  	if err != nil {
    91  		return nil, errors.Errorf("Could not compute zeta(%d,theta): %s", iMax, err)
    92  	}
    93  	z.alpha = 1.0 / (1.0 - theta)
    94  	z.zipfGenMu.eta = (1 - math.Pow(2.0/float64(z.zipfGenMu.iMax+1-z.iMin), 1.0-theta)) / (1.0 - zeta2/zetaN)
    95  	z.zipfGenMu.zetaN = zetaN
    96  	z.zeta2 = zeta2
    97  	z.halfPowTheta = 1.0 + math.Pow(0.5, z.theta)
    98  	return &z, nil
    99  }
   100  
   101  // computeZetaIncrementally recomputes zeta(iMax, theta), assuming that
   102  // sum = zeta(oldIMax, theta). It returns zeta(iMax, theta), computed incrementally.
   103  func computeZetaIncrementally(oldIMax, iMax uint64, theta float64, sum float64) (float64, error) {
   104  	if iMax < oldIMax {
   105  		return 0, errors.Errorf("Can't increment iMax backwards!")
   106  	}
   107  	for i := oldIMax + 1; i <= iMax; i++ {
   108  		sum += 1.0 / math.Pow(float64(i), theta)
   109  	}
   110  	return sum, nil
   111  }
   112  
   113  // The function zeta computes the value
   114  // zeta(n, theta) = (1/1)^theta + (1/2)^theta + (1/3)^theta + ... + (1/n)^theta
   115  func computeZetaFromScratch(n uint64, theta float64) (float64, error) {
   116  	if n == defaultIMax && theta == defaultTheta {
   117  		// Precomputed value, borrowed from ScrambledZipfianGenerator.java. (This is
   118  		// quite slow to calculate from scratch due to the large n value.)
   119  		return defaultZetaN, nil
   120  	}
   121  	zeta, err := computeZetaIncrementally(0, n, theta, 0.0)
   122  	if err != nil {
   123  		return zeta, errors.Errorf("could not compute zeta: %s", err)
   124  	}
   125  	return zeta, nil
   126  }
   127  
   128  // Uint64 draws a new value between iMin and iMax, with probabilities
   129  // according to the Zipf distribution.
   130  func (z *ZipfGenerator) Uint64() uint64 {
   131  	z.zipfGenMu.mu.Lock()
   132  	u := z.zipfGenMu.r.Float64()
   133  	uz := u * z.zipfGenMu.zetaN
   134  	var result uint64
   135  	if uz < 1.0 {
   136  		result = z.iMin
   137  	} else if uz < z.halfPowTheta {
   138  		result = z.iMin + 1
   139  	} else {
   140  		spread := float64(z.zipfGenMu.iMax + 1 - z.iMin)
   141  		result = z.iMin + uint64(int64(spread*math.Pow(z.zipfGenMu.eta*u-z.zipfGenMu.eta+1.0, z.alpha)))
   142  	}
   143  	if z.verbose {
   144  		fmt.Printf("Uint64[%d, %d] -> %d\n", z.iMin, z.zipfGenMu.iMax, result)
   145  	}
   146  	z.zipfGenMu.mu.Unlock()
   147  	return result
   148  }
   149  
   150  // IncrementIMax increments iMax by count and recomputes the internal values
   151  // that depend on it. It throws an error if the recomputation failed.
   152  func (z *ZipfGenerator) IncrementIMax(count uint64) error {
   153  	z.zipfGenMu.mu.Lock()
   154  	zetaN, err := computeZetaIncrementally(
   155  		z.zipfGenMu.iMax+1-z.iMin, z.zipfGenMu.iMax+count+1-z.iMin, z.theta, z.zipfGenMu.zetaN)
   156  	if err != nil {
   157  		z.zipfGenMu.mu.Unlock()
   158  		return errors.Errorf("Could not incrementally compute zeta: %s", err)
   159  	}
   160  	z.zipfGenMu.iMax += count
   161  	eta := (1 - math.Pow(2.0/float64(z.zipfGenMu.iMax+1-z.iMin), 1.0-z.theta)) / (1.0 - z.zeta2/zetaN)
   162  	z.zipfGenMu.eta = eta
   163  	z.zipfGenMu.zetaN = zetaN
   164  	z.zipfGenMu.mu.Unlock()
   165  	return nil
   166  }