github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/randvar/zipf.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
    12  // implied. See the License for the specific language governing
    13  // permissions and limitations under the License. See the AUTHORS file
    14  // for names of contributors.
    15  //
    16  // ZipfGenerator implements the Incrementing Zipfian Random Number Generator from
    17  // [1]: "Quickly Generating Billion-Record Synthetic Databases"
    18  // by Gray, Sundaresan, Englert, Baclawski, and Weinberger, SIGMOD 1994.
    19  
    20  package randvar
    21  
    22  import (
    23  	"math"
    24  	"sync"
    25  
    26  	"github.com/cockroachdb/errors"
    27  	"golang.org/x/exp/rand"
    28  )
    29  
    30  const (
    31  	// See https://github.com/brianfrankcooper/YCSB/blob/f886c1e7988f8f4965cb88a1fe2f6bad2c61b56d/core/src/main/java/com/yahoo/ycsb/generator/ScrambledZipfianGenerator.java#L33-L35
    32  	defaultMax   = 10000000000
    33  	defaultTheta = 0.99
    34  	defaultZetaN = 26.46902820178302
    35  )
    36  
    37  // Zipf is a random number generator that generates random numbers from a Zipf
    38  // distribution. Unlike rand.Zipf, this generator supports incrementing the max
    39  // parameter without performing an expensive recomputation of the underlying
    40  // hidden parameters, which is a pattern used in [1] for efficiently generating
    41  // large volumes of Zipf-distributed records for synthetic data. Second,
    42  // rand.Zipf only supports theta <= 1, we suppose all values of theta.
    43  type Zipf struct {
    44  	// Supplied constants.
    45  	theta float64
    46  	min   uint64
    47  	// Internally computed constants.
    48  	alpha, zeta2 float64
    49  	halfPowTheta float64
    50  	// Mutable state.
    51  	mu struct {
    52  		sync.RWMutex
    53  		max   uint64
    54  		eta   float64
    55  		zetaN float64
    56  	}
    57  }
    58  
    59  // NewDefaultZipf constructs a new Zipf generator with the default parameters.
    60  func NewDefaultZipf() (*Zipf, error) {
    61  	return NewZipf(1, defaultMax, defaultTheta)
    62  }
    63  
    64  // NewZipf constructs a new Zipf generator with the given parameters.  Returns
    65  // an error if the parameters are outside the accepted range.
    66  func NewZipf(min, max uint64, theta float64) (*Zipf, error) {
    67  	if min > max {
    68  		return nil, errors.Errorf("min %d > max %d", errors.Safe(min), errors.Safe(max))
    69  	}
    70  	if theta < 0.0 || theta == 1.0 {
    71  		return nil, errors.New("0 < theta, and theta != 1")
    72  	}
    73  
    74  	z := &Zipf{
    75  		min:   min,
    76  		theta: theta,
    77  	}
    78  	z.mu.max = max
    79  
    80  	// Compute hidden parameters.
    81  	z.zeta2 = computeZetaFromScratch(2, theta)
    82  	z.halfPowTheta = 1.0 + math.Pow(0.5, z.theta)
    83  	z.mu.zetaN = computeZetaFromScratch(max+1-min, theta)
    84  	z.alpha = 1.0 / (1.0 - theta)
    85  	z.mu.eta = (1 - math.Pow(2.0/float64(z.mu.max+1-z.min), 1.0-theta)) / (1.0 - z.zeta2/z.mu.zetaN)
    86  	return z, nil
    87  }
    88  
    89  // computeZetaIncrementally recomputes zeta(max, theta), assuming that sum =
    90  // zeta(oldMax, theta). Returns zeta(max, theta), computed incrementally.
    91  func computeZetaIncrementally(oldMax, max uint64, theta float64, sum float64) float64 {
    92  	if max < oldMax {
    93  		panic("unable to decrement max!")
    94  	}
    95  	for i := oldMax + 1; i <= max; i++ {
    96  		sum += 1.0 / math.Pow(float64(i), theta)
    97  	}
    98  	return sum
    99  }
   100  
   101  // The function zeta computes the value
   102  // zeta(n, theta) = (1/1)^theta + (1/2)^theta + (1/3)^theta + ... + (1/n)^theta
   103  func computeZetaFromScratch(n uint64, theta float64) float64 {
   104  	if n == defaultMax && theta == defaultTheta {
   105  		// Precomputed value, borrowed from ScrambledZipfianGenerator.java. This is
   106  		// quite slow to calculate from scratch due to the large n value.
   107  		return defaultZetaN
   108  	}
   109  	return computeZetaIncrementally(0, n, theta, 0.0)
   110  }
   111  
   112  // IncMax increments max and recomputes the internal values that depend on
   113  // it. Returns an error if the recomputation failed.
   114  func (z *Zipf) IncMax(delta int) {
   115  	z.mu.Lock()
   116  	oldMax := z.mu.max
   117  	z.mu.max += uint64(delta)
   118  	z.mu.zetaN = computeZetaIncrementally(oldMax+1-z.min, z.mu.max+1-z.min, z.theta, z.mu.zetaN)
   119  	z.mu.eta = (1 - math.Pow(2.0/float64(z.mu.max+1-z.min), 1.0-z.theta)) / (1.0 - z.zeta2/z.mu.zetaN)
   120  	z.mu.Unlock()
   121  }
   122  
   123  // Max returns the max.
   124  func (z *Zipf) Max() uint64 {
   125  	z.mu.Lock()
   126  	defer z.mu.Unlock()
   127  	return z.mu.max
   128  }
   129  
   130  // Uint64 draws a new value between min and max, with probabilities according
   131  // to the Zipf distribution.
   132  func (z *Zipf) Uint64(rng *rand.Rand) uint64 {
   133  	u := rng.Float64()
   134  	z.mu.RLock()
   135  	uz := u * z.mu.zetaN
   136  	var result uint64
   137  	if uz < 1.0 {
   138  		result = z.min
   139  	} else if uz < z.halfPowTheta {
   140  		result = z.min + 1
   141  	} else {
   142  		spread := float64(z.mu.max + 1 - z.min)
   143  		result = z.min + uint64(spread*math.Pow(z.mu.eta*u-z.mu.eta+1.0, z.alpha))
   144  	}
   145  	z.mu.RUnlock()
   146  	return result
   147  }