github.com/andy2046/gopie@v0.7.0/pkg/hyperloglog/hyperloglog.go (about)

     1  // Package hyperloglog implements HyperLogLog cardinality estimation.
     2  package hyperloglog
     3  
     4  import (
     5  	"errors"
     6  	"hash"
     7  	"hash/fnv"
     8  	"math"
     9  )
    10  
    11  // HyperLogLog probabilistic data struct for cardinality estimation.
    12  type HyperLogLog struct {
    13  	registers []uint8     // registers bucket
    14  	m         uint        // number of registers
    15  	b         uint32      // number of bits to find registers bucket number
    16  	alpha     float64     // bias-correction constant
    17  	hash      hash.Hash32 // hash function
    18  }
    19  
    20  const (
    21  	exp32    float64 = 4294967296
    22  	negexp32 float64 = -4294967296
    23  	alpha16  float64 = 0.673
    24  	alpha32  float64 = 0.697
    25  	alpha64  float64 = 0.709
    26  )
    27  
    28  // New creates a new HyperLogLog with `m` registers bucket.
    29  // `m` should be a power of two.
    30  func New(m uint) (*HyperLogLog, error) {
    31  	if (m & (m - 1)) != 0 {
    32  		m = adjustM(m)
    33  	}
    34  
    35  	return &HyperLogLog{
    36  		registers: make([]uint8, m),
    37  		m:         m,
    38  		b:         uint32(math.Ceil(math.Log2(float64(m)))),
    39  		alpha:     calculateAlpha(m),
    40  		hash:      fnv.New32(),
    41  	}, nil
    42  }
    43  
    44  // NewGuess creates a new HyperLogLog within the given standard error.
    45  func NewGuess(stdErr float64) (*HyperLogLog, error) {
    46  	m := math.Pow(1.04/stdErr, 2)
    47  	return New(uint(math.Pow(2, math.Ceil(math.Log2(m)))))
    48  }
    49  
    50  // Add adds the data to the set.
    51  func (h *HyperLogLog) Add(data []byte) {
    52  	var (
    53  		hash = h.calculateHash(data)
    54  		k    = 32 - h.b
    55  		r    = calculateConsecutiveZeros(hash, k)
    56  		j    = hash >> uint(k)
    57  	)
    58  
    59  	if r > h.registers[j] {
    60  		h.registers[j] = r
    61  	}
    62  }
    63  
    64  // Count returns the estimated cardinality of the set.
    65  func (h *HyperLogLog) Count() uint64 {
    66  	sum, m := 0.0, float64(h.m)
    67  	for _, rv := range h.registers {
    68  		sum += 1.0 / math.Pow(2.0, float64(rv))
    69  	}
    70  	estimate := h.alpha * m * m / sum
    71  	if estimate <= 5.0/2.0*m {
    72  		// Small range correction
    73  		v := 0
    74  		for _, r := range h.registers {
    75  			if r == 0 {
    76  				v++
    77  			}
    78  		}
    79  		if v > 0 {
    80  			estimate = m * math.Log(m/float64(v))
    81  		}
    82  	} else if estimate > 1.0/30.0*exp32 {
    83  		// Large range correction
    84  		estimate = negexp32 * math.Log(1-estimate/exp32)
    85  	}
    86  	return uint64(estimate)
    87  }
    88  
    89  // Merge combines the HyperLogLog with the other.
    90  func (h *HyperLogLog) Merge(other *HyperLogLog) error {
    91  	if h.m != other.m {
    92  		return errors.New("registers bucket number must match")
    93  	}
    94  
    95  	for j, r := range other.registers {
    96  		if r > h.registers[j] {
    97  			h.registers[j] = r
    98  		}
    99  	}
   100  
   101  	return nil
   102  }
   103  
   104  // Reset restores the HyperLogLog to its original state.
   105  func (h *HyperLogLog) Reset() {
   106  	h.registers = make([]uint8, h.m)
   107  }
   108  
   109  // SetHash sets the hashing function.
   110  func (h *HyperLogLog) SetHash(hasher hash.Hash32) {
   111  	h.hash = hasher
   112  }
   113  
   114  func (h *HyperLogLog) calculateHash(data []byte) uint32 {
   115  	h.hash.Reset()
   116  	h.hash.Write(data)
   117  	sum := h.hash.Sum32()
   118  	return sum
   119  }
   120  
   121  func calculateAlpha(m uint) float64 {
   122  	var a float64
   123  	switch m {
   124  	case 16:
   125  		a = alpha16
   126  	case 32:
   127  		a = alpha32
   128  	case 64:
   129  		a = alpha64
   130  	default:
   131  		a = 0.7213 / (1.0 + 1.079/float64(m))
   132  	}
   133  	return a
   134  }
   135  
   136  // calculateConsecutiveZeros calculates the position of the rightmost 1-bit.
   137  func calculateConsecutiveZeros(val, max uint32) uint8 {
   138  	r := uint32(1)
   139  	for val&1 == 0 && r <= max {
   140  		r++
   141  		val >>= 1
   142  	}
   143  	return uint8(r)
   144  }
   145  
   146  func adjustM(x uint) uint {
   147  	m := uint(1)
   148  	for m < x {
   149  		m <<= 1
   150  	}
   151  	return m
   152  }