github.com/andy2046/gopie@v0.7.0/pkg/hyperloglog/hyperloglog.go (about) 1 // Package hyperloglog implements HyperLogLog cardinality estimation. 2 package hyperloglog 3 4 import ( 5 "errors" 6 "hash" 7 "hash/fnv" 8 "math" 9 ) 10 11 // HyperLogLog probabilistic data struct for cardinality estimation. 12 type HyperLogLog struct { 13 registers []uint8 // registers bucket 14 m uint // number of registers 15 b uint32 // number of bits to find registers bucket number 16 alpha float64 // bias-correction constant 17 hash hash.Hash32 // hash function 18 } 19 20 const ( 21 exp32 float64 = 4294967296 22 negexp32 float64 = -4294967296 23 alpha16 float64 = 0.673 24 alpha32 float64 = 0.697 25 alpha64 float64 = 0.709 26 ) 27 28 // New creates a new HyperLogLog with `m` registers bucket. 29 // `m` should be a power of two. 30 func New(m uint) (*HyperLogLog, error) { 31 if (m & (m - 1)) != 0 { 32 m = adjustM(m) 33 } 34 35 return &HyperLogLog{ 36 registers: make([]uint8, m), 37 m: m, 38 b: uint32(math.Ceil(math.Log2(float64(m)))), 39 alpha: calculateAlpha(m), 40 hash: fnv.New32(), 41 }, nil 42 } 43 44 // NewGuess creates a new HyperLogLog within the given standard error. 45 func NewGuess(stdErr float64) (*HyperLogLog, error) { 46 m := math.Pow(1.04/stdErr, 2) 47 return New(uint(math.Pow(2, math.Ceil(math.Log2(m))))) 48 } 49 50 // Add adds the data to the set. 51 func (h *HyperLogLog) Add(data []byte) { 52 var ( 53 hash = h.calculateHash(data) 54 k = 32 - h.b 55 r = calculateConsecutiveZeros(hash, k) 56 j = hash >> uint(k) 57 ) 58 59 if r > h.registers[j] { 60 h.registers[j] = r 61 } 62 } 63 64 // Count returns the estimated cardinality of the set. 65 func (h *HyperLogLog) Count() uint64 { 66 sum, m := 0.0, float64(h.m) 67 for _, rv := range h.registers { 68 sum += 1.0 / math.Pow(2.0, float64(rv)) 69 } 70 estimate := h.alpha * m * m / sum 71 if estimate <= 5.0/2.0*m { 72 // Small range correction 73 v := 0 74 for _, r := range h.registers { 75 if r == 0 { 76 v++ 77 } 78 } 79 if v > 0 { 80 estimate = m * math.Log(m/float64(v)) 81 } 82 } else if estimate > 1.0/30.0*exp32 { 83 // Large range correction 84 estimate = negexp32 * math.Log(1-estimate/exp32) 85 } 86 return uint64(estimate) 87 } 88 89 // Merge combines the HyperLogLog with the other. 90 func (h *HyperLogLog) Merge(other *HyperLogLog) error { 91 if h.m != other.m { 92 return errors.New("registers bucket number must match") 93 } 94 95 for j, r := range other.registers { 96 if r > h.registers[j] { 97 h.registers[j] = r 98 } 99 } 100 101 return nil 102 } 103 104 // Reset restores the HyperLogLog to its original state. 105 func (h *HyperLogLog) Reset() { 106 h.registers = make([]uint8, h.m) 107 } 108 109 // SetHash sets the hashing function. 110 func (h *HyperLogLog) SetHash(hasher hash.Hash32) { 111 h.hash = hasher 112 } 113 114 func (h *HyperLogLog) calculateHash(data []byte) uint32 { 115 h.hash.Reset() 116 h.hash.Write(data) 117 sum := h.hash.Sum32() 118 return sum 119 } 120 121 func calculateAlpha(m uint) float64 { 122 var a float64 123 switch m { 124 case 16: 125 a = alpha16 126 case 32: 127 a = alpha32 128 case 64: 129 a = alpha64 130 default: 131 a = 0.7213 / (1.0 + 1.079/float64(m)) 132 } 133 return a 134 } 135 136 // calculateConsecutiveZeros calculates the position of the rightmost 1-bit. 137 func calculateConsecutiveZeros(val, max uint32) uint8 { 138 r := uint32(1) 139 for val&1 == 0 && r <= max { 140 r++ 141 val >>= 1 142 } 143 return uint8(r) 144 } 145 146 func adjustM(x uint) uint { 147 m := uint(1) 148 for m < x { 149 m <<= 1 150 } 151 return m 152 }