github.com/fluhus/gostuff@v0.4.1-0.20240331134726-be71864f2b5d/hll/v2/hll.go (about) 1 // Package hll provides an implementation of the HyperLogLog algorithm. 2 // 3 // A HyperLogLog counter can approximate the cardinality of a set with high 4 // accuracy and little memory. 5 // 6 // # Accuracy 7 // 8 // Average error for 1,000,000,000 elements for different values of logSize: 9 // 10 // logSize average error % 11 // 4 21 12 // 5 12 13 // 6 10 14 // 7 8.1 15 // 8 4.8 16 // 9 3.6 17 // 10 1.9 18 // 11 1.2 19 // 12 1.0 20 // 13 0.7 21 // 14 0.5 22 // 15 0.33 23 // 16 0.25 24 // 25 // # Citation 26 // 27 // Flajolet, Philippe; Fusy, Éric; Gandouet, Olivier; Meunier, Frédéric (2007). 28 // "Hyperloglog: The analysis of a near-optimal cardinality estimation 29 // algorithm". Discrete Mathematics and Theoretical Computer Science 30 // Proceedings. 31 package hll 32 33 import ( 34 "fmt" 35 "math" 36 ) 37 38 // An HLL is a HyperLogLog counter for arbitrary values. 39 type HLL[T any] struct { 40 counters []byte 41 h func(T) uint64 42 nbits int 43 m int 44 mask uint64 45 } 46 47 // New creates a new HyperLogLog counter. 48 // The counter will use 2^logSize bytes. 49 // h is the hash function to use for added values. 50 func New[T any](logSize int, h func(T) uint64) *HLL[T] { 51 if logSize < 4 { 52 panic(fmt.Sprintf("logSize=%v, should be at least 4", logSize)) 53 } 54 m := 1 << logSize 55 return &HLL[T]{ 56 counters: make([]byte, m), 57 h: h, 58 nbits: logSize, 59 m: m, 60 mask: uint64(m - 1), 61 } 62 } 63 64 // Add adds v to the counter. Calls hash once. 65 func (h *HLL[T]) Add(t T) { 66 hash := h.h(t) 67 idx := hash & h.mask 68 fp := hash >> h.nbits 69 z := byte(h.nzeros(fp)) + 1 70 if z > h.counters[idx] { 71 h.counters[idx] = z 72 } 73 } 74 75 // ApproxCount returns the current approximate count. 76 // Does not alter the state of the counter. 77 func (h *HLL[T]) ApproxCount() int { 78 z := 0.0 79 for _, v := range h.counters { 80 z += math.Pow(2, -float64(v)) 81 } 82 z = 1.0 / z 83 fm := float64(h.m) 84 result := int(h.alpha() * fm * fm * z) 85 86 if result < h.m*5/2 { 87 zeros := 0 88 for _, v := range h.counters { 89 if v == 0 { 90 zeros++ 91 } 92 } 93 // If some registers are zero, use linear counting. 94 if zeros > 0 { 95 result = int(fm * math.Log(fm/float64(zeros))) 96 } 97 } 98 99 return result 100 } 101 102 // Returns the alpha value to use depending on m. 103 func (h *HLL[T]) alpha() float64 { 104 switch h.m { 105 case 16: 106 return 0.673 107 case 32: 108 return 0.697 109 case 64: 110 return 0.709 111 } 112 return 0.7213 / (1 + 1.079/float64(h.m)) 113 } 114 115 // nzeros counts the number of zeros on the right side of a binary number. 116 func (h *HLL[T]) nzeros(a uint64) int { 117 if a == 0 { 118 return 64 - h.nbits // Number of bits after using the first nbits. 119 } 120 n := 0 121 for a&1 == 0 { 122 n++ 123 a /= 2 124 } 125 return n 126 } 127 128 // AddHLL adds the state of another counter to h, 129 // assuming they use the same hash function. 130 // The result is equivalent to adding all the values of other to h. 131 func (h *HLL[T]) AddHLL(other *HLL[T]) { 132 if len(h.counters) != len(other.counters) { 133 panic("merging HLLs with different sizes") 134 } 135 for i, b := range other.counters { 136 if h.counters[i] < b { 137 h.counters[i] = b 138 } 139 } 140 }