github.com/fluhus/gostuff@v0.4.1-0.20240331134726-be71864f2b5d/hll/hll.go (about) 1 // Package hll provides an implementation of the HyperLogLog algorithm. 2 // 3 // A HyperLogLog counter can approximate the cardinality of a set with high 4 // accuracy and little memory. 5 // 6 // # Accuracy 7 // 8 // Average error for 1,000,000,000 elements for different values of logSize: 9 // 10 // logSize average error % 11 // 4 21 12 // 5 12 13 // 6 10 14 // 7 8.1 15 // 8 4.8 16 // 9 3.6 17 // 10 1.9 18 // 11 1.2 19 // 12 1.0 20 // 13 0.7 21 // 14 0.5 22 // 15 0.33 23 // 16 0.25 24 // 25 // # Citation 26 // 27 // Flajolet, Philippe; Fusy, Éric; Gandouet, Olivier; Meunier, Frédéric (2007). 28 // "Hyperloglog: The analysis of a near-optimal cardinality estimation 29 // algorithm". Discrete Mathematics and Theoretical Computer Science 30 // Proceedings. 31 package hll 32 33 import ( 34 "encoding/json" 35 "fmt" 36 "hash" 37 "math" 38 _ "unsafe" 39 40 "github.com/spaolacci/murmur3" 41 ) 42 43 const ( 44 nbits = 16 45 m = 1 << nbits 46 mask = m - 1 47 alpha = 0.7213 / (1.0 + 1.079/m) 48 ) 49 50 //go:linkname fastrand runtime.fastrand 51 func fastrand() uint32 52 53 // An HLL is a HyperLogLog counter for arbitrary values. 54 // 55 // Deprecated: use HLL2. 56 type HLL struct { 57 counters []byte 58 h hash.Hash64 59 seed uint32 60 } 61 62 // New creates a new HyperLogLog counter with a random hash seed. 63 func New() *HLL { 64 return NewSeed(fastrand()) 65 } 66 67 // NewSeed creates a new HyperLogLog counter with the given hash seed. 68 func NewSeed(seed uint32) *HLL { 69 return &HLL{ 70 counters: make([]byte, m), 71 h: murmur3.New64WithSeed(seed), 72 seed: seed, 73 } 74 } 75 76 // Add adds v to the counter. Calls hash once. 77 func (h *HLL) Add(v []byte) { 78 h.h.Reset() 79 h.h.Write(v) 80 hash := h.h.Sum64() 81 82 idx := hash & mask 83 fp := hash >> nbits 84 z := byte(nzeros(fp)) + 1 85 if z > h.counters[idx] { 86 h.counters[idx] = z 87 } 88 } 89 90 // ApproxCount returns the current approximate count. 91 // Does not alter the state of the counter. 92 func (h *HLL) ApproxCount() int { 93 z := 0.0 94 for _, v := range h.counters { 95 z += math.Pow(2, -float64(v)) 96 } 97 z = 1.0 / z 98 result := int(alpha * m * m * z) 99 100 if result < m*5/2 { 101 zeros := 0 102 for _, v := range h.counters { 103 if v == 0 { 104 zeros++ 105 } 106 } 107 // If some registers are zero, use linear counting. 108 if zeros > 0 { 109 result = int(m * math.Log(m/float64(zeros))) 110 } 111 } 112 113 return result 114 } 115 116 // nzeros counts the number of zeros on the right side of a binary number. 117 func nzeros(a uint64) int { 118 if a == 0 { 119 return 64 - nbits // Number of bits after using the first nbits. 120 } 121 n := 0 122 for a&1 == 0 { 123 n++ 124 a /= 2 125 } 126 return n 127 } 128 129 // AddHLL adds the state of another counter to h. 130 // The result is equivalent to adding all the values of other to h. 131 func (h *HLL) AddHLL(other *HLL) { 132 if h.seed != other.seed { 133 panic(fmt.Sprintf("seeds don't match: %v, %v", h.seed, other.seed)) 134 } 135 for i, b := range other.counters { 136 if h.counters[i] < b { 137 h.counters[i] = b 138 } 139 } 140 } 141 142 // Used for JSON marshaling/unmarshaling. 143 type jsonHLL struct { 144 Counters []byte 145 Seed uint32 146 } 147 148 // MarshalJSON implements the json.Marshaler interface. 149 func (h *HLL) MarshalJSON() ([]byte, error) { 150 return json.Marshal(&jsonHLL{Counters: h.counters, Seed: h.seed}) 151 } 152 153 // UnmarshalJSON implements the json.Unmarshaler interface. 154 func (h *HLL) UnmarshalJSON(b []byte) error { 155 jh := &jsonHLL{} 156 if err := json.Unmarshal(b, jh); err != nil { 157 return err 158 } 159 h.counters = jh.Counters 160 h.h = murmur3.New64WithSeed(jh.Seed) 161 h.seed = jh.Seed 162 return nil 163 }