github.com/gopherd/gonum@v0.0.4/stat/card/hll32.go (about) 1 // Copyright ©2019 The Gonum Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package card 6 7 import ( 8 "bytes" 9 "encoding/gob" 10 "errors" 11 "fmt" 12 "hash" 13 "math" 14 "math/bits" 15 "reflect" 16 ) 17 18 // HyperLogLog32 is implements cardinality estimation according to the 19 // HyperLogLog algorithm described in Analysis of Algorithms, pp127–146. 20 type HyperLogLog32 struct { 21 p uint8 22 m uint32 23 24 hash hash.Hash32 25 26 register []uint8 27 } 28 29 // NewHyperLogLog32 returns a new HyperLogLog32 sketch. The value of prec 30 // must be in the range [4, 32]. NewHyperLogLog32 will allocate a byte slice 31 // that is 2^prec long. 32 func NewHyperLogLog32(prec int, h hash.Hash32) (*HyperLogLog32, error) { 33 // The implementation here is based on the pseudo-code in 34 // "HyperLogLog: the analysis of a near-optimal cardinality 35 // estimation algorithm", figure 3. 36 37 if prec < 4 || w32 < prec { 38 return nil, errors.New("card: precision out of range") 39 } 40 p := uint8(prec) 41 m := uint32(1) << p 42 return &HyperLogLog32{ 43 p: p, m: m, 44 hash: h, 45 register: make([]byte, m), 46 }, nil 47 } 48 49 // Write notes the data in b as a single observation into the sketch held by 50 // the receiver. 51 // 52 // Write satisfies the io.Writer interface. If the hash.Hash32 type passed to 53 // NewHyperLogLog32 or SetHash satisfies the hash.Hash contract, Write will always 54 // return a nil error. 55 func (h *HyperLogLog32) Write(b []byte) (int, error) { 56 n, err := h.hash.Write(b) 57 x := h.hash.Sum32() 58 h.hash.Reset() 59 q := w32 - h.p 60 idx := x >> q 61 r := rho32q(x, q) 62 if r > h.register[idx] { 63 h.register[idx] = r 64 } 65 return n, err 66 } 67 68 // Union places the union of the sketches in a and b into the receiver. 69 // Union will return an error if the precisions or hash functions of a 70 // and b do not match or if the receiver has a hash function that is set 71 // and does not match those of a and b. Hash functions provided by hash.Hash32 72 // implementations x and y match when reflect.TypeOf(x) == reflect.TypeOf(y). 73 // 74 // If the receiver does not have a set hash function, it can be set after 75 // a call to Union with the SetHash method. 76 func (h *HyperLogLog32) Union(a, b *HyperLogLog32) error { 77 if a.p != b.p { 78 return errors.New("card: mismatched precision") 79 } 80 ta := reflect.TypeOf(b.hash) 81 if reflect.TypeOf(b.hash) != ta { 82 return errors.New("card: mismatched hash function") 83 } 84 if h.hash != nil && reflect.TypeOf(h.hash) != ta { 85 return errors.New("card: mismatched hash function") 86 } 87 88 if h != a && h != b { 89 *h = HyperLogLog32{p: a.p, m: a.m, hash: h.hash, register: make([]uint8, a.m)} 90 } 91 for i, r := range a.register { 92 h.register[i] = max(r, b.register[i]) 93 } 94 return nil 95 } 96 97 // SetHash sets the hash function of the receiver if it is nil. SetHash 98 // will return an error if it is called on a receiver with a non-nil 99 // hash function. 100 func (h *HyperLogLog32) SetHash(fn hash.Hash32) error { 101 if h.hash == nil { 102 return errors.New("card: hash function already set") 103 } 104 h.hash = fn 105 return nil 106 } 107 108 // Count returns an estimate of the cardinality of the set of items written 109 // the receiver. 110 func (h *HyperLogLog32) Count() float64 { 111 var s float64 112 for _, v := range h.register { 113 s += 1 / float64(uint64(1)<<v) 114 } 115 m := float64(h.m) 116 e := alpha(uint64(h.m)) * m * m / s 117 if e <= 5*m/2 { 118 var v int 119 for _, r := range h.register { 120 if r == 0 { 121 v++ 122 } 123 } 124 if v != 0 { 125 return linearCounting(m, float64(v)) 126 } 127 return e 128 } 129 if e <= (1<<w32)/30.0 { 130 return e 131 } 132 return -(1 << w32) * math.Log1p(-e/(1<<w32)) 133 } 134 135 // rho32q (ϱ) is the number of leading zeros in q-wide low bits of x, plus 1. 136 func rho32q(x uint32, q uint8) uint8 { 137 return min(uint8(bits.LeadingZeros32(x<<(w32-q))), q) + 1 138 } 139 140 // Reset clears the receiver's registers allowing it to be reused. 141 // Reset does not alter the precision of the receiver or the hash 142 // function that is used. 143 func (h *HyperLogLog32) Reset() { 144 for i := range h.register { 145 h.register[i] = 0 146 } 147 } 148 149 // MarshalBinary marshals the sketch in the receiver. It encodes the 150 // name of the hash function, the precision of the sketch and the 151 // sketch data. The receiver must have a non-nil hash function. 152 func (h *HyperLogLog32) MarshalBinary() ([]byte, error) { 153 if h.hash == nil { 154 return nil, errors.New("card: hash function not set") 155 } 156 var buf bytes.Buffer 157 enc := gob.NewEncoder(&buf) 158 err := enc.Encode(uint8(w32)) 159 if err != nil { 160 return nil, err 161 } 162 err = enc.Encode(typeNameOf(h.hash)) 163 if err != nil { 164 return nil, err 165 } 166 err = enc.Encode(h.p) 167 if err != nil { 168 return nil, err 169 } 170 err = enc.Encode(h.register) 171 if err != nil { 172 return nil, err 173 } 174 return buf.Bytes(), nil 175 } 176 177 // UnmarshalBinary unmarshals the binary representation of a sketch 178 // into the receiver. The precision of the receiver will be set after 179 // return. The receiver must have a non-nil hash function value that is 180 // the same type as the one that was stored in the binary data. 181 func (h *HyperLogLog32) UnmarshalBinary(b []byte) error { 182 dec := gob.NewDecoder(bytes.NewReader(b)) 183 var size uint8 184 err := dec.Decode(&size) 185 if err != nil { 186 return err 187 } 188 if size != w32 { 189 return fmt.Errorf("card: mismatched hash function size: dst=%d src=%d", w32, size) 190 } 191 var srcHash string 192 err = dec.Decode(&srcHash) 193 if err != nil { 194 return err 195 } 196 if h.hash == nil { 197 h.hash = hash32For(srcHash) 198 if h.hash == nil { 199 return fmt.Errorf("card: hash function not set and no hash registered for %q", srcHash) 200 } 201 } else { 202 dstHash := typeNameOf(h.hash) 203 if dstHash != srcHash { 204 return fmt.Errorf("card: mismatched hash function: dst=%s src=%s", dstHash, srcHash) 205 } 206 } 207 err = dec.Decode(&h.p) 208 if err != nil { 209 return err 210 } 211 h.m = uint32(1) << h.p 212 h.register = h.register[:0] 213 err = dec.Decode(&h.register) 214 if err != nil { 215 return err 216 } 217 return nil 218 } 219 220 func hash32For(name string) hash.Hash32 { 221 fn, ok := hashes.Load(name) 222 if !ok { 223 return nil 224 } 225 h, _ := fn.(userType).fn.Call(nil)[0].Interface().(hash.Hash32) 226 return h 227 }