github.com/whatap/golib@v0.0.22/util/hll/HyperLogLog.go (about) 1 /* 2 * This file from 3 * https://github.com/addthis/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/cardinality/HyperLogLog.java 4 * 5 * This class modified by Scouter-Project * - original package : com.clearspring.analytics.stream.cardinality 6 * - remove implements : ICardinality, Serializable 7 * - add method : public boolean offer(long o) 8 * - remove classes : Builder, enum Format, HyperLogLogPlusMergeException, SerializationHolder 9 * 10 * ==================================== 11 * 12 * Copyright (C) 2012 Clearspring Technologies, Inc. 13 * 14 * Licensed under the Apache License, Version 2.0 (the "License"); 15 * you may not use this file except in compliance with the License. 16 * You may obtain a copy of the License at 17 * 18 * http://www.apache.org/licenses/LICENSE-2.0 19 * 20 * Unless required by applicable law or agreed to in writing, software 21 * distributed under the License is distributed on an "AS IS" BASIS, 22 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 23 * See the License for the specific language governing permissions and 24 * limitations under the License. 25 */ 26 27 /** 28 * Java implementation of HyperLogLog (HLL) algorithm from this paper: 29 * <p/> 30 * http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf 31 * <p/> 32 * HLL is an improved version of LogLog that is capable of estimating the 33 * cardinality of a set with accuracy = 1.04/sqrt(m) where m = 2^b. So we can 34 * control accuracy vs space usage by increasing or decreasing b. 35 * <p/> 36 * The main benefit of using HLL over LL is that it only requires 64% of the 37 * space that LL does to get the same accuracy. 38 * <p/> 39 * This implementation implements a single counter. If a large (millions) number 40 * of counters are required you may want to refer to: 41 * <p/> 42 * http://dsiutils.dsi.unimi.it/ 43 * <p/> 44 * It has a more complex implementation of HLL that supports multiple counters 45 * in a single object, drastically reducing the java overhead from creating a 46 * large number of objects. 47 * <p/> 48 * This implementation leveraged a javascript implementation that Yammer has 49 * been working on: 50 * <p/> 51 * https://github.com/yammer/probablyjs 52 * <p> 53 * Note that this implementation does not include the long range correction 54 * function defined in the original paper. Empirical evidence shows that the 55 * correction function causes more harm than good. 56 * </p> 57 * <p/> 58 * <p> 59 * Users have different motivations to use different types of hashing functions. 60 * Rather than try to keep up with all available hash functions and to remove 61 * the concern of causing future binary incompatibilities this class allows 62 * clients to offer the value in hashed int or long form. This way clients are 63 * free to change their hash function on their own time line. We recommend using 64 * Google's Guava Murmur3_128 implementation as it provides good performance and 65 * speed when high precision is required. In our tests the 32bit MurmurHash 66 * function included in this pcode is faster and produces better results than 67 * the 32 bit murmur3 implementation google provides. 68 * </p> 69 * 70 */ 71 72 package hll 73 74 import ( 75 //"log" 76 "math" 77 78 "github.com/whatap/golib/io" 79 ) 80 81 type HyperLogLog struct { 82 dirty bool 83 registerSet *RegisterSet 84 log2m uint32 85 alphaMM float64 86 } 87 88 /** 89 * Creates a new HyperLogLog instance using the given registers. Used for 90 * unmarshalling a serialized instance and for merging multiple counters 91 * together. 92 * 93 * @param registerSet 94 * - the initial values for the register set 95 */ 96 // @Deprecated 97 func NewHyperLogLog(log2m uint32, registerSet *RegisterSet) *HyperLogLog { 98 p := new(HyperLogLog) 99 if !validateLog2m(log2m) { 100 return nil 101 } 102 p.registerSet = registerSet 103 p.log2m = log2m 104 m := uint32(1) << p.log2m 105 p.alphaMM = p.getAlphaMM(log2m, m) 106 107 return p 108 } 109 110 /** 111 * Create a new HyperLogLog instance. The log2m parameter defines the 112 * accuracy of the counter. The larger the log2m the better the accuracy. 113 * <p/> 114 * accuracy = 1.04/sqrt(2^log2m) 115 * 116 * @param log2m 117 * - the number of bits to use as the basis for the HLL instance 118 */ 119 func NewHyperLogLogInt(log2m uint32) *HyperLogLog { 120 p := NewHyperLogLog(log2m, NewRegisterSet(1<<log2m)) 121 return p 122 } 123 124 /** 125 * Create a new HyperLogLog instance using the specified standard deviation. 126 * 127 * @param rsd 128 * - the relative standard deviation for the counter. smaller 129 * values create counters that require more space. 130 */ 131 func NewHyperLogLogFloat(Rsd float64) *HyperLogLog { 132 //this(log2m(rsd)); 133 p := NewHyperLogLogInt(_log2m(Rsd)) 134 135 return p 136 } 137 138 func NewHyperLogLogDefault() *HyperLogLog { 139 p := NewHyperLogLogInt(10) 140 141 return p 142 } 143 144 func (this *HyperLogLog) offerHashedLong(hashedValue uint64) bool { 145 // j becomes the binary address determined by the first b log2m of x 146 // j will be between 0 and 2^log2m 147 //j := int(uint64(hashedValue) >> (strconv. Long.SIZE - log2m)) 148 j := uint32(hashedValue >> (64 - this.log2m)) 149 150 // TODO numberOfLeadingZeros 151 //r := int32(1) //Long.numberOfLeadingZeros((hashedValue << this.log2m) | (1 << (this.log2m - 1)) + 1) + 1; 152 r := uint32(clz64((hashedValue<<this.log2m)|(1<<(this.log2m-1))+1)) + 1 153 154 return this.registerSet.UpdateIfGreater(j, r) 155 } 156 func (this *HyperLogLog) offerHashed(hashedValue uint32) bool { 157 // j becomes the binary address determined by the first b log2m of x 158 // j will be between 0 and 2^log2m 159 //final int j = hashedValue >>> (Integer.SIZE - log2m); 160 j := hashedValue >> (32 - this.log2m) 161 162 // TODO numberOfLeadingZeros 163 //final int r = Integer.numberOfLeadingZeros((hashedValue << this.log2m) | (1 << (this.log2m - 1)) + 1) + 1; 164 //r := int32(1) //Integer.numberOfLeadingZeros((hashedValue << this.log2m) | (1 << (this.log2m - 1)) + 1) + 1 165 r := uint32(clz32((hashedValue<<this.log2m)|(1<<(this.log2m-1))+1) + 1) 166 167 //fmt.Println("HyperLogLog offerHashed hash=", hashedValue , ",j=", j, ",r=", r) 168 169 return this.registerSet.UpdateIfGreater(j, r) 170 } 171 172 // public boolean offer(Object o) { 173 // final int x = MurmurHash.hash(o); 174 // return offerHashed(x); 175 // } 176 func (this *HyperLogLog) Offer(o uint32) bool { 177 x := MurmurHash(o) 178 return this.offerHashed(x) 179 } 180 181 func (this *HyperLogLog) OfferLong(o uint64) bool { 182 x := MurmurHashLong(o) 183 return this.offerHashed(x) 184 } 185 186 func (this *HyperLogLog) Cardinality() uint64 { 187 registerSum := float64(0) 188 count := this.registerSet.Count 189 zeros := float64(0.0) 190 for j := 0; j < this.registerSet.Count; j++ { 191 val := this.registerSet.Get(j) 192 registerSum += float64(1.0) / float64(int(1)<<uint(val)) 193 if val == 0 { 194 zeros++ 195 } 196 } 197 estimate := this.alphaMM * (float64(1) / registerSum) 198 if estimate <= (float64(5.0)/float64(2.0))*float64(count) { 199 // Small Range Estimate 200 return uint64(Round(linearCounting(count, zeros))) 201 } else { 202 return uint64(Round(estimate)) 203 } 204 } 205 func (this *HyperLogLog) Sizeof() int { 206 return this.registerSet.Size * 4 207 } 208 209 /* 210 * This method is modified by Souter-pcode 211 * 212 */ 213 func (this *HyperLogLog) GetBytes() []byte { 214 out := io.NewDataOutputX() 215 out.WriteInt(int32(this.log2m)) 216 out.WriteInt(int32(this.registerSet.Size)) 217 for _, x := range this.registerSet.ReadOnlyBits() { 218 out.WriteInt(int32(x)) 219 } 220 return out.ToByteArray() 221 } 222 223 /** 224 * Add all the elements of the other set to this set. 225 * <p/> 226 * This operation does not imply a loss of precision. 227 * 228 * @param other 229 * A compatible Hyperloglog instance (same log2m) 230 * @throws CardinalityMergeException 231 * if other is not compatible 232 */ 233 func (this *HyperLogLog) AddAll(other *HyperLogLog) { 234 235 if this.Sizeof() != other.Sizeof() { 236 //throw new RuntimeException("Cannot merge estimators of different sizes"); 237 panic("AddAll Cannot merge estimators of different sizes") 238 } 239 240 this.registerSet.Merge(other.registerSet) 241 } 242 243 func (this *HyperLogLog) Merge(estimators ...*HyperLogLog) *HyperLogLog { 244 merged := NewHyperLogLog(this.log2m, NewRegisterSet(this.registerSet.Count)) 245 merged.AddAll(this) 246 247 if estimators == nil { 248 return merged 249 } 250 for _, estimator := range estimators { 251 hll := estimator 252 merged.AddAll(hll) 253 } 254 return merged 255 } 256 257 /* 258 * Initial code from HyperLogLog.Builder.build() 259 * by Scouter-Project */ 260 func BuildHyperLogLog(bytes []byte) *HyperLogLog { 261 in := io.NewDataInputX(bytes) 262 log2m := uint32(in.ReadInt()) 263 n := in.ReadInt() 264 ints := make([]uint32, n) 265 for i := 0; i < int(n); i++ { 266 ints[i] = uint32(in.ReadInt()) 267 } 268 return NewHyperLogLog(log2m, NewRegisterSetInit(int(1<<log2m), ints)) 269 } 270 271 func (this *HyperLogLog) getAlphaMM(p uint32, m uint32) float64 { 272 // See the paper. 273 switch p { 274 case 4: 275 return 0.673 * float64(m) * float64(m) 276 case 5: 277 return 0.697 * float64(m) * float64(m) 278 case 6: 279 return 0.709 * float64(m) * float64(m) 280 default: 281 return (0.7213 / (1 + 1.079/float64(m))) * float64(m) * float64(m) 282 } 283 } 284 285 func linearCounting(m int, V float64) float64 { 286 return float64(m) * math.Log(float64(m)/V) 287 } 288 289 func _log2m(rsd float64) uint32 { 290 return uint32(math.Log((1.106/rsd)*(1.106/rsd)) / math.Log(2)) 291 } 292 293 func rsd(log2m uint32) float64 { 294 return 1.106 / math.Sqrt(math.Exp(float64(log2m)*math.Log(2))) 295 } 296 297 func validateLog2m(log2m uint32) bool { 298 if log2m < 0 || log2m > 30 { 299 //throw new IllegalArgumentException("log2m argument is " + log2m + " and is outpcodee the range [0, 30]"); 300 return false 301 } 302 return true 303 304 } 305 306 func Round(val float64) int64 { 307 if val < 0 { 308 return int64(val - 0.5) 309 } 310 return int64(val + 0.5) 311 } 312 313 var clzLookup = []uint8{ 314 32, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 315 } 316 317 // This optimized clz32 algorithm is from: 318 // 319 // http://embeddedgurus.com/state-space/2014/09/ 320 // fast-deterministic-and-portable-counting-leading-zeros/ 321 func clz32(x uint32) uint8 { 322 var n uint8 323 324 if x >= (1 << 16) { 325 if x >= (1 << 24) { 326 if x >= (1 << 28) { 327 n = 28 328 } else { 329 n = 24 330 } 331 } else { 332 if x >= (1 << 20) { 333 n = 20 334 } else { 335 n = 16 336 } 337 } 338 } else { 339 if x >= (1 << 8) { 340 if x >= (1 << 12) { 341 n = 12 342 } else { 343 n = 8 344 } 345 } else { 346 if x >= (1 << 4) { 347 n = 4 348 } else { 349 n = 0 350 } 351 } 352 } 353 return clzLookup[x>>n] - n 354 } 355 356 func clz64(x uint64) uint8 { 357 var c uint8 358 for m := uint64(1 << 63); m&x == 0 && m != 0; m >>= 1 { 359 c++ 360 } 361 return c 362 }