github.com/whatap/golib@v0.0.22/util/hll/HyperLogLog.go (about)

     1  /*
     2   * This file from
     3   *   https://github.com/addthis/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/cardinality/HyperLogLog.java
     4   *
     5   *   This class modified by Scouter-Project *   - original package :  com.clearspring.analytics.stream.cardinality
     6   *   - remove implements : ICardinality, Serializable
     7   *   - add method : public boolean offer(long o)
     8   *   - remove classes : Builder,  enum Format, HyperLogLogPlusMergeException, SerializationHolder
     9   *
    10   *  ====================================
    11   *
    12   * Copyright (C) 2012 Clearspring Technologies, Inc.
    13   *
    14   * Licensed under the Apache License, Version 2.0 (the "License");
    15   * you may not use this file except in compliance with the License.
    16   * You may obtain a copy of the License at
    17   *
    18   * http://www.apache.org/licenses/LICENSE-2.0
    19   *
    20   * Unless required by applicable law or agreed to in writing, software
    21   * distributed under the License is distributed on an "AS IS" BASIS,
    22   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    23   * See the License for the specific language governing permissions and
    24   * limitations under the License.
    25   */
    26  
    27  /**
    28   * Java implementation of HyperLogLog (HLL) algorithm from this paper:
    29   * <p/>
    30   * http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
    31   * <p/>
    32   * HLL is an improved version of LogLog that is capable of estimating the
    33   * cardinality of a set with accuracy = 1.04/sqrt(m) where m = 2^b. So we can
    34   * control accuracy vs space usage by increasing or decreasing b.
    35   * <p/>
    36   * The main benefit of using HLL over LL is that it only requires 64% of the
    37   * space that LL does to get the same accuracy.
    38   * <p/>
    39   * This implementation implements a single counter. If a large (millions) number
    40   * of counters are required you may want to refer to:
    41   * <p/>
    42   * http://dsiutils.dsi.unimi.it/
    43   * <p/>
    44   * It has a more complex implementation of HLL that supports multiple counters
    45   * in a single object, drastically reducing the java overhead from creating a
    46   * large number of objects.
    47   * <p/>
    48   * This implementation leveraged a javascript implementation that Yammer has
    49   * been working on:
    50   * <p/>
    51   * https://github.com/yammer/probablyjs
    52   * <p>
    53   * Note that this implementation does not include the long range correction
    54   * function defined in the original paper. Empirical evidence shows that the
    55   * correction function causes more harm than good.
    56   * </p>
    57   * <p/>
    58   * <p>
    59   * Users have different motivations to use different types of hashing functions.
    60   * Rather than try to keep up with all available hash functions and to remove
    61   * the concern of causing future binary incompatibilities this class allows
    62   * clients to offer the value in hashed int or long form. This way clients are
    63   * free to change their hash function on their own time line. We recommend using
    64   * Google's Guava Murmur3_128 implementation as it provides good performance and
    65   * speed when high precision is required. In our tests the 32bit MurmurHash
    66   * function included in this pcode is faster and produces better results than
    67   * the 32 bit murmur3 implementation google provides.
    68   * </p>
    69   *
    70   */
    71  
    72  package hll
    73  
    74  import (
    75  	//"log"
    76  	"math"
    77  
    78  	"github.com/whatap/golib/io"
    79  )
    80  
    81  type HyperLogLog struct {
    82  	dirty       bool
    83  	registerSet *RegisterSet
    84  	log2m       uint32
    85  	alphaMM     float64
    86  }
    87  
    88  /**
    89   * Creates a new HyperLogLog instance using the given registers. Used for
    90   * unmarshalling a serialized instance and for merging multiple counters
    91   * together.
    92   *
    93   * @param registerSet
    94   *            - the initial values for the register set
    95   */
    96  //	@Deprecated
    97  func NewHyperLogLog(log2m uint32, registerSet *RegisterSet) *HyperLogLog {
    98  	p := new(HyperLogLog)
    99  	if !validateLog2m(log2m) {
   100  		return nil
   101  	}
   102  	p.registerSet = registerSet
   103  	p.log2m = log2m
   104  	m := uint32(1) << p.log2m
   105  	p.alphaMM = p.getAlphaMM(log2m, m)
   106  
   107  	return p
   108  }
   109  
   110  /**
   111   * Create a new HyperLogLog instance. The log2m parameter defines the
   112   * accuracy of the counter. The larger the log2m the better the accuracy.
   113   * <p/>
   114   * accuracy = 1.04/sqrt(2^log2m)
   115   *
   116   * @param log2m
   117   *            - the number of bits to use as the basis for the HLL instance
   118   */
   119  func NewHyperLogLogInt(log2m uint32) *HyperLogLog {
   120  	p := NewHyperLogLog(log2m, NewRegisterSet(1<<log2m))
   121  	return p
   122  }
   123  
   124  /**
   125   * Create a new HyperLogLog instance using the specified standard deviation.
   126   *
   127   * @param rsd
   128   *            - the relative standard deviation for the counter. smaller
   129   *            values create counters that require more space.
   130   */
   131  func NewHyperLogLogFloat(Rsd float64) *HyperLogLog {
   132  	//this(log2m(rsd));
   133  	p := NewHyperLogLogInt(_log2m(Rsd))
   134  
   135  	return p
   136  }
   137  
   138  func NewHyperLogLogDefault() *HyperLogLog {
   139  	p := NewHyperLogLogInt(10)
   140  
   141  	return p
   142  }
   143  
   144  func (this *HyperLogLog) offerHashedLong(hashedValue uint64) bool {
   145  	// j becomes the binary address determined by the first b log2m of x
   146  	// j will be between 0 and 2^log2m
   147  	//j := int(uint64(hashedValue) >> (strconv. Long.SIZE - log2m))
   148  	j := uint32(hashedValue >> (64 - this.log2m))
   149  
   150  	// TODO numberOfLeadingZeros
   151  	//r := int32(1) //Long.numberOfLeadingZeros((hashedValue << this.log2m) | (1 << (this.log2m - 1)) + 1) + 1;
   152  	r := uint32(clz64((hashedValue<<this.log2m)|(1<<(this.log2m-1))+1)) + 1
   153  
   154  	return this.registerSet.UpdateIfGreater(j, r)
   155  }
   156  func (this *HyperLogLog) offerHashed(hashedValue uint32) bool {
   157  	// j becomes the binary address determined by the first b log2m of x
   158  	// j will be between 0 and 2^log2m
   159  	//final int j = hashedValue >>> (Integer.SIZE - log2m);
   160  	j := hashedValue >> (32 - this.log2m)
   161  
   162  	// TODO numberOfLeadingZeros
   163  	//final int r = Integer.numberOfLeadingZeros((hashedValue << this.log2m) | (1 << (this.log2m - 1)) + 1) + 1;
   164  	//r := int32(1) //Integer.numberOfLeadingZeros((hashedValue << this.log2m) | (1 << (this.log2m - 1)) + 1) + 1
   165  	r := uint32(clz32((hashedValue<<this.log2m)|(1<<(this.log2m-1))+1) + 1)
   166  
   167  	//fmt.Println("HyperLogLog offerHashed hash=", hashedValue , ",j=", j, ",r=", r)
   168  
   169  	return this.registerSet.UpdateIfGreater(j, r)
   170  }
   171  
   172  //	public boolean offer(Object o) {
   173  //			final int x = MurmurHash.hash(o);
   174  //			return offerHashed(x);
   175  //		}
   176  func (this *HyperLogLog) Offer(o uint32) bool {
   177  	x := MurmurHash(o)
   178  	return this.offerHashed(x)
   179  }
   180  
   181  func (this *HyperLogLog) OfferLong(o uint64) bool {
   182  	x := MurmurHashLong(o)
   183  	return this.offerHashed(x)
   184  }
   185  
   186  func (this *HyperLogLog) Cardinality() uint64 {
   187  	registerSum := float64(0)
   188  	count := this.registerSet.Count
   189  	zeros := float64(0.0)
   190  	for j := 0; j < this.registerSet.Count; j++ {
   191  		val := this.registerSet.Get(j)
   192  		registerSum += float64(1.0) / float64(int(1)<<uint(val))
   193  		if val == 0 {
   194  			zeros++
   195  		}
   196  	}
   197  	estimate := this.alphaMM * (float64(1) / registerSum)
   198  	if estimate <= (float64(5.0)/float64(2.0))*float64(count) {
   199  		// Small Range Estimate
   200  		return uint64(Round(linearCounting(count, zeros)))
   201  	} else {
   202  		return uint64(Round(estimate))
   203  	}
   204  }
   205  func (this *HyperLogLog) Sizeof() int {
   206  	return this.registerSet.Size * 4
   207  }
   208  
   209  /*
   210   * This method is modified by Souter-pcode
   211   *
   212   */
   213  func (this *HyperLogLog) GetBytes() []byte {
   214  	out := io.NewDataOutputX()
   215  	out.WriteInt(int32(this.log2m))
   216  	out.WriteInt(int32(this.registerSet.Size))
   217  	for _, x := range this.registerSet.ReadOnlyBits() {
   218  		out.WriteInt(int32(x))
   219  	}
   220  	return out.ToByteArray()
   221  }
   222  
   223  /**
   224   * Add all the elements of the other set to this set.
   225   * <p/>
   226   * This operation does not imply a loss of precision.
   227   *
   228   * @param other
   229   *            A compatible Hyperloglog instance (same log2m)
   230   * @throws CardinalityMergeException
   231   *             if other is not compatible
   232   */
   233  func (this *HyperLogLog) AddAll(other *HyperLogLog) {
   234  
   235  	if this.Sizeof() != other.Sizeof() {
   236  		//throw new RuntimeException("Cannot merge estimators of different sizes");
   237  		panic("AddAll Cannot merge estimators of different sizes")
   238  	}
   239  
   240  	this.registerSet.Merge(other.registerSet)
   241  }
   242  
   243  func (this *HyperLogLog) Merge(estimators ...*HyperLogLog) *HyperLogLog {
   244  	merged := NewHyperLogLog(this.log2m, NewRegisterSet(this.registerSet.Count))
   245  	merged.AddAll(this)
   246  
   247  	if estimators == nil {
   248  		return merged
   249  	}
   250  	for _, estimator := range estimators {
   251  		hll := estimator
   252  		merged.AddAll(hll)
   253  	}
   254  	return merged
   255  }
   256  
   257  /*
   258   * Initial code from HyperLogLog.Builder.build()
   259   * by Scouter-Project	 */
   260  func BuildHyperLogLog(bytes []byte) *HyperLogLog {
   261  	in := io.NewDataInputX(bytes)
   262  	log2m := uint32(in.ReadInt())
   263  	n := in.ReadInt()
   264  	ints := make([]uint32, n)
   265  	for i := 0; i < int(n); i++ {
   266  		ints[i] = uint32(in.ReadInt())
   267  	}
   268  	return NewHyperLogLog(log2m, NewRegisterSetInit(int(1<<log2m), ints))
   269  }
   270  
   271  func (this *HyperLogLog) getAlphaMM(p uint32, m uint32) float64 {
   272  	// See the paper.
   273  	switch p {
   274  	case 4:
   275  		return 0.673 * float64(m) * float64(m)
   276  	case 5:
   277  		return 0.697 * float64(m) * float64(m)
   278  	case 6:
   279  		return 0.709 * float64(m) * float64(m)
   280  	default:
   281  		return (0.7213 / (1 + 1.079/float64(m))) * float64(m) * float64(m)
   282  	}
   283  }
   284  
   285  func linearCounting(m int, V float64) float64 {
   286  	return float64(m) * math.Log(float64(m)/V)
   287  }
   288  
   289  func _log2m(rsd float64) uint32 {
   290  	return uint32(math.Log((1.106/rsd)*(1.106/rsd)) / math.Log(2))
   291  }
   292  
   293  func rsd(log2m uint32) float64 {
   294  	return 1.106 / math.Sqrt(math.Exp(float64(log2m)*math.Log(2)))
   295  }
   296  
   297  func validateLog2m(log2m uint32) bool {
   298  	if log2m < 0 || log2m > 30 {
   299  		//throw new IllegalArgumentException("log2m argument is " + log2m + " and is outpcodee the range [0, 30]");
   300  		return false
   301  	}
   302  	return true
   303  
   304  }
   305  
   306  func Round(val float64) int64 {
   307  	if val < 0 {
   308  		return int64(val - 0.5)
   309  	}
   310  	return int64(val + 0.5)
   311  }
   312  
   313  var clzLookup = []uint8{
   314  	32, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28,
   315  }
   316  
   317  // This optimized clz32 algorithm is from:
   318  //
   319  //	http://embeddedgurus.com/state-space/2014/09/
   320  //			fast-deterministic-and-portable-counting-leading-zeros/
   321  func clz32(x uint32) uint8 {
   322  	var n uint8
   323  
   324  	if x >= (1 << 16) {
   325  		if x >= (1 << 24) {
   326  			if x >= (1 << 28) {
   327  				n = 28
   328  			} else {
   329  				n = 24
   330  			}
   331  		} else {
   332  			if x >= (1 << 20) {
   333  				n = 20
   334  			} else {
   335  				n = 16
   336  			}
   337  		}
   338  	} else {
   339  		if x >= (1 << 8) {
   340  			if x >= (1 << 12) {
   341  				n = 12
   342  			} else {
   343  				n = 8
   344  			}
   345  		} else {
   346  			if x >= (1 << 4) {
   347  				n = 4
   348  			} else {
   349  				n = 0
   350  			}
   351  		}
   352  	}
   353  	return clzLookup[x>>n] - n
   354  }
   355  
   356  func clz64(x uint64) uint8 {
   357  	var c uint8
   358  	for m := uint64(1 << 63); m&x == 0 && m != 0; m >>= 1 {
   359  		c++
   360  	}
   361  	return c
   362  }