github.com/fluhus/gostuff@v0.4.1-0.20240331134726-be71864f2b5d/hll/v2/hll.go (about)

     1  // Package hll provides an implementation of the HyperLogLog algorithm.
     2  //
     3  // A HyperLogLog counter can approximate the cardinality of a set with high
     4  // accuracy and little memory.
     5  //
     6  // # Accuracy
     7  //
     8  // Average error for 1,000,000,000 elements for different values of logSize:
     9  //
    10  //	logSize    average error %
    11  //	4          21
    12  //	5          12
    13  //	6          10
    14  //	7          8.1
    15  //	8          4.8
    16  //	9          3.6
    17  //	10         1.9
    18  //	11         1.2
    19  //	12         1.0
    20  //	13         0.7
    21  //	14         0.5
    22  //	15         0.33
    23  //	16         0.25
    24  //
    25  // # Citation
    26  //
    27  // Flajolet, Philippe; Fusy, Éric; Gandouet, Olivier; Meunier, Frédéric (2007).
    28  // "Hyperloglog: The analysis of a near-optimal cardinality estimation
    29  // algorithm". Discrete Mathematics and Theoretical Computer Science
    30  // Proceedings.
    31  package hll
    32  
    33  import (
    34  	"fmt"
    35  	"math"
    36  )
    37  
    38  // An HLL is a HyperLogLog counter for arbitrary values.
    39  type HLL[T any] struct {
    40  	counters []byte
    41  	h        func(T) uint64
    42  	nbits    int
    43  	m        int
    44  	mask     uint64
    45  }
    46  
    47  // New creates a new HyperLogLog counter.
    48  // The counter will use 2^logSize bytes.
    49  // h is the hash function to use for added values.
    50  func New[T any](logSize int, h func(T) uint64) *HLL[T] {
    51  	if logSize < 4 {
    52  		panic(fmt.Sprintf("logSize=%v, should be at least 4", logSize))
    53  	}
    54  	m := 1 << logSize
    55  	return &HLL[T]{
    56  		counters: make([]byte, m),
    57  		h:        h,
    58  		nbits:    logSize,
    59  		m:        m,
    60  		mask:     uint64(m - 1),
    61  	}
    62  }
    63  
    64  // Add adds v to the counter. Calls hash once.
    65  func (h *HLL[T]) Add(t T) {
    66  	hash := h.h(t)
    67  	idx := hash & h.mask
    68  	fp := hash >> h.nbits
    69  	z := byte(h.nzeros(fp)) + 1
    70  	if z > h.counters[idx] {
    71  		h.counters[idx] = z
    72  	}
    73  }
    74  
    75  // ApproxCount returns the current approximate count.
    76  // Does not alter the state of the counter.
    77  func (h *HLL[T]) ApproxCount() int {
    78  	z := 0.0
    79  	for _, v := range h.counters {
    80  		z += math.Pow(2, -float64(v))
    81  	}
    82  	z = 1.0 / z
    83  	fm := float64(h.m)
    84  	result := int(h.alpha() * fm * fm * z)
    85  
    86  	if result < h.m*5/2 {
    87  		zeros := 0
    88  		for _, v := range h.counters {
    89  			if v == 0 {
    90  				zeros++
    91  			}
    92  		}
    93  		// If some registers are zero, use linear counting.
    94  		if zeros > 0 {
    95  			result = int(fm * math.Log(fm/float64(zeros)))
    96  		}
    97  	}
    98  
    99  	return result
   100  }
   101  
   102  // Returns the alpha value to use depending on m.
   103  func (h *HLL[T]) alpha() float64 {
   104  	switch h.m {
   105  	case 16:
   106  		return 0.673
   107  	case 32:
   108  		return 0.697
   109  	case 64:
   110  		return 0.709
   111  	}
   112  	return 0.7213 / (1 + 1.079/float64(h.m))
   113  }
   114  
   115  // nzeros counts the number of zeros on the right side of a binary number.
   116  func (h *HLL[T]) nzeros(a uint64) int {
   117  	if a == 0 {
   118  		return 64 - h.nbits // Number of bits after using the first nbits.
   119  	}
   120  	n := 0
   121  	for a&1 == 0 {
   122  		n++
   123  		a /= 2
   124  	}
   125  	return n
   126  }
   127  
   128  // AddHLL adds the state of another counter to h,
   129  // assuming they use the same hash function.
   130  // The result is equivalent to adding all the values of other to h.
   131  func (h *HLL[T]) AddHLL(other *HLL[T]) {
   132  	if len(h.counters) != len(other.counters) {
   133  		panic("merging HLLs with different sizes")
   134  	}
   135  	for i, b := range other.counters {
   136  		if h.counters[i] < b {
   137  			h.counters[i] = b
   138  		}
   139  	}
   140  }