github.com/fluhus/gostuff@v0.4.1-0.20240331134726-be71864f2b5d/hll/hll.go (about)

     1  // Package hll provides an implementation of the HyperLogLog algorithm.
     2  //
     3  // A HyperLogLog counter can approximate the cardinality of a set with high
     4  // accuracy and little memory.
     5  //
     6  // # Accuracy
     7  //
     8  // Average error for 1,000,000,000 elements for different values of logSize:
     9  //
    10  //	logSize    average error %
    11  //	4          21
    12  //	5          12
    13  //	6          10
    14  //	7          8.1
    15  //	8          4.8
    16  //	9          3.6
    17  //	10         1.9
    18  //	11         1.2
    19  //	12         1.0
    20  //	13         0.7
    21  //	14         0.5
    22  //	15         0.33
    23  //	16         0.25
    24  //
    25  // # Citation
    26  //
    27  // Flajolet, Philippe; Fusy, Éric; Gandouet, Olivier; Meunier, Frédéric (2007).
    28  // "Hyperloglog: The analysis of a near-optimal cardinality estimation
    29  // algorithm". Discrete Mathematics and Theoretical Computer Science
    30  // Proceedings.
    31  package hll
    32  
    33  import (
    34  	"encoding/json"
    35  	"fmt"
    36  	"hash"
    37  	"math"
    38  	_ "unsafe"
    39  
    40  	"github.com/spaolacci/murmur3"
    41  )
    42  
    43  const (
    44  	nbits = 16
    45  	m     = 1 << nbits
    46  	mask  = m - 1
    47  	alpha = 0.7213 / (1.0 + 1.079/m)
    48  )
    49  
    50  //go:linkname fastrand runtime.fastrand
    51  func fastrand() uint32
    52  
    53  // An HLL is a HyperLogLog counter for arbitrary values.
    54  //
    55  // Deprecated: use HLL2.
    56  type HLL struct {
    57  	counters []byte
    58  	h        hash.Hash64
    59  	seed     uint32
    60  }
    61  
    62  // New creates a new HyperLogLog counter with a random hash seed.
    63  func New() *HLL {
    64  	return NewSeed(fastrand())
    65  }
    66  
    67  // NewSeed creates a new HyperLogLog counter with the given hash seed.
    68  func NewSeed(seed uint32) *HLL {
    69  	return &HLL{
    70  		counters: make([]byte, m),
    71  		h:        murmur3.New64WithSeed(seed),
    72  		seed:     seed,
    73  	}
    74  }
    75  
    76  // Add adds v to the counter. Calls hash once.
    77  func (h *HLL) Add(v []byte) {
    78  	h.h.Reset()
    79  	h.h.Write(v)
    80  	hash := h.h.Sum64()
    81  
    82  	idx := hash & mask
    83  	fp := hash >> nbits
    84  	z := byte(nzeros(fp)) + 1
    85  	if z > h.counters[idx] {
    86  		h.counters[idx] = z
    87  	}
    88  }
    89  
    90  // ApproxCount returns the current approximate count.
    91  // Does not alter the state of the counter.
    92  func (h *HLL) ApproxCount() int {
    93  	z := 0.0
    94  	for _, v := range h.counters {
    95  		z += math.Pow(2, -float64(v))
    96  	}
    97  	z = 1.0 / z
    98  	result := int(alpha * m * m * z)
    99  
   100  	if result < m*5/2 {
   101  		zeros := 0
   102  		for _, v := range h.counters {
   103  			if v == 0 {
   104  				zeros++
   105  			}
   106  		}
   107  		// If some registers are zero, use linear counting.
   108  		if zeros > 0 {
   109  			result = int(m * math.Log(m/float64(zeros)))
   110  		}
   111  	}
   112  
   113  	return result
   114  }
   115  
   116  // nzeros counts the number of zeros on the right side of a binary number.
   117  func nzeros(a uint64) int {
   118  	if a == 0 {
   119  		return 64 - nbits // Number of bits after using the first nbits.
   120  	}
   121  	n := 0
   122  	for a&1 == 0 {
   123  		n++
   124  		a /= 2
   125  	}
   126  	return n
   127  }
   128  
   129  // AddHLL adds the state of another counter to h.
   130  // The result is equivalent to adding all the values of other to h.
   131  func (h *HLL) AddHLL(other *HLL) {
   132  	if h.seed != other.seed {
   133  		panic(fmt.Sprintf("seeds don't match: %v, %v", h.seed, other.seed))
   134  	}
   135  	for i, b := range other.counters {
   136  		if h.counters[i] < b {
   137  			h.counters[i] = b
   138  		}
   139  	}
   140  }
   141  
   142  // Used for JSON marshaling/unmarshaling.
   143  type jsonHLL struct {
   144  	Counters []byte
   145  	Seed     uint32
   146  }
   147  
   148  // MarshalJSON implements the json.Marshaler interface.
   149  func (h *HLL) MarshalJSON() ([]byte, error) {
   150  	return json.Marshal(&jsonHLL{Counters: h.counters, Seed: h.seed})
   151  }
   152  
   153  // UnmarshalJSON implements the json.Unmarshaler interface.
   154  func (h *HLL) UnmarshalJSON(b []byte) error {
   155  	jh := &jsonHLL{}
   156  	if err := json.Unmarshal(b, jh); err != nil {
   157  		return err
   158  	}
   159  	h.counters = jh.Counters
   160  	h.h = murmur3.New64WithSeed(jh.Seed)
   161  	h.seed = jh.Seed
   162  	return nil
   163  }