github.com/jingcheng-WU/gonum@v0.9.1-0.20210323123734-f1a2a11a8f7b/stat/card/hll32.go (about)

     1  // Copyright ©2019 The Gonum Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package card
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/gob"
    10  	"errors"
    11  	"fmt"
    12  	"hash"
    13  	"math"
    14  	"math/bits"
    15  	"reflect"
    16  )
    17  
    18  // HyperLogLog32 is implements cardinality estimation according to the
    19  // HyperLogLog algorithm described in Analysis of Algorithms, pp127–146.
    20  type HyperLogLog32 struct {
    21  	p uint8
    22  	m uint32
    23  
    24  	hash hash.Hash32
    25  
    26  	register []uint8
    27  }
    28  
    29  // NewHyperLogLog32 returns a new HyperLogLog32 sketch. The value of prec
    30  // must be in the range [4, 32]. NewHyperLogLog32 will allocate a byte slice
    31  // that is 2^prec long.
    32  func NewHyperLogLog32(prec int, h hash.Hash32) (*HyperLogLog32, error) {
    33  	// The implementation here is based on the pseudo-code in
    34  	// "HyperLogLog: the analysis of a near-optimal cardinality
    35  	// estimation algorithm", figure 3.
    36  
    37  	if prec < 4 || w32 < prec {
    38  		return nil, errors.New("card: precision out of range")
    39  	}
    40  	p := uint8(prec)
    41  	m := uint32(1) << p
    42  	return &HyperLogLog32{
    43  		p: p, m: m,
    44  		hash:     h,
    45  		register: make([]byte, m),
    46  	}, nil
    47  }
    48  
    49  // Write notes the data in b as a single observation into the sketch held by
    50  // the receiver.
    51  //
    52  // Write satisfies the io.Writer interface. If the hash.Hash32 type passed to
    53  // NewHyperLogLog32 or SetHash satisfies the hash.Hash contract, Write will always
    54  // return a nil error.
    55  func (h *HyperLogLog32) Write(b []byte) (int, error) {
    56  	n, err := h.hash.Write(b)
    57  	x := h.hash.Sum32()
    58  	h.hash.Reset()
    59  	q := w32 - h.p
    60  	idx := x >> q
    61  	r := rho32q(x, q)
    62  	if r > h.register[idx] {
    63  		h.register[idx] = r
    64  	}
    65  	return n, err
    66  }
    67  
    68  // Union places the union of the sketches in a and b into the receiver.
    69  // Union will return an error if the precisions or hash functions of a
    70  // and b do not match or if the receiver has a hash function that is set
    71  // and does not match those of a and b. Hash functions provided by hash.Hash32
    72  // implementations x and y match when reflect.TypeOf(x) == reflect.TypeOf(y).
    73  //
    74  // If the receiver does not have a set hash function, it can be set after
    75  // a call to Union with the SetHash method.
    76  func (h *HyperLogLog32) Union(a, b *HyperLogLog32) error {
    77  	if a.p != b.p {
    78  		return errors.New("card: mismatched precision")
    79  	}
    80  	ta := reflect.TypeOf(b.hash)
    81  	if reflect.TypeOf(b.hash) != ta {
    82  		return errors.New("card: mismatched hash function")
    83  	}
    84  	if h.hash != nil && reflect.TypeOf(h.hash) != ta {
    85  		return errors.New("card: mismatched hash function")
    86  	}
    87  
    88  	if h != a && h != b {
    89  		*h = HyperLogLog32{p: a.p, m: a.m, hash: h.hash, register: make([]uint8, a.m)}
    90  	}
    91  	for i, r := range a.register {
    92  		h.register[i] = max(r, b.register[i])
    93  	}
    94  	return nil
    95  }
    96  
    97  // SetHash sets the hash function of the receiver if it is nil. SetHash
    98  // will return an error if it is called on a receiver with a non-nil
    99  // hash function.
   100  func (h *HyperLogLog32) SetHash(fn hash.Hash32) error {
   101  	if h.hash == nil {
   102  		return errors.New("card: hash function already set")
   103  	}
   104  	h.hash = fn
   105  	return nil
   106  }
   107  
   108  // Count returns an estimate of the cardinality of the set of items written
   109  // the receiver.
   110  func (h *HyperLogLog32) Count() float64 {
   111  	var s float64
   112  	for _, v := range h.register {
   113  		s += 1 / float64(uint64(1)<<v)
   114  	}
   115  	m := float64(h.m)
   116  	e := alpha(uint64(h.m)) * m * m / s
   117  	if e <= 5*m/2 {
   118  		var v int
   119  		for _, r := range h.register {
   120  			if r == 0 {
   121  				v++
   122  			}
   123  		}
   124  		if v != 0 {
   125  			return linearCounting(m, float64(v))
   126  		}
   127  		return e
   128  	}
   129  	if e <= (1<<w32)/30.0 {
   130  		return e
   131  	}
   132  	return -(1 << w32) * math.Log1p(-e/(1<<w32))
   133  }
   134  
   135  // rho32q (ϱ) is the number of leading zeros in q-wide low bits of x, plus 1.
   136  func rho32q(x uint32, q uint8) uint8 {
   137  	return min(uint8(bits.LeadingZeros32(x<<(w32-q))), q) + 1
   138  }
   139  
   140  // Reset clears the receiver's registers allowing it to be reused.
   141  // Reset does not alter the precision of the receiver or the hash
   142  // function that is used.
   143  func (h *HyperLogLog32) Reset() {
   144  	for i := range h.register {
   145  		h.register[i] = 0
   146  	}
   147  }
   148  
   149  // MarshalBinary marshals the sketch in the receiver. It encodes the
   150  // name of the hash function, the precision of the sketch and the
   151  // sketch data. The receiver must have a non-nil hash function.
   152  func (h *HyperLogLog32) MarshalBinary() ([]byte, error) {
   153  	if h.hash == nil {
   154  		return nil, errors.New("card: hash function not set")
   155  	}
   156  	var buf bytes.Buffer
   157  	enc := gob.NewEncoder(&buf)
   158  	err := enc.Encode(uint8(w32))
   159  	if err != nil {
   160  		return nil, err
   161  	}
   162  	err = enc.Encode(typeNameOf(h.hash))
   163  	if err != nil {
   164  		return nil, err
   165  	}
   166  	err = enc.Encode(h.p)
   167  	if err != nil {
   168  		return nil, err
   169  	}
   170  	err = enc.Encode(h.register)
   171  	if err != nil {
   172  		return nil, err
   173  	}
   174  	return buf.Bytes(), nil
   175  }
   176  
   177  // UnmarshalBinary unmarshals the binary representation of a sketch
   178  // into the receiver. The precision of the receiver will be set after
   179  // return. The receiver must have a non-nil hash function value that is
   180  // the same type as the one that was stored in the binary data.
   181  func (h *HyperLogLog32) UnmarshalBinary(b []byte) error {
   182  	dec := gob.NewDecoder(bytes.NewReader(b))
   183  	var size uint8
   184  	err := dec.Decode(&size)
   185  	if err != nil {
   186  		return err
   187  	}
   188  	if size != w32 {
   189  		return fmt.Errorf("card: mismatched hash function size: dst=%d src=%d", w32, size)
   190  	}
   191  	var srcHash string
   192  	err = dec.Decode(&srcHash)
   193  	if err != nil {
   194  		return err
   195  	}
   196  	if h.hash == nil {
   197  		h.hash = hash32For(srcHash)
   198  		if h.hash == nil {
   199  			return fmt.Errorf("card: hash function not set and no hash registered for %q", srcHash)
   200  		}
   201  	} else {
   202  		dstHash := typeNameOf(h.hash)
   203  		if dstHash != srcHash {
   204  			return fmt.Errorf("card: mismatched hash function: dst=%s src=%s", dstHash, srcHash)
   205  		}
   206  	}
   207  	err = dec.Decode(&h.p)
   208  	if err != nil {
   209  		return err
   210  	}
   211  	h.m = uint32(1) << h.p
   212  	h.register = h.register[:0]
   213  	err = dec.Decode(&h.register)
   214  	if err != nil {
   215  		return err
   216  	}
   217  	return nil
   218  }
   219  
   220  func hash32For(name string) hash.Hash32 {
   221  	fn, ok := hashes.Load(name)
   222  	if !ok {
   223  		return nil
   224  	}
   225  	h, _ := fn.(userType).fn.Call(nil)[0].Interface().(hash.Hash32)
   226  	return h
   227  }