github.com/andy2046/gopie@v0.7.0/pkg/countminsketch/countmin.go (about)

     1  // Package countminsketch implements Count-Min Sketch.
     2  package countminsketch
     3  
     4  import (
     5  	"encoding/binary"
     6  	"errors"
     7  	"hash"
     8  	"hash/fnv"
     9  	"math"
    10  )
    11  
    12  // CountMinSketch struct.
    13  type CountMinSketch struct {
    14  	matrix [][]uint64  // count matrix
    15  	width  uint        // matrix width
    16  	depth  uint        // matrix depth
    17  	count  uint64      // total number of items added
    18  	hash   hash.Hash64 // hash function
    19  }
    20  
    21  // For a sketch matrix w x d with total sum of all counts N,
    22  // the estimate has error at most 2N/w, with probability at least 1-(1/2)^d.
    23  
    24  // New returns new Count-Min Sketch with the given `width` and `depth`.
    25  func New(width, depth uint) (*CountMinSketch, error) {
    26  	if width < 1 || depth < 1 {
    27  		return nil, errors.New("Dimensions must be positive")
    28  	}
    29  
    30  	matrix := make([][]uint64, depth)
    31  	for i := uint(0); i < depth; i++ {
    32  		matrix[i] = make([]uint64, width)
    33  	}
    34  
    35  	return &CountMinSketch{
    36  		matrix: matrix,
    37  		width:  width,
    38  		depth:  depth,
    39  		hash:   fnv.New64(),
    40  	}, nil
    41  }
    42  
    43  // NewGuess returns new Count-Min Sketch with the given error rate `epsilon` and confidence `delta`.
    44  func NewGuess(epsilon, delta float64) (*CountMinSketch, error) {
    45  	if epsilon <= 0 || epsilon >= 1 {
    46  		return nil, errors.New("epsilon must be in range (0, 1)")
    47  	}
    48  	if delta <= 0 || delta >= 1 {
    49  		return nil, errors.New("delta must be in range (0, 1)")
    50  	}
    51  
    52  	width, depth := uint(math.Ceil(math.E/epsilon)),
    53  		uint(math.Ceil(math.Log(1-delta)/math.Log(0.5)))
    54  
    55  	return New(width, depth)
    56  }
    57  
    58  // Count returns the number of items added to the sketch.
    59  func (c *CountMinSketch) Count() uint64 {
    60  	return c.count
    61  }
    62  
    63  // Add add the `data` to the sketch. `count` default to 1.
    64  func (c *CountMinSketch) Add(data []byte, count ...uint64) {
    65  	cnt := uint64(1)
    66  	if len(count) > 0 {
    67  		cnt = count[0]
    68  	}
    69  
    70  	lower, upper := hashn(data, c.hash)
    71  
    72  	for i := uint(0); i < c.depth; i++ {
    73  		c.matrix[i][(uint(lower)+uint(upper)*i)%c.width] += cnt
    74  	}
    75  
    76  	c.count += cnt
    77  }
    78  
    79  // AddString add the `data` string to the sketch. `count` default to 1.
    80  func (c *CountMinSketch) AddString(data string, count ...uint64) {
    81  	c.Add([]byte(data), count...)
    82  }
    83  
    84  // Estimate estimate the frequency of the `data`.
    85  func (c *CountMinSketch) Estimate(data []byte) uint64 {
    86  	var (
    87  		lower, upper = hashn(data, c.hash)
    88  		count        uint64
    89  	)
    90  
    91  	for i := uint(0); i < c.depth; i++ {
    92  		j := (uint(lower) + uint(upper)*i) % c.width
    93  		if i == 0 || c.matrix[i][j] < count {
    94  			count = c.matrix[i][j]
    95  		}
    96  	}
    97  
    98  	return count
    99  }
   100  
   101  // EstimateString estimate the frequency of the `data` string.
   102  func (c *CountMinSketch) EstimateString(data string) uint64 {
   103  	return c.Estimate([]byte(data))
   104  }
   105  
   106  // Reset reset the sketch to its original state.
   107  func (c *CountMinSketch) Reset() {
   108  	matrix := make([][]uint64, c.depth)
   109  	for i := uint(0); i < c.depth; i++ {
   110  		matrix[i] = make([]uint64, c.width)
   111  	}
   112  
   113  	c.matrix = matrix
   114  	c.count = 0
   115  }
   116  
   117  // Merge combines the sketch with another.
   118  func (c *CountMinSketch) Merge(other *CountMinSketch) error {
   119  	if c.depth != other.depth {
   120  		return errors.New("matrix depth must match")
   121  	}
   122  
   123  	if c.width != other.width {
   124  		return errors.New("matrix width must match")
   125  	}
   126  
   127  	for i := uint(0); i < c.depth; i++ {
   128  		for j := uint(0); j < c.width; j++ {
   129  			c.matrix[i][j] += other.matrix[i][j]
   130  		}
   131  	}
   132  
   133  	c.count += other.count
   134  	return nil
   135  }
   136  
   137  // Depth returns the matrix depth.
   138  func (c *CountMinSketch) Depth() uint {
   139  	return c.depth
   140  }
   141  
   142  // Width returns the matrix width.
   143  func (c *CountMinSketch) Width() uint {
   144  	return c.width
   145  }
   146  
   147  func hashn(data []byte, hasher hash.Hash64) (uint32, uint32) {
   148  	hasher.Reset()
   149  	hasher.Write(data)
   150  	sum := hasher.Sum(nil)
   151  	return binary.BigEndian.Uint32(sum[4:8]), binary.BigEndian.Uint32(sum[0:4])
   152  }