github.com/andy2046/gopie@v0.7.0/pkg/countminsketch/countmin.go (about) 1 // Package countminsketch implements Count-Min Sketch. 2 package countminsketch 3 4 import ( 5 "encoding/binary" 6 "errors" 7 "hash" 8 "hash/fnv" 9 "math" 10 ) 11 12 // CountMinSketch struct. 13 type CountMinSketch struct { 14 matrix [][]uint64 // count matrix 15 width uint // matrix width 16 depth uint // matrix depth 17 count uint64 // total number of items added 18 hash hash.Hash64 // hash function 19 } 20 21 // For a sketch matrix w x d with total sum of all counts N, 22 // the estimate has error at most 2N/w, with probability at least 1-(1/2)^d. 23 24 // New returns new Count-Min Sketch with the given `width` and `depth`. 25 func New(width, depth uint) (*CountMinSketch, error) { 26 if width < 1 || depth < 1 { 27 return nil, errors.New("Dimensions must be positive") 28 } 29 30 matrix := make([][]uint64, depth) 31 for i := uint(0); i < depth; i++ { 32 matrix[i] = make([]uint64, width) 33 } 34 35 return &CountMinSketch{ 36 matrix: matrix, 37 width: width, 38 depth: depth, 39 hash: fnv.New64(), 40 }, nil 41 } 42 43 // NewGuess returns new Count-Min Sketch with the given error rate `epsilon` and confidence `delta`. 44 func NewGuess(epsilon, delta float64) (*CountMinSketch, error) { 45 if epsilon <= 0 || epsilon >= 1 { 46 return nil, errors.New("epsilon must be in range (0, 1)") 47 } 48 if delta <= 0 || delta >= 1 { 49 return nil, errors.New("delta must be in range (0, 1)") 50 } 51 52 width, depth := uint(math.Ceil(math.E/epsilon)), 53 uint(math.Ceil(math.Log(1-delta)/math.Log(0.5))) 54 55 return New(width, depth) 56 } 57 58 // Count returns the number of items added to the sketch. 59 func (c *CountMinSketch) Count() uint64 { 60 return c.count 61 } 62 63 // Add add the `data` to the sketch. `count` default to 1. 64 func (c *CountMinSketch) Add(data []byte, count ...uint64) { 65 cnt := uint64(1) 66 if len(count) > 0 { 67 cnt = count[0] 68 } 69 70 lower, upper := hashn(data, c.hash) 71 72 for i := uint(0); i < c.depth; i++ { 73 c.matrix[i][(uint(lower)+uint(upper)*i)%c.width] += cnt 74 } 75 76 c.count += cnt 77 } 78 79 // AddString add the `data` string to the sketch. `count` default to 1. 80 func (c *CountMinSketch) AddString(data string, count ...uint64) { 81 c.Add([]byte(data), count...) 82 } 83 84 // Estimate estimate the frequency of the `data`. 85 func (c *CountMinSketch) Estimate(data []byte) uint64 { 86 var ( 87 lower, upper = hashn(data, c.hash) 88 count uint64 89 ) 90 91 for i := uint(0); i < c.depth; i++ { 92 j := (uint(lower) + uint(upper)*i) % c.width 93 if i == 0 || c.matrix[i][j] < count { 94 count = c.matrix[i][j] 95 } 96 } 97 98 return count 99 } 100 101 // EstimateString estimate the frequency of the `data` string. 102 func (c *CountMinSketch) EstimateString(data string) uint64 { 103 return c.Estimate([]byte(data)) 104 } 105 106 // Reset reset the sketch to its original state. 107 func (c *CountMinSketch) Reset() { 108 matrix := make([][]uint64, c.depth) 109 for i := uint(0); i < c.depth; i++ { 110 matrix[i] = make([]uint64, c.width) 111 } 112 113 c.matrix = matrix 114 c.count = 0 115 } 116 117 // Merge combines the sketch with another. 118 func (c *CountMinSketch) Merge(other *CountMinSketch) error { 119 if c.depth != other.depth { 120 return errors.New("matrix depth must match") 121 } 122 123 if c.width != other.width { 124 return errors.New("matrix width must match") 125 } 126 127 for i := uint(0); i < c.depth; i++ { 128 for j := uint(0); j < c.width; j++ { 129 c.matrix[i][j] += other.matrix[i][j] 130 } 131 } 132 133 c.count += other.count 134 return nil 135 } 136 137 // Depth returns the matrix depth. 138 func (c *CountMinSketch) Depth() uint { 139 return c.depth 140 } 141 142 // Width returns the matrix width. 143 func (c *CountMinSketch) Width() uint { 144 return c.width 145 } 146 147 func hashn(data []byte, hasher hash.Hash64) (uint32, uint32) { 148 hasher.Reset() 149 hasher.Write(data) 150 sum := hasher.Sum(nil) 151 return binary.BigEndian.Uint32(sum[4:8]), binary.BigEndian.Uint32(sum[0:4]) 152 }