github.com/scottcagno/storage@v1.8.0/pkg/bloom/bloom.go (about) 1 /* 2 * 3 * * // Copyright (c) 2021 Scott Cagno. All rights reserved. 4 * * // The license can be found in the root of this project; see LICENSE. 5 * 6 */ 7 8 package bloom 9 10 import ( 11 "fmt" 12 "github.com/scottcagno/storage/pkg/bits" 13 "github.com/scottcagno/storage/pkg/hash/cityhash" 14 "math" 15 ) 16 17 const ( 18 kp00 = uint64(1610612741) 19 kp01 = uint64(402653189) 20 kp02 = uint64(805306457) 21 kp03 = uint64(201326611) 22 kp04 = uint64(1728949133) 23 kp05 = uint64(8543917829) 24 kp06 = uint64(648679351) 25 kp07 = uint64(9196230203) 26 ) 27 28 // n|N = number of items in the filter 29 // p|P = probability of false positives 30 // m|M = total number of bits in the filter, ie. size 31 // k|K = number of hash functions 32 33 // BloomFilter is a basic bloom filter implementation 34 type BloomFilter struct { 35 m uint // m is the number of bits allocated for the filter 36 k uint // k is the number of hash functions for the filter 37 n uint // n is the number of items "in" the filter 38 b *bits.BitSet 39 count int 40 mask uint64 41 } 42 43 // minimum item count, aka default 44 const minItemCount = math.MaxUint8 45 46 // NewBloomFilter returns a new filter with m number of bits available and hints to use k hash functions 47 func NewBloomFilter(n uint) *BloomFilter { 48 if n < minItemCount { 49 n = minItemCount 50 } 51 // using k=8 and maintaining a bitset m=n*24 provides a fairly 52 // constant p=0.00004 (1 in 25,000) false positive ratio which 53 // is probably acceptable in almost all cases I can think of 54 return &BloomFilter{ 55 m: n * 24, 56 k: 8, 57 b: bits.NewBitSet(n), 58 mask: uint64(n - 1), 59 } 60 } 61 62 // -> n = ceil(m / (-k / log(1 - exp(log(p) / k)))) 63 // -> p = pow(1 - exp(-k / (m / n)), k) 64 // -> m = ceil((n * log(p)) / log(1 / pow(2, log(2)))) 65 // -> k = round((m / n) * log(2)) 66 67 func hashes(data []byte) [8]uint64 { 68 h1, h2 := cityhash.Hash128WithSeed(data, kp00, kp01) 69 h3, h4 := cityhash.Hash128WithSeed(data, kp02, kp03) 70 h5, h6 := cityhash.Hash128WithSeed(data, kp04, kp05) 71 h7, h8 := cityhash.Hash128WithSeed(data, kp06, kp07) 72 return [8]uint64{h1, h2, h3, h4, h5, h6, h7, h8} 73 } 74 75 // mask returns the ith hashed location using the eight base hash values 76 func hashAndMask(h [8]uint64, i uint) uint64 { 77 ii := uint64(i) 78 return h[ii&1] + ii*h[2+(((ii+(ii&1))&7)>>1)] 79 } 80 81 // location returns the ith hashed location using the four base hash values 82 func (f *BloomFilter) hashAndMask(h [8]uint64, i uint) uint { 83 return uint(hashAndMask(h, i) % uint64(f.m)) 84 } 85 86 func info(data []byte, hashes [8]uint64, hashAndMask [8]uint) { 87 fmt.Printf("data: %q\n", data) 88 fmt.Printf("hashes:\n\t%v\n", hashes) 89 fmt.Printf("set locations:\n") 90 for i := 0; i < 8; i++ { 91 fmt.Printf("\ti=%d, f.k=%d, l1=%d\n", i, 8, hashAndMask[i]) 92 } 93 } 94 95 // location returns the ith hashed location using the four base hash values 96 func location(h [8]uint64, i uint) uint64 { 97 ii := uint64(i) 98 return h[ii%2] + ii*h[2+(((ii+(ii%2))%8)/2)] 99 } 100 101 // location returns the ith hashed location using the four base hash values 102 func (f *BloomFilter) location(h [8]uint64, i uint) uint { 103 return uint(location(h, i) % uint64(f.m)) 104 } 105 106 func (f *BloomFilter) Set(data []byte) { 107 h := hashes(data) 108 for i := uint(0); i < f.k; i++ { 109 f.b.Set(f.hashAndMask(h, i)) 110 } 111 f.count++ 112 } 113 114 func (f *BloomFilter) Unset(data []byte) { 115 h := hashes(data) 116 for i := uint(0); i < f.k; i++ { 117 f.b.Unset(f.hashAndMask(h, i)) 118 } 119 f.count-- 120 } 121 122 func (f *BloomFilter) Count() int { 123 return f.count 124 } 125 126 func (f *BloomFilter) Size() int { 127 return int(f.m) 128 } 129 130 func (f *BloomFilter) MayHave(data []byte) bool { 131 return f.Has(data) 132 } 133 134 // Has returns true if the data is in the BloomFilter, false otherwise. 135 // If true, the result might be a false positive. If false, the data 136 // is definitely not in the set. 137 func (f *BloomFilter) Has(data []byte) bool { 138 h := hashes(data) 139 for i := uint(0); i < f.k; i++ { 140 if !f.b.IsSet(f.hashAndMask(h, i)) { 141 return false 142 } 143 } 144 return true 145 } 146 147 func split2(x uint64) []uint32 { 148 return []uint32{ 149 uint32(x >> 0), 150 uint32(x >> 32), 151 } 152 } 153 154 func join2(a, b uint32) uint64 { 155 return uint64(a) | uint64(b)<<32 156 } 157 158 func split4(x uint64) []uint16 { 159 return []uint16{ 160 uint16(x >> 0), 161 uint16(x >> 16), 162 uint16(x >> 32), 163 uint16(x >> 48), 164 } 165 } 166 167 func join4(a, b, c, d int64) uint64 { 168 return uint64(a) | uint64(b)<<16 | uint64(c)<<32 | uint64(d)<<48 169 }