github.com/symblcrowd/bloom@v2.0.5+incompatible/bloom.go (about) 1 /* 2 Package bloom provides data structures and methods for creating Bloom filters. 3 4 A Bloom filter is a representation of a set of _n_ items, where the main 5 requirement is to make membership queries; _i.e._, whether an item is a 6 member of a set. 7 8 A Bloom filter has two parameters: _m_, a maximum size (typically a reasonably large 9 multiple of the cardinality of the set to represent) and _k_, the number of hashing 10 functions on elements of the set. (The actual hashing functions are important, too, 11 but this is not a parameter for this implementation). A Bloom filter is backed by 12 a BitSet; a key is represented in the filter by setting the bits at each value of the 13 hashing functions (modulo _m_). Set membership is done by _testing_ whether the 14 bits at each value of the hashing functions (again, modulo _m_) are set. If so, 15 the item is in the set. If the item is actually in the set, a Bloom filter will 16 never fail (the true positive rate is 1.0); but it is susceptible to false 17 positives. The art is to choose _k_ and _m_ correctly. 18 19 In this implementation, the hashing functions used is murmurhash, 20 a non-cryptographic hashing function. 21 22 This implementation accepts keys for setting as testing as []byte. Thus, to 23 add a string item, "Love": 24 25 uint n = 1000 26 filter := bloom.New(20*n, 5) // load of 20, 5 keys 27 filter.Add([]byte("Love")) 28 29 Similarly, to test if "Love" is in bloom: 30 31 if filter.Test([]byte("Love")) 32 33 For numeric data, I recommend that you look into the binary/encoding library. But, 34 for example, to add a uint32 to the filter: 35 36 i := uint32(100) 37 n1 := make([]byte,4) 38 binary.BigEndian.PutUint32(n1,i) 39 f.Add(n1) 40 41 Finally, there is a method to estimate the false positive rate of a particular 42 Bloom filter for a set of size _n_: 43 44 if filter.EstimateFalsePositiveRate(1000) > 0.001 45 46 Given the particular hashing scheme, it's best to be empirical about this. Note 47 that estimating the FP rate will clear the Bloom filter. 48 */ 49 package bloom 50 51 import ( 52 "bytes" 53 "encoding/binary" 54 "encoding/json" 55 "fmt" 56 "io" 57 "math" 58 59 "github.com/bits-and-blooms/bitset" 60 "github.com/spaolacci/murmur3" 61 "log" 62 "strings" 63 ) 64 65 // A BloomFilter is a representation of a set of _n_ items, where the main 66 // requirement is to make membership queries; _i.e._, whether an item is a 67 // member of a set. 68 type BloomFilter struct { 69 m uint 70 k uint 71 b *bitset.BitSet 72 } 73 74 func max(x, y uint) uint { 75 if x > y { 76 return x 77 } 78 return y 79 } 80 81 // New creates a new Bloom filter with _m_ bits and _k_ hashing functions 82 // We force _m_ and _k_ to be at least one to avoid panics. 83 func New(m uint, k uint) *BloomFilter { 84 return &BloomFilter{max(1, m), max(1, k), bitset.New(m)} 85 } 86 87 // From creates a new Bloom filter with len(_data_) * 64 bits and _k_ hashing 88 // functions. The data slice is not going to be reset. 89 func From(data []uint64, k uint) *BloomFilter { 90 m := uint(len(data) * 64) 91 return &BloomFilter{m, k, bitset.From(data)} 92 } 93 94 // baseHashes returns the four hash values of data that are used to create k 95 // hashes 96 func baseHashes(data []byte) [4]uint64 { 97 a1 := []byte{1} // to grab another bit of data 98 hasher := murmur3.New128() 99 hasher.Write(data) // #nosec 100 v1, v2 := hasher.Sum128() 101 hasher.Write(a1) // #nosec 102 v3, v4 := hasher.Sum128() 103 return [4]uint64{ 104 v1, v2, v3, v4, 105 } 106 } 107 108 // location returns the ith hashed location using the four base hash values 109 func location(h [4]uint64, i uint) uint64 { 110 ii := uint64(i) 111 return h[ii%2] + ii*h[2+(((ii+(ii%2))%4)/2)] 112 } 113 114 // location returns the ith hashed location using the four base hash values 115 func (f *BloomFilter) location(h [4]uint64, i uint) uint { 116 return uint(location(h, i) % uint64(f.m)) 117 } 118 119 func (f *BloomFilter) GetB() *bitset.BitSet { 120 return f.b 121 } 122 123 // EstimateParameters estimates requirements for m and k. 124 // Based on https://bitbucket.org/ww/bloom/src/829aa19d01d9/bloom.go 125 // used with permission. 126 func EstimateParameters(n uint, p float64) (m uint, k uint) { 127 m = uint(math.Ceil(-1 * float64(n) * math.Log(p) / math.Pow(math.Log(2), 2))) 128 k = uint(math.Ceil(math.Log(2) * float64(m) / float64(n))) 129 return 130 } 131 132 // NewWithEstimates creates a new Bloom filter for about n items with fp 133 // false positive rate 134 func NewWithEstimates(n uint, fp float64) *BloomFilter { 135 m, k := EstimateParameters(n, fp) 136 return New(m, k) 137 } 138 139 // Cap returns the capacity, _m_, of a Bloom filter 140 func (f *BloomFilter) Cap() uint { 141 return f.m 142 } 143 144 // K returns the number of hash functions used in the BloomFilter 145 func (f *BloomFilter) K() uint { 146 return f.k 147 } 148 149 // Add data to the Bloom Filter. Returns the filter (allows chaining) 150 func (f *BloomFilter) Add(data []byte) *BloomFilter { 151 h := baseHashes(data) 152 for i := uint(0); i < f.k; i++ { 153 f.b.Set(f.location(h, i)) 154 } 155 return f 156 } 157 158 // Merge the data from two Bloom Filters. 159 func (f *BloomFilter) Merge(g *BloomFilter) error { 160 // Make sure the m's and k's are the same, otherwise merging has no real use. 161 if f.m != g.m { 162 return fmt.Errorf("m's don't match: %d != %d", f.m, g.m) 163 } 164 165 if f.k != g.k { 166 return fmt.Errorf("k's don't match: %d != %d", f.m, g.m) 167 } 168 169 f.b.InPlaceUnion(g.b) 170 return nil 171 } 172 173 // Copy creates a copy of a Bloom filter. 174 func (f *BloomFilter) Copy() *BloomFilter { 175 fc := New(f.m, f.k) 176 fc.Merge(f) // #nosec 177 return fc 178 } 179 180 // AddString to the Bloom Filter. Returns the filter (allows chaining) 181 func (f *BloomFilter) AddString(data string) *BloomFilter { 182 return f.Add([]byte(data)) 183 } 184 185 // Test returns true if the data is in the BloomFilter, false otherwise. 186 // If true, the result might be a false positive. If false, the data 187 // is definitely not in the set. 188 func (f *BloomFilter) Test(data []byte) bool { 189 h := baseHashes(data) 190 for i := uint(0); i < f.k; i++ { 191 if !f.b.Test(f.location(h, i)) { 192 return false 193 } 194 } 195 return true 196 } 197 198 // TestString returns true if the string is in the BloomFilter, false otherwise. 199 // If true, the result might be a false positive. If false, the data 200 // is definitely not in the set. 201 func (f *BloomFilter) TestString(data string) bool { 202 return f.Test([]byte(data)) 203 } 204 205 // TestLocations returns true if all locations are set in the BloomFilter, false 206 // otherwise. 207 func (f *BloomFilter) TestLocations(locs []uint64) bool { 208 for i := 0; i < len(locs); i++ { 209 if !f.b.Test(uint(locs[i] % uint64(f.m))) { 210 return false 211 } 212 } 213 return true 214 } 215 216 // TestAndAdd is the equivalent to calling Test(data) then Add(data). 217 // Returns the result of Test. 218 func (f *BloomFilter) TestAndAdd(data []byte) bool { 219 present := true 220 h := baseHashes(data) 221 for i := uint(0); i < f.k; i++ { 222 l := f.location(h, i) 223 if !f.b.Test(l) { 224 present = false 225 } 226 f.b.Set(l) 227 } 228 return present 229 } 230 231 // TestAndAddString is the equivalent to calling Test(string) then Add(string). 232 // Returns the result of Test. 233 func (f *BloomFilter) TestAndAddString(data string) bool { 234 return f.TestAndAdd([]byte(data)) 235 } 236 237 func (f *BloomFilter) Compare(filter2 BloomFilter) float64 { 238 s1 := f.GetB().String() 239 s1 = strings.Replace(s1, "{", "", 1) 240 s1 = strings.Replace(s1, "}", "", 1) 241 s2 := filter2.GetB().String() 242 s2 = strings.Replace(s2, "{", "", 1) 243 s2 = strings.Replace(s2, "}", "", 1) 244 245 positions1 := strings.Split(s1, ",") 246 positions2 := strings.Split(s2, ",") 247 248 mapKeys1 := make(map[string]bool) 249 mapKeys2 := make(map[string]bool) 250 allKeys := make(map[string]bool) 251 252 for _, pos1 := range positions1 { 253 mapKeys1[pos1] = true 254 allKeys[pos1] = true 255 } 256 for _, pos2 := range positions2 { 257 mapKeys2[pos2] = true 258 allKeys[pos2] = true 259 } 260 log.Println(len(allKeys)) 261 gleich := 0 262 gesamt := 0 263 for key, _ := range allKeys { 264 v1, _ := mapKeys1[key] 265 v2, _ := mapKeys2[key] 266 if v1 && v2 { 267 gleich++ 268 gesamt++ 269 } else if v1 || v2 { 270 gesamt++ 271 } 272 } 273 274 log.Println(gleich) 275 log.Println(gesamt) 276 val := float64(gleich) / float64(gesamt) 277 log.Println(val) 278 return val 279 280 //return f.b.DifferenceCardinality(filter2.b) 281 } 282 283 // ClearAll clears all the data in a Bloom filter, removing all keys 284 func (f *BloomFilter) ClearAll() *BloomFilter { 285 f.b.ClearAll() 286 return f 287 } 288 289 // EstimateFalsePositiveRate returns, for a BloomFilter with a estimate of m bits 290 // and k hash functions, what the false positive rate will be 291 // while storing n entries; runs 100,000 tests. This is an empirical 292 // test using integers as keys. As a side-effect, it clears the BloomFilter. 293 func (f *BloomFilter) EstimateFalsePositiveRate(n uint) (fpRate float64) { 294 rounds := uint32(100000) 295 f.ClearAll() 296 n1 := make([]byte, 4) 297 for i := uint32(0); i < uint32(n); i++ { 298 binary.BigEndian.PutUint32(n1, i) 299 f.Add(n1) 300 } 301 fp := 0 302 // test for number of rounds 303 for i := uint32(0); i < rounds; i++ { 304 binary.BigEndian.PutUint32(n1, i+uint32(n)+1) 305 if f.Test(n1) { 306 //fmt.Printf("%v failed.\n", i+uint32(n)+1) 307 fp++ 308 } 309 } 310 fpRate = float64(fp) / (float64(rounds)) 311 f.ClearAll() 312 return 313 } 314 315 // bloomFilterJSON is an unexported type for marshaling/unmarshaling BloomFilter struct. 316 type bloomFilterJSON struct { 317 M uint `json:"m"` 318 K uint `json:"k"` 319 B *bitset.BitSet `json:"b"` 320 } 321 322 // MarshalJSON implements json.Marshaler interface. 323 func (f *BloomFilter) MarshalJSON() ([]byte, error) { 324 return json.Marshal(bloomFilterJSON{f.m, f.k, f.b}) 325 } 326 327 // UnmarshalJSON implements json.Unmarshaler interface. 328 func (f *BloomFilter) UnmarshalJSON(data []byte) error { 329 var j bloomFilterJSON 330 err := json.Unmarshal(data, &j) 331 if err != nil { 332 return err 333 } 334 f.m = j.M 335 f.k = j.K 336 f.b = j.B 337 return nil 338 } 339 340 // WriteTo writes a binary representation of the BloomFilter to an i/o stream. 341 // It returns the number of bytes written. 342 func (f *BloomFilter) WriteTo(stream io.Writer) (int64, error) { 343 err := binary.Write(stream, binary.BigEndian, uint64(f.m)) 344 if err != nil { 345 return 0, err 346 } 347 err = binary.Write(stream, binary.BigEndian, uint64(f.k)) 348 if err != nil { 349 return 0, err 350 } 351 numBytes, err := f.b.WriteTo(stream) 352 return numBytes + int64(2*binary.Size(uint64(0))), err 353 } 354 355 // ReadFrom reads a binary representation of the BloomFilter (such as might 356 // have been written by WriteTo()) from an i/o stream. It returns the number 357 // of bytes read. 358 func (f *BloomFilter) ReadFrom(stream io.Reader) (int64, error) { 359 var m, k uint64 360 err := binary.Read(stream, binary.BigEndian, &m) 361 if err != nil { 362 return 0, err 363 } 364 err = binary.Read(stream, binary.BigEndian, &k) 365 if err != nil { 366 return 0, err 367 } 368 b := &bitset.BitSet{} 369 numBytes, err := b.ReadFrom(stream) 370 if err != nil { 371 return 0, err 372 } 373 f.m = uint(m) 374 f.k = uint(k) 375 f.b = b 376 return numBytes + int64(2*binary.Size(uint64(0))), nil 377 } 378 379 // GobEncode implements gob.GobEncoder interface. 380 func (f *BloomFilter) GobEncode() ([]byte, error) { 381 var buf bytes.Buffer 382 _, err := f.WriteTo(&buf) 383 if err != nil { 384 return nil, err 385 } 386 387 return buf.Bytes(), nil 388 } 389 390 // GobDecode implements gob.GobDecoder interface. 391 func (f *BloomFilter) GobDecode(data []byte) error { 392 buf := bytes.NewBuffer(data) 393 _, err := f.ReadFrom(buf) 394 395 return err 396 } 397 398 // Equal tests for the equality of two Bloom filters 399 func (f *BloomFilter) Equal(g *BloomFilter) bool { 400 return f.m == g.m && f.k == g.k && f.b.Equal(g.b) 401 } 402 403 // Locations returns a list of hash locations representing a data item. 404 func Locations(data []byte, k uint) []uint64 { 405 locs := make([]uint64, k) 406 407 // calculate locations 408 h := baseHashes(data) 409 for i := uint(0); i < k; i++ { 410 locs[i] = location(h, i) 411 } 412 413 return locs 414 }