github.com/symblcrowd/bloom@v2.0.5+incompatible/bloom.go (about)

     1  /*
     2  Package bloom provides data structures and methods for creating Bloom filters.
     3  
     4  A Bloom filter is a representation of a set of _n_ items, where the main
     5  requirement is to make membership queries; _i.e._, whether an item is a
     6  member of a set.
     7  
     8  A Bloom filter has two parameters: _m_, a maximum size (typically a reasonably large
     9  multiple of the cardinality of the set to represent) and _k_, the number of hashing
    10  functions on elements of the set. (The actual hashing functions are important, too,
    11  but this is not a parameter for this implementation). A Bloom filter is backed by
    12  a BitSet; a key is represented in the filter by setting the bits at each value of the
    13  hashing functions (modulo _m_). Set membership is done by _testing_ whether the
    14  bits at each value of the hashing functions (again, modulo _m_) are set. If so,
    15  the item is in the set. If the item is actually in the set, a Bloom filter will
    16  never fail (the true positive rate is 1.0); but it is susceptible to false
    17  positives. The art is to choose _k_ and _m_ correctly.
    18  
    19  In this implementation, the hashing functions used is murmurhash,
    20  a non-cryptographic hashing function.
    21  
    22  This implementation accepts keys for setting as testing as []byte. Thus, to
    23  add a string item, "Love":
    24  
    25      uint n = 1000
    26      filter := bloom.New(20*n, 5) // load of 20, 5 keys
    27      filter.Add([]byte("Love"))
    28  
    29  Similarly, to test if "Love" is in bloom:
    30  
    31      if filter.Test([]byte("Love"))
    32  
    33  For numeric data, I recommend that you look into the binary/encoding library. But,
    34  for example, to add a uint32 to the filter:
    35  
    36      i := uint32(100)
    37      n1 := make([]byte,4)
    38      binary.BigEndian.PutUint32(n1,i)
    39      f.Add(n1)
    40  
    41  Finally, there is a method to estimate the false positive rate of a particular
    42  Bloom filter for a set of size _n_:
    43  
    44      if filter.EstimateFalsePositiveRate(1000) > 0.001
    45  
    46  Given the particular hashing scheme, it's best to be empirical about this. Note
    47  that estimating the FP rate will clear the Bloom filter.
    48  */
    49  package bloom
    50  
    51  import (
    52  	"bytes"
    53  	"encoding/binary"
    54  	"encoding/json"
    55  	"fmt"
    56  	"io"
    57  	"math"
    58  
    59  	"github.com/bits-and-blooms/bitset"
    60  	"github.com/spaolacci/murmur3"
    61  	"log"
    62  	"strings"
    63  )
    64  
    65  // A BloomFilter is a representation of a set of _n_ items, where the main
    66  // requirement is to make membership queries; _i.e._, whether an item is a
    67  // member of a set.
    68  type BloomFilter struct {
    69  	m uint
    70  	k uint
    71  	b *bitset.BitSet
    72  }
    73  
    74  func max(x, y uint) uint {
    75  	if x > y {
    76  		return x
    77  	}
    78  	return y
    79  }
    80  
    81  // New creates a new Bloom filter with _m_ bits and _k_ hashing functions
    82  // We force _m_ and _k_ to be at least one to avoid panics.
    83  func New(m uint, k uint) *BloomFilter {
    84  	return &BloomFilter{max(1, m), max(1, k), bitset.New(m)}
    85  }
    86  
    87  // From creates a new Bloom filter with len(_data_) * 64 bits and _k_ hashing
    88  // functions. The data slice is not going to be reset.
    89  func From(data []uint64, k uint) *BloomFilter {
    90  	m := uint(len(data) * 64)
    91  	return &BloomFilter{m, k, bitset.From(data)}
    92  }
    93  
    94  // baseHashes returns the four hash values of data that are used to create k
    95  // hashes
    96  func baseHashes(data []byte) [4]uint64 {
    97  	a1 := []byte{1} // to grab another bit of data
    98  	hasher := murmur3.New128()
    99  	hasher.Write(data) // #nosec
   100  	v1, v2 := hasher.Sum128()
   101  	hasher.Write(a1) // #nosec
   102  	v3, v4 := hasher.Sum128()
   103  	return [4]uint64{
   104  		v1, v2, v3, v4,
   105  	}
   106  }
   107  
   108  // location returns the ith hashed location using the four base hash values
   109  func location(h [4]uint64, i uint) uint64 {
   110  	ii := uint64(i)
   111  	return h[ii%2] + ii*h[2+(((ii+(ii%2))%4)/2)]
   112  }
   113  
   114  // location returns the ith hashed location using the four base hash values
   115  func (f *BloomFilter) location(h [4]uint64, i uint) uint {
   116  	return uint(location(h, i) % uint64(f.m))
   117  }
   118  
   119  func (f *BloomFilter) GetB() *bitset.BitSet {
   120  	return f.b
   121  }
   122  
   123  // EstimateParameters estimates requirements for m and k.
   124  // Based on https://bitbucket.org/ww/bloom/src/829aa19d01d9/bloom.go
   125  // used with permission.
   126  func EstimateParameters(n uint, p float64) (m uint, k uint) {
   127  	m = uint(math.Ceil(-1 * float64(n) * math.Log(p) / math.Pow(math.Log(2), 2)))
   128  	k = uint(math.Ceil(math.Log(2) * float64(m) / float64(n)))
   129  	return
   130  }
   131  
   132  // NewWithEstimates creates a new Bloom filter for about n items with fp
   133  // false positive rate
   134  func NewWithEstimates(n uint, fp float64) *BloomFilter {
   135  	m, k := EstimateParameters(n, fp)
   136  	return New(m, k)
   137  }
   138  
   139  // Cap returns the capacity, _m_, of a Bloom filter
   140  func (f *BloomFilter) Cap() uint {
   141  	return f.m
   142  }
   143  
   144  // K returns the number of hash functions used in the BloomFilter
   145  func (f *BloomFilter) K() uint {
   146  	return f.k
   147  }
   148  
   149  // Add data to the Bloom Filter. Returns the filter (allows chaining)
   150  func (f *BloomFilter) Add(data []byte) *BloomFilter {
   151  	h := baseHashes(data)
   152  	for i := uint(0); i < f.k; i++ {
   153  		f.b.Set(f.location(h, i))
   154  	}
   155  	return f
   156  }
   157  
   158  // Merge the data from two Bloom Filters.
   159  func (f *BloomFilter) Merge(g *BloomFilter) error {
   160  	// Make sure the m's and k's are the same, otherwise merging has no real use.
   161  	if f.m != g.m {
   162  		return fmt.Errorf("m's don't match: %d != %d", f.m, g.m)
   163  	}
   164  
   165  	if f.k != g.k {
   166  		return fmt.Errorf("k's don't match: %d != %d", f.m, g.m)
   167  	}
   168  
   169  	f.b.InPlaceUnion(g.b)
   170  	return nil
   171  }
   172  
   173  // Copy creates a copy of a Bloom filter.
   174  func (f *BloomFilter) Copy() *BloomFilter {
   175  	fc := New(f.m, f.k)
   176  	fc.Merge(f) // #nosec
   177  	return fc
   178  }
   179  
   180  // AddString to the Bloom Filter. Returns the filter (allows chaining)
   181  func (f *BloomFilter) AddString(data string) *BloomFilter {
   182  	return f.Add([]byte(data))
   183  }
   184  
   185  // Test returns true if the data is in the BloomFilter, false otherwise.
   186  // If true, the result might be a false positive. If false, the data
   187  // is definitely not in the set.
   188  func (f *BloomFilter) Test(data []byte) bool {
   189  	h := baseHashes(data)
   190  	for i := uint(0); i < f.k; i++ {
   191  		if !f.b.Test(f.location(h, i)) {
   192  			return false
   193  		}
   194  	}
   195  	return true
   196  }
   197  
   198  // TestString returns true if the string is in the BloomFilter, false otherwise.
   199  // If true, the result might be a false positive. If false, the data
   200  // is definitely not in the set.
   201  func (f *BloomFilter) TestString(data string) bool {
   202  	return f.Test([]byte(data))
   203  }
   204  
   205  // TestLocations returns true if all locations are set in the BloomFilter, false
   206  // otherwise.
   207  func (f *BloomFilter) TestLocations(locs []uint64) bool {
   208  	for i := 0; i < len(locs); i++ {
   209  		if !f.b.Test(uint(locs[i] % uint64(f.m))) {
   210  			return false
   211  		}
   212  	}
   213  	return true
   214  }
   215  
   216  // TestAndAdd is the equivalent to calling Test(data) then Add(data).
   217  // Returns the result of Test.
   218  func (f *BloomFilter) TestAndAdd(data []byte) bool {
   219  	present := true
   220  	h := baseHashes(data)
   221  	for i := uint(0); i < f.k; i++ {
   222  		l := f.location(h, i)
   223  		if !f.b.Test(l) {
   224  			present = false
   225  		}
   226  		f.b.Set(l)
   227  	}
   228  	return present
   229  }
   230  
   231  // TestAndAddString is the equivalent to calling Test(string) then Add(string).
   232  // Returns the result of Test.
   233  func (f *BloomFilter) TestAndAddString(data string) bool {
   234  	return f.TestAndAdd([]byte(data))
   235  }
   236  
   237  func (f *BloomFilter) Compare(filter2 BloomFilter) float64 {
   238  	s1 := f.GetB().String()
   239  	s1 = strings.Replace(s1, "{", "", 1)
   240  	s1 = strings.Replace(s1, "}", "", 1)
   241  	s2 := filter2.GetB().String()
   242  	s2 = strings.Replace(s2, "{", "", 1)
   243  	s2 = strings.Replace(s2, "}", "", 1)
   244  
   245  	positions1 := strings.Split(s1, ",")
   246  	positions2 := strings.Split(s2, ",")
   247  
   248  	mapKeys1 := make(map[string]bool)
   249  	mapKeys2 := make(map[string]bool)
   250  	allKeys := make(map[string]bool)
   251  
   252  	for _, pos1 := range positions1 {
   253  		mapKeys1[pos1] = true
   254  		allKeys[pos1] = true
   255  	}
   256  	for _, pos2 := range positions2 {
   257  		mapKeys2[pos2] = true
   258  		allKeys[pos2] = true
   259  	}
   260  	log.Println(len(allKeys))
   261  	gleich := 0
   262  	gesamt := 0
   263  	for key, _ := range allKeys {
   264  		v1, _ := mapKeys1[key]
   265  		v2, _ := mapKeys2[key]
   266  		if v1 && v2 {
   267  			gleich++
   268  			gesamt++
   269  		} else if v1 || v2 {
   270  			gesamt++
   271  		}
   272  	}
   273  
   274  	log.Println(gleich)
   275  	log.Println(gesamt)
   276  	val := float64(gleich) / float64(gesamt)
   277  	log.Println(val)
   278  	return val
   279  
   280  	//return f.b.DifferenceCardinality(filter2.b)
   281  }
   282  
   283  // ClearAll clears all the data in a Bloom filter, removing all keys
   284  func (f *BloomFilter) ClearAll() *BloomFilter {
   285  	f.b.ClearAll()
   286  	return f
   287  }
   288  
   289  // EstimateFalsePositiveRate returns, for a BloomFilter with a estimate of m bits
   290  // and k hash functions, what the false positive rate will be
   291  // while storing n entries; runs 100,000 tests. This is an empirical
   292  // test using integers as keys. As a side-effect, it clears the BloomFilter.
   293  func (f *BloomFilter) EstimateFalsePositiveRate(n uint) (fpRate float64) {
   294  	rounds := uint32(100000)
   295  	f.ClearAll()
   296  	n1 := make([]byte, 4)
   297  	for i := uint32(0); i < uint32(n); i++ {
   298  		binary.BigEndian.PutUint32(n1, i)
   299  		f.Add(n1)
   300  	}
   301  	fp := 0
   302  	// test for number of rounds
   303  	for i := uint32(0); i < rounds; i++ {
   304  		binary.BigEndian.PutUint32(n1, i+uint32(n)+1)
   305  		if f.Test(n1) {
   306  			//fmt.Printf("%v failed.\n", i+uint32(n)+1)
   307  			fp++
   308  		}
   309  	}
   310  	fpRate = float64(fp) / (float64(rounds))
   311  	f.ClearAll()
   312  	return
   313  }
   314  
   315  // bloomFilterJSON is an unexported type for marshaling/unmarshaling BloomFilter struct.
   316  type bloomFilterJSON struct {
   317  	M uint           `json:"m"`
   318  	K uint           `json:"k"`
   319  	B *bitset.BitSet `json:"b"`
   320  }
   321  
   322  // MarshalJSON implements json.Marshaler interface.
   323  func (f *BloomFilter) MarshalJSON() ([]byte, error) {
   324  	return json.Marshal(bloomFilterJSON{f.m, f.k, f.b})
   325  }
   326  
   327  // UnmarshalJSON implements json.Unmarshaler interface.
   328  func (f *BloomFilter) UnmarshalJSON(data []byte) error {
   329  	var j bloomFilterJSON
   330  	err := json.Unmarshal(data, &j)
   331  	if err != nil {
   332  		return err
   333  	}
   334  	f.m = j.M
   335  	f.k = j.K
   336  	f.b = j.B
   337  	return nil
   338  }
   339  
   340  // WriteTo writes a binary representation of the BloomFilter to an i/o stream.
   341  // It returns the number of bytes written.
   342  func (f *BloomFilter) WriteTo(stream io.Writer) (int64, error) {
   343  	err := binary.Write(stream, binary.BigEndian, uint64(f.m))
   344  	if err != nil {
   345  		return 0, err
   346  	}
   347  	err = binary.Write(stream, binary.BigEndian, uint64(f.k))
   348  	if err != nil {
   349  		return 0, err
   350  	}
   351  	numBytes, err := f.b.WriteTo(stream)
   352  	return numBytes + int64(2*binary.Size(uint64(0))), err
   353  }
   354  
   355  // ReadFrom reads a binary representation of the BloomFilter (such as might
   356  // have been written by WriteTo()) from an i/o stream. It returns the number
   357  // of bytes read.
   358  func (f *BloomFilter) ReadFrom(stream io.Reader) (int64, error) {
   359  	var m, k uint64
   360  	err := binary.Read(stream, binary.BigEndian, &m)
   361  	if err != nil {
   362  		return 0, err
   363  	}
   364  	err = binary.Read(stream, binary.BigEndian, &k)
   365  	if err != nil {
   366  		return 0, err
   367  	}
   368  	b := &bitset.BitSet{}
   369  	numBytes, err := b.ReadFrom(stream)
   370  	if err != nil {
   371  		return 0, err
   372  	}
   373  	f.m = uint(m)
   374  	f.k = uint(k)
   375  	f.b = b
   376  	return numBytes + int64(2*binary.Size(uint64(0))), nil
   377  }
   378  
   379  // GobEncode implements gob.GobEncoder interface.
   380  func (f *BloomFilter) GobEncode() ([]byte, error) {
   381  	var buf bytes.Buffer
   382  	_, err := f.WriteTo(&buf)
   383  	if err != nil {
   384  		return nil, err
   385  	}
   386  
   387  	return buf.Bytes(), nil
   388  }
   389  
   390  // GobDecode implements gob.GobDecoder interface.
   391  func (f *BloomFilter) GobDecode(data []byte) error {
   392  	buf := bytes.NewBuffer(data)
   393  	_, err := f.ReadFrom(buf)
   394  
   395  	return err
   396  }
   397  
   398  // Equal tests for the equality of two Bloom filters
   399  func (f *BloomFilter) Equal(g *BloomFilter) bool {
   400  	return f.m == g.m && f.k == g.k && f.b.Equal(g.b)
   401  }
   402  
   403  // Locations returns a list of hash locations representing a data item.
   404  func Locations(data []byte, k uint) []uint64 {
   405  	locs := make([]uint64, k)
   406  
   407  	// calculate locations
   408  	h := baseHashes(data)
   409  	for i := uint(0); i < k; i++ {
   410  		locs[i] = location(h, i)
   411  	}
   412  
   413  	return locs
   414  }