github.com/scottcagno/storage@v1.8.0/pkg/bloom/bloom.go (about)

     1  /*
     2   *
     3   *  * // Copyright (c) 2021 Scott Cagno. All rights reserved.
     4   *  * // The license can be found in the root of this project; see LICENSE.
     5   *
     6   */
     7  
     8  package bloom
     9  
    10  import (
    11  	"fmt"
    12  	"github.com/scottcagno/storage/pkg/bits"
    13  	"github.com/scottcagno/storage/pkg/hash/cityhash"
    14  	"math"
    15  )
    16  
    17  const (
    18  	kp00 = uint64(1610612741)
    19  	kp01 = uint64(402653189)
    20  	kp02 = uint64(805306457)
    21  	kp03 = uint64(201326611)
    22  	kp04 = uint64(1728949133)
    23  	kp05 = uint64(8543917829)
    24  	kp06 = uint64(648679351)
    25  	kp07 = uint64(9196230203)
    26  )
    27  
    28  // n|N = number of items in the filter
    29  // p|P = probability of false positives
    30  // m|M = total number of bits in the filter, ie. size
    31  // k|K = number of hash functions
    32  
    33  // BloomFilter is a basic bloom filter implementation
    34  type BloomFilter struct {
    35  	m     uint // m is the number of bits allocated for the filter
    36  	k     uint // k is the number of hash functions for the filter
    37  	n     uint // n is the number of items "in" the filter
    38  	b     *bits.BitSet
    39  	count int
    40  	mask  uint64
    41  }
    42  
    43  // minimum item count, aka default
    44  const minItemCount = math.MaxUint8
    45  
    46  // NewBloomFilter returns a new filter with m number of bits available and hints to use k hash functions
    47  func NewBloomFilter(n uint) *BloomFilter {
    48  	if n < minItemCount {
    49  		n = minItemCount
    50  	}
    51  	// using k=8 and maintaining a bitset m=n*24 provides a fairly
    52  	// constant p=0.00004 (1 in 25,000) false positive ratio which
    53  	// is probably acceptable in almost all cases I can think of
    54  	return &BloomFilter{
    55  		m:    n * 24,
    56  		k:    8,
    57  		b:    bits.NewBitSet(n),
    58  		mask: uint64(n - 1),
    59  	}
    60  }
    61  
    62  // -> n = ceil(m / (-k / log(1 - exp(log(p) / k))))
    63  // -> p = pow(1 - exp(-k / (m / n)), k)
    64  // -> m = ceil((n * log(p)) / log(1 / pow(2, log(2))))
    65  // -> k = round((m / n) * log(2))
    66  
    67  func hashes(data []byte) [8]uint64 {
    68  	h1, h2 := cityhash.Hash128WithSeed(data, kp00, kp01)
    69  	h3, h4 := cityhash.Hash128WithSeed(data, kp02, kp03)
    70  	h5, h6 := cityhash.Hash128WithSeed(data, kp04, kp05)
    71  	h7, h8 := cityhash.Hash128WithSeed(data, kp06, kp07)
    72  	return [8]uint64{h1, h2, h3, h4, h5, h6, h7, h8}
    73  }
    74  
    75  // mask returns the ith hashed location using the eight base hash values
    76  func hashAndMask(h [8]uint64, i uint) uint64 {
    77  	ii := uint64(i)
    78  	return h[ii&1] + ii*h[2+(((ii+(ii&1))&7)>>1)]
    79  }
    80  
    81  // location returns the ith hashed location using the four base hash values
    82  func (f *BloomFilter) hashAndMask(h [8]uint64, i uint) uint {
    83  	return uint(hashAndMask(h, i) % uint64(f.m))
    84  }
    85  
    86  func info(data []byte, hashes [8]uint64, hashAndMask [8]uint) {
    87  	fmt.Printf("data: %q\n", data)
    88  	fmt.Printf("hashes:\n\t%v\n", hashes)
    89  	fmt.Printf("set locations:\n")
    90  	for i := 0; i < 8; i++ {
    91  		fmt.Printf("\ti=%d, f.k=%d, l1=%d\n", i, 8, hashAndMask[i])
    92  	}
    93  }
    94  
    95  // location returns the ith hashed location using the four base hash values
    96  func location(h [8]uint64, i uint) uint64 {
    97  	ii := uint64(i)
    98  	return h[ii%2] + ii*h[2+(((ii+(ii%2))%8)/2)]
    99  }
   100  
   101  // location returns the ith hashed location using the four base hash values
   102  func (f *BloomFilter) location(h [8]uint64, i uint) uint {
   103  	return uint(location(h, i) % uint64(f.m))
   104  }
   105  
   106  func (f *BloomFilter) Set(data []byte) {
   107  	h := hashes(data)
   108  	for i := uint(0); i < f.k; i++ {
   109  		f.b.Set(f.hashAndMask(h, i))
   110  	}
   111  	f.count++
   112  }
   113  
   114  func (f *BloomFilter) Unset(data []byte) {
   115  	h := hashes(data)
   116  	for i := uint(0); i < f.k; i++ {
   117  		f.b.Unset(f.hashAndMask(h, i))
   118  	}
   119  	f.count--
   120  }
   121  
   122  func (f *BloomFilter) Count() int {
   123  	return f.count
   124  }
   125  
   126  func (f *BloomFilter) Size() int {
   127  	return int(f.m)
   128  }
   129  
   130  func (f *BloomFilter) MayHave(data []byte) bool {
   131  	return f.Has(data)
   132  }
   133  
   134  // Has returns true if the data is in the BloomFilter, false otherwise.
   135  // If true, the result might be a false positive. If false, the data
   136  // is definitely not in the set.
   137  func (f *BloomFilter) Has(data []byte) bool {
   138  	h := hashes(data)
   139  	for i := uint(0); i < f.k; i++ {
   140  		if !f.b.IsSet(f.hashAndMask(h, i)) {
   141  			return false
   142  		}
   143  	}
   144  	return true
   145  }
   146  
   147  func split2(x uint64) []uint32 {
   148  	return []uint32{
   149  		uint32(x >> 0),
   150  		uint32(x >> 32),
   151  	}
   152  }
   153  
   154  func join2(a, b uint32) uint64 {
   155  	return uint64(a) | uint64(b)<<32
   156  }
   157  
   158  func split4(x uint64) []uint16 {
   159  	return []uint16{
   160  		uint16(x >> 0),
   161  		uint16(x >> 16),
   162  		uint16(x >> 32),
   163  		uint16(x >> 48),
   164  	}
   165  }
   166  
   167  func join4(a, b, c, d int64) uint64 {
   168  	return uint64(a) | uint64(b)<<16 | uint64(c)<<32 | uint64(d)<<48
   169  }