github.com/creachadair/ffs@v0.17.3/index/index.go (about)

     1  // Copyright 2021 Michael J. Fromberger. All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package index constructs a Bloom filter index for a set of string keys.
    16  package index
    17  
    18  import (
    19  	"math"
    20  	"math/rand/v2"
    21  
    22  	"github.com/cespare/xxhash/v2"
    23  )
    24  
    25  // An Index holds a Bloom filter index for a set of keys.
    26  type Index struct {
    27  	numKeys int       // number of keys stored
    28  	bits    bitVector // a multiple of 64 bits
    29  	nbits   uint64    // the number of bits in the vector (≥ m)
    30  	seeds   []uint64  // hash seeds (length = k)
    31  	hash    func(s string) uint64
    32  }
    33  
    34  // New constructs an empty index with capacity for the specified number of
    35  // keys. A nil opts value is ready for use and provides default values as
    36  // described on Options. New will panic if numKeys ≤ 0.
    37  func New(numKeys int, opts *Options) *Index {
    38  	idx := &Index{hash: opts.hashFunc()}
    39  	idx.init(numKeys, opts.falsePositiveRate())
    40  	return idx
    41  }
    42  
    43  // Add adds the specified key to the index.
    44  func (idx *Index) Add(key string) {
    45  	hash := idx.hash(key)
    46  	for _, seed := range idx.seeds {
    47  		pos := int((hash ^ seed) % idx.nbits)
    48  		idx.bits.Set(pos)
    49  	}
    50  	idx.numKeys++
    51  }
    52  
    53  // Has reports whether key is one of the indexed keys. False positives are
    54  // possible for keys that were not added to the index, but no false negatives.
    55  func (idx *Index) Has(key string) bool {
    56  	hash := idx.hash(key)
    57  	for _, seed := range idx.seeds {
    58  		pos := int((hash ^ seed) % idx.nbits)
    59  		if !idx.bits.IsSet(pos) {
    60  			return false
    61  		}
    62  	}
    63  	return true
    64  }
    65  
    66  // Stats returns size and capacity statistics for the index.
    67  func (idx *Index) Stats() Stats {
    68  	return Stats{
    69  		NumKeys:    idx.numKeys,
    70  		FilterBits: int(idx.nbits),
    71  		NumHashes:  len(idx.seeds),
    72  	}
    73  }
    74  
    75  // Len reports the number of keys added to the index. This is shorthand for
    76  // idx.Stats().NumKeys.
    77  func (idx *Index) Len() int { return idx.numKeys }
    78  
    79  // init initializes the internal data structures for the index Bloom filter,
    80  // where n is the expected capacity in number of keys and p is the desired
    81  // false positive rate.
    82  func (idx *Index) init(n int, p float64) {
    83  	// The optimal width for a Bloom filter with n elements and false-positive
    84  	// rate p:
    85  	//
    86  	//             -n * ln(p)
    87  	//  m = ceil( ------------ )
    88  	//              ln(2)**2
    89  	//
    90  	m := math.Ceil(-float64(n) * math.Log(p) / (math.Ln2 * math.Ln2))
    91  
    92  	// The optimal number of hashes for an m-bit filter holding n elements:
    93  	//
    94  	//             m * ln(2)
    95  	//  k = ceil( ----------- )
    96  	//                 n
    97  	//
    98  	k := math.Ceil((m * math.Ln2) / float64(n))
    99  
   100  	idx.bits = newBitVector(int(m))
   101  	idx.nbits = 64 * uint64(len(idx.bits))
   102  	idx.seeds = make([]uint64, int(k))
   103  
   104  	for i := range idx.seeds {
   105  		idx.seeds[i] = rand.Uint64()
   106  	}
   107  }
   108  
   109  // Options provide optional settings for an index. A nil *Options is ready for
   110  // use and provides default values as described.
   111  type Options struct {
   112  	// Compute a 64-bit hash of s. If nil, uses xxhash.Sum64String.
   113  	Hash func(s string) uint64
   114  
   115  	// The maximum false positive rate to permit. A value ≤ 0 defaults to 0.03.
   116  	// Decreasing this value increases the memory required for the index.
   117  	FalsePositiveRate float64
   118  }
   119  
   120  func (o *Options) hashFunc() func(string) uint64 {
   121  	if o == nil || o.Hash == nil {
   122  		return xxhash.Sum64String
   123  	}
   124  	return o.Hash
   125  }
   126  
   127  func (o *Options) falsePositiveRate() float64 {
   128  	if o == nil || o.FalsePositiveRate <= 0 {
   129  		return 0.03
   130  	}
   131  	return o.FalsePositiveRate
   132  }
   133  
   134  // Stats record size and capacity statistics for an Index.
   135  type Stats struct {
   136  	NumKeys    int // the number of keys added to the index
   137  	FilterBits int // the number of bits allocated to the Bloom filter (m)
   138  	NumHashes  int // the number of hash seeds allocated (k)
   139  }
   140  
   141  type bitVector []uint64
   142  
   143  func newBitVector(size int) bitVector  { return make(bitVector, (size+63)/64) }
   144  func (b bitVector) IsSet(pos int) bool { return b[(pos>>6)%len(b)]&(uint64(1)<<(pos&0x3f)) != 0 }
   145  func (b bitVector) Set(pos int)        { b[(pos>>6)%len(b)] |= uint64(1) << (pos & 0x3f) }