github.com/creachadair/ffs@v0.17.3/index/index.go (about) 1 // Copyright 2021 Michael J. Fromberger. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package index constructs a Bloom filter index for a set of string keys. 16 package index 17 18 import ( 19 "math" 20 "math/rand/v2" 21 22 "github.com/cespare/xxhash/v2" 23 ) 24 25 // An Index holds a Bloom filter index for a set of keys. 26 type Index struct { 27 numKeys int // number of keys stored 28 bits bitVector // a multiple of 64 bits 29 nbits uint64 // the number of bits in the vector (≥ m) 30 seeds []uint64 // hash seeds (length = k) 31 hash func(s string) uint64 32 } 33 34 // New constructs an empty index with capacity for the specified number of 35 // keys. A nil opts value is ready for use and provides default values as 36 // described on Options. New will panic if numKeys ≤ 0. 37 func New(numKeys int, opts *Options) *Index { 38 idx := &Index{hash: opts.hashFunc()} 39 idx.init(numKeys, opts.falsePositiveRate()) 40 return idx 41 } 42 43 // Add adds the specified key to the index. 44 func (idx *Index) Add(key string) { 45 hash := idx.hash(key) 46 for _, seed := range idx.seeds { 47 pos := int((hash ^ seed) % idx.nbits) 48 idx.bits.Set(pos) 49 } 50 idx.numKeys++ 51 } 52 53 // Has reports whether key is one of the indexed keys. False positives are 54 // possible for keys that were not added to the index, but no false negatives. 55 func (idx *Index) Has(key string) bool { 56 hash := idx.hash(key) 57 for _, seed := range idx.seeds { 58 pos := int((hash ^ seed) % idx.nbits) 59 if !idx.bits.IsSet(pos) { 60 return false 61 } 62 } 63 return true 64 } 65 66 // Stats returns size and capacity statistics for the index. 67 func (idx *Index) Stats() Stats { 68 return Stats{ 69 NumKeys: idx.numKeys, 70 FilterBits: int(idx.nbits), 71 NumHashes: len(idx.seeds), 72 } 73 } 74 75 // Len reports the number of keys added to the index. This is shorthand for 76 // idx.Stats().NumKeys. 77 func (idx *Index) Len() int { return idx.numKeys } 78 79 // init initializes the internal data structures for the index Bloom filter, 80 // where n is the expected capacity in number of keys and p is the desired 81 // false positive rate. 82 func (idx *Index) init(n int, p float64) { 83 // The optimal width for a Bloom filter with n elements and false-positive 84 // rate p: 85 // 86 // -n * ln(p) 87 // m = ceil( ------------ ) 88 // ln(2)**2 89 // 90 m := math.Ceil(-float64(n) * math.Log(p) / (math.Ln2 * math.Ln2)) 91 92 // The optimal number of hashes for an m-bit filter holding n elements: 93 // 94 // m * ln(2) 95 // k = ceil( ----------- ) 96 // n 97 // 98 k := math.Ceil((m * math.Ln2) / float64(n)) 99 100 idx.bits = newBitVector(int(m)) 101 idx.nbits = 64 * uint64(len(idx.bits)) 102 idx.seeds = make([]uint64, int(k)) 103 104 for i := range idx.seeds { 105 idx.seeds[i] = rand.Uint64() 106 } 107 } 108 109 // Options provide optional settings for an index. A nil *Options is ready for 110 // use and provides default values as described. 111 type Options struct { 112 // Compute a 64-bit hash of s. If nil, uses xxhash.Sum64String. 113 Hash func(s string) uint64 114 115 // The maximum false positive rate to permit. A value ≤ 0 defaults to 0.03. 116 // Decreasing this value increases the memory required for the index. 117 FalsePositiveRate float64 118 } 119 120 func (o *Options) hashFunc() func(string) uint64 { 121 if o == nil || o.Hash == nil { 122 return xxhash.Sum64String 123 } 124 return o.Hash 125 } 126 127 func (o *Options) falsePositiveRate() float64 { 128 if o == nil || o.FalsePositiveRate <= 0 { 129 return 0.03 130 } 131 return o.FalsePositiveRate 132 } 133 134 // Stats record size and capacity statistics for an Index. 135 type Stats struct { 136 NumKeys int // the number of keys added to the index 137 FilterBits int // the number of bits allocated to the Bloom filter (m) 138 NumHashes int // the number of hash seeds allocated (k) 139 } 140 141 type bitVector []uint64 142 143 func newBitVector(size int) bitVector { return make(bitVector, (size+63)/64) } 144 func (b bitVector) IsSet(pos int) bool { return b[(pos>>6)%len(b)]&(uint64(1)<<(pos&0x3f)) != 0 } 145 func (b bitVector) Set(pos int) { b[(pos>>6)%len(b)] |= uint64(1) << (pos & 0x3f) }