github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/bloom/bloom.go (about) 1 // Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 // Package bloom implements Bloom filters. 6 package bloom // import "github.com/zuoyebang/bitalostable/bloom" 7 8 import ( 9 "encoding/binary" 10 "fmt" 11 12 "github.com/zuoyebang/bitalostable/internal/base" 13 ) 14 15 const ( 16 cacheLineSize = 64 17 cacheLineBits = cacheLineSize * 8 18 ) 19 20 type tableFilter []byte 21 22 func (f tableFilter) MayContain(key []byte) bool { 23 if len(f) <= 5 { 24 return false 25 } 26 n := len(f) - 5 27 nProbes := f[n] 28 nLines := binary.LittleEndian.Uint32(f[n+1:]) 29 cacheLineBits := 8 * (uint32(n) / nLines) 30 31 h := hash(key) 32 delta := h>>17 | h<<15 33 b := (h % nLines) * cacheLineBits 34 35 for j := uint8(0); j < nProbes; j++ { 36 bitPos := b + (h % cacheLineBits) 37 if f[bitPos/8]&(1<<(bitPos%8)) == 0 { 38 return false 39 } 40 h += delta 41 } 42 return true 43 } 44 45 func calculateProbes(bitsPerKey int) uint32 { 46 // We intentionally round down to reduce probing cost a little bit 47 n := uint32(float64(bitsPerKey) * 0.69) // 0.69 =~ ln(2) 48 if n < 1 { 49 n = 1 50 } 51 if n > 30 { 52 n = 30 53 } 54 return n 55 } 56 57 // extend appends n zero bytes to b. It returns the overall slice (of length 58 // n+len(originalB)) and the slice of n trailing zeroes. 59 func extend(b []byte, n int) (overall, trailer []byte) { 60 want := n + len(b) 61 if want <= cap(b) { 62 overall = b[:want] 63 trailer = overall[len(b):] 64 for i := range trailer { 65 trailer[i] = 0 66 } 67 } else { 68 // Grow the capacity exponentially, with a 1KiB minimum. 69 c := 1024 70 for c < want { 71 c += c / 4 72 } 73 overall = make([]byte, want, c) 74 trailer = overall[len(b):] 75 copy(overall, b) 76 } 77 return overall, trailer 78 } 79 80 // hash implements a hashing algorithm similar to the Murmur hash. 81 func hash(b []byte) uint32 { 82 const ( 83 seed = 0xbc9f1d34 84 m = 0xc6a4a793 85 ) 86 h := uint32(seed) ^ uint32(uint64(uint32(len(b))*m)) 87 for ; len(b) >= 4; b = b[4:] { 88 h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 89 h *= m 90 h ^= h >> 16 91 } 92 93 // The code below first casts each byte to a signed 8-bit integer. This is 94 // necessary to match RocksDB's behavior. Note that the `byte` type in Go is 95 // unsigned. What is the difference between casting a signed 8-bit value vs 96 // unsigned 8-bit value into an unsigned 32-bit value? 97 // Sign-extension. Consider the value 250 which has the bit pattern 11111010: 98 // 99 // uint32(250) = 00000000000000000000000011111010 100 // uint32(int8(250)) = 11111111111111111111111111111010 101 // 102 // Note that the original LevelDB code did not explicitly cast to a signed 103 // 8-bit value which left the behavior dependent on whether C characters were 104 // signed or unsigned which is a compiler flag for gcc (-funsigned-char). 105 switch len(b) { 106 case 3: 107 h += uint32(int8(b[2])) << 16 108 fallthrough 109 case 2: 110 h += uint32(int8(b[1])) << 8 111 fallthrough 112 case 1: 113 h += uint32(int8(b[0])) 114 h *= m 115 h ^= h >> 24 116 } 117 return h 118 } 119 120 type tableFilterWriter struct { 121 bitsPerKey int 122 hashes []uint32 123 } 124 125 // AddKey implements the base.FilterWriter interface. 126 func (w *tableFilterWriter) AddKey(key []byte) { 127 h := hash(key) 128 if n := len(w.hashes); n == 0 || h != w.hashes[n-1] { 129 w.hashes = append(w.hashes, h) 130 } 131 } 132 133 // Finish implements the base.FilterWriter interface. 134 func (w *tableFilterWriter) Finish(buf []byte) []byte { 135 // The table filter format matches the RocksDB full-file filter format. 136 var nBits, nLines int 137 if len(w.hashes) != 0 { 138 nBits = len(w.hashes) * w.bitsPerKey 139 nLines = (nBits + cacheLineBits - 1) / (cacheLineBits) 140 // Make nLines an odd number to make sure more bits are involved when 141 // determining which block. 142 if nLines%2 == 0 { 143 nLines++ 144 } 145 nBits = nLines * cacheLineBits 146 nLines = nBits / (cacheLineBits) 147 } 148 149 nBytes := nBits / 8 150 // +5: 4 bytes for num-lines, 1 byte for num-probes 151 buf, filter := extend(buf, nBytes+5) 152 153 if nBits != 0 && nLines != 0 { 154 nProbes := calculateProbes(w.bitsPerKey) 155 for _, h := range w.hashes { 156 delta := h>>17 | h<<15 // rotate right 17 bits 157 b := (h % uint32(nLines)) * (cacheLineBits) 158 for i := uint32(0); i < nProbes; i++ { 159 bitPos := b + (h % cacheLineBits) 160 filter[bitPos/8] |= (1 << (bitPos % 8)) 161 h += delta 162 } 163 } 164 filter[nBytes] = byte(nProbes) 165 binary.LittleEndian.PutUint32(filter[nBytes+1:], uint32(nLines)) 166 } 167 168 w.hashes = w.hashes[:0] 169 return buf 170 } 171 172 // FilterPolicy implements the FilterPolicy interface from the bitalostable package. 173 // 174 // The integer value is the approximate number of bits used per key. A good 175 // value is 10, which yields a filter with ~ 1% false positive rate. 176 // 177 // It is valid to use the other API in this package (bitalostable/bloom) without 178 // using this type or the bitalostable package. 179 type FilterPolicy int 180 181 // Name implements the bitalostable.FilterPolicy interface. 182 func (p FilterPolicy) Name() string { 183 // This string looks arbitrary, but its value is written to LevelDB .sst 184 // files, and should be this exact value to be compatible with those files 185 // and with the C++ LevelDB code. 186 return "rocksdb.BuiltinBloomFilter" 187 } 188 189 // MayContain implements the bitalostable.FilterPolicy interface. 190 func (p FilterPolicy) MayContain(ftype base.FilterType, f, key []byte) bool { 191 switch ftype { 192 case base.TableFilter: 193 return tableFilter(f).MayContain(key) 194 default: 195 panic(fmt.Sprintf("unknown filter type: %v", ftype)) 196 } 197 } 198 199 // NewWriter implements the bitalostable.FilterPolicy interface. 200 func (p FilterPolicy) NewWriter(ftype base.FilterType) base.FilterWriter { 201 switch ftype { 202 case base.TableFilter: 203 return &tableFilterWriter{ 204 bitsPerKey: int(p), 205 } 206 default: 207 panic(fmt.Sprintf("unknown filter type: %v", ftype)) 208 } 209 }