github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/bloom/bloom.go (about)

     1  // Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  // Package bloom implements Bloom filters.
     6  package bloom // import "github.com/zuoyebang/bitalostable/bloom"
     7  
     8  import (
     9  	"encoding/binary"
    10  	"fmt"
    11  
    12  	"github.com/zuoyebang/bitalostable/internal/base"
    13  )
    14  
    15  const (
    16  	cacheLineSize = 64
    17  	cacheLineBits = cacheLineSize * 8
    18  )
    19  
    20  type tableFilter []byte
    21  
    22  func (f tableFilter) MayContain(key []byte) bool {
    23  	if len(f) <= 5 {
    24  		return false
    25  	}
    26  	n := len(f) - 5
    27  	nProbes := f[n]
    28  	nLines := binary.LittleEndian.Uint32(f[n+1:])
    29  	cacheLineBits := 8 * (uint32(n) / nLines)
    30  
    31  	h := hash(key)
    32  	delta := h>>17 | h<<15
    33  	b := (h % nLines) * cacheLineBits
    34  
    35  	for j := uint8(0); j < nProbes; j++ {
    36  		bitPos := b + (h % cacheLineBits)
    37  		if f[bitPos/8]&(1<<(bitPos%8)) == 0 {
    38  			return false
    39  		}
    40  		h += delta
    41  	}
    42  	return true
    43  }
    44  
    45  func calculateProbes(bitsPerKey int) uint32 {
    46  	// We intentionally round down to reduce probing cost a little bit
    47  	n := uint32(float64(bitsPerKey) * 0.69) // 0.69 =~ ln(2)
    48  	if n < 1 {
    49  		n = 1
    50  	}
    51  	if n > 30 {
    52  		n = 30
    53  	}
    54  	return n
    55  }
    56  
    57  // extend appends n zero bytes to b. It returns the overall slice (of length
    58  // n+len(originalB)) and the slice of n trailing zeroes.
    59  func extend(b []byte, n int) (overall, trailer []byte) {
    60  	want := n + len(b)
    61  	if want <= cap(b) {
    62  		overall = b[:want]
    63  		trailer = overall[len(b):]
    64  		for i := range trailer {
    65  			trailer[i] = 0
    66  		}
    67  	} else {
    68  		// Grow the capacity exponentially, with a 1KiB minimum.
    69  		c := 1024
    70  		for c < want {
    71  			c += c / 4
    72  		}
    73  		overall = make([]byte, want, c)
    74  		trailer = overall[len(b):]
    75  		copy(overall, b)
    76  	}
    77  	return overall, trailer
    78  }
    79  
    80  // hash implements a hashing algorithm similar to the Murmur hash.
    81  func hash(b []byte) uint32 {
    82  	const (
    83  		seed = 0xbc9f1d34
    84  		m    = 0xc6a4a793
    85  	)
    86  	h := uint32(seed) ^ uint32(uint64(uint32(len(b))*m))
    87  	for ; len(b) >= 4; b = b[4:] {
    88  		h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
    89  		h *= m
    90  		h ^= h >> 16
    91  	}
    92  
    93  	// The code below first casts each byte to a signed 8-bit integer. This is
    94  	// necessary to match RocksDB's behavior. Note that the `byte` type in Go is
    95  	// unsigned. What is the difference between casting a signed 8-bit value vs
    96  	// unsigned 8-bit value into an unsigned 32-bit value?
    97  	// Sign-extension. Consider the value 250 which has the bit pattern 11111010:
    98  	//
    99  	//   uint32(250)        = 00000000000000000000000011111010
   100  	//   uint32(int8(250))  = 11111111111111111111111111111010
   101  	//
   102  	// Note that the original LevelDB code did not explicitly cast to a signed
   103  	// 8-bit value which left the behavior dependent on whether C characters were
   104  	// signed or unsigned which is a compiler flag for gcc (-funsigned-char).
   105  	switch len(b) {
   106  	case 3:
   107  		h += uint32(int8(b[2])) << 16
   108  		fallthrough
   109  	case 2:
   110  		h += uint32(int8(b[1])) << 8
   111  		fallthrough
   112  	case 1:
   113  		h += uint32(int8(b[0]))
   114  		h *= m
   115  		h ^= h >> 24
   116  	}
   117  	return h
   118  }
   119  
   120  type tableFilterWriter struct {
   121  	bitsPerKey int
   122  	hashes     []uint32
   123  }
   124  
   125  // AddKey implements the base.FilterWriter interface.
   126  func (w *tableFilterWriter) AddKey(key []byte) {
   127  	h := hash(key)
   128  	if n := len(w.hashes); n == 0 || h != w.hashes[n-1] {
   129  		w.hashes = append(w.hashes, h)
   130  	}
   131  }
   132  
   133  // Finish implements the base.FilterWriter interface.
   134  func (w *tableFilterWriter) Finish(buf []byte) []byte {
   135  	// The table filter format matches the RocksDB full-file filter format.
   136  	var nBits, nLines int
   137  	if len(w.hashes) != 0 {
   138  		nBits = len(w.hashes) * w.bitsPerKey
   139  		nLines = (nBits + cacheLineBits - 1) / (cacheLineBits)
   140  		// Make nLines an odd number to make sure more bits are involved when
   141  		// determining which block.
   142  		if nLines%2 == 0 {
   143  			nLines++
   144  		}
   145  		nBits = nLines * cacheLineBits
   146  		nLines = nBits / (cacheLineBits)
   147  	}
   148  
   149  	nBytes := nBits / 8
   150  	// +5: 4 bytes for num-lines, 1 byte for num-probes
   151  	buf, filter := extend(buf, nBytes+5)
   152  
   153  	if nBits != 0 && nLines != 0 {
   154  		nProbes := calculateProbes(w.bitsPerKey)
   155  		for _, h := range w.hashes {
   156  			delta := h>>17 | h<<15 // rotate right 17 bits
   157  			b := (h % uint32(nLines)) * (cacheLineBits)
   158  			for i := uint32(0); i < nProbes; i++ {
   159  				bitPos := b + (h % cacheLineBits)
   160  				filter[bitPos/8] |= (1 << (bitPos % 8))
   161  				h += delta
   162  			}
   163  		}
   164  		filter[nBytes] = byte(nProbes)
   165  		binary.LittleEndian.PutUint32(filter[nBytes+1:], uint32(nLines))
   166  	}
   167  
   168  	w.hashes = w.hashes[:0]
   169  	return buf
   170  }
   171  
   172  // FilterPolicy implements the FilterPolicy interface from the bitalostable package.
   173  //
   174  // The integer value is the approximate number of bits used per key. A good
   175  // value is 10, which yields a filter with ~ 1% false positive rate.
   176  //
   177  // It is valid to use the other API in this package (bitalostable/bloom) without
   178  // using this type or the bitalostable package.
   179  type FilterPolicy int
   180  
   181  // Name implements the bitalostable.FilterPolicy interface.
   182  func (p FilterPolicy) Name() string {
   183  	// This string looks arbitrary, but its value is written to LevelDB .sst
   184  	// files, and should be this exact value to be compatible with those files
   185  	// and with the C++ LevelDB code.
   186  	return "rocksdb.BuiltinBloomFilter"
   187  }
   188  
   189  // MayContain implements the bitalostable.FilterPolicy interface.
   190  func (p FilterPolicy) MayContain(ftype base.FilterType, f, key []byte) bool {
   191  	switch ftype {
   192  	case base.TableFilter:
   193  		return tableFilter(f).MayContain(key)
   194  	default:
   195  		panic(fmt.Sprintf("unknown filter type: %v", ftype))
   196  	}
   197  }
   198  
   199  // NewWriter implements the bitalostable.FilterPolicy interface.
   200  func (p FilterPolicy) NewWriter(ftype base.FilterType) base.FilterWriter {
   201  	switch ftype {
   202  	case base.TableFilter:
   203  		return &tableFilterWriter{
   204  			bitsPerKey: int(p),
   205  		}
   206  	default:
   207  		panic(fmt.Sprintf("unknown filter type: %v", ftype))
   208  	}
   209  }