github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/bloom/bloom.go (about)

     1  // Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  // Package bloom implements Bloom filters.
     6  package bloom // import "github.com/petermattis/pebble/bloom"
     7  
     8  import (
     9  	"encoding/binary"
    10  	"fmt"
    11  
    12  	"github.com/petermattis/pebble/internal/base"
    13  )
    14  
    15  const (
    16  	cacheLineSize = 64
    17  	cacheLineBits = cacheLineSize * 8
    18  )
    19  
    20  // blockFilter is an encoded set of []byte keys.
    21  type blockFilter []byte
    22  
    23  // MayContain returns whether the filter may contain given key. False positives
    24  // are possible, where it returns true for keys not in the original set.
    25  func (f blockFilter) MayContain(key []byte) bool {
    26  	if len(f) <= 1 {
    27  		return false
    28  	}
    29  	nProbes := f[len(f)-1]
    30  	if nProbes > 30 {
    31  		// This is reserved for potentially new encodings for short Bloom filters.
    32  		// Consider it a match.
    33  		return true
    34  	}
    35  	nBits := uint32(8 * (len(f) - 1))
    36  	h := hash(key)
    37  	delta := h>>17 | h<<15
    38  	for j := uint8(0); j < nProbes; j++ {
    39  		bitPos := h % nBits
    40  		if f[bitPos/8]&(1<<(bitPos%8)) == 0 {
    41  			return false
    42  		}
    43  		h += delta
    44  	}
    45  	return true
    46  }
    47  
    48  type tableFilter []byte
    49  
    50  func (f tableFilter) MayContain(key []byte) bool {
    51  	if len(f) <= 5 {
    52  		return false
    53  	}
    54  	n := len(f) - 5
    55  	nProbes := f[n]
    56  	nLines := binary.LittleEndian.Uint32(f[n+1:])
    57  	cacheLineBits := 8 * (uint32(n) / nLines)
    58  
    59  	h := hash(key)
    60  	delta := h>>17 | h<<15
    61  	b := (h % nLines) * cacheLineBits
    62  
    63  	for j := uint8(0); j < nProbes; j++ {
    64  		bitPos := b + (h % cacheLineBits)
    65  		if f[bitPos/8]&(1<<(bitPos%8)) == 0 {
    66  			return false
    67  		}
    68  		h += delta
    69  	}
    70  	return true
    71  }
    72  
    73  func calculateProbes(bitsPerKey int) uint32 {
    74  	// We intentionally round down to reduce probing cost a little bit
    75  	n := uint32(float64(bitsPerKey) * 0.69) // 0.69 =~ ln(2)
    76  	if n < 1 {
    77  		n = 1
    78  	}
    79  	if n > 30 {
    80  		n = 30
    81  	}
    82  	return n
    83  }
    84  
    85  // extend appends n zero bytes to b. It returns the overall slice (of length
    86  // n+len(originalB)) and the slice of n trailing zeroes.
    87  func extend(b []byte, n int) (overall, trailer []byte) {
    88  	want := n + len(b)
    89  	if want <= cap(b) {
    90  		overall = b[:want]
    91  		trailer = overall[len(b):]
    92  		for i := range trailer {
    93  			trailer[i] = 0
    94  		}
    95  	} else {
    96  		// Grow the capacity exponentially, with a 1KiB minimum.
    97  		c := 1024
    98  		for c < want {
    99  			c += c / 4
   100  		}
   101  		overall = make([]byte, want, c)
   102  		trailer = overall[len(b):]
   103  		copy(overall, b)
   104  	}
   105  	return overall, trailer
   106  }
   107  
   108  // hash implements a hashing algorithm similar to the Murmur hash.
   109  func hash(b []byte) uint32 {
   110  	const (
   111  		seed = 0xbc9f1d34
   112  		m    = 0xc6a4a793
   113  	)
   114  	h := uint32(seed) ^ uint32(len(b)*m)
   115  	for ; len(b) >= 4; b = b[4:] {
   116  		h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
   117  		h *= m
   118  		h ^= h >> 16
   119  	}
   120  	switch len(b) {
   121  	case 3:
   122  		h += uint32(b[2]) << 16
   123  		fallthrough
   124  	case 2:
   125  		h += uint32(b[1]) << 8
   126  		fallthrough
   127  	case 1:
   128  		h += uint32(b[0])
   129  		h *= m
   130  		h ^= h >> 24
   131  	}
   132  	return h
   133  }
   134  
   135  type tableFilterWriter struct {
   136  	bitsPerKey int
   137  	hashes     []uint32
   138  }
   139  
   140  // AddKey implements the base.FilterWriter interface.
   141  func (w *tableFilterWriter) AddKey(key []byte) {
   142  	h := hash(key)
   143  	if n := len(w.hashes); n == 0 || h != w.hashes[n-1] {
   144  		w.hashes = append(w.hashes, h)
   145  	}
   146  }
   147  
   148  // Finish implements the base.FilterWriter interface.
   149  func (w *tableFilterWriter) Finish(buf []byte) []byte {
   150  	// The table filter format matches the RocksDB full-file filter format.
   151  	var nBits, nLines int
   152  	if len(w.hashes) != 0 {
   153  		nBits = len(w.hashes) * w.bitsPerKey
   154  		nLines = (nBits + cacheLineBits - 1) / (cacheLineBits)
   155  		// Make nLines an odd number to make sure more bits are involved when
   156  		// determining which block.
   157  		if nLines%2 == 0 {
   158  			nLines++
   159  		}
   160  		nBits = nLines * cacheLineBits
   161  		nLines = nBits / (cacheLineBits)
   162  	}
   163  
   164  	nBytes := nBits / 8
   165  	// +5: 4 bytes for num-lines, 1 byte for num-probes
   166  	buf, filter := extend(buf, nBytes+5)
   167  
   168  	if nBits != 0 && nLines != 0 {
   169  		nProbes := calculateProbes(w.bitsPerKey)
   170  		for _, h := range w.hashes {
   171  			delta := h>>17 | h<<15 // rotate right 17 bits
   172  			b := (h % uint32(nLines)) * (cacheLineBits)
   173  			for i := uint32(0); i < nProbes; i++ {
   174  				bitPos := b + (h % cacheLineBits)
   175  				filter[bitPos/8] |= (1 << (bitPos % 8))
   176  				h += delta
   177  			}
   178  		}
   179  		filter[nBytes] = byte(nProbes)
   180  		binary.LittleEndian.PutUint32(filter[nBytes+1:], uint32(nLines))
   181  	}
   182  
   183  	w.hashes = w.hashes[:0]
   184  	return buf
   185  }
   186  
   187  // FilterPolicy implements the FilterPolicy interface from the pebble package.
   188  //
   189  // The integer value is the approximate number of bits used per key. A good
   190  // value is 10, which yields a filter with ~ 1% false positive rate.
   191  //
   192  // It is valid to use the other API in this package (pebble/bloom) without
   193  // using this type or the pebble package.
   194  type FilterPolicy int
   195  
   196  // Name implements the pebble.FilterPolicy interface.
   197  func (p FilterPolicy) Name() string {
   198  	// This string looks arbitrary, but its value is written to LevelDB .sst
   199  	// files, and should be this exact value to be compatible with those files
   200  	// and with the C++ LevelDB code.
   201  	return "rocksdb.BuiltinBloomFilter"
   202  }
   203  
   204  // MayContain implements the pebble.FilterPolicy interface.
   205  func (p FilterPolicy) MayContain(ftype base.FilterType, f, key []byte) bool {
   206  	switch ftype {
   207  	case base.TableFilter:
   208  		return tableFilter(f).MayContain(key)
   209  	default:
   210  		panic(fmt.Sprintf("unknown filter type: %v", ftype))
   211  	}
   212  }
   213  
   214  // NewWriter implements the pebble.FilterPolicy interface.
   215  func (p FilterPolicy) NewWriter(ftype base.FilterType) base.FilterWriter {
   216  	switch ftype {
   217  	case base.TableFilter:
   218  		return &tableFilterWriter{
   219  			bitsPerKey: int(p),
   220  		}
   221  	default:
   222  		panic(fmt.Sprintf("unknown filter type: %v", ftype))
   223  	}
   224  }