github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/bloom/filter.go (about)

     1  package bloom
     2  
     3  import (
     4  	"io"
     5  	"sync"
     6  	"unsafe"
     7  )
     8  
     9  // Filter is an interface representing read-only bloom filters where programs
    10  // can probe for the possible presence of a hash key.
    11  type Filter interface {
    12  	Check(uint64) bool
    13  }
    14  
    15  // SplitBlockFilter is an in-memory implementation of the parquet bloom filters.
    16  //
    17  // This type is useful to construct bloom filters that are later serialized
    18  // to a storage medium.
    19  type SplitBlockFilter []Block
    20  
    21  // MakeSplitBlockFilter constructs a SplitBlockFilter value from the data byte
    22  // slice.
    23  func MakeSplitBlockFilter(data []byte) SplitBlockFilter {
    24  	p := *(*unsafe.Pointer)(unsafe.Pointer(&data))
    25  	n := len(data) / BlockSize
    26  	return unsafe.Slice((*Block)(p), n)
    27  }
    28  
    29  // NumSplitBlocksOf returns the number of blocks in a filter intended to hold
    30  // the given number of values and bits of filter per value.
    31  //
    32  // This function is useful to determine the number of blocks when creating bloom
    33  // filters in memory, for example:
    34  //
    35  //	f := make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(n, 10))
    36  func NumSplitBlocksOf(numValues int64, bitsPerValue uint) int {
    37  	numBytes := ((uint(numValues) * bitsPerValue) + 7) / 8
    38  	numBlocks := (numBytes + (BlockSize - 1)) / BlockSize
    39  	return int(numBlocks)
    40  }
    41  
    42  // Reset clears the content of the filter f.
    43  func (f SplitBlockFilter) Reset() {
    44  	for i := range f {
    45  		f[i] = Block{}
    46  	}
    47  }
    48  
    49  // Block returns a pointer to the block that the given value hashes to in the
    50  // bloom filter.
    51  func (f SplitBlockFilter) Block(x uint64) *Block { return &f[fasthash1x64(x, int32(len(f)))] }
    52  
    53  // InsertBulk adds all values from x into f.
    54  func (f SplitBlockFilter) InsertBulk(x []uint64) { filterInsertBulk(f, x) }
    55  
    56  // Insert adds x to f.
    57  func (f SplitBlockFilter) Insert(x uint64) { filterInsert(f, x) }
    58  
    59  // Check tests whether x is in f.
    60  func (f SplitBlockFilter) Check(x uint64) bool { return filterCheck(f, x) }
    61  
    62  // Bytes converts f to a byte slice.
    63  //
    64  // The returned slice shares the memory of f. The method is intended to be used
    65  // to serialize the bloom filter to a storage medium.
    66  func (f SplitBlockFilter) Bytes() []byte {
    67  	return unsafe.Slice(*(**byte)(unsafe.Pointer(&f)), len(f)*BlockSize)
    68  }
    69  
    70  // CheckSplitBlock is similar to bloom.SplitBlockFilter.Check but reads the
    71  // bloom filter of n bytes from r.
    72  //
    73  // The size n of the bloom filter is assumed to be a multiple of the block size.
    74  func CheckSplitBlock(r io.ReaderAt, n int64, x uint64) (bool, error) {
    75  	block := acquireBlock()
    76  	defer releaseBlock(block)
    77  	offset := BlockSize * fasthash1x64(x, int32(n/BlockSize))
    78  	_, err := r.ReadAt(block.Bytes(), int64(offset))
    79  	return block.Check(uint32(x)), err
    80  }
    81  
    82  var (
    83  	blockPool sync.Pool
    84  )
    85  
    86  func acquireBlock() *Block {
    87  	b, _ := blockPool.Get().(*Block)
    88  	if b == nil {
    89  		b = new(Block)
    90  	}
    91  	return b
    92  }
    93  
    94  func releaseBlock(b *Block) {
    95  	if b != nil {
    96  		blockPool.Put(b)
    97  	}
    98  }