github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/bloom/filter.go (about)

     1  package bloom
     2  
     3  import (
     4  	"io"
     5  	"sync"
     6  	"unsafe"
     7  )
     8  
     9  // Filter is an interface representing read-only bloom filters where programs
    10  // can probe for the possible presence of a hash key.
    11  type Filter interface {
    12  	Check(uint64) bool
    13  }
    14  
    15  // MutableFilter is an extension of the Filter interface which supports
    16  // inserting keys to the filter.
    17  type MutableFilter interface {
    18  	Filter
    19  	Reset()
    20  	Insert(uint64)
    21  	InsertBulk([]uint64)
    22  	Bytes() []byte
    23  }
    24  
    25  // SplitBlockFilter is an in-memory implementation of the parquet bloom filters.
    26  //
    27  // This type is useful to construct bloom filters that are later serialized
    28  // to a storage medium.
    29  type SplitBlockFilter []Block
    30  
    31  // MakeSplitBlockFilter constructs a SplitBlockFilter value from the data byte
    32  // slice.
    33  func MakeSplitBlockFilter(data []byte) SplitBlockFilter {
    34  	p := *(*unsafe.Pointer)(unsafe.Pointer(&data))
    35  	n := len(data) / BlockSize
    36  	return unsafe.Slice((*Block)(p), n)
    37  }
    38  
    39  // NumSplitBlocksOf returns the number of blocks in a filter intended to hold
    40  // the given number of values and bits of filter per value.
    41  //
    42  // This function is useful to determine the number of blocks when creating bloom
    43  // filters in memory, for example:
    44  //
    45  //	f := make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(n, 10))
    46  //
    47  func NumSplitBlocksOf(numValues int64, bitsPerValue uint) int {
    48  	numBytes := ((uint(numValues) * bitsPerValue) + 7) / 8
    49  	numBlocks := (numBytes + (BlockSize - 1)) / BlockSize
    50  	return int(numBlocks)
    51  }
    52  
    53  // Reset clears the content of the filter f.
    54  func (f SplitBlockFilter) Reset() {
    55  	for i := range f {
    56  		f[i] = Block{}
    57  	}
    58  }
    59  
    60  // Block returns a pointer to the block that the given value hashes to in the
    61  // bloom filter.
    62  func (f SplitBlockFilter) Block(x uint64) *Block { return &f[fasthash1x64(x, int32(len(f)))] }
    63  
    64  // InsertBulk adds all values from x into f.
    65  func (f SplitBlockFilter) InsertBulk(x []uint64) { filterInsertBulk(f, x) }
    66  
    67  // Insert adds x to f.
    68  func (f SplitBlockFilter) Insert(x uint64) { filterInsert(f, x) }
    69  
    70  // Check tests whether x is in f.
    71  func (f SplitBlockFilter) Check(x uint64) bool { return filterCheck(f, x) }
    72  
    73  // Bytes converts f to a byte slice.
    74  //
    75  // The returned slice shares the memory of f. The method is intended to be used
    76  // to serialize the bloom filter to a storage medium.
    77  func (f SplitBlockFilter) Bytes() []byte {
    78  	return unsafe.Slice(*(**byte)(unsafe.Pointer(&f)), len(f)*BlockSize)
    79  }
    80  
    81  // CheckSplitBlock is similar to bloom.SplitBlockFilter.Check but reads the
    82  // bloom filter of n bytes from r.
    83  //
    84  // The size n of the bloom filter is assumed to be a multiple of the block size.
    85  func CheckSplitBlock(r io.ReaderAt, n int64, x uint64) (bool, error) {
    86  	block := acquireBlock()
    87  	defer releaseBlock(block)
    88  	offset := BlockSize * fasthash1x64(x, int32(n/BlockSize))
    89  	_, err := r.ReadAt(block.Bytes(), int64(offset))
    90  	return block.Check(uint32(x)), err
    91  }
    92  
    93  var (
    94  	_ MutableFilter = (SplitBlockFilter)(nil)
    95  
    96  	blockPool sync.Pool
    97  )
    98  
    99  func acquireBlock() *Block {
   100  	b, _ := blockPool.Get().(*Block)
   101  	if b == nil {
   102  		b = new(Block)
   103  	}
   104  	return b
   105  }
   106  
   107  func releaseBlock(b *Block) {
   108  	if b != nil {
   109  		blockPool.Put(b)
   110  	}
   111  }