github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/bloom/filter.go (about) 1 package bloom 2 3 import ( 4 "io" 5 "sync" 6 "unsafe" 7 ) 8 9 // Filter is an interface representing read-only bloom filters where programs 10 // can probe for the possible presence of a hash key. 11 type Filter interface { 12 Check(uint64) bool 13 } 14 15 // SplitBlockFilter is an in-memory implementation of the parquet bloom filters. 16 // 17 // This type is useful to construct bloom filters that are later serialized 18 // to a storage medium. 19 type SplitBlockFilter []Block 20 21 // MakeSplitBlockFilter constructs a SplitBlockFilter value from the data byte 22 // slice. 23 func MakeSplitBlockFilter(data []byte) SplitBlockFilter { 24 p := *(*unsafe.Pointer)(unsafe.Pointer(&data)) 25 n := len(data) / BlockSize 26 return unsafe.Slice((*Block)(p), n) 27 } 28 29 // NumSplitBlocksOf returns the number of blocks in a filter intended to hold 30 // the given number of values and bits of filter per value. 31 // 32 // This function is useful to determine the number of blocks when creating bloom 33 // filters in memory, for example: 34 // 35 // f := make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(n, 10)) 36 func NumSplitBlocksOf(numValues int64, bitsPerValue uint) int { 37 numBytes := ((uint(numValues) * bitsPerValue) + 7) / 8 38 numBlocks := (numBytes + (BlockSize - 1)) / BlockSize 39 return int(numBlocks) 40 } 41 42 // Reset clears the content of the filter f. 43 func (f SplitBlockFilter) Reset() { 44 for i := range f { 45 f[i] = Block{} 46 } 47 } 48 49 // Block returns a pointer to the block that the given value hashes to in the 50 // bloom filter. 51 func (f SplitBlockFilter) Block(x uint64) *Block { return &f[fasthash1x64(x, int32(len(f)))] } 52 53 // InsertBulk adds all values from x into f. 54 func (f SplitBlockFilter) InsertBulk(x []uint64) { filterInsertBulk(f, x) } 55 56 // Insert adds x to f. 57 func (f SplitBlockFilter) Insert(x uint64) { filterInsert(f, x) } 58 59 // Check tests whether x is in f. 60 func (f SplitBlockFilter) Check(x uint64) bool { return filterCheck(f, x) } 61 62 // Bytes converts f to a byte slice. 63 // 64 // The returned slice shares the memory of f. The method is intended to be used 65 // to serialize the bloom filter to a storage medium. 66 func (f SplitBlockFilter) Bytes() []byte { 67 return unsafe.Slice(*(**byte)(unsafe.Pointer(&f)), len(f)*BlockSize) 68 } 69 70 // CheckSplitBlock is similar to bloom.SplitBlockFilter.Check but reads the 71 // bloom filter of n bytes from r. 72 // 73 // The size n of the bloom filter is assumed to be a multiple of the block size. 74 func CheckSplitBlock(r io.ReaderAt, n int64, x uint64) (bool, error) { 75 block := acquireBlock() 76 defer releaseBlock(block) 77 offset := BlockSize * fasthash1x64(x, int32(n/BlockSize)) 78 _, err := r.ReadAt(block.Bytes(), int64(offset)) 79 return block.Check(uint32(x)), err 80 } 81 82 var ( 83 blockPool sync.Pool 84 ) 85 86 func acquireBlock() *Block { 87 b, _ := blockPool.Get().(*Block) 88 if b == nil { 89 b = new(Block) 90 } 91 return b 92 } 93 94 func releaseBlock(b *Block) { 95 if b != nil { 96 blockPool.Put(b) 97 } 98 }