github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/bloom/filter.go (about) 1 package bloom 2 3 import ( 4 "io" 5 "sync" 6 "unsafe" 7 ) 8 9 // Filter is an interface representing read-only bloom filters where programs 10 // can probe for the possible presence of a hash key. 11 type Filter interface { 12 Check(uint64) bool 13 } 14 15 // MutableFilter is an extension of the Filter interface which supports 16 // inserting keys to the filter. 17 type MutableFilter interface { 18 Filter 19 Reset() 20 Insert(uint64) 21 InsertBulk([]uint64) 22 Bytes() []byte 23 } 24 25 // SplitBlockFilter is an in-memory implementation of the parquet bloom filters. 26 // 27 // This type is useful to construct bloom filters that are later serialized 28 // to a storage medium. 29 type SplitBlockFilter []Block 30 31 // MakeSplitBlockFilter constructs a SplitBlockFilter value from the data byte 32 // slice. 33 func MakeSplitBlockFilter(data []byte) SplitBlockFilter { 34 p := *(*unsafe.Pointer)(unsafe.Pointer(&data)) 35 n := len(data) / BlockSize 36 return unsafe.Slice((*Block)(p), n) 37 } 38 39 // NumSplitBlocksOf returns the number of blocks in a filter intended to hold 40 // the given number of values and bits of filter per value. 41 // 42 // This function is useful to determine the number of blocks when creating bloom 43 // filters in memory, for example: 44 // 45 // f := make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(n, 10)) 46 // 47 func NumSplitBlocksOf(numValues int64, bitsPerValue uint) int { 48 numBytes := ((uint(numValues) * bitsPerValue) + 7) / 8 49 numBlocks := (numBytes + (BlockSize - 1)) / BlockSize 50 return int(numBlocks) 51 } 52 53 // Reset clears the content of the filter f. 54 func (f SplitBlockFilter) Reset() { 55 for i := range f { 56 f[i] = Block{} 57 } 58 } 59 60 // Block returns a pointer to the block that the given value hashes to in the 61 // bloom filter. 62 func (f SplitBlockFilter) Block(x uint64) *Block { return &f[fasthash1x64(x, int32(len(f)))] } 63 64 // InsertBulk adds all values from x into f. 65 func (f SplitBlockFilter) InsertBulk(x []uint64) { filterInsertBulk(f, x) } 66 67 // Insert adds x to f. 68 func (f SplitBlockFilter) Insert(x uint64) { filterInsert(f, x) } 69 70 // Check tests whether x is in f. 71 func (f SplitBlockFilter) Check(x uint64) bool { return filterCheck(f, x) } 72 73 // Bytes converts f to a byte slice. 74 // 75 // The returned slice shares the memory of f. The method is intended to be used 76 // to serialize the bloom filter to a storage medium. 77 func (f SplitBlockFilter) Bytes() []byte { 78 return unsafe.Slice(*(**byte)(unsafe.Pointer(&f)), len(f)*BlockSize) 79 } 80 81 // CheckSplitBlock is similar to bloom.SplitBlockFilter.Check but reads the 82 // bloom filter of n bytes from r. 83 // 84 // The size n of the bloom filter is assumed to be a multiple of the block size. 85 func CheckSplitBlock(r io.ReaderAt, n int64, x uint64) (bool, error) { 86 block := acquireBlock() 87 defer releaseBlock(block) 88 offset := BlockSize * fasthash1x64(x, int32(n/BlockSize)) 89 _, err := r.ReadAt(block.Bytes(), int64(offset)) 90 return block.Check(uint32(x)), err 91 } 92 93 var ( 94 _ MutableFilter = (SplitBlockFilter)(nil) 95 96 blockPool sync.Pool 97 ) 98 99 func acquireBlock() *Block { 100 b, _ := blockPool.Get().(*Block) 101 if b == nil { 102 b = new(Block) 103 } 104 return b 105 } 106 107 func releaseBlock(b *Block) { 108 if b != nil { 109 blockPool.Put(b) 110 } 111 }