github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/bloom.go (about) 1 package parquet 2 3 import ( 4 "io" 5 6 "github.com/vc42/parquet-go/bloom" 7 "github.com/vc42/parquet-go/bloom/xxhash" 8 "github.com/vc42/parquet-go/encoding" 9 "github.com/vc42/parquet-go/encoding/plain" 10 "github.com/vc42/parquet-go/format" 11 "github.com/vc42/parquet-go/internal/unsafecast" 12 ) 13 14 // BloomFilter is an interface allowing applications to test whether a key 15 // exists in a bloom filter. 16 type BloomFilter interface { 17 // Implement the io.ReaderAt interface as a mechanism to allow reading the 18 // raw bits of the filter. 19 io.ReaderAt 20 21 // Returns the size of the bloom filter (in bytes). 22 Size() int64 23 24 // Tests whether the given value is present in the filter. 25 // 26 // A non-nil error may be returned if reading the filter failed. This may 27 // happen if the filter was lazily loaded from a storage medium during the 28 // call to Check for example. Applications that can guarantee that the 29 // filter was in memory at the time Check was called can safely ignore the 30 // error, which would always be nil in this case. 31 Check(value Value) (bool, error) 32 } 33 34 type bloomFilter struct { 35 io.SectionReader 36 hash bloom.Hash 37 check func(io.ReaderAt, int64, uint64) (bool, error) 38 } 39 40 func (f *bloomFilter) Check(v Value) (bool, error) { 41 return f.check(&f.SectionReader, f.Size(), v.hash(f.hash)) 42 } 43 44 func (v Value) hash(h bloom.Hash) uint64 { 45 switch v.Kind() { 46 case Boolean: 47 return h.Sum64Uint8(uint8(v.u64)) 48 case Int32, Float: 49 return h.Sum64Uint32(uint32(v.u64)) 50 case Int64, Double: 51 return h.Sum64Uint64(v.u64) 52 case Int96: 53 return h.Sum64(v.Bytes()) 54 default: 55 return h.Sum64(v.ByteArray()) 56 } 57 } 58 59 func newBloomFilter(file io.ReaderAt, offset int64, header *format.BloomFilterHeader) *bloomFilter { 60 if header.Algorithm.Block != nil { 61 if header.Hash.XxHash != nil { 62 if header.Compression.Uncompressed != nil { 63 return &bloomFilter{ 64 SectionReader: *io.NewSectionReader(file, offset, int64(header.NumBytes)), 65 hash: bloom.XXH64{}, 66 check: bloom.CheckSplitBlock, 67 } 68 } 69 } 70 } 71 return nil 72 } 73 74 // The BloomFilterColumn interface is a declarative representation of bloom filters 75 // used when configuring filters on a parquet writer. 76 type BloomFilterColumn interface { 77 // Returns the path of the column that the filter applies to. 78 Path() []string 79 80 // Returns the hashing algorithm used when inserting values into a bloom 81 // filter. 82 Hash() bloom.Hash 83 84 // Returns an encoding which can be used to write columns of values to the 85 // filter. 86 Encoding() encoding.Encoding 87 88 // Returns the size of the filter needed to encode values in the filter, 89 // assuming each value will be encoded with the given number of bits. 90 Size(numValues int64, bitsPerValue uint) int 91 } 92 93 // SplitBlockFilter constructs a split block bloom filter object for the column 94 // at the given path. 95 func SplitBlockFilter(path ...string) BloomFilterColumn { return splitBlockFilter(path) } 96 97 type splitBlockFilter []string 98 99 func (f splitBlockFilter) Path() []string { return f } 100 func (f splitBlockFilter) Hash() bloom.Hash { return bloom.XXH64{} } 101 func (f splitBlockFilter) Encoding() encoding.Encoding { return splitBlockEncoding{} } 102 func (f splitBlockFilter) Size(numValues int64, bitsPerValue uint) int { 103 return bloom.BlockSize * bloom.NumSplitBlocksOf(numValues, bitsPerValue) 104 } 105 106 // Creates a header from the given bloom filter. 107 // 108 // For now there is only one type of filter supported, but we provide this 109 // function to suggest a model for extending the implementation if new filters 110 // are added to the parquet specs. 111 func bloomFilterHeader(filter BloomFilterColumn) (header format.BloomFilterHeader) { 112 switch filter.(type) { 113 case splitBlockFilter: 114 header.Algorithm.Block = &format.SplitBlockAlgorithm{} 115 } 116 switch filter.Hash().(type) { 117 case bloom.XXH64: 118 header.Hash.XxHash = &format.XxHash{} 119 } 120 header.Compression.Uncompressed = &format.BloomFilterUncompressed{} 121 return header 122 } 123 124 func searchBloomFilterColumn(filters []BloomFilterColumn, path columnPath) BloomFilterColumn { 125 for _, f := range filters { 126 if path.equal(f.Path()) { 127 return f 128 } 129 } 130 return nil 131 } 132 133 const ( 134 // Size of the stack buffer used to perform bulk operations on bloom filters. 135 // 136 // This value was determined as being a good default empirically, 137 // 128 x uint64 makes a 1KiB buffer which amortizes the cost of calling 138 // methods of bloom filters while not causing too much stack growth either. 139 filterEncodeBufferSize = 128 140 ) 141 142 type splitBlockEncoding struct { 143 encoding.NotSupported 144 } 145 146 func (splitBlockEncoding) EncodeBoolean(dst, src []byte) ([]byte, error) { 147 splitBlockEncodeUint8(bloom.MakeSplitBlockFilter(dst), src) 148 return dst, nil 149 } 150 151 func (splitBlockEncoding) EncodeInt32(dst, src []byte) ([]byte, error) { 152 splitBlockEncodeUint32(bloom.MakeSplitBlockFilter(dst), unsafecast.BytesToUint32(src)) 153 return dst, nil 154 } 155 156 func (splitBlockEncoding) EncodeInt64(dst, src []byte) ([]byte, error) { 157 splitBlockEncodeUint64(bloom.MakeSplitBlockFilter(dst), unsafecast.BytesToUint64(src)) 158 return dst, nil 159 } 160 161 func (e splitBlockEncoding) EncodeInt96(dst, src []byte) ([]byte, error) { 162 splitBlockEncodeFixedLenByteArray(bloom.MakeSplitBlockFilter(dst), src, 12) 163 return dst, nil 164 } 165 166 func (splitBlockEncoding) EncodeFloat(dst, src []byte) ([]byte, error) { 167 splitBlockEncodeUint32(bloom.MakeSplitBlockFilter(dst), unsafecast.BytesToUint32(src)) 168 return dst, nil 169 } 170 171 func (splitBlockEncoding) EncodeDouble(dst, src []byte) ([]byte, error) { 172 splitBlockEncodeUint64(bloom.MakeSplitBlockFilter(dst), unsafecast.BytesToUint64(src)) 173 return dst, nil 174 } 175 176 func (splitBlockEncoding) EncodeByteArray(dst, src []byte) ([]byte, error) { 177 filter := bloom.MakeSplitBlockFilter(dst) 178 buffer := make([]uint64, 0, filterEncodeBufferSize) 179 180 err := plain.RangeByteArray(src, func(value []byte) error { 181 if len(buffer) == cap(buffer) { 182 filter.InsertBulk(buffer) 183 buffer = buffer[:0] 184 } 185 buffer = append(buffer, xxhash.Sum64(value)) 186 return nil 187 }) 188 189 filter.InsertBulk(buffer) 190 return dst, err 191 } 192 193 func (splitBlockEncoding) EncodeFixedLenByteArray(dst, src []byte, size int) ([]byte, error) { 194 filter := bloom.MakeSplitBlockFilter(dst) 195 if size == 16 { 196 splitBlockEncodeUint128(filter, unsafecast.BytesToUint128(src)) 197 } else { 198 splitBlockEncodeFixedLenByteArray(filter, src, size) 199 } 200 return dst, nil 201 } 202 203 func splitBlockEncodeFixedLenByteArray(filter bloom.SplitBlockFilter, data []byte, size int) { 204 buffer := make([]uint64, 0, filterEncodeBufferSize) 205 206 for i, j := 0, size; j <= len(data); { 207 if len(buffer) == cap(buffer) { 208 filter.InsertBulk(buffer) 209 buffer = buffer[:0] 210 } 211 buffer = append(buffer, xxhash.Sum64(data[i:j])) 212 i += size 213 j += size 214 } 215 216 filter.InsertBulk(buffer) 217 } 218 219 func splitBlockEncodeUint8(filter bloom.SplitBlockFilter, values []uint8) { 220 buffer := make([]uint64, filterEncodeBufferSize) 221 222 for i := 0; i < len(values); { 223 n := xxhash.MultiSum64Uint8(buffer, values[i:]) 224 filter.InsertBulk(buffer[:n]) 225 i += n 226 } 227 } 228 229 func splitBlockEncodeUint32(filter bloom.SplitBlockFilter, values []uint32) { 230 buffer := make([]uint64, filterEncodeBufferSize) 231 232 for i := 0; i < len(values); { 233 n := xxhash.MultiSum64Uint32(buffer, values[i:]) 234 filter.InsertBulk(buffer[:n]) 235 i += n 236 } 237 } 238 239 func splitBlockEncodeUint64(filter bloom.SplitBlockFilter, values []uint64) { 240 buffer := make([]uint64, filterEncodeBufferSize) 241 242 for i := 0; i < len(values); { 243 n := xxhash.MultiSum64Uint64(buffer, values[i:]) 244 filter.InsertBulk(buffer[:n]) 245 i += n 246 } 247 } 248 249 func splitBlockEncodeUint128(filter bloom.SplitBlockFilter, values [][16]byte) { 250 buffer := make([]uint64, filterEncodeBufferSize) 251 252 for i := 0; i < len(values); { 253 n := xxhash.MultiSum64Uint128(buffer, values[i:]) 254 filter.InsertBulk(buffer[:n]) 255 i += n 256 } 257 }