github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/bloom.go (about) 1 package parquet 2 3 import ( 4 "io" 5 6 "github.com/segmentio/parquet-go/bloom" 7 "github.com/segmentio/parquet-go/bloom/xxhash" 8 "github.com/segmentio/parquet-go/deprecated" 9 "github.com/segmentio/parquet-go/encoding" 10 "github.com/segmentio/parquet-go/format" 11 "github.com/segmentio/parquet-go/internal/unsafecast" 12 ) 13 14 // BloomFilter is an interface allowing applications to test whether a key 15 // exists in a bloom filter. 16 type BloomFilter interface { 17 // Implement the io.ReaderAt interface as a mechanism to allow reading the 18 // raw bits of the filter. 19 io.ReaderAt 20 21 // Returns the size of the bloom filter (in bytes). 22 Size() int64 23 24 // Tests whether the given value is present in the filter. 25 // 26 // A non-nil error may be returned if reading the filter failed. This may 27 // happen if the filter was lazily loaded from a storage medium during the 28 // call to Check for example. Applications that can guarantee that the 29 // filter was in memory at the time Check was called can safely ignore the 30 // error, which would always be nil in this case. 31 Check(value Value) (bool, error) 32 } 33 34 type bloomFilter struct { 35 io.SectionReader 36 hash bloom.Hash 37 check func(io.ReaderAt, int64, uint64) (bool, error) 38 } 39 40 func (f *bloomFilter) Check(v Value) (bool, error) { 41 return f.check(&f.SectionReader, f.Size(), v.hash(f.hash)) 42 } 43 44 func (v Value) hash(h bloom.Hash) uint64 { 45 switch v.Kind() { 46 case Boolean: 47 return h.Sum64Uint8(v.byte()) 48 case Int32, Float: 49 return h.Sum64Uint32(v.uint32()) 50 case Int64, Double: 51 return h.Sum64Uint64(v.uint64()) 52 default: // Int96, ByteArray, FixedLenByteArray, or null 53 return h.Sum64(v.byteArray()) 54 } 55 } 56 57 func newBloomFilter(file io.ReaderAt, offset int64, header *format.BloomFilterHeader) *bloomFilter { 58 if header.Algorithm.Block != nil { 59 if header.Hash.XxHash != nil { 60 if header.Compression.Uncompressed != nil { 61 return &bloomFilter{ 62 SectionReader: *io.NewSectionReader(file, offset, int64(header.NumBytes)), 63 hash: bloom.XXH64{}, 64 check: bloom.CheckSplitBlock, 65 } 66 } 67 } 68 } 69 return nil 70 } 71 72 // The BloomFilterColumn interface is a declarative representation of bloom filters 73 // used when configuring filters on a parquet writer. 74 type BloomFilterColumn interface { 75 // Returns the path of the column that the filter applies to. 76 Path() []string 77 78 // Returns the hashing algorithm used when inserting values into a bloom 79 // filter. 80 Hash() bloom.Hash 81 82 // Returns an encoding which can be used to write columns of values to the 83 // filter. 84 Encoding() encoding.Encoding 85 86 // Returns the size of the filter needed to encode values in the filter, 87 // assuming each value will be encoded with the given number of bits. 88 Size(numValues int64) int 89 } 90 91 // SplitBlockFilter constructs a split block bloom filter object for the column 92 // at the given path, with the given bitsPerValue. 93 // 94 // If you are unsure what number of bitsPerValue to use, 10 is a reasonable 95 // tradeoff between size and error rate for common datasets. 96 // 97 // For more information on the tradeoff between size and error rate, consult 98 // this website: https://hur.st/bloomfilter/?n=4000&p=0.1&m=&k=1 99 func SplitBlockFilter(bitsPerValue uint, path ...string) BloomFilterColumn { 100 return splitBlockFilter{ 101 bitsPerValue: bitsPerValue, 102 path: path, 103 } 104 } 105 106 type splitBlockFilter struct { 107 bitsPerValue uint 108 path []string 109 } 110 111 func (f splitBlockFilter) Path() []string { return f.path } 112 func (f splitBlockFilter) Hash() bloom.Hash { return bloom.XXH64{} } 113 func (f splitBlockFilter) Encoding() encoding.Encoding { return splitBlockEncoding{} } 114 115 func (f splitBlockFilter) Size(numValues int64) int { 116 return bloom.BlockSize * bloom.NumSplitBlocksOf(numValues, f.bitsPerValue) 117 } 118 119 // Creates a header from the given bloom filter. 120 // 121 // For now there is only one type of filter supported, but we provide this 122 // function to suggest a model for extending the implementation if new filters 123 // are added to the parquet specs. 124 func bloomFilterHeader(filter BloomFilterColumn) (header format.BloomFilterHeader) { 125 switch filter.(type) { 126 case splitBlockFilter: 127 header.Algorithm.Block = &format.SplitBlockAlgorithm{} 128 } 129 switch filter.Hash().(type) { 130 case bloom.XXH64: 131 header.Hash.XxHash = &format.XxHash{} 132 } 133 header.Compression.Uncompressed = &format.BloomFilterUncompressed{} 134 return header 135 } 136 137 func searchBloomFilterColumn(filters []BloomFilterColumn, path columnPath) BloomFilterColumn { 138 for _, f := range filters { 139 if path.equal(f.Path()) { 140 return f 141 } 142 } 143 return nil 144 } 145 146 const ( 147 // Size of the stack buffer used to perform bulk operations on bloom filters. 148 // 149 // This value was determined as being a good default empirically, 150 // 128 x uint64 makes a 1KiB buffer which amortizes the cost of calling 151 // methods of bloom filters while not causing too much stack growth either. 152 filterEncodeBufferSize = 128 153 ) 154 155 type splitBlockEncoding struct { 156 encoding.NotSupported 157 } 158 159 func (splitBlockEncoding) EncodeBoolean(dst []byte, src []byte) ([]byte, error) { 160 splitBlockEncodeUint8(bloom.MakeSplitBlockFilter(dst), src) 161 return dst, nil 162 } 163 164 func (splitBlockEncoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) { 165 splitBlockEncodeUint32(bloom.MakeSplitBlockFilter(dst), unsafecast.Int32ToUint32(src)) 166 return dst, nil 167 } 168 169 func (splitBlockEncoding) EncodeInt64(dst []byte, src []int64) ([]byte, error) { 170 splitBlockEncodeUint64(bloom.MakeSplitBlockFilter(dst), unsafecast.Int64ToUint64(src)) 171 return dst, nil 172 } 173 174 func (e splitBlockEncoding) EncodeInt96(dst []byte, src []deprecated.Int96) ([]byte, error) { 175 splitBlockEncodeFixedLenByteArray(bloom.MakeSplitBlockFilter(dst), deprecated.Int96ToBytes(src), 12) 176 return dst, nil 177 } 178 179 func (splitBlockEncoding) EncodeFloat(dst []byte, src []float32) ([]byte, error) { 180 splitBlockEncodeUint32(bloom.MakeSplitBlockFilter(dst), unsafecast.Float32ToUint32(src)) 181 return dst, nil 182 } 183 184 func (splitBlockEncoding) EncodeDouble(dst []byte, src []float64) ([]byte, error) { 185 splitBlockEncodeUint64(bloom.MakeSplitBlockFilter(dst), unsafecast.Float64ToUint64(src)) 186 return dst, nil 187 } 188 189 func (splitBlockEncoding) EncodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, error) { 190 filter := bloom.MakeSplitBlockFilter(dst) 191 buffer := make([]uint64, 0, filterEncodeBufferSize) 192 baseOffset := offsets[0] 193 194 for _, endOffset := range offsets[1:] { 195 value := src[baseOffset:endOffset:endOffset] 196 baseOffset = endOffset 197 198 if len(buffer) == cap(buffer) { 199 filter.InsertBulk(buffer) 200 buffer = buffer[:0] 201 } 202 203 buffer = append(buffer, xxhash.Sum64(value)) 204 } 205 206 filter.InsertBulk(buffer) 207 return dst, nil 208 } 209 210 func (splitBlockEncoding) EncodeFixedLenByteArray(dst []byte, src []byte, size int) ([]byte, error) { 211 filter := bloom.MakeSplitBlockFilter(dst) 212 if size == 16 { 213 splitBlockEncodeUint128(filter, unsafecast.BytesToUint128(src)) 214 } else { 215 splitBlockEncodeFixedLenByteArray(filter, src, size) 216 } 217 return dst, nil 218 } 219 220 func splitBlockEncodeFixedLenByteArray(filter bloom.SplitBlockFilter, data []byte, size int) { 221 buffer := make([]uint64, 0, filterEncodeBufferSize) 222 223 for i, j := 0, size; j <= len(data); { 224 if len(buffer) == cap(buffer) { 225 filter.InsertBulk(buffer) 226 buffer = buffer[:0] 227 } 228 buffer = append(buffer, xxhash.Sum64(data[i:j])) 229 i += size 230 j += size 231 } 232 233 filter.InsertBulk(buffer) 234 } 235 236 func splitBlockEncodeUint8(filter bloom.SplitBlockFilter, values []uint8) { 237 buffer := make([]uint64, filterEncodeBufferSize) 238 239 for i := 0; i < len(values); { 240 n := xxhash.MultiSum64Uint8(buffer, values[i:]) 241 filter.InsertBulk(buffer[:n]) 242 i += n 243 } 244 } 245 246 func splitBlockEncodeUint32(filter bloom.SplitBlockFilter, values []uint32) { 247 buffer := make([]uint64, filterEncodeBufferSize) 248 249 for i := 0; i < len(values); { 250 n := xxhash.MultiSum64Uint32(buffer, values[i:]) 251 filter.InsertBulk(buffer[:n]) 252 i += n 253 } 254 } 255 256 func splitBlockEncodeUint64(filter bloom.SplitBlockFilter, values []uint64) { 257 buffer := make([]uint64, filterEncodeBufferSize) 258 259 for i := 0; i < len(values); { 260 n := xxhash.MultiSum64Uint64(buffer, values[i:]) 261 filter.InsertBulk(buffer[:n]) 262 i += n 263 } 264 } 265 266 func splitBlockEncodeUint128(filter bloom.SplitBlockFilter, values [][16]byte) { 267 buffer := make([]uint64, filterEncodeBufferSize) 268 269 for i := 0; i < len(values); { 270 n := xxhash.MultiSum64Uint128(buffer, values[i:]) 271 filter.InsertBulk(buffer[:n]) 272 i += n 273 } 274 }