github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/bloom.go (about)

     1  package parquet
     2  
     3  import (
     4  	"io"
     5  
     6  	"github.com/vc42/parquet-go/bloom"
     7  	"github.com/vc42/parquet-go/bloom/xxhash"
     8  	"github.com/vc42/parquet-go/encoding"
     9  	"github.com/vc42/parquet-go/encoding/plain"
    10  	"github.com/vc42/parquet-go/format"
    11  	"github.com/vc42/parquet-go/internal/unsafecast"
    12  )
    13  
    14  // BloomFilter is an interface allowing applications to test whether a key
    15  // exists in a bloom filter.
    16  type BloomFilter interface {
    17  	// Implement the io.ReaderAt interface as a mechanism to allow reading the
    18  	// raw bits of the filter.
    19  	io.ReaderAt
    20  
    21  	// Returns the size of the bloom filter (in bytes).
    22  	Size() int64
    23  
    24  	// Tests whether the given value is present in the filter.
    25  	//
    26  	// A non-nil error may be returned if reading the filter failed. This may
    27  	// happen if the filter was lazily loaded from a storage medium during the
    28  	// call to Check for example. Applications that can guarantee that the
    29  	// filter was in memory at the time Check was called can safely ignore the
    30  	// error, which would always be nil in this case.
    31  	Check(value Value) (bool, error)
    32  }
    33  
    34  type bloomFilter struct {
    35  	io.SectionReader
    36  	hash  bloom.Hash
    37  	check func(io.ReaderAt, int64, uint64) (bool, error)
    38  }
    39  
    40  func (f *bloomFilter) Check(v Value) (bool, error) {
    41  	return f.check(&f.SectionReader, f.Size(), v.hash(f.hash))
    42  }
    43  
    44  func (v Value) hash(h bloom.Hash) uint64 {
    45  	switch v.Kind() {
    46  	case Boolean:
    47  		return h.Sum64Uint8(uint8(v.u64))
    48  	case Int32, Float:
    49  		return h.Sum64Uint32(uint32(v.u64))
    50  	case Int64, Double:
    51  		return h.Sum64Uint64(v.u64)
    52  	case Int96:
    53  		return h.Sum64(v.Bytes())
    54  	default:
    55  		return h.Sum64(v.ByteArray())
    56  	}
    57  }
    58  
    59  func newBloomFilter(file io.ReaderAt, offset int64, header *format.BloomFilterHeader) *bloomFilter {
    60  	if header.Algorithm.Block != nil {
    61  		if header.Hash.XxHash != nil {
    62  			if header.Compression.Uncompressed != nil {
    63  				return &bloomFilter{
    64  					SectionReader: *io.NewSectionReader(file, offset, int64(header.NumBytes)),
    65  					hash:          bloom.XXH64{},
    66  					check:         bloom.CheckSplitBlock,
    67  				}
    68  			}
    69  		}
    70  	}
    71  	return nil
    72  }
    73  
    74  // The BloomFilterColumn interface is a declarative representation of bloom filters
    75  // used when configuring filters on a parquet writer.
    76  type BloomFilterColumn interface {
    77  	// Returns the path of the column that the filter applies to.
    78  	Path() []string
    79  
    80  	// Returns the hashing algorithm used when inserting values into a bloom
    81  	// filter.
    82  	Hash() bloom.Hash
    83  
    84  	// Returns an encoding which can be used to write columns of values to the
    85  	// filter.
    86  	Encoding() encoding.Encoding
    87  
    88  	// Returns the size of the filter needed to encode values in the filter,
    89  	// assuming each value will be encoded with the given number of bits.
    90  	Size(numValues int64, bitsPerValue uint) int
    91  }
    92  
    93  // SplitBlockFilter constructs a split block bloom filter object for the column
    94  // at the given path.
    95  func SplitBlockFilter(path ...string) BloomFilterColumn { return splitBlockFilter(path) }
    96  
    97  type splitBlockFilter []string
    98  
    99  func (f splitBlockFilter) Path() []string              { return f }
   100  func (f splitBlockFilter) Hash() bloom.Hash            { return bloom.XXH64{} }
   101  func (f splitBlockFilter) Encoding() encoding.Encoding { return splitBlockEncoding{} }
   102  func (f splitBlockFilter) Size(numValues int64, bitsPerValue uint) int {
   103  	return bloom.BlockSize * bloom.NumSplitBlocksOf(numValues, bitsPerValue)
   104  }
   105  
   106  // Creates a header from the given bloom filter.
   107  //
   108  // For now there is only one type of filter supported, but we provide this
   109  // function to suggest a model for extending the implementation if new filters
   110  // are added to the parquet specs.
   111  func bloomFilterHeader(filter BloomFilterColumn) (header format.BloomFilterHeader) {
   112  	switch filter.(type) {
   113  	case splitBlockFilter:
   114  		header.Algorithm.Block = &format.SplitBlockAlgorithm{}
   115  	}
   116  	switch filter.Hash().(type) {
   117  	case bloom.XXH64:
   118  		header.Hash.XxHash = &format.XxHash{}
   119  	}
   120  	header.Compression.Uncompressed = &format.BloomFilterUncompressed{}
   121  	return header
   122  }
   123  
   124  func searchBloomFilterColumn(filters []BloomFilterColumn, path columnPath) BloomFilterColumn {
   125  	for _, f := range filters {
   126  		if path.equal(f.Path()) {
   127  			return f
   128  		}
   129  	}
   130  	return nil
   131  }
   132  
   133  const (
   134  	// Size of the stack buffer used to perform bulk operations on bloom filters.
   135  	//
   136  	// This value was determined as being a good default empirically,
   137  	// 128 x uint64 makes a 1KiB buffer which amortizes the cost of calling
   138  	// methods of bloom filters while not causing too much stack growth either.
   139  	filterEncodeBufferSize = 128
   140  )
   141  
   142  type splitBlockEncoding struct {
   143  	encoding.NotSupported
   144  }
   145  
   146  func (splitBlockEncoding) EncodeBoolean(dst, src []byte) ([]byte, error) {
   147  	splitBlockEncodeUint8(bloom.MakeSplitBlockFilter(dst), src)
   148  	return dst, nil
   149  }
   150  
   151  func (splitBlockEncoding) EncodeInt32(dst, src []byte) ([]byte, error) {
   152  	splitBlockEncodeUint32(bloom.MakeSplitBlockFilter(dst), unsafecast.BytesToUint32(src))
   153  	return dst, nil
   154  }
   155  
   156  func (splitBlockEncoding) EncodeInt64(dst, src []byte) ([]byte, error) {
   157  	splitBlockEncodeUint64(bloom.MakeSplitBlockFilter(dst), unsafecast.BytesToUint64(src))
   158  	return dst, nil
   159  }
   160  
   161  func (e splitBlockEncoding) EncodeInt96(dst, src []byte) ([]byte, error) {
   162  	splitBlockEncodeFixedLenByteArray(bloom.MakeSplitBlockFilter(dst), src, 12)
   163  	return dst, nil
   164  }
   165  
   166  func (splitBlockEncoding) EncodeFloat(dst, src []byte) ([]byte, error) {
   167  	splitBlockEncodeUint32(bloom.MakeSplitBlockFilter(dst), unsafecast.BytesToUint32(src))
   168  	return dst, nil
   169  }
   170  
   171  func (splitBlockEncoding) EncodeDouble(dst, src []byte) ([]byte, error) {
   172  	splitBlockEncodeUint64(bloom.MakeSplitBlockFilter(dst), unsafecast.BytesToUint64(src))
   173  	return dst, nil
   174  }
   175  
   176  func (splitBlockEncoding) EncodeByteArray(dst, src []byte) ([]byte, error) {
   177  	filter := bloom.MakeSplitBlockFilter(dst)
   178  	buffer := make([]uint64, 0, filterEncodeBufferSize)
   179  
   180  	err := plain.RangeByteArray(src, func(value []byte) error {
   181  		if len(buffer) == cap(buffer) {
   182  			filter.InsertBulk(buffer)
   183  			buffer = buffer[:0]
   184  		}
   185  		buffer = append(buffer, xxhash.Sum64(value))
   186  		return nil
   187  	})
   188  
   189  	filter.InsertBulk(buffer)
   190  	return dst, err
   191  }
   192  
   193  func (splitBlockEncoding) EncodeFixedLenByteArray(dst, src []byte, size int) ([]byte, error) {
   194  	filter := bloom.MakeSplitBlockFilter(dst)
   195  	if size == 16 {
   196  		splitBlockEncodeUint128(filter, unsafecast.BytesToUint128(src))
   197  	} else {
   198  		splitBlockEncodeFixedLenByteArray(filter, src, size)
   199  	}
   200  	return dst, nil
   201  }
   202  
   203  func splitBlockEncodeFixedLenByteArray(filter bloom.SplitBlockFilter, data []byte, size int) {
   204  	buffer := make([]uint64, 0, filterEncodeBufferSize)
   205  
   206  	for i, j := 0, size; j <= len(data); {
   207  		if len(buffer) == cap(buffer) {
   208  			filter.InsertBulk(buffer)
   209  			buffer = buffer[:0]
   210  		}
   211  		buffer = append(buffer, xxhash.Sum64(data[i:j]))
   212  		i += size
   213  		j += size
   214  	}
   215  
   216  	filter.InsertBulk(buffer)
   217  }
   218  
   219  func splitBlockEncodeUint8(filter bloom.SplitBlockFilter, values []uint8) {
   220  	buffer := make([]uint64, filterEncodeBufferSize)
   221  
   222  	for i := 0; i < len(values); {
   223  		n := xxhash.MultiSum64Uint8(buffer, values[i:])
   224  		filter.InsertBulk(buffer[:n])
   225  		i += n
   226  	}
   227  }
   228  
   229  func splitBlockEncodeUint32(filter bloom.SplitBlockFilter, values []uint32) {
   230  	buffer := make([]uint64, filterEncodeBufferSize)
   231  
   232  	for i := 0; i < len(values); {
   233  		n := xxhash.MultiSum64Uint32(buffer, values[i:])
   234  		filter.InsertBulk(buffer[:n])
   235  		i += n
   236  	}
   237  }
   238  
   239  func splitBlockEncodeUint64(filter bloom.SplitBlockFilter, values []uint64) {
   240  	buffer := make([]uint64, filterEncodeBufferSize)
   241  
   242  	for i := 0; i < len(values); {
   243  		n := xxhash.MultiSum64Uint64(buffer, values[i:])
   244  		filter.InsertBulk(buffer[:n])
   245  		i += n
   246  	}
   247  }
   248  
   249  func splitBlockEncodeUint128(filter bloom.SplitBlockFilter, values [][16]byte) {
   250  	buffer := make([]uint64, filterEncodeBufferSize)
   251  
   252  	for i := 0; i < len(values); {
   253  		n := xxhash.MultiSum64Uint128(buffer, values[i:])
   254  		filter.InsertBulk(buffer[:n])
   255  		i += n
   256  	}
   257  }