github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/bloom.go

github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/bloom.go (about)

     1  package parquet
     2  
     3  import (
     4  	"io"
     5  
     6  	"github.com/segmentio/parquet-go/bloom"
     7  	"github.com/segmentio/parquet-go/bloom/xxhash"
     8  	"github.com/segmentio/parquet-go/deprecated"
     9  	"github.com/segmentio/parquet-go/encoding"
    10  	"github.com/segmentio/parquet-go/format"
    11  	"github.com/segmentio/parquet-go/internal/unsafecast"
    12  )
    13  
    14  // BloomFilter is an interface allowing applications to test whether a key
    15  // exists in a bloom filter.
    16  type BloomFilter interface {
    17  	// Implement the io.ReaderAt interface as a mechanism to allow reading the
    18  	// raw bits of the filter.
    19  	io.ReaderAt
    20  
    21  	// Returns the size of the bloom filter (in bytes).
    22  	Size() int64
    23  
    24  	// Tests whether the given value is present in the filter.
    25  	//
    26  	// A non-nil error may be returned if reading the filter failed. This may
    27  	// happen if the filter was lazily loaded from a storage medium during the
    28  	// call to Check for example. Applications that can guarantee that the
    29  	// filter was in memory at the time Check was called can safely ignore the
    30  	// error, which would always be nil in this case.
    31  	Check(value Value) (bool, error)
    32  }
    33  
    34  type bloomFilter struct {
    35  	io.SectionReader
    36  	hash  bloom.Hash
    37  	check func(io.ReaderAt, int64, uint64) (bool, error)
    38  }
    39  
    40  func (f *bloomFilter) Check(v Value) (bool, error) {
    41  	return f.check(&f.SectionReader, f.Size(), v.hash(f.hash))
    42  }
    43  
    44  func (v Value) hash(h bloom.Hash) uint64 {
    45  	switch v.Kind() {
    46  	case Boolean:
    47  		return h.Sum64Uint8(v.byte())
    48  	case Int32, Float:
    49  		return h.Sum64Uint32(v.uint32())
    50  	case Int64, Double:
    51  		return h.Sum64Uint64(v.uint64())
    52  	default: // Int96, ByteArray, FixedLenByteArray, or null
    53  		return h.Sum64(v.byteArray())
    54  	}
    55  }
    56  
    57  func newBloomFilter(file io.ReaderAt, offset int64, header *format.BloomFilterHeader) *bloomFilter {
    58  	if header.Algorithm.Block != nil {
    59  		if header.Hash.XxHash != nil {
    60  			if header.Compression.Uncompressed != nil {
    61  				return &bloomFilter{
    62  					SectionReader: *io.NewSectionReader(file, offset, int64(header.NumBytes)),
    63  					hash:          bloom.XXH64{},
    64  					check:         bloom.CheckSplitBlock,
    65  				}
    66  			}
    67  		}
    68  	}
    69  	return nil
    70  }
    71  
    72  // The BloomFilterColumn interface is a declarative representation of bloom filters
    73  // used when configuring filters on a parquet writer.
    74  type BloomFilterColumn interface {
    75  	// Returns the path of the column that the filter applies to.
    76  	Path() []string
    77  
    78  	// Returns the hashing algorithm used when inserting values into a bloom
    79  	// filter.
    80  	Hash() bloom.Hash
    81  
    82  	// Returns an encoding which can be used to write columns of values to the
    83  	// filter.
    84  	Encoding() encoding.Encoding
    85  
    86  	// Returns the size of the filter needed to encode values in the filter,
    87  	// assuming each value will be encoded with the given number of bits.
    88  	Size(numValues int64) int
    89  }
    90  
    91  // SplitBlockFilter constructs a split block bloom filter object for the column
    92  // at the given path, with the given bitsPerValue.
    93  //
    94  // If you are unsure what number of bitsPerValue to use, 10 is a reasonable
    95  // tradeoff between size and error rate for common datasets.
    96  //
    97  // For more information on the tradeoff between size and error rate, consult
    98  // this website: https://hur.st/bloomfilter/?n=4000&p=0.1&m=&k=1
    99  func SplitBlockFilter(bitsPerValue uint, path ...string) BloomFilterColumn {
   100  	return splitBlockFilter{
   101  		bitsPerValue: bitsPerValue,
   102  		path:         path,
   103  	}
   104  }
   105  
   106  type splitBlockFilter struct {
   107  	bitsPerValue uint
   108  	path         []string
   109  }
   110  
   111  func (f splitBlockFilter) Path() []string              { return f.path }
   112  func (f splitBlockFilter) Hash() bloom.Hash            { return bloom.XXH64{} }
   113  func (f splitBlockFilter) Encoding() encoding.Encoding { return splitBlockEncoding{} }
   114  
   115  func (f splitBlockFilter) Size(numValues int64) int {
   116  	return bloom.BlockSize * bloom.NumSplitBlocksOf(numValues, f.bitsPerValue)
   117  }
   118  
   119  // Creates a header from the given bloom filter.
   120  //
   121  // For now there is only one type of filter supported, but we provide this
   122  // function to suggest a model for extending the implementation if new filters
   123  // are added to the parquet specs.
   124  func bloomFilterHeader(filter BloomFilterColumn) (header format.BloomFilterHeader) {
   125  	switch filter.(type) {
   126  	case splitBlockFilter:
   127  		header.Algorithm.Block = &format.SplitBlockAlgorithm{}
   128  	}
   129  	switch filter.Hash().(type) {
   130  	case bloom.XXH64:
   131  		header.Hash.XxHash = &format.XxHash{}
   132  	}
   133  	header.Compression.Uncompressed = &format.BloomFilterUncompressed{}
   134  	return header
   135  }
   136  
   137  func searchBloomFilterColumn(filters []BloomFilterColumn, path columnPath) BloomFilterColumn {
   138  	for _, f := range filters {
   139  		if path.equal(f.Path()) {
   140  			return f
   141  		}
   142  	}
   143  	return nil
   144  }
   145  
   146  const (
   147  	// Size of the stack buffer used to perform bulk operations on bloom filters.
   148  	//
   149  	// This value was determined as being a good default empirically,
   150  	// 128 x uint64 makes a 1KiB buffer which amortizes the cost of calling
   151  	// methods of bloom filters while not causing too much stack growth either.
   152  	filterEncodeBufferSize = 128
   153  )
   154  
   155  type splitBlockEncoding struct {
   156  	encoding.NotSupported
   157  }
   158  
   159  func (splitBlockEncoding) EncodeBoolean(dst []byte, src []byte) ([]byte, error) {
   160  	splitBlockEncodeUint8(bloom.MakeSplitBlockFilter(dst), src)
   161  	return dst, nil
   162  }
   163  
   164  func (splitBlockEncoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) {
   165  	splitBlockEncodeUint32(bloom.MakeSplitBlockFilter(dst), unsafecast.Int32ToUint32(src))
   166  	return dst, nil
   167  }
   168  
   169  func (splitBlockEncoding) EncodeInt64(dst []byte, src []int64) ([]byte, error) {
   170  	splitBlockEncodeUint64(bloom.MakeSplitBlockFilter(dst), unsafecast.Int64ToUint64(src))
   171  	return dst, nil
   172  }
   173  
   174  func (e splitBlockEncoding) EncodeInt96(dst []byte, src []deprecated.Int96) ([]byte, error) {
   175  	splitBlockEncodeFixedLenByteArray(bloom.MakeSplitBlockFilter(dst), deprecated.Int96ToBytes(src), 12)
   176  	return dst, nil
   177  }
   178  
   179  func (splitBlockEncoding) EncodeFloat(dst []byte, src []float32) ([]byte, error) {
   180  	splitBlockEncodeUint32(bloom.MakeSplitBlockFilter(dst), unsafecast.Float32ToUint32(src))
   181  	return dst, nil
   182  }
   183  
   184  func (splitBlockEncoding) EncodeDouble(dst []byte, src []float64) ([]byte, error) {
   185  	splitBlockEncodeUint64(bloom.MakeSplitBlockFilter(dst), unsafecast.Float64ToUint64(src))
   186  	return dst, nil
   187  }
   188  
   189  func (splitBlockEncoding) EncodeByteArray(dst []byte, src []byte, offsets []uint32) ([]byte, error) {
   190  	filter := bloom.MakeSplitBlockFilter(dst)
   191  	buffer := make([]uint64, 0, filterEncodeBufferSize)
   192  	baseOffset := offsets[0]
   193  
   194  	for _, endOffset := range offsets[1:] {
   195  		value := src[baseOffset:endOffset:endOffset]
   196  		baseOffset = endOffset
   197  
   198  		if len(buffer) == cap(buffer) {
   199  			filter.InsertBulk(buffer)
   200  			buffer = buffer[:0]
   201  		}
   202  
   203  		buffer = append(buffer, xxhash.Sum64(value))
   204  	}
   205  
   206  	filter.InsertBulk(buffer)
   207  	return dst, nil
   208  }
   209  
   210  func (splitBlockEncoding) EncodeFixedLenByteArray(dst []byte, src []byte, size int) ([]byte, error) {
   211  	filter := bloom.MakeSplitBlockFilter(dst)
   212  	if size == 16 {
   213  		splitBlockEncodeUint128(filter, unsafecast.BytesToUint128(src))
   214  	} else {
   215  		splitBlockEncodeFixedLenByteArray(filter, src, size)
   216  	}
   217  	return dst, nil
   218  }
   219  
   220  func splitBlockEncodeFixedLenByteArray(filter bloom.SplitBlockFilter, data []byte, size int) {
   221  	buffer := make([]uint64, 0, filterEncodeBufferSize)
   222  
   223  	for i, j := 0, size; j <= len(data); {
   224  		if len(buffer) == cap(buffer) {
   225  			filter.InsertBulk(buffer)
   226  			buffer = buffer[:0]
   227  		}
   228  		buffer = append(buffer, xxhash.Sum64(data[i:j]))
   229  		i += size
   230  		j += size
   231  	}
   232  
   233  	filter.InsertBulk(buffer)
   234  }
   235  
   236  func splitBlockEncodeUint8(filter bloom.SplitBlockFilter, values []uint8) {
   237  	buffer := make([]uint64, filterEncodeBufferSize)
   238  
   239  	for i := 0; i < len(values); {
   240  		n := xxhash.MultiSum64Uint8(buffer, values[i:])
   241  		filter.InsertBulk(buffer[:n])
   242  		i += n
   243  	}
   244  }
   245  
   246  func splitBlockEncodeUint32(filter bloom.SplitBlockFilter, values []uint32) {
   247  	buffer := make([]uint64, filterEncodeBufferSize)
   248  
   249  	for i := 0; i < len(values); {
   250  		n := xxhash.MultiSum64Uint32(buffer, values[i:])
   251  		filter.InsertBulk(buffer[:n])
   252  		i += n
   253  	}
   254  }
   255  
   256  func splitBlockEncodeUint64(filter bloom.SplitBlockFilter, values []uint64) {
   257  	buffer := make([]uint64, filterEncodeBufferSize)
   258  
   259  	for i := 0; i < len(values); {
   260  		n := xxhash.MultiSum64Uint64(buffer, values[i:])
   261  		filter.InsertBulk(buffer[:n])
   262  		i += n
   263  	}
   264  }
   265  
   266  func splitBlockEncodeUint128(filter bloom.SplitBlockFilter, values [][16]byte) {
   267  	buffer := make([]uint64, filterEncodeBufferSize)
   268  
   269  	for i := 0; i < len(values); {
   270  		n := xxhash.MultiSum64Uint128(buffer, values[i:])
   271  		filter.InsertBulk(buffer[:n])
   272  		i += n
   273  	}
   274  }