github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/bloom/filter_test.go

github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/bloom/filter_test.go (about)

     1  package bloom_test
     2  
     3  import (
     4  	"bytes"
     5  	"math/rand"
     6  	"testing"
     7  
     8  	"github.com/segmentio/parquet-go/bloom"
     9  )
    10  
    11  func TestSplitBlockFilter(t *testing.T) {
    12  	const N = 1000
    13  	const S = 3
    14  	f := make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(N, 10))
    15  	p := rand.New(rand.NewSource(S))
    16  
    17  	// Half of the values are inserted individually.
    18  	for i := 0; i < N/2; i++ {
    19  		f.Insert(p.Uint64())
    20  	}
    21  	// The other half is inserted as a bulk operation.
    22  	b := make([]uint64, N/2)
    23  	for i := range b {
    24  		b[i] = p.Uint64()
    25  	}
    26  	f.InsertBulk(b)
    27  
    28  	if f.Block(0) == nil {
    29  		t.Fatal("looking up filter block returned impossible nil value")
    30  	}
    31  
    32  	for _, test := range []struct {
    33  		scenario string
    34  		filter   bloom.Filter
    35  	}{
    36  		{scenario: "filter", filter: f},
    37  		{scenario: "reader", filter: newSerializedFilter(f.Bytes())},
    38  	} {
    39  		t.Run(test.scenario, func(t *testing.T) {
    40  			p.Seed(S)
    41  			falsePositives := 0
    42  
    43  			for i := 0; i < N; i++ {
    44  				x := p.Uint64()
    45  
    46  				if !test.filter.Check(x) {
    47  					t.Fatalf("bloom filter block does not contain the value #%d that was inserted: %d", i, x)
    48  				}
    49  				if test.filter.Check(^x) {
    50  					falsePositives++
    51  				}
    52  			}
    53  
    54  			if r := (float64(falsePositives) / N); r > 0.01 {
    55  				t.Fatalf("bloom filter triggered too many false positives: %g%%", r*100)
    56  			}
    57  		})
    58  	}
    59  
    60  	t.Run("Reset", func(t *testing.T) {
    61  		allZeros := true
    62  		for _, b := range f.Bytes() {
    63  			if b != 0 {
    64  				allZeros = false
    65  				break
    66  			}
    67  		}
    68  		if allZeros {
    69  			t.Fatal("bloom filter bytes were all zero after inserting keys")
    70  		}
    71  		f.Reset()
    72  		for i, b := range f.Bytes() {
    73  			if b != 0 {
    74  				t.Fatalf("bloom filter byte at index %d was not zero after resetting the filter: %02X", i, b)
    75  			}
    76  		}
    77  	})
    78  }
    79  
    80  func TestSplitBlockFilterBug1(t *testing.T) {
    81  	// This test exercises the case where we bulk insert a single key in the
    82  	// filter, which skips the core of the optimized assembly routines and runs
    83  	// through the loop handling tails of remaining keys after consuming groups
    84  	// of two or more.
    85  	//
    86  	// The use of quick.Check in bloom filter tests of the parquet package had
    87  	// uncovered a bug which was reproduced here in isolation when debugging.
    88  	h := [1]uint64{0b1000101001000001001001111000000100011011001000011110011100110000}
    89  	f := make(bloom.SplitBlockFilter, 1)
    90  	f.InsertBulk(h[:])
    91  	if !f.Check(h[0]) {
    92  		t.Error("value inserted in the filter was not found")
    93  	}
    94  }
    95  
    96  type serializedFilter struct {
    97  	bytes.Reader
    98  }
    99  
   100  func (f *serializedFilter) Check(x uint64) bool {
   101  	ok, _ := bloom.CheckSplitBlock(&f.Reader, f.Size(), x)
   102  	return ok
   103  }
   104  
   105  func newSerializedFilter(b []byte) *serializedFilter {
   106  	f := new(serializedFilter)
   107  	f.Reset(b)
   108  	return f
   109  }
   110  
   111  func BenchmarkFilterInsertBulk(b *testing.B) {
   112  	f := make(bloom.SplitBlockFilter, 99)
   113  	x := make([]uint64, 16)
   114  	r := rand.NewSource(0).(rand.Source64)
   115  
   116  	for i := range x {
   117  		x[i] = r.Uint64()
   118  	}
   119  
   120  	for i := 0; i < b.N; i++ {
   121  		f.InsertBulk(x)
   122  	}
   123  
   124  	b.SetBytes(bloom.BlockSize * int64(len(x)))
   125  }
   126  
   127  func BenchmarkFilterInsert(b *testing.B) {
   128  	f := make(bloom.SplitBlockFilter, 1)
   129  	for i := 0; i < b.N; i++ {
   130  		f.Insert(uint64(i))
   131  	}
   132  	b.SetBytes(bloom.BlockSize)
   133  }
   134  
   135  func BenchmarkFilterCheck(b *testing.B) {
   136  	f := make(bloom.SplitBlockFilter, 1)
   137  	f.Insert(42)
   138  	for i := 0; i < b.N; i++ {
   139  		f.Check(42)
   140  	}
   141  	b.SetBytes(bloom.BlockSize)
   142  }