github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/bloom/filter_test.go (about) 1 package bloom_test 2 3 import ( 4 "bytes" 5 "math/rand" 6 "testing" 7 8 "github.com/segmentio/parquet-go/bloom" 9 ) 10 11 func TestSplitBlockFilter(t *testing.T) { 12 const N = 1000 13 const S = 3 14 f := make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(N, 10)) 15 p := rand.New(rand.NewSource(S)) 16 17 // Half of the values are inserted individually. 18 for i := 0; i < N/2; i++ { 19 f.Insert(p.Uint64()) 20 } 21 // The other half is inserted as a bulk operation. 22 b := make([]uint64, N/2) 23 for i := range b { 24 b[i] = p.Uint64() 25 } 26 f.InsertBulk(b) 27 28 if f.Block(0) == nil { 29 t.Fatal("looking up filter block returned impossible nil value") 30 } 31 32 for _, test := range []struct { 33 scenario string 34 filter bloom.Filter 35 }{ 36 {scenario: "filter", filter: f}, 37 {scenario: "reader", filter: newSerializedFilter(f.Bytes())}, 38 } { 39 t.Run(test.scenario, func(t *testing.T) { 40 p.Seed(S) 41 falsePositives := 0 42 43 for i := 0; i < N; i++ { 44 x := p.Uint64() 45 46 if !test.filter.Check(x) { 47 t.Fatalf("bloom filter block does not contain the value #%d that was inserted: %d", i, x) 48 } 49 if test.filter.Check(^x) { 50 falsePositives++ 51 } 52 } 53 54 if r := (float64(falsePositives) / N); r > 0.01 { 55 t.Fatalf("bloom filter triggered too many false positives: %g%%", r*100) 56 } 57 }) 58 } 59 60 t.Run("Reset", func(t *testing.T) { 61 allZeros := true 62 for _, b := range f.Bytes() { 63 if b != 0 { 64 allZeros = false 65 break 66 } 67 } 68 if allZeros { 69 t.Fatal("bloom filter bytes were all zero after inserting keys") 70 } 71 f.Reset() 72 for i, b := range f.Bytes() { 73 if b != 0 { 74 t.Fatalf("bloom filter byte at index %d was not zero after resetting the filter: %02X", i, b) 75 } 76 } 77 }) 78 } 79 80 func TestSplitBlockFilterBug1(t *testing.T) { 81 // This test exercises the case where we bulk insert a single key in the 82 // filter, which skips the core of the optimized assembly routines and runs 83 // through the loop handling tails of remaining keys after consuming groups 84 // of two or more. 85 // 86 // The use of quick.Check in bloom filter tests of the parquet package had 87 // uncovered a bug which was reproduced here in isolation when debugging. 88 h := [1]uint64{0b1000101001000001001001111000000100011011001000011110011100110000} 89 f := make(bloom.SplitBlockFilter, 1) 90 f.InsertBulk(h[:]) 91 if !f.Check(h[0]) { 92 t.Error("value inserted in the filter was not found") 93 } 94 } 95 96 type serializedFilter struct { 97 bytes.Reader 98 } 99 100 func (f *serializedFilter) Check(x uint64) bool { 101 ok, _ := bloom.CheckSplitBlock(&f.Reader, f.Size(), x) 102 return ok 103 } 104 105 func newSerializedFilter(b []byte) *serializedFilter { 106 f := new(serializedFilter) 107 f.Reset(b) 108 return f 109 } 110 111 func BenchmarkFilterInsertBulk(b *testing.B) { 112 f := make(bloom.SplitBlockFilter, 99) 113 x := make([]uint64, 16) 114 r := rand.NewSource(0).(rand.Source64) 115 116 for i := range x { 117 x[i] = r.Uint64() 118 } 119 120 for i := 0; i < b.N; i++ { 121 f.InsertBulk(x) 122 } 123 124 b.SetBytes(bloom.BlockSize * int64(len(x))) 125 } 126 127 func BenchmarkFilterInsert(b *testing.B) { 128 f := make(bloom.SplitBlockFilter, 1) 129 for i := 0; i < b.N; i++ { 130 f.Insert(uint64(i)) 131 } 132 b.SetBytes(bloom.BlockSize) 133 } 134 135 func BenchmarkFilterCheck(b *testing.B) { 136 f := make(bloom.SplitBlockFilter, 1) 137 f.Insert(42) 138 for i := 0; i < b.N; i++ { 139 f.Check(42) 140 } 141 b.SetBytes(bloom.BlockSize) 142 }