github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/bloom_test.go (about)

     1  package parquet
     2  
     3  import (
     4  	"math/rand"
     5  	"testing"
     6  
     7  	"github.com/segmentio/parquet-go/bloom"
     8  	"github.com/segmentio/parquet-go/deprecated"
     9  	"github.com/segmentio/parquet-go/internal/quick"
    10  	"github.com/segmentio/parquet-go/internal/unsafecast"
    11  )
    12  
    13  func TestSplitBlockFilter(t *testing.T) {
    14  	newFilter := func(numValues int) bloom.SplitBlockFilter {
    15  		return make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(int64(numValues), 11))
    16  	}
    17  
    18  	enc := SplitBlockFilter(10, "$").Encoding()
    19  
    20  	check := func(filter bloom.SplitBlockFilter, value Value) bool {
    21  		return filter.Check(value.hash(&bloom.XXH64{}))
    22  	}
    23  
    24  	tests := []struct {
    25  		scenario string
    26  		function interface{}
    27  	}{
    28  		{
    29  			scenario: "BOOLEAN",
    30  			function: func(values []bool) bool {
    31  				filter := newFilter(len(values))
    32  				enc.EncodeBoolean(filter.Bytes(), unsafecast.BoolToBytes(values))
    33  				for _, v := range values {
    34  					if !check(filter, ValueOf(v)) {
    35  						return false
    36  					}
    37  				}
    38  				return true
    39  			},
    40  		},
    41  
    42  		{
    43  			scenario: "INT32",
    44  			function: func(values []int32) bool {
    45  				filter := newFilter(len(values))
    46  				enc.EncodeInt32(filter.Bytes(), values)
    47  				for _, v := range values {
    48  					if !check(filter, ValueOf(v)) {
    49  						return false
    50  					}
    51  				}
    52  				return true
    53  			},
    54  		},
    55  
    56  		{
    57  			scenario: "INT64",
    58  			function: func(values []int64) bool {
    59  				filter := newFilter(len(values))
    60  				enc.EncodeInt64(filter.Bytes(), values)
    61  				for _, v := range values {
    62  					if !check(filter, ValueOf(v)) {
    63  						return false
    64  					}
    65  				}
    66  				return true
    67  			},
    68  		},
    69  
    70  		{
    71  			scenario: "INT96",
    72  			function: func(values []deprecated.Int96) bool {
    73  				filter := newFilter(len(values))
    74  				enc.EncodeInt96(filter.Bytes(), values)
    75  				for _, v := range values {
    76  					if !check(filter, ValueOf(v)) {
    77  						return false
    78  					}
    79  				}
    80  				return true
    81  			},
    82  		},
    83  
    84  		{
    85  			scenario: "FLOAT",
    86  			function: func(values []float32) bool {
    87  				filter := newFilter(len(values))
    88  				enc.EncodeFloat(filter.Bytes(), values)
    89  				for _, v := range values {
    90  					if !check(filter, ValueOf(v)) {
    91  						return false
    92  					}
    93  				}
    94  				return true
    95  			},
    96  		},
    97  
    98  		{
    99  			scenario: "DOUBLE",
   100  			function: func(values []float64) bool {
   101  				filter := newFilter(len(values))
   102  				enc.EncodeDouble(filter.Bytes(), values)
   103  				for _, v := range values {
   104  					if !check(filter, ValueOf(v)) {
   105  						return false
   106  					}
   107  				}
   108  				return true
   109  			},
   110  		},
   111  
   112  		{
   113  			scenario: "BYTE_ARRAY",
   114  			function: func(values [][]byte) bool {
   115  				content := make([]byte, 0, 512)
   116  				offsets := make([]uint32, len(values))
   117  				for _, value := range values {
   118  					offsets = append(offsets, uint32(len(content)))
   119  					content = append(content, value...)
   120  				}
   121  				offsets = append(offsets, uint32(len(content)))
   122  				filter := newFilter(len(values))
   123  				enc.EncodeByteArray(filter.Bytes(), content, offsets)
   124  				for _, v := range values {
   125  					if !check(filter, ValueOf(v)) {
   126  						return false
   127  					}
   128  				}
   129  				return true
   130  			},
   131  		},
   132  
   133  		{
   134  			scenario: "FIXED_LEN_BYTE_ARRAY",
   135  			function: func(values []byte) bool {
   136  				filter := newFilter(len(values))
   137  				enc.EncodeFixedLenByteArray(filter.Bytes(), values, 1)
   138  				for _, v := range values {
   139  					if !check(filter, ValueOf([1]byte{v})) {
   140  						return false
   141  					}
   142  				}
   143  				return true
   144  			},
   145  		},
   146  	}
   147  
   148  	for _, test := range tests {
   149  		t.Run(test.scenario, func(t *testing.T) {
   150  			if err := quick.Check(test.function); err != nil {
   151  				t.Error(err)
   152  			}
   153  		})
   154  	}
   155  }
   156  
   157  func BenchmarkSplitBlockFilter(b *testing.B) {
   158  	const N = 1000
   159  	f := make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(N, 10)).Bytes()
   160  	e := SplitBlockFilter(10, "$").Encoding()
   161  
   162  	v := make([]int64, N)
   163  	r := rand.NewSource(10)
   164  	for i := range v {
   165  		v[i] = r.Int63()
   166  	}
   167  
   168  	for i := 0; i < b.N; i++ {
   169  		e.EncodeInt64(f, v)
   170  	}
   171  
   172  	b.SetBytes(8 * N)
   173  }