github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/dictionary_test.go

github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/dictionary_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"fmt"
     5  	"math/rand"
     6  	"testing"
     7  	"time"
     8  
     9  	"github.com/vc42/parquet-go"
    10  )
    11  
    12  var dictionaryTypes = [...]parquet.Type{
    13  	parquet.BooleanType,
    14  	parquet.Int32Type,
    15  	parquet.Int64Type,
    16  	parquet.Int96Type,
    17  	parquet.FloatType,
    18  	parquet.DoubleType,
    19  	parquet.ByteArrayType,
    20  	parquet.FixedLenByteArrayType(10),
    21  	parquet.FixedLenByteArrayType(16),
    22  	parquet.Uint(32).Type(),
    23  	parquet.Uint(64).Type(),
    24  }
    25  
    26  func TestDictionary(t *testing.T) {
    27  	for _, typ := range dictionaryTypes {
    28  		for _, numValues := range []int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1e2, 1e3, 1e4} {
    29  			t.Run(fmt.Sprintf("%s/N=%d", typ, numValues), func(t *testing.T) {
    30  				testDictionary(t, typ, numValues)
    31  			})
    32  		}
    33  	}
    34  }
    35  
    36  func testDictionary(t *testing.T, typ parquet.Type, numValues int) {
    37  	const columnIndex = 1
    38  
    39  	dict := typ.NewDictionary(columnIndex, 0, nil)
    40  	values := make([]parquet.Value, numValues)
    41  	indexes := make([]int32, numValues)
    42  	lookups := make([]parquet.Value, numValues)
    43  
    44  	f := randValueFuncOf(typ)
    45  	r := rand.New(rand.NewSource(int64(numValues)))
    46  
    47  	for i := range values {
    48  		values[i] = f(r)
    49  		values[i] = values[i].Level(0, 0, columnIndex)
    50  	}
    51  
    52  	mapping := make(map[int32]parquet.Value, numValues)
    53  
    54  	for i := 0; i < numValues; {
    55  		j := i + ((numValues-i)/2 + 1)
    56  		if j > numValues {
    57  			j = numValues
    58  		}
    59  
    60  		dict.Insert(indexes[i:j], values[i:j])
    61  
    62  		for k, v := range values[i:j] {
    63  			mapping[indexes[i+k]] = v
    64  		}
    65  
    66  		for _, index := range indexes[i:j] {
    67  			if index < 0 || index >= int32(dict.Len()) {
    68  				t.Fatalf("index out of bounds: %d", index)
    69  			}
    70  		}
    71  
    72  		r.Shuffle(j-i, func(a, b int) {
    73  			indexes[a+i], indexes[b+i] = indexes[b+i], indexes[a+i]
    74  		})
    75  
    76  		dict.Lookup(indexes[i:j], lookups[i:j])
    77  
    78  		for lookupIndex, valueIndex := range indexes[i:j] {
    79  			want := mapping[valueIndex]
    80  			got := lookups[lookupIndex+i]
    81  
    82  			if !parquet.DeepEqual(want, got) {
    83  				t.Fatalf("wrong value looked up at index %d: want=%#v got=%#v", valueIndex, want, got)
    84  			}
    85  		}
    86  
    87  		minValue := values[i]
    88  		maxValue := values[i]
    89  
    90  		for _, value := range values[i+1 : j] {
    91  			switch {
    92  			case typ.Compare(value, minValue) < 0:
    93  				minValue = value
    94  			case typ.Compare(value, maxValue) > 0:
    95  				maxValue = value
    96  			}
    97  		}
    98  
    99  		lowerBound, upperBound := dict.Bounds(indexes[i:j])
   100  		if !parquet.DeepEqual(lowerBound, minValue) {
   101  			t.Errorf("wrong lower bound betwen indexes %d and %d: want=%#v got=%#v", i, j, minValue, lowerBound)
   102  		}
   103  		if !parquet.DeepEqual(upperBound, maxValue) {
   104  			t.Errorf("wrong upper bound between indexes %d and %d: want=%#v got=%#v", i, j, maxValue, upperBound)
   105  		}
   106  
   107  		i = j
   108  	}
   109  
   110  	for i := range lookups {
   111  		lookups[i] = parquet.Value{}
   112  	}
   113  
   114  	dict.Lookup(indexes, lookups)
   115  
   116  	for lookupIndex, valueIndex := range indexes {
   117  		want := mapping[valueIndex]
   118  		got := lookups[lookupIndex]
   119  
   120  		if !parquet.Equal(want, got) {
   121  			t.Fatalf("wrong value looked up at index %d: want=%+v got=%+v", valueIndex, want, got)
   122  		}
   123  	}
   124  }
   125  
   126  func BenchmarkDictionary(b *testing.B) {
   127  	tests := []struct {
   128  		scenario string
   129  		init     func(parquet.Dictionary, []int32, []parquet.Value)
   130  		test     func(parquet.Dictionary, []int32, []parquet.Value)
   131  	}{
   132  		{
   133  			scenario: "Bounds",
   134  			init:     parquet.Dictionary.Insert,
   135  			test: func(dict parquet.Dictionary, indexes []int32, _ []parquet.Value) {
   136  				dict.Bounds(indexes)
   137  			},
   138  		},
   139  
   140  		{
   141  			scenario: "Insert",
   142  			test:     parquet.Dictionary.Insert,
   143  		},
   144  
   145  		{
   146  			scenario: "Lookup",
   147  			init:     parquet.Dictionary.Insert,
   148  			test:     parquet.Dictionary.Lookup,
   149  		},
   150  	}
   151  
   152  	for i, test := range tests {
   153  		b.Run(test.scenario, func(b *testing.B) {
   154  			for j, typ := range dictionaryTypes {
   155  				for _, numValues := range []int{1e2, 1e3, 1e4, 1e5, 1e6} {
   156  					dict := typ.NewDictionary(0, 0, make([]byte, 0, 4*numValues))
   157  					values := make([]parquet.Value, numValues)
   158  
   159  					f := randValueFuncOf(typ)
   160  					r := rand.New(rand.NewSource(int64(i * j * numValues)))
   161  
   162  					for i := range values {
   163  						values[i] = f(r)
   164  					}
   165  
   166  					indexes := make([]int32, len(values))
   167  					if test.init != nil {
   168  						test.init(dict, indexes, values)
   169  					}
   170  
   171  					b.Run(fmt.Sprintf("%s/N=%d", typ, numValues), func(b *testing.B) {
   172  						start := time.Now()
   173  
   174  						for i := 0; i < b.N; i++ {
   175  							test.test(dict, indexes, values)
   176  						}
   177  
   178  						seconds := time.Since(start).Seconds()
   179  						b.ReportMetric(float64(numValues*b.N)/seconds, "value/s")
   180  					})
   181  				}
   182  			}
   183  		})
   184  	}
   185  }