github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/dictionary_test.go (about) 1 package parquet_test 2 3 import ( 4 "fmt" 5 "math/rand" 6 "testing" 7 "time" 8 9 "github.com/vc42/parquet-go" 10 ) 11 12 var dictionaryTypes = [...]parquet.Type{ 13 parquet.BooleanType, 14 parquet.Int32Type, 15 parquet.Int64Type, 16 parquet.Int96Type, 17 parquet.FloatType, 18 parquet.DoubleType, 19 parquet.ByteArrayType, 20 parquet.FixedLenByteArrayType(10), 21 parquet.FixedLenByteArrayType(16), 22 parquet.Uint(32).Type(), 23 parquet.Uint(64).Type(), 24 } 25 26 func TestDictionary(t *testing.T) { 27 for _, typ := range dictionaryTypes { 28 for _, numValues := range []int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1e2, 1e3, 1e4} { 29 t.Run(fmt.Sprintf("%s/N=%d", typ, numValues), func(t *testing.T) { 30 testDictionary(t, typ, numValues) 31 }) 32 } 33 } 34 } 35 36 func testDictionary(t *testing.T, typ parquet.Type, numValues int) { 37 const columnIndex = 1 38 39 dict := typ.NewDictionary(columnIndex, 0, nil) 40 values := make([]parquet.Value, numValues) 41 indexes := make([]int32, numValues) 42 lookups := make([]parquet.Value, numValues) 43 44 f := randValueFuncOf(typ) 45 r := rand.New(rand.NewSource(int64(numValues))) 46 47 for i := range values { 48 values[i] = f(r) 49 values[i] = values[i].Level(0, 0, columnIndex) 50 } 51 52 mapping := make(map[int32]parquet.Value, numValues) 53 54 for i := 0; i < numValues; { 55 j := i + ((numValues-i)/2 + 1) 56 if j > numValues { 57 j = numValues 58 } 59 60 dict.Insert(indexes[i:j], values[i:j]) 61 62 for k, v := range values[i:j] { 63 mapping[indexes[i+k]] = v 64 } 65 66 for _, index := range indexes[i:j] { 67 if index < 0 || index >= int32(dict.Len()) { 68 t.Fatalf("index out of bounds: %d", index) 69 } 70 } 71 72 r.Shuffle(j-i, func(a, b int) { 73 indexes[a+i], indexes[b+i] = indexes[b+i], indexes[a+i] 74 }) 75 76 dict.Lookup(indexes[i:j], lookups[i:j]) 77 78 for lookupIndex, valueIndex := range indexes[i:j] { 79 want := mapping[valueIndex] 80 got := lookups[lookupIndex+i] 81 82 if !parquet.DeepEqual(want, got) { 83 t.Fatalf("wrong value looked up at index %d: want=%#v got=%#v", valueIndex, want, got) 84 } 85 } 86 87 minValue := values[i] 88 maxValue := values[i] 89 90 for _, value := range values[i+1 : j] { 91 switch { 92 case typ.Compare(value, minValue) < 0: 93 minValue = value 94 case typ.Compare(value, maxValue) > 0: 95 maxValue = value 96 } 97 } 98 99 lowerBound, upperBound := dict.Bounds(indexes[i:j]) 100 if !parquet.DeepEqual(lowerBound, minValue) { 101 t.Errorf("wrong lower bound betwen indexes %d and %d: want=%#v got=%#v", i, j, minValue, lowerBound) 102 } 103 if !parquet.DeepEqual(upperBound, maxValue) { 104 t.Errorf("wrong upper bound between indexes %d and %d: want=%#v got=%#v", i, j, maxValue, upperBound) 105 } 106 107 i = j 108 } 109 110 for i := range lookups { 111 lookups[i] = parquet.Value{} 112 } 113 114 dict.Lookup(indexes, lookups) 115 116 for lookupIndex, valueIndex := range indexes { 117 want := mapping[valueIndex] 118 got := lookups[lookupIndex] 119 120 if !parquet.Equal(want, got) { 121 t.Fatalf("wrong value looked up at index %d: want=%+v got=%+v", valueIndex, want, got) 122 } 123 } 124 } 125 126 func BenchmarkDictionary(b *testing.B) { 127 tests := []struct { 128 scenario string 129 init func(parquet.Dictionary, []int32, []parquet.Value) 130 test func(parquet.Dictionary, []int32, []parquet.Value) 131 }{ 132 { 133 scenario: "Bounds", 134 init: parquet.Dictionary.Insert, 135 test: func(dict parquet.Dictionary, indexes []int32, _ []parquet.Value) { 136 dict.Bounds(indexes) 137 }, 138 }, 139 140 { 141 scenario: "Insert", 142 test: parquet.Dictionary.Insert, 143 }, 144 145 { 146 scenario: "Lookup", 147 init: parquet.Dictionary.Insert, 148 test: parquet.Dictionary.Lookup, 149 }, 150 } 151 152 for i, test := range tests { 153 b.Run(test.scenario, func(b *testing.B) { 154 for j, typ := range dictionaryTypes { 155 for _, numValues := range []int{1e2, 1e3, 1e4, 1e5, 1e6} { 156 dict := typ.NewDictionary(0, 0, make([]byte, 0, 4*numValues)) 157 values := make([]parquet.Value, numValues) 158 159 f := randValueFuncOf(typ) 160 r := rand.New(rand.NewSource(int64(i * j * numValues))) 161 162 for i := range values { 163 values[i] = f(r) 164 } 165 166 indexes := make([]int32, len(values)) 167 if test.init != nil { 168 test.init(dict, indexes, values) 169 } 170 171 b.Run(fmt.Sprintf("%s/N=%d", typ, numValues), func(b *testing.B) { 172 start := time.Now() 173 174 for i := 0; i < b.N; i++ { 175 test.test(dict, indexes, values) 176 } 177 178 seconds := time.Since(start).Seconds() 179 b.ReportMetric(float64(numValues*b.N)/seconds, "value/s") 180 }) 181 } 182 } 183 }) 184 } 185 }