github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/buffer_go18_test.go (about) 1 //go:build go1.18 2 3 package parquet_test 4 5 import ( 6 "encoding/binary" 7 "errors" 8 "fmt" 9 "io" 10 "math/rand" 11 "reflect" 12 "sort" 13 "testing" 14 15 "github.com/segmentio/parquet-go" 16 ) 17 18 func TestGenericBuffer(t *testing.T) { 19 testGenericBuffer[booleanColumn](t) 20 testGenericBuffer[int32Column](t) 21 testGenericBuffer[int64Column](t) 22 testGenericBuffer[int96Column](t) 23 testGenericBuffer[floatColumn](t) 24 testGenericBuffer[doubleColumn](t) 25 testGenericBuffer[byteArrayColumn](t) 26 testGenericBuffer[fixedLenByteArrayColumn](t) 27 testGenericBuffer[stringColumn](t) 28 testGenericBuffer[indexedStringColumn](t) 29 testGenericBuffer[uuidColumn](t) 30 testGenericBuffer[timeColumn](t) 31 testGenericBuffer[timeInMillisColumn](t) 32 testGenericBuffer[mapColumn](t) 33 testGenericBuffer[decimalColumn](t) 34 testGenericBuffer[addressBook](t) 35 testGenericBuffer[contact](t) 36 testGenericBuffer[listColumn2](t) 37 testGenericBuffer[listColumn1](t) 38 testGenericBuffer[listColumn0](t) 39 testGenericBuffer[nestedListColumn1](t) 40 testGenericBuffer[nestedListColumn](t) 41 testGenericBuffer[*contact](t) 42 testGenericBuffer[paddedBooleanColumn](t) 43 testGenericBuffer[optionalInt32Column](t) 44 testGenericBuffer[repeatedInt32Column](t) 45 } 46 47 func testGenericBuffer[Row any](t *testing.T) { 48 var model Row 49 t.Run(reflect.TypeOf(model).Name(), func(t *testing.T) { 50 err := quickCheck(func(rows []Row) bool { 51 if len(rows) == 0 { 52 return true // TODO: fix support for parquet files with zero rows 53 } 54 if err := testGenericBufferRows(rows); err != nil { 55 t.Error(err) 56 return false 57 } 58 return true 59 }) 60 if err != nil { 61 t.Error(err) 62 } 63 }) 64 } 65 66 func testGenericBufferRows[Row any](rows []Row) error { 67 setNullPointers(rows) 68 buffer := parquet.NewGenericBuffer[Row]() 69 _, err := buffer.Write(rows) 70 if err != nil { 71 return err 72 } 73 reader := parquet.NewGenericRowGroupReader[Row](buffer) 74 result := make([]Row, len(rows)) 75 n, err := reader.Read(result) 76 if err != nil && !errors.Is(err, io.EOF) { 77 return err 78 } 79 if n < len(rows) { 80 return fmt.Errorf("not enough values were read: want=%d got=%d", len(rows), n) 81 } 82 if !reflect.DeepEqual(rows, result) { 83 return fmt.Errorf("rows mismatch:\nwant: %#v\ngot: %#v", rows, result) 84 } 85 return nil 86 } 87 88 func setNullPointers[Row any](rows []Row) { 89 if len(rows) > 0 && reflect.TypeOf(rows[0]).Kind() == reflect.Pointer { 90 for i := range rows { 91 v := reflect.ValueOf(&rows[i]).Elem() 92 if v.IsNil() { 93 v.Set(reflect.New(v.Type().Elem())) 94 } 95 } 96 } 97 } 98 99 type generator[T any] interface { 100 generate(*rand.Rand) T 101 } 102 103 func BenchmarkGenericBuffer(b *testing.B) { 104 benchmarkGenericBuffer[benchmarkRowType](b) 105 benchmarkGenericBuffer[booleanColumn](b) 106 benchmarkGenericBuffer[int32Column](b) 107 benchmarkGenericBuffer[int64Column](b) 108 benchmarkGenericBuffer[floatColumn](b) 109 benchmarkGenericBuffer[doubleColumn](b) 110 benchmarkGenericBuffer[byteArrayColumn](b) 111 benchmarkGenericBuffer[fixedLenByteArrayColumn](b) 112 benchmarkGenericBuffer[stringColumn](b) 113 benchmarkGenericBuffer[indexedStringColumn](b) 114 benchmarkGenericBuffer[uuidColumn](b) 115 benchmarkGenericBuffer[timeColumn](b) 116 benchmarkGenericBuffer[timeInMillisColumn](b) 117 benchmarkGenericBuffer[mapColumn](b) 118 benchmarkGenericBuffer[decimalColumn](b) 119 benchmarkGenericBuffer[contact](b) 120 benchmarkGenericBuffer[paddedBooleanColumn](b) 121 benchmarkGenericBuffer[optionalInt32Column](b) 122 benchmarkGenericBuffer[repeatedInt32Column](b) 123 } 124 125 func benchmarkGenericBuffer[Row generator[Row]](b *testing.B) { 126 var model Row 127 b.Run(reflect.TypeOf(model).Name(), func(b *testing.B) { 128 prng := rand.New(rand.NewSource(0)) 129 rows := make([]Row, benchmarkNumRows) 130 for i := range rows { 131 rows[i] = rows[i].generate(prng) 132 } 133 134 b.Run("go1.17", func(b *testing.B) { 135 buffer := parquet.NewBuffer(parquet.SchemaOf(rows[0])) 136 i := 0 137 benchmarkRowsPerSecond(b, func() int { 138 for j := 0; j < benchmarkRowsPerStep; j++ { 139 if err := buffer.Write(&rows[i]); err != nil { 140 b.Fatal(err) 141 } 142 } 143 144 i += benchmarkRowsPerStep 145 i %= benchmarkNumRows 146 147 if i == 0 { 148 buffer.Reset() 149 } 150 return benchmarkRowsPerStep 151 }) 152 }) 153 154 b.Run("go1.18", func(b *testing.B) { 155 buffer := parquet.NewGenericBuffer[Row]() 156 i := 0 157 benchmarkRowsPerSecond(b, func() int { 158 n, err := buffer.Write(rows[i : i+benchmarkRowsPerStep]) 159 if err != nil { 160 b.Fatal(err) 161 } 162 163 i += benchmarkRowsPerStep 164 i %= benchmarkNumRows 165 166 if i == 0 { 167 buffer.Reset() 168 } 169 return n 170 }) 171 }) 172 }) 173 } 174 175 func TestIssue327(t *testing.T) { 176 t.Run("untagged nested lists should panic", func(t *testing.T) { 177 type testType struct { 178 ListOfLists [][]int 179 } 180 181 defer func() { 182 if r := recover(); r == nil { 183 t.Errorf("Nested lists without the list tag should panic") 184 } 185 }() 186 187 _ = parquet.NewGenericBuffer[testType]() 188 }) 189 } 190 191 func TestIssue346(t *testing.T) { 192 type TestType struct { 193 Key int 194 } 195 196 schema := parquet.SchemaOf(TestType{}) 197 buffer := parquet.NewGenericBuffer[any](schema) 198 199 data := make([]any, 1) 200 data[0] = TestType{Key: 0} 201 _, _ = buffer.Write(data) 202 } 203 204 func TestIssue347(t *testing.T) { 205 type TestType struct { 206 Key int 207 } 208 209 // instantiating with concrete type shouldn't panic 210 _ = parquet.NewGenericBuffer[TestType]() 211 212 // instantiating with schema and interface type parameter shouldn't panic 213 schema := parquet.SchemaOf(TestType{}) 214 _ = parquet.NewGenericBuffer[any](schema) 215 216 defer func() { 217 if r := recover(); r == nil { 218 t.Errorf("instantiating generic buffer without schema and with interface " + 219 "type parameter should panic") 220 } 221 }() 222 _ = parquet.NewGenericBuffer[any]() 223 } 224 225 func BenchmarkSortGenericBuffer(b *testing.B) { 226 type Row struct { 227 I0 int64 228 I1 int64 229 I2 int64 230 I3 int64 231 I4 int64 232 I5 int64 233 I6 int64 234 I7 int64 235 I8 int64 236 I9 int64 237 ID [16]byte 238 } 239 240 buf := parquet.NewGenericBuffer[Row]( 241 parquet.SortingRowGroupConfig( 242 parquet.SortingColumns( 243 parquet.Ascending("ID"), 244 ), 245 ), 246 ) 247 248 rows := make([]Row, 10e3) 249 prng := rand.New(rand.NewSource(0)) 250 251 for i := range rows { 252 binary.LittleEndian.PutUint64(rows[i].ID[:8], uint64(i)) 253 binary.LittleEndian.PutUint64(rows[i].ID[8:], ^uint64(i)) 254 } 255 256 buf.Write(rows) 257 b.ResetTimer() 258 259 for i := 0; i < b.N; i++ { 260 for j := 0; j < 10; j++ { 261 buf.Swap(prng.Intn(len(rows)), prng.Intn(len(rows))) 262 } 263 264 sort.Sort(buf) 265 } 266 }