github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/parquet_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "fmt" 6 "io" 7 "math/rand" 8 "reflect" 9 "strings" 10 "testing" 11 "time" 12 13 "github.com/google/uuid" 14 "github.com/vc42/parquet-go" 15 "github.com/vc42/parquet-go/deprecated" 16 "github.com/vc42/parquet-go/internal/quick" 17 ) 18 19 const ( 20 benchmarkNumRows = 10_000 21 benchmarkRowsPerStep = 1000 22 ) 23 24 type benchmarkRowType struct { 25 ID [16]byte `parquet:"id,uuid"` 26 Value float64 `parquet:"value"` 27 } 28 29 func (row benchmarkRowType) generate(prng *rand.Rand) benchmarkRowType { 30 prng.Read(row.ID[:]) 31 row.Value = prng.Float64() 32 return row 33 } 34 35 type paddedBooleanColumn struct { 36 Value bool 37 _ [3]byte 38 } 39 40 func (row paddedBooleanColumn) generate(prng *rand.Rand) paddedBooleanColumn { 41 return paddedBooleanColumn{Value: prng.Int()%2 == 0} 42 } 43 44 type booleanColumn struct { 45 Value bool 46 } 47 48 func (row booleanColumn) generate(prng *rand.Rand) booleanColumn { 49 return booleanColumn{Value: prng.Int()%2 == 0} 50 } 51 52 type int32Column struct { 53 Value int32 `parquet:",delta"` 54 } 55 56 func (row int32Column) generate(prng *rand.Rand) int32Column { 57 return int32Column{Value: prng.Int31n(100)} 58 } 59 60 type int64Column struct { 61 Value int64 `parquet:",delta"` 62 } 63 64 func (row int64Column) generate(prng *rand.Rand) int64Column { 65 return int64Column{Value: prng.Int63n(100)} 66 } 67 68 type int96Column struct { 69 Value deprecated.Int96 70 } 71 72 func (row int96Column) generate(prng *rand.Rand) int96Column { 73 row.Value[0] = prng.Uint32() 74 row.Value[1] = prng.Uint32() 75 row.Value[2] = prng.Uint32() 76 return row 77 } 78 79 type floatColumn struct { 80 Value float32 81 } 82 83 func (row floatColumn) generate(prng *rand.Rand) floatColumn { 84 return floatColumn{Value: prng.Float32()} 85 } 86 87 type doubleColumn struct { 88 Value float64 89 } 90 91 func (row doubleColumn) generate(prng *rand.Rand) doubleColumn { 92 return doubleColumn{Value: prng.Float64()} 93 } 94 95 type byteArrayColumn struct { 96 Value []byte 97 } 98 99 func (row byteArrayColumn) generate(prng *rand.Rand) byteArrayColumn { 100 row.Value = make([]byte, prng.Intn(10)) 101 prng.Read(row.Value) 102 return row 103 } 104 105 type fixedLenByteArrayColumn struct { 106 Value [10]byte 107 } 108 109 func (row fixedLenByteArrayColumn) generate(prng *rand.Rand) fixedLenByteArrayColumn { 110 prng.Read(row.Value[:]) 111 return row 112 } 113 114 type stringColumn struct { 115 Value string 116 } 117 118 func (row stringColumn) generate(prng *rand.Rand) stringColumn { 119 return stringColumn{Value: generateString(prng, 10)} 120 } 121 122 type indexedStringColumn struct { 123 Value string `parquet:",dict"` 124 } 125 126 func (row indexedStringColumn) generate(prng *rand.Rand) indexedStringColumn { 127 return indexedStringColumn{Value: generateString(prng, 10)} 128 } 129 130 type uuidColumn struct { 131 Value uuid.UUID `parquet:",delta"` 132 } 133 134 func (row uuidColumn) generate(prng *rand.Rand) uuidColumn { 135 prng.Read(row.Value[:]) 136 return row 137 } 138 139 type decimalColumn struct { 140 Value int64 `parquet:",decimal(0:3)"` 141 } 142 143 func (row decimalColumn) generate(prng *rand.Rand) decimalColumn { 144 return decimalColumn{Value: prng.Int63()} 145 } 146 147 type mapColumn struct { 148 Value map[utf8string]int 149 } 150 151 func (row mapColumn) generate(prng *rand.Rand) mapColumn { 152 n := prng.Intn(10) 153 row.Value = make(map[utf8string]int, n) 154 for i := 0; i < n; i++ { 155 row.Value[utf8string(generateString(prng, 8))] = prng.Intn(100) 156 } 157 return row 158 } 159 160 type addressBook struct { 161 Owner utf8string `parquet:",plain"` 162 OwnerPhoneNumbers []utf8string `parquet:",plain"` 163 Contacts []contact 164 } 165 166 type contact struct { 167 Name utf8string `parquet:",plain"` 168 PhoneNumber utf8string `parquet:",plain"` 169 } 170 171 func (row contact) generate(prng *rand.Rand) contact { 172 return contact{ 173 Name: utf8string(generateString(prng, 16)), 174 PhoneNumber: utf8string(generateString(prng, 10)), 175 } 176 } 177 178 type optionalInt32Column struct { 179 Value int32 `parquet:",optional"` 180 } 181 182 func (row optionalInt32Column) generate(prng *rand.Rand) optionalInt32Column { 183 return optionalInt32Column{Value: prng.Int31n(100)} 184 } 185 186 type repeatedInt32Column struct { 187 Values []int32 188 } 189 190 func (row repeatedInt32Column) generate(prng *rand.Rand) repeatedInt32Column { 191 row.Values = make([]int32, prng.Intn(10)) 192 for i := range row.Values { 193 row.Values[i] = prng.Int31n(10) 194 } 195 return row 196 } 197 198 type listColumn2 struct { 199 Value utf8string `parquet:",optional"` 200 } 201 202 type listColumn1 struct { 203 List2 []listColumn2 `parquet:",list"` 204 } 205 206 type listColumn0 struct { 207 List1 []listColumn1 `parquet:",list"` 208 } 209 210 type nestedListColumn1 struct { 211 Level3 []utf8string `parquet:"level3"` 212 } 213 214 type nestedListColumn struct { 215 Level1 []nestedListColumn1 `parquet:"level1"` 216 Level2 []utf8string `parquet:"level2"` 217 } 218 219 type utf8string string 220 221 func (utf8string) Generate(rand *rand.Rand, size int) reflect.Value { 222 const characters = "abcdefghijklmnopqrstuvwxyz1234567890" 223 const maxSize = 10 224 if size > maxSize { 225 size = maxSize 226 } 227 n := rand.Intn(size) 228 b := make([]byte, n) 229 for i := range b { 230 b[i] = characters[rand.Intn(len(characters))] 231 } 232 return reflect.ValueOf(utf8string(b)) 233 } 234 235 type Contact struct { 236 Name string `parquet:"name"` 237 PhoneNumber string `parquet:"phoneNumber,optional,zstd"` 238 } 239 240 type AddressBook struct { 241 Owner string `parquet:"owner,zstd"` 242 OwnerPhoneNumbers []string `parquet:"ownerPhoneNumbers,gzip"` 243 Contacts []Contact `parquet:"contacts"` 244 } 245 246 func forEachLeafColumn(col *parquet.Column, do func(*parquet.Column) error) error { 247 children := col.Columns() 248 249 if len(children) == 0 { 250 return do(col) 251 } 252 253 for _, child := range children { 254 if err := forEachLeafColumn(child, do); err != nil { 255 return err 256 } 257 } 258 259 return nil 260 } 261 262 func forEachPage(pages parquet.PageReader, do func(parquet.Page) error) error { 263 for { 264 p, err := pages.ReadPage() 265 if err != nil { 266 if err == io.EOF { 267 err = nil 268 } 269 return err 270 } 271 if err := do(p); err != nil { 272 return err 273 } 274 } 275 } 276 277 func forEachValue(values parquet.ValueReader, do func(parquet.Value) error) error { 278 buffer := [3]parquet.Value{} 279 for { 280 n, err := values.ReadValues(buffer[:]) 281 for _, v := range buffer[:n] { 282 if err := do(v); err != nil { 283 return err 284 } 285 } 286 if err != nil { 287 if err == io.EOF { 288 err = nil 289 } 290 return err 291 } 292 } 293 } 294 295 func forEachColumnPage(col *parquet.Column, do func(*parquet.Column, parquet.Page) error) error { 296 return forEachLeafColumn(col, func(leaf *parquet.Column) error { 297 pages := leaf.Pages() 298 defer pages.Close() 299 return forEachPage(pages, func(page parquet.Page) error { return do(leaf, page) }) 300 }) 301 } 302 303 func forEachColumnValue(col *parquet.Column, do func(*parquet.Column, parquet.Value) error) error { 304 return forEachColumnPage(col, func(leaf *parquet.Column, page parquet.Page) error { 305 return forEachValue(page.Values(), func(value parquet.Value) error { return do(leaf, value) }) 306 }) 307 } 308 309 func forEachColumnChunk(file *parquet.File, do func(*parquet.Column, parquet.ColumnChunk) error) error { 310 return forEachLeafColumn(file.Root(), func(leaf *parquet.Column) error { 311 for _, rowGroup := range file.RowGroups() { 312 if err := do(leaf, rowGroup.ColumnChunks()[leaf.Index()]); err != nil { 313 return err 314 } 315 } 316 return nil 317 }) 318 } 319 320 func createParquetFile(rows rows, options ...parquet.WriterOption) (*parquet.File, error) { 321 buffer := new(bytes.Buffer) 322 323 if err := writeParquetFile(buffer, rows, options...); err != nil { 324 return nil, err 325 } 326 327 reader := bytes.NewReader(buffer.Bytes()) 328 return parquet.OpenFile(reader, reader.Size()) 329 } 330 331 func writeParquetFile(w io.Writer, rows rows, options ...parquet.WriterOption) error { 332 writer := parquet.NewWriter(w, options...) 333 334 for _, row := range rows { 335 if err := writer.Write(row); err != nil { 336 return err 337 } 338 } 339 340 return writer.Close() 341 } 342 343 func writeParquetFileWithBuffer(w io.Writer, rows rows, options ...parquet.WriterOption) error { 344 buffer := parquet.NewBuffer() 345 for _, row := range rows { 346 if err := buffer.Write(row); err != nil { 347 return err 348 } 349 } 350 351 writer := parquet.NewWriter(w, options...) 352 numRows, err := copyRowsAndClose(writer, buffer.Rows()) 353 if err != nil { 354 return err 355 } 356 if numRows != int64(len(rows)) { 357 return fmt.Errorf("wrong number of rows written from buffer to file: want=%d got=%d", len(rows), numRows) 358 } 359 return writer.Close() 360 } 361 362 type rows []interface{} 363 364 func makeRows(any interface{}) rows { 365 if v, ok := any.([]interface{}); ok { 366 return rows(v) 367 } 368 value := reflect.ValueOf(any) 369 slice := make([]interface{}, value.Len()) 370 for i := range slice { 371 slice[i] = value.Index(i).Interface() 372 } 373 return rows(slice) 374 } 375 376 func randValueFuncOf(t parquet.Type) func(*rand.Rand) parquet.Value { 377 switch k := t.Kind(); k { 378 case parquet.Boolean: 379 return func(r *rand.Rand) parquet.Value { 380 return parquet.ValueOf(r.Float64() < 0.5) 381 } 382 383 case parquet.Int32: 384 return func(r *rand.Rand) parquet.Value { 385 return parquet.ValueOf(r.Int31()) 386 } 387 388 case parquet.Int64: 389 return func(r *rand.Rand) parquet.Value { 390 return parquet.ValueOf(r.Int63()) 391 } 392 393 case parquet.Int96: 394 return func(r *rand.Rand) parquet.Value { 395 return parquet.ValueOf(deprecated.Int96{ 396 0: r.Uint32(), 397 1: r.Uint32(), 398 2: r.Uint32(), 399 }) 400 } 401 402 case parquet.Float: 403 return func(r *rand.Rand) parquet.Value { 404 return parquet.ValueOf(r.Float32()) 405 } 406 407 case parquet.Double: 408 return func(r *rand.Rand) parquet.Value { 409 return parquet.ValueOf(r.Float64()) 410 } 411 412 case parquet.ByteArray: 413 return func(r *rand.Rand) parquet.Value { 414 n := r.Intn(49) + 1 415 b := make([]byte, n) 416 const characters = "1234567890qwertyuiopasdfghjklzxcvbnm " 417 for i := range b { 418 b[i] = characters[r.Intn(len(characters))] 419 } 420 return parquet.ValueOf(b) 421 } 422 423 case parquet.FixedLenByteArray: 424 arrayType := reflect.ArrayOf(t.Length(), reflect.TypeOf(byte(0))) 425 return func(r *rand.Rand) parquet.Value { 426 b := make([]byte, arrayType.Len()) 427 r.Read(b) 428 v := reflect.New(arrayType).Elem() 429 reflect.Copy(v, reflect.ValueOf(b)) 430 return parquet.ValueOf(v.Interface()) 431 } 432 433 default: 434 panic("NOT IMPLEMENTED") 435 } 436 } 437 438 func copyRowsAndClose(w parquet.RowWriter, r parquet.Rows) (int64, error) { 439 defer r.Close() 440 return parquet.CopyRows(w, r) 441 } 442 443 func benchmarkRowsPerSecond(b *testing.B, f func() int) { 444 b.ResetTimer() 445 start := time.Now() 446 numRows := int64(0) 447 448 for i := 0; i < b.N; i++ { 449 n := f() 450 numRows += int64(n) 451 } 452 453 seconds := time.Since(start).Seconds() 454 b.ReportMetric(float64(numRows)/seconds, "row/s") 455 } 456 457 func generateString(r *rand.Rand, n int) string { 458 const characters = "1234567890qwertyuiopasdfghjklzxcvbnm" 459 b := new(strings.Builder) 460 for i := 0; i < n; i++ { 461 b.WriteByte(characters[r.Intn(len(characters))]) 462 } 463 return b.String() 464 } 465 466 var quickCheckConfig = quick.Config{ 467 Sizes: []int{ 468 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 469 10, 20, 30, 40, 50, 123, 470 }, 471 } 472 473 func quickCheck(f interface{}) error { 474 return quickCheckConfig.Check(f) 475 }