github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/reader_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "errors" 6 "fmt" 7 "io" 8 "math" 9 "math/rand" 10 "os" 11 "reflect" 12 "testing" 13 14 "github.com/parquet-go/parquet-go" 15 "github.com/parquet-go/parquet-go/internal/quick" 16 ) 17 18 func TestGenericReader(t *testing.T) { 19 testGenericReader[booleanColumn](t) 20 testGenericReader[int32Column](t) 21 testGenericReader[int64Column](t) 22 testGenericReader[int96Column](t) 23 testGenericReader[floatColumn](t) 24 testGenericReader[doubleColumn](t) 25 testGenericReader[byteArrayColumn](t) 26 testGenericReader[fixedLenByteArrayColumn](t) 27 testGenericReader[stringColumn](t) 28 testGenericReader[indexedStringColumn](t) 29 testGenericReader[uuidColumn](t) 30 testGenericReader[timeColumn](t) 31 testGenericReader[timeInMillisColumn](t) 32 testGenericReader[mapColumn](t) 33 testGenericReader[decimalColumn](t) 34 testGenericReader[addressBook](t) 35 testGenericReader[contact](t) 36 testGenericReader[listColumn2](t) 37 testGenericReader[listColumn1](t) 38 testGenericReader[listColumn0](t) 39 testGenericReader[nestedListColumn1](t) 40 testGenericReader[nestedListColumn](t) 41 testGenericReader[*contact](t) 42 testGenericReader[paddedBooleanColumn](t) 43 testGenericReader[optionalInt32Column](t) 44 testGenericReader[repeatedInt32Column](t) 45 } 46 47 func testGenericReader[Row any](t *testing.T) { 48 var model Row 49 t.Run(reflect.TypeOf(model).Name(), func(t *testing.T) { 50 err := quickCheck(func(rows []Row) bool { 51 if len(rows) == 0 { 52 return true // TODO: fix support for parquet files with zero rows 53 } 54 if err := testGenericReaderRows(rows); err != nil { 55 t.Error(err) 56 return false 57 } 58 return true 59 }) 60 if err != nil { 61 t.Error(err) 62 } 63 }) 64 } 65 66 func testGenericReaderRows[Row any](rows []Row) error { 67 setNullPointers(rows) 68 buffer := new(bytes.Buffer) 69 writer := parquet.NewGenericWriter[Row](buffer) 70 _, err := writer.Write(rows) 71 if err != nil { 72 return err 73 } 74 if err := writer.Close(); err != nil { 75 return err 76 } 77 reader := parquet.NewGenericReader[Row](bytes.NewReader(buffer.Bytes())) 78 result := make([]Row, len(rows)) 79 n, err := reader.Read(result) 80 if err != nil && !errors.Is(err, io.EOF) { 81 return err 82 } 83 if n < len(rows) { 84 return fmt.Errorf("not enough values were read: want=%d got=%d", len(rows), n) 85 } 86 if !reflect.DeepEqual(rows, result) { 87 return fmt.Errorf("rows mismatch:\nwant: %+v\ngot: %+v", rows, result) 88 } 89 return nil 90 } 91 92 func TestIssue400(t *testing.T) { 93 type B struct { 94 Name string 95 } 96 type A struct { 97 B []B `parquet:",optional"` 98 } 99 100 b := new(bytes.Buffer) 101 w := parquet.NewGenericWriter[A](b) 102 expect := []A{ 103 { 104 B: []B{ 105 { 106 // 32 bytes random so we can see in the binary parquet if we 107 // actually wrote the value 108 Name: "9e7eb1f0-bbcc-43ec-bfad-a9fac1bb0feb", 109 }, 110 }, 111 }, 112 } 113 _, err := w.Write(expect) 114 if err != nil { 115 t.Fatal(err) 116 } 117 if err = w.Close(); err != nil { 118 t.Fatal(err) 119 } 120 121 r := parquet.NewGenericReader[A](bytes.NewReader(b.Bytes())) 122 values := make([]A, 1) 123 _, err = r.Read(values) 124 if err != nil { 125 t.Fatal(err) 126 } 127 if !reflect.DeepEqual(expect[0], values[0]) { 128 t.Errorf("want %q got %q", values[0], expect[0]) 129 } 130 } 131 132 func TestReadMinPageSize(t *testing.T) { 133 // NOTE: min page size is 307 for MyRow schema 134 t.Run("test read less than min page size", func(t *testing.T) { testReadMinPageSize(128, t) }) 135 t.Run("test read equal to min page size", func(t *testing.T) { testReadMinPageSize(307, t) }) 136 t.Run("test read more than min page size", func(t *testing.T) { testReadMinPageSize(384, t) }) 137 // NOTE: num rows is 20,000 138 t.Run("test read equal to num rows", func(t *testing.T) { testReadMinPageSize(20_000, t) }) 139 t.Run("test read more than num rows", func(t *testing.T) { testReadMinPageSize(25_000, t) }) 140 } 141 142 func testReadMinPageSize(readSize int, t *testing.T) { 143 type MyRow struct { 144 ID [16]byte `parquet:"id,delta,uuid"` 145 File string `parquet:"file,dict,zstd"` 146 Index int64 `parquet:"index,delta,zstd"` 147 } 148 149 numRows := 20_000 150 maxPageBytes := 5000 151 152 tmp, err := os.CreateTemp("/tmp", "*.parquet") 153 if err != nil { 154 t.Fatal("os.CreateTemp: ", err) 155 } 156 path := tmp.Name() 157 defer os.Remove(path) 158 t.Log("file:", path) 159 160 // The page buffer size ensures we get multiple pages out of this example. 161 w := parquet.NewGenericWriter[MyRow](tmp, parquet.PageBufferSize(maxPageBytes)) 162 // Need to write 1 row at a time here as writing many at once disregards PageBufferSize option. 163 for i := 0; i < numRows; i++ { 164 row := MyRow{ 165 ID: [16]byte{15: byte(i)}, 166 File: "hi" + fmt.Sprint(i), 167 Index: int64(i), 168 } 169 _, err := w.Write([]MyRow{row}) 170 if err != nil { 171 t.Fatal("w.Write: ", err) 172 } 173 // Flush writes rows as row group. 4 total (20k/5k) in this file. 174 if (i+1)%maxPageBytes == 0 { 175 err = w.Flush() 176 if err != nil { 177 t.Fatal("w.Flush: ", err) 178 } 179 } 180 } 181 err = w.Close() 182 if err != nil { 183 t.Fatal("w.Close: ", err) 184 } 185 err = tmp.Close() 186 if err != nil { 187 t.Fatal("tmp.Close: ", err) 188 } 189 190 file, err := os.Open(path) 191 if err != nil { 192 t.Fatal("os.Open", err) 193 } 194 reader := parquet.NewGenericReader[MyRow](file) 195 read := int64(0) 196 nRows := reader.NumRows() 197 rows := make([]MyRow, 0, nRows) 198 buf := make([]MyRow, readSize) // NOTE: min page size is 307 for MyRow schema 199 200 for read < nRows { 201 num, err := reader.Read(buf) 202 read += int64(num) 203 if err != nil && !errors.Is(err, io.EOF) { 204 t.Fatal("Read:", err) 205 } 206 rows = append(rows, buf...) 207 } 208 209 if err := reader.Close(); err != nil { 210 t.Fatal("Close", err) 211 } 212 213 if len(rows) < numRows { 214 t.Fatalf("not enough values were read: want=%d got=%d", len(rows), numRows) 215 } 216 for i, row := range rows[:numRows] { 217 id := [16]byte{15: byte(i)} 218 file := "hi" + fmt.Sprint(i) 219 index := int64(i) 220 221 if row.ID != id || row.File != file || row.Index != index { 222 t.Fatalf("rows mismatch at index: %d got: %+v", i, row) 223 } 224 } 225 } 226 227 func BenchmarkGenericReader(b *testing.B) { 228 benchmarkGenericReader[benchmarkRowType](b) 229 benchmarkGenericReader[booleanColumn](b) 230 benchmarkGenericReader[int32Column](b) 231 benchmarkGenericReader[int64Column](b) 232 benchmarkGenericReader[floatColumn](b) 233 benchmarkGenericReader[doubleColumn](b) 234 benchmarkGenericReader[byteArrayColumn](b) 235 benchmarkGenericReader[fixedLenByteArrayColumn](b) 236 benchmarkGenericReader[stringColumn](b) 237 benchmarkGenericReader[indexedStringColumn](b) 238 benchmarkGenericReader[uuidColumn](b) 239 benchmarkGenericReader[timeColumn](b) 240 benchmarkGenericReader[timeInMillisColumn](b) 241 benchmarkGenericReader[mapColumn](b) 242 benchmarkGenericReader[decimalColumn](b) 243 benchmarkGenericReader[contact](b) 244 benchmarkGenericReader[paddedBooleanColumn](b) 245 benchmarkGenericReader[optionalInt32Column](b) 246 } 247 248 func benchmarkGenericReader[Row generator[Row]](b *testing.B) { 249 var model Row 250 b.Run(reflect.TypeOf(model).Name(), func(b *testing.B) { 251 prng := rand.New(rand.NewSource(0)) 252 rows := make([]Row, benchmarkNumRows) 253 for i := range rows { 254 rows[i] = rows[i].generate(prng) 255 } 256 257 rowbuf := make([]Row, benchmarkRowsPerStep) 258 buffer := parquet.NewGenericBuffer[Row]() 259 buffer.Write(rows) 260 261 b.Run("go1.17", func(b *testing.B) { 262 reader := parquet.NewRowGroupReader(buffer) 263 benchmarkRowsPerSecond(b, func() int { 264 for i := range rowbuf { 265 if err := reader.Read(&rowbuf[i]); err != nil { 266 if err != io.EOF { 267 b.Fatal(err) 268 } else { 269 reader.Reset() 270 } 271 } 272 } 273 return len(rowbuf) 274 }) 275 }) 276 277 b.Run("go1.18", func(b *testing.B) { 278 reader := parquet.NewGenericRowGroupReader[Row](buffer) 279 benchmarkRowsPerSecond(b, func() int { 280 n, err := reader.Read(rowbuf) 281 if err != nil { 282 if err != io.EOF { 283 b.Fatal(err) 284 } else { 285 reader.Reset() 286 } 287 } 288 return n 289 }) 290 }) 291 }) 292 } 293 294 func rowsOf(numRows int, model interface{}) rows { 295 prng := rand.New(rand.NewSource(0)) 296 return randomRowsOf(prng, numRows, model) 297 } 298 299 func randomRowsOf(prng *rand.Rand, numRows int, model interface{}) rows { 300 typ := reflect.TypeOf(model) 301 rows := make(rows, numRows) 302 makeValue := quick.MakeValueFuncOf(typ) 303 for i := range rows { 304 v := reflect.New(typ).Elem() 305 makeValue(v, prng) 306 rows[i] = v.Interface() 307 } 308 return rows 309 } 310 311 var readerTests = []struct { 312 scenario string 313 model interface{} 314 }{ 315 { 316 scenario: "BOOLEAN", 317 model: booleanColumn{}, 318 }, 319 320 { 321 scenario: "INT32", 322 model: int32Column{}, 323 }, 324 325 { 326 scenario: "INT64", 327 model: int64Column{}, 328 }, 329 330 { 331 scenario: "INT96", 332 model: int96Column{}, 333 }, 334 335 { 336 scenario: "FLOAT", 337 model: floatColumn{}, 338 }, 339 340 { 341 scenario: "DOUBLE", 342 model: doubleColumn{}, 343 }, 344 345 { 346 scenario: "BYTE_ARRAY", 347 model: byteArrayColumn{}, 348 }, 349 350 { 351 scenario: "FIXED_LEN_BYTE_ARRAY", 352 model: fixedLenByteArrayColumn{}, 353 }, 354 355 { 356 scenario: "STRING", 357 model: stringColumn{}, 358 }, 359 360 { 361 scenario: "STRING (dict)", 362 model: indexedStringColumn{}, 363 }, 364 365 { 366 scenario: "UUID", 367 model: uuidColumn{}, 368 }, 369 370 { 371 scenario: "time.Time", 372 model: timeColumn{}, 373 }, 374 375 { 376 scenario: "time.Time in ms", 377 model: timeInMillisColumn{}, 378 }, 379 380 { 381 scenario: "DECIMAL", 382 model: decimalColumn{}, 383 }, 384 385 { 386 scenario: "AddressBook", 387 model: addressBook{}, 388 }, 389 390 { 391 scenario: "one optional level", 392 model: listColumn2{}, 393 }, 394 395 { 396 scenario: "one repeated level", 397 model: listColumn1{}, 398 }, 399 400 { 401 scenario: "two repeated levels", 402 model: listColumn0{}, 403 }, 404 405 { 406 scenario: "three repeated levels", 407 model: listColumn0{}, 408 }, 409 410 { 411 scenario: "nested lists", 412 model: nestedListColumn{}, 413 }, 414 415 { 416 scenario: "key-value pairs", 417 model: struct { 418 KeyValuePairs map[utf8string]utf8string 419 }{}, 420 }, 421 422 { 423 scenario: "multiple key-value pairs", 424 model: struct { 425 KeyValuePairs0 map[utf8string]utf8string 426 KeyValuePairs1 map[utf8string]utf8string 427 KeyValuePairs2 map[utf8string]utf8string 428 }{}, 429 }, 430 431 { 432 scenario: "repeated key-value pairs", 433 model: struct { 434 RepeatedKeyValuePairs []map[utf8string]utf8string 435 }{}, 436 }, 437 438 { 439 scenario: "map of repeated values", 440 model: struct { 441 MapOfRepeated map[utf8string][]utf8string 442 }{}, 443 }, 444 } 445 446 func TestReader(t *testing.T) { 447 buf := new(bytes.Buffer) 448 file := bytes.NewReader(nil) 449 450 for _, test := range readerTests { 451 t.Run(test.scenario, func(t *testing.T) { 452 const N = 42 453 454 rowType := reflect.TypeOf(test.model) 455 rowPtr := reflect.New(rowType) 456 rowZero := reflect.Zero(rowType) 457 rowValue := rowPtr.Elem() 458 459 for n := 1; n < N; n++ { 460 t.Run(fmt.Sprintf("N=%d", n), func(t *testing.T) { 461 defer buf.Reset() 462 rows := rowsOf(n, test.model) 463 464 if err := writeParquetFileWithBuffer(buf, rows); err != nil { 465 t.Fatal(err) 466 } 467 468 file.Reset(buf.Bytes()) 469 r := parquet.NewReader(file, parquet.SchemaOf(test.model)) 470 471 for i, v := range rows { 472 if err := r.Read(rowPtr.Interface()); err != nil { 473 t.Fatal(err) 474 } 475 if !reflect.DeepEqual(rowValue.Interface(), v) { 476 t.Errorf("row mismatch at index %d\nwant = %+v\ngot = %+v", i, v, rowValue.Interface()) 477 } 478 rowValue.Set(rowZero) 479 } 480 481 if err := r.Read(rowPtr.Interface()); err != io.EOF { 482 t.Errorf("expected EOF after reading all values but got: %v", err) 483 } 484 }) 485 } 486 }) 487 } 488 } 489 490 func BenchmarkReaderReadType(b *testing.B) { 491 buf := new(bytes.Buffer) 492 file := bytes.NewReader(nil) 493 494 for _, test := range readerTests { 495 b.Run(test.scenario, func(b *testing.B) { 496 defer buf.Reset() 497 rows := rowsOf(benchmarkNumRows, test.model) 498 499 if err := writeParquetFile(buf, rows); err != nil { 500 b.Fatal(err) 501 } 502 file.Reset(buf.Bytes()) 503 f, err := parquet.OpenFile(file, file.Size()) 504 if err != nil { 505 b.Fatal(err) 506 } 507 508 rowType := reflect.TypeOf(test.model) 509 rowPtr := reflect.New(rowType) 510 rowZero := reflect.Zero(rowType) 511 rowValue := rowPtr.Elem() 512 513 r := parquet.NewReader(f) 514 p := rowPtr.Interface() 515 516 benchmarkRowsPerSecond(b, func() (n int) { 517 for i := 0; i < benchmarkRowsPerStep; i++ { 518 if err := r.Read(p); err != nil { 519 if err == io.EOF { 520 r.Reset() 521 } else { 522 b.Fatal(err) 523 } 524 } 525 } 526 rowValue.Set(rowZero) 527 return benchmarkRowsPerStep 528 }) 529 530 b.SetBytes(int64(math.Ceil(benchmarkRowsPerStep * float64(file.Size()) / benchmarkNumRows))) 531 }) 532 } 533 } 534 535 func BenchmarkReaderReadRow(b *testing.B) { 536 buf := new(bytes.Buffer) 537 file := bytes.NewReader(nil) 538 539 for _, test := range readerTests { 540 b.Run(test.scenario, func(b *testing.B) { 541 defer buf.Reset() 542 rows := rowsOf(benchmarkNumRows, test.model) 543 544 if err := writeParquetFile(buf, rows); err != nil { 545 b.Fatal(err) 546 } 547 file.Reset(buf.Bytes()) 548 f, err := parquet.OpenFile(file, file.Size()) 549 if err != nil { 550 b.Fatal(err) 551 } 552 553 r := parquet.NewReader(f) 554 rowbuf := make([]parquet.Row, benchmarkRowsPerStep) 555 556 benchmarkRowsPerSecond(b, func() int { 557 n, err := r.ReadRows(rowbuf) 558 if err != nil { 559 if err == io.EOF { 560 r.Reset() 561 } else { 562 b.Fatal(err) 563 } 564 } 565 return n 566 }) 567 568 b.SetBytes(int64(math.Ceil(benchmarkRowsPerStep * float64(file.Size()) / benchmarkNumRows))) 569 }) 570 } 571 } 572 573 func TestReaderReadSubset(t *testing.T) { 574 // In this example we'll write 3 columns to the file - X, Y, and Z, but 575 // we'll only read out the X and Y columns. Returns true if all writes 576 // and reads were successful, and false otherwise. 577 type Point3D struct{ X, Y, Z int64 } 578 type Point2D struct{ X, Y int64 } 579 580 err := quickCheck(func(points3D []Point3D) bool { 581 if len(points3D) == 0 { 582 return true 583 } 584 buf := new(bytes.Buffer) 585 err := writeParquetFile(buf, makeRows(points3D)) 586 if err != nil { 587 t.Error(err) 588 return false 589 } 590 reader := parquet.NewReader(bytes.NewReader(buf.Bytes())) 591 for i := 0; ; i++ { 592 row := Point2D{} 593 err := reader.Read(&row) 594 if err != nil { 595 if err == io.EOF && i == len(points3D) { 596 break 597 } 598 t.Error(err) 599 return false 600 } 601 if row != (Point2D{X: points3D[i].X, Y: points3D[i].Y}) { 602 t.Errorf("points mismatch at row index %d: want=%v got=%v", i, points3D[i], row) 603 return false 604 } 605 } 606 return true 607 }) 608 if err != nil { 609 t.Error(err) 610 } 611 } 612 613 func TestReaderSeekToRow(t *testing.T) { 614 type rowType struct { 615 Name utf8string `parquet:",dict"` 616 } 617 618 rows := rowsOf(10, rowType{}) 619 buf := new(bytes.Buffer) 620 err := writeParquetFile(buf, rows) 621 if err != nil { 622 t.Fatal(err) 623 } 624 625 reader := parquet.NewReader(bytes.NewReader(buf.Bytes())) 626 for i := 0; i < 10; i++ { 627 if err := reader.SeekToRow(int64(i)); err != nil { 628 t.Fatalf("seek to row %d: %v", i, err) 629 } 630 631 row := new(rowType) 632 err := reader.Read(row) 633 if err != nil { 634 t.Fatalf("reading row %d: %v", i, err) 635 } 636 637 if *row != rows[i] { 638 t.Fatalf("row %d mismatch: got=%+v want=%+v", i, *row, rows[i]) 639 } 640 } 641 } 642 643 func TestSeekToRowNoDict(t *testing.T) { 644 type rowType struct { 645 Name utf8string `parquet:","` // no dictionary encoding 646 } 647 648 // write samples to in-memory buffer 649 buf := new(bytes.Buffer) 650 schema := parquet.SchemaOf(new(rowType)) 651 w := parquet.NewWriter(buf, schema) 652 sample := rowType{ 653 Name: "foo1", 654 } 655 // write two rows 656 w.Write(sample) 657 sample.Name = "foo2" 658 w.Write(sample) 659 w.Close() 660 661 // create reader 662 r := parquet.NewReader(bytes.NewReader(buf.Bytes())) 663 664 // read second row 665 r.SeekToRow(1) 666 row := new(rowType) 667 err := r.Read(row) 668 if err != nil { 669 t.Fatalf("reading row: %v", err) 670 } 671 // fmt.Println(&sample, row) 672 if *row != sample { 673 t.Fatalf("read != write") 674 } 675 } 676 677 func TestSeekToRowReadAll(t *testing.T) { 678 type rowType struct { 679 Name utf8string `parquet:",dict"` 680 } 681 682 // write samples to in-memory buffer 683 buf := new(bytes.Buffer) 684 schema := parquet.SchemaOf(new(rowType)) 685 w := parquet.NewWriter(buf, schema) 686 sample := rowType{ 687 Name: "foo1", 688 } 689 // write two rows 690 w.Write(sample) 691 sample.Name = "foo2" 692 w.Write(sample) 693 w.Close() 694 695 // create reader 696 r := parquet.NewReader(bytes.NewReader(buf.Bytes())) 697 698 // read first row 699 r.SeekToRow(0) 700 row := new(rowType) 701 err := r.Read(row) 702 if err != nil { 703 t.Fatalf("reading row: %v", err) 704 } 705 // read second row 706 r.SeekToRow(1) 707 row = new(rowType) 708 err = r.Read(row) 709 if err != nil { 710 t.Fatalf("reading row: %v", err) 711 } 712 // fmt.Println(&sample, row) 713 if *row != sample { 714 t.Fatalf("read != write") 715 } 716 } 717 718 func TestSeekToRowDictReadSecond(t *testing.T) { 719 type rowType struct { 720 Name utf8string `parquet:",dict"` 721 } 722 723 // write samples to in-memory buffer 724 buf := new(bytes.Buffer) 725 schema := parquet.SchemaOf(new(rowType)) 726 w := parquet.NewWriter(buf, schema) 727 sample := rowType{ 728 Name: "foo1", 729 } 730 // write two rows 731 w.Write(sample) 732 sample.Name = "foo2" 733 w.Write(sample) 734 w.Close() 735 736 // create reader 737 r := parquet.NewReader(bytes.NewReader(buf.Bytes())) 738 739 // read second row 740 r.SeekToRow(1) 741 row := new(rowType) 742 err := r.Read(row) 743 if err != nil { 744 t.Fatalf("reading row: %v", err) 745 } 746 // fmt.Println(&sample, row) 747 if *row != sample { 748 t.Fatalf("read != write") 749 } 750 } 751 752 func TestSeekToRowDictReadMultiplePages(t *testing.T) { 753 type rowType struct { 754 Name utf8string `parquet:",dict"` 755 } 756 757 // write samples to in-memory buffer 758 buf := new(bytes.Buffer) 759 schema := parquet.SchemaOf(new(rowType)) 760 w := parquet.NewWriter(buf, schema, &parquet.WriterConfig{ 761 PageBufferSize: 10, 762 }) 763 sample := rowType{ 764 Name: "foo1", 765 } 766 767 // write enough rows to spill over a single page 768 for i := 0; i < 10; i++ { 769 w.Write(sample) 770 } 771 sample.Name = "foo2" 772 w.Write(sample) 773 w.Close() 774 775 // create reader 776 r := parquet.NewReader(bytes.NewReader(buf.Bytes())) 777 778 // read 11th row 779 r.SeekToRow(10) 780 row := new(rowType) 781 err := r.Read(row) 782 if err != nil { 783 t.Fatalf("reading row: %v", err) 784 } 785 if *row != sample { 786 t.Fatalf("read != write") 787 } 788 }