github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/buffer_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "encoding/binary" 6 "errors" 7 "fmt" 8 "io" 9 "math" 10 "math/rand" 11 "reflect" 12 "sort" 13 "strconv" 14 "testing" 15 16 "github.com/parquet-go/parquet-go" 17 "github.com/parquet-go/parquet-go/encoding" 18 ) 19 20 func TestGenericBuffer(t *testing.T) { 21 testGenericBuffer[booleanColumn](t) 22 testGenericBuffer[int32Column](t) 23 testGenericBuffer[int64Column](t) 24 testGenericBuffer[int96Column](t) 25 testGenericBuffer[floatColumn](t) 26 testGenericBuffer[doubleColumn](t) 27 testGenericBuffer[byteArrayColumn](t) 28 testGenericBuffer[fixedLenByteArrayColumn](t) 29 testGenericBuffer[stringColumn](t) 30 testGenericBuffer[indexedStringColumn](t) 31 testGenericBuffer[uuidColumn](t) 32 testGenericBuffer[timeColumn](t) 33 testGenericBuffer[timeInMillisColumn](t) 34 testGenericBuffer[mapColumn](t) 35 testGenericBuffer[decimalColumn](t) 36 testGenericBuffer[addressBook](t) 37 testGenericBuffer[contact](t) 38 testGenericBuffer[listColumn2](t) 39 testGenericBuffer[listColumn1](t) 40 testGenericBuffer[listColumn0](t) 41 testGenericBuffer[nestedListColumn1](t) 42 testGenericBuffer[nestedListColumn](t) 43 testGenericBuffer[*contact](t) 44 testGenericBuffer[paddedBooleanColumn](t) 45 testGenericBuffer[optionalInt32Column](t) 46 testGenericBuffer[repeatedInt32Column](t) 47 } 48 49 func testGenericBuffer[Row any](t *testing.T) { 50 var model Row 51 t.Run(reflect.TypeOf(model).Name(), func(t *testing.T) { 52 err := quickCheck(func(rows []Row) bool { 53 if len(rows) == 0 { 54 return true // TODO: fix support for parquet files with zero rows 55 } 56 if err := testGenericBufferRows(rows); err != nil { 57 t.Error(err) 58 return false 59 } 60 return true 61 }) 62 if err != nil { 63 t.Error(err) 64 } 65 }) 66 } 67 68 func testGenericBufferRows[Row any](rows []Row) error { 69 setNullPointers(rows) 70 buffer := parquet.NewGenericBuffer[Row]() 71 _, err := buffer.Write(rows) 72 if err != nil { 73 return err 74 } 75 reader := parquet.NewGenericRowGroupReader[Row](buffer) 76 result := make([]Row, len(rows)) 77 n, err := reader.Read(result) 78 if err != nil && !errors.Is(err, io.EOF) { 79 return err 80 } 81 if n < len(rows) { 82 return fmt.Errorf("not enough values were read: want=%d got=%d", len(rows), n) 83 } 84 if !reflect.DeepEqual(rows, result) { 85 return fmt.Errorf("rows mismatch:\nwant: %#v\ngot: %#v", rows, result) 86 } 87 return nil 88 } 89 90 func setNullPointers[Row any](rows []Row) { 91 if len(rows) > 0 && reflect.TypeOf(rows[0]).Kind() == reflect.Pointer { 92 for i := range rows { 93 v := reflect.ValueOf(&rows[i]).Elem() 94 if v.IsNil() { 95 v.Set(reflect.New(v.Type().Elem())) 96 } 97 } 98 } 99 } 100 101 type generator[T any] interface { 102 generate(*rand.Rand) T 103 } 104 105 func BenchmarkGenericBuffer(b *testing.B) { 106 benchmarkGenericBuffer[benchmarkRowType](b) 107 benchmarkGenericBuffer[booleanColumn](b) 108 benchmarkGenericBuffer[int32Column](b) 109 benchmarkGenericBuffer[int64Column](b) 110 benchmarkGenericBuffer[floatColumn](b) 111 benchmarkGenericBuffer[doubleColumn](b) 112 benchmarkGenericBuffer[byteArrayColumn](b) 113 benchmarkGenericBuffer[fixedLenByteArrayColumn](b) 114 benchmarkGenericBuffer[stringColumn](b) 115 benchmarkGenericBuffer[indexedStringColumn](b) 116 benchmarkGenericBuffer[uuidColumn](b) 117 benchmarkGenericBuffer[timeColumn](b) 118 benchmarkGenericBuffer[timeInMillisColumn](b) 119 benchmarkGenericBuffer[mapColumn](b) 120 benchmarkGenericBuffer[decimalColumn](b) 121 benchmarkGenericBuffer[contact](b) 122 benchmarkGenericBuffer[paddedBooleanColumn](b) 123 benchmarkGenericBuffer[optionalInt32Column](b) 124 benchmarkGenericBuffer[repeatedInt32Column](b) 125 } 126 127 func benchmarkGenericBuffer[Row generator[Row]](b *testing.B) { 128 var model Row 129 b.Run(reflect.TypeOf(model).Name(), func(b *testing.B) { 130 prng := rand.New(rand.NewSource(0)) 131 rows := make([]Row, benchmarkNumRows) 132 for i := range rows { 133 rows[i] = rows[i].generate(prng) 134 } 135 136 b.Run("go1.17", func(b *testing.B) { 137 buffer := parquet.NewBuffer(parquet.SchemaOf(rows[0])) 138 i := 0 139 benchmarkRowsPerSecond(b, func() int { 140 for j := 0; j < benchmarkRowsPerStep; j++ { 141 if err := buffer.Write(&rows[i]); err != nil { 142 b.Fatal(err) 143 } 144 } 145 146 i += benchmarkRowsPerStep 147 i %= benchmarkNumRows 148 149 if i == 0 { 150 buffer.Reset() 151 } 152 return benchmarkRowsPerStep 153 }) 154 }) 155 156 b.Run("go1.18", func(b *testing.B) { 157 buffer := parquet.NewGenericBuffer[Row]() 158 i := 0 159 benchmarkRowsPerSecond(b, func() int { 160 n, err := buffer.Write(rows[i : i+benchmarkRowsPerStep]) 161 if err != nil { 162 b.Fatal(err) 163 } 164 165 i += benchmarkRowsPerStep 166 i %= benchmarkNumRows 167 168 if i == 0 { 169 buffer.Reset() 170 } 171 return n 172 }) 173 }) 174 }) 175 } 176 177 func TestIssue327(t *testing.T) { 178 t.Run("untagged nested lists should panic", func(t *testing.T) { 179 type testType struct { 180 ListOfLists [][]int 181 } 182 183 defer func() { 184 if r := recover(); r == nil { 185 t.Errorf("Nested lists without the list tag should panic") 186 } 187 }() 188 189 _ = parquet.NewGenericBuffer[testType]() 190 }) 191 } 192 193 func TestIssue346(t *testing.T) { 194 type TestType struct { 195 Key int 196 } 197 198 schema := parquet.SchemaOf(TestType{}) 199 buffer := parquet.NewGenericBuffer[any](schema) 200 201 data := make([]any, 1) 202 data[0] = TestType{Key: 0} 203 _, _ = buffer.Write(data) 204 } 205 206 func TestIssue347(t *testing.T) { 207 type TestType struct { 208 Key int 209 } 210 211 // instantiating with concrete type shouldn't panic 212 _ = parquet.NewGenericBuffer[TestType]() 213 214 // instantiating with schema and interface type parameter shouldn't panic 215 schema := parquet.SchemaOf(TestType{}) 216 _ = parquet.NewGenericBuffer[any](schema) 217 218 defer func() { 219 if r := recover(); r == nil { 220 t.Errorf("instantiating generic buffer without schema and with interface " + 221 "type parameter should panic") 222 } 223 }() 224 _ = parquet.NewGenericBuffer[any]() 225 } 226 227 func BenchmarkSortGenericBuffer(b *testing.B) { 228 type Row struct { 229 I0 int64 230 I1 int64 231 I2 int64 232 I3 int64 233 I4 int64 234 I5 int64 235 I6 int64 236 I7 int64 237 I8 int64 238 I9 int64 239 ID [16]byte 240 } 241 242 buf := parquet.NewGenericBuffer[Row]( 243 parquet.SortingRowGroupConfig( 244 parquet.SortingColumns( 245 parquet.Ascending("ID"), 246 ), 247 ), 248 ) 249 250 rows := make([]Row, 10e3) 251 prng := rand.New(rand.NewSource(0)) 252 253 for i := range rows { 254 binary.LittleEndian.PutUint64(rows[i].ID[:8], uint64(i)) 255 binary.LittleEndian.PutUint64(rows[i].ID[8:], ^uint64(i)) 256 } 257 258 buf.Write(rows) 259 b.ResetTimer() 260 261 for i := 0; i < b.N; i++ { 262 for j := 0; j < 10; j++ { 263 buf.Swap(prng.Intn(len(rows)), prng.Intn(len(rows))) 264 } 265 266 sort.Sort(buf) 267 } 268 } 269 270 var bufferTests = [...]struct { 271 scenario string 272 typ parquet.Type 273 values [][]interface{} 274 }{ 275 { 276 scenario: "boolean", 277 typ: parquet.BooleanType, 278 values: [][]interface{}{ 279 {}, 280 {false}, 281 {true}, 282 { 283 false, true, false, false, true, true, 284 false, false, false, true, false, true, 285 }, 286 }, 287 }, 288 289 { 290 scenario: "int32", 291 typ: parquet.Int32Type, 292 values: [][]interface{}{ 293 {}, 294 {int32(0)}, 295 {int32(1)}, 296 { 297 int32(1), int32(2), int32(3), int32(4), int32(5), int32(6), 298 int32(math.MaxInt8), int32(math.MaxInt16), int32(math.MaxInt32), 299 int32(7), int32(9), int32(9), int32(0), 300 }, 301 }, 302 }, 303 304 { 305 scenario: "int64", 306 typ: parquet.Int64Type, 307 values: [][]interface{}{ 308 {}, 309 {int64(0)}, 310 {int64(1)}, 311 { 312 int64(1), int64(2), int64(3), int64(4), int64(5), int64(6), 313 int64(math.MaxInt8), int64(math.MaxInt16), int64(math.MaxInt64), int64(7), 314 int64(9), int64(9), int64(0), 315 }, 316 }, 317 }, 318 319 { 320 scenario: "float", 321 typ: parquet.FloatType, 322 values: [][]interface{}{ 323 {}, 324 {float32(0)}, 325 {float32(1)}, 326 { 327 float32(1), float32(2), float32(3), float32(4), float32(5), float32(6), 328 float32(0.5), float32(math.SmallestNonzeroFloat32), float32(math.MaxFloat32), float32(7), 329 float32(9), float32(9), float32(0), 330 }, 331 }, 332 }, 333 334 { 335 scenario: "double", 336 typ: parquet.DoubleType, 337 values: [][]interface{}{ 338 {}, 339 {float64(0)}, 340 {float64(1)}, 341 { 342 float64(1), float64(2), float64(3), float64(4), float64(5), float64(6), 343 float64(0.5), float64(math.SmallestNonzeroFloat64), float64(math.MaxFloat64), float64(7), 344 float64(9), float64(9), float64(0), 345 }, 346 }, 347 }, 348 349 { 350 scenario: "string", 351 typ: parquet.ByteArrayType, 352 values: [][]interface{}{ 353 {}, 354 {""}, 355 {"Hello World!"}, 356 { 357 "ABCDEFG", "HIJKLMN", "OPQRSTU", "VWXZY01", "2345678", 358 "90!@#$%", "^&*()_+", "Hello World!", "Answer=42", "ABCEDFG", 359 "HIJKLMN", "OPQRSTU", "VWXYZ", 360 }, 361 }, 362 }, 363 364 { 365 scenario: "fixed length byte array", 366 typ: parquet.FixedLenByteArrayType(10), 367 values: [][]interface{}{ 368 {}, 369 {[10]byte{}}, 370 {[10]byte{0: 1}}, 371 { 372 [10]byte{0: 0}, [10]byte{0: 2}, [10]byte{0: 1}, [10]byte{0: 4}, [10]byte{0: 3}, 373 [10]byte{0: 6}, [10]byte{0: 5}, [10]byte{0: 8}, [10]byte{0: 7}, [10]byte{0: 10}, 374 [10]byte{0: 11}, [10]byte{0: 12}, [10]byte{9: 0xFF}, 375 }, 376 }, 377 }, 378 379 { 380 scenario: "uuid", 381 typ: parquet.UUID().Type(), 382 values: [][]interface{}{ 383 {}, 384 {[16]byte{}}, 385 {[16]byte{0: 1}}, 386 { 387 [16]byte{0: 0}, [16]byte{0: 2}, [16]byte{0: 1}, [16]byte{0: 4}, [16]byte{0: 3}, 388 [16]byte{0: 6}, [16]byte{0: 5}, [16]byte{0: 8}, [16]byte{0: 7}, [16]byte{0: 10}, 389 [16]byte{0: 11}, [16]byte{0: 12}, [16]byte{15: 0xFF}, 390 }, 391 }, 392 }, 393 394 { 395 scenario: "uint32", 396 typ: parquet.Uint(32).Type(), 397 values: [][]interface{}{ 398 {}, 399 {uint32(0)}, 400 {uint32(1)}, 401 { 402 uint32(1), uint32(2), uint32(3), uint32(4), uint32(5), uint32(6), 403 uint32(math.MaxInt8), uint32(math.MaxInt16), uint32(math.MaxUint32), uint32(7), 404 uint32(9), uint32(9), uint32(0), 405 }, 406 }, 407 }, 408 409 { 410 scenario: "uint64", 411 typ: parquet.Uint(64).Type(), 412 values: [][]interface{}{ 413 {}, 414 {uint64(0)}, 415 {uint64(1)}, 416 { 417 uint64(1), uint64(2), uint64(3), uint64(4), uint64(5), uint64(6), 418 uint64(math.MaxInt8), uint64(math.MaxInt16), uint64(math.MaxUint64), 419 uint64(7), uint64(9), uint64(9), uint64(0), 420 }, 421 }, 422 }, 423 } 424 425 func TestBuffer(t *testing.T) { 426 for _, test := range bufferTests { 427 t.Run(test.scenario, func(t *testing.T) { 428 for _, config := range [...]struct { 429 scenario string 430 typ parquet.Type 431 }{ 432 {scenario: "plain", typ: test.typ}, 433 {scenario: "indexed", typ: test.typ.NewDictionary(0, 0, test.typ.NewValues(nil, nil)).Type()}, 434 } { 435 t.Run(config.scenario, func(t *testing.T) { 436 for _, mod := range [...]struct { 437 scenario string 438 function func(parquet.Node) parquet.Node 439 }{ 440 {scenario: "optional", function: parquet.Optional}, 441 {scenario: "repeated", function: parquet.Repeated}, 442 {scenario: "required", function: parquet.Required}, 443 } { 444 t.Run(mod.scenario, func(t *testing.T) { 445 for _, ordering := range [...]struct { 446 scenario string 447 sorting parquet.SortingColumn 448 sortFunc func(parquet.Type, []parquet.Value) 449 }{ 450 {scenario: "unordered", sorting: nil, sortFunc: unordered}, 451 {scenario: "ascending", sorting: parquet.Ascending("data"), sortFunc: ascending}, 452 {scenario: "descending", sorting: parquet.Descending("data"), sortFunc: descending}, 453 } { 454 t.Run(ordering.scenario, func(t *testing.T) { 455 schema := parquet.NewSchema("test", parquet.Group{ 456 "data": mod.function(parquet.Leaf(config.typ)), 457 }) 458 459 options := []parquet.RowGroupOption{ 460 schema, 461 parquet.ColumnBufferCapacity(100), 462 } 463 if ordering.sorting != nil { 464 options = append(options, 465 parquet.SortingRowGroupConfig( 466 parquet.SortingColumns(ordering.sorting), 467 ), 468 ) 469 } 470 471 content := new(bytes.Buffer) 472 buffer := parquet.NewBuffer(options...) 473 474 for _, values := range test.values { 475 t.Run("", func(t *testing.T) { 476 defer content.Reset() 477 defer buffer.Reset() 478 fields := schema.Fields() 479 testBuffer(t, fields[0], buffer, &parquet.Plain, values, ordering.sortFunc) 480 }) 481 } 482 }) 483 } 484 }) 485 } 486 }) 487 } 488 }) 489 } 490 } 491 492 type sortFunc func(parquet.Type, []parquet.Value) 493 494 func unordered(typ parquet.Type, values []parquet.Value) {} 495 496 func ascending(typ parquet.Type, values []parquet.Value) { 497 sort.Slice(values, func(i, j int) bool { return typ.Compare(values[i], values[j]) < 0 }) 498 } 499 500 func descending(typ parquet.Type, values []parquet.Value) { 501 sort.Slice(values, func(i, j int) bool { return typ.Compare(values[i], values[j]) > 0 }) 502 } 503 504 func testBuffer(t *testing.T, node parquet.Node, buffer *parquet.Buffer, encoding encoding.Encoding, values []interface{}, sortFunc sortFunc) { 505 repetitionLevel := 0 506 definitionLevel := 0 507 if !node.Required() { 508 definitionLevel = 1 509 } 510 511 minValue := parquet.Value{} 512 maxValue := parquet.Value{} 513 batch := make([]parquet.Value, len(values)) 514 for i := range values { 515 batch[i] = parquet.ValueOf(values[i]).Level(repetitionLevel, definitionLevel, 0) 516 } 517 518 for i := range batch { 519 _, err := buffer.WriteRows([]parquet.Row{batch[i : i+1]}) 520 if err != nil { 521 t.Fatalf("writing value to row group: %v", err) 522 } 523 } 524 525 numRows := buffer.NumRows() 526 if numRows != int64(len(batch)) { 527 t.Fatalf("number of rows mismatch: want=%d got=%d", len(batch), numRows) 528 } 529 530 typ := node.Type() 531 for _, value := range batch { 532 if minValue.IsNull() || typ.Compare(value, minValue) < 0 { 533 minValue = value 534 } 535 if maxValue.IsNull() || typ.Compare(value, maxValue) > 0 { 536 maxValue = value 537 } 538 } 539 540 sortFunc(typ, batch) 541 sort.Sort(buffer) 542 543 page := buffer.ColumnBuffers()[0].Page() 544 numValues := page.NumValues() 545 if numValues != int64(len(batch)) { 546 t.Fatalf("number of values mistmatch: want=%d got=%d", len(batch), numValues) 547 } 548 549 numNulls := page.NumNulls() 550 if numNulls != 0 { 551 t.Fatalf("number of nulls mismatch: want=0 got=%d", numNulls) 552 } 553 554 min, max, hasBounds := page.Bounds() 555 if !hasBounds && numRows > 0 { 556 t.Fatal("page bounds are missing") 557 } 558 if !parquet.Equal(min, minValue) { 559 t.Fatalf("min value mismatch: want=%v got=%v", minValue, min) 560 } 561 if !parquet.Equal(max, maxValue) { 562 t.Fatalf("max value mismatch: want=%v got=%v", maxValue, max) 563 } 564 565 // We write a single value per row, so num values = num rows for all pages 566 // including repeated ones, which makes it OK to slice the pages using the 567 // number of values as a proxy for the row indexes. 568 halfValues := numValues / 2 569 570 for _, test := range [...]struct { 571 scenario string 572 values []parquet.Value 573 reader parquet.ValueReader 574 }{ 575 {"page", batch, page.Values()}, 576 {"head", batch[:halfValues], page.Slice(0, halfValues).Values()}, 577 {"tail", batch[halfValues:], page.Slice(halfValues, numValues).Values()}, 578 } { 579 v := [1]parquet.Value{} 580 i := 0 581 582 for { 583 n, err := test.reader.ReadValues(v[:]) 584 if n > 0 { 585 if n != 1 { 586 t.Fatalf("reading value from %q reader returned the wrong count: want=1 got=%d", test.scenario, n) 587 } 588 if i < len(test.values) { 589 if !parquet.Equal(v[0], test.values[i]) { 590 t.Fatalf("%q value at index %d mismatches: want=%v got=%v", test.scenario, i, test.values[i], v[0]) 591 } 592 } 593 i++ 594 } 595 if err != nil { 596 if err == io.EOF { 597 break 598 } 599 t.Fatalf("reading value from %q reader: %v", test.scenario, err) 600 } 601 } 602 603 if i != len(test.values) { 604 t.Errorf("wrong number of values read from %q reader: want=%d got=%d", test.scenario, len(test.values), i) 605 } 606 } 607 } 608 609 func TestBufferGenerateBloomFilters(t *testing.T) { 610 type Point3D struct { 611 X float64 612 Y float64 613 Z float64 614 } 615 616 f := func(rows []Point3D) bool { 617 if len(rows) == 0 { // TODO: support writing files with no rows 618 return true 619 } 620 621 output := new(bytes.Buffer) 622 buffer := parquet.NewBuffer() 623 writer := parquet.NewWriter(output, 624 parquet.BloomFilters( 625 parquet.SplitBlockFilter(10, "X"), 626 parquet.SplitBlockFilter(10, "Y"), 627 parquet.SplitBlockFilter(10, "Z"), 628 ), 629 ) 630 for i := range rows { 631 buffer.Write(&rows[i]) 632 } 633 _, err := copyRowsAndClose(writer, buffer.Rows()) 634 if err != nil { 635 t.Error(err) 636 return false 637 } 638 if err := writer.Close(); err != nil { 639 t.Error(err) 640 return false 641 } 642 643 reader := bytes.NewReader(output.Bytes()) 644 f, err := parquet.OpenFile(reader, reader.Size()) 645 if err != nil { 646 t.Error(err) 647 return false 648 } 649 rowGroup := f.RowGroups()[0] 650 columns := rowGroup.ColumnChunks() 651 x := columns[0] 652 y := columns[1] 653 z := columns[2] 654 655 for i, col := range []parquet.ColumnChunk{x, y, z} { 656 if col.BloomFilter() == nil { 657 t.Errorf("column %d has no bloom filter despite being configured to have one", i) 658 return false 659 } 660 } 661 662 fx := x.BloomFilter() 663 fy := y.BloomFilter() 664 fz := z.BloomFilter() 665 666 test := func(f parquet.BloomFilter, v float64) bool { 667 if ok, err := f.Check(parquet.ValueOf(v)); err != nil { 668 t.Errorf("unexpected error checking bloom filter: %v", err) 669 return false 670 } else if !ok { 671 t.Errorf("bloom filter does not contain value %g", v) 672 return false 673 } 674 return true 675 } 676 677 for _, row := range rows { 678 if !test(fx, row.X) || !test(fy, row.Y) || !test(fz, row.Z) { 679 return false 680 } 681 } 682 683 return true 684 } 685 686 if err := quickCheck(f); err != nil { 687 t.Error(err) 688 } 689 } 690 691 func TestBufferRoundtripNestedRepeated(t *testing.T) { 692 type C struct { 693 D int 694 } 695 type B struct { 696 C []C 697 } 698 type A struct { 699 B []B 700 } 701 702 // Write enough objects to exceed first page 703 buffer := parquet.NewBuffer() 704 var objs []A 705 for i := 0; i < 6; i++ { 706 o := A{[]B{{[]C{ 707 {i}, 708 {i}, 709 }}}} 710 buffer.Write(&o) 711 objs = append(objs, o) 712 } 713 714 buf := new(bytes.Buffer) 715 w := parquet.NewWriter(buf, parquet.PageBufferSize(100)) 716 w.WriteRowGroup(buffer) 717 w.Flush() 718 w.Close() 719 720 file := bytes.NewReader(buf.Bytes()) 721 r := parquet.NewReader(file) 722 for i := 0; ; i++ { 723 o := new(A) 724 err := r.Read(o) 725 if errors.Is(err, io.EOF) { 726 if i < len(objs) { 727 t.Errorf("too few rows were read: %d<%d", i, len(objs)) 728 } 729 break 730 } 731 if !reflect.DeepEqual(*o, objs[i]) { 732 t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o) 733 } 734 } 735 } 736 737 func TestBufferRoundtripNestedRepeatedPointer(t *testing.T) { 738 type C struct { 739 D *int 740 } 741 type B struct { 742 C []C 743 } 744 type A struct { 745 B []B 746 } 747 748 // Write enough objects to exceed first page 749 buffer := parquet.NewBuffer() 750 var objs []A 751 for i := 0; i < 6; i++ { 752 j := i 753 o := A{[]B{{[]C{ 754 {&j}, 755 {nil}, 756 }}}} 757 buffer.Write(&o) 758 objs = append(objs, o) 759 } 760 761 buf := new(bytes.Buffer) 762 w := parquet.NewWriter(buf, parquet.PageBufferSize(100)) 763 w.WriteRowGroup(buffer) 764 w.Flush() 765 w.Close() 766 767 file := bytes.NewReader(buf.Bytes()) 768 r := parquet.NewReader(file) 769 for i := 0; ; i++ { 770 o := new(A) 771 err := r.Read(o) 772 if err == io.EOF { 773 break 774 } 775 if !reflect.DeepEqual(*o, objs[i]) { 776 t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o) 777 } 778 } 779 } 780 781 func TestRoundtripNestedRepeatedBytes(t *testing.T) { 782 type B struct { 783 C []byte 784 } 785 type A struct { 786 A string 787 B []B 788 } 789 790 var objs []A 791 for i := 0; i < 2; i++ { 792 o := A{ 793 "test" + strconv.Itoa(i), 794 []B{ 795 {[]byte{byte(i)}}, 796 }, 797 } 798 objs = append(objs, o) 799 } 800 801 buf := new(bytes.Buffer) 802 w := parquet.NewWriter(buf, parquet.PageBufferSize(100)) 803 for _, o := range objs { 804 w.Write(&o) 805 } 806 w.Close() 807 808 file := bytes.NewReader(buf.Bytes()) 809 810 r := parquet.NewReader(file) 811 for i := 0; ; i++ { 812 o := new(A) 813 err := r.Read(o) 814 if errors.Is(err, io.EOF) { 815 if i < len(objs) { 816 t.Errorf("too few rows were read: %d<%d", i, len(objs)) 817 } 818 break 819 } 820 if !reflect.DeepEqual(*o, objs[i]) { 821 t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o) 822 } 823 } 824 } 825 826 func TestBufferSeekToRow(t *testing.T) { 827 type B struct { 828 I int 829 C []string 830 } 831 type A struct { 832 B []B 833 } 834 835 buffer := parquet.NewBuffer() 836 var objs []A 837 for i := 0; i < 2; i++ { 838 o := A{ 839 B: []B{ 840 {I: i, C: []string{"foo", strconv.Itoa(i)}}, 841 {I: i + 1, C: []string{"bar", strconv.Itoa(i + 1)}}, 842 }, 843 } 844 buffer.Write(&o) 845 objs = append(objs, o) 846 } 847 848 buf := new(bytes.Buffer) 849 w := parquet.NewWriter(buf) 850 w.WriteRowGroup(buffer) 851 w.Flush() 852 w.Close() 853 854 file := bytes.NewReader(buf.Bytes()) 855 r := parquet.NewReader(file) 856 857 i := 1 858 o := new(A) 859 if err := r.SeekToRow(int64(i)); err != nil { 860 t.Fatal(err) 861 } 862 if err := r.Read(o); err != nil { 863 t.Fatal(err) 864 } 865 if !reflect.DeepEqual(*o, objs[i]) { 866 t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o) 867 } 868 } 869 870 type TestStruct struct { 871 A *string `parquet:"a,optional,dict"` 872 } 873 874 func TestOptionalDictWriteRowGroup(t *testing.T) { 875 s := parquet.SchemaOf(&TestStruct{}) 876 877 str1 := "test1" 878 str2 := "test2" 879 records := []*TestStruct{ 880 {A: nil}, 881 {A: &str1}, 882 {A: nil}, 883 {A: &str2}, 884 {A: nil}, 885 } 886 887 buf := parquet.NewBuffer(s) 888 for _, rec := range records { 889 row := s.Deconstruct(nil, rec) 890 _, err := buf.WriteRows([]parquet.Row{row}) 891 if err != nil { 892 t.Fatal(err) 893 } 894 } 895 896 b := bytes.NewBuffer(nil) 897 w := parquet.NewWriter(b) 898 _, err := w.WriteRowGroup(buf) 899 if err != nil { 900 t.Fatal(err) 901 } 902 } 903 904 func TestNullsSortFirst(t *testing.T) { 905 s := parquet.SchemaOf(&TestStruct{}) 906 907 str1 := "test1" 908 str2 := "test2" 909 records := []*TestStruct{ 910 {A: &str1}, 911 {A: nil}, 912 {A: &str2}, 913 } 914 buf := parquet.NewBuffer( 915 s, 916 parquet.SortingRowGroupConfig(parquet.SortingColumns(parquet.NullsFirst(parquet.Ascending(s.Columns()[0][0])))), 917 ) 918 for _, rec := range records { 919 row := s.Deconstruct(nil, rec) 920 _, err := buf.WriteRows([]parquet.Row{row}) 921 if err != nil { 922 t.Fatal(err) 923 } 924 } 925 926 sort.Sort(buf) 927 928 rows := buf.Rows() 929 defer rows.Close() 930 rowBuf := make([]parquet.Row, len(records)) 931 if _, err := rows.ReadRows(rowBuf); err != nil { 932 t.Fatal(err) 933 } 934 935 resultRecords := make([]TestStruct, len(records)) 936 for i, r := range rowBuf { 937 if err := s.Reconstruct(&resultRecords[i], r); err != nil { 938 t.Fatal(err) 939 } 940 } 941 942 if resultRecords[0].A != nil { 943 t.Fatal("expected null to sort first, but found", resultRecords) 944 } 945 } 946 947 func generateBenchmarkBufferRows(n int) (*parquet.Schema, []parquet.Row) { 948 model := new(benchmarkRowType) 949 schema := parquet.SchemaOf(model) 950 prng := rand.New(rand.NewSource(0)) 951 rows := make([]parquet.Row, n) 952 953 for i := range rows { 954 io.ReadFull(prng, model.ID[:]) 955 model.Value = prng.Float64() 956 rows[i] = make(parquet.Row, 0, 2) 957 rows[i] = schema.Deconstruct(rows[i], model) 958 } 959 960 return schema, rows 961 } 962 963 func BenchmarkBufferReadRows100x(b *testing.B) { 964 schema, rows := generateBenchmarkBufferRows(benchmarkNumRows) 965 buffer := parquet.NewBuffer(schema) 966 967 for i := 0; i < len(rows); i += benchmarkRowsPerStep { 968 j := i + benchmarkRowsPerStep 969 if _, err := buffer.WriteRows(rows[i:j]); err != nil { 970 b.Fatal(err) 971 } 972 } 973 974 bufferRows := buffer.Rows() 975 defer bufferRows.Close() 976 977 benchmarkRowsPerSecond(b, func() int { 978 n, err := bufferRows.ReadRows(rows[:benchmarkRowsPerStep]) 979 if err != nil { 980 if errors.Is(err, io.EOF) { 981 err = bufferRows.SeekToRow(0) 982 } 983 if err != nil { 984 b.Fatal(err) 985 } 986 } 987 return n 988 }) 989 } 990 991 func BenchmarkBufferWriteRows100x(b *testing.B) { 992 schema, rows := generateBenchmarkBufferRows(benchmarkNumRows) 993 buffer := parquet.NewBuffer(schema) 994 995 i := 0 996 benchmarkRowsPerSecond(b, func() int { 997 n, err := buffer.WriteRows(rows[i : i+benchmarkRowsPerStep]) 998 if err != nil { 999 b.Fatal(err) 1000 } 1001 1002 i += benchmarkRowsPerStep 1003 i %= benchmarkNumRows 1004 1005 if i == 0 { 1006 buffer.Reset() 1007 } 1008 return n 1009 }) 1010 }