github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/buffer_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "errors" 6 "io" 7 "math" 8 "math/rand" 9 "reflect" 10 "sort" 11 "strconv" 12 "testing" 13 14 "github.com/segmentio/parquet-go" 15 "github.com/segmentio/parquet-go/encoding" 16 ) 17 18 var bufferTests = [...]struct { 19 scenario string 20 typ parquet.Type 21 values [][]interface{} 22 }{ 23 { 24 scenario: "boolean", 25 typ: parquet.BooleanType, 26 values: [][]interface{}{ 27 {}, 28 {false}, 29 {true}, 30 { 31 false, true, false, false, true, true, 32 false, false, false, true, false, true, 33 }, 34 }, 35 }, 36 37 { 38 scenario: "int32", 39 typ: parquet.Int32Type, 40 values: [][]interface{}{ 41 {}, 42 {int32(0)}, 43 {int32(1)}, 44 { 45 int32(1), int32(2), int32(3), int32(4), int32(5), int32(6), 46 int32(math.MaxInt8), int32(math.MaxInt16), int32(math.MaxInt32), 47 int32(7), int32(9), int32(9), int32(0), 48 }, 49 }, 50 }, 51 52 { 53 scenario: "int64", 54 typ: parquet.Int64Type, 55 values: [][]interface{}{ 56 {}, 57 {int64(0)}, 58 {int64(1)}, 59 { 60 int64(1), int64(2), int64(3), int64(4), int64(5), int64(6), 61 int64(math.MaxInt8), int64(math.MaxInt16), int64(math.MaxInt64), int64(7), 62 int64(9), int64(9), int64(0), 63 }, 64 }, 65 }, 66 67 { 68 scenario: "float", 69 typ: parquet.FloatType, 70 values: [][]interface{}{ 71 {}, 72 {float32(0)}, 73 {float32(1)}, 74 { 75 float32(1), float32(2), float32(3), float32(4), float32(5), float32(6), 76 float32(0.5), float32(math.SmallestNonzeroFloat32), float32(math.MaxFloat32), float32(7), 77 float32(9), float32(9), float32(0), 78 }, 79 }, 80 }, 81 82 { 83 scenario: "double", 84 typ: parquet.DoubleType, 85 values: [][]interface{}{ 86 {}, 87 {float64(0)}, 88 {float64(1)}, 89 { 90 float64(1), float64(2), float64(3), float64(4), float64(5), float64(6), 91 float64(0.5), float64(math.SmallestNonzeroFloat64), float64(math.MaxFloat64), float64(7), 92 float64(9), float64(9), float64(0), 93 }, 94 }, 95 }, 96 97 { 98 scenario: "string", 99 typ: parquet.ByteArrayType, 100 values: [][]interface{}{ 101 {}, 102 {""}, 103 {"Hello World!"}, 104 { 105 "ABCDEFG", "HIJKLMN", "OPQRSTU", "VWXZY01", "2345678", 106 "90!@#$%", "^&*()_+", "Hello World!", "Answer=42", "ABCEDFG", 107 "HIJKLMN", "OPQRSTU", "VWXYZ", 108 }, 109 }, 110 }, 111 112 { 113 scenario: "fixed length byte array", 114 typ: parquet.FixedLenByteArrayType(10), 115 values: [][]interface{}{ 116 {}, 117 {[10]byte{}}, 118 {[10]byte{0: 1}}, 119 { 120 [10]byte{0: 0}, [10]byte{0: 2}, [10]byte{0: 1}, [10]byte{0: 4}, [10]byte{0: 3}, 121 [10]byte{0: 6}, [10]byte{0: 5}, [10]byte{0: 8}, [10]byte{0: 7}, [10]byte{0: 10}, 122 [10]byte{0: 11}, [10]byte{0: 12}, [10]byte{9: 0xFF}, 123 }, 124 }, 125 }, 126 127 { 128 scenario: "uuid", 129 typ: parquet.UUID().Type(), 130 values: [][]interface{}{ 131 {}, 132 {[16]byte{}}, 133 {[16]byte{0: 1}}, 134 { 135 [16]byte{0: 0}, [16]byte{0: 2}, [16]byte{0: 1}, [16]byte{0: 4}, [16]byte{0: 3}, 136 [16]byte{0: 6}, [16]byte{0: 5}, [16]byte{0: 8}, [16]byte{0: 7}, [16]byte{0: 10}, 137 [16]byte{0: 11}, [16]byte{0: 12}, [16]byte{15: 0xFF}, 138 }, 139 }, 140 }, 141 142 { 143 scenario: "uint32", 144 typ: parquet.Uint(32).Type(), 145 values: [][]interface{}{ 146 {}, 147 {uint32(0)}, 148 {uint32(1)}, 149 { 150 uint32(1), uint32(2), uint32(3), uint32(4), uint32(5), uint32(6), 151 uint32(math.MaxInt8), uint32(math.MaxInt16), uint32(math.MaxUint32), uint32(7), 152 uint32(9), uint32(9), uint32(0), 153 }, 154 }, 155 }, 156 157 { 158 scenario: "uint64", 159 typ: parquet.Uint(64).Type(), 160 values: [][]interface{}{ 161 {}, 162 {uint64(0)}, 163 {uint64(1)}, 164 { 165 uint64(1), uint64(2), uint64(3), uint64(4), uint64(5), uint64(6), 166 uint64(math.MaxInt8), uint64(math.MaxInt16), uint64(math.MaxUint64), 167 uint64(7), uint64(9), uint64(9), uint64(0), 168 }, 169 }, 170 }, 171 } 172 173 func TestBuffer(t *testing.T) { 174 for _, test := range bufferTests { 175 t.Run(test.scenario, func(t *testing.T) { 176 for _, config := range [...]struct { 177 scenario string 178 typ parquet.Type 179 }{ 180 {scenario: "plain", typ: test.typ}, 181 {scenario: "indexed", typ: test.typ.NewDictionary(0, 0, test.typ.NewValues(nil, nil)).Type()}, 182 } { 183 t.Run(config.scenario, func(t *testing.T) { 184 for _, mod := range [...]struct { 185 scenario string 186 function func(parquet.Node) parquet.Node 187 }{ 188 {scenario: "optional", function: parquet.Optional}, 189 {scenario: "repeated", function: parquet.Repeated}, 190 {scenario: "required", function: parquet.Required}, 191 } { 192 t.Run(mod.scenario, func(t *testing.T) { 193 for _, ordering := range [...]struct { 194 scenario string 195 sorting parquet.SortingColumn 196 sortFunc func(parquet.Type, []parquet.Value) 197 }{ 198 {scenario: "unordered", sorting: nil, sortFunc: unordered}, 199 {scenario: "ascending", sorting: parquet.Ascending("data"), sortFunc: ascending}, 200 {scenario: "descending", sorting: parquet.Descending("data"), sortFunc: descending}, 201 } { 202 t.Run(ordering.scenario, func(t *testing.T) { 203 schema := parquet.NewSchema("test", parquet.Group{ 204 "data": mod.function(parquet.Leaf(config.typ)), 205 }) 206 207 options := []parquet.RowGroupOption{ 208 schema, 209 parquet.ColumnBufferCapacity(100), 210 } 211 if ordering.sorting != nil { 212 options = append(options, 213 parquet.SortingRowGroupConfig( 214 parquet.SortingColumns(ordering.sorting), 215 ), 216 ) 217 } 218 219 content := new(bytes.Buffer) 220 buffer := parquet.NewBuffer(options...) 221 222 for _, values := range test.values { 223 t.Run("", func(t *testing.T) { 224 defer content.Reset() 225 defer buffer.Reset() 226 fields := schema.Fields() 227 testBuffer(t, fields[0], buffer, &parquet.Plain, values, ordering.sortFunc) 228 }) 229 } 230 }) 231 } 232 }) 233 } 234 }) 235 } 236 }) 237 } 238 } 239 240 type sortFunc func(parquet.Type, []parquet.Value) 241 242 func unordered(typ parquet.Type, values []parquet.Value) {} 243 244 func ascending(typ parquet.Type, values []parquet.Value) { 245 sort.Slice(values, func(i, j int) bool { return typ.Compare(values[i], values[j]) < 0 }) 246 } 247 248 func descending(typ parquet.Type, values []parquet.Value) { 249 sort.Slice(values, func(i, j int) bool { return typ.Compare(values[i], values[j]) > 0 }) 250 } 251 252 func testBuffer(t *testing.T, node parquet.Node, buffer *parquet.Buffer, encoding encoding.Encoding, values []interface{}, sortFunc sortFunc) { 253 repetitionLevel := 0 254 definitionLevel := 0 255 if !node.Required() { 256 definitionLevel = 1 257 } 258 259 minValue := parquet.Value{} 260 maxValue := parquet.Value{} 261 batch := make([]parquet.Value, len(values)) 262 for i := range values { 263 batch[i] = parquet.ValueOf(values[i]).Level(repetitionLevel, definitionLevel, 0) 264 } 265 266 for i := range batch { 267 _, err := buffer.WriteRows([]parquet.Row{batch[i : i+1]}) 268 if err != nil { 269 t.Fatalf("writing value to row group: %v", err) 270 } 271 } 272 273 numRows := buffer.NumRows() 274 if numRows != int64(len(batch)) { 275 t.Fatalf("number of rows mismatch: want=%d got=%d", len(batch), numRows) 276 } 277 278 typ := node.Type() 279 for _, value := range batch { 280 if minValue.IsNull() || typ.Compare(value, minValue) < 0 { 281 minValue = value 282 } 283 if maxValue.IsNull() || typ.Compare(value, maxValue) > 0 { 284 maxValue = value 285 } 286 } 287 288 sortFunc(typ, batch) 289 sort.Sort(buffer) 290 291 page := buffer.ColumnBuffers()[0].Page() 292 numValues := page.NumValues() 293 if numValues != int64(len(batch)) { 294 t.Fatalf("number of values mistmatch: want=%d got=%d", len(batch), numValues) 295 } 296 297 numNulls := page.NumNulls() 298 if numNulls != 0 { 299 t.Fatalf("number of nulls mismatch: want=0 got=%d", numNulls) 300 } 301 302 min, max, hasBounds := page.Bounds() 303 if !hasBounds && numRows > 0 { 304 t.Fatal("page bounds are missing") 305 } 306 if !parquet.Equal(min, minValue) { 307 t.Fatalf("min value mismatch: want=%v got=%v", minValue, min) 308 } 309 if !parquet.Equal(max, maxValue) { 310 t.Fatalf("max value mismatch: want=%v got=%v", maxValue, max) 311 } 312 313 // We write a single value per row, so num values = num rows for all pages 314 // including repeated ones, which makes it OK to slice the pages using the 315 // number of values as a proxy for the row indexes. 316 halfValues := numValues / 2 317 318 for _, test := range [...]struct { 319 scenario string 320 values []parquet.Value 321 reader parquet.ValueReader 322 }{ 323 {"page", batch, page.Values()}, 324 {"head", batch[:halfValues], page.Slice(0, halfValues).Values()}, 325 {"tail", batch[halfValues:], page.Slice(halfValues, numValues).Values()}, 326 } { 327 v := [1]parquet.Value{} 328 i := 0 329 330 for { 331 n, err := test.reader.ReadValues(v[:]) 332 if n > 0 { 333 if n != 1 { 334 t.Fatalf("reading value from %q reader returned the wrong count: want=1 got=%d", test.scenario, n) 335 } 336 if i < len(test.values) { 337 if !parquet.Equal(v[0], test.values[i]) { 338 t.Fatalf("%q value at index %d mismatches: want=%v got=%v", test.scenario, i, test.values[i], v[0]) 339 } 340 } 341 i++ 342 } 343 if err != nil { 344 if err == io.EOF { 345 break 346 } 347 t.Fatalf("reading value from %q reader: %v", test.scenario, err) 348 } 349 } 350 351 if i != len(test.values) { 352 t.Errorf("wrong number of values read from %q reader: want=%d got=%d", test.scenario, len(test.values), i) 353 } 354 } 355 } 356 357 func TestBufferGenerateBloomFilters(t *testing.T) { 358 type Point3D struct { 359 X float64 360 Y float64 361 Z float64 362 } 363 364 f := func(rows []Point3D) bool { 365 if len(rows) == 0 { // TODO: support writing files with no rows 366 return true 367 } 368 369 output := new(bytes.Buffer) 370 buffer := parquet.NewBuffer() 371 writer := parquet.NewWriter(output, 372 parquet.BloomFilters( 373 parquet.SplitBlockFilter(10, "X"), 374 parquet.SplitBlockFilter(10, "Y"), 375 parquet.SplitBlockFilter(10, "Z"), 376 ), 377 ) 378 for i := range rows { 379 buffer.Write(&rows[i]) 380 } 381 _, err := copyRowsAndClose(writer, buffer.Rows()) 382 if err != nil { 383 t.Error(err) 384 return false 385 } 386 if err := writer.Close(); err != nil { 387 t.Error(err) 388 return false 389 } 390 391 reader := bytes.NewReader(output.Bytes()) 392 f, err := parquet.OpenFile(reader, reader.Size()) 393 if err != nil { 394 t.Error(err) 395 return false 396 } 397 rowGroup := f.RowGroups()[0] 398 columns := rowGroup.ColumnChunks() 399 x := columns[0] 400 y := columns[1] 401 z := columns[2] 402 403 for i, col := range []parquet.ColumnChunk{x, y, z} { 404 if col.BloomFilter() == nil { 405 t.Errorf("column %d has no bloom filter despite being configured to have one", i) 406 return false 407 } 408 } 409 410 fx := x.BloomFilter() 411 fy := y.BloomFilter() 412 fz := z.BloomFilter() 413 414 test := func(f parquet.BloomFilter, v float64) bool { 415 if ok, err := f.Check(parquet.ValueOf(v)); err != nil { 416 t.Errorf("unexpected error checking bloom filter: %v", err) 417 return false 418 } else if !ok { 419 t.Errorf("bloom filter does not contain value %g", v) 420 return false 421 } 422 return true 423 } 424 425 for _, row := range rows { 426 if !test(fx, row.X) || !test(fy, row.Y) || !test(fz, row.Z) { 427 return false 428 } 429 } 430 431 return true 432 } 433 434 if err := quickCheck(f); err != nil { 435 t.Error(err) 436 } 437 } 438 439 func TestBufferRoundtripNestedRepeated(t *testing.T) { 440 type C struct { 441 D int 442 } 443 type B struct { 444 C []C 445 } 446 type A struct { 447 B []B 448 } 449 450 // Write enough objects to exceed first page 451 buffer := parquet.NewBuffer() 452 var objs []A 453 for i := 0; i < 6; i++ { 454 o := A{[]B{{[]C{ 455 {i}, 456 {i}, 457 }}}} 458 buffer.Write(&o) 459 objs = append(objs, o) 460 } 461 462 buf := new(bytes.Buffer) 463 w := parquet.NewWriter(buf, parquet.PageBufferSize(100)) 464 w.WriteRowGroup(buffer) 465 w.Flush() 466 w.Close() 467 468 file := bytes.NewReader(buf.Bytes()) 469 r := parquet.NewReader(file) 470 for i := 0; ; i++ { 471 o := new(A) 472 err := r.Read(o) 473 if errors.Is(err, io.EOF) { 474 if i < len(objs) { 475 t.Errorf("too few rows were read: %d<%d", i, len(objs)) 476 } 477 break 478 } 479 if !reflect.DeepEqual(*o, objs[i]) { 480 t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o) 481 } 482 } 483 } 484 485 func TestBufferRoundtripNestedRepeatedPointer(t *testing.T) { 486 type C struct { 487 D *int 488 } 489 type B struct { 490 C []C 491 } 492 type A struct { 493 B []B 494 } 495 496 // Write enough objects to exceed first page 497 buffer := parquet.NewBuffer() 498 var objs []A 499 for i := 0; i < 6; i++ { 500 j := i 501 o := A{[]B{{[]C{ 502 {&j}, 503 {nil}, 504 }}}} 505 buffer.Write(&o) 506 objs = append(objs, o) 507 } 508 509 buf := new(bytes.Buffer) 510 w := parquet.NewWriter(buf, parquet.PageBufferSize(100)) 511 w.WriteRowGroup(buffer) 512 w.Flush() 513 w.Close() 514 515 file := bytes.NewReader(buf.Bytes()) 516 r := parquet.NewReader(file) 517 for i := 0; ; i++ { 518 o := new(A) 519 err := r.Read(o) 520 if err == io.EOF { 521 break 522 } 523 if !reflect.DeepEqual(*o, objs[i]) { 524 t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o) 525 } 526 } 527 } 528 529 func TestRoundtripNestedRepeatedBytes(t *testing.T) { 530 type B struct { 531 C []byte 532 } 533 type A struct { 534 A string 535 B []B 536 } 537 538 var objs []A 539 for i := 0; i < 2; i++ { 540 o := A{ 541 "test" + strconv.Itoa(i), 542 []B{ 543 {[]byte{byte(i)}}, 544 }, 545 } 546 objs = append(objs, o) 547 } 548 549 buf := new(bytes.Buffer) 550 w := parquet.NewWriter(buf, parquet.PageBufferSize(100)) 551 for _, o := range objs { 552 w.Write(&o) 553 } 554 w.Close() 555 556 file := bytes.NewReader(buf.Bytes()) 557 558 r := parquet.NewReader(file) 559 for i := 0; ; i++ { 560 o := new(A) 561 err := r.Read(o) 562 if errors.Is(err, io.EOF) { 563 if i < len(objs) { 564 t.Errorf("too few rows were read: %d<%d", i, len(objs)) 565 } 566 break 567 } 568 if !reflect.DeepEqual(*o, objs[i]) { 569 t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o) 570 } 571 } 572 } 573 574 func TestBufferSeekToRow(t *testing.T) { 575 type B struct { 576 I int 577 C []string 578 } 579 type A struct { 580 B []B 581 } 582 583 buffer := parquet.NewBuffer() 584 var objs []A 585 for i := 0; i < 2; i++ { 586 o := A{ 587 B: []B{ 588 {I: i, C: []string{"foo", strconv.Itoa(i)}}, 589 {I: i + 1, C: []string{"bar", strconv.Itoa(i + 1)}}, 590 }, 591 } 592 buffer.Write(&o) 593 objs = append(objs, o) 594 } 595 596 buf := new(bytes.Buffer) 597 w := parquet.NewWriter(buf) 598 w.WriteRowGroup(buffer) 599 w.Flush() 600 w.Close() 601 602 file := bytes.NewReader(buf.Bytes()) 603 r := parquet.NewReader(file) 604 605 i := 1 606 o := new(A) 607 if err := r.SeekToRow(int64(i)); err != nil { 608 t.Fatal(err) 609 } 610 if err := r.Read(o); err != nil { 611 t.Fatal(err) 612 } 613 if !reflect.DeepEqual(*o, objs[i]) { 614 t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o) 615 } 616 } 617 618 type TestStruct struct { 619 A *string `parquet:"a,optional,dict"` 620 } 621 622 func TestOptionalDictWriteRowGroup(t *testing.T) { 623 s := parquet.SchemaOf(&TestStruct{}) 624 625 str1 := "test1" 626 str2 := "test2" 627 records := []*TestStruct{ 628 {A: nil}, 629 {A: &str1}, 630 {A: nil}, 631 {A: &str2}, 632 {A: nil}, 633 } 634 635 buf := parquet.NewBuffer(s) 636 for _, rec := range records { 637 row := s.Deconstruct(nil, rec) 638 _, err := buf.WriteRows([]parquet.Row{row}) 639 if err != nil { 640 t.Fatal(err) 641 } 642 } 643 644 b := bytes.NewBuffer(nil) 645 w := parquet.NewWriter(b) 646 _, err := w.WriteRowGroup(buf) 647 if err != nil { 648 t.Fatal(err) 649 } 650 } 651 652 func TestNullsSortFirst(t *testing.T) { 653 s := parquet.SchemaOf(&TestStruct{}) 654 655 str1 := "test1" 656 str2 := "test2" 657 records := []*TestStruct{ 658 {A: &str1}, 659 {A: nil}, 660 {A: &str2}, 661 } 662 buf := parquet.NewBuffer( 663 s, 664 parquet.SortingRowGroupConfig(parquet.SortingColumns(parquet.NullsFirst(parquet.Ascending(s.Columns()[0][0])))), 665 ) 666 for _, rec := range records { 667 row := s.Deconstruct(nil, rec) 668 _, err := buf.WriteRows([]parquet.Row{row}) 669 if err != nil { 670 t.Fatal(err) 671 } 672 } 673 674 sort.Sort(buf) 675 676 rows := buf.Rows() 677 defer rows.Close() 678 rowBuf := make([]parquet.Row, len(records)) 679 if _, err := rows.ReadRows(rowBuf); err != nil { 680 t.Fatal(err) 681 } 682 683 resultRecords := make([]TestStruct, len(records)) 684 for i, r := range rowBuf { 685 if err := s.Reconstruct(&resultRecords[i], r); err != nil { 686 t.Fatal(err) 687 } 688 } 689 690 if resultRecords[0].A != nil { 691 t.Fatal("expected null to sort first, but found", resultRecords) 692 } 693 } 694 695 func generateBenchmarkBufferRows(n int) (*parquet.Schema, []parquet.Row) { 696 model := new(benchmarkRowType) 697 schema := parquet.SchemaOf(model) 698 prng := rand.New(rand.NewSource(0)) 699 rows := make([]parquet.Row, n) 700 701 for i := range rows { 702 io.ReadFull(prng, model.ID[:]) 703 model.Value = prng.Float64() 704 rows[i] = make(parquet.Row, 0, 2) 705 rows[i] = schema.Deconstruct(rows[i], model) 706 } 707 708 return schema, rows 709 } 710 711 func BenchmarkBufferReadRows100x(b *testing.B) { 712 schema, rows := generateBenchmarkBufferRows(benchmarkNumRows) 713 buffer := parquet.NewBuffer(schema) 714 715 for i := 0; i < len(rows); i += benchmarkRowsPerStep { 716 j := i + benchmarkRowsPerStep 717 if _, err := buffer.WriteRows(rows[i:j]); err != nil { 718 b.Fatal(err) 719 } 720 } 721 722 bufferRows := buffer.Rows() 723 defer bufferRows.Close() 724 725 benchmarkRowsPerSecond(b, func() int { 726 n, err := bufferRows.ReadRows(rows[:benchmarkRowsPerStep]) 727 if err != nil { 728 if errors.Is(err, io.EOF) { 729 err = bufferRows.SeekToRow(0) 730 } 731 if err != nil { 732 b.Fatal(err) 733 } 734 } 735 return n 736 }) 737 } 738 739 func BenchmarkBufferWriteRows100x(b *testing.B) { 740 schema, rows := generateBenchmarkBufferRows(benchmarkNumRows) 741 buffer := parquet.NewBuffer(schema) 742 743 i := 0 744 benchmarkRowsPerSecond(b, func() int { 745 n, err := buffer.WriteRows(rows[i : i+benchmarkRowsPerStep]) 746 if err != nil { 747 b.Fatal(err) 748 } 749 750 i += benchmarkRowsPerStep 751 i %= benchmarkNumRows 752 753 if i == 0 { 754 buffer.Reset() 755 } 756 return n 757 }) 758 }