github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/buffer_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "errors" 6 "io" 7 "math" 8 "math/rand" 9 "reflect" 10 "sort" 11 "strconv" 12 "testing" 13 14 "github.com/vc42/parquet-go" 15 "github.com/vc42/parquet-go/encoding" 16 ) 17 18 var bufferTests = [...]struct { 19 scenario string 20 typ parquet.Type 21 values [][]interface{} 22 }{ 23 { 24 scenario: "boolean", 25 typ: parquet.BooleanType, 26 values: [][]interface{}{ 27 {}, 28 {false}, 29 {true}, 30 { 31 false, true, false, false, true, true, 32 false, false, false, true, false, true, 33 }, 34 }, 35 }, 36 37 { 38 scenario: "int32", 39 typ: parquet.Int32Type, 40 values: [][]interface{}{ 41 {}, 42 {int32(0)}, 43 {int32(1)}, 44 { 45 int32(1), int32(2), int32(3), int32(4), int32(5), int32(6), 46 int32(math.MaxInt8), int32(math.MaxInt16), int32(math.MaxInt32), 47 int32(7), int32(9), int32(9), int32(0), 48 }, 49 }, 50 }, 51 52 { 53 scenario: "int64", 54 typ: parquet.Int64Type, 55 values: [][]interface{}{ 56 {}, 57 {int64(0)}, 58 {int64(1)}, 59 { 60 int64(1), int64(2), int64(3), int64(4), int64(5), int64(6), 61 int64(math.MaxInt8), int64(math.MaxInt16), int64(math.MaxInt64), int64(7), 62 int64(9), int64(9), int64(0), 63 }, 64 }, 65 }, 66 67 { 68 scenario: "float", 69 typ: parquet.FloatType, 70 values: [][]interface{}{ 71 {}, 72 {float32(0)}, 73 {float32(1)}, 74 { 75 float32(1), float32(2), float32(3), float32(4), float32(5), float32(6), 76 float32(0.5), float32(math.SmallestNonzeroFloat32), float32(math.MaxFloat32), float32(7), 77 float32(9), float32(9), float32(0), 78 }, 79 }, 80 }, 81 82 { 83 scenario: "double", 84 typ: parquet.DoubleType, 85 values: [][]interface{}{ 86 {}, 87 {float64(0)}, 88 {float64(1)}, 89 { 90 float64(1), float64(2), float64(3), float64(4), float64(5), float64(6), 91 float64(0.5), float64(math.SmallestNonzeroFloat64), float64(math.MaxFloat64), float64(7), 92 float64(9), float64(9), float64(0), 93 }, 94 }, 95 }, 96 97 { 98 scenario: "string", 99 typ: parquet.ByteArrayType, 100 values: [][]interface{}{ 101 {}, 102 {""}, 103 {"Hello World!"}, 104 { 105 "ABCDEFG", "HIJKLMN", "OPQRSTU", "VWXZY01", "2345678", 106 "90!@#$%", "^&*()_+", "Hello World!", "Answer=42", "ABCEDFG", 107 "HIJKLMN", "OPQRSTU", "VWXYZ", 108 }, 109 }, 110 }, 111 112 { 113 scenario: "fixed length byte array", 114 typ: parquet.FixedLenByteArrayType(10), 115 values: [][]interface{}{ 116 {}, 117 {[10]byte{}}, 118 {[10]byte{0: 1}}, 119 { 120 [10]byte{0: 0}, [10]byte{0: 2}, [10]byte{0: 1}, [10]byte{0: 4}, [10]byte{0: 3}, 121 [10]byte{0: 6}, [10]byte{0: 5}, [10]byte{0: 8}, [10]byte{0: 7}, [10]byte{0: 10}, 122 [10]byte{0: 11}, [10]byte{0: 12}, [10]byte{9: 0xFF}, 123 }, 124 }, 125 }, 126 127 { 128 scenario: "uuid", 129 typ: parquet.UUID().Type(), 130 values: [][]interface{}{ 131 {}, 132 {[16]byte{}}, 133 {[16]byte{0: 1}}, 134 { 135 [16]byte{0: 0}, [16]byte{0: 2}, [16]byte{0: 1}, [16]byte{0: 4}, [16]byte{0: 3}, 136 [16]byte{0: 6}, [16]byte{0: 5}, [16]byte{0: 8}, [16]byte{0: 7}, [16]byte{0: 10}, 137 [16]byte{0: 11}, [16]byte{0: 12}, [16]byte{15: 0xFF}, 138 }, 139 }, 140 }, 141 142 { 143 scenario: "uint32", 144 typ: parquet.Uint(32).Type(), 145 values: [][]interface{}{ 146 {}, 147 {uint32(0)}, 148 {uint32(1)}, 149 { 150 uint32(1), uint32(2), uint32(3), uint32(4), uint32(5), uint32(6), 151 uint32(math.MaxInt8), uint32(math.MaxInt16), uint32(math.MaxUint32), uint32(7), 152 uint32(9), uint32(9), uint32(0), 153 }, 154 }, 155 }, 156 157 { 158 scenario: "uint64", 159 typ: parquet.Uint(64).Type(), 160 values: [][]interface{}{ 161 {}, 162 {uint64(0)}, 163 {uint64(1)}, 164 { 165 uint64(1), uint64(2), uint64(3), uint64(4), uint64(5), uint64(6), 166 uint64(math.MaxInt8), uint64(math.MaxInt16), uint64(math.MaxUint64), 167 uint64(7), uint64(9), uint64(9), uint64(0), 168 }, 169 }, 170 }, 171 } 172 173 func TestBuffer(t *testing.T) { 174 for _, test := range bufferTests { 175 t.Run(test.scenario, func(t *testing.T) { 176 for _, config := range [...]struct { 177 scenario string 178 typ parquet.Type 179 }{ 180 {scenario: "plain", typ: test.typ}, 181 {scenario: "indexed", typ: test.typ.NewDictionary(0, 0, nil).Type()}, 182 } { 183 t.Run(config.scenario, func(t *testing.T) { 184 for _, mod := range [...]struct { 185 scenario string 186 function func(parquet.Node) parquet.Node 187 }{ 188 {scenario: "optional", function: parquet.Optional}, 189 {scenario: "repeated", function: parquet.Repeated}, 190 {scenario: "required", function: parquet.Required}, 191 } { 192 t.Run(mod.scenario, func(t *testing.T) { 193 for _, ordering := range [...]struct { 194 scenario string 195 sorting parquet.SortingColumn 196 sortFunc func(parquet.Type, []parquet.Value) 197 }{ 198 {scenario: "unordered", sorting: nil, sortFunc: unordered}, 199 {scenario: "ascending", sorting: parquet.Ascending("data"), sortFunc: ascending}, 200 {scenario: "descending", sorting: parquet.Descending("data"), sortFunc: descending}, 201 } { 202 t.Run(ordering.scenario, func(t *testing.T) { 203 schema := parquet.NewSchema("test", parquet.Group{ 204 "data": mod.function(parquet.Leaf(config.typ)), 205 }) 206 207 options := []parquet.RowGroupOption{ 208 schema, 209 parquet.ColumnBufferCapacity(100), 210 } 211 if ordering.sorting != nil { 212 options = append(options, parquet.SortingColumns(ordering.sorting)) 213 } 214 215 content := new(bytes.Buffer) 216 buffer := parquet.NewBuffer(options...) 217 218 for _, values := range test.values { 219 t.Run("", func(t *testing.T) { 220 defer content.Reset() 221 defer buffer.Reset() 222 fields := schema.Fields() 223 testBuffer(t, fields[0], buffer, &parquet.Plain, values, ordering.sortFunc) 224 }) 225 } 226 }) 227 } 228 }) 229 } 230 }) 231 } 232 }) 233 } 234 } 235 236 type sortFunc func(parquet.Type, []parquet.Value) 237 238 func unordered(typ parquet.Type, values []parquet.Value) {} 239 240 func ascending(typ parquet.Type, values []parquet.Value) { 241 sort.Slice(values, func(i, j int) bool { return typ.Compare(values[i], values[j]) < 0 }) 242 } 243 244 func descending(typ parquet.Type, values []parquet.Value) { 245 sort.Slice(values, func(i, j int) bool { return typ.Compare(values[i], values[j]) > 0 }) 246 } 247 248 func testBuffer(t *testing.T, node parquet.Node, buffer *parquet.Buffer, encoding encoding.Encoding, values []interface{}, sortFunc sortFunc) { 249 repetitionLevel := 0 250 definitionLevel := 0 251 if !node.Required() { 252 definitionLevel = 1 253 } 254 255 minValue := parquet.Value{} 256 maxValue := parquet.Value{} 257 batch := make([]parquet.Value, len(values)) 258 for i := range values { 259 batch[i] = parquet.ValueOf(values[i]).Level(repetitionLevel, definitionLevel, 0) 260 } 261 262 for i := range batch { 263 _, err := buffer.WriteRows([]parquet.Row{batch[i : i+1]}) 264 if err != nil { 265 t.Fatalf("writing value to row group: %v", err) 266 } 267 } 268 269 numRows := buffer.NumRows() 270 if numRows != int64(len(batch)) { 271 t.Fatalf("number of rows mismatch: want=%d got=%d", len(batch), numRows) 272 } 273 274 typ := node.Type() 275 for _, value := range batch { 276 if minValue.IsNull() || typ.Compare(value, minValue) < 0 { 277 minValue = value 278 } 279 if maxValue.IsNull() || typ.Compare(value, maxValue) > 0 { 280 maxValue = value 281 } 282 } 283 284 sortFunc(typ, batch) 285 sort.Sort(buffer) 286 287 page := buffer.ColumnBuffers()[0].Page() 288 numValues := page.NumValues() 289 if numValues != int64(len(batch)) { 290 t.Fatalf("number of values mistmatch: want=%d got=%d", len(batch), numValues) 291 } 292 293 numNulls := page.NumNulls() 294 if numNulls != 0 { 295 t.Fatalf("number of nulls mismatch: want=0 got=%d", numNulls) 296 } 297 298 min, max, hasBounds := page.Bounds() 299 if !hasBounds && numRows > 0 { 300 t.Fatal("page bounds are missing") 301 } 302 if !parquet.Equal(min, minValue) { 303 t.Fatalf("min value mismatch: want=%v got=%v", minValue, min) 304 } 305 if !parquet.Equal(max, maxValue) { 306 t.Fatalf("max value mismatch: want=%v got=%v", maxValue, max) 307 } 308 309 // We write a single value per row, so num values = num rows for all pages 310 // including repeated ones, which makes it OK to slice the pages using the 311 // number of values as a proxy for the row indexes. 312 halfValues := numValues / 2 313 314 for _, test := range [...]struct { 315 scenario string 316 values []parquet.Value 317 reader parquet.ValueReader 318 }{ 319 {"page", batch, page.Values()}, 320 {"head", batch[:halfValues], page.Slice(0, halfValues).Values()}, 321 {"tail", batch[halfValues:], page.Slice(halfValues, numValues).Values()}, 322 } { 323 v := [1]parquet.Value{} 324 i := 0 325 326 for { 327 n, err := test.reader.ReadValues(v[:]) 328 if n > 0 { 329 if n != 1 { 330 t.Fatalf("reading value from %q reader returned the wrong count: want=1 got=%d", test.scenario, n) 331 } 332 if i < len(test.values) { 333 if !parquet.Equal(v[0], test.values[i]) { 334 t.Fatalf("%q value at index %d mismatches: want=%v got=%v", test.scenario, i, test.values[i], v[0]) 335 } 336 } 337 i++ 338 } 339 if err != nil { 340 if err == io.EOF { 341 break 342 } 343 t.Fatalf("reading value from %q reader: %v", test.scenario, err) 344 } 345 } 346 347 if i != len(test.values) { 348 t.Errorf("wrong number of values read from %q reader: want=%d got=%d", test.scenario, len(test.values), i) 349 } 350 } 351 } 352 353 func TestBufferGenerateBloomFilters(t *testing.T) { 354 type Point3D struct { 355 X float64 356 Y float64 357 Z float64 358 } 359 360 f := func(rows []Point3D) bool { 361 if len(rows) == 0 { // TODO: support writing files with no rows 362 return true 363 } 364 365 output := new(bytes.Buffer) 366 buffer := parquet.NewBuffer() 367 writer := parquet.NewWriter(output, 368 parquet.BloomFilters( 369 parquet.SplitBlockFilter("X"), 370 parquet.SplitBlockFilter("Y"), 371 parquet.SplitBlockFilter("Z"), 372 ), 373 ) 374 for i := range rows { 375 buffer.Write(&rows[i]) 376 } 377 _, err := copyRowsAndClose(writer, buffer.Rows()) 378 if err != nil { 379 t.Error(err) 380 return false 381 } 382 if err := writer.Close(); err != nil { 383 t.Error(err) 384 return false 385 } 386 387 reader := bytes.NewReader(output.Bytes()) 388 f, err := parquet.OpenFile(reader, reader.Size()) 389 if err != nil { 390 t.Error(err) 391 return false 392 } 393 rowGroup := f.RowGroups()[0] 394 columns := rowGroup.ColumnChunks() 395 x := columns[0] 396 y := columns[1] 397 z := columns[2] 398 399 for i, col := range []parquet.ColumnChunk{x, y, z} { 400 if col.BloomFilter() == nil { 401 t.Errorf("column %d has no bloom filter despite being configured to have one", i) 402 return false 403 } 404 } 405 406 fx := x.BloomFilter() 407 fy := y.BloomFilter() 408 fz := z.BloomFilter() 409 410 test := func(f parquet.BloomFilter, v float64) bool { 411 if ok, err := f.Check(parquet.ValueOf(v)); err != nil { 412 t.Errorf("unexpected error checking bloom filter: %v", err) 413 return false 414 } else if !ok { 415 t.Errorf("bloom filter does not contain value %g", v) 416 return false 417 } 418 return true 419 } 420 421 for _, row := range rows { 422 if !test(fx, row.X) || !test(fy, row.Y) || !test(fz, row.Z) { 423 return false 424 } 425 } 426 427 return true 428 } 429 430 if err := quickCheck(f); err != nil { 431 t.Error(err) 432 } 433 } 434 435 func TestBufferRoundtripNestedRepeated(t *testing.T) { 436 type C struct { 437 D int 438 } 439 type B struct { 440 C []C 441 } 442 type A struct { 443 B []B 444 } 445 446 // Write enough objects to exceed first page 447 buffer := parquet.NewBuffer() 448 var objs []A 449 for i := 0; i < 6; i++ { 450 o := A{[]B{{[]C{ 451 {i}, 452 {i}, 453 }}}} 454 buffer.Write(&o) 455 objs = append(objs, o) 456 } 457 458 buf := new(bytes.Buffer) 459 w := parquet.NewWriter(buf, parquet.PageBufferSize(100)) 460 w.WriteRowGroup(buffer) 461 w.Flush() 462 w.Close() 463 464 file := bytes.NewReader(buf.Bytes()) 465 r := parquet.NewReader(file) 466 for i := 0; ; i++ { 467 o := new(A) 468 err := r.Read(o) 469 if errors.Is(err, io.EOF) { 470 if i < len(objs) { 471 t.Errorf("too few rows were read: %d<%d", i, len(objs)) 472 } 473 break 474 } 475 if !reflect.DeepEqual(*o, objs[i]) { 476 t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o) 477 } 478 } 479 } 480 481 func TestBufferRoundtripNestedRepeatedPointer(t *testing.T) { 482 type C struct { 483 D *int 484 } 485 type B struct { 486 C []C 487 } 488 type A struct { 489 B []B 490 } 491 492 // Write enough objects to exceed first page 493 buffer := parquet.NewBuffer() 494 var objs []A 495 for i := 0; i < 6; i++ { 496 j := i 497 o := A{[]B{{[]C{ 498 {&j}, 499 {nil}, 500 }}}} 501 buffer.Write(&o) 502 objs = append(objs, o) 503 } 504 505 buf := new(bytes.Buffer) 506 w := parquet.NewWriter(buf, parquet.PageBufferSize(100)) 507 w.WriteRowGroup(buffer) 508 w.Flush() 509 w.Close() 510 511 file := bytes.NewReader(buf.Bytes()) 512 r := parquet.NewReader(file) 513 for i := 0; ; i++ { 514 o := new(A) 515 err := r.Read(o) 516 if err == io.EOF { 517 break 518 } 519 if !reflect.DeepEqual(*o, objs[i]) { 520 t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o) 521 } 522 } 523 } 524 525 func TestRoundtripNestedRepeatedBytes(t *testing.T) { 526 type B struct { 527 C []byte 528 } 529 type A struct { 530 A string 531 B []B 532 } 533 534 var objs []A 535 for i := 0; i < 2; i++ { 536 o := A{ 537 "test" + strconv.Itoa(i), 538 []B{ 539 {[]byte{byte(i)}}, 540 }, 541 } 542 objs = append(objs, o) 543 } 544 545 buf := new(bytes.Buffer) 546 w := parquet.NewWriter(buf, parquet.PageBufferSize(100)) 547 for _, o := range objs { 548 w.Write(&o) 549 } 550 w.Close() 551 552 file := bytes.NewReader(buf.Bytes()) 553 554 r := parquet.NewReader(file) 555 for i := 0; ; i++ { 556 o := new(A) 557 err := r.Read(o) 558 if errors.Is(err, io.EOF) { 559 if i < len(objs) { 560 t.Errorf("too few rows were read: %d<%d", i, len(objs)) 561 } 562 break 563 } 564 if !reflect.DeepEqual(*o, objs[i]) { 565 t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o) 566 } 567 } 568 } 569 570 func TestBufferSeekToRow(t *testing.T) { 571 type B struct { 572 I int 573 C []string 574 } 575 type A struct { 576 B []B 577 } 578 579 buffer := parquet.NewBuffer() 580 var objs []A 581 for i := 0; i < 2; i++ { 582 o := A{ 583 B: []B{ 584 {I: i, C: []string{"foo", strconv.Itoa(i)}}, 585 {I: i + 1, C: []string{"bar", strconv.Itoa(i + 1)}}, 586 }, 587 } 588 buffer.Write(&o) 589 objs = append(objs, o) 590 } 591 592 buf := new(bytes.Buffer) 593 w := parquet.NewWriter(buf) 594 w.WriteRowGroup(buffer) 595 w.Flush() 596 w.Close() 597 598 file := bytes.NewReader(buf.Bytes()) 599 r := parquet.NewReader(file) 600 601 i := 1 602 o := new(A) 603 if err := r.SeekToRow(int64(i)); err != nil { 604 t.Fatal(err) 605 } 606 if err := r.Read(o); err != nil { 607 t.Fatal(err) 608 } 609 if !reflect.DeepEqual(*o, objs[i]) { 610 t.Errorf("points mismatch at row index %d: want=%v got=%v", i, objs[i], o) 611 } 612 } 613 614 type TestStruct struct { 615 A *string `parquet:"a,optional,dict"` 616 } 617 618 func TestOptionalDictWriteRowGroup(t *testing.T) { 619 s := parquet.SchemaOf(&TestStruct{}) 620 621 str1 := "test1" 622 str2 := "test2" 623 records := []*TestStruct{ 624 {A: nil}, 625 {A: &str1}, 626 {A: nil}, 627 {A: &str2}, 628 {A: nil}, 629 } 630 631 buf := parquet.NewBuffer(s) 632 for _, rec := range records { 633 row := s.Deconstruct(nil, rec) 634 _, err := buf.WriteRows([]parquet.Row{row}) 635 if err != nil { 636 t.Fatal(err) 637 } 638 } 639 640 b := bytes.NewBuffer(nil) 641 w := parquet.NewWriter(b) 642 _, err := w.WriteRowGroup(buf) 643 if err != nil { 644 t.Fatal(err) 645 } 646 } 647 648 func generateBenchmarkBufferRows(n int) (*parquet.Schema, []parquet.Row) { 649 model := new(benchmarkRowType) 650 schema := parquet.SchemaOf(model) 651 prng := rand.New(rand.NewSource(0)) 652 rows := make([]parquet.Row, n) 653 654 for i := range rows { 655 io.ReadFull(prng, model.ID[:]) 656 model.Value = prng.Float64() 657 rows[i] = make(parquet.Row, 0, 2) 658 rows[i] = schema.Deconstruct(rows[i], model) 659 } 660 661 return schema, rows 662 } 663 664 func BenchmarkBufferReadRows100x(b *testing.B) { 665 schema, rows := generateBenchmarkBufferRows(benchmarkNumRows) 666 buffer := parquet.NewBuffer(schema) 667 668 for i := 0; i < len(rows); i += benchmarkRowsPerStep { 669 j := i + benchmarkRowsPerStep 670 if _, err := buffer.WriteRows(rows[i:j]); err != nil { 671 b.Fatal(err) 672 } 673 } 674 675 bufferRows := buffer.Rows() 676 defer bufferRows.Close() 677 678 benchmarkRowsPerSecond(b, func() int { 679 n, err := bufferRows.ReadRows(rows[:benchmarkRowsPerStep]) 680 if err != nil { 681 if errors.Is(err, io.EOF) { 682 err = bufferRows.SeekToRow(0) 683 } 684 if err != nil { 685 b.Fatal(err) 686 } 687 } 688 return n 689 }) 690 } 691 692 func BenchmarkBufferWriteRows100x(b *testing.B) { 693 schema, rows := generateBenchmarkBufferRows(benchmarkNumRows) 694 buffer := parquet.NewBuffer(schema) 695 696 i := 0 697 benchmarkRowsPerSecond(b, func() int { 698 n, err := buffer.WriteRows(rows[i : i+benchmarkRowsPerStep]) 699 if err != nil { 700 b.Fatal(err) 701 } 702 703 i += benchmarkRowsPerStep 704 i %= benchmarkNumRows 705 706 if i == 0 { 707 buffer.Reset() 708 } 709 return n 710 }) 711 }