github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/merge_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "errors" 6 "fmt" 7 "io" 8 "math/rand" 9 "sort" 10 "testing" 11 12 "github.com/parquet-go/parquet-go" 13 ) 14 15 const ( 16 numRowGroups = 3 17 rowsPerGroup = benchmarkNumRows 18 ) 19 20 type wrappedRowGroup struct { 21 parquet.RowGroup 22 rowsCallback func(parquet.Rows) parquet.Rows 23 } 24 25 func (r wrappedRowGroup) Rows() parquet.Rows { 26 return r.rowsCallback(r.RowGroup.Rows()) 27 } 28 29 type wrappedRows struct { 30 parquet.Rows 31 closed bool 32 } 33 34 func (r *wrappedRows) Close() error { 35 r.closed = true 36 return r.Rows.Close() 37 } 38 39 func TestMergeRowGroups(t *testing.T) { 40 tests := []struct { 41 scenario string 42 options []parquet.RowGroupOption 43 input []parquet.RowGroup 44 output parquet.RowGroup 45 }{ 46 { 47 scenario: "no row groups", 48 options: []parquet.RowGroupOption{ 49 parquet.SchemaOf(Person{}), 50 }, 51 output: sortedRowGroup( 52 []parquet.RowGroupOption{ 53 parquet.SchemaOf(Person{}), 54 }, 55 ), 56 }, 57 58 { 59 scenario: "a single row group", 60 input: []parquet.RowGroup{ 61 sortedRowGroup(nil, 62 Person{FirstName: "some", LastName: "one", Age: 30}, 63 Person{FirstName: "some", LastName: "one else", Age: 31}, 64 Person{FirstName: "and", LastName: "you", Age: 32}, 65 ), 66 }, 67 output: sortedRowGroup(nil, 68 Person{FirstName: "some", LastName: "one", Age: 30}, 69 Person{FirstName: "some", LastName: "one else", Age: 31}, 70 Person{FirstName: "and", LastName: "you", Age: 32}, 71 ), 72 }, 73 74 { 75 scenario: "two row groups without ordering", 76 input: []parquet.RowGroup{ 77 sortedRowGroup(nil, Person{FirstName: "some", LastName: "one", Age: 30}), 78 sortedRowGroup(nil, Person{FirstName: "some", LastName: "one else", Age: 31}), 79 }, 80 output: sortedRowGroup(nil, 81 Person{FirstName: "some", LastName: "one", Age: 30}, 82 Person{FirstName: "some", LastName: "one else", Age: 31}, 83 ), 84 }, 85 86 { 87 scenario: "three row groups without ordering", 88 input: []parquet.RowGroup{ 89 sortedRowGroup(nil, Person{FirstName: "some", LastName: "one", Age: 30}), 90 sortedRowGroup(nil, Person{FirstName: "some", LastName: "one else", Age: 31}), 91 sortedRowGroup(nil, Person{FirstName: "question", LastName: "answer", Age: 42}), 92 }, 93 output: sortedRowGroup(nil, 94 Person{FirstName: "some", LastName: "one", Age: 30}, 95 Person{FirstName: "some", LastName: "one else", Age: 31}, 96 Person{FirstName: "question", LastName: "answer", Age: 42}, 97 ), 98 }, 99 100 { 101 scenario: "row groups sorted by ascending last name", 102 options: []parquet.RowGroupOption{ 103 parquet.SortingRowGroupConfig( 104 parquet.SortingColumns( 105 parquet.Ascending("LastName"), 106 ), 107 ), 108 }, 109 input: []parquet.RowGroup{ 110 sortedRowGroup( 111 []parquet.RowGroupOption{ 112 parquet.SortingRowGroupConfig( 113 parquet.SortingColumns( 114 parquet.Ascending("LastName"), 115 ), 116 ), 117 }, 118 Person{FirstName: "Han", LastName: "Solo"}, 119 Person{FirstName: "Luke", LastName: "Skywalker"}, 120 ), 121 sortedRowGroup( 122 []parquet.RowGroupOption{ 123 parquet.SortingRowGroupConfig( 124 parquet.SortingColumns( 125 parquet.Ascending("LastName"), 126 ), 127 ), 128 }, 129 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 130 ), 131 }, 132 output: sortedRowGroup(nil, 133 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 134 Person{FirstName: "Luke", LastName: "Skywalker"}, 135 Person{FirstName: "Han", LastName: "Solo"}, 136 ), 137 }, 138 { 139 scenario: "reproduce issue #66, merging rows with an empty row group", 140 options: []parquet.RowGroupOption{ 141 parquet.SortingRowGroupConfig( 142 parquet.SortingColumns( 143 parquet.Ascending("LastName"), 144 ), 145 ), 146 }, 147 input: []parquet.RowGroup{ 148 sortedRowGroup( 149 []parquet.RowGroupOption{ 150 parquet.SortingRowGroupConfig( 151 parquet.SortingColumns( 152 parquet.Ascending("LastName"), 153 ), 154 ), 155 }, 156 Person{FirstName: "Han", LastName: "Solo"}, 157 ), 158 159 sortedRowGroup( 160 []parquet.RowGroupOption{ 161 parquet.SchemaOf(Person{}), 162 parquet.SortingRowGroupConfig( 163 parquet.SortingColumns( 164 parquet.Ascending("LastName"), 165 ), 166 ), 167 }, 168 ), 169 sortedRowGroup( 170 []parquet.RowGroupOption{ 171 parquet.SortingRowGroupConfig( 172 parquet.SortingColumns( 173 parquet.Ascending("LastName"), 174 ), 175 ), 176 }, 177 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 178 ), 179 }, 180 output: sortedRowGroup(nil, 181 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 182 Person{FirstName: "Han", LastName: "Solo"}, 183 ), 184 }, 185 { 186 scenario: "row groups sorted by descending last name", 187 options: []parquet.RowGroupOption{ 188 parquet.SortingRowGroupConfig( 189 parquet.SortingColumns( 190 parquet.Descending("LastName"), 191 ), 192 ), 193 }, 194 input: []parquet.RowGroup{ 195 sortedRowGroup( 196 []parquet.RowGroupOption{ 197 parquet.SortingRowGroupConfig( 198 parquet.SortingColumns( 199 parquet.Descending("LastName"), 200 ), 201 ), 202 }, 203 Person{FirstName: "Han", LastName: "Solo"}, 204 Person{FirstName: "Luke", LastName: "Skywalker"}, 205 ), 206 sortedRowGroup( 207 []parquet.RowGroupOption{ 208 parquet.SortingRowGroupConfig( 209 parquet.SortingColumns( 210 parquet.Descending("LastName"), 211 ), 212 ), 213 }, 214 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 215 ), 216 }, 217 output: sortedRowGroup(nil, 218 Person{FirstName: "Han", LastName: "Solo"}, 219 Person{FirstName: "Luke", LastName: "Skywalker"}, 220 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 221 ), 222 }, 223 224 { 225 scenario: "row groups sorted by ascending last and first name", 226 options: []parquet.RowGroupOption{ 227 parquet.SortingRowGroupConfig( 228 parquet.SortingColumns( 229 parquet.Ascending("LastName"), 230 parquet.Ascending("FirstName"), 231 ), 232 ), 233 }, 234 input: []parquet.RowGroup{ 235 sortedRowGroup( 236 []parquet.RowGroupOption{ 237 parquet.SortingRowGroupConfig( 238 parquet.SortingColumns( 239 parquet.Ascending("LastName"), 240 parquet.Ascending("FirstName"), 241 ), 242 ), 243 }, 244 Person{FirstName: "Luke", LastName: "Skywalker"}, 245 Person{FirstName: "Han", LastName: "Solo"}, 246 ), 247 sortedRowGroup( 248 []parquet.RowGroupOption{ 249 parquet.SortingRowGroupConfig( 250 parquet.SortingColumns( 251 parquet.Ascending("LastName"), 252 parquet.Ascending("FirstName"), 253 ), 254 ), 255 }, 256 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 257 Person{FirstName: "Anakin", LastName: "Skywalker"}, 258 ), 259 }, 260 output: sortedRowGroup(nil, 261 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 262 Person{FirstName: "Anakin", LastName: "Skywalker"}, 263 Person{FirstName: "Luke", LastName: "Skywalker"}, 264 Person{FirstName: "Han", LastName: "Solo"}, 265 ), 266 }, 267 268 { 269 scenario: "row groups with conversion to a different schema", 270 options: []parquet.RowGroupOption{ 271 parquet.SchemaOf(LastNameOnly{}), 272 parquet.SortingRowGroupConfig( 273 parquet.SortingColumns( 274 parquet.Ascending("LastName"), 275 ), 276 ), 277 }, 278 input: []parquet.RowGroup{ 279 sortedRowGroup( 280 []parquet.RowGroupOption{ 281 parquet.SortingRowGroupConfig( 282 parquet.SortingColumns( 283 parquet.Ascending("LastName"), 284 ), 285 ), 286 }, 287 Person{FirstName: "Han", LastName: "Solo"}, 288 Person{FirstName: "Luke", LastName: "Skywalker"}, 289 ), 290 sortedRowGroup( 291 []parquet.RowGroupOption{ 292 parquet.SortingRowGroupConfig( 293 parquet.SortingColumns( 294 parquet.Ascending("LastName"), 295 ), 296 ), 297 }, 298 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 299 Person{FirstName: "Anakin", LastName: "Skywalker"}, 300 ), 301 }, 302 output: sortedRowGroup( 303 []parquet.RowGroupOption{ 304 parquet.SortingRowGroupConfig( 305 parquet.SortingColumns( 306 parquet.Ascending("LastName"), 307 ), 308 ), 309 }, 310 LastNameOnly{LastName: "Solo"}, 311 LastNameOnly{LastName: "Skywalker"}, 312 LastNameOnly{LastName: "Skywalker"}, 313 LastNameOnly{LastName: "Kenobi"}, 314 ), 315 }, 316 } 317 318 for _, adapter := range []struct { 319 scenario string 320 function func(parquet.RowGroup) parquet.RowGroup 321 }{ 322 {scenario: "buffer", function: selfRowGroup}, 323 {scenario: "file", function: fileRowGroup}, 324 } { 325 t.Run(adapter.scenario, func(t *testing.T) { 326 for _, test := range tests { 327 t.Run(test.scenario, func(t *testing.T) { 328 input := make([]parquet.RowGroup, len(test.input)) 329 for i := range test.input { 330 input[i] = adapter.function(test.input[i]) 331 } 332 333 merged, err := parquet.MergeRowGroups(test.input, test.options...) 334 if err != nil { 335 t.Fatal(err) 336 } 337 if merged.NumRows() != test.output.NumRows() { 338 t.Fatalf("the number of rows mismatch: want=%d got=%d", merged.NumRows(), test.output.NumRows()) 339 } 340 if merged.Schema() != test.output.Schema() { 341 t.Fatalf("the row group schemas mismatch:\n%v\n%v", test.output.Schema(), merged.Schema()) 342 } 343 344 options := []parquet.RowGroupOption{parquet.SchemaOf(Person{})} 345 options = append(options, test.options...) 346 // We test two views of the resulting row group: the one originally 347 // returned by MergeRowGroups, and one where the merged row group 348 // has been copied into a new buffer. The intent is to exercise both 349 // the row-by-row read as well as optimized code paths when CopyRows 350 // bypasses the ReadRow/WriteRow calls and the row group is written 351 // directly to the buffer by calling WriteRowsTo/WriteRowGroup. 352 mergedCopy := parquet.NewBuffer(options...) 353 354 totalRows := test.output.NumRows() 355 numRows, err := copyRowsAndClose(mergedCopy, merged.Rows()) 356 if err != nil { 357 t.Fatal(err) 358 } 359 if numRows != totalRows { 360 t.Fatalf("wrong number of rows copied: want=%d got=%d", totalRows, numRows) 361 } 362 363 for _, merge := range []struct { 364 scenario string 365 rowGroup parquet.RowGroup 366 }{ 367 {scenario: "self", rowGroup: merged}, 368 {scenario: "copy", rowGroup: mergedCopy}, 369 } { 370 t.Run(merge.scenario, func(t *testing.T) { 371 var expectedRows = test.output.Rows() 372 var mergedRows = merge.rowGroup.Rows() 373 var row1 = make([]parquet.Row, 1) 374 var row2 = make([]parquet.Row, 1) 375 var numRows int64 376 377 defer expectedRows.Close() 378 defer mergedRows.Close() 379 380 for { 381 _, err1 := expectedRows.ReadRows(row1) 382 n, err2 := mergedRows.ReadRows(row2) 383 384 if err1 != err2 { 385 // ReadRows may or may not return io.EOF 386 // when it reads the last row, so we test 387 // that the reference RowReader has also 388 // reached the end. 389 if err1 == nil && err2 == io.EOF { 390 _, err1 = expectedRows.ReadRows(row1[:0]) 391 } 392 if err1 != io.EOF { 393 t.Fatalf("errors mismatched while comparing row %d/%d: want=%v got=%v", numRows, totalRows, err1, err2) 394 } 395 } 396 397 if n != 0 { 398 if !row1[0].Equal(row2[0]) { 399 t.Errorf("row at index %d/%d mismatch: want=%+v got=%+v", numRows, totalRows, row1[0], row2[0]) 400 } 401 numRows++ 402 } 403 404 if err1 != nil { 405 break 406 } 407 } 408 409 if numRows != totalRows { 410 t.Errorf("expected to read %d rows but %d were found", totalRows, numRows) 411 } 412 }) 413 } 414 415 }) 416 } 417 }) 418 } 419 } 420 421 func TestMergeRowGroupsCursorsAreClosed(t *testing.T) { 422 type model struct { 423 A int 424 } 425 426 schema := parquet.SchemaOf(model{}) 427 options := []parquet.RowGroupOption{ 428 parquet.SortingRowGroupConfig( 429 parquet.SortingColumns( 430 parquet.Ascending(schema.Columns()[0]...), 431 ), 432 ), 433 } 434 435 prng := rand.New(rand.NewSource(0)) 436 rowGroups := make([]parquet.RowGroup, numRowGroups) 437 rows := make([]*wrappedRows, 0, numRowGroups) 438 439 for i := range rowGroups { 440 rowGroups[i] = wrappedRowGroup{ 441 RowGroup: sortedRowGroup(options, randomRowsOf(prng, rowsPerGroup, model{})...), 442 rowsCallback: func(r parquet.Rows) parquet.Rows { 443 wrapped := &wrappedRows{Rows: r} 444 rows = append(rows, wrapped) 445 return wrapped 446 }, 447 } 448 } 449 450 m, err := parquet.MergeRowGroups(rowGroups, options...) 451 if err != nil { 452 t.Fatal(err) 453 } 454 func() { 455 mergedRows := m.Rows() 456 defer mergedRows.Close() 457 458 // Add 1 more slot to the buffer to force an io.EOF on the first read. 459 rbuf := make([]parquet.Row, (numRowGroups*rowsPerGroup)+1) 460 if _, err := mergedRows.ReadRows(rbuf); !errors.Is(err, io.EOF) { 461 t.Fatal(err) 462 } 463 }() 464 465 for i, wrapped := range rows { 466 if !wrapped.closed { 467 t.Fatalf("RowGroup %d not closed", i) 468 } 469 } 470 } 471 472 func TestMergeRowGroupsSeekToRow(t *testing.T) { 473 type model struct { 474 A int 475 } 476 477 schema := parquet.SchemaOf(model{}) 478 options := []parquet.RowGroupOption{ 479 parquet.SortingRowGroupConfig( 480 parquet.SortingColumns( 481 parquet.Ascending(schema.Columns()[0]...), 482 ), 483 ), 484 } 485 486 rowGroups := make([]parquet.RowGroup, numRowGroups) 487 488 counter := 0 489 for i := range rowGroups { 490 rows := make([]interface{}, 0, rowsPerGroup) 491 for j := 0; j < rowsPerGroup; j++ { 492 rows = append(rows, model{A: counter}) 493 counter++ 494 } 495 rowGroups[i] = sortedRowGroup(options, rows...) 496 } 497 498 m, err := parquet.MergeRowGroups(rowGroups, options...) 499 if err != nil { 500 t.Fatal(err) 501 } 502 503 func() { 504 mergedRows := m.Rows() 505 defer mergedRows.Close() 506 507 rbuf := make([]parquet.Row, 1) 508 cursor := int64(0) 509 for { 510 if err := mergedRows.SeekToRow(cursor); err != nil { 511 t.Fatal(err) 512 } 513 514 if _, err := mergedRows.ReadRows(rbuf); err != nil { 515 if errors.Is(err, io.EOF) { 516 break 517 } 518 t.Fatal(err) 519 } 520 v := model{} 521 if err := schema.Reconstruct(&v, rbuf[0]); err != nil { 522 t.Fatal(err) 523 } 524 if v.A != int(cursor) { 525 t.Fatalf("expected value %d, got %d", cursor, v.A) 526 } 527 528 cursor++ 529 } 530 }() 531 } 532 533 func BenchmarkMergeRowGroups(b *testing.B) { 534 for _, test := range readerTests { 535 b.Run(test.scenario, func(b *testing.B) { 536 schema := parquet.SchemaOf(test.model) 537 538 options := []parquet.RowGroupOption{ 539 parquet.SortingRowGroupConfig( 540 parquet.SortingColumns( 541 parquet.Ascending(schema.Columns()[0]...), 542 ), 543 ), 544 } 545 546 prng := rand.New(rand.NewSource(0)) 547 rowGroups := make([]parquet.RowGroup, numRowGroups) 548 549 for i := range rowGroups { 550 rowGroups[i] = sortedRowGroup(options, randomRowsOf(prng, rowsPerGroup, test.model)...) 551 } 552 553 for n := 1; n <= numRowGroups; n++ { 554 b.Run(fmt.Sprintf("groups=%d,rows=%d", n, n*rowsPerGroup), func(b *testing.B) { 555 mergedRowGroup, err := parquet.MergeRowGroups(rowGroups[:n], options...) 556 if err != nil { 557 b.Fatal(err) 558 } 559 560 rows := mergedRowGroup.Rows() 561 rbuf := make([]parquet.Row, benchmarkRowsPerStep) 562 defer func() { rows.Close() }() 563 564 benchmarkRowsPerSecond(b, func() int { 565 n, err := rows.ReadRows(rbuf) 566 if err != nil { 567 if !errors.Is(err, io.EOF) { 568 b.Fatal(err) 569 } 570 rows.Close() 571 rows = mergedRowGroup.Rows() 572 } 573 return n 574 }) 575 }) 576 } 577 }) 578 } 579 } 580 581 func BenchmarkMergeFiles(b *testing.B) { 582 rowGroupBuffers := make([]bytes.Buffer, numRowGroups) 583 584 for _, test := range readerTests { 585 b.Run(test.scenario, func(b *testing.B) { 586 schema := parquet.SchemaOf(test.model) 587 588 sortingOptions := []parquet.SortingOption{ 589 parquet.SortingColumns( 590 parquet.Ascending(schema.Columns()[0]...), 591 ), 592 } 593 594 options := []parquet.RowGroupOption{ 595 schema, 596 parquet.SortingRowGroupConfig( 597 sortingOptions..., 598 ), 599 } 600 601 buffer := parquet.NewBuffer(options...) 602 603 prng := rand.New(rand.NewSource(0)) 604 files := make([]*parquet.File, numRowGroups) 605 rowGroups := make([]parquet.RowGroup, numRowGroups) 606 607 for i := range files { 608 for _, row := range randomRowsOf(prng, rowsPerGroup, test.model) { 609 buffer.Write(row) 610 } 611 sort.Sort(buffer) 612 rowGroupBuffers[i].Reset() 613 writer := parquet.NewWriter(&rowGroupBuffers[i], 614 schema, 615 parquet.SortingWriterConfig( 616 sortingOptions..., 617 ), 618 ) 619 _, err := copyRowsAndClose(writer, buffer.Rows()) 620 if err != nil { 621 b.Fatal(err) 622 } 623 if err := writer.Close(); err != nil { 624 b.Fatal(err) 625 } 626 r := bytes.NewReader(rowGroupBuffers[i].Bytes()) 627 f, err := parquet.OpenFile(r, r.Size()) 628 if err != nil { 629 b.Fatal(err) 630 } 631 files[i], rowGroups[i] = f, f.RowGroups()[0] 632 } 633 634 for n := 1; n <= numRowGroups; n++ { 635 b.Run(fmt.Sprintf("groups=%d,rows=%d", n, n*rowsPerGroup), func(b *testing.B) { 636 mergedRowGroup, err := parquet.MergeRowGroups(rowGroups[:n], options...) 637 if err != nil { 638 b.Fatal(err) 639 } 640 641 rows := mergedRowGroup.Rows() 642 rbuf := make([]parquet.Row, benchmarkRowsPerStep) 643 defer func() { rows.Close() }() 644 645 benchmarkRowsPerSecond(b, func() int { 646 n, err := rows.ReadRows(rbuf) 647 if err != nil { 648 if !errors.Is(err, io.EOF) { 649 b.Fatal(err) 650 } 651 rows.Close() 652 rows = mergedRowGroup.Rows() 653 } 654 return n 655 }) 656 657 totalSize := int64(0) 658 for _, f := range files[:n] { 659 totalSize += f.Size() 660 } 661 }) 662 } 663 }) 664 } 665 }