github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/merge_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "errors" 6 "fmt" 7 "io" 8 "math/rand" 9 "sort" 10 "testing" 11 12 "github.com/segmentio/parquet-go" 13 ) 14 15 const ( 16 numRowGroups = 3 17 rowsPerGroup = benchmarkNumRows 18 ) 19 20 type wrappedRowGroup struct { 21 parquet.RowGroup 22 rowsCallback func(parquet.Rows) parquet.Rows 23 } 24 25 func (r wrappedRowGroup) Rows() parquet.Rows { 26 return r.rowsCallback(r.RowGroup.Rows()) 27 } 28 29 type wrappedRows struct { 30 parquet.Rows 31 closed bool 32 } 33 34 func (r *wrappedRows) Close() error { 35 r.closed = true 36 return r.Rows.Close() 37 } 38 39 func TestMergeRowGroups(t *testing.T) { 40 tests := []struct { 41 scenario string 42 options []parquet.RowGroupOption 43 input []parquet.RowGroup 44 output parquet.RowGroup 45 }{ 46 { 47 scenario: "no row groups", 48 options: []parquet.RowGroupOption{ 49 parquet.SchemaOf(Person{}), 50 }, 51 output: sortedRowGroup( 52 []parquet.RowGroupOption{ 53 parquet.SchemaOf(Person{}), 54 }, 55 ), 56 }, 57 58 { 59 scenario: "a single row group", 60 input: []parquet.RowGroup{ 61 sortedRowGroup(nil, 62 Person{FirstName: "some", LastName: "one", Age: 30}, 63 Person{FirstName: "some", LastName: "one else", Age: 31}, 64 Person{FirstName: "and", LastName: "you", Age: 32}, 65 ), 66 }, 67 output: sortedRowGroup(nil, 68 Person{FirstName: "some", LastName: "one", Age: 30}, 69 Person{FirstName: "some", LastName: "one else", Age: 31}, 70 Person{FirstName: "and", LastName: "you", Age: 32}, 71 ), 72 }, 73 74 { 75 scenario: "two row groups without ordering", 76 input: []parquet.RowGroup{ 77 sortedRowGroup(nil, Person{FirstName: "some", LastName: "one", Age: 30}), 78 sortedRowGroup(nil, Person{FirstName: "some", LastName: "one else", Age: 31}), 79 }, 80 output: sortedRowGroup(nil, 81 Person{FirstName: "some", LastName: "one", Age: 30}, 82 Person{FirstName: "some", LastName: "one else", Age: 31}, 83 ), 84 }, 85 86 { 87 scenario: "three row groups without ordering", 88 input: []parquet.RowGroup{ 89 sortedRowGroup(nil, Person{FirstName: "some", LastName: "one", Age: 30}), 90 sortedRowGroup(nil, Person{FirstName: "some", LastName: "one else", Age: 31}), 91 sortedRowGroup(nil, Person{FirstName: "question", LastName: "answer", Age: 42}), 92 }, 93 output: sortedRowGroup(nil, 94 Person{FirstName: "some", LastName: "one", Age: 30}, 95 Person{FirstName: "some", LastName: "one else", Age: 31}, 96 Person{FirstName: "question", LastName: "answer", Age: 42}, 97 ), 98 }, 99 100 { 101 scenario: "row groups sorted by ascending last name", 102 options: []parquet.RowGroupOption{ 103 parquet.SortingRowGroupConfig( 104 parquet.SortingColumns( 105 parquet.Ascending("LastName"), 106 ), 107 ), 108 }, 109 input: []parquet.RowGroup{ 110 sortedRowGroup( 111 []parquet.RowGroupOption{ 112 parquet.SortingRowGroupConfig( 113 parquet.SortingColumns( 114 parquet.Ascending("LastName"), 115 ), 116 ), 117 }, 118 Person{FirstName: "Han", LastName: "Solo"}, 119 Person{FirstName: "Luke", LastName: "Skywalker"}, 120 ), 121 sortedRowGroup( 122 []parquet.RowGroupOption{ 123 parquet.SortingRowGroupConfig( 124 parquet.SortingColumns( 125 parquet.Ascending("LastName"), 126 ), 127 ), 128 }, 129 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 130 ), 131 }, 132 output: sortedRowGroup(nil, 133 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 134 Person{FirstName: "Luke", LastName: "Skywalker"}, 135 Person{FirstName: "Han", LastName: "Solo"}, 136 ), 137 }, 138 139 { 140 scenario: "row groups sorted by descending last name", 141 options: []parquet.RowGroupOption{ 142 parquet.SortingRowGroupConfig( 143 parquet.SortingColumns( 144 parquet.Descending("LastName"), 145 ), 146 ), 147 }, 148 input: []parquet.RowGroup{ 149 sortedRowGroup( 150 []parquet.RowGroupOption{ 151 parquet.SortingRowGroupConfig( 152 parquet.SortingColumns( 153 parquet.Descending("LastName"), 154 ), 155 ), 156 }, 157 Person{FirstName: "Han", LastName: "Solo"}, 158 Person{FirstName: "Luke", LastName: "Skywalker"}, 159 ), 160 sortedRowGroup( 161 []parquet.RowGroupOption{ 162 parquet.SortingRowGroupConfig( 163 parquet.SortingColumns( 164 parquet.Descending("LastName"), 165 ), 166 ), 167 }, 168 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 169 ), 170 }, 171 output: sortedRowGroup(nil, 172 Person{FirstName: "Han", LastName: "Solo"}, 173 Person{FirstName: "Luke", LastName: "Skywalker"}, 174 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 175 ), 176 }, 177 178 { 179 scenario: "row groups sorted by ascending last and first name", 180 options: []parquet.RowGroupOption{ 181 parquet.SortingRowGroupConfig( 182 parquet.SortingColumns( 183 parquet.Ascending("LastName"), 184 parquet.Ascending("FirstName"), 185 ), 186 ), 187 }, 188 input: []parquet.RowGroup{ 189 sortedRowGroup( 190 []parquet.RowGroupOption{ 191 parquet.SortingRowGroupConfig( 192 parquet.SortingColumns( 193 parquet.Ascending("LastName"), 194 parquet.Ascending("FirstName"), 195 ), 196 ), 197 }, 198 Person{FirstName: "Luke", LastName: "Skywalker"}, 199 Person{FirstName: "Han", LastName: "Solo"}, 200 ), 201 sortedRowGroup( 202 []parquet.RowGroupOption{ 203 parquet.SortingRowGroupConfig( 204 parquet.SortingColumns( 205 parquet.Ascending("LastName"), 206 parquet.Ascending("FirstName"), 207 ), 208 ), 209 }, 210 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 211 Person{FirstName: "Anakin", LastName: "Skywalker"}, 212 ), 213 }, 214 output: sortedRowGroup(nil, 215 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 216 Person{FirstName: "Anakin", LastName: "Skywalker"}, 217 Person{FirstName: "Luke", LastName: "Skywalker"}, 218 Person{FirstName: "Han", LastName: "Solo"}, 219 ), 220 }, 221 222 { 223 scenario: "row groups with conversion to a different schema", 224 options: []parquet.RowGroupOption{ 225 parquet.SchemaOf(LastNameOnly{}), 226 parquet.SortingRowGroupConfig( 227 parquet.SortingColumns( 228 parquet.Ascending("LastName"), 229 ), 230 ), 231 }, 232 input: []parquet.RowGroup{ 233 sortedRowGroup( 234 []parquet.RowGroupOption{ 235 parquet.SortingRowGroupConfig( 236 parquet.SortingColumns( 237 parquet.Ascending("LastName"), 238 ), 239 ), 240 }, 241 Person{FirstName: "Han", LastName: "Solo"}, 242 Person{FirstName: "Luke", LastName: "Skywalker"}, 243 ), 244 sortedRowGroup( 245 []parquet.RowGroupOption{ 246 parquet.SortingRowGroupConfig( 247 parquet.SortingColumns( 248 parquet.Ascending("LastName"), 249 ), 250 ), 251 }, 252 Person{FirstName: "Obiwan", LastName: "Kenobi"}, 253 Person{FirstName: "Anakin", LastName: "Skywalker"}, 254 ), 255 }, 256 output: sortedRowGroup( 257 []parquet.RowGroupOption{ 258 parquet.SortingRowGroupConfig( 259 parquet.SortingColumns( 260 parquet.Ascending("LastName"), 261 ), 262 ), 263 }, 264 LastNameOnly{LastName: "Solo"}, 265 LastNameOnly{LastName: "Skywalker"}, 266 LastNameOnly{LastName: "Skywalker"}, 267 LastNameOnly{LastName: "Kenobi"}, 268 ), 269 }, 270 } 271 272 for _, adapter := range []struct { 273 scenario string 274 function func(parquet.RowGroup) parquet.RowGroup 275 }{ 276 {scenario: "buffer", function: selfRowGroup}, 277 {scenario: "file", function: fileRowGroup}, 278 } { 279 t.Run(adapter.scenario, func(t *testing.T) { 280 for _, test := range tests { 281 t.Run(test.scenario, func(t *testing.T) { 282 input := make([]parquet.RowGroup, len(test.input)) 283 for i := range test.input { 284 input[i] = adapter.function(test.input[i]) 285 } 286 287 merged, err := parquet.MergeRowGroups(test.input, test.options...) 288 if err != nil { 289 t.Fatal(err) 290 } 291 if merged.NumRows() != test.output.NumRows() { 292 t.Fatalf("the number of rows mismatch: want=%d got=%d", merged.NumRows(), test.output.NumRows()) 293 } 294 if merged.Schema() != test.output.Schema() { 295 t.Fatalf("the row group schemas mismatch:\n%v\n%v", test.output.Schema(), merged.Schema()) 296 } 297 298 options := []parquet.RowGroupOption{parquet.SchemaOf(Person{})} 299 options = append(options, test.options...) 300 // We test two views of the resulting row group: the one originally 301 // returned by MergeRowGroups, and one where the merged row group 302 // has been copied into a new buffer. The intent is to exercise both 303 // the row-by-row read as well as optimized code paths when CopyRows 304 // bypasses the ReadRow/WriteRow calls and the row group is written 305 // directly to the buffer by calling WriteRowsTo/WriteRowGroup. 306 mergedCopy := parquet.NewBuffer(options...) 307 308 totalRows := test.output.NumRows() 309 numRows, err := copyRowsAndClose(mergedCopy, merged.Rows()) 310 if err != nil { 311 t.Fatal(err) 312 } 313 if numRows != totalRows { 314 t.Fatalf("wrong number of rows copied: want=%d got=%d", totalRows, numRows) 315 } 316 317 for _, merge := range []struct { 318 scenario string 319 rowGroup parquet.RowGroup 320 }{ 321 {scenario: "self", rowGroup: merged}, 322 {scenario: "copy", rowGroup: mergedCopy}, 323 } { 324 t.Run(merge.scenario, func(t *testing.T) { 325 var expectedRows = test.output.Rows() 326 var mergedRows = merge.rowGroup.Rows() 327 var row1 = make([]parquet.Row, 1) 328 var row2 = make([]parquet.Row, 1) 329 var numRows int64 330 331 defer expectedRows.Close() 332 defer mergedRows.Close() 333 334 for { 335 _, err1 := expectedRows.ReadRows(row1) 336 n, err2 := mergedRows.ReadRows(row2) 337 338 if err1 != err2 { 339 // ReadRows may or may not return io.EOF 340 // when it reads the last row, so we test 341 // that the reference RowReader has also 342 // reached the end. 343 if err1 == nil && err2 == io.EOF { 344 _, err1 = expectedRows.ReadRows(row1[:0]) 345 } 346 if err1 != io.EOF { 347 t.Fatalf("errors mismatched while comparing row %d/%d: want=%v got=%v", numRows, totalRows, err1, err2) 348 } 349 } 350 351 if n != 0 { 352 if !row1[0].Equal(row2[0]) { 353 t.Errorf("row at index %d/%d mismatch: want=%+v got=%+v", numRows, totalRows, row1[0], row2[0]) 354 } 355 numRows++ 356 } 357 358 if err1 != nil { 359 break 360 } 361 } 362 363 if numRows != totalRows { 364 t.Errorf("expected to read %d rows but %d were found", totalRows, numRows) 365 } 366 }) 367 } 368 369 }) 370 } 371 }) 372 } 373 } 374 375 func TestMergeRowGroupsCursorsAreClosed(t *testing.T) { 376 type model struct { 377 A int 378 } 379 380 schema := parquet.SchemaOf(model{}) 381 options := []parquet.RowGroupOption{ 382 parquet.SortingRowGroupConfig( 383 parquet.SortingColumns( 384 parquet.Ascending(schema.Columns()[0]...), 385 ), 386 ), 387 } 388 389 prng := rand.New(rand.NewSource(0)) 390 rowGroups := make([]parquet.RowGroup, numRowGroups) 391 rows := make([]*wrappedRows, 0, numRowGroups) 392 393 for i := range rowGroups { 394 rowGroups[i] = wrappedRowGroup{ 395 RowGroup: sortedRowGroup(options, randomRowsOf(prng, rowsPerGroup, model{})...), 396 rowsCallback: func(r parquet.Rows) parquet.Rows { 397 wrapped := &wrappedRows{Rows: r} 398 rows = append(rows, wrapped) 399 return wrapped 400 }, 401 } 402 } 403 404 m, err := parquet.MergeRowGroups(rowGroups, options...) 405 if err != nil { 406 t.Fatal(err) 407 } 408 func() { 409 mergedRows := m.Rows() 410 defer mergedRows.Close() 411 412 // Add 1 more slot to the buffer to force an io.EOF on the first read. 413 rbuf := make([]parquet.Row, (numRowGroups*rowsPerGroup)+1) 414 if _, err := mergedRows.ReadRows(rbuf); !errors.Is(err, io.EOF) { 415 t.Fatal(err) 416 } 417 }() 418 419 for i, wrapped := range rows { 420 if !wrapped.closed { 421 t.Fatalf("RowGroup %d not closed", i) 422 } 423 } 424 } 425 426 func TestMergeRowGroupsSeekToRow(t *testing.T) { 427 type model struct { 428 A int 429 } 430 431 schema := parquet.SchemaOf(model{}) 432 options := []parquet.RowGroupOption{ 433 parquet.SortingRowGroupConfig( 434 parquet.SortingColumns( 435 parquet.Ascending(schema.Columns()[0]...), 436 ), 437 ), 438 } 439 440 rowGroups := make([]parquet.RowGroup, numRowGroups) 441 442 counter := 0 443 for i := range rowGroups { 444 rows := make([]interface{}, 0, rowsPerGroup) 445 for j := 0; j < rowsPerGroup; j++ { 446 rows = append(rows, model{A: counter}) 447 counter++ 448 } 449 rowGroups[i] = sortedRowGroup(options, rows...) 450 } 451 452 m, err := parquet.MergeRowGroups(rowGroups, options...) 453 if err != nil { 454 t.Fatal(err) 455 } 456 457 func() { 458 mergedRows := m.Rows() 459 defer mergedRows.Close() 460 461 rbuf := make([]parquet.Row, 1) 462 cursor := int64(0) 463 for { 464 if err := mergedRows.SeekToRow(cursor); err != nil { 465 t.Fatal(err) 466 } 467 468 if _, err := mergedRows.ReadRows(rbuf); err != nil { 469 if errors.Is(err, io.EOF) { 470 break 471 } 472 t.Fatal(err) 473 } 474 v := model{} 475 if err := schema.Reconstruct(&v, rbuf[0]); err != nil { 476 t.Fatal(err) 477 } 478 if v.A != int(cursor) { 479 t.Fatalf("expected value %d, got %d", cursor, v.A) 480 } 481 482 cursor++ 483 } 484 }() 485 } 486 487 func BenchmarkMergeRowGroups(b *testing.B) { 488 for _, test := range readerTests { 489 b.Run(test.scenario, func(b *testing.B) { 490 schema := parquet.SchemaOf(test.model) 491 492 options := []parquet.RowGroupOption{ 493 parquet.SortingRowGroupConfig( 494 parquet.SortingColumns( 495 parquet.Ascending(schema.Columns()[0]...), 496 ), 497 ), 498 } 499 500 prng := rand.New(rand.NewSource(0)) 501 rowGroups := make([]parquet.RowGroup, numRowGroups) 502 503 for i := range rowGroups { 504 rowGroups[i] = sortedRowGroup(options, randomRowsOf(prng, rowsPerGroup, test.model)...) 505 } 506 507 for n := 1; n <= numRowGroups; n++ { 508 b.Run(fmt.Sprintf("groups=%d,rows=%d", n, n*rowsPerGroup), func(b *testing.B) { 509 mergedRowGroup, err := parquet.MergeRowGroups(rowGroups[:n], options...) 510 if err != nil { 511 b.Fatal(err) 512 } 513 514 rows := mergedRowGroup.Rows() 515 rbuf := make([]parquet.Row, benchmarkRowsPerStep) 516 defer func() { rows.Close() }() 517 518 benchmarkRowsPerSecond(b, func() int { 519 n, err := rows.ReadRows(rbuf) 520 if err != nil { 521 if !errors.Is(err, io.EOF) { 522 b.Fatal(err) 523 } 524 rows.Close() 525 rows = mergedRowGroup.Rows() 526 } 527 return n 528 }) 529 }) 530 } 531 }) 532 } 533 } 534 535 func BenchmarkMergeFiles(b *testing.B) { 536 rowGroupBuffers := make([]bytes.Buffer, numRowGroups) 537 538 for _, test := range readerTests { 539 b.Run(test.scenario, func(b *testing.B) { 540 schema := parquet.SchemaOf(test.model) 541 542 sortingOptions := []parquet.SortingOption{ 543 parquet.SortingColumns( 544 parquet.Ascending(schema.Columns()[0]...), 545 ), 546 } 547 548 options := []parquet.RowGroupOption{ 549 schema, 550 parquet.SortingRowGroupConfig( 551 sortingOptions..., 552 ), 553 } 554 555 buffer := parquet.NewBuffer(options...) 556 557 prng := rand.New(rand.NewSource(0)) 558 files := make([]*parquet.File, numRowGroups) 559 rowGroups := make([]parquet.RowGroup, numRowGroups) 560 561 for i := range files { 562 for _, row := range randomRowsOf(prng, rowsPerGroup, test.model) { 563 buffer.Write(row) 564 } 565 sort.Sort(buffer) 566 rowGroupBuffers[i].Reset() 567 writer := parquet.NewWriter(&rowGroupBuffers[i], 568 schema, 569 parquet.SortingWriterConfig( 570 sortingOptions..., 571 ), 572 ) 573 _, err := copyRowsAndClose(writer, buffer.Rows()) 574 if err != nil { 575 b.Fatal(err) 576 } 577 if err := writer.Close(); err != nil { 578 b.Fatal(err) 579 } 580 r := bytes.NewReader(rowGroupBuffers[i].Bytes()) 581 f, err := parquet.OpenFile(r, r.Size()) 582 if err != nil { 583 b.Fatal(err) 584 } 585 files[i], rowGroups[i] = f, f.RowGroups()[0] 586 } 587 588 for n := 1; n <= numRowGroups; n++ { 589 b.Run(fmt.Sprintf("groups=%d,rows=%d", n, n*rowsPerGroup), func(b *testing.B) { 590 mergedRowGroup, err := parquet.MergeRowGroups(rowGroups[:n], options...) 591 if err != nil { 592 b.Fatal(err) 593 } 594 595 rows := mergedRowGroup.Rows() 596 rbuf := make([]parquet.Row, benchmarkRowsPerStep) 597 defer func() { rows.Close() }() 598 599 benchmarkRowsPerSecond(b, func() int { 600 n, err := rows.ReadRows(rbuf) 601 if err != nil { 602 if !errors.Is(err, io.EOF) { 603 b.Fatal(err) 604 } 605 rows.Close() 606 rows = mergedRowGroup.Rows() 607 } 608 return n 609 }) 610 611 totalSize := int64(0) 612 for _, f := range files[:n] { 613 totalSize += f.Size() 614 } 615 }) 616 } 617 }) 618 } 619 }