github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/row_test.go (about) 1 package parquet_test 2 3 import ( 4 "io" 5 "reflect" 6 "testing" 7 8 "github.com/google/uuid" 9 "github.com/segmentio/parquet-go" 10 ) 11 12 type bufferedRows struct { 13 rows []parquet.Row 14 } 15 16 func (r *bufferedRows) ReadRows(rows []parquet.Row) (int, error) { 17 for i := range rows { 18 if len(r.rows) == 0 { 19 return i, io.EOF 20 } 21 rows[i] = append(rows[i][:0], r.rows[0]...) 22 r.rows = r.rows[1:] 23 } 24 return len(rows), nil 25 } 26 27 func (w *bufferedRows) WriteRows(rows []parquet.Row) (int, error) { 28 for _, row := range rows { 29 w.rows = append(w.rows, row.Clone()) 30 } 31 return len(rows), nil 32 } 33 34 func TestMultiRowWriter(t *testing.T) { 35 b1 := new(bufferedRows) 36 b2 := new(bufferedRows) 37 mw := parquet.MultiRowWriter(b1, b2) 38 39 rows := []parquet.Row{ 40 { 41 parquet.Int32Value(10).Level(0, 0, 0), 42 parquet.Int32Value(11).Level(0, 0, 1), 43 parquet.Int32Value(12).Level(0, 0, 2), 44 }, 45 { 46 parquet.Int32Value(20).Level(0, 0, 0), 47 parquet.Int32Value(21).Level(0, 0, 1), 48 parquet.Int32Value(22).Level(0, 0, 2), 49 }, 50 } 51 52 n, err := mw.WriteRows(rows) 53 if err != nil { 54 t.Fatal(err) 55 } 56 if n != len(rows) { 57 t.Fatalf("number of rows written mismatch: got=%d want=%d", n, len(rows)) 58 } 59 60 assertEqualRows(t, rows, b1.rows) 61 assertEqualRows(t, rows, b2.rows) 62 } 63 64 func TestRowClone(t *testing.T) { 65 row := parquet.Row{ 66 parquet.ValueOf(42).Level(0, 1, 0), 67 parquet.ValueOf("Hello World").Level(1, 1, 1), 68 } 69 if clone := row.Clone(); !row.Equal(clone) { 70 t.Error("row and its clone are not equal") 71 } 72 } 73 74 func TestDeconstructionReconstruction(t *testing.T) { 75 type Person struct { 76 FirstName string 77 LastName string 78 Age int `parquet:",optional"` 79 Weight float64 `parquet:",optional"` 80 } 81 82 type Details struct { 83 Person *Person 84 } 85 86 type Friend struct { 87 ID [16]byte `parquet:",uuid"` 88 Details *Details 89 } 90 91 type User struct { 92 ID [16]byte `parquet:",uuid"` 93 Details *Details 94 Friends []Friend `parquet:",list,optional"` 95 } 96 97 type List2 struct { 98 Value string `parquet:",optional"` 99 } 100 101 type List1 struct { 102 List2 []List2 `parquet:",list"` 103 } 104 105 type List0 struct { 106 List1 []List1 `parquet:",list"` 107 } 108 109 type nestedListsLevel1 struct { 110 Level2 []string `parquet:"level2"` 111 } 112 113 type nestedLists struct { 114 Level1 []nestedListsLevel1 `parquet:"level1"` 115 } 116 117 tests := []struct { 118 scenario string 119 input interface{} 120 values [][]parquet.Value 121 }{ 122 { 123 scenario: "single field", 124 input: struct { 125 Name string 126 }{Name: "Luke"}, 127 values: [][]parquet.Value{ 128 0: {parquet.ValueOf("Luke").Level(0, 0, 0)}, 129 }, 130 }, 131 132 { 133 scenario: "multiple fields", 134 input: Person{ 135 FirstName: "Han", 136 LastName: "Solo", 137 Age: 42, 138 Weight: 81.5, 139 }, 140 values: [][]parquet.Value{ 141 0: {parquet.ValueOf("Han").Level(0, 0, 0)}, 142 1: {parquet.ValueOf("Solo").Level(0, 0, 1)}, 143 2: {parquet.ValueOf(42).Level(0, 1, 2)}, 144 3: {parquet.ValueOf(81.5).Level(0, 1, 3)}, 145 }, 146 }, 147 148 { 149 scenario: "empty repeated field", 150 input: struct { 151 Symbols []string 152 }{ 153 Symbols: []string{}, 154 }, 155 values: [][]parquet.Value{ 156 0: {parquet.ValueOf(nil).Level(0, 0, 0)}, 157 }, 158 }, 159 160 { 161 scenario: "single repeated field", 162 input: struct { 163 Symbols []string 164 }{ 165 Symbols: []string{"EUR", "USD", "GBP", "JPY"}, 166 }, 167 values: [][]parquet.Value{ 168 0: { 169 parquet.ValueOf("EUR").Level(0, 1, 0), 170 parquet.ValueOf("USD").Level(1, 1, 0), 171 parquet.ValueOf("GBP").Level(1, 1, 0), 172 parquet.ValueOf("JPY").Level(1, 1, 0), 173 }, 174 }, 175 }, 176 177 { 178 scenario: "multiple repeated field", 179 input: struct { 180 Symbols []string 181 Values []float32 182 }{ 183 Symbols: []string{"EUR", "USD", "GBP", "JPY"}, 184 Values: []float32{0.1, 0.2, 0.3, 0.4}, 185 }, 186 values: [][]parquet.Value{ 187 0: { 188 parquet.ValueOf("EUR").Level(0, 1, 0), 189 parquet.ValueOf("USD").Level(1, 1, 0), 190 parquet.ValueOf("GBP").Level(1, 1, 0), 191 parquet.ValueOf("JPY").Level(1, 1, 0), 192 }, 193 1: { 194 parquet.ValueOf(float32(0.1)).Level(0, 1, 0), 195 parquet.ValueOf(float32(0.2)).Level(1, 1, 0), 196 parquet.ValueOf(float32(0.3)).Level(1, 1, 0), 197 parquet.ValueOf(float32(0.4)).Level(1, 1, 0), 198 }, 199 }, 200 }, 201 202 { 203 scenario: "top level nil pointer field", 204 input: struct { 205 Person *Person 206 }{ 207 Person: nil, 208 }, 209 // Here there are four nil values because the Person type has four 210 // fields but it is nil. 211 values: [][]parquet.Value{ 212 0: {parquet.ValueOf(nil).Level(0, 0, 0)}, 213 1: {parquet.ValueOf(nil).Level(0, 0, 0)}, 214 2: {parquet.ValueOf(nil).Level(0, 0, 0)}, 215 3: {parquet.ValueOf(nil).Level(0, 0, 0)}, 216 }, 217 }, 218 219 { 220 scenario: "top level slice pointer", 221 input: struct { 222 List []*List2 223 }{ 224 List: []*List2{ 225 {Value: "foo"}, 226 {Value: "bar"}, 227 }, 228 }, 229 values: [][]parquet.Value{ 230 0: { 231 parquet.ValueOf("foo").Level(0, 2, 0), 232 parquet.ValueOf("bar").Level(1, 2, 0), 233 }, 234 }, 235 }, 236 237 { 238 scenario: "sub level nil pointer field", 239 input: User{ 240 ID: uuid.MustParse("A65B576D-9299-4769-9D93-04BE0583F027"), 241 Details: &Details{ 242 Person: nil, 243 }, 244 }, 245 // Here there are four nil values because the Person type has four 246 // fields but it is nil. 247 values: [][]parquet.Value{ 248 // User.ID 249 0: {parquet.ValueOf(uuid.MustParse("A65B576D-9299-4769-9D93-04BE0583F027"))}, 250 // User.Details.Person 251 1: {parquet.ValueOf(nil).Level(0, 1, 0)}, 252 2: {parquet.ValueOf(nil).Level(0, 1, 0)}, 253 3: {parquet.ValueOf(nil).Level(0, 1, 0)}, 254 4: {parquet.ValueOf(nil).Level(0, 1, 0)}, 255 // User.Friends.ID 256 5: {parquet.ValueOf(nil).Level(0, 0, 0)}, 257 // User.Friends.Details.Person 258 6: {parquet.ValueOf(nil).Level(0, 0, 0)}, 259 7: {parquet.ValueOf(nil).Level(0, 0, 0)}, 260 8: {parquet.ValueOf(nil).Level(0, 0, 0)}, 261 9: {parquet.ValueOf(nil).Level(0, 0, 0)}, 262 }, 263 }, 264 265 { 266 scenario: "deeply nested structure", 267 input: struct { 268 User User 269 }{ 270 User: User{ 271 ID: uuid.MustParse("A65B576D-9299-4769-9D93-04BE0583F027"), 272 Details: &Details{ 273 Person: &Person{ 274 FirstName: "Luke", 275 LastName: "Skywalker", 276 }, 277 }, 278 Friends: []Friend{ 279 { 280 ID: uuid.MustParse("1B76F8D0-82C6-403F-A104-DCDA69207220"), 281 Details: &Details{ 282 Person: &Person{ 283 FirstName: "Han", 284 LastName: "Solo", 285 }, 286 }, 287 }, 288 289 { 290 ID: uuid.MustParse("C43C8852-CCE5-40E6-B0DF-7212A5633346"), 291 Details: &Details{ 292 Person: &Person{ 293 FirstName: "Leia", 294 LastName: "Skywalker", 295 }, 296 }, 297 }, 298 299 { 300 ID: uuid.MustParse("E78642A8-0931-4D5F-918F-24DC8FF445B0"), 301 Details: &Details{ 302 Person: &Person{ 303 FirstName: "C3PO", 304 LastName: "Droid", 305 }, 306 }, 307 }, 308 }, 309 }, 310 }, 311 312 values: [][]parquet.Value{ 313 // User.ID 314 0: {parquet.ValueOf(uuid.MustParse("A65B576D-9299-4769-9D93-04BE0583F027"))}, 315 316 // User.Details 317 1: {parquet.ValueOf("Luke").Level(0, 2, 0)}, 318 2: {parquet.ValueOf("Skywalker").Level(0, 2, 0)}, 319 3: {parquet.ValueOf(nil).Level(0, 2, 0)}, 320 4: {parquet.ValueOf(nil).Level(0, 2, 0)}, 321 322 5: { // User.Friends.ID 323 parquet.ValueOf(uuid.MustParse("1B76F8D0-82C6-403F-A104-DCDA69207220")).Level(0, 2, 0), 324 parquet.ValueOf(uuid.MustParse("C43C8852-CCE5-40E6-B0DF-7212A5633346")).Level(1, 2, 0), 325 parquet.ValueOf(uuid.MustParse("E78642A8-0931-4D5F-918F-24DC8FF445B0")).Level(1, 2, 0), 326 }, 327 328 6: { // User.Friends.Details.Person.FirstName 329 parquet.ValueOf("Han").Level(0, 4, 0), 330 parquet.ValueOf("Leia").Level(1, 4, 0), 331 parquet.ValueOf("C3PO").Level(1, 4, 0), 332 }, 333 334 7: { // User.Friends.Details.Person.LastName 335 parquet.ValueOf("Solo").Level(0, 4, 0), 336 parquet.ValueOf("Skywalker").Level(1, 4, 0), 337 parquet.ValueOf("Droid").Level(1, 4, 0), 338 }, 339 340 8: { // User.Friends.Details.Person.Age 341 parquet.ValueOf(nil).Level(0, 4, 0), 342 parquet.ValueOf(nil).Level(1, 4, 0), 343 parquet.ValueOf(nil).Level(1, 4, 0), 344 }, 345 346 9: { // User.Friends.Details.Person.Weight 347 parquet.ValueOf(nil).Level(0, 4, 0), 348 parquet.ValueOf(nil).Level(1, 4, 0), 349 parquet.ValueOf(nil).Level(1, 4, 0), 350 }, 351 }, 352 }, 353 354 { 355 scenario: "multiple repeated levels", 356 input: List0{ 357 List1: []List1{ 358 {List2: []List2{{Value: "A"}, {Value: "B"}}}, 359 {List2: []List2{}}, // parquet doesn't differentiate between empty repeated and a nil list 360 {List2: []List2{{Value: "C"}}}, 361 {List2: []List2{}}, 362 {List2: []List2{{Value: "D"}, {Value: "E"}, {Value: "F"}}}, 363 {List2: []List2{{Value: "G"}, {Value: "H"}, {Value: "I"}}}, 364 }, 365 }, 366 values: [][]parquet.Value{ 367 { 368 parquet.ValueOf("A").Level(0, 3, 0), 369 parquet.ValueOf("B").Level(2, 3, 0), 370 parquet.ValueOf(nil).Level(1, 1, 0), 371 parquet.ValueOf("C").Level(1, 3, 0), 372 parquet.ValueOf(nil).Level(1, 1, 0), 373 parquet.ValueOf("D").Level(1, 3, 0), 374 parquet.ValueOf("E").Level(2, 3, 0), 375 parquet.ValueOf("F").Level(2, 3, 0), 376 parquet.ValueOf("G").Level(1, 3, 0), 377 parquet.ValueOf("H").Level(2, 3, 0), 378 parquet.ValueOf("I").Level(2, 3, 0), 379 }, 380 }, 381 }, 382 383 // https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet 384 385 // message nestedLists { 386 // repeated group level1 { 387 // repeated string level2; 388 // } 389 // } 390 // --- 391 // { 392 // level1: { 393 // level2: a 394 // level2: b 395 // level2: c 396 // }, 397 // level1: { 398 // level2: d 399 // level2: e 400 // level2: f 401 // level2: g 402 // } 403 // } 404 // 405 { 406 scenario: "twitter blog example 1", 407 input: nestedLists{ 408 Level1: []nestedListsLevel1{ 409 {Level2: []string{"a", "b", "c"}}, 410 {Level2: []string{"d", "e", "f", "g"}}, 411 }, 412 }, 413 values: [][]parquet.Value{ 414 0: { 415 parquet.ValueOf("a").Level(0, 2, 0), 416 parquet.ValueOf("b").Level(2, 2, 0), 417 parquet.ValueOf("c").Level(2, 2, 0), 418 parquet.ValueOf("d").Level(1, 2, 0), 419 parquet.ValueOf("e").Level(2, 2, 0), 420 parquet.ValueOf("f").Level(2, 2, 0), 421 parquet.ValueOf("g").Level(2, 2, 0), 422 }, 423 }, 424 }, 425 426 // message nestedLists { 427 // repeated group level1 { 428 // repeated string level2; 429 // } 430 // } 431 // --- 432 // { 433 // level1: { 434 // level2: h 435 // }, 436 // level1: { 437 // level2: i 438 // level2: j 439 // } 440 // } 441 // 442 { 443 scenario: "twitter blog example 2", 444 input: nestedLists{ 445 Level1: []nestedListsLevel1{ 446 {Level2: []string{"h"}}, 447 {Level2: []string{"i", "j"}}, 448 }, 449 }, 450 values: [][]parquet.Value{ 451 0: { 452 parquet.ValueOf("h").Level(0, 2, 0), 453 parquet.ValueOf("i").Level(1, 2, 0), 454 parquet.ValueOf("j").Level(2, 2, 0), 455 }, 456 }, 457 }, 458 459 // message AddressBook { 460 // required string owner; 461 // repeated string ownerPhoneNumbers; 462 // repeated group contacts { 463 // required string name; 464 // optional string phoneNumber; 465 // } 466 // } 467 // --- 468 // AddressBook { 469 // owner: "Julien Le Dem", 470 // ownerPhoneNumbers: "555 123 4567", 471 // ownerPhoneNumbers: "555 666 1337", 472 // contacts: { 473 // name: "Dmitriy Ryaboy", 474 // phoneNumber: "555 987 6543", 475 // }, 476 // contacts: { 477 // name: "Chris Aniszczyk" 478 // } 479 // } 480 { 481 scenario: "twitter blog example 3", 482 input: AddressBook{ 483 Owner: "Julien Le Dem", 484 OwnerPhoneNumbers: []string{ 485 "555 123 4567", 486 "555 666 1337", 487 }, 488 Contacts: []Contact{ 489 { 490 Name: "Dmitriy Ryaboy", 491 PhoneNumber: "555 987 6543", 492 }, 493 { 494 Name: "Chris Aniszczyk", 495 }, 496 }, 497 }, 498 values: [][]parquet.Value{ 499 0: { // AddressBook.owner 500 parquet.ValueOf("Julien Le Dem").Level(0, 0, 0), 501 }, 502 1: { // AddressBook.ownerPhoneNumbers 503 parquet.ValueOf("555 123 4567").Level(0, 1, 0), 504 parquet.ValueOf("555 666 1337").Level(1, 1, 0), 505 }, 506 2: { // AddressBook.contacts.name 507 parquet.ValueOf("Dmitriy Ryaboy").Level(0, 1, 0), 508 parquet.ValueOf("Chris Aniszczyk").Level(1, 1, 0), 509 }, 510 3: { // AddressBook.contacts.phoneNumber 511 parquet.ValueOf("555 987 6543").Level(0, 2, 0), 512 parquet.ValueOf(nil).Level(1, 1, 0), 513 }, 514 }, 515 }, 516 } 517 518 for _, test := range tests { 519 t.Run(test.scenario, func(t *testing.T) { 520 schema := parquet.SchemaOf(test.input) 521 row := schema.Deconstruct(nil, test.input) 522 values := columnsOf(row) 523 524 t.Logf("\n%s", schema) 525 526 for columnIndex, expect := range test.values { 527 assertEqualValues(t, columnIndex, expect, values[columnIndex]) 528 } 529 530 newValue := reflect.New(reflect.TypeOf(test.input)) 531 if err := schema.Reconstruct(newValue.Interface(), row); err != nil { 532 t.Errorf("reconstruction of the parquet row into a go value failed:\n\t%v", err) 533 } else if !reflect.DeepEqual(newValue.Elem().Interface(), test.input) { 534 t.Errorf("reconstruction of the parquet row into a go value produced the wrong output:\nwant = %#v\ngot = %#v", test.input, newValue.Elem()) 535 } 536 537 for columnIndex := range test.values { 538 values[columnIndex] = nil 539 } 540 541 for columnIndex, unexpected := range values { 542 if unexpected != nil { 543 t.Errorf("unexpected column index %d found with %d values in it", columnIndex, len(unexpected)) 544 } 545 } 546 }) 547 } 548 } 549 550 func columnsOf(row parquet.Row) [][]parquet.Value { 551 columns := make([][]parquet.Value, 0) 552 row.Range(func(_ int, c []parquet.Value) bool { 553 columns = append(columns, c) 554 return true 555 }) 556 return columns 557 } 558 559 func assertEqualRows(t *testing.T, want, got []parquet.Row) { 560 if len(want) != len(got) { 561 t.Errorf("number of rows mismatch: want=%d got=%d", len(want), len(got)) 562 return 563 } 564 565 for i := range want { 566 row1, row2 := want[i], got[i] 567 568 if len(row1) != len(row2) { 569 t.Errorf("number of values in row %d mismatch: want=%d got=%d", i, len(row1), len(row2)) 570 continue 571 } 572 573 for j := range row1 { 574 if value1, value2 := row1[j], row2[j]; !parquet.DeepEqual(value1, value2) { 575 t.Errorf("values of row %d at index %d mismatch: want=%+v got=%+v", i, j, value1, value2) 576 } 577 } 578 } 579 } 580 581 func assertEqualValues(t *testing.T, columnIndex int, want, got []parquet.Value) { 582 n := len(want) 583 584 if len(want) != len(got) { 585 t.Errorf("wrong number of values in column %d: want=%d got=%d", columnIndex, len(want), len(got)) 586 if len(want) > len(got) { 587 n = len(got) 588 } 589 } 590 591 for i := 0; i < n; i++ { 592 v1, v2 := want[i], got[i] 593 594 if !parquet.Equal(v1, v2) { 595 t.Errorf("values at index %d mismatch in column %d: want=%#v got=%#v", i, columnIndex, v1, v2) 596 } 597 if columnIndex != int(v2.Column()) { 598 t.Errorf("column index mismatch in column %d: want=%d got=%#v", i, columnIndex, v2) 599 } 600 if v1.RepetitionLevel() != v2.RepetitionLevel() { 601 t.Errorf("repetition levels at index %d mismatch in column %d: want=%#v got=%#v", i, columnIndex, v1, v2) 602 } 603 if v1.DefinitionLevel() != v2.DefinitionLevel() { 604 t.Errorf("definition levels at index %d mismatch in column %d: want=%#v got=%#v", i, columnIndex, v1, v2) 605 } 606 } 607 } 608 609 func BenchmarkDeconstruct(b *testing.B) { 610 row := &AddressBook{ 611 Owner: "Julien Le Dem", 612 OwnerPhoneNumbers: []string{ 613 "555 123 4567", 614 "555 666 1337", 615 }, 616 Contacts: []Contact{ 617 { 618 Name: "Dmitriy Ryaboy", 619 PhoneNumber: "555 987 6543", 620 }, 621 { 622 Name: "Chris Aniszczyk", 623 }, 624 }, 625 } 626 627 schema := parquet.SchemaOf(row) 628 buffer := parquet.Row{} 629 630 for i := 0; i < b.N; i++ { 631 buffer = schema.Deconstruct(buffer[:0], row) 632 } 633 } 634 635 func BenchmarkReconstruct(b *testing.B) { 636 row := &AddressBook{ 637 Owner: "Julien Le Dem", 638 OwnerPhoneNumbers: []string{ 639 "555 123 4567", 640 "555 666 1337", 641 }, 642 Contacts: []Contact{ 643 { 644 Name: "Dmitriy Ryaboy", 645 PhoneNumber: "555 987 6543", 646 }, 647 { 648 Name: "Chris Aniszczyk", 649 }, 650 }, 651 } 652 653 schema := parquet.SchemaOf(row) 654 values := schema.Deconstruct(nil, row) 655 buffer := AddressBook{} 656 657 for i := 0; i < b.N; i++ { 658 buffer = AddressBook{} 659 660 if err := schema.Reconstruct(&buffer, values); err != nil { 661 b.Fatal(err) 662 } 663 } 664 }