github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/row_test.go (about) 1 package parquet_test 2 3 import ( 4 "reflect" 5 "testing" 6 7 "github.com/google/uuid" 8 "github.com/vc42/parquet-go" 9 ) 10 11 func TestRowClone(t *testing.T) { 12 row := parquet.Row{ 13 parquet.ValueOf(42).Level(0, 1, 0), 14 parquet.ValueOf("Hello World").Level(1, 1, 1), 15 } 16 if clone := row.Clone(); !row.Equal(clone) { 17 t.Error("row and its clone are not equal") 18 } 19 } 20 21 func TestDeconstructionReconstruction(t *testing.T) { 22 type Person struct { 23 FirstName string 24 LastName string 25 Age int `parquet:",optional"` 26 Weight float64 `parquet:",optional"` 27 } 28 29 type Details struct { 30 Person *Person 31 } 32 33 type Friend struct { 34 ID [16]byte `parquet:",uuid"` 35 Details *Details 36 } 37 38 type User struct { 39 ID [16]byte `parquet:",uuid"` 40 Details *Details 41 Friends []Friend `parquet:",list,optional"` 42 } 43 44 type List2 struct { 45 Value string `parquet:",optional"` 46 } 47 48 type List1 struct { 49 List2 []List2 `parquet:",list"` 50 } 51 52 type List0 struct { 53 List1 []List1 `parquet:",list"` 54 } 55 56 type nestedListsLevel1 struct { 57 Level2 []string `parquet:"level2"` 58 } 59 60 type nestedLists struct { 61 Level1 []nestedListsLevel1 `parquet:"level1"` 62 } 63 64 tests := []struct { 65 scenario string 66 input interface{} 67 values [][]parquet.Value 68 }{ 69 { 70 scenario: "single field", 71 input: struct { 72 Name string 73 }{Name: "Luke"}, 74 values: [][]parquet.Value{ 75 0: {parquet.ValueOf("Luke")}, 76 }, 77 }, 78 79 { 80 scenario: "multiple fields", 81 input: Person{ 82 FirstName: "Han", 83 LastName: "Solo", 84 Age: 42, 85 Weight: 81.5, 86 }, 87 values: [][]parquet.Value{ 88 0: {parquet.ValueOf("Han")}, 89 1: {parquet.ValueOf("Solo")}, 90 2: {parquet.ValueOf(42).Level(0, 1, 0)}, 91 3: {parquet.ValueOf(81.5).Level(0, 1, 0)}, 92 }, 93 }, 94 95 { 96 scenario: "empty repeated field", 97 input: struct { 98 Symbols []string 99 }{ 100 Symbols: []string{}, 101 }, 102 values: [][]parquet.Value{ 103 0: {parquet.ValueOf(nil).Level(0, 0, 0)}, 104 }, 105 }, 106 107 { 108 scenario: "single repeated field", 109 input: struct { 110 Symbols []string 111 }{ 112 Symbols: []string{"EUR", "USD", "GBP", "JPY"}, 113 }, 114 values: [][]parquet.Value{ 115 0: { 116 parquet.ValueOf("EUR").Level(0, 1, 0), 117 parquet.ValueOf("USD").Level(1, 1, 0), 118 parquet.ValueOf("GBP").Level(1, 1, 0), 119 parquet.ValueOf("JPY").Level(1, 1, 0), 120 }, 121 }, 122 }, 123 124 { 125 scenario: "multiple repeated field", 126 input: struct { 127 Symbols []string 128 Values []float32 129 }{ 130 Symbols: []string{"EUR", "USD", "GBP", "JPY"}, 131 Values: []float32{0.1, 0.2, 0.3, 0.4}, 132 }, 133 values: [][]parquet.Value{ 134 0: { 135 parquet.ValueOf("EUR").Level(0, 1, 0), 136 parquet.ValueOf("USD").Level(1, 1, 0), 137 parquet.ValueOf("GBP").Level(1, 1, 0), 138 parquet.ValueOf("JPY").Level(1, 1, 0), 139 }, 140 1: { 141 parquet.ValueOf(float32(0.1)).Level(0, 1, 0), 142 parquet.ValueOf(float32(0.2)).Level(1, 1, 0), 143 parquet.ValueOf(float32(0.3)).Level(1, 1, 0), 144 parquet.ValueOf(float32(0.4)).Level(1, 1, 0), 145 }, 146 }, 147 }, 148 149 { 150 scenario: "top level nil pointer field", 151 input: struct { 152 Person *Person 153 }{ 154 Person: nil, 155 }, 156 // Here there are four nil values because the Person type has four 157 // fields but it is nil. 158 values: [][]parquet.Value{ 159 0: {parquet.ValueOf(nil).Level(0, 0, 0)}, 160 1: {parquet.ValueOf(nil).Level(0, 0, 0)}, 161 2: {parquet.ValueOf(nil).Level(0, 0, 0)}, 162 3: {parquet.ValueOf(nil).Level(0, 0, 0)}, 163 }, 164 }, 165 166 { 167 scenario: "sub level nil pointer field", 168 input: User{ 169 ID: uuid.MustParse("A65B576D-9299-4769-9D93-04BE0583F027"), 170 Details: &Details{ 171 Person: nil, 172 }, 173 }, 174 // Here there are four nil values because the Person type has four 175 // fields but it is nil. 176 values: [][]parquet.Value{ 177 // User.ID 178 0: {parquet.ValueOf(uuid.MustParse("A65B576D-9299-4769-9D93-04BE0583F027"))}, 179 // User.Details.Person 180 1: {parquet.ValueOf(nil).Level(0, 1, 0)}, 181 2: {parquet.ValueOf(nil).Level(0, 1, 0)}, 182 3: {parquet.ValueOf(nil).Level(0, 1, 0)}, 183 4: {parquet.ValueOf(nil).Level(0, 1, 0)}, 184 // User.Friends.ID 185 5: {parquet.ValueOf(nil).Level(0, 0, 0)}, 186 // User.Friends.Details.Person 187 6: {parquet.ValueOf(nil).Level(0, 0, 0)}, 188 7: {parquet.ValueOf(nil).Level(0, 0, 0)}, 189 8: {parquet.ValueOf(nil).Level(0, 0, 0)}, 190 9: {parquet.ValueOf(nil).Level(0, 0, 0)}, 191 }, 192 }, 193 194 { 195 scenario: "deeply nested structure", 196 input: struct { 197 User User 198 }{ 199 User: User{ 200 ID: uuid.MustParse("A65B576D-9299-4769-9D93-04BE0583F027"), 201 Details: &Details{ 202 Person: &Person{ 203 FirstName: "Luke", 204 LastName: "Skywalker", 205 }, 206 }, 207 Friends: []Friend{ 208 { 209 ID: uuid.MustParse("1B76F8D0-82C6-403F-A104-DCDA69207220"), 210 Details: &Details{ 211 Person: &Person{ 212 FirstName: "Han", 213 LastName: "Solo", 214 }, 215 }, 216 }, 217 218 { 219 ID: uuid.MustParse("C43C8852-CCE5-40E6-B0DF-7212A5633346"), 220 Details: &Details{ 221 Person: &Person{ 222 FirstName: "Leia", 223 LastName: "Skywalker", 224 }, 225 }, 226 }, 227 228 { 229 ID: uuid.MustParse("E78642A8-0931-4D5F-918F-24DC8FF445B0"), 230 Details: &Details{ 231 Person: &Person{ 232 FirstName: "C3PO", 233 LastName: "Droid", 234 }, 235 }, 236 }, 237 }, 238 }, 239 }, 240 241 values: [][]parquet.Value{ 242 // User.ID 243 0: {parquet.ValueOf(uuid.MustParse("A65B576D-9299-4769-9D93-04BE0583F027"))}, 244 245 // User.Details 246 1: {parquet.ValueOf("Luke").Level(0, 2, 0)}, 247 2: {parquet.ValueOf("Skywalker").Level(0, 2, 0)}, 248 3: {parquet.ValueOf(nil).Level(0, 2, 0)}, 249 4: {parquet.ValueOf(nil).Level(0, 2, 0)}, 250 251 5: { // User.Friends.ID 252 parquet.ValueOf(uuid.MustParse("1B76F8D0-82C6-403F-A104-DCDA69207220")).Level(0, 2, 0), 253 parquet.ValueOf(uuid.MustParse("C43C8852-CCE5-40E6-B0DF-7212A5633346")).Level(1, 2, 0), 254 parquet.ValueOf(uuid.MustParse("E78642A8-0931-4D5F-918F-24DC8FF445B0")).Level(1, 2, 0), 255 }, 256 257 6: { // User.Friends.Details.Person.FirstName 258 parquet.ValueOf("Han").Level(0, 4, 0), 259 parquet.ValueOf("Leia").Level(1, 4, 0), 260 parquet.ValueOf("C3PO").Level(1, 4, 0), 261 }, 262 263 7: { // User.Friends.Details.Person.LastName 264 parquet.ValueOf("Solo").Level(0, 4, 0), 265 parquet.ValueOf("Skywalker").Level(1, 4, 0), 266 parquet.ValueOf("Droid").Level(1, 4, 0), 267 }, 268 269 8: { // User.Friends.Details.Person.Age 270 parquet.ValueOf(nil).Level(0, 4, 0), 271 parquet.ValueOf(nil).Level(1, 4, 0), 272 parquet.ValueOf(nil).Level(1, 4, 0), 273 }, 274 275 9: { // User.Friends.Details.Person.Weight 276 parquet.ValueOf(nil).Level(0, 4, 0), 277 parquet.ValueOf(nil).Level(1, 4, 0), 278 parquet.ValueOf(nil).Level(1, 4, 0), 279 }, 280 }, 281 }, 282 283 { 284 scenario: "multiple repeated levels", 285 input: List0{ 286 List1: []List1{ 287 {List2: []List2{{Value: "A"}, {Value: "B"}}}, 288 {List2: []List2{}}, // parquet doesn't differentiate between empty repeated and a nil list 289 {List2: []List2{{Value: "C"}}}, 290 {List2: []List2{}}, 291 {List2: []List2{{Value: "D"}, {Value: "E"}, {Value: "F"}}}, 292 {List2: []List2{{Value: "G"}, {Value: "H"}, {Value: "I"}}}, 293 }, 294 }, 295 values: [][]parquet.Value{ 296 { 297 parquet.ValueOf("A").Level(0, 3, 0), 298 parquet.ValueOf("B").Level(2, 3, 0), 299 parquet.ValueOf(nil).Level(1, 1, 0), 300 parquet.ValueOf("C").Level(1, 3, 0), 301 parquet.ValueOf(nil).Level(1, 1, 0), 302 parquet.ValueOf("D").Level(1, 3, 0), 303 parquet.ValueOf("E").Level(2, 3, 0), 304 parquet.ValueOf("F").Level(2, 3, 0), 305 parquet.ValueOf("G").Level(1, 3, 0), 306 parquet.ValueOf("H").Level(2, 3, 0), 307 parquet.ValueOf("I").Level(2, 3, 0), 308 }, 309 }, 310 }, 311 312 // https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet 313 314 // message nestedLists { 315 // repeated group level1 { 316 // repeated string level2; 317 // } 318 // } 319 // --- 320 // { 321 // level1: { 322 // level2: a 323 // level2: b 324 // level2: c 325 // }, 326 // level1: { 327 // level2: d 328 // level2: e 329 // level2: f 330 // level2: g 331 // } 332 // } 333 // 334 { 335 scenario: "twitter blog example 1", 336 input: nestedLists{ 337 Level1: []nestedListsLevel1{ 338 {Level2: []string{"a", "b", "c"}}, 339 {Level2: []string{"d", "e", "f", "g"}}, 340 }, 341 }, 342 values: [][]parquet.Value{ 343 0: { 344 parquet.ValueOf("a").Level(0, 2, 0), 345 parquet.ValueOf("b").Level(2, 2, 0), 346 parquet.ValueOf("c").Level(2, 2, 0), 347 parquet.ValueOf("d").Level(1, 2, 0), 348 parquet.ValueOf("e").Level(2, 2, 0), 349 parquet.ValueOf("f").Level(2, 2, 0), 350 parquet.ValueOf("g").Level(2, 2, 0), 351 }, 352 }, 353 }, 354 355 // message nestedLists { 356 // repeated group level1 { 357 // repeated string level2; 358 // } 359 // } 360 // --- 361 // { 362 // level1: { 363 // level2: h 364 // }, 365 // level1: { 366 // level2: i 367 // level2: j 368 // } 369 // } 370 // 371 { 372 scenario: "twitter blog example 2", 373 input: nestedLists{ 374 Level1: []nestedListsLevel1{ 375 {Level2: []string{"h"}}, 376 {Level2: []string{"i", "j"}}, 377 }, 378 }, 379 values: [][]parquet.Value{ 380 0: { 381 parquet.ValueOf("h").Level(0, 2, 0), 382 parquet.ValueOf("i").Level(1, 2, 0), 383 parquet.ValueOf("j").Level(2, 2, 0), 384 }, 385 }, 386 }, 387 388 // message AddressBook { 389 // required string owner; 390 // repeated string ownerPhoneNumbers; 391 // repeated group contacts { 392 // required string name; 393 // optional string phoneNumber; 394 // } 395 // } 396 // --- 397 // AddressBook { 398 // owner: "Julien Le Dem", 399 // ownerPhoneNumbers: "555 123 4567", 400 // ownerPhoneNumbers: "555 666 1337", 401 // contacts: { 402 // name: "Dmitriy Ryaboy", 403 // phoneNumber: "555 987 6543", 404 // }, 405 // contacts: { 406 // name: "Chris Aniszczyk" 407 // } 408 // } 409 { 410 scenario: "twitter blog example 3", 411 input: AddressBook{ 412 Owner: "Julien Le Dem", 413 OwnerPhoneNumbers: []string{ 414 "555 123 4567", 415 "555 666 1337", 416 }, 417 Contacts: []Contact{ 418 { 419 Name: "Dmitriy Ryaboy", 420 PhoneNumber: "555 987 6543", 421 }, 422 { 423 Name: "Chris Aniszczyk", 424 }, 425 }, 426 }, 427 values: [][]parquet.Value{ 428 0: { // AddressBook.owner 429 parquet.ValueOf("Julien Le Dem").Level(0, 0, 0), 430 }, 431 1: { // AddressBook.ownerPhoneNumbers 432 parquet.ValueOf("555 123 4567").Level(0, 1, 0), 433 parquet.ValueOf("555 666 1337").Level(1, 1, 0), 434 }, 435 2: { // AddressBook.contacts.name 436 parquet.ValueOf("Dmitriy Ryaboy").Level(0, 1, 0), 437 parquet.ValueOf("Chris Aniszczyk").Level(1, 1, 0), 438 }, 439 3: { // AddressBook.contacts.phoneNumber 440 parquet.ValueOf("555 987 6543").Level(0, 2, 0), 441 parquet.ValueOf(nil).Level(1, 1, 0), 442 }, 443 }, 444 }, 445 } 446 447 for _, test := range tests { 448 t.Run(test.scenario, func(t *testing.T) { 449 schema := parquet.SchemaOf(test.input) 450 row := schema.Deconstruct(nil, test.input) 451 values := columnsOf(row) 452 453 t.Logf("\n%s\n", schema) 454 455 for columnIndex, expect := range test.values { 456 assertEqualValues(t, columnIndex, expect, values[columnIndex]) 457 } 458 459 newValue := reflect.New(reflect.TypeOf(test.input)) 460 if err := schema.Reconstruct(newValue.Interface(), row); err != nil { 461 t.Errorf("reconstruction of the parquet row into a go value failed:\n\t%v", err) 462 } else if !reflect.DeepEqual(newValue.Elem().Interface(), test.input) { 463 t.Errorf("reconstruction of the parquet row into a go value produced the wrong output:\nwant = %#v\ngot = %#v", test.input, newValue.Elem()) 464 } 465 466 for columnIndex := range test.values { 467 values[columnIndex] = nil 468 } 469 470 for columnIndex, unexpected := range values { 471 if unexpected != nil { 472 t.Errorf("unexpected column index %d found with %d values in it", columnIndex, len(unexpected)) 473 } 474 } 475 }) 476 } 477 } 478 479 func columnsOf(row parquet.Row) [][]parquet.Value { 480 maxColumnIndex := 0 481 for _, value := range row { 482 if columnIndex := int(value.Column()); columnIndex > maxColumnIndex { 483 maxColumnIndex = columnIndex 484 } 485 } 486 columns := make([][]parquet.Value, maxColumnIndex+1) 487 for _, value := range row { 488 columnIndex := value.Column() 489 columns[columnIndex] = append(columns[columnIndex], value) 490 } 491 return columns 492 } 493 494 func assertEqualValues(t *testing.T, columnIndex int, want, got []parquet.Value) { 495 n := len(want) 496 497 if len(want) != len(got) { 498 t.Errorf("wrong number of values in column %d: want=%d got=%d", columnIndex, len(want), len(got)) 499 if len(want) > len(got) { 500 n = len(got) 501 } 502 } 503 504 for i := 0; i < n; i++ { 505 v1, v2 := want[i], got[i] 506 507 if !parquet.Equal(v1, v2) { 508 t.Errorf("values at index %d mismatch in column %d: want=%#v got=%#v", i, columnIndex, v1, v2) 509 } 510 if columnIndex != int(v2.Column()) { 511 t.Errorf("column index mismatch in column %d: want=%d got=%#v", i, columnIndex, v2) 512 } 513 if v1.RepetitionLevel() != v2.RepetitionLevel() { 514 t.Errorf("repetition levels at index %d mismatch in column %d: want=%#v got=%#v", i, columnIndex, v1, v2) 515 } 516 if v1.DefinitionLevel() != v2.DefinitionLevel() { 517 t.Errorf("definition levels at index %d mismatch in column %d: want=%#v got=%#v", i, columnIndex, v1, v2) 518 } 519 } 520 } 521 522 func BenchmarkDeconstruct(b *testing.B) { 523 row := &AddressBook{ 524 Owner: "Julien Le Dem", 525 OwnerPhoneNumbers: []string{ 526 "555 123 4567", 527 "555 666 1337", 528 }, 529 Contacts: []Contact{ 530 { 531 Name: "Dmitriy Ryaboy", 532 PhoneNumber: "555 987 6543", 533 }, 534 { 535 Name: "Chris Aniszczyk", 536 }, 537 }, 538 } 539 540 schema := parquet.SchemaOf(row) 541 buffer := parquet.Row{} 542 543 for i := 0; i < b.N; i++ { 544 buffer = schema.Deconstruct(buffer[:0], row) 545 } 546 } 547 548 func BenchmarkReconstruct(b *testing.B) { 549 row := &AddressBook{ 550 Owner: "Julien Le Dem", 551 OwnerPhoneNumbers: []string{ 552 "555 123 4567", 553 "555 666 1337", 554 }, 555 Contacts: []Contact{ 556 { 557 Name: "Dmitriy Ryaboy", 558 PhoneNumber: "555 987 6543", 559 }, 560 { 561 Name: "Chris Aniszczyk", 562 }, 563 }, 564 } 565 566 schema := parquet.SchemaOf(row) 567 values := schema.Deconstruct(nil, row) 568 buffer := AddressBook{} 569 570 for i := 0; i < b.N; i++ { 571 buffer = AddressBook{} 572 573 if err := schema.Reconstruct(&buffer, values); err != nil { 574 b.Fatal(err) 575 } 576 } 577 }