github.com/fraugster/parquet-go@v0.12.0/floor/reader_test.go (about) 1 package floor 2 3 import ( 4 "fmt" 5 "os" 6 "reflect" 7 "testing" 8 "time" 9 10 "github.com/davecgh/go-spew/spew" 11 goparquet "github.com/fraugster/parquet-go" 12 "github.com/fraugster/parquet-go/floor/interfaces" 13 "github.com/fraugster/parquet-go/parquet" 14 "github.com/fraugster/parquet-go/parquetschema" 15 "github.com/stretchr/testify/require" 16 ) 17 18 func TestNewReaderFailures(t *testing.T) { 19 _, err := NewFileReader("file-does-not-exist.parquet") 20 require.Error(t, err) 21 22 _, err = NewFileReader("/dev/null") 23 require.Error(t, err) 24 } 25 26 func TestReadFile(t *testing.T) { 27 _ = os.Mkdir("files", 0755) 28 29 sd, err := parquetschema.ParseSchemaDefinition( 30 `message test_msg { 31 required int64 foo; 32 optional binary bar (STRING); 33 optional group baz { 34 required int64 value; 35 } 36 }`) 37 require.NoError(t, err, "parsing schema definition failed") 38 39 t.Logf("schema definition: %s", spew.Sdump(sd)) 40 41 hlWriter, err := NewFileWriter( 42 "files/readtest.parquet", 43 goparquet.WithCompressionCodec(parquet.CompressionCodec_SNAPPY), 44 goparquet.WithCreator("floor-unittest"), 45 goparquet.WithSchemaDefinition(sd), 46 ) 47 require.NoError(t, err, "creating parquet file writer failed") 48 49 type bazMsg struct { 50 Value uint32 51 } 52 53 type testMsg struct { 54 Foo int64 55 Bar *string 56 Baz *bazMsg 57 } 58 59 // Baz doesn't seem to get written correctly. when dumping the resulting file, baz.value is wrong. 60 require.NoError(t, hlWriter.Write(testMsg{Foo: 1, Bar: strPtr("hello"), Baz: &bazMsg{Value: 9001}})) 61 require.NoError(t, hlWriter.Write(&testMsg{Foo: 23})) 62 require.NoError(t, hlWriter.Write(testMsg{Foo: 42, Bar: strPtr("world!")})) 63 require.NoError(t, hlWriter.Close()) 64 65 hlReader, err := NewFileReader("files/readtest.parquet") 66 require.NoError(t, err) 67 68 count := 0 69 70 var result []testMsg 71 72 for hlReader.Next() { 73 var msg testMsg 74 75 require.Error(t, hlReader.Scan(int(1)), "%d. Scan into int unexpectedly succeeded", count) 76 require.Error(t, hlReader.Scan(new(int)), "%d. Scan into *int unexpectedly succeeded", count) 77 78 require.NoError(t, hlReader.Scan(&msg), "%d. Scan failed", count) 79 t.Logf("%d. data = %#v", count, hlReader.data) 80 81 result = append(result, msg) 82 83 count++ 84 } 85 86 require.NoError(t, hlReader.Err(), "hlReader returned error") 87 require.False(t, hlReader.Next(), "hlReader returned true after it had returned false") 88 89 t.Logf("count = %d", count) 90 t.Logf("result = %s", spew.Sdump(result)) 91 92 require.NoError(t, hlReader.Err(), "hlReader returned an error") 93 94 require.NoError(t, hlReader.Close()) 95 } 96 97 func TestReadWriteAthenaList(t *testing.T) { 98 _ = os.Mkdir("files", 0755) 99 100 sd, err := parquetschema.ParseSchemaDefinition( 101 `message test_msg { 102 required group emails (LIST) { 103 repeated group bag { 104 required binary array_element (STRING); 105 } 106 } 107 }`) 108 require.NoError(t, err, "parsing schema definition failed") 109 110 t.Logf("schema definition: %s", spew.Sdump(sd)) 111 112 hlWriter, err := NewFileWriter( 113 "files/athena_list.parquet", 114 goparquet.WithCompressionCodec(parquet.CompressionCodec_SNAPPY), 115 goparquet.WithCreator("floor-unittest"), 116 goparquet.WithSchemaDefinition(sd), 117 ) 118 require.NoError(t, err) 119 120 testData := []string{"foo@example.com", "bar@example.com"} 121 122 require.NoError(t, hlWriter.Write(&emailList{emails: testData})) 123 124 require.NoError(t, hlWriter.Close()) 125 126 hlReader, err := NewFileReader("files/athena_list.parquet") 127 require.NoError(t, err) 128 129 var l emailList 130 131 require.True(t, hlReader.Next()) 132 133 require.NoError(t, hlReader.Scan(&l)) 134 135 require.Equal(t, testData, l.emails) 136 } 137 138 type emailList struct { 139 emails []string 140 } 141 142 func (l *emailList) MarshalParquet(obj interfaces.MarshalObject) error { 143 list := obj.AddField("emails").List() 144 for _, email := range l.emails { 145 list.Add().SetByteArray([]byte(email)) 146 } 147 return nil 148 } 149 150 func (l *emailList) UnmarshalParquet(obj interfaces.UnmarshalObject) error { 151 list, err := obj.GetField("emails").List() 152 if err != nil { 153 return fmt.Errorf("couldn't get emails as list: %w", err) 154 } 155 156 for list.Next() { 157 v, err := list.Value() 158 if err != nil { 159 return fmt.Errorf("couldn't get list value: %w", err) 160 } 161 vv, err := v.ByteArray() 162 if err != nil { 163 return fmt.Errorf("couldn't get list value as byte array: %w", err) 164 } 165 l.emails = append(l.emails, string(vv)) 166 } 167 168 return nil 169 } 170 171 func TestReadWriteMap(t *testing.T) { 172 _ = os.Mkdir("files", 0755) 173 174 sd, err := parquetschema.ParseSchemaDefinition( 175 `message test_msg { 176 required group foo (MAP) { 177 repeated group key_value (MAP_KEY_VALUE) { 178 required binary key (STRING); 179 required int32 value; 180 } 181 } 182 }`) 183 require.NoError(t, err, "parsing schema definition failed") 184 185 t.Logf("schema definition: %s", spew.Sdump(sd)) 186 187 hlWriter, err := NewFileWriter( 188 "files/map.parquet", 189 goparquet.WithCompressionCodec(parquet.CompressionCodec_SNAPPY), 190 goparquet.WithCreator("floor-unittest"), 191 goparquet.WithSchemaDefinition(sd), 192 ) 193 require.NoError(t, err) 194 195 type testMsg struct { 196 Foo map[string]int32 197 } 198 199 testData := []testMsg{ 200 {Foo: map[string]int32{"foo": 23, "bar": 42, "baz": 9001}}, 201 {Foo: map[string]int32{"a": 61, "c": 63}}, 202 } 203 204 for _, tt := range testData { 205 require.NoError(t, hlWriter.Write(tt)) 206 } 207 require.NoError(t, hlWriter.Close()) 208 209 hlReader, err := NewFileReader("files/map.parquet") 210 require.NoError(t, err) 211 212 count := 0 213 214 var result []testMsg 215 216 for hlReader.Next() { 217 var msg testMsg 218 219 require.NoError(t, hlReader.Scan(&msg), "%d. Scan failed", count) 220 t.Logf("%d. data = %#v", count, hlReader.data) 221 222 result = append(result, msg) 223 224 count++ 225 } 226 227 require.NoError(t, hlReader.Err(), "hlReader returned an error") 228 t.Logf("count = %d", count) 229 230 for idx, elem := range result { 231 require.Equal(t, testData[idx], elem, "%d. read result doesn't match expected data", idx) 232 } 233 234 require.NoError(t, hlReader.Close()) 235 } 236 237 func TestReadWriteSlice(t *testing.T) { 238 _ = os.Mkdir("files", 0755) 239 240 sd, err := parquetschema.ParseSchemaDefinition( 241 `message test_msg { 242 required group foo (LIST) { 243 repeated group list { 244 required binary element (STRING); 245 } 246 } 247 }`) 248 require.NoError(t, err, "parsing schema definition failed") 249 250 t.Logf("schema definition: %s", spew.Sdump(sd)) 251 252 hlWriter, err := NewFileWriter( 253 "files/list.parquet", 254 goparquet.WithCompressionCodec(parquet.CompressionCodec_SNAPPY), 255 goparquet.WithCreator("floor-unittest"), 256 goparquet.WithSchemaDefinition(sd), 257 ) 258 require.NoError(t, err) 259 260 type testMsg struct { 261 Foo []string 262 } 263 264 testData := []testMsg{ 265 {Foo: []string{"hello", "world!"}}, 266 {Foo: []string{"these", "are", "just", "my", "tokens"}}, 267 {Foo: []string{"bla"}}, 268 } 269 270 for _, tt := range testData { 271 require.NoError(t, hlWriter.Write(tt)) 272 } 273 require.NoError(t, hlWriter.Close()) 274 275 hlReader, err := NewFileReader("files/list.parquet") 276 require.NoError(t, err) 277 278 count := 0 279 280 var result []testMsg 281 282 for hlReader.Next() { 283 var msg testMsg 284 285 require.NoError(t, hlReader.Scan(&msg), "%d. Scan failed", count) 286 t.Logf("%d. data = %#v", count, hlReader.data) 287 288 result = append(result, msg) 289 290 count++ 291 } 292 293 require.NoError(t, hlReader.Err(), "hlReader returned an error") 294 t.Logf("count = %d", count) 295 296 for idx, elem := range result { 297 require.Equal(t, testData[idx], elem, "%d. read result doesn't match expected data") 298 } 299 300 require.NoError(t, hlReader.Close()) 301 } 302 303 func TestReadWriteArray(t *testing.T) { 304 _ = os.Mkdir("files", 0755) 305 306 sd, err := parquetschema.ParseSchemaDefinition( 307 `message test_msg { 308 required group foo (LIST) { 309 repeated group list { 310 required binary element (STRING); 311 } 312 } 313 }`) 314 require.NoError(t, err, "parsing schema definition failed") 315 316 t.Logf("schema definition: %s", spew.Sdump(sd)) 317 318 hlWriter, err := NewFileWriter( 319 "files/array.parquet", 320 goparquet.WithCompressionCodec(parquet.CompressionCodec_SNAPPY), 321 goparquet.WithCreator("floor-unittest"), 322 goparquet.WithSchemaDefinition(sd), 323 ) 324 require.NoError(t, err) 325 326 type testMsg struct { 327 Foo [2]string 328 } 329 330 testData := []testMsg{ 331 {Foo: [2]string{"hello", "world!"}}, 332 {Foo: [2]string{"good morning", "vietnam!"}}, 333 {Foo: [2]string{"Berlin", "Zehlendorf"}}, 334 } 335 336 for _, tt := range testData { 337 require.NoError(t, hlWriter.Write(tt)) 338 } 339 require.NoError(t, hlWriter.Close()) 340 341 hlReader, err := NewFileReader("files/array.parquet") 342 require.NoError(t, err) 343 344 count := 0 345 346 var result []testMsg 347 348 for hlReader.Next() { 349 var msg testMsg 350 351 require.NoError(t, hlReader.Scan(&msg), "%d. Scan failed", count) 352 t.Logf("%d. data = %#v", count, hlReader.data) 353 354 result = append(result, msg) 355 356 count++ 357 } 358 359 require.NoError(t, hlReader.Err(), "hlReader returned an error") 360 361 t.Logf("count = %d", count) 362 363 for idx, elem := range result { 364 require.Equal(t, testData[idx], elem, "%d. read result doesn't match expected data") 365 } 366 367 require.NoError(t, hlReader.Close()) 368 } 369 370 func TestReadWriteSpecialTypes(t *testing.T) { 371 _ = os.Mkdir("files", 0755) 372 373 sd, err := parquetschema.ParseSchemaDefinition( 374 `message test_msg { 375 required fixed_len_byte_array(16) theid (UUID); 376 required binary clientstr (ENUM); 377 required binary client (ENUM); 378 required binary datastr (JSON); 379 required binary data (JSON); 380 optional int64 ignored; 381 }`) 382 require.NoError(t, err, "parsing schema definition failed") 383 384 t.Logf("schema definition: %s", spew.Sdump(sd)) 385 386 hlWriter, err := NewFileWriter( 387 "files/specialtypes.parquet", 388 goparquet.WithCompressionCodec(parquet.CompressionCodec_SNAPPY), 389 goparquet.WithCreator("floor-unittest"), 390 goparquet.WithSchemaDefinition(sd), 391 ) 392 require.NoError(t, err) 393 394 type testMsg struct { 395 TheID [16]byte 396 ClientStr string 397 Client []byte 398 DataStr string 399 Data []byte 400 ignored int64 // ignored because it's private and therefore not settable. 401 NotInSchema int64 // does not match up with anything in schema, therefore there shall be no attempt to fill it. 402 } 403 404 testData := []testMsg{ 405 { 406 TheID: [16]byte{0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x0A, 0x0B, 0x0C, 0x0E, 0x0F, 0x10}, 407 ClientStr: "hello", 408 Client: []byte("world"), 409 DataStr: `{"foo":"bar","baz":23}`, 410 Data: []byte(`{"quux":{"foo":"bar"}}`), 411 ignored: 23, 412 }, 413 } 414 415 for _, tt := range testData { 416 require.NoError(t, hlWriter.Write(tt)) 417 } 418 require.NoError(t, hlWriter.Close()) 419 420 testData[0].ignored = 0 421 422 hlReader, err := NewFileReader("files/specialtypes.parquet") 423 require.NoError(t, err) 424 425 count := 0 426 427 var result []testMsg 428 429 for hlReader.Next() { 430 var msg testMsg 431 432 require.NoError(t, hlReader.Scan(&msg), "%d. Scan failed", count) 433 t.Logf("%d. data = %#v", count, hlReader.data) 434 435 result = append(result, msg) 436 437 count++ 438 } 439 440 require.NoError(t, hlReader.Err(), "hlReader returned an error") 441 442 t.Logf("count = %d", count) 443 444 for idx, elem := range result { 445 require.Equal(t, testData[idx], elem, "%d. read result doesn't match expected data") 446 } 447 448 require.NoError(t, hlReader.Close()) 449 } 450 451 func elem(data interface{}) interfaces.UnmarshalElement { 452 return interfaces.NewUnmarshallElement(data) 453 } 454 455 func TestReflectUnmarshaller(t *testing.T) { 456 obj1 := struct { 457 Foo int64 458 }{} 459 460 sd, err := parquetschema.ParseSchemaDefinition(`message test { required int64 foo; }`) 461 require.NoError(t, err) 462 463 um := &reflectUnmarshaller{obj: obj1, schemaDef: sd} 464 465 data := interfaces.NewUnmarshallObject(map[string]interface{}{"foo": int64(42)}) 466 467 err = um.UnmarshalParquet(data) 468 require.EqualError(t, err, "you need to provide an object of type *struct { Foo int64 } to unmarshal into") 469 470 i64 := int64(23) 471 obj2 := &i64 472 473 um.obj = obj2 474 475 err = um.UnmarshalParquet(data) 476 require.EqualError(t, err, "provided object of type *int64 is not a struct") 477 478 um.obj = &obj1 479 480 err = um.UnmarshalParquet(data) 481 require.NoError(t, err) 482 } 483 484 func TestUnmarshallerFieldNameStructTag(t *testing.T) { 485 obj1 := struct { 486 Foo int64 `parquet:"bar"` 487 }{} 488 489 sd, err := parquetschema.ParseSchemaDefinition(`message test { required int64 bar; }`) 490 require.NoError(t, err) 491 492 um := &reflectUnmarshaller{obj: &obj1, schemaDef: sd} 493 494 data := interfaces.NewUnmarshallObject(map[string]interface{}{"bar": int64(42)}) 495 496 err = um.UnmarshalParquet(data) 497 require.NoError(t, err) 498 } 499 500 func TestFillValue(t *testing.T) { 501 um := &reflectUnmarshaller{} 502 503 require.NoError(t, um.fillValue(reflect.New(reflect.TypeOf(true)).Elem(), elem(false), nil)) 504 require.Error(t, um.fillValue(reflect.New(reflect.TypeOf(true)).Elem(), elem(23), nil)) 505 506 require.NoError(t, um.fillValue(reflect.New(reflect.TypeOf(int32(0))).Elem(), elem(int64(23)), nil)) 507 require.NoError(t, um.fillValue(reflect.New(reflect.TypeOf(int32(0))).Elem(), elem(int32(23)), nil)) 508 require.Error(t, um.fillValue(reflect.New(reflect.TypeOf(int32(0))).Elem(), elem(3.5), nil)) 509 510 require.NoError(t, um.fillValue(reflect.New(reflect.TypeOf(uint32(0))).Elem(), elem(int64(42)), nil)) 511 require.NoError(t, um.fillValue(reflect.New(reflect.TypeOf(uint32(0))).Elem(), elem(int32(42)), nil)) 512 require.Error(t, um.fillValue(reflect.New(reflect.TypeOf(uint32(0))).Elem(), elem("9001"), nil)) 513 514 require.NoError(t, um.fillValue(reflect.New(reflect.TypeOf(float32(0.0))).Elem(), elem(float64(23.5)), nil)) 515 require.NoError(t, um.fillValue(reflect.New(reflect.TypeOf(float32(0.0))).Elem(), elem(float32(23.5)), nil)) 516 517 require.Error(t, um.fillValue(reflect.New(reflect.TypeOf(float32(0.0))).Elem(), elem(false), nil)) 518 519 require.NoError(t, um.fillValue(reflect.New(reflect.TypeOf([]byte{})).Elem(), elem([]byte("hello world!")), nil)) 520 require.Error(t, um.fillValue(reflect.New(reflect.TypeOf([]byte{})).Elem(), elem(int64(1000000)), nil)) 521 522 sd, err := parquetschema.ParseSchemaDefinition(`message test { 523 required int32 date (DATE); 524 required int64 tsnano (TIMESTAMP(NANOS, true)); 525 required int64 tsmicro (TIMESTAMP(MICROS, true)); 526 required int64 tsmilli (TIMESTAMP(MILLIS, true)); 527 required int96 tshive; 528 required int64 tnano (TIME(NANOS, true)); 529 required int64 tmicro (TIME(MICROS, true)); 530 required int32 tmilli (TIME(MILLIS, true)); 531 }`) 532 require.NoError(t, err) 533 534 date := time.Unix(0, 0) 535 require.NoError(t, um.fillValue(reflect.ValueOf(&date).Elem(), elem(int32(9)), sd.SubSchema("date"))) 536 require.Equal(t, date, time.Date(1970, 1, 10, 0, 0, 0, 0, time.UTC)) 537 538 ts := time.Unix(0, 0) 539 require.NoError(t, um.fillValue(reflect.ValueOf(&ts).Elem(), elem(int64(42000000000)), sd.SubSchema("tsnano"))) 540 require.Equal(t, ts, time.Date(1970, 1, 1, 0, 0, 42, 0, time.UTC)) 541 542 require.NoError(t, um.fillValue(reflect.ValueOf(&ts).Elem(), elem(int64(1423000000)), sd.SubSchema("tsmicro"))) 543 require.Equal(t, ts, time.Date(1970, 1, 1, 0, 23, 43, 0, time.UTC)) 544 545 require.NoError(t, um.fillValue(reflect.ValueOf(&ts).Elem(), elem(int64(45299450)), sd.SubSchema("tsmilli"))) 546 require.Equal(t, ts, time.Date(1970, 1, 1, 12, 34, 59, 450000000, time.UTC)) 547 548 require.NoError(t, um.fillValue(reflect.ValueOf(&ts).Elem(), elem([12]byte{00, 0x60, 0xFD, 0x4B, 0x32, 0x29, 0x00, 0x00, 0x59, 0x68, 0x25, 0x00}), sd.SubSchema("tshive"))) 549 require.Equal(t, ts, time.Date(2000, 1, 1, 12, 34, 56, 0, time.UTC)) 550 551 var tt Time 552 require.NoError(t, um.fillValue(reflect.ValueOf(&tt).Elem(), elem(int64(30000000010)), sd.SubSchema("tnano"))) 553 require.Equal(t, tt, MustTime(NewTime(0, 0, 30, 10)).UTC()) 554 555 require.NoError(t, um.fillValue(reflect.ValueOf(&tt).Elem(), elem(int64(210000020)), sd.SubSchema("tmicro"))) 556 require.Equal(t, tt, MustTime(NewTime(0, 3, 30, 20000)).UTC()) 557 558 require.NoError(t, um.fillValue(reflect.ValueOf(&tt).Elem(), elem(int32(14620200)), sd.SubSchema("tmilli"))) 559 require.Equal(t, tt, MustTime(NewTime(4, 3, 40, 200000000)).UTC()) 560 } 561 562 func BenchmarkReadFile(b *testing.B) { 563 _ = os.Mkdir("files", 0755) 564 565 sd, err := parquetschema.ParseSchemaDefinition( 566 `message test_msg { 567 required int64 foo; 568 optional binary bar (STRING); 569 optional group baz { 570 required int64 value; 571 } 572 }`) 573 require.NoError(b, err, "parsing schema definition failed") 574 575 hlWriter, err := NewFileWriter( 576 "files/readtest.parquet", 577 goparquet.WithCompressionCodec(parquet.CompressionCodec_SNAPPY), 578 goparquet.WithCreator("floor-unittest"), 579 goparquet.WithSchemaDefinition(sd), 580 ) 581 require.NoError(b, err, "creating parquet file writer failed") 582 583 type bazMsg struct { 584 Value uint32 585 } 586 587 type testMsg struct { 588 Foo int64 589 Bar *string 590 Baz *bazMsg 591 } 592 593 // Baz doesn't seem to get written correctly. when dumping the resulting file, baz.value is wrong. 594 require.NoError(b, hlWriter.Write(testMsg{Foo: 1, Bar: strPtr("hello"), Baz: &bazMsg{Value: 9001}})) 595 require.NoError(b, hlWriter.Close()) 596 597 hlReader, err := NewFileReader("files/readtest.parquet") 598 require.NoError(b, err) 599 defer func() { 600 require.NoError(b, hlReader.Close()) 601 }() 602 require.True(b, hlReader.Next()) 603 604 b.ResetTimer() 605 for i := 0; i < b.N; i++ { 606 var msg testMsg 607 _ = hlReader.Scan(&msg) 608 } 609 }