github.com/fraugster/parquet-go@v0.12.0/readwrite_test.go (about) 1 package goparquet 2 3 import ( 4 "bytes" 5 "context" 6 "errors" 7 "fmt" 8 "io" 9 "math" 10 "math/rand" 11 "os" 12 "testing" 13 "time" 14 15 "github.com/fraugster/parquet-go/parquet" 16 "github.com/fraugster/parquet-go/parquetschema" 17 "github.com/stretchr/testify/assert" 18 "github.com/stretchr/testify/require" 19 ) 20 21 func TestWriteThenReadFile(t *testing.T) { 22 ctx := context.Background() 23 24 testFunc := func(t *testing.T, name string, opts []FileWriterOption, ropts []FileReaderOption) { 25 _ = os.Mkdir("files", 0755) 26 27 filename := fmt.Sprintf("files/test1_%s.parquet", name) 28 29 wf, err := os.OpenFile(filename, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) 30 require.NoError(t, err, "creating file failed") 31 32 w := NewFileWriter(wf, opts...) 33 34 fooStore, err := NewInt64Store(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 35 require.NoError(t, err, "failed to create fooStore") 36 37 barStore, err := NewByteArrayStore(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 38 require.NoError(t, err, "failed to create barStore") 39 40 bazStore, err := NewInt32Store(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 41 require.NoError(t, err, "failed to create bazStore") 42 43 require.NoError(t, w.AddColumn("foo", NewDataColumn(fooStore, parquet.FieldRepetitionType_REQUIRED))) 44 require.NoError(t, w.AddColumn("bar", NewDataColumn(barStore, parquet.FieldRepetitionType_OPTIONAL))) 45 require.NoError(t, w.AddColumn("baz", NewDataColumn(bazStore, parquet.FieldRepetitionType_OPTIONAL))) 46 47 const ( 48 numRecords = 10000 49 flushLimit = 1000 50 ) 51 52 for idx := 0; idx < numRecords; idx++ { 53 if idx > 0 && idx%flushLimit == 0 { 54 require.NoError(t, w.FlushRowGroup(), "%d. AddData failed", idx) 55 } 56 57 data := map[string]interface{}{"foo": int64(idx), "bar": []byte("value" + fmt.Sprint(idx))} 58 if idx%20 != 0 { 59 data["baz"] = int32(idx % 16) 60 } 61 62 require.NoError(t, w.AddData(data), "%d. AddData failed", idx) 63 } 64 65 assert.NoError(t, w.Close(), "Close failed") 66 67 require.NoError(t, wf.Close()) 68 69 rf, err := os.Open(filename) 70 require.NoError(t, err, "opening file failed") 71 defer rf.Close() 72 73 r, err := NewFileReaderWithOptions(rf, ropts...) 74 require.NoError(t, err, "creating file reader failed") 75 76 cols := r.Columns() 77 require.Len(t, cols, 3, "got %d column", len(cols)) 78 require.Equal(t, "foo", cols[0].Name()) 79 require.Equal(t, "foo", cols[0].FlatName()) 80 require.Equal(t, "bar", cols[1].Name()) 81 require.Equal(t, "bar", cols[1].FlatName()) 82 require.Equal(t, "baz", cols[2].Name()) 83 require.Equal(t, "baz", cols[2].FlatName()) 84 for g := 0; g < r.RowGroupCount(); g++ { 85 require.NoError(t, r.readRowGroup(ctx), "Reading row group failed") 86 for i := 0; i < int(r.schemaReader.rowGroupNumRecords()); i++ { 87 data, err := r.schemaReader.getData() 88 require.NoError(t, err) 89 _, ok := data["foo"] 90 require.True(t, ok) 91 } 92 } 93 } 94 95 tests := []struct { 96 Name string 97 WriteOpts []FileWriterOption 98 ReadOpts []FileReaderOption 99 }{ 100 { 101 Name: "datapagev1", 102 WriteOpts: []FileWriterOption{ 103 WithCompressionCodec(parquet.CompressionCodec_SNAPPY), 104 WithCreator("parquet-go-unittest"), 105 }, 106 ReadOpts: []FileReaderOption{}, 107 }, 108 { 109 Name: "datapagev2", 110 WriteOpts: []FileWriterOption{ 111 WithCompressionCodec(parquet.CompressionCodec_SNAPPY), 112 WithCreator("parquet-go-unittest"), WithDataPageV2(), 113 }, 114 ReadOpts: []FileReaderOption{}, 115 }, 116 { 117 Name: "datapagev1_crc", 118 WriteOpts: []FileWriterOption{ 119 WithCompressionCodec(parquet.CompressionCodec_SNAPPY), 120 WithCreator("parquet-go-unittest"), 121 WithCRC(true), 122 }, 123 ReadOpts: []FileReaderOption{WithCRC32Validation(true)}, 124 }, 125 { 126 Name: "datapagev2_crc", 127 WriteOpts: []FileWriterOption{ 128 WithCompressionCodec(parquet.CompressionCodec_SNAPPY), 129 WithCreator("parquet-go-unittest"), 130 WithDataPageV2(), 131 WithCRC(true), 132 }, 133 ReadOpts: []FileReaderOption{WithCRC32Validation(true)}, 134 }, 135 } 136 137 for _, tt := range tests { 138 t.Run(tt.Name, func(t *testing.T) { 139 testFunc(t, tt.Name, tt.WriteOpts, tt.ReadOpts) 140 }) 141 } 142 } 143 144 func TestWriteThenReadFileRepeated(t *testing.T) { 145 ctx := context.Background() 146 147 _ = os.Mkdir("files", 0755) 148 149 wf, err := os.OpenFile("files/test2.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) 150 require.NoError(t, err, "creating file failed") 151 152 w := NewFileWriter(wf, WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithCreator("parquet-go-unittest")) 153 154 fooStore, err := NewInt64Store(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 155 require.NoError(t, err, "failed to create fooStore") 156 157 require.NoError(t, w.AddColumn("foo", NewDataColumn(fooStore, parquet.FieldRepetitionType_REPEATED))) 158 159 data := []map[string]interface{}{ 160 {"foo": []int64{1}}, 161 {"foo": []int64{1, 2, 3, 1}}, 162 {}, 163 {"foo": []int64{1, 3, 1, 1}}, 164 {}, 165 {"foo": []int64{1, 2, 2, 1}}, 166 } 167 168 for i := range data { 169 require.NoError(t, w.AddData(data[i])) 170 } 171 172 assert.NoError(t, w.Close(), "Close failed") 173 174 require.NoError(t, wf.Close()) 175 176 rf, err := os.Open("files/test2.parquet") 177 require.NoError(t, err, "opening file failed") 178 defer rf.Close() 179 180 r, err := NewFileReader(rf) 181 require.NoError(t, err, "creating file reader failed") 182 require.NoError(t, r.readRowGroup(ctx)) 183 184 require.Equal(t, int64(len(data)), r.schemaReader.rowGroupNumRecords()) 185 for i := range data { 186 d, err := r.schemaReader.getData() 187 require.NoError(t, err) 188 require.Equal(t, data[i], d) 189 } 190 } 191 192 func TestWriteThenReadFileOptional(t *testing.T) { 193 ctx := context.Background() 194 _ = os.Mkdir("files", 0755) 195 196 wf, err := os.OpenFile("files/test3.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) 197 require.NoError(t, err, "creating file failed") 198 199 w := NewFileWriter(wf, WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithCreator("parquet-go-unittest")) 200 201 fooStore, err := NewByteArrayStore(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 202 require.NoError(t, err, "failed to create fooStore") 203 204 require.NoError(t, w.AddColumn("foo", NewDataColumn(fooStore, parquet.FieldRepetitionType_OPTIONAL))) 205 206 data := []map[string]interface{}{ 207 {"foo": []byte("1")}, 208 {"foo": []byte("2")}, 209 {}, 210 {"foo": []byte("3")}, 211 {}, 212 {"foo": []byte("4")}, 213 } 214 215 for i := range data { 216 require.NoError(t, w.AddData(data[i])) 217 } 218 219 assert.NoError(t, w.Close(), "Close failed") 220 221 require.NoError(t, wf.Close()) 222 223 rf, err := os.Open("files/test3.parquet") 224 require.NoError(t, err, "opening file failed") 225 defer rf.Close() 226 227 r, err := NewFileReader(rf) 228 require.NoError(t, err, "creating file reader failed") 229 require.NoError(t, r.readRowGroup(ctx)) 230 231 require.Equal(t, int64(len(data)), r.schemaReader.rowGroupNumRecords()) 232 root := r.schemaReader.root 233 for i := range data { 234 _, ok := data[i]["foo"] 235 rL, dL, b := root.getFirstRDLevel() 236 if ok { 237 assert.False(t, b) 238 assert.Equal(t, int32(0), rL) 239 assert.Equal(t, int32(1), dL) 240 } else { 241 assert.False(t, b) 242 assert.Equal(t, int32(0), rL) 243 assert.Equal(t, int32(0), dL) 244 } 245 246 get, err := r.schemaReader.getData() 247 require.NoError(t, err) 248 require.Equal(t, data[i], get) 249 } 250 } 251 252 func TestWriteThenReadFileNested(t *testing.T) { 253 ctx := context.Background() 254 _ = os.Mkdir("files", 0755) 255 256 wf, err := os.OpenFile("files/test4.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) 257 require.NoError(t, err, "creating file failed") 258 259 w := NewFileWriter(wf, WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithCreator("parquet-go-unittest")) 260 261 fooStore, err := NewInt64Store(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 262 require.NoError(t, err, "failed to create fooStore") 263 barStore, err := NewByteArrayStore(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 264 require.NoError(t, err, "failed to create barStore") 265 266 require.NoError(t, w.AddGroupByPath(ColumnPath{"baz"}, parquet.FieldRepetitionType_REPEATED)) 267 require.NoError(t, w.AddColumnByPath(ColumnPath{"baz", "foo"}, NewDataColumn(fooStore, parquet.FieldRepetitionType_REQUIRED))) 268 require.NoError(t, w.AddColumnByPath(ColumnPath{"baz", "bar"}, NewDataColumn(barStore, parquet.FieldRepetitionType_OPTIONAL))) 269 270 data := []map[string]interface{}{ 271 { 272 "baz": []map[string]interface{}{ 273 {"foo": int64(10)}, 274 }, 275 }, 276 } 277 278 for i := range data { 279 require.NoError(t, w.AddData(data[i])) 280 } 281 282 assert.NoError(t, w.Close(), "Close failed") 283 284 require.NoError(t, wf.Close()) 285 286 rf, err := os.Open("files/test4.parquet") 287 require.NoError(t, err, "opening file failed") 288 defer rf.Close() 289 290 r, err := NewFileReader(rf) 291 require.NoError(t, err, "creating file reader failed") 292 require.NoError(t, r.readRowGroup(ctx)) 293 294 require.Equal(t, int64(len(data)), r.schemaReader.rowGroupNumRecords()) 295 for i := range data { 296 d, err := r.schemaReader.getData() 297 require.NoError(t, err) 298 require.Equal(t, data[i], d) 299 } 300 } 301 302 func TestWriteThenReadFileNested2(t *testing.T) { 303 ctx := context.Background() 304 _ = os.Mkdir("files", 0755) 305 306 wf, err := os.OpenFile("files/test5.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) 307 require.NoError(t, err, "creating file failed") 308 309 w := NewFileWriter(wf, WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithCreator("parquet-go-unittest")) 310 311 blaStore, err := NewInt64Store(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 312 require.NoError(t, err, "failed to create fooStore") 313 barStore, err := NewByteArrayStore(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 314 require.NoError(t, err, "failed to create barStore") 315 316 require.NoError(t, w.AddGroup("foo", parquet.FieldRepetitionType_REPEATED)) 317 require.NoError(t, w.AddColumn("foo.bla", NewDataColumn(blaStore, parquet.FieldRepetitionType_REQUIRED))) 318 require.NoError(t, w.AddColumn("foo.bar", NewDataColumn(barStore, parquet.FieldRepetitionType_OPTIONAL))) 319 320 data := []map[string]interface{}{ 321 { 322 "foo": []map[string]interface{}{ 323 { 324 "bla": int64(23), 325 "bar": []byte("foobar"), 326 }, 327 }, 328 }, 329 { 330 "foo": []map[string]interface{}{ 331 { 332 "bla": int64(24), 333 "bar": []byte("hello"), 334 }, 335 }, 336 }, 337 { 338 "foo": []map[string]interface{}{ 339 { 340 "bla": int64(25), 341 }, 342 { 343 "bla": int64(26), 344 "bar": []byte("bye!"), 345 }, 346 { 347 "bla": int64(27), 348 }, 349 }, 350 }, 351 } 352 for i := range data { 353 require.NoError(t, w.AddData(data[i])) 354 } 355 356 assert.NoError(t, w.Close(), "Close failed") 357 358 require.NoError(t, wf.Close()) 359 360 rf, err := os.Open("files/test5.parquet") 361 require.NoError(t, err, "opening file failed") 362 defer rf.Close() 363 364 r, err := NewFileReader(rf) 365 require.NoError(t, err, "creating file reader failed") 366 require.NoError(t, r.readRowGroup(ctx)) 367 368 require.Equal(t, int64(len(data)), r.schemaReader.rowGroupNumRecords()) 369 for i := range data { 370 d, err := r.schemaReader.getData() 371 require.NoError(t, err) 372 require.Equal(t, data[i], d) 373 } 374 } 375 376 func TestWriteThenReadFileMap(t *testing.T) { 377 ctx := context.Background() 378 _ = os.Mkdir("files", 0755) 379 380 wf, err := os.OpenFile("files/test6.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) 381 require.NoError(t, err, "creating file failed") 382 383 w := NewFileWriter(wf, WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithCreator("parquet-go-unittest")) 384 385 fooStore, err := NewInt64Store(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 386 require.NoError(t, err, "failed to create fooStore") 387 barStore, err := NewByteArrayStore(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 388 require.NoError(t, err, "failed to create barStore") 389 elementStore, err := NewInt32Store(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 390 require.NoError(t, err, "failed to create elementStore") 391 392 elementCol := NewDataColumn(elementStore, parquet.FieldRepetitionType_REQUIRED) 393 list, err := NewListColumn(elementCol, parquet.FieldRepetitionType_OPTIONAL) 394 require.NoError(t, err) 395 396 quuxParams := &ColumnParameters{ 397 LogicalType: parquet.NewLogicalType(), 398 } 399 quuxParams.LogicalType.DECIMAL = parquet.NewDecimalType() 400 quuxParams.LogicalType.DECIMAL.Scale = 3 401 quuxParams.LogicalType.DECIMAL.Precision = 5 402 403 quuxStore, err := NewInt32Store(parquet.Encoding_PLAIN, true, quuxParams) 404 require.NoError(t, err) 405 406 require.NoError(t, w.AddColumn("foo", NewDataColumn(fooStore, parquet.FieldRepetitionType_REQUIRED))) 407 require.NoError(t, w.AddColumn("bar", NewDataColumn(barStore, parquet.FieldRepetitionType_OPTIONAL))) 408 require.NoError(t, w.AddColumn("baz", list)) 409 require.NoError(t, w.AddColumn("quux", NewDataColumn(quuxStore, parquet.FieldRepetitionType_OPTIONAL))) 410 411 /* `message test_msg { 412 required int64 foo; 413 optional binary bar (STRING); 414 optional group baz (LIST) { 415 repeated group list { 416 required int32 element; 417 } 418 } 419 optional int32 quux (DECIMAL(3, 5)); 420 }` */ 421 data := []map[string]interface{}{ 422 { 423 "foo": int64(500), 424 }, 425 { 426 "foo": int64(23), 427 "bar": []byte("hello!"), 428 "baz": map[string]interface{}{ 429 "list": []map[string]interface{}{ 430 {"element": int32(23)}, 431 }, 432 }, 433 "quux": int32(123456), 434 }, 435 { 436 "foo": int64(42), 437 "bar": []byte("world!"), 438 "baz": map[string]interface{}{ 439 "list": []map[string]interface{}{ 440 {"element": int32(1)}, 441 {"element": int32(1)}, 442 {"element": int32(2)}, 443 {"element": int32(3)}, 444 {"element": int32(5)}, 445 }, 446 }, 447 }, 448 { 449 "foo": int64(1000), 450 "bar": []byte("bye!"), 451 "baz": map[string]interface{}{ 452 "list": []map[string]interface{}{ 453 {"element": int32(2)}, 454 {"element": int32(3)}, 455 {"element": int32(5)}, 456 {"element": int32(7)}, 457 {"element": int32(11)}, 458 }, 459 }, 460 }, 461 } 462 463 for i := range data { 464 require.NoError(t, w.AddData(data[i])) 465 } 466 467 assert.NoError(t, w.Close(), "Close failed") 468 469 require.NoError(t, wf.Close()) 470 471 rf, err := os.Open("files/test6.parquet") 472 require.NoError(t, err, "opening file failed") 473 defer rf.Close() 474 475 r, err := NewFileReader(rf) 476 require.NoError(t, err, "creating file reader failed") 477 require.NoError(t, r.readRowGroup(ctx)) 478 479 require.Equal(t, int64(len(data)), r.schemaReader.rowGroupNumRecords()) 480 for i := range data { 481 d, err := r.schemaReader.getData() 482 require.NoError(t, err) 483 require.Equal(t, data[i], d) 484 } 485 } 486 487 func TestWriteThenReadFileNested3(t *testing.T) { 488 ctx := context.Background() 489 _ = os.Mkdir("files", 0755) 490 491 wf, err := os.OpenFile("files/test7.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) 492 require.NoError(t, err, "creating file failed") 493 494 w := NewFileWriter(wf, WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithCreator("parquet-go-unittest")) 495 valueStore, err := NewInt64Store(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 496 require.NoError(t, err, "failed to create valueStore") 497 require.NoError(t, w.AddGroup("baz", parquet.FieldRepetitionType_OPTIONAL)) 498 require.NoError(t, w.AddColumn("baz.value", NewDataColumn(valueStore, parquet.FieldRepetitionType_REQUIRED))) 499 500 data := []map[string]interface{}{ 501 { 502 "baz": map[string]interface{}{ 503 "value": int64(9001), 504 }, 505 }, 506 {}, 507 {}, 508 } 509 510 for i := range data { 511 require.NoError(t, w.AddData(data[i])) 512 } 513 514 assert.NoError(t, w.Close(), "Close failed") 515 516 require.NoError(t, wf.Close()) 517 518 rf, err := os.Open("files/test7.parquet") 519 require.NoError(t, err, "opening file failed") 520 defer rf.Close() 521 522 r, err := NewFileReader(rf) 523 require.NoError(t, err, "creating file reader failed") 524 require.NoError(t, r.readRowGroup(ctx)) 525 526 require.Equal(t, int64(len(data)), r.schemaReader.rowGroupNumRecords()) 527 for i := range data { 528 d, err := r.schemaReader.getData() 529 require.NoError(t, err) 530 require.Equal(t, data[i], d) 531 } 532 } 533 534 func TestWriteEmptyDict(t *testing.T) { 535 ctx := context.Background() 536 _ = os.Mkdir("files", 0755) 537 538 wf, err := os.OpenFile("files/test8.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) 539 require.NoError(t, err, "creating file failed") 540 541 w := NewFileWriter(wf, WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithCreator("parquet-go-unittest")) 542 valueStore, err := NewByteArrayStore(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 543 require.NoError(t, err, "failed to create valueStore") 544 require.NoError(t, w.AddColumn("value", NewDataColumn(valueStore, parquet.FieldRepetitionType_OPTIONAL))) 545 546 for i := 0; i < 1000; i++ { 547 require.NoError(t, w.AddData(nil)) 548 } 549 550 assert.NoError(t, w.Close(), "Close failed") 551 552 require.NoError(t, wf.Close()) 553 554 rf, err := os.Open("files/test8.parquet") 555 require.NoError(t, err, "opening file failed") 556 defer rf.Close() 557 558 r, err := NewFileReader(rf) 559 require.NoError(t, err, "creating file reader failed") 560 require.NoError(t, r.readRowGroup(ctx)) 561 562 require.Equal(t, int64(1000), r.schemaReader.rowGroupNumRecords()) 563 for i := 0; i < 1000; i++ { 564 d, err := r.schemaReader.getData() 565 require.NoError(t, err) 566 require.Equal(t, map[string]interface{}{}, d) 567 } 568 } 569 570 func TestWriteTimeData(t *testing.T) { 571 _ = os.Mkdir("files", 0755) 572 573 wf, err := os.OpenFile("files/test9.parquet", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) 574 require.NoError(t, err, "creating file failed") 575 576 sd, err := parquetschema.ParseSchemaDefinition(` 577 message foo { 578 required int64 ts_nanos (TIMESTAMP(NANOS, true)); 579 required int64 ts_micros (TIMESTAMP(MICROS, true)); 580 required int64 ts_millis (TIMESTAMP(MILLIS, true)); 581 required int32 date (DATE); 582 required int64 t_nanos (TIME(NANOS, false)); 583 required int64 t_micros (TIME(MICROS, false)); 584 required int32 t_millis (TIME(MILLIS, false)); 585 optional int32 t_alwaysnull (TIME(MILLIS, false)); 586 } 587 `) 588 require.NoError(t, err) 589 590 w := NewFileWriter(wf, WithSchemaDefinition(sd), WithCompressionCodec(parquet.CompressionCodec_GZIP)) 591 testData := []time.Time{ 592 time.Date(2015, 5, 9, 14, 15, 45, 666777888, time.UTC), 593 time.Date(1983, 10, 18, 11, 45, 16, 123456789, time.UTC), 594 } 595 596 for _, tt := range testData { 597 require.NoError(t, w.AddData(map[string]interface{}{ 598 "ts_nanos": tt.UnixNano(), 599 "ts_micros": tt.UnixNano() / 1000, 600 "ts_millis": tt.UnixNano() / 1000000, 601 "date": int32(tt.UnixNano() / (86400 * 1000000000)), 602 "t_nanos": int64((tt.Hour()*3600+tt.Minute()*60+tt.Second())*1000000000 + tt.Nanosecond()), 603 "t_micros": int64((tt.Hour()*3600+tt.Minute()*60+tt.Second())*1000000 + tt.Nanosecond()/1000), 604 "t_millis": int32((tt.Hour()*3600+tt.Minute()*60+tt.Second())*1000 + tt.Nanosecond()/1000000), 605 })) 606 } 607 608 require.NoError(t, w.FlushRowGroup()) 609 require.NoError(t, w.Close()) 610 require.NoError(t, wf.Close()) 611 612 rf, err := os.Open("files/test9.parquet") 613 require.NoError(t, err, "opening file failed") 614 defer rf.Close() 615 616 r, err := NewFileReader(rf) 617 require.NoError(t, err, "creating file reader failed") 618 619 require.NoError(t, r.PreLoad()) 620 621 rg := r.CurrentRowGroup() 622 623 verificationData := []struct { 624 pathInSchema []string 625 maxValue []byte 626 minValue []byte 627 nullCount int64 628 distinctCount int64 629 }{ 630 { 631 []string{"ts_nanos"}, 632 []byte{0x20, 0xa3, 0xc6, 0xc3, 0x7c, 0x93, 0xdc, 0x13}, 633 []byte{0x15, 0xc5, 0x33, 0x1e, 0x40, 0x96, 0xa, 0x6}, 634 0, 635 2, 636 }, 637 { 638 []string{"ts_micros"}, 639 []byte{0xd9, 0x32, 0xe0, 0xc7, 0xa6, 0x15, 0x5, 0x0}, 640 []byte{0x40, 0xd, 0xc0, 0x1e, 0xed, 0x8b, 0x1, 0x0}, 641 0, 642 2, 643 }, 644 { 645 []string{"ts_millis"}, 646 []byte{0x2, 0x29, 0x8, 0x39, 0x4d, 0x1, 0x0, 0x0}, 647 []byte{0x5b, 0x39, 0x6c, 0x5b, 0x65, 0x0, 0x0, 0x0}, 648 0, 649 2, 650 }, 651 { 652 []string{"date"}, 653 []byte{0xb4, 0x40, 0x0, 0x0}, 654 []byte{0xae, 0x13, 0x0, 0x0}, 655 0, 656 2, 657 }, 658 { 659 []string{"t_nanos"}, 660 []byte{0x20, 0xa3, 0x3a, 0xd8, 0xb2, 0x2e, 0x0, 0x0}, 661 []byte{0x15, 0xc5, 0x81, 0x7d, 0x7c, 0x26, 0x0, 0x0}, 662 0, 663 2, 664 }, 665 { 666 []string{"t_micros"}, 667 []byte{0xd9, 0xb2, 0x70, 0xf4, 0xb, 0x0, 0x0, 0x0}, 668 []byte{0x40, 0xcd, 0x3c, 0xda, 0x9, 0x0, 0x0, 0x0}, 669 0, 670 2, 671 }, 672 { 673 []string{"t_millis"}, 674 []byte{0x2, 0x79, 0xf, 0x3}, 675 []byte{0x5b, 0xb1, 0x85, 0x2}, 676 0, 677 2, 678 }, 679 { 680 []string{"t_alwaysnull"}, 681 nil, 682 nil, 683 2, 684 0, 685 }, 686 } 687 688 for idx, tt := range verificationData { 689 assert.Equal(t, tt.pathInSchema, rg.Columns[idx].MetaData.PathInSchema, "%d. path in schema doesn't match", idx) 690 assert.Equal(t, tt.maxValue, rg.Columns[idx].MetaData.Statistics.MaxValue, "%d. max value doesn't match", idx) 691 assert.Equal(t, tt.minValue, rg.Columns[idx].MetaData.Statistics.MinValue, "%d. min value doesn't match", idx) 692 assert.Equal(t, tt.nullCount, rg.Columns[idx].MetaData.Statistics.GetNullCount(), "%d. null count doesn't match", idx) 693 assert.Equal(t, tt.distinctCount, rg.Columns[idx].MetaData.Statistics.GetDistinctCount(), "%d. distinct count doesn't match", idx) 694 } 695 } 696 697 func TestReadWriteMultiLevel(t *testing.T) { 698 sc := `message txn { 699 optional group cluster (LIST) { 700 repeated group list { 701 required group element { 702 optional group cluster_step (LIST) { 703 repeated group list { 704 required group element { 705 optional group story_point { 706 required binary type (STRING); 707 } 708 } 709 } 710 } 711 } 712 } 713 } 714 } 715 ` 716 buf := &bytes.Buffer{} 717 sd, err := parquetschema.ParseSchemaDefinition(sc) 718 require.NoError(t, err) 719 w := NewFileWriter(buf, WithSchemaDefinition(sd)) 720 721 require.NoError(t, w.AddData(map[string]interface{}{})) 722 require.NoError(t, w.Close()) 723 buf2 := bytes.NewReader(buf.Bytes()) 724 r, err := NewFileReader(buf2) 725 require.NoError(t, err) 726 data, err := r.NextRow() 727 require.NoError(t, err) 728 require.Equal(t, map[string]interface{}{}, data) 729 730 _, err = r.NextRow() 731 require.Equal(t, io.EOF, err) 732 } 733 734 func TestWriteFileWithMarshallerThenReadWithUnmarshaller(t *testing.T) { 735 sd, err := parquetschema.ParseSchemaDefinition( 736 `message test_msg { 737 required group baz (LIST) { 738 repeated group list { 739 required group element { 740 required int64 quux; 741 } 742 } 743 } 744 }`) 745 746 require.NoError(t, err, "parsing schema definition failed") 747 748 buf := &bytes.Buffer{} 749 hlWriter := NewFileWriter( 750 buf, 751 WithCompressionCodec(parquet.CompressionCodec_SNAPPY), 752 WithCreator("floor-unittest"), 753 WithSchemaDefinition(sd), 754 ) 755 756 require.NoError(t, err, "creating new file writer failed") 757 758 testData := map[string]interface{}{ 759 "baz": map[string]interface{}{ 760 "list": []map[string]interface{}{ 761 { 762 "element": map[string]interface{}{ 763 "quux": int64(23), 764 }, 765 }, 766 { 767 "element": map[string]interface{}{ 768 "quux": int64(42), 769 }, 770 }, 771 }, 772 }, 773 } 774 775 require.NoError(t, hlWriter.AddData(testData), "writing object using marshaller failed") 776 777 require.NoError(t, hlWriter.Close()) 778 779 hlReader, err := NewFileReader(bytes.NewReader(buf.Bytes())) 780 require.NoError(t, err, "opening file failed") 781 782 readData, err := hlReader.NextRow() 783 require.NoError(t, err) 784 require.Equal(t, testData, readData, "written and read data don't match") 785 } 786 787 func TestWriteWithFlushGroupMetaDataThenRead(t *testing.T) { 788 sd, err := parquetschema.ParseSchemaDefinition( 789 `message test_msg { 790 required int64 foo; 791 required group x { 792 required int64 bar; 793 } 794 }`) 795 796 require.NoError(t, err, "parsing schema definition failed") 797 798 buf := &bytes.Buffer{} 799 hlWriter := NewFileWriter( 800 buf, 801 WithCompressionCodec(parquet.CompressionCodec_SNAPPY), 802 WithCreator("floor-unittest"), 803 WithSchemaDefinition(sd), 804 WithMetaData(map[string]string{"global": "metadata"}), 805 ) 806 807 require.NoError(t, err, "creating new file writer failed") 808 809 testData := map[string]interface{}{ 810 "foo": int64(23), 811 "x": map[string]interface{}{ 812 "bar": int64(42), 813 }, 814 } 815 816 require.NoError(t, hlWriter.AddData(testData), "writing object using marshaller failed") 817 818 require.NoError(t, hlWriter.Close( 819 WithRowGroupMetaData(map[string]string{"a": "hello", "b": "world"}), 820 WithRowGroupMetaDataForColumn("foo", map[string]string{"b": "friendo", "c": "!"}), 821 WithRowGroupMetaDataForColumn("x.bar", map[string]string{"a": "goodbye"}), 822 )) 823 824 hlReader, err := NewFileReader(bytes.NewReader(buf.Bytes())) 825 require.NoError(t, err) 826 827 require.Equal(t, map[string]string{"global": "metadata"}, hlReader.MetaData()) 828 829 require.NoError(t, hlReader.PreLoad()) 830 831 // the low-level way of inspecting column metadata: 832 rg := hlReader.CurrentRowGroup() 833 cols := rg.GetColumns() 834 require.Equal(t, 2, len(cols)) 835 836 require.Equal(t, []string{"foo"}, cols[0].MetaData.PathInSchema) 837 require.Equal(t, []*parquet.KeyValue{ 838 {Key: "a", Value: strPtr("hello")}, 839 {Key: "b", Value: strPtr("friendo")}, 840 {Key: "c", Value: strPtr("!")}, 841 }, cols[0].MetaData.KeyValueMetadata) 842 843 require.Equal(t, []string{"x", "bar"}, cols[1].MetaData.PathInSchema) 844 require.Equal(t, []*parquet.KeyValue{ 845 {Key: "a", Value: strPtr("goodbye")}, 846 {Key: "b", Value: strPtr("world")}, 847 }, cols[1].MetaData.KeyValueMetadata) 848 849 // the high-level way of inspecting column metadata: 850 fooMetaData, err := hlReader.ColumnMetaData("foo") 851 require.NoError(t, err) 852 require.Equal(t, map[string]string{"a": "hello", "b": "friendo", "c": "!"}, fooMetaData) 853 854 xbarMetaData, err := hlReader.ColumnMetaData("x.bar") 855 require.NoError(t, err) 856 require.Equal(t, map[string]string{"a": "goodbye", "b": "world"}, xbarMetaData) 857 858 _, err = hlReader.ColumnMetaData("does.not.exist") 859 require.Error(t, err) 860 } 861 862 func TestReadWriteColumeEncodings(t *testing.T) { 863 buf := &bytes.Buffer{} 864 865 w := NewFileWriter(buf) 866 867 s, err := NewBooleanStore(parquet.Encoding_RLE, &ColumnParameters{}) 868 require.NoError(t, err) 869 require.NoError(t, w.AddColumn("a", NewDataColumn(s, parquet.FieldRepetitionType_REQUIRED))) 870 871 s, err = NewBooleanStore(parquet.Encoding_PLAIN, &ColumnParameters{}) 872 require.NoError(t, err) 873 require.NoError(t, w.AddColumn("b", NewDataColumn(s, parquet.FieldRepetitionType_REQUIRED))) 874 875 s, err = NewByteArrayStore(parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY, false, &ColumnParameters{}) 876 require.NoError(t, err) 877 require.NoError(t, w.AddColumn("c", NewDataColumn(s, parquet.FieldRepetitionType_REQUIRED))) 878 879 s, err = NewByteArrayStore(parquet.Encoding_DELTA_BYTE_ARRAY, false, &ColumnParameters{}) 880 require.NoError(t, err) 881 require.NoError(t, w.AddColumn("d", NewDataColumn(s, parquet.FieldRepetitionType_REQUIRED))) 882 883 s, err = NewFloatStore(parquet.Encoding_PLAIN, false, &ColumnParameters{}) 884 require.NoError(t, err) 885 require.NoError(t, w.AddColumn("e", NewDataColumn(s, parquet.FieldRepetitionType_REQUIRED))) 886 887 s, err = NewDoubleStore(parquet.Encoding_PLAIN, false, &ColumnParameters{}) 888 require.NoError(t, err) 889 require.NoError(t, w.AddColumn("f", NewDataColumn(s, parquet.FieldRepetitionType_REQUIRED))) 890 891 testData := map[string]interface{}{ 892 "a": true, 893 "b": false, 894 "c": []byte("hello"), 895 "d": []byte("world"), 896 "e": float32(23.0), 897 "f": float64(42.0), 898 } 899 900 require.NoError(t, w.AddData(testData)) 901 902 require.NoError(t, w.Close()) 903 904 r, err := NewFileReader(bytes.NewReader(buf.Bytes())) 905 require.NoError(t, err) 906 907 data, err := r.NextRow() 908 require.NoError(t, err) 909 910 require.Equal(t, testData, data) 911 912 _, err = r.NextRow() 913 require.Equal(t, io.EOF, err) 914 } 915 916 func strPtr(s string) *string { 917 return &s 918 } 919 920 func TestWriteThenReadFileUnsetOptional(t *testing.T) { 921 sd, err := parquetschema.ParseSchemaDefinition(` 922 message foo { 923 optional group a (LIST) { 924 repeated group list { 925 optional group element { 926 optional int64 b; 927 } 928 } 929 } 930 }`) 931 require.NoError(t, err) 932 933 var buf bytes.Buffer 934 require.NoError(t, err) 935 w := NewFileWriter(&buf, WithSchemaDefinition(sd)) 936 testData := map[string]interface{}{ 937 "a": map[string]interface{}{ 938 "list": []map[string]interface{}{ 939 {}, 940 { 941 "element": map[string]interface{}{}, 942 }, 943 { 944 "element": map[string]interface{}{ 945 "b": int64(2), 946 }, 947 }, 948 }, 949 }, 950 } 951 require.NoError(t, w.AddData(testData)) 952 require.NoError(t, w.Close()) 953 954 r, err := NewFileReader(bytes.NewReader(buf.Bytes())) 955 require.NoError(t, err) 956 957 data, err := r.NextRow() 958 require.NoError(t, err) 959 require.Equal(t, testData, data) 960 961 _, err = r.NextRow() 962 require.Equal(t, io.EOF, err) 963 } 964 965 func TestReadWriteFixedLenByteArrayEncodings(t *testing.T) { 966 testData := []struct { 967 name string 968 enc parquet.Encoding 969 useDict bool 970 input []byte 971 }{ 972 {name: "delta_byte_array_with_dict", enc: parquet.Encoding_DELTA_BYTE_ARRAY, useDict: true, input: []byte{1, 3, 2, 14, 99, 42}}, 973 {name: "delta_byte_array_no_dict", enc: parquet.Encoding_DELTA_BYTE_ARRAY, useDict: false, input: []byte{7, 5, 254, 127, 42, 23}}, 974 {name: "plain_no_dict", enc: parquet.Encoding_PLAIN, useDict: false, input: []byte{9, 8, 7, 6, 5, 4}}, 975 } 976 977 for _, tt := range testData { 978 t.Run(tt.name, func(t *testing.T) { 979 var buf bytes.Buffer 980 wr := NewFileWriter(&buf) 981 982 l := int32(len(tt.input)) 983 store, err := NewFixedByteArrayStore(tt.enc, tt.useDict, &ColumnParameters{TypeLength: &l}) 984 require.NoError(t, err) 985 986 require.NoError(t, wr.AddColumn("value", NewDataColumn(store, parquet.FieldRepetitionType_REQUIRED))) 987 988 inputRow := map[string]interface{}{"value": tt.input} 989 990 require.NoError(t, wr.AddData(inputRow)) 991 992 require.NoError(t, wr.Close()) 993 994 rd, err := NewFileReader(bytes.NewReader(buf.Bytes())) 995 require.NoError(t, err) 996 997 outputRow, err := rd.NextRow() 998 require.NoError(t, err) 999 1000 require.Equal(t, inputRow, outputRow) 1001 1002 _, err = rd.NextRow() 1003 require.Error(t, err) 1004 require.True(t, errors.Is(err, io.EOF)) 1005 }) 1006 } 1007 } 1008 1009 func TestReadWriteByteArrayEncodings(t *testing.T) { 1010 testData := []struct { 1011 name string 1012 enc parquet.Encoding 1013 useDict bool 1014 input []byte 1015 }{ 1016 {name: "delta_byte_array_with_dict", enc: parquet.Encoding_DELTA_BYTE_ARRAY, useDict: true, input: []byte{1, 3, 2, 14, 99, 42}}, 1017 {name: "delta_byte_array_no_dict", enc: parquet.Encoding_DELTA_BYTE_ARRAY, useDict: false, input: []byte{7, 5, 254, 127, 42, 23}}, 1018 {name: "delta_length_byte_array_with_dict", enc: parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY, useDict: true, input: []byte{1, 5, 15, 25, 35, 75}}, 1019 {name: "delta_length_byte_array_no_dict", enc: parquet.Encoding_DELTA_LENGTH_BYTE_ARRAY, useDict: false, input: []byte{75, 25, 5, 35, 15, 1}}, 1020 {name: "plain_no_dict", enc: parquet.Encoding_PLAIN, useDict: false, input: []byte{9, 8, 7, 6, 5, 4}}, 1021 } 1022 1023 for _, tt := range testData { 1024 t.Run(tt.name, func(t *testing.T) { 1025 var buf bytes.Buffer 1026 wr := NewFileWriter(&buf) 1027 1028 store, err := NewByteArrayStore(tt.enc, tt.useDict, &ColumnParameters{}) 1029 require.NoError(t, err) 1030 1031 require.NoError(t, wr.AddColumn("value", NewDataColumn(store, parquet.FieldRepetitionType_REQUIRED))) 1032 1033 inputRow := map[string]interface{}{"value": tt.input} 1034 1035 require.NoError(t, wr.AddData(inputRow)) 1036 1037 require.NoError(t, wr.Close()) 1038 1039 rd, err := NewFileReader(bytes.NewReader(buf.Bytes())) 1040 require.NoError(t, err) 1041 1042 outputRow, err := rd.NextRow() 1043 require.NoError(t, err) 1044 1045 require.Equal(t, inputRow, outputRow) 1046 1047 _, err = rd.NextRow() 1048 require.Error(t, err) 1049 require.True(t, errors.Is(err, io.EOF)) 1050 }) 1051 } 1052 } 1053 1054 func TestReadWriteInt64Encodings(t *testing.T) { 1055 testData := []struct { 1056 name string 1057 enc parquet.Encoding 1058 useDict bool 1059 input int64 1060 }{ 1061 {name: "plain_no_dict", enc: parquet.Encoding_PLAIN, useDict: false, input: 87743737636726}, 1062 {name: "plain_with_dict", enc: parquet.Encoding_PLAIN, useDict: true, input: 42}, 1063 {name: "delta_binary_packed", enc: parquet.Encoding_DELTA_BINARY_PACKED, useDict: false, input: 6363228832}, 1064 } 1065 1066 for _, tt := range testData { 1067 t.Run(tt.name, func(t *testing.T) { 1068 var buf bytes.Buffer 1069 1070 wr := NewFileWriter(&buf) 1071 1072 bas, err := NewInt64Store(tt.enc, tt.useDict, &ColumnParameters{}) 1073 require.NoError(t, err) 1074 1075 col := NewDataColumn(bas, parquet.FieldRepetitionType_REQUIRED) 1076 require.NoError(t, wr.AddColumn("number", col)) 1077 1078 inputRow := map[string]interface{}{ 1079 "number": tt.input, 1080 } 1081 1082 require.NoError(t, wr.AddData(inputRow)) 1083 1084 require.NoError(t, wr.Close()) 1085 1086 rd, err := NewFileReader(bytes.NewReader(buf.Bytes())) 1087 if err != nil { 1088 t.Fatal(err) 1089 } 1090 1091 outputRow, err := rd.NextRow() 1092 require.NoError(t, err) 1093 1094 require.Equal(t, inputRow, outputRow) 1095 1096 _, err = rd.NextRow() 1097 require.True(t, errors.Is(err, io.EOF)) 1098 }) 1099 } 1100 } 1101 1102 func TestReadWriteInt32Encodings(t *testing.T) { 1103 testData := []struct { 1104 name string 1105 enc parquet.Encoding 1106 useDict bool 1107 input int32 1108 }{ 1109 {name: "plain_no_dict", enc: parquet.Encoding_PLAIN, useDict: false, input: 3628282}, 1110 {name: "plain_with_dict", enc: parquet.Encoding_PLAIN, useDict: true, input: 23}, 1111 {name: "delta_binary_packed", enc: parquet.Encoding_DELTA_BINARY_PACKED, useDict: false, input: 9361082}, 1112 } 1113 1114 for _, tt := range testData { 1115 t.Run(tt.name, func(t *testing.T) { 1116 var buf bytes.Buffer 1117 1118 wr := NewFileWriter(&buf) 1119 1120 bas, err := NewInt32Store(tt.enc, tt.useDict, &ColumnParameters{}) 1121 require.NoError(t, err) 1122 1123 col := NewDataColumn(bas, parquet.FieldRepetitionType_REQUIRED) 1124 require.NoError(t, wr.AddColumn("number", col)) 1125 1126 inputRow := map[string]interface{}{ 1127 "number": tt.input, 1128 } 1129 1130 require.NoError(t, wr.AddData(inputRow)) 1131 1132 require.NoError(t, wr.Close()) 1133 1134 rd, err := NewFileReader(bytes.NewReader(buf.Bytes())) 1135 if err != nil { 1136 t.Fatal(err) 1137 } 1138 1139 outputRow, err := rd.NextRow() 1140 require.NoError(t, err) 1141 1142 require.Equal(t, inputRow, outputRow) 1143 1144 _, err = rd.NextRow() 1145 require.True(t, errors.Is(err, io.EOF)) 1146 }) 1147 } 1148 } 1149 1150 func TestReadWriteInt96Encodings(t *testing.T) { 1151 testData := []struct { 1152 name string 1153 enc parquet.Encoding 1154 useDict bool 1155 input [12]byte 1156 }{ 1157 {name: "plain_no_dict", enc: parquet.Encoding_PLAIN, useDict: false, input: TimeToInt96(time.Date(2020, 3, 16, 14, 30, 0, 0, time.UTC))}, 1158 {name: "plain_with_dict", enc: parquet.Encoding_PLAIN, useDict: true, input: TimeToInt96(time.Now())}, 1159 } 1160 1161 for _, tt := range testData { 1162 t.Run(tt.name, func(t *testing.T) { 1163 var buf bytes.Buffer 1164 1165 wr := NewFileWriter(&buf) 1166 1167 bas, err := NewInt96Store(tt.enc, tt.useDict, &ColumnParameters{}) 1168 require.NoError(t, err) 1169 1170 col := NewDataColumn(bas, parquet.FieldRepetitionType_REQUIRED) 1171 require.NoError(t, wr.AddColumn("ts", col)) 1172 1173 inputRow := map[string]interface{}{ 1174 "ts": tt.input, 1175 } 1176 1177 require.NoError(t, wr.AddData(inputRow)) 1178 1179 require.NoError(t, wr.Close()) 1180 1181 rd, err := NewFileReader(bytes.NewReader(buf.Bytes())) 1182 if err != nil { 1183 t.Fatal(err) 1184 } 1185 1186 outputRow, err := rd.NextRow() 1187 require.NoError(t, err) 1188 1189 require.Equal(t, inputRow, outputRow) 1190 1191 _, err = rd.NextRow() 1192 require.True(t, errors.Is(err, io.EOF)) 1193 }) 1194 } 1195 } 1196 1197 func TestReadWriteFloatEncodings(t *testing.T) { 1198 testData := []struct { 1199 name string 1200 enc parquet.Encoding 1201 useDict bool 1202 input float32 1203 }{ 1204 {name: "plain_no_dict", enc: parquet.Encoding_PLAIN, useDict: false, input: 1.1111}, 1205 {name: "plain_with_dict", enc: parquet.Encoding_PLAIN, useDict: true, input: 2.2222}, 1206 } 1207 1208 for _, tt := range testData { 1209 t.Run(tt.name, func(t *testing.T) { 1210 var buf bytes.Buffer 1211 1212 wr := NewFileWriter(&buf) 1213 1214 bas, err := NewFloatStore(tt.enc, tt.useDict, &ColumnParameters{}) 1215 require.NoError(t, err) 1216 1217 col := NewDataColumn(bas, parquet.FieldRepetitionType_REQUIRED) 1218 require.NoError(t, wr.AddColumn("number", col)) 1219 1220 inputRow := map[string]interface{}{ 1221 "number": tt.input, 1222 } 1223 1224 require.NoError(t, wr.AddData(inputRow)) 1225 1226 require.NoError(t, wr.Close()) 1227 1228 rd, err := NewFileReader(bytes.NewReader(buf.Bytes())) 1229 if err != nil { 1230 t.Fatal(err) 1231 } 1232 1233 outputRow, err := rd.NextRow() 1234 require.NoError(t, err) 1235 1236 require.Equal(t, inputRow, outputRow) 1237 1238 _, err = rd.NextRow() 1239 require.True(t, errors.Is(err, io.EOF)) 1240 }) 1241 } 1242 } 1243 1244 func TestReadWriteDoubleEncodings(t *testing.T) { 1245 testData := []struct { 1246 name string 1247 enc parquet.Encoding 1248 useDict bool 1249 input float64 1250 }{ 1251 {name: "plain_no_dict", enc: parquet.Encoding_PLAIN, useDict: false, input: 42.123456}, 1252 {name: "plain_with_dict", enc: parquet.Encoding_PLAIN, useDict: true, input: 32.98765}, 1253 } 1254 1255 for _, tt := range testData { 1256 t.Run(tt.name, func(t *testing.T) { 1257 var buf bytes.Buffer 1258 1259 wr := NewFileWriter(&buf) 1260 1261 bas, err := NewDoubleStore(tt.enc, tt.useDict, &ColumnParameters{}) 1262 require.NoError(t, err) 1263 1264 col := NewDataColumn(bas, parquet.FieldRepetitionType_REQUIRED) 1265 require.NoError(t, wr.AddColumn("number", col)) 1266 1267 inputRow := map[string]interface{}{ 1268 "number": tt.input, 1269 } 1270 1271 require.NoError(t, wr.AddData(inputRow)) 1272 1273 require.NoError(t, wr.Close()) 1274 1275 rd, err := NewFileReader(bytes.NewReader(buf.Bytes())) 1276 if err != nil { 1277 t.Fatal(err) 1278 } 1279 1280 outputRow, err := rd.NextRow() 1281 require.NoError(t, err) 1282 1283 require.Equal(t, inputRow, outputRow) 1284 1285 _, err = rd.NextRow() 1286 require.True(t, errors.Is(err, io.EOF)) 1287 }) 1288 } 1289 } 1290 1291 func TestWriteThenReadMultiplePages(t *testing.T) { 1292 const mySchema = `message msg { 1293 required binary ts_str (STRING); 1294 }` 1295 1296 sd, err := parquetschema.ParseSchemaDefinition(mySchema) 1297 require.NoError(t, err) 1298 1299 testData := []struct { 1300 name string 1301 options []FileWriterOption 1302 }{ 1303 1304 { 1305 name: "snappy", 1306 options: []FileWriterOption{ 1307 WithSchemaDefinition(sd), WithCompressionCodec(parquet.CompressionCodec_SNAPPY), 1308 }, 1309 }, 1310 { 1311 name: "snappy_1kb_page", 1312 options: []FileWriterOption{ 1313 WithSchemaDefinition(sd), WithCompressionCodec(parquet.CompressionCodec_SNAPPY), WithMaxPageSize(1 * 1024), 1314 }, 1315 }, 1316 } 1317 1318 for _, tt := range testData { 1319 t.Run(tt.name, func(t *testing.T) { 1320 f := new(bytes.Buffer) 1321 1322 fw := NewFileWriter(f, tt.options...) 1323 defer fw.Close() 1324 1325 const numRows = 75 1326 1327 records := []map[string]interface{}{} 1328 1329 for i := 0; i < numRows; i++ { 1330 tsStr := time.Now().Add(time.Duration(1+rand.Int63n(300)) * time.Second).Format(time.RFC3339) 1331 rec := map[string]interface{}{"ts_str": []byte(tsStr)} 1332 records = append(records, rec) 1333 require.NoError(t, fw.AddData(rec)) 1334 } 1335 1336 require.NoError(t, fw.Close()) 1337 1338 r, err := NewFileReader(bytes.NewReader(f.Bytes())) 1339 require.NoError(t, err) 1340 1341 rowCount := r.NumRows() 1342 require.Equal(t, int64(numRows), rowCount) 1343 1344 for i := int64(0); i < rowCount; i++ { 1345 data, err := r.NextRow() 1346 require.NoError(t, err) 1347 require.Equal(t, records[i], data, "%d. records don't match", i) 1348 //fmt.Printf("in %d. %s\n", i, string(data["ts_str"].([]byte))) 1349 } 1350 }) 1351 } 1352 } 1353 1354 func TestReadWriteDoubleNaN(t *testing.T) { 1355 var buf bytes.Buffer 1356 1357 wr := NewFileWriter(&buf) 1358 1359 bas, err := NewDoubleStore(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 1360 require.NoError(t, err) 1361 1362 col := NewDataColumn(bas, parquet.FieldRepetitionType_REQUIRED) 1363 require.NoError(t, wr.AddColumn("value", col)) 1364 1365 data := []float64{42.23, math.NaN(), math.NaN(), 23.42, math.Inf(1), math.Inf(-1), 1.111} 1366 1367 for _, f := range data { 1368 require.NoError(t, wr.AddData(map[string]interface{}{ 1369 "value": f, 1370 })) 1371 } 1372 1373 require.NoError(t, wr.Close()) 1374 1375 rd, err := NewFileReader(bytes.NewReader(buf.Bytes())) 1376 if err != nil { 1377 t.Fatal(err) 1378 } 1379 1380 for i := range data { 1381 outputRow, err := rd.NextRow() 1382 require.NoError(t, err) 1383 if math.IsNaN(data[i]) { 1384 require.True(t, math.IsNaN(outputRow["value"].(float64))) 1385 } else { 1386 require.Equal(t, data[i], outputRow["value"].(float64)) 1387 } 1388 } 1389 1390 _, err = rd.NextRow() 1391 require.True(t, errors.Is(err, io.EOF)) 1392 } 1393 1394 func TestReadWriteFloatNaN(t *testing.T) { 1395 var buf bytes.Buffer 1396 1397 wr := NewFileWriter(&buf) 1398 1399 bas, err := NewFloatStore(parquet.Encoding_PLAIN, true, &ColumnParameters{}) 1400 require.NoError(t, err) 1401 1402 col := NewDataColumn(bas, parquet.FieldRepetitionType_REQUIRED) 1403 require.NoError(t, wr.AddColumn("value", col)) 1404 1405 data := []float32{42.23, float32(math.NaN()), float32(math.NaN()), 23.42, float32(math.Inf(1)), float32(math.Inf(-1)), 1.111} 1406 1407 for _, f := range data { 1408 require.NoError(t, wr.AddData(map[string]interface{}{ 1409 "value": f, 1410 })) 1411 } 1412 1413 require.NoError(t, wr.Close()) 1414 1415 rd, err := NewFileReader(bytes.NewReader(buf.Bytes())) 1416 if err != nil { 1417 t.Fatal(err) 1418 } 1419 1420 for i := range data { 1421 outputRow, err := rd.NextRow() 1422 require.NoError(t, err) 1423 if math.IsNaN(float64(data[i])) { 1424 require.True(t, math.IsNaN(float64(outputRow["value"].(float32)))) 1425 } else { 1426 require.Equal(t, data[i], outputRow["value"].(float32)) 1427 } 1428 } 1429 1430 _, err = rd.NextRow() 1431 require.True(t, errors.Is(err, io.EOF)) 1432 } 1433 1434 func TestWriteThenReadSetSchemaDefinition(t *testing.T) { 1435 var buf bytes.Buffer 1436 1437 wr := NewFileWriter(&buf) 1438 1439 sd, err := parquetschema.ParseSchemaDefinition(`message msg { required int64 foo; }`) 1440 require.NoError(t, err) 1441 1442 require.NoError(t, wr.SetSchemaDefinition(sd)) 1443 1444 require.NoError(t, wr.AddData(map[string]interface{}{"foo": int64(23)})) 1445 1446 require.NoError(t, wr.Close()) 1447 1448 require.Equal(t, sd.String(), wr.GetSchemaDefinition().String()) 1449 1450 require.Equal(t, 1, len(wr.Columns())) 1451 require.Equal(t, parquet.TypePtr(parquet.Type_INT64), wr.GetColumnByName("foo").Type()) 1452 require.Nil(t, wr.GetColumnByName("bar")) 1453 require.Nil(t, wr.GetColumnByPath(ColumnPath{"bar"})) 1454 require.NotNil(t, wr.GetColumnByPath(ColumnPath{"foo"})) 1455 1456 r, err := NewFileReader(bytes.NewReader(buf.Bytes())) 1457 require.NoError(t, err) 1458 1459 sd2 := r.GetSchemaDefinition() 1460 1461 require.Equal(t, sd.String(), sd2.String()) 1462 1463 row, err := r.NextRow() 1464 require.NoError(t, err) 1465 require.Equal(t, map[string]interface{}{"foo": int64(23)}, row) 1466 1467 _, err = r.NextRow() 1468 require.True(t, errors.Is(err, io.EOF)) 1469 } 1470 1471 func TestRepeatedInt32(t *testing.T) { 1472 // this is here to somehow reproduce the issue discussed in https://github.com/fraugster/parquet-go/pull/8 1473 sd, err := parquetschema.ParseSchemaDefinition(`message msg { 1474 repeated int32 foo; 1475 }`) 1476 require.NoError(t, err) 1477 1478 var buf bytes.Buffer 1479 fw := NewFileWriter(&buf, WithSchemaDefinition(sd)) 1480 1481 err = fw.AddData(map[string]interface{}{ 1482 "foo": []int32{ 1483 int32(23), 1484 int32(42), 1485 int32(9001), 1486 }, 1487 }) 1488 require.NoError(t, err) 1489 1490 require.NoError(t, fw.Close()) 1491 1492 r, err := NewFileReader(bytes.NewReader(buf.Bytes())) 1493 require.NoError(t, err) 1494 1495 row, err := r.NextRow() 1496 require.NoError(t, err) 1497 1498 // here's a problem: we added nil, but got a []byte{}. 1499 require.Equal(t, []int32{ 1500 int32(23), 1501 int32(42), 1502 int32(9001), 1503 }, row["foo"]) 1504 }