github.com/fraugster/parquet-go@v0.12.0/schema.go (about) 1 package goparquet 2 3 import ( 4 "errors" 5 "fmt" 6 "strings" 7 8 "github.com/fraugster/parquet-go/parquet" 9 "github.com/fraugster/parquet-go/parquetschema" 10 ) 11 12 const ( 13 _ int = iota 14 listParent 15 mapParent 16 ) 17 18 // Column is composed of a schema definition for the column, a column store 19 // that contains the implementation to write the data to a parquet file, and 20 // any additional parameters that are necessary to correctly write the data. 21 // Please the NewDataColumn, NewListColumn or NewMapColumn functions to create 22 // a Column object correctly. 23 type Column struct { 24 index int 25 name string 26 path ColumnPath 27 28 // one of the following should be not null. data or children 29 data *ColumnStore 30 children []*Column 31 32 rep parquet.FieldRepetitionType 33 34 maxR, maxD uint16 35 36 parent int // one of noParent, listParent, mapParent 37 // for the reader we should read this element from the meta, for the writer we need to build this element 38 element *parquet.SchemaElement 39 40 params *ColumnParameters 41 42 alloc *allocTracker 43 } 44 45 // ColumnPath describes the path through the hierarchy of the schema for a particular column. For a top-level 46 // column of the schema, the column path only contains one element, while for nested columns, the path consists 47 // of multiple elements. 48 type ColumnPath []string 49 50 func parseColumnPath(s string) ColumnPath { 51 return strings.Split(s, ".") 52 } 53 54 func (c ColumnPath) flatName() string { 55 return strings.Join(c, ".") 56 } 57 58 // Equal returns true if all path elements of this ColumnPath are equal to the 59 // corresponding path elements of the ColumnPath provided as parameter, false 60 // otherwise. 61 func (c ColumnPath) Equal(d ColumnPath) bool { 62 if len(c) != len(d) { 63 return false 64 } 65 for i := range c { 66 if c[i] != d[i] { 67 return false 68 } 69 } 70 return true 71 } 72 73 // HasPrefix returns true if all path elements of the ColumnPath provided as parameter 74 // are equal to the corresponding path elements of this ColumnPath. 75 func (c ColumnPath) HasPrefix(d ColumnPath) bool { 76 if len(d) > len(c) { 77 return false 78 } 79 for i := range d { 80 if c[i] != d[i] { 81 return false 82 } 83 } 84 return true 85 } 86 87 // Children returns the column's child columns. 88 func (c *Column) Children() []*Column { 89 return c.children 90 } 91 92 func (c *Column) getSchemaArray() []*parquet.SchemaElement { 93 ret := []*parquet.SchemaElement{c.Element()} 94 if c.data != nil { 95 return ret 96 } 97 98 for i := range c.children { 99 ret = append(ret, c.children[i].getSchemaArray()...) 100 } 101 102 return ret 103 } 104 105 // MaxDefinitionLevel returns the maximum definition level for this column. 106 func (c *Column) MaxDefinitionLevel() uint16 { 107 return c.maxD 108 } 109 110 // MaxRepetitionLevel returns the maximum repetition value for this column. 111 func (c *Column) MaxRepetitionLevel() uint16 { 112 return c.maxR 113 } 114 115 // FlatName returns the name of the column and its parents in dotted notation. 116 // 117 // Deprecated: use Path instead. If a column or group name contains '.', the returned 118 // flat name cannot be used to properly address them. 119 func (c *Column) FlatName() string { 120 return c.path.flatName() 121 } 122 123 // Path returns the full column path of the column. 124 func (c *Column) Path() ColumnPath { 125 return c.path 126 } 127 128 // Name returns the column name. 129 func (c *Column) Name() string { 130 return c.name 131 } 132 133 // Index returns the index of the column in schema, zero based. 134 func (c *Column) Index() int { 135 return c.index 136 } 137 138 // Element returns schema element definition of the column. 139 func (c *Column) Element() *parquet.SchemaElement { 140 if c.element == nil { 141 // If this is a no-element node, we need to re-create element every time to make sure the content is always up-to-date 142 return c.buildElement() 143 } 144 return c.element 145 } 146 147 // Type returns the parquet type of the value. If the column is a group, then the 148 // method will return nil. 149 func (c *Column) Type() *parquet.Type { 150 if c.data == nil { 151 return nil 152 } 153 154 return parquet.TypePtr(c.data.parquetType()) 155 } 156 157 // RepetitionType returns the repetition type for the current column. 158 func (c *Column) RepetitionType() *parquet.FieldRepetitionType { 159 return &c.rep 160 } 161 162 // DataColumn returns true if the column is data column, false otherwise. 163 func (c *Column) DataColumn() bool { 164 return c.data != nil 165 } 166 167 // ChildrenCount returns the number of children in a group. If the column is 168 // a data column, it returns -1. 169 func (c *Column) ChildrenCount() int { 170 if c.data != nil { 171 return -1 172 } 173 174 return len(c.children) 175 } 176 177 func (c *Column) getColumnStore() *ColumnStore { 178 return c.data 179 } 180 181 func (c *Column) buildElement() *parquet.SchemaElement { 182 rep := c.rep 183 elem := &parquet.SchemaElement{ 184 RepetitionType: &rep, 185 Name: c.name, 186 } 187 188 if c.params != nil { 189 elem.FieldID = c.params.FieldID 190 elem.ConvertedType = c.params.ConvertedType 191 elem.LogicalType = c.params.LogicalType 192 } 193 194 if c.data != nil { 195 elem.Type = parquet.TypePtr(c.data.parquetType()) 196 elem.TypeLength = c.params.TypeLength 197 elem.Scale = c.params.Scale 198 elem.Precision = c.params.Precision 199 } else { 200 nc := int32(len(c.children)) 201 elem.NumChildren = &nc 202 } 203 204 return elem 205 } 206 207 func (c *Column) getDataSize() int64 { 208 if _, ok := c.data.typedColumnStore.(*booleanStore); ok { 209 // Booleans are stored in one bit, so the result is the number of items / 8 210 return int64(c.data.values.numValues())/8 + 1 211 } 212 _, dataSize := c.data.values.sizes() 213 return dataSize 214 } 215 216 func (c *Column) getNextData() (map[string]interface{}, int32, error) { 217 if c.children == nil { 218 return nil, 0, errors.New("bug: call getNextData on non group node") 219 } 220 ret := make(map[string]interface{}) 221 notNil := 0 222 var maxD int32 223 for i := range c.children { 224 data, dl, err := c.children[i].getData() 225 if err != nil { 226 return nil, 0, err 227 } 228 if dl > maxD { 229 maxD = dl 230 } 231 232 // https://golang.org/doc/faq#nil_error 233 if m, ok := data.(map[string]interface{}); ok && m == nil { 234 data = nil 235 } 236 237 // if the data is not nil, then its ok, but if its nil, we need to know in which definition level is this nil is. 238 // if its exactly one below max definition level, then the parent is there 239 if data != nil { 240 ret[c.children[i].name] = data 241 if c.children[i].data != nil { 242 c.alloc.register(data, uint64(c.children[i].data.sizeOf(data))) 243 } 244 notNil++ 245 } 246 var diff int32 247 if c.children[i].rep != parquet.FieldRepetitionType_REQUIRED { 248 diff++ 249 } 250 if dl == int32(c.children[i].maxD)-diff { 251 notNil++ 252 } 253 } 254 255 if notNil == 0 { 256 return nil, maxD, nil 257 } 258 259 return ret, int32(c.maxD), nil 260 } 261 262 func (c *Column) getFirstRDLevel() (int32, int32, bool) { 263 if c.data != nil { 264 return c.data.getRDLevelAt(-1) 265 } 266 267 // there should be at lease 1 child, 268 for i := range c.children { 269 rl, dl, last := c.children[i].getFirstRDLevel() 270 if last { 271 return rl, dl, last 272 } 273 274 // if this value is not nil, rLevel or dLevel less than this level is not interesting 275 if rl >= int32(c.children[i].maxR) || dl >= int32(c.children[i].maxD) { 276 return rl, dl, last 277 } 278 } 279 280 return -1, -1, false 281 } 282 283 func (c *Column) getData() (interface{}, int32, error) { 284 if c.children != nil { 285 data, maxD, err := c.getNextData() 286 if err != nil { 287 return nil, 0, err 288 } 289 290 if c.rep != parquet.FieldRepetitionType_REPEATED || data == nil { 291 return data, maxD, nil 292 } 293 294 ret := []map[string]interface{}{data} 295 for { 296 rl, _, last := c.getFirstRDLevel() 297 if last || rl < int32(c.maxR) || rl == 0 { 298 // end of this object 299 return ret, maxD, nil 300 } 301 302 data, _, err := c.getNextData() 303 if err != nil { 304 return nil, maxD, err 305 } 306 307 ret = append(ret, data) 308 } 309 } 310 311 return c.data.get(int32(c.maxD), int32(c.maxR)) 312 } 313 314 type schema struct { 315 schemaDef *parquetschema.SchemaDefinition 316 root *Column 317 numRecords int64 318 readOnly int 319 320 maxPageSize int64 321 322 // selected columns in reading. if the size is zero, it means all the columns 323 selectedColumns []ColumnPath 324 325 enableCRC bool // if true, CRC32 checksums will be computed for pages upon writing. 326 validateCRC bool // if true, CRC32 checksums will be validated for pages upon reading. 327 328 alloc *allocTracker 329 } 330 331 func (r *schema) ensureRoot() { 332 if r.root == nil { 333 r.root = &Column{ 334 index: 0, 335 name: "msg", 336 data: nil, 337 children: []*Column{}, 338 rep: 0, 339 maxR: 0, 340 maxD: 0, 341 element: nil, 342 alloc: r.alloc, 343 } 344 } 345 } 346 347 func (r *schema) SetSelectedColumns(cols ...ColumnPath) { 348 r.selectedColumns = cols 349 } 350 351 func (r *schema) isSelectedByPath(path ColumnPath) bool { 352 if len(r.selectedColumns) == 0 { 353 return true 354 } 355 356 for _, p := range r.selectedColumns { 357 if p.Equal(path) { 358 return true 359 } 360 361 if path.HasPrefix(p) { 362 return true 363 } 364 } 365 366 return false 367 } 368 369 func (r *schema) getSchemaArray() []*parquet.SchemaElement { 370 r.ensureRoot() 371 elem := r.root.getSchemaArray() 372 // the root doesn't have repetition type 373 elem[0].RepetitionType = nil 374 return elem 375 } 376 377 func (r *schema) Columns() []*Column { 378 var ret []*Column 379 var fn func([]*Column) 380 381 fn = func(columns []*Column) { 382 for i := range columns { 383 if columns[i].data != nil { 384 ret = append(ret, columns[i]) 385 } else { 386 fn(columns[i].children) 387 } 388 } 389 } 390 r.ensureRoot() 391 fn(r.root.children) 392 return ret 393 } 394 395 func (r *schema) GetColumnByName(path string) *Column { 396 data := r.Columns() 397 for i := range data { 398 if data[i].path.flatName() == path { 399 return data[i] 400 } 401 } 402 403 return nil 404 } 405 406 func (r *schema) GetColumnByPath(path ColumnPath) *Column { 407 return r.getColumnByPath(r.root, path) 408 } 409 410 func (r *schema) getColumnByPath(col *Column, path ColumnPath) *Column { 411 if len(path) == 0 { 412 return nil 413 } 414 415 for _, c := range col.children { 416 if c.name == path[0] { 417 if len(path) == 1 { 418 return c 419 } 420 return r.getColumnByPath(c, path[1:]) 421 } 422 } 423 424 return nil 425 } 426 427 // resetData is useful for resetting data after writing a chunk, to collect data for the next chunk 428 func (r *schema) resetData() { 429 data := r.Columns() 430 for i := range data { 431 data[i].data.reset(data[i].rep, data[i].maxR, data[i].maxD) 432 } 433 434 r.numRecords = 0 435 } 436 437 func (r *schema) setNumRecords(n int64) { 438 r.numRecords = n 439 } 440 441 func (r *schema) sortIndex() { 442 var ( 443 idx int 444 fn func(c *[]*Column) 445 ) 446 447 fn = func(c *[]*Column) { 448 if c == nil { 449 return 450 } 451 for data := range *c { 452 if (*c)[data].data != nil { 453 (*c)[data].index = idx 454 idx++ 455 } else { 456 fn(&(*c)[data].children) 457 } 458 } 459 } 460 r.ensureRoot() 461 fn(&r.root.children) 462 } 463 464 func (r *schema) SetSchemaDefinition(sd *parquetschema.SchemaDefinition) error { 465 r.schemaDef = sd 466 467 root, err := r.createColumnFromColumnDefinition(r.schemaDef.RootColumn) 468 if err != nil { 469 return err 470 } 471 472 r.root = root 473 474 for _, c := range r.root.children { 475 recursiveFix(c, ColumnPath{}, 0, 0, r.alloc) 476 } 477 478 return nil 479 } 480 481 func (r *schema) createColumnFromColumnDefinition(root *parquetschema.ColumnDefinition) (*Column, error) { 482 params := &ColumnParameters{ 483 LogicalType: root.SchemaElement.LogicalType, 484 ConvertedType: root.SchemaElement.ConvertedType, 485 TypeLength: root.SchemaElement.TypeLength, 486 FieldID: root.SchemaElement.FieldID, 487 Scale: root.SchemaElement.Scale, 488 Precision: root.SchemaElement.Precision, 489 } 490 491 col := &Column{ 492 name: root.SchemaElement.GetName(), 493 rep: root.SchemaElement.GetRepetitionType(), 494 params: params, 495 alloc: r.alloc, 496 } 497 498 if len(root.Children) > 0 { 499 for _, c := range root.Children { 500 childColumn, err := r.createColumnFromColumnDefinition(c) 501 if err != nil { 502 return nil, err 503 } 504 col.children = append(col.children, childColumn) 505 } 506 } else { 507 dataColumn, err := r.getColumnStore(root.SchemaElement, params) 508 if err != nil { 509 return nil, err 510 } 511 col.data = dataColumn 512 } 513 514 col.element = col.buildElement() 515 516 return col, nil 517 } 518 519 func (r *schema) getColumnStore(elem *parquet.SchemaElement, params *ColumnParameters) (*ColumnStore, error) { 520 if elem.Type == nil { 521 return nil, nil 522 } 523 524 var ( 525 colStore *ColumnStore 526 err error 527 ) 528 529 typ := elem.GetType() 530 531 switch typ { 532 case parquet.Type_BYTE_ARRAY: 533 colStore, err = NewByteArrayStore(parquet.Encoding_PLAIN, true, params) 534 case parquet.Type_FLOAT: 535 colStore, err = NewFloatStore(parquet.Encoding_PLAIN, true, params) 536 case parquet.Type_DOUBLE: 537 colStore, err = NewDoubleStore(parquet.Encoding_PLAIN, true, params) 538 case parquet.Type_BOOLEAN: 539 colStore, err = NewBooleanStore(parquet.Encoding_PLAIN, params) 540 case parquet.Type_INT32: 541 colStore, err = NewInt32Store(parquet.Encoding_PLAIN, true, params) 542 case parquet.Type_INT64: 543 colStore, err = NewInt64Store(parquet.Encoding_PLAIN, true, params) 544 case parquet.Type_INT96: 545 colStore, err = NewInt96Store(parquet.Encoding_PLAIN, true, params) 546 case parquet.Type_FIXED_LEN_BYTE_ARRAY: 547 colStore, err = NewFixedByteArrayStore(parquet.Encoding_PLAIN, true, params) 548 default: 549 return nil, fmt.Errorf("unsupported type %q when creating Column store", typ.String()) 550 } 551 if err != nil { 552 return nil, fmt.Errorf("creating Column store for type %q failed: %v", typ.String(), err) 553 } 554 555 colStore.maxPageSize = r.maxPageSize 556 557 return colStore, nil 558 } 559 560 // ColumnParameters contains common parameters related to a column. 561 type ColumnParameters struct { 562 LogicalType *parquet.LogicalType 563 ConvertedType *parquet.ConvertedType 564 TypeLength *int32 565 FieldID *int32 566 Scale *int32 567 Precision *int32 568 } 569 570 // NewDataColumn creates a new data column of the provided field repetition type, using 571 // the provided column store to write data. Do not use this function to create a group. 572 func NewDataColumn(store *ColumnStore, rep parquet.FieldRepetitionType) *Column { 573 return &Column{ 574 data: store, 575 children: nil, 576 rep: rep, 577 params: store.typedColumnStore.params(), 578 alloc: store.alloc, 579 } 580 } 581 582 // NewListColumn return a new LIST column, which is a group of converted type LIST 583 // with a repeated group named "list" as child which then contains a child which is 584 // the element column. 585 func NewListColumn(element *Column, rep parquet.FieldRepetitionType) (*Column, error) { 586 // the higher level element doesn't need name, but all lower level does. 587 element.name = "element" 588 return &Column{ 589 data: nil, 590 rep: rep, 591 parent: listParent, 592 children: []*Column{ 593 { 594 name: "list", 595 data: nil, 596 rep: parquet.FieldRepetitionType_REPEATED, 597 children: []*Column{element}, 598 }, 599 }, 600 params: &ColumnParameters{ 601 LogicalType: &parquet.LogicalType{ 602 LIST: parquet.NewListType(), 603 }, 604 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), 605 }, 606 alloc: element.alloc, 607 }, nil 608 } 609 610 // NewMapColumn returns a new MAP column, which is a group of converted type LIST 611 // with a repeated group named "key_value" of converted type MAP_KEY_VALUE. This 612 // group in turn contains two columns "key" and "value". 613 func NewMapColumn(key, value *Column, rep parquet.FieldRepetitionType) (*Column, error) { 614 // the higher level element doesn't need name, but all lower level does. 615 if key.rep != parquet.FieldRepetitionType_REQUIRED { 616 return nil, errors.New("the key repetition type should be REQUIRED") 617 } 618 619 key.name = "key" 620 value.name = "value" 621 return &Column{ 622 data: nil, 623 rep: rep, 624 parent: mapParent, 625 children: []*Column{ 626 { 627 name: "key_value", 628 data: nil, 629 rep: parquet.FieldRepetitionType_REPEATED, 630 children: []*Column{ 631 key, 632 value, 633 }, 634 params: &ColumnParameters{ 635 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_MAP_KEY_VALUE), 636 }, 637 }, 638 }, 639 params: &ColumnParameters{ 640 LogicalType: &parquet.LogicalType{ 641 MAP: parquet.NewMapType(), 642 }, 643 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), 644 }, 645 alloc: key.alloc, 646 }, nil 647 } 648 649 func (r *schema) AddGroupByPath(path ColumnPath, rep parquet.FieldRepetitionType) error { 650 return r.addColumnOrGroupByPath(path, &Column{ 651 children: []*Column{}, 652 data: nil, 653 rep: rep, 654 params: &ColumnParameters{}, 655 alloc: r.alloc, 656 }) 657 } 658 659 func (r *schema) AddColumn(path string, col *Column) error { 660 return r.addColumnOrGroupByPath(parseColumnPath(path), col) 661 } 662 663 func (r *schema) AddColumnByPath(path ColumnPath, col *Column) error { 664 return r.addColumnOrGroupByPath(path, col) 665 } 666 667 func recursiveFix(col *Column, colPath ColumnPath, maxR, maxD uint16, alloc *allocTracker) { 668 if col.alloc == nil { 669 col.alloc = alloc 670 } 671 if col.data != nil && col.data.alloc == nil { 672 col.data.alloc = alloc 673 } 674 675 if col.rep != parquet.FieldRepetitionType_REQUIRED { 676 maxD++ 677 } 678 if col.rep == parquet.FieldRepetitionType_REPEATED { 679 maxR++ 680 } 681 682 col.maxR = maxR 683 col.maxD = maxD 684 col.path = append(colPath, col.name) 685 if col.data != nil { 686 col.data.reset(col.rep, col.maxR, col.maxD) 687 return 688 } 689 690 for i := range col.children { 691 recursiveFix(col.children[i], col.path, maxR, maxD, alloc) 692 } 693 } 694 695 func (r *schema) addColumnOrGroupByPath(pa ColumnPath, col *Column) error { 696 if r.readOnly != 0 { 697 return errors.New("the schema is read only") 698 } 699 700 r.ensureRoot() 701 702 name := pa[len(pa)-1] 703 704 col.name = name 705 c := r.root 706 for i := 0; i < len(pa)-1; i++ { 707 found := false 708 if c.children == nil { 709 break 710 } 711 for j := range c.children { 712 if c.children[j].name == pa[i] { 713 found = true 714 c = c.children[j] 715 break 716 } 717 } 718 719 if !found { 720 return fmt.Errorf("path %s failed on %q", pa, pa[i]) 721 } 722 723 if c.parent != 0 { 724 return errors.New("can not add a new Column to a list or map logical type") 725 } 726 727 if c.children == nil && i < len(pa)-1 { 728 return fmt.Errorf("path %s is not parent at %q", pa, pa[i]) 729 } 730 } 731 732 if c.children == nil { 733 return errors.New("the children are nil") 734 } 735 736 recursiveFix(col, c.path, c.maxR, c.maxD, col.alloc) 737 738 c.children = append(c.children, col) 739 r.sortIndex() 740 741 return nil 742 } 743 744 func (r *schema) findDataColumn(path string) (*Column, error) { 745 pa := parseColumnPath(path) 746 r.ensureRoot() 747 c := r.root.children 748 var ret *Column 749 for i := 0; i < len(pa); i++ { 750 found := false 751 for j := range c { 752 if c[j].name == pa[i] { 753 found = true 754 ret = c[j] 755 c = c[j].children 756 break 757 } 758 } 759 if !found { 760 return nil, fmt.Errorf("path %s failed on %q", path, pa[i]) 761 } 762 if c == nil && i < len(pa)-1 { 763 return nil, fmt.Errorf("path %s is not parent at %q", path, pa[i]) 764 } 765 } 766 767 if ret == nil || ret.data == nil { 768 return nil, fmt.Errorf("path %s doesnt end on data", path) 769 } 770 771 return ret, nil 772 } 773 774 func (r *schema) AddData(m map[string]interface{}) error { 775 r.readOnly = 1 776 r.ensureRoot() 777 err := r.recursiveAddColumnData(r.root.children, m, 0, 0, 0) 778 if err != nil { 779 return err 780 } 781 782 if err := r.recursiveFlushPages(r.root.children); err != nil { 783 return err 784 } 785 786 r.numRecords++ 787 return nil 788 } 789 790 func (r *schema) getData() (map[string]interface{}, error) { 791 d, _, err := r.root.getData() 792 if err != nil { 793 return nil, err 794 } 795 if d.(map[string]interface{}) == nil { 796 d = make(map[string]interface{}) // just non nil root doc 797 } 798 799 return d.(map[string]interface{}), nil 800 } 801 802 func (r *schema) recursiveAddColumnNil(c []*Column, defLvl, maxRepLvl uint16, repLvl uint16) error { 803 for i := range c { 804 if c[i].data != nil { 805 if c[i].rep == parquet.FieldRepetitionType_REQUIRED && defLvl == c[i].maxD { 806 return fmt.Errorf("the value %q is required", c[i].path.flatName()) 807 } 808 if err := c[i].data.add(nil, defLvl, maxRepLvl, repLvl); err != nil { 809 return err 810 } 811 } 812 if c[i].children != nil { 813 if err := r.recursiveAddColumnNil(c[i].children, defLvl, maxRepLvl, repLvl); err != nil { 814 return err 815 } 816 } 817 } 818 return nil 819 } 820 821 func (r *schema) recursiveFlushPages(c []*Column) error { 822 for i := range c { 823 if c[i].data != nil { 824 if err := c[i].data.flushPage(r, false); err != nil { 825 return err 826 } 827 } 828 if c[i].children != nil { 829 if err := r.recursiveFlushPages(c[i].children); err != nil { 830 return err 831 } 832 } 833 } 834 return nil 835 } 836 837 func (r *schema) recursiveAddColumnData(c []*Column, m interface{}, defLvl uint16, maxRepLvl uint16, repLvl uint16) error { 838 var data = m.(map[string]interface{}) 839 for i := range c { 840 d := data[c[i].name] 841 if c[i].data != nil { 842 if err := c[i].data.add(d, defLvl, maxRepLvl, repLvl); err != nil { 843 return err 844 } 845 } 846 if c[i].children != nil { 847 l := defLvl 848 // In case of required value, there is no need to add a definition value, since it should be there always, 849 // also for nil value, it means we should skip from this level to the lowest level 850 if c[i].rep != parquet.FieldRepetitionType_REQUIRED && d != nil { 851 l++ 852 } 853 854 switch v := d.(type) { 855 case nil: 856 if err := r.recursiveAddColumnNil(c[i].children, l, maxRepLvl, repLvl); err != nil { 857 return err 858 } 859 case map[string]interface{}: // Not repeated 860 if c[i].rep == parquet.FieldRepetitionType_REPEATED { 861 return fmt.Errorf("repeated group should be array") 862 } 863 if err := r.recursiveAddColumnData(c[i].children, v, l, maxRepLvl, repLvl); err != nil { 864 return err 865 } 866 case []map[string]interface{}: 867 if c[i].rep != parquet.FieldRepetitionType_REPEATED { 868 return fmt.Errorf("no repeated group should not be array") 869 } 870 m := maxRepLvl + 1 871 rL := repLvl 872 if len(v) == 0 { 873 return r.recursiveAddColumnNil(c[i].children, l, m, rL) 874 } 875 for vi := range v { 876 if vi > 0 { 877 rL = m 878 } 879 if err := r.recursiveAddColumnData(c[i].children, v[vi], l, m, rL); err != nil { 880 return err 881 } 882 } 883 884 default: 885 return fmt.Errorf("data is not a map or array of map, its a %T", v) 886 } 887 } 888 } 889 890 return nil 891 } 892 893 func (c *Column) readColumnSchema(schema []*parquet.SchemaElement, path ColumnPath, idx int, dLevel, rLevel uint16) (int, error) { 894 s := schema[idx] 895 896 if s.Name == "" { 897 return 0, fmt.Errorf("name in schema on index %d is empty", idx) 898 } 899 900 if s.RepetitionType == nil { 901 return 0, fmt.Errorf("field RepetitionType is nil in index %d", idx) 902 } 903 904 if *s.RepetitionType != parquet.FieldRepetitionType_REQUIRED { 905 dLevel++ 906 } 907 908 if *s.RepetitionType == parquet.FieldRepetitionType_REPEATED { 909 rLevel++ 910 } 911 912 c.element = s 913 c.maxR = rLevel 914 c.maxD = dLevel 915 data, err := getValuesStore(s, c.alloc) 916 if err != nil { 917 return 0, err 918 } 919 c.rep = *s.RepetitionType 920 c.data = data 921 c.path = append(path, s.Name) 922 c.name = s.Name 923 return idx + 1, nil 924 } 925 926 func (c *Column) readGroupSchema(schema []*parquet.SchemaElement, path ColumnPath, idx int, dLevel, rLevel uint16) (int, error) { 927 if len(schema) <= idx { 928 return 0, errors.New("schema index out of bound") 929 } 930 931 s := schema[idx] 932 if s.Type != nil { 933 return 0, fmt.Errorf("field Type is not nil in index %d", idx) 934 } 935 if s.NumChildren == nil { 936 return 0, fmt.Errorf("the field NumChildren is invalid in index %d", idx) 937 } 938 939 if *s.NumChildren <= 0 { 940 return 0, fmt.Errorf("the field NumChildren is zero in index %d", idx) 941 } 942 l := int(*s.NumChildren) 943 944 if len(schema) <= idx+l { 945 return 0, fmt.Errorf("not enough element in the schema list in index %d", idx) 946 } 947 948 if s.RepetitionType != nil && *s.RepetitionType != parquet.FieldRepetitionType_REQUIRED { 949 dLevel++ 950 } 951 952 if s.RepetitionType != nil && *s.RepetitionType == parquet.FieldRepetitionType_REPEATED { 953 rLevel++ 954 } 955 956 c.maxD = dLevel 957 c.maxR = rLevel 958 959 c.path = append(path, s.Name) 960 c.name = s.Name 961 c.element = s 962 c.children = make([]*Column, 0, l) 963 c.rep = s.GetRepetitionType() 964 965 var err error 966 idx++ // move idx from this group to next 967 for i := 0; i < l; i++ { 968 if len(schema) <= idx { 969 return 0, fmt.Errorf("schema index %d is out of bounds", idx) 970 } 971 if schema[idx].Type == nil { 972 // another group 973 child := &Column{alloc: c.alloc} 974 idx, err = child.readGroupSchema(schema, c.path, idx, dLevel, rLevel) 975 if err != nil { 976 return 0, err 977 } 978 c.children = append(c.children, child) 979 } else { 980 child := &Column{alloc: c.alloc} 981 idx, err = child.readColumnSchema(schema, c.path, idx, dLevel, rLevel) 982 if err != nil { 983 return 0, err 984 } 985 c.children = append(c.children, child) 986 } 987 } 988 989 return idx, nil 990 } 991 992 func (r *schema) readSchema(schema []*parquet.SchemaElement) error { 993 r.readOnly = 1 994 var err error 995 for idx := 0; idx < len(schema); { 996 if schema[idx].Type == nil { 997 c := &Column{alloc: r.alloc} 998 idx, err = c.readGroupSchema(schema, ColumnPath{}, idx, 0, 0) 999 if err != nil { 1000 return err 1001 } 1002 r.root.children = append(r.root.children, c) 1003 } else { 1004 c := &Column{alloc: r.alloc} 1005 idx, err = c.readColumnSchema(schema, ColumnPath{}, idx, 0, 0) 1006 if err != nil { 1007 return err 1008 } 1009 r.root.children = append(r.root.children, c) 1010 } 1011 } 1012 r.sortIndex() 1013 r.schemaDef = parquetschema.SchemaDefinitionFromColumnDefinition(createColumnDefinitionFromColumn(r.root)) 1014 return nil 1015 } 1016 1017 func createColumnDefinitionFromColumn(c *Column) *parquetschema.ColumnDefinition { 1018 col := &parquetschema.ColumnDefinition{ 1019 SchemaElement: c.Element(), 1020 } 1021 1022 for _, child := range c.Children() { 1023 col.Children = append(col.Children, createColumnDefinitionFromColumn(child)) 1024 } 1025 1026 return col 1027 } 1028 1029 func (r *schema) GetSchemaDefinition() *parquetschema.SchemaDefinition { 1030 return r.schemaDef 1031 } 1032 1033 // DataSize return the size of data stored in the schema right now 1034 func (r *schema) DataSize() int64 { 1035 cols := r.Columns() 1036 var size int64 1037 for i := range cols { 1038 size += cols[i].getDataSize() 1039 } 1040 1041 return size 1042 } 1043 1044 func (r *schema) rowGroupNumRecords() int64 { 1045 return r.numRecords 1046 } 1047 1048 func makeSchema(meta *parquet.FileMetaData, validateCRC bool, alloc *allocTracker) (*schema, error) { 1049 if len(meta.Schema) < 1 { 1050 return nil, errors.New("no schema element found") 1051 } 1052 s := &schema{ 1053 root: &Column{ 1054 index: 0, 1055 name: meta.Schema[0].Name, 1056 data: nil, 1057 children: make([]*Column, 0, len(meta.Schema)-1), 1058 rep: 0, 1059 maxR: 0, 1060 maxD: 0, 1061 element: meta.Schema[0], 1062 params: &ColumnParameters{ 1063 LogicalType: meta.Schema[0].LogicalType, 1064 ConvertedType: meta.Schema[0].ConvertedType, 1065 TypeLength: meta.Schema[0].TypeLength, 1066 FieldID: meta.Schema[0].FieldID, 1067 }, 1068 alloc: alloc, 1069 }, 1070 validateCRC: validateCRC, 1071 alloc: alloc, 1072 } 1073 err := s.readSchema(meta.Schema[1:]) 1074 if err != nil { 1075 return nil, err 1076 } 1077 1078 return s, nil 1079 }