github.com/fraugster/parquet-go@v0.12.0/schema.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"strings"
     7  
     8  	"github.com/fraugster/parquet-go/parquet"
     9  	"github.com/fraugster/parquet-go/parquetschema"
    10  )
    11  
    12  const (
    13  	_ int = iota
    14  	listParent
    15  	mapParent
    16  )
    17  
    18  // Column is composed of a schema definition for the column, a column store
    19  // that contains the implementation to write the data to a parquet file, and
    20  // any additional parameters that are necessary to correctly write the data.
    21  // Please the NewDataColumn, NewListColumn or NewMapColumn functions to create
    22  // a Column object correctly.
    23  type Column struct {
    24  	index int
    25  	name  string
    26  	path  ColumnPath
    27  
    28  	// one of the following should be not null. data or children
    29  	data     *ColumnStore
    30  	children []*Column
    31  
    32  	rep parquet.FieldRepetitionType
    33  
    34  	maxR, maxD uint16
    35  
    36  	parent int // one of noParent, listParent, mapParent
    37  	// for the reader we should read this element from the meta, for the writer we need to build this element
    38  	element *parquet.SchemaElement
    39  
    40  	params *ColumnParameters
    41  
    42  	alloc *allocTracker
    43  }
    44  
    45  // ColumnPath describes the path through the hierarchy of the schema for a particular column. For a top-level
    46  // column of the schema, the column path only contains one element, while for nested columns, the path consists
    47  // of multiple elements.
    48  type ColumnPath []string
    49  
    50  func parseColumnPath(s string) ColumnPath {
    51  	return strings.Split(s, ".")
    52  }
    53  
    54  func (c ColumnPath) flatName() string {
    55  	return strings.Join(c, ".")
    56  }
    57  
    58  // Equal returns true if all path elements of this ColumnPath are equal to the
    59  // corresponding path elements of the ColumnPath provided as parameter, false
    60  // otherwise.
    61  func (c ColumnPath) Equal(d ColumnPath) bool {
    62  	if len(c) != len(d) {
    63  		return false
    64  	}
    65  	for i := range c {
    66  		if c[i] != d[i] {
    67  			return false
    68  		}
    69  	}
    70  	return true
    71  }
    72  
    73  // HasPrefix returns true if all path elements of the ColumnPath provided as parameter
    74  // are equal to the corresponding path elements of this ColumnPath.
    75  func (c ColumnPath) HasPrefix(d ColumnPath) bool {
    76  	if len(d) > len(c) {
    77  		return false
    78  	}
    79  	for i := range d {
    80  		if c[i] != d[i] {
    81  			return false
    82  		}
    83  	}
    84  	return true
    85  }
    86  
    87  // Children returns the column's child columns.
    88  func (c *Column) Children() []*Column {
    89  	return c.children
    90  }
    91  
    92  func (c *Column) getSchemaArray() []*parquet.SchemaElement {
    93  	ret := []*parquet.SchemaElement{c.Element()}
    94  	if c.data != nil {
    95  		return ret
    96  	}
    97  
    98  	for i := range c.children {
    99  		ret = append(ret, c.children[i].getSchemaArray()...)
   100  	}
   101  
   102  	return ret
   103  }
   104  
   105  // MaxDefinitionLevel returns the maximum definition level for this column.
   106  func (c *Column) MaxDefinitionLevel() uint16 {
   107  	return c.maxD
   108  }
   109  
   110  // MaxRepetitionLevel returns the maximum repetition value for this column.
   111  func (c *Column) MaxRepetitionLevel() uint16 {
   112  	return c.maxR
   113  }
   114  
   115  // FlatName returns the name of the column and its parents in dotted notation.
   116  //
   117  // Deprecated: use Path instead. If a column or group name contains '.', the returned
   118  // flat name cannot be used to properly address them.
   119  func (c *Column) FlatName() string {
   120  	return c.path.flatName()
   121  }
   122  
   123  // Path returns the full column path of the column.
   124  func (c *Column) Path() ColumnPath {
   125  	return c.path
   126  }
   127  
   128  // Name returns the column name.
   129  func (c *Column) Name() string {
   130  	return c.name
   131  }
   132  
   133  // Index returns the index of the column in schema, zero based.
   134  func (c *Column) Index() int {
   135  	return c.index
   136  }
   137  
   138  // Element returns schema element definition of the column.
   139  func (c *Column) Element() *parquet.SchemaElement {
   140  	if c.element == nil {
   141  		// If this is a no-element node, we need to re-create element every time to make sure the content is always up-to-date
   142  		return c.buildElement()
   143  	}
   144  	return c.element
   145  }
   146  
   147  // Type returns the parquet type of the value. If the column is a group, then the
   148  // method will return nil.
   149  func (c *Column) Type() *parquet.Type {
   150  	if c.data == nil {
   151  		return nil
   152  	}
   153  
   154  	return parquet.TypePtr(c.data.parquetType())
   155  }
   156  
   157  // RepetitionType returns the repetition type for the current column.
   158  func (c *Column) RepetitionType() *parquet.FieldRepetitionType {
   159  	return &c.rep
   160  }
   161  
   162  // DataColumn returns true if the column is data column, false otherwise.
   163  func (c *Column) DataColumn() bool {
   164  	return c.data != nil
   165  }
   166  
   167  // ChildrenCount returns the number of children in a group. If the column is
   168  // a data column, it returns -1.
   169  func (c *Column) ChildrenCount() int {
   170  	if c.data != nil {
   171  		return -1
   172  	}
   173  
   174  	return len(c.children)
   175  }
   176  
   177  func (c *Column) getColumnStore() *ColumnStore {
   178  	return c.data
   179  }
   180  
   181  func (c *Column) buildElement() *parquet.SchemaElement {
   182  	rep := c.rep
   183  	elem := &parquet.SchemaElement{
   184  		RepetitionType: &rep,
   185  		Name:           c.name,
   186  	}
   187  
   188  	if c.params != nil {
   189  		elem.FieldID = c.params.FieldID
   190  		elem.ConvertedType = c.params.ConvertedType
   191  		elem.LogicalType = c.params.LogicalType
   192  	}
   193  
   194  	if c.data != nil {
   195  		elem.Type = parquet.TypePtr(c.data.parquetType())
   196  		elem.TypeLength = c.params.TypeLength
   197  		elem.Scale = c.params.Scale
   198  		elem.Precision = c.params.Precision
   199  	} else {
   200  		nc := int32(len(c.children))
   201  		elem.NumChildren = &nc
   202  	}
   203  
   204  	return elem
   205  }
   206  
   207  func (c *Column) getDataSize() int64 {
   208  	if _, ok := c.data.typedColumnStore.(*booleanStore); ok {
   209  		// Booleans are stored in one bit, so the result is the number of items / 8
   210  		return int64(c.data.values.numValues())/8 + 1
   211  	}
   212  	_, dataSize := c.data.values.sizes()
   213  	return dataSize
   214  }
   215  
   216  func (c *Column) getNextData() (map[string]interface{}, int32, error) {
   217  	if c.children == nil {
   218  		return nil, 0, errors.New("bug: call getNextData on non group node")
   219  	}
   220  	ret := make(map[string]interface{})
   221  	notNil := 0
   222  	var maxD int32
   223  	for i := range c.children {
   224  		data, dl, err := c.children[i].getData()
   225  		if err != nil {
   226  			return nil, 0, err
   227  		}
   228  		if dl > maxD {
   229  			maxD = dl
   230  		}
   231  
   232  		// https://golang.org/doc/faq#nil_error
   233  		if m, ok := data.(map[string]interface{}); ok && m == nil {
   234  			data = nil
   235  		}
   236  
   237  		// if the data is not nil, then its ok, but if its nil, we need to know in which definition level is this nil is.
   238  		// if its exactly one below max definition level, then the parent is there
   239  		if data != nil {
   240  			ret[c.children[i].name] = data
   241  			if c.children[i].data != nil {
   242  				c.alloc.register(data, uint64(c.children[i].data.sizeOf(data)))
   243  			}
   244  			notNil++
   245  		}
   246  		var diff int32
   247  		if c.children[i].rep != parquet.FieldRepetitionType_REQUIRED {
   248  			diff++
   249  		}
   250  		if dl == int32(c.children[i].maxD)-diff {
   251  			notNil++
   252  		}
   253  	}
   254  
   255  	if notNil == 0 {
   256  		return nil, maxD, nil
   257  	}
   258  
   259  	return ret, int32(c.maxD), nil
   260  }
   261  
   262  func (c *Column) getFirstRDLevel() (int32, int32, bool) {
   263  	if c.data != nil {
   264  		return c.data.getRDLevelAt(-1)
   265  	}
   266  
   267  	// there should be at lease 1 child,
   268  	for i := range c.children {
   269  		rl, dl, last := c.children[i].getFirstRDLevel()
   270  		if last {
   271  			return rl, dl, last
   272  		}
   273  
   274  		// if this value is not nil, rLevel or dLevel less than this level is not interesting
   275  		if rl >= int32(c.children[i].maxR) || dl >= int32(c.children[i].maxD) {
   276  			return rl, dl, last
   277  		}
   278  	}
   279  
   280  	return -1, -1, false
   281  }
   282  
   283  func (c *Column) getData() (interface{}, int32, error) {
   284  	if c.children != nil {
   285  		data, maxD, err := c.getNextData()
   286  		if err != nil {
   287  			return nil, 0, err
   288  		}
   289  
   290  		if c.rep != parquet.FieldRepetitionType_REPEATED || data == nil {
   291  			return data, maxD, nil
   292  		}
   293  
   294  		ret := []map[string]interface{}{data}
   295  		for {
   296  			rl, _, last := c.getFirstRDLevel()
   297  			if last || rl < int32(c.maxR) || rl == 0 {
   298  				// end of this object
   299  				return ret, maxD, nil
   300  			}
   301  
   302  			data, _, err := c.getNextData()
   303  			if err != nil {
   304  				return nil, maxD, err
   305  			}
   306  
   307  			ret = append(ret, data)
   308  		}
   309  	}
   310  
   311  	return c.data.get(int32(c.maxD), int32(c.maxR))
   312  }
   313  
   314  type schema struct {
   315  	schemaDef  *parquetschema.SchemaDefinition
   316  	root       *Column
   317  	numRecords int64
   318  	readOnly   int
   319  
   320  	maxPageSize int64
   321  
   322  	// selected columns in reading. if the size is zero, it means all the columns
   323  	selectedColumns []ColumnPath
   324  
   325  	enableCRC   bool // if true, CRC32 checksums will be computed for pages upon writing.
   326  	validateCRC bool // if true, CRC32 checksums will be validated for pages upon reading.
   327  
   328  	alloc *allocTracker
   329  }
   330  
   331  func (r *schema) ensureRoot() {
   332  	if r.root == nil {
   333  		r.root = &Column{
   334  			index:    0,
   335  			name:     "msg",
   336  			data:     nil,
   337  			children: []*Column{},
   338  			rep:      0,
   339  			maxR:     0,
   340  			maxD:     0,
   341  			element:  nil,
   342  			alloc:    r.alloc,
   343  		}
   344  	}
   345  }
   346  
   347  func (r *schema) SetSelectedColumns(cols ...ColumnPath) {
   348  	r.selectedColumns = cols
   349  }
   350  
   351  func (r *schema) isSelectedByPath(path ColumnPath) bool {
   352  	if len(r.selectedColumns) == 0 {
   353  		return true
   354  	}
   355  
   356  	for _, p := range r.selectedColumns {
   357  		if p.Equal(path) {
   358  			return true
   359  		}
   360  
   361  		if path.HasPrefix(p) {
   362  			return true
   363  		}
   364  	}
   365  
   366  	return false
   367  }
   368  
   369  func (r *schema) getSchemaArray() []*parquet.SchemaElement {
   370  	r.ensureRoot()
   371  	elem := r.root.getSchemaArray()
   372  	// the root doesn't have repetition type
   373  	elem[0].RepetitionType = nil
   374  	return elem
   375  }
   376  
   377  func (r *schema) Columns() []*Column {
   378  	var ret []*Column
   379  	var fn func([]*Column)
   380  
   381  	fn = func(columns []*Column) {
   382  		for i := range columns {
   383  			if columns[i].data != nil {
   384  				ret = append(ret, columns[i])
   385  			} else {
   386  				fn(columns[i].children)
   387  			}
   388  		}
   389  	}
   390  	r.ensureRoot()
   391  	fn(r.root.children)
   392  	return ret
   393  }
   394  
   395  func (r *schema) GetColumnByName(path string) *Column {
   396  	data := r.Columns()
   397  	for i := range data {
   398  		if data[i].path.flatName() == path {
   399  			return data[i]
   400  		}
   401  	}
   402  
   403  	return nil
   404  }
   405  
   406  func (r *schema) GetColumnByPath(path ColumnPath) *Column {
   407  	return r.getColumnByPath(r.root, path)
   408  }
   409  
   410  func (r *schema) getColumnByPath(col *Column, path ColumnPath) *Column {
   411  	if len(path) == 0 {
   412  		return nil
   413  	}
   414  
   415  	for _, c := range col.children {
   416  		if c.name == path[0] {
   417  			if len(path) == 1 {
   418  				return c
   419  			}
   420  			return r.getColumnByPath(c, path[1:])
   421  		}
   422  	}
   423  
   424  	return nil
   425  }
   426  
   427  // resetData is useful for resetting data after writing a chunk, to collect data for the next chunk
   428  func (r *schema) resetData() {
   429  	data := r.Columns()
   430  	for i := range data {
   431  		data[i].data.reset(data[i].rep, data[i].maxR, data[i].maxD)
   432  	}
   433  
   434  	r.numRecords = 0
   435  }
   436  
   437  func (r *schema) setNumRecords(n int64) {
   438  	r.numRecords = n
   439  }
   440  
   441  func (r *schema) sortIndex() {
   442  	var (
   443  		idx int
   444  		fn  func(c *[]*Column)
   445  	)
   446  
   447  	fn = func(c *[]*Column) {
   448  		if c == nil {
   449  			return
   450  		}
   451  		for data := range *c {
   452  			if (*c)[data].data != nil {
   453  				(*c)[data].index = idx
   454  				idx++
   455  			} else {
   456  				fn(&(*c)[data].children)
   457  			}
   458  		}
   459  	}
   460  	r.ensureRoot()
   461  	fn(&r.root.children)
   462  }
   463  
   464  func (r *schema) SetSchemaDefinition(sd *parquetschema.SchemaDefinition) error {
   465  	r.schemaDef = sd
   466  
   467  	root, err := r.createColumnFromColumnDefinition(r.schemaDef.RootColumn)
   468  	if err != nil {
   469  		return err
   470  	}
   471  
   472  	r.root = root
   473  
   474  	for _, c := range r.root.children {
   475  		recursiveFix(c, ColumnPath{}, 0, 0, r.alloc)
   476  	}
   477  
   478  	return nil
   479  }
   480  
   481  func (r *schema) createColumnFromColumnDefinition(root *parquetschema.ColumnDefinition) (*Column, error) {
   482  	params := &ColumnParameters{
   483  		LogicalType:   root.SchemaElement.LogicalType,
   484  		ConvertedType: root.SchemaElement.ConvertedType,
   485  		TypeLength:    root.SchemaElement.TypeLength,
   486  		FieldID:       root.SchemaElement.FieldID,
   487  		Scale:         root.SchemaElement.Scale,
   488  		Precision:     root.SchemaElement.Precision,
   489  	}
   490  
   491  	col := &Column{
   492  		name:   root.SchemaElement.GetName(),
   493  		rep:    root.SchemaElement.GetRepetitionType(),
   494  		params: params,
   495  		alloc:  r.alloc,
   496  	}
   497  
   498  	if len(root.Children) > 0 {
   499  		for _, c := range root.Children {
   500  			childColumn, err := r.createColumnFromColumnDefinition(c)
   501  			if err != nil {
   502  				return nil, err
   503  			}
   504  			col.children = append(col.children, childColumn)
   505  		}
   506  	} else {
   507  		dataColumn, err := r.getColumnStore(root.SchemaElement, params)
   508  		if err != nil {
   509  			return nil, err
   510  		}
   511  		col.data = dataColumn
   512  	}
   513  
   514  	col.element = col.buildElement()
   515  
   516  	return col, nil
   517  }
   518  
   519  func (r *schema) getColumnStore(elem *parquet.SchemaElement, params *ColumnParameters) (*ColumnStore, error) {
   520  	if elem.Type == nil {
   521  		return nil, nil
   522  	}
   523  
   524  	var (
   525  		colStore *ColumnStore
   526  		err      error
   527  	)
   528  
   529  	typ := elem.GetType()
   530  
   531  	switch typ {
   532  	case parquet.Type_BYTE_ARRAY:
   533  		colStore, err = NewByteArrayStore(parquet.Encoding_PLAIN, true, params)
   534  	case parquet.Type_FLOAT:
   535  		colStore, err = NewFloatStore(parquet.Encoding_PLAIN, true, params)
   536  	case parquet.Type_DOUBLE:
   537  		colStore, err = NewDoubleStore(parquet.Encoding_PLAIN, true, params)
   538  	case parquet.Type_BOOLEAN:
   539  		colStore, err = NewBooleanStore(parquet.Encoding_PLAIN, params)
   540  	case parquet.Type_INT32:
   541  		colStore, err = NewInt32Store(parquet.Encoding_PLAIN, true, params)
   542  	case parquet.Type_INT64:
   543  		colStore, err = NewInt64Store(parquet.Encoding_PLAIN, true, params)
   544  	case parquet.Type_INT96:
   545  		colStore, err = NewInt96Store(parquet.Encoding_PLAIN, true, params)
   546  	case parquet.Type_FIXED_LEN_BYTE_ARRAY:
   547  		colStore, err = NewFixedByteArrayStore(parquet.Encoding_PLAIN, true, params)
   548  	default:
   549  		return nil, fmt.Errorf("unsupported type %q when creating Column store", typ.String())
   550  	}
   551  	if err != nil {
   552  		return nil, fmt.Errorf("creating Column store for type %q failed: %v", typ.String(), err)
   553  	}
   554  
   555  	colStore.maxPageSize = r.maxPageSize
   556  
   557  	return colStore, nil
   558  }
   559  
   560  // ColumnParameters contains common parameters related to a column.
   561  type ColumnParameters struct {
   562  	LogicalType   *parquet.LogicalType
   563  	ConvertedType *parquet.ConvertedType
   564  	TypeLength    *int32
   565  	FieldID       *int32
   566  	Scale         *int32
   567  	Precision     *int32
   568  }
   569  
   570  // NewDataColumn creates a new data column of the provided field repetition type, using
   571  // the provided column store to write data. Do not use this function to create a group.
   572  func NewDataColumn(store *ColumnStore, rep parquet.FieldRepetitionType) *Column {
   573  	return &Column{
   574  		data:     store,
   575  		children: nil,
   576  		rep:      rep,
   577  		params:   store.typedColumnStore.params(),
   578  		alloc:    store.alloc,
   579  	}
   580  }
   581  
   582  // NewListColumn return a new LIST column, which is a group of converted type LIST
   583  // with a repeated group named "list" as child which then contains a child which is
   584  // the element column.
   585  func NewListColumn(element *Column, rep parquet.FieldRepetitionType) (*Column, error) {
   586  	// the higher level element doesn't need name, but all lower level does.
   587  	element.name = "element"
   588  	return &Column{
   589  		data:   nil,
   590  		rep:    rep,
   591  		parent: listParent,
   592  		children: []*Column{
   593  			{
   594  				name:     "list",
   595  				data:     nil,
   596  				rep:      parquet.FieldRepetitionType_REPEATED,
   597  				children: []*Column{element},
   598  			},
   599  		},
   600  		params: &ColumnParameters{
   601  			LogicalType: &parquet.LogicalType{
   602  				LIST: parquet.NewListType(),
   603  			},
   604  			ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
   605  		},
   606  		alloc: element.alloc,
   607  	}, nil
   608  }
   609  
   610  // NewMapColumn returns a new MAP column, which is a group of converted type LIST
   611  // with a repeated group named "key_value" of converted type MAP_KEY_VALUE. This
   612  // group in turn contains two columns "key" and "value".
   613  func NewMapColumn(key, value *Column, rep parquet.FieldRepetitionType) (*Column, error) {
   614  	// the higher level element doesn't need name, but all lower level does.
   615  	if key.rep != parquet.FieldRepetitionType_REQUIRED {
   616  		return nil, errors.New("the key repetition type should be REQUIRED")
   617  	}
   618  
   619  	key.name = "key"
   620  	value.name = "value"
   621  	return &Column{
   622  		data:   nil,
   623  		rep:    rep,
   624  		parent: mapParent,
   625  		children: []*Column{
   626  			{
   627  				name: "key_value",
   628  				data: nil,
   629  				rep:  parquet.FieldRepetitionType_REPEATED,
   630  				children: []*Column{
   631  					key,
   632  					value,
   633  				},
   634  				params: &ColumnParameters{
   635  					ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_MAP_KEY_VALUE),
   636  				},
   637  			},
   638  		},
   639  		params: &ColumnParameters{
   640  			LogicalType: &parquet.LogicalType{
   641  				MAP: parquet.NewMapType(),
   642  			},
   643  			ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_MAP),
   644  		},
   645  		alloc: key.alloc,
   646  	}, nil
   647  }
   648  
   649  func (r *schema) AddGroupByPath(path ColumnPath, rep parquet.FieldRepetitionType) error {
   650  	return r.addColumnOrGroupByPath(path, &Column{
   651  		children: []*Column{},
   652  		data:     nil,
   653  		rep:      rep,
   654  		params:   &ColumnParameters{},
   655  		alloc:    r.alloc,
   656  	})
   657  }
   658  
   659  func (r *schema) AddColumn(path string, col *Column) error {
   660  	return r.addColumnOrGroupByPath(parseColumnPath(path), col)
   661  }
   662  
   663  func (r *schema) AddColumnByPath(path ColumnPath, col *Column) error {
   664  	return r.addColumnOrGroupByPath(path, col)
   665  }
   666  
   667  func recursiveFix(col *Column, colPath ColumnPath, maxR, maxD uint16, alloc *allocTracker) {
   668  	if col.alloc == nil {
   669  		col.alloc = alloc
   670  	}
   671  	if col.data != nil && col.data.alloc == nil {
   672  		col.data.alloc = alloc
   673  	}
   674  
   675  	if col.rep != parquet.FieldRepetitionType_REQUIRED {
   676  		maxD++
   677  	}
   678  	if col.rep == parquet.FieldRepetitionType_REPEATED {
   679  		maxR++
   680  	}
   681  
   682  	col.maxR = maxR
   683  	col.maxD = maxD
   684  	col.path = append(colPath, col.name)
   685  	if col.data != nil {
   686  		col.data.reset(col.rep, col.maxR, col.maxD)
   687  		return
   688  	}
   689  
   690  	for i := range col.children {
   691  		recursiveFix(col.children[i], col.path, maxR, maxD, alloc)
   692  	}
   693  }
   694  
   695  func (r *schema) addColumnOrGroupByPath(pa ColumnPath, col *Column) error {
   696  	if r.readOnly != 0 {
   697  		return errors.New("the schema is read only")
   698  	}
   699  
   700  	r.ensureRoot()
   701  
   702  	name := pa[len(pa)-1]
   703  
   704  	col.name = name
   705  	c := r.root
   706  	for i := 0; i < len(pa)-1; i++ {
   707  		found := false
   708  		if c.children == nil {
   709  			break
   710  		}
   711  		for j := range c.children {
   712  			if c.children[j].name == pa[i] {
   713  				found = true
   714  				c = c.children[j]
   715  				break
   716  			}
   717  		}
   718  
   719  		if !found {
   720  			return fmt.Errorf("path %s failed on %q", pa, pa[i])
   721  		}
   722  
   723  		if c.parent != 0 {
   724  			return errors.New("can not add a new Column to a list or map logical type")
   725  		}
   726  
   727  		if c.children == nil && i < len(pa)-1 {
   728  			return fmt.Errorf("path %s is not parent at %q", pa, pa[i])
   729  		}
   730  	}
   731  
   732  	if c.children == nil {
   733  		return errors.New("the children are nil")
   734  	}
   735  
   736  	recursiveFix(col, c.path, c.maxR, c.maxD, col.alloc)
   737  
   738  	c.children = append(c.children, col)
   739  	r.sortIndex()
   740  
   741  	return nil
   742  }
   743  
   744  func (r *schema) findDataColumn(path string) (*Column, error) {
   745  	pa := parseColumnPath(path)
   746  	r.ensureRoot()
   747  	c := r.root.children
   748  	var ret *Column
   749  	for i := 0; i < len(pa); i++ {
   750  		found := false
   751  		for j := range c {
   752  			if c[j].name == pa[i] {
   753  				found = true
   754  				ret = c[j]
   755  				c = c[j].children
   756  				break
   757  			}
   758  		}
   759  		if !found {
   760  			return nil, fmt.Errorf("path %s failed on %q", path, pa[i])
   761  		}
   762  		if c == nil && i < len(pa)-1 {
   763  			return nil, fmt.Errorf("path %s is not parent at %q", path, pa[i])
   764  		}
   765  	}
   766  
   767  	if ret == nil || ret.data == nil {
   768  		return nil, fmt.Errorf("path %s doesnt end on data", path)
   769  	}
   770  
   771  	return ret, nil
   772  }
   773  
   774  func (r *schema) AddData(m map[string]interface{}) error {
   775  	r.readOnly = 1
   776  	r.ensureRoot()
   777  	err := r.recursiveAddColumnData(r.root.children, m, 0, 0, 0)
   778  	if err != nil {
   779  		return err
   780  	}
   781  
   782  	if err := r.recursiveFlushPages(r.root.children); err != nil {
   783  		return err
   784  	}
   785  
   786  	r.numRecords++
   787  	return nil
   788  }
   789  
   790  func (r *schema) getData() (map[string]interface{}, error) {
   791  	d, _, err := r.root.getData()
   792  	if err != nil {
   793  		return nil, err
   794  	}
   795  	if d.(map[string]interface{}) == nil {
   796  		d = make(map[string]interface{}) // just non nil root doc
   797  	}
   798  
   799  	return d.(map[string]interface{}), nil
   800  }
   801  
   802  func (r *schema) recursiveAddColumnNil(c []*Column, defLvl, maxRepLvl uint16, repLvl uint16) error {
   803  	for i := range c {
   804  		if c[i].data != nil {
   805  			if c[i].rep == parquet.FieldRepetitionType_REQUIRED && defLvl == c[i].maxD {
   806  				return fmt.Errorf("the value %q is required", c[i].path.flatName())
   807  			}
   808  			if err := c[i].data.add(nil, defLvl, maxRepLvl, repLvl); err != nil {
   809  				return err
   810  			}
   811  		}
   812  		if c[i].children != nil {
   813  			if err := r.recursiveAddColumnNil(c[i].children, defLvl, maxRepLvl, repLvl); err != nil {
   814  				return err
   815  			}
   816  		}
   817  	}
   818  	return nil
   819  }
   820  
   821  func (r *schema) recursiveFlushPages(c []*Column) error {
   822  	for i := range c {
   823  		if c[i].data != nil {
   824  			if err := c[i].data.flushPage(r, false); err != nil {
   825  				return err
   826  			}
   827  		}
   828  		if c[i].children != nil {
   829  			if err := r.recursiveFlushPages(c[i].children); err != nil {
   830  				return err
   831  			}
   832  		}
   833  	}
   834  	return nil
   835  }
   836  
   837  func (r *schema) recursiveAddColumnData(c []*Column, m interface{}, defLvl uint16, maxRepLvl uint16, repLvl uint16) error {
   838  	var data = m.(map[string]interface{})
   839  	for i := range c {
   840  		d := data[c[i].name]
   841  		if c[i].data != nil {
   842  			if err := c[i].data.add(d, defLvl, maxRepLvl, repLvl); err != nil {
   843  				return err
   844  			}
   845  		}
   846  		if c[i].children != nil {
   847  			l := defLvl
   848  			// In case of required value, there is no need to add a definition value, since it should be there always,
   849  			// also for nil value, it means we should skip from this level to the lowest level
   850  			if c[i].rep != parquet.FieldRepetitionType_REQUIRED && d != nil {
   851  				l++
   852  			}
   853  
   854  			switch v := d.(type) {
   855  			case nil:
   856  				if err := r.recursiveAddColumnNil(c[i].children, l, maxRepLvl, repLvl); err != nil {
   857  					return err
   858  				}
   859  			case map[string]interface{}: // Not repeated
   860  				if c[i].rep == parquet.FieldRepetitionType_REPEATED {
   861  					return fmt.Errorf("repeated group should be array")
   862  				}
   863  				if err := r.recursiveAddColumnData(c[i].children, v, l, maxRepLvl, repLvl); err != nil {
   864  					return err
   865  				}
   866  			case []map[string]interface{}:
   867  				if c[i].rep != parquet.FieldRepetitionType_REPEATED {
   868  					return fmt.Errorf("no repeated group should not be array")
   869  				}
   870  				m := maxRepLvl + 1
   871  				rL := repLvl
   872  				if len(v) == 0 {
   873  					return r.recursiveAddColumnNil(c[i].children, l, m, rL)
   874  				}
   875  				for vi := range v {
   876  					if vi > 0 {
   877  						rL = m
   878  					}
   879  					if err := r.recursiveAddColumnData(c[i].children, v[vi], l, m, rL); err != nil {
   880  						return err
   881  					}
   882  				}
   883  
   884  			default:
   885  				return fmt.Errorf("data is not a map or array of map, its a %T", v)
   886  			}
   887  		}
   888  	}
   889  
   890  	return nil
   891  }
   892  
   893  func (c *Column) readColumnSchema(schema []*parquet.SchemaElement, path ColumnPath, idx int, dLevel, rLevel uint16) (int, error) {
   894  	s := schema[idx]
   895  
   896  	if s.Name == "" {
   897  		return 0, fmt.Errorf("name in schema on index %d is empty", idx)
   898  	}
   899  
   900  	if s.RepetitionType == nil {
   901  		return 0, fmt.Errorf("field RepetitionType is nil in index %d", idx)
   902  	}
   903  
   904  	if *s.RepetitionType != parquet.FieldRepetitionType_REQUIRED {
   905  		dLevel++
   906  	}
   907  
   908  	if *s.RepetitionType == parquet.FieldRepetitionType_REPEATED {
   909  		rLevel++
   910  	}
   911  
   912  	c.element = s
   913  	c.maxR = rLevel
   914  	c.maxD = dLevel
   915  	data, err := getValuesStore(s, c.alloc)
   916  	if err != nil {
   917  		return 0, err
   918  	}
   919  	c.rep = *s.RepetitionType
   920  	c.data = data
   921  	c.path = append(path, s.Name)
   922  	c.name = s.Name
   923  	return idx + 1, nil
   924  }
   925  
   926  func (c *Column) readGroupSchema(schema []*parquet.SchemaElement, path ColumnPath, idx int, dLevel, rLevel uint16) (int, error) {
   927  	if len(schema) <= idx {
   928  		return 0, errors.New("schema index out of bound")
   929  	}
   930  
   931  	s := schema[idx]
   932  	if s.Type != nil {
   933  		return 0, fmt.Errorf("field Type is not nil in index %d", idx)
   934  	}
   935  	if s.NumChildren == nil {
   936  		return 0, fmt.Errorf("the field NumChildren is invalid in index %d", idx)
   937  	}
   938  
   939  	if *s.NumChildren <= 0 {
   940  		return 0, fmt.Errorf("the field NumChildren is zero in index %d", idx)
   941  	}
   942  	l := int(*s.NumChildren)
   943  
   944  	if len(schema) <= idx+l {
   945  		return 0, fmt.Errorf("not enough element in the schema list in index %d", idx)
   946  	}
   947  
   948  	if s.RepetitionType != nil && *s.RepetitionType != parquet.FieldRepetitionType_REQUIRED {
   949  		dLevel++
   950  	}
   951  
   952  	if s.RepetitionType != nil && *s.RepetitionType == parquet.FieldRepetitionType_REPEATED {
   953  		rLevel++
   954  	}
   955  
   956  	c.maxD = dLevel
   957  	c.maxR = rLevel
   958  
   959  	c.path = append(path, s.Name)
   960  	c.name = s.Name
   961  	c.element = s
   962  	c.children = make([]*Column, 0, l)
   963  	c.rep = s.GetRepetitionType()
   964  
   965  	var err error
   966  	idx++ // move idx from this group to next
   967  	for i := 0; i < l; i++ {
   968  		if len(schema) <= idx {
   969  			return 0, fmt.Errorf("schema index %d is out of bounds", idx)
   970  		}
   971  		if schema[idx].Type == nil {
   972  			// another group
   973  			child := &Column{alloc: c.alloc}
   974  			idx, err = child.readGroupSchema(schema, c.path, idx, dLevel, rLevel)
   975  			if err != nil {
   976  				return 0, err
   977  			}
   978  			c.children = append(c.children, child)
   979  		} else {
   980  			child := &Column{alloc: c.alloc}
   981  			idx, err = child.readColumnSchema(schema, c.path, idx, dLevel, rLevel)
   982  			if err != nil {
   983  				return 0, err
   984  			}
   985  			c.children = append(c.children, child)
   986  		}
   987  	}
   988  
   989  	return idx, nil
   990  }
   991  
   992  func (r *schema) readSchema(schema []*parquet.SchemaElement) error {
   993  	r.readOnly = 1
   994  	var err error
   995  	for idx := 0; idx < len(schema); {
   996  		if schema[idx].Type == nil {
   997  			c := &Column{alloc: r.alloc}
   998  			idx, err = c.readGroupSchema(schema, ColumnPath{}, idx, 0, 0)
   999  			if err != nil {
  1000  				return err
  1001  			}
  1002  			r.root.children = append(r.root.children, c)
  1003  		} else {
  1004  			c := &Column{alloc: r.alloc}
  1005  			idx, err = c.readColumnSchema(schema, ColumnPath{}, idx, 0, 0)
  1006  			if err != nil {
  1007  				return err
  1008  			}
  1009  			r.root.children = append(r.root.children, c)
  1010  		}
  1011  	}
  1012  	r.sortIndex()
  1013  	r.schemaDef = parquetschema.SchemaDefinitionFromColumnDefinition(createColumnDefinitionFromColumn(r.root))
  1014  	return nil
  1015  }
  1016  
  1017  func createColumnDefinitionFromColumn(c *Column) *parquetschema.ColumnDefinition {
  1018  	col := &parquetschema.ColumnDefinition{
  1019  		SchemaElement: c.Element(),
  1020  	}
  1021  
  1022  	for _, child := range c.Children() {
  1023  		col.Children = append(col.Children, createColumnDefinitionFromColumn(child))
  1024  	}
  1025  
  1026  	return col
  1027  }
  1028  
  1029  func (r *schema) GetSchemaDefinition() *parquetschema.SchemaDefinition {
  1030  	return r.schemaDef
  1031  }
  1032  
  1033  // DataSize return the size of data stored in the schema right now
  1034  func (r *schema) DataSize() int64 {
  1035  	cols := r.Columns()
  1036  	var size int64
  1037  	for i := range cols {
  1038  		size += cols[i].getDataSize()
  1039  	}
  1040  
  1041  	return size
  1042  }
  1043  
  1044  func (r *schema) rowGroupNumRecords() int64 {
  1045  	return r.numRecords
  1046  }
  1047  
  1048  func makeSchema(meta *parquet.FileMetaData, validateCRC bool, alloc *allocTracker) (*schema, error) {
  1049  	if len(meta.Schema) < 1 {
  1050  		return nil, errors.New("no schema element found")
  1051  	}
  1052  	s := &schema{
  1053  		root: &Column{
  1054  			index:    0,
  1055  			name:     meta.Schema[0].Name,
  1056  			data:     nil,
  1057  			children: make([]*Column, 0, len(meta.Schema)-1),
  1058  			rep:      0,
  1059  			maxR:     0,
  1060  			maxD:     0,
  1061  			element:  meta.Schema[0],
  1062  			params: &ColumnParameters{
  1063  				LogicalType:   meta.Schema[0].LogicalType,
  1064  				ConvertedType: meta.Schema[0].ConvertedType,
  1065  				TypeLength:    meta.Schema[0].TypeLength,
  1066  				FieldID:       meta.Schema[0].FieldID,
  1067  			},
  1068  			alloc: alloc,
  1069  		},
  1070  		validateCRC: validateCRC,
  1071  		alloc:       alloc,
  1072  	}
  1073  	err := s.readSchema(meta.Schema[1:])
  1074  	if err != nil {
  1075  		return nil, err
  1076  	}
  1077  
  1078  	return s, nil
  1079  }