github.com/fraugster/parquet-go@v0.12.0/parquetschema/schema_parser.go

github.com/fraugster/parquet-go@v0.12.0/parquetschema/schema_parser.go (about)

     1  package parquetschema
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"math"
     7  	"runtime"
     8  	"strconv"
     9  	"strings"
    10  	"unicode"
    11  	"unicode/utf8"
    12  
    13  	"github.com/fraugster/parquet-go/parquet"
    14  )
    15  
    16  type item struct {
    17  	typ  itemType
    18  	pos  pos
    19  	val  string
    20  	line int
    21  }
    22  
    23  type pos int
    24  
    25  func (i item) String() string {
    26  	switch {
    27  	case i.typ == itemEOF:
    28  		return "EOF"
    29  	case i.typ == itemError:
    30  		return i.val
    31  	case len(i.val) > 10:
    32  		return fmt.Sprintf("%.10q...", i.val)
    33  	}
    34  	return fmt.Sprintf("%q", i.val)
    35  }
    36  
    37  type itemType int
    38  
    39  const (
    40  	itemError itemType = iota
    41  	itemEOF
    42  
    43  	itemLeftParen
    44  	itemRightParen
    45  	itemLeftBrace
    46  	itemRightBrace
    47  	itemEqual
    48  	itemSemicolon
    49  	itemComma
    50  	itemNumber
    51  	itemIdentifier
    52  	itemKeyword
    53  	itemMessage
    54  	itemRepeated
    55  	itemOptional
    56  	itemRequired
    57  	itemGroup
    58  )
    59  
    60  func (i itemType) String() string {
    61  	typeNames := map[itemType]string{
    62  		itemError:      "error",
    63  		itemEOF:        "EOF",
    64  		itemLeftParen:  "(",
    65  		itemRightParen: ")",
    66  		itemLeftBrace:  "{",
    67  		itemRightBrace: "}",
    68  		itemEqual:      "=",
    69  		itemSemicolon:  ";",
    70  		itemComma:      ",",
    71  		itemNumber:     "number",
    72  		itemIdentifier: "identifier",
    73  		itemKeyword:    "<keyword>",
    74  		itemMessage:    "message",
    75  		itemRepeated:   "repeated",
    76  		itemOptional:   "optional",
    77  		itemRequired:   "required",
    78  		itemGroup:      "group",
    79  	}
    80  
    81  	n, ok := typeNames[i]
    82  	if !ok {
    83  		return fmt.Sprintf("<type:%d>", int(i))
    84  	}
    85  	return n
    86  }
    87  
    88  var key = map[string]itemType{
    89  	"message":  itemMessage,
    90  	"repeated": itemRepeated,
    91  	"optional": itemOptional,
    92  	"required": itemRequired,
    93  	"group":    itemGroup,
    94  }
    95  
    96  const eof = -1
    97  
    98  type stateFn func(*schemaLexer) stateFn
    99  
   100  type schemaLexer struct {
   101  	input     string
   102  	pos       pos
   103  	start     pos
   104  	width     pos
   105  	items     chan item
   106  	line      int
   107  	startLine int
   108  }
   109  
   110  func (l *schemaLexer) next() rune {
   111  	if int(l.pos) >= len(l.input) {
   112  		l.width = 0
   113  		return eof
   114  	}
   115  
   116  	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
   117  	l.width = pos(w)
   118  	l.pos += l.width
   119  	if r == '\n' {
   120  		l.line++
   121  	}
   122  	return r
   123  }
   124  
   125  func (l *schemaLexer) peek() rune {
   126  	r := l.next()
   127  	l.backup()
   128  	return r
   129  }
   130  
   131  func (l *schemaLexer) backup() {
   132  	l.pos -= l.width
   133  	if l.width == 1 && l.input[l.pos] == '\n' {
   134  		l.line--
   135  	}
   136  }
   137  
   138  func (l *schemaLexer) ignore() {
   139  	l.start = l.pos
   140  	l.startLine = l.line
   141  }
   142  
   143  func (l *schemaLexer) emit(t itemType) {
   144  	l.items <- item{t, l.start, l.input[l.start:l.pos], l.startLine}
   145  	l.start = l.pos
   146  	l.startLine = l.line
   147  }
   148  
   149  func (l *schemaLexer) acceptRun(valid string) {
   150  	for strings.ContainsRune(valid, l.next()) {
   151  	}
   152  	l.backup()
   153  }
   154  
   155  func (l *schemaLexer) nextItem() item {
   156  	return <-l.items
   157  }
   158  
   159  func (l *schemaLexer) drain() {
   160  	for range l.items {
   161  	}
   162  }
   163  
   164  func lex(input string) *schemaLexer {
   165  	l := &schemaLexer{
   166  		input:     input,
   167  		items:     make(chan item),
   168  		line:      1,
   169  		startLine: 1,
   170  	}
   171  
   172  	go l.run()
   173  	return l
   174  }
   175  
   176  func (l *schemaLexer) run() {
   177  	for state := lexText; state != nil; {
   178  		state = state(l)
   179  	}
   180  	close(l.items)
   181  }
   182  
   183  func lexText(l *schemaLexer) stateFn {
   184  	switch r := l.next(); {
   185  	case r == eof:
   186  		l.emit(itemEOF)
   187  		return nil
   188  	case isSpace(r):
   189  		return lexSpace
   190  	case r == '(':
   191  		l.emit(itemLeftParen)
   192  	case r == ')':
   193  		l.emit(itemRightParen)
   194  	case r == '{':
   195  		l.emit(itemLeftBrace)
   196  	case r == '}':
   197  		l.emit(itemRightBrace)
   198  	case isDigit(r):
   199  		return lexNumber
   200  	case r == '=':
   201  		l.emit(itemEqual)
   202  	case r == ';':
   203  		l.emit(itemSemicolon)
   204  	case r == ',':
   205  		l.emit(itemComma)
   206  	default:
   207  		return lexIdentifier
   208  	}
   209  	return lexText
   210  }
   211  
   212  func isSpace(r rune) bool {
   213  	return r == ' ' || r == '\t' || r == '\n' || r == '\r'
   214  }
   215  
   216  func isDigit(r rune) bool {
   217  	return unicode.IsDigit(r)
   218  }
   219  
   220  func isSchemaDelim(r rune) bool {
   221  	return r == ' ' || r == ';' || r == '{' || r == '}' || r == '(' || r == ')' || r == '=' || r == ','
   222  }
   223  
   224  func lexSpace(l *schemaLexer) stateFn {
   225  	for isSpace(l.peek()) {
   226  		l.next()
   227  	}
   228  	l.ignore()
   229  	return lexText
   230  }
   231  
   232  func lexNumber(l *schemaLexer) stateFn {
   233  	l.acceptRun("0123456789")
   234  	l.emit(itemNumber)
   235  	return lexText
   236  }
   237  
   238  func lexIdentifier(l *schemaLexer) stateFn {
   239  loop:
   240  	for {
   241  		switch r := l.next(); {
   242  		case !isSchemaDelim(r): // the = is there to accept it as part of the identifiers being read within type annotations.
   243  			// absorb.
   244  		default:
   245  			l.backup()
   246  			word := l.input[l.start:l.pos]
   247  			switch {
   248  			case key[word] > itemKeyword:
   249  				l.emit(key[word])
   250  			default:
   251  				l.emit(itemIdentifier)
   252  			}
   253  			break loop
   254  		}
   255  	}
   256  	return lexText
   257  }
   258  
   259  type schemaParser struct {
   260  	l     *schemaLexer
   261  	token item
   262  	root  *ColumnDefinition
   263  }
   264  
   265  func newSchemaParser(text string) *schemaParser {
   266  	return &schemaParser{
   267  		l:    lex(text),
   268  		root: &ColumnDefinition{SchemaElement: &parquet.SchemaElement{}},
   269  	}
   270  }
   271  
   272  func (p *schemaParser) parse() (err error) {
   273  	defer p.recover(&err)
   274  
   275  	p.parseMessage()
   276  
   277  	p.next()
   278  	p.expect(itemEOF)
   279  
   280  	p.validate(p.root, false)
   281  
   282  	return nil
   283  }
   284  
   285  func (p *schemaParser) recover(errp *error) {
   286  	if e := recover(); e != nil {
   287  		if _, ok := e.(runtime.Error); ok {
   288  			panic(e)
   289  		}
   290  		p.l.drain()
   291  		*errp = e.(error)
   292  	}
   293  }
   294  
   295  func (p *schemaParser) errorf(msg string, args ...interface{}) {
   296  	msg = fmt.Sprintf("line %d: %s", p.token.line, msg)
   297  	panic(fmt.Errorf(msg, args...))
   298  }
   299  
   300  func (p *schemaParser) expect(typ itemType) {
   301  	if typ == itemIdentifier && p.token.typ > itemKeyword {
   302  		return
   303  	}
   304  
   305  	if p.token.typ != typ {
   306  		p.errorf("expected %s, got %s instead", typ, p.token)
   307  	}
   308  }
   309  
   310  func (p *schemaParser) next() {
   311  	p.token = p.l.nextItem()
   312  }
   313  
   314  func (p *schemaParser) parseMessage() {
   315  	p.next()
   316  	p.expect(itemMessage)
   317  
   318  	p.next()
   319  	p.expect(itemIdentifier)
   320  
   321  	p.root.SchemaElement.Name = p.token.val
   322  
   323  	p.next()
   324  	p.expect(itemLeftBrace)
   325  
   326  	p.root.Children = p.parseMessageBody()
   327  	for _, c := range p.root.Children {
   328  		recursiveFix(c)
   329  	}
   330  
   331  	p.expect(itemRightBrace)
   332  }
   333  
   334  func recursiveFix(col *ColumnDefinition) {
   335  	if nc := int32(len(col.Children)); nc > 0 {
   336  		col.SchemaElement.NumChildren = &nc
   337  	}
   338  
   339  	for i := range col.Children {
   340  		recursiveFix(col.Children[i])
   341  	}
   342  }
   343  
   344  func (p *schemaParser) parseMessageBody() []*ColumnDefinition {
   345  	var cols []*ColumnDefinition
   346  	p.expect(itemLeftBrace)
   347  	for {
   348  		p.next()
   349  		if p.token.typ == itemRightBrace {
   350  			return cols
   351  		}
   352  
   353  		cols = append(cols, p.parseColumnDefinition())
   354  	}
   355  }
   356  
   357  func (p *schemaParser) parseColumnDefinition() *ColumnDefinition {
   358  	col := &ColumnDefinition{
   359  		SchemaElement: &parquet.SchemaElement{},
   360  	}
   361  
   362  	switch p.token.typ {
   363  	case itemRepeated:
   364  		col.SchemaElement.RepetitionType = parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REPEATED)
   365  	case itemOptional:
   366  		col.SchemaElement.RepetitionType = parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_OPTIONAL)
   367  	case itemRequired:
   368  		col.SchemaElement.RepetitionType = parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED)
   369  	default:
   370  		p.errorf("invalid field repetition type %q", p.token.val)
   371  	}
   372  
   373  	p.next()
   374  
   375  	if p.token.typ == itemGroup {
   376  		p.next()
   377  		p.expect(itemIdentifier)
   378  		col.SchemaElement.Name = p.token.val
   379  
   380  		p.next()
   381  		if p.token.typ == itemLeftParen {
   382  			col.SchemaElement.ConvertedType = p.parseConvertedType()
   383  			p.next()
   384  		}
   385  
   386  		col.Children = p.parseMessageBody()
   387  
   388  		p.expect(itemRightBrace)
   389  	} else {
   390  		col.SchemaElement.Type = p.getTokenType()
   391  
   392  		if col.SchemaElement.GetType() == parquet.Type_FIXED_LEN_BYTE_ARRAY {
   393  			p.next()
   394  			p.expect(itemLeftParen)
   395  			p.next()
   396  			p.expect(itemNumber)
   397  
   398  			i, err := strconv.ParseUint(p.token.val, 10, 32)
   399  			if err != nil {
   400  				p.errorf("invalid fixed_len_byte_array length %q: %v", p.token.val, err)
   401  			}
   402  
   403  			byteArraySize := int32(i)
   404  
   405  			col.SchemaElement.TypeLength = &byteArraySize
   406  
   407  			p.next()
   408  			p.expect(itemRightParen)
   409  		}
   410  
   411  		p.next()
   412  		p.expect(itemIdentifier)
   413  		col.SchemaElement.Name = p.token.val
   414  
   415  		p.next()
   416  		if p.token.typ == itemLeftParen {
   417  			col.SchemaElement.LogicalType, col.SchemaElement.ConvertedType = p.parseLogicalOrConvertedType()
   418  			if col.SchemaElement.LogicalType != nil && col.SchemaElement.LogicalType.IsSetDECIMAL() {
   419  				col.SchemaElement.Scale = &col.SchemaElement.LogicalType.DECIMAL.Scale
   420  				col.SchemaElement.Precision = &col.SchemaElement.LogicalType.DECIMAL.Precision
   421  			}
   422  			p.next()
   423  		}
   424  
   425  		if p.token.typ == itemEqual {
   426  			col.SchemaElement.FieldID = p.parseFieldID()
   427  			p.next()
   428  		}
   429  
   430  		p.expect(itemSemicolon)
   431  	}
   432  
   433  	return col
   434  }
   435  
   436  func (p *schemaParser) isValidType(typ string) {
   437  	validTypes := []string{"binary", "float", "double", "boolean", "int32", "int64", "int96", "fixed_len_byte_array"}
   438  	for _, vt := range validTypes {
   439  		if vt == typ {
   440  			return
   441  		}
   442  	}
   443  	p.errorf("invalid type %q", typ)
   444  }
   445  
   446  func (p *schemaParser) getTokenType() *parquet.Type {
   447  	p.isValidType(p.token.val)
   448  
   449  	switch p.token.val {
   450  	case "binary":
   451  		return parquet.TypePtr(parquet.Type_BYTE_ARRAY)
   452  	case "float":
   453  		return parquet.TypePtr(parquet.Type_FLOAT)
   454  	case "double":
   455  		return parquet.TypePtr(parquet.Type_DOUBLE)
   456  	case "boolean":
   457  		return parquet.TypePtr(parquet.Type_BOOLEAN)
   458  	case "int32":
   459  		return parquet.TypePtr(parquet.Type_INT32)
   460  	case "int64":
   461  		return parquet.TypePtr(parquet.Type_INT64)
   462  	case "int96":
   463  		return parquet.TypePtr(parquet.Type_INT96)
   464  	case "fixed_len_byte_array":
   465  		return parquet.TypePtr(parquet.Type_FIXED_LEN_BYTE_ARRAY)
   466  	default:
   467  		p.errorf("unsupported type %q", p.token.val)
   468  		return nil
   469  	}
   470  }
   471  
   472  func (p *schemaParser) parseLogicalOrConvertedType() (*parquet.LogicalType, *parquet.ConvertedType) {
   473  	p.expect(itemLeftParen)
   474  	p.next()
   475  	p.expect(itemIdentifier)
   476  
   477  	typStr := p.token.val
   478  
   479  	lt := parquet.NewLogicalType()
   480  	var ct *parquet.ConvertedType
   481  
   482  	switch strings.ToUpper(typStr) {
   483  	case "STRING":
   484  		lt.STRING = parquet.NewStringType()
   485  		ct = parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8)
   486  		p.next()
   487  	case "DATE":
   488  		lt.DATE = parquet.NewDateType()
   489  		ct = parquet.ConvertedTypePtr(parquet.ConvertedType_DATE)
   490  		p.next()
   491  	case "TIMESTAMP":
   492  		ct = p.parseTimestampLogicalType(lt)
   493  		p.next()
   494  	case "TIME":
   495  		ct = p.parseTimeLogicalType(lt)
   496  		p.next()
   497  	case "INT":
   498  		ct = p.parseIntLogicalType(lt)
   499  		p.next()
   500  	case "UUID":
   501  		lt.UUID = parquet.NewUUIDType()
   502  		p.next()
   503  	case "ENUM":
   504  		lt.ENUM = parquet.NewEnumType()
   505  		ct = parquet.ConvertedTypePtr(parquet.ConvertedType_ENUM)
   506  		p.next()
   507  	case "JSON":
   508  		lt.JSON = parquet.NewJsonType()
   509  		ct = parquet.ConvertedTypePtr(parquet.ConvertedType_JSON)
   510  		p.next()
   511  	case "BSON":
   512  		lt.BSON = parquet.NewBsonType()
   513  		ct = parquet.ConvertedTypePtr(parquet.ConvertedType_BSON)
   514  		p.next()
   515  	case "DECIMAL":
   516  		ct = p.parseDecimalLogicalType(lt)
   517  		// n.b. no p.next is necessary because parseDecimalLogicalType may have already seen the ) if the list of scale and precision were not there, i.e. if it was a converted type.
   518  	default:
   519  		convertedType, err := parquet.ConvertedTypeFromString(strings.ToUpper(typStr))
   520  		if err != nil {
   521  			p.errorf("unsupported logical type or converted type %q", typStr)
   522  		}
   523  		lt = nil
   524  		ct = &convertedType
   525  		p.next()
   526  	}
   527  
   528  	p.expect(itemRightParen)
   529  
   530  	return lt, ct
   531  }
   532  
   533  func (p *schemaParser) parseTimestampLogicalType(lt *parquet.LogicalType) (ct *parquet.ConvertedType) {
   534  	lt.TIMESTAMP = parquet.NewTimestampType()
   535  	p.next()
   536  	p.expect(itemLeftParen)
   537  
   538  	p.next()
   539  	p.expect(itemIdentifier)
   540  
   541  	lt.TIMESTAMP.Unit = parquet.NewTimeUnit()
   542  	switch p.token.val {
   543  	case "MILLIS":
   544  		lt.TIMESTAMP.Unit.MILLIS = parquet.NewMilliSeconds()
   545  		ct = parquet.ConvertedTypePtr(parquet.ConvertedType_TIMESTAMP_MILLIS)
   546  	case "MICROS":
   547  		lt.TIMESTAMP.Unit.MICROS = parquet.NewMicroSeconds()
   548  		ct = parquet.ConvertedTypePtr(parquet.ConvertedType_TIMESTAMP_MICROS)
   549  	case "NANOS":
   550  		lt.TIMESTAMP.Unit.NANOS = parquet.NewNanoSeconds()
   551  	default:
   552  		p.errorf("unknown unit annotation %q for TIMESTAMP", p.token.val)
   553  	}
   554  
   555  	p.next()
   556  	p.expect(itemComma)
   557  
   558  	p.next()
   559  	p.expect(itemIdentifier)
   560  
   561  	switch p.token.val {
   562  	case "true", "false":
   563  		lt.TIMESTAMP.IsAdjustedToUTC, _ = strconv.ParseBool(p.token.val)
   564  	default:
   565  		p.errorf("invalid isAdjustedToUTC annotation %q for TIMESTAMP", p.token.val)
   566  	}
   567  
   568  	p.next()
   569  	p.expect(itemRightParen)
   570  
   571  	return ct
   572  }
   573  
   574  func (p *schemaParser) parseTimeLogicalType(lt *parquet.LogicalType) (ct *parquet.ConvertedType) {
   575  	lt.TIME = parquet.NewTimeType()
   576  	p.next()
   577  	p.expect(itemLeftParen)
   578  
   579  	p.next()
   580  	p.expect(itemIdentifier)
   581  
   582  	lt.TIME.Unit = parquet.NewTimeUnit()
   583  	switch p.token.val {
   584  	case "MILLIS":
   585  		lt.TIME.Unit.MILLIS = parquet.NewMilliSeconds()
   586  		ct = parquet.ConvertedTypePtr(parquet.ConvertedType_TIME_MILLIS)
   587  	case "MICROS":
   588  		lt.TIME.Unit.MICROS = parquet.NewMicroSeconds()
   589  		ct = parquet.ConvertedTypePtr(parquet.ConvertedType_TIME_MICROS)
   590  	case "NANOS":
   591  		lt.TIME.Unit.NANOS = parquet.NewNanoSeconds()
   592  	default:
   593  		p.errorf("unknown unit annotation %q for TIME", p.token.val)
   594  	}
   595  
   596  	p.next()
   597  	p.expect(itemComma)
   598  
   599  	p.next()
   600  	p.expect(itemIdentifier)
   601  
   602  	switch p.token.val {
   603  	case "true", "false":
   604  		lt.TIME.IsAdjustedToUTC, _ = strconv.ParseBool(p.token.val)
   605  	default:
   606  		p.errorf("invalid isAdjustedToUTC annotation %q for TIME", p.token.val)
   607  	}
   608  
   609  	p.next()
   610  	p.expect(itemRightParen)
   611  
   612  	return ct
   613  }
   614  
   615  func (p *schemaParser) parseIntLogicalType(lt *parquet.LogicalType) *parquet.ConvertedType {
   616  	lt.INTEGER = parquet.NewIntType()
   617  	p.next()
   618  	p.expect(itemLeftParen)
   619  
   620  	p.next()
   621  	p.expect(itemNumber)
   622  
   623  	bitWidth, _ := strconv.ParseInt(p.token.val, 10, 64)
   624  	if bitWidth != 8 && bitWidth != 16 && bitWidth != 32 && bitWidth != 64 {
   625  		p.errorf("INT: unsupported bitwidth %d", bitWidth)
   626  	}
   627  
   628  	lt.INTEGER.BitWidth = int8(bitWidth)
   629  
   630  	p.next()
   631  	p.expect(itemComma)
   632  
   633  	p.next()
   634  	p.expect(itemIdentifier)
   635  	switch p.token.val {
   636  	case "true", "false":
   637  		lt.INTEGER.IsSigned, _ = strconv.ParseBool(p.token.val)
   638  	default:
   639  		p.errorf("invalid isSigned annotation %q for INT", p.token.val)
   640  	}
   641  
   642  	p.next()
   643  	p.expect(itemRightParen)
   644  
   645  	convertedTypeStr := fmt.Sprintf("INT_%d", bitWidth)
   646  	if !lt.INTEGER.IsSigned {
   647  		convertedTypeStr = "U" + convertedTypeStr
   648  	}
   649  
   650  	convertedType, err := parquet.ConvertedTypeFromString(convertedTypeStr)
   651  	if err != nil {
   652  		p.errorf("couldn't convert INT(%d, %t) annotation to converted type %s: %v", bitWidth, lt.INTEGER.IsSigned, convertedTypeStr, err)
   653  	}
   654  	return parquet.ConvertedTypePtr(convertedType)
   655  }
   656  
   657  func (p *schemaParser) parseDecimalLogicalType(lt *parquet.LogicalType) *parquet.ConvertedType {
   658  	ct := parquet.ConvertedTypePtr(parquet.ConvertedType_DECIMAL)
   659  	p.next()
   660  
   661  	if p.token.typ == itemRightParen { // if the next token is ), skip parsing precision and scale because we only got a converted type.
   662  		return ct
   663  	}
   664  
   665  	lt.DECIMAL = parquet.NewDecimalType()
   666  
   667  	p.expect(itemLeftParen)
   668  
   669  	p.next()
   670  	p.expect(itemNumber)
   671  
   672  	prec, _ := strconv.ParseInt(p.token.val, 10, 64)
   673  	lt.DECIMAL.Precision = int32(prec)
   674  
   675  	p.next()
   676  	p.expect(itemComma)
   677  
   678  	p.next()
   679  	p.expect(itemNumber)
   680  
   681  	scale, _ := strconv.ParseInt(p.token.val, 10, 64)
   682  	lt.DECIMAL.Scale = int32(scale)
   683  
   684  	p.next()
   685  	p.expect(itemRightParen)
   686  
   687  	p.next() // here, we're pre-loading the next token for the caller.
   688  	return ct
   689  }
   690  
   691  func (p *schemaParser) parseConvertedType() *parquet.ConvertedType {
   692  	p.expect(itemLeftParen)
   693  	p.next()
   694  	p.expect(itemIdentifier)
   695  
   696  	typStr := p.token.val
   697  
   698  	convertedType, err := parquet.ConvertedTypeFromString(typStr)
   699  	if err != nil {
   700  		p.errorf("invalid converted type %q", typStr)
   701  	}
   702  
   703  	p.next()
   704  	p.expect(itemRightParen)
   705  
   706  	return parquet.ConvertedTypePtr(convertedType)
   707  }
   708  
   709  func (p *schemaParser) parseFieldID() *int32 {
   710  	p.expect(itemEqual)
   711  	p.next()
   712  	p.expect(itemNumber)
   713  
   714  	i, err := strconv.ParseInt(p.token.val, 10, 32)
   715  	if err != nil {
   716  		p.errorf("couldn't parse field ID %q: %v", p.token.val, err)
   717  	}
   718  
   719  	i32 := int32(i)
   720  
   721  	return &i32
   722  }
   723  
   724  func (p *schemaParser) validate(col *ColumnDefinition, strictMode bool) {
   725  	if err := col.validate(true, strictMode); err != nil {
   726  		p.errorf("%v", err)
   727  	}
   728  }
   729  
   730  // Validate conducts a validation of the schema definition. This is
   731  // useful when the schema definition has been constructed programmatically
   732  // by other means than the schema parser to ensure that it is still
   733  // valid.
   734  func (sd *SchemaDefinition) Validate() error {
   735  	if sd == nil {
   736  		return errors.New("schema definition is nil")
   737  	}
   738  
   739  	return sd.RootColumn.validate(true, false)
   740  }
   741  
   742  // ValidateStrict conducts a stricter validation of the schema definition.
   743  // This includes the validation as done by Validate, but prohibits backwards-
   744  // compatible definitions of LIST and MAP.
   745  func (sd *SchemaDefinition) ValidateStrict() error {
   746  	if sd == nil {
   747  		return errors.New("schema definition is nil")
   748  	}
   749  	return sd.RootColumn.validate(true, true)
   750  }
   751  
   752  func (col *ColumnDefinition) validateColumn(isRoot, strictMode bool) error {
   753  	if col == nil {
   754  		return errors.New("column definition is nil")
   755  	}
   756  
   757  	if col.SchemaElement == nil {
   758  		return errors.New("column has no schema element")
   759  	}
   760  
   761  	if col.SchemaElement.Name == "" {
   762  		return errors.New("column has no name")
   763  	}
   764  
   765  	if !isRoot && len(col.Children) == 0 && col.SchemaElement.Type == nil {
   766  		return fmt.Errorf("field %s has neither children nor a type", col.SchemaElement.Name)
   767  	}
   768  
   769  	if col.SchemaElement.Type != nil && len(col.Children) > 0 {
   770  		return fmt.Errorf("field %s has a type but also children", col.SchemaElement.Name)
   771  	}
   772  
   773  	return nil
   774  }
   775  
   776  func (col *ColumnDefinition) validateListLogicalType(strictMode bool) error {
   777  	if col.SchemaElement.Type != nil {
   778  		return fmt.Errorf("field %s is not a group but annotated as LIST", col.SchemaElement.Name)
   779  	}
   780  	if rep := col.SchemaElement.GetRepetitionType(); rep != parquet.FieldRepetitionType_OPTIONAL && rep != parquet.FieldRepetitionType_REQUIRED {
   781  		return fmt.Errorf("field %s is a LIST but has repetition type %s", col.SchemaElement.Name, rep)
   782  	}
   783  	if len(col.Children) != 1 {
   784  		return fmt.Errorf("field %s is a LIST but has %d children", col.SchemaElement.Name, len(col.Children))
   785  	}
   786  	if col.Children[0].SchemaElement.Name != "list" {
   787  		if strictMode {
   788  			return fmt.Errorf("field %s is a LIST but its child is not named \"list\"", col.SchemaElement.Name)
   789  		}
   790  
   791  		if col.Children[0].SchemaElement.Type != nil {
   792  			// backwards compatibility rule 1: repeated field is not a group, its type is the element type and elements are required.
   793  		} else {
   794  			repeatedGroup := col.Children[0]
   795  			switch len(repeatedGroup.Children) {
   796  			case 0:
   797  				return fmt.Errorf("field %s is a LIST but the repeated group inside it is not called \"list\" and contains no fields", col.SchemaElement.Name)
   798  			case 1:
   799  				// if col.Children[0].SchemaElement.Name == "array" or
   800  				//	col.Children[0].SchemaElement.Name == col.SchemaElement.Name+"_tuple" or
   801  				//	col.Children[0].SchemaElement.Name == "bag":
   802  				// backwards compatibility rule 3: repeated field is a group with one field and is named either array or uses the LIST-annotated
   803  				// group's name with _tuple appended then the repeated type is the element type and elements are required.
   804  				// also added "bag" because that's what we see generated on AWS Athena.
   805  				// else: backwards compatibility rule 4: the repeated field's type is the element type with the repeated field's repetition.
   806  			default:
   807  				// backwards compatbility rule 2: repeated field is a group with multiple fields, its type is the element type and elements are required.
   808  			}
   809  		}
   810  	} else {
   811  		if col.Children[0].SchemaElement.Type != nil || col.Children[0].SchemaElement.GetRepetitionType() != parquet.FieldRepetitionType_REPEATED {
   812  			return fmt.Errorf("field %s is a LIST but its child is not a repeated group", col.SchemaElement.Name)
   813  		}
   814  		if len(col.Children[0].Children) != 1 {
   815  			return fmt.Errorf("field %s.list has %d children", col.SchemaElement.Name, len(col.Children[0].Children))
   816  		}
   817  		if col.Children[0].Children[0].SchemaElement.Name != "element" {
   818  			return fmt.Errorf("%s.list has a child but it's called %q, not \"element\"", col.SchemaElement.Name, col.Children[0].Children[0].SchemaElement.Name)
   819  		}
   820  		if rep := col.Children[0].Children[0].SchemaElement.GetRepetitionType(); rep != parquet.FieldRepetitionType_OPTIONAL && rep != parquet.FieldRepetitionType_REQUIRED {
   821  			return fmt.Errorf("%s.list.element has disallowed repetition type %s", col.SchemaElement.Name, rep)
   822  		}
   823  	}
   824  
   825  	for _, c := range col.Children[0].Children {
   826  		if err := c.validate(false, strictMode); err != nil {
   827  			return err
   828  		}
   829  	}
   830  
   831  	return nil
   832  }
   833  
   834  func (col *ColumnDefinition) validateMapLogicalType(strictMode bool) error {
   835  	if col.SchemaElement.GetConvertedType() == parquet.ConvertedType_MAP_KEY_VALUE {
   836  		if strictMode {
   837  			return fmt.Errorf("field %s is incorrectly annotated as MAP_KEY_VALUE", col.SchemaElement.Name)
   838  		}
   839  	}
   840  
   841  	if col.SchemaElement.Type != nil {
   842  		return fmt.Errorf("field %s is not a group but annotated as MAP", col.SchemaElement.Name)
   843  	}
   844  	if len(col.Children) != 1 {
   845  		return fmt.Errorf("field %s is a MAP but has %d children", col.SchemaElement.Name, len(col.Children))
   846  	}
   847  	if col.Children[0].SchemaElement.Type != nil || col.Children[0].SchemaElement.GetRepetitionType() != parquet.FieldRepetitionType_REPEATED {
   848  		return fmt.Errorf("filed %s is a MAP but its child is not a repeated group", col.SchemaElement.Name)
   849  	}
   850  	if strictMode && col.Children[0].SchemaElement.Name != "key_value" {
   851  		return fmt.Errorf("field %s is a MAP but its child is not named \"key_value\"", col.SchemaElement.Name)
   852  	}
   853  
   854  	if strictMode {
   855  		foundKey := false
   856  		foundValue := false
   857  		for _, c := range col.Children[0].Children {
   858  			switch c.SchemaElement.Name {
   859  			case "key":
   860  				if c.SchemaElement.GetRepetitionType() != parquet.FieldRepetitionType_REQUIRED {
   861  					return fmt.Errorf("field %s.key_value.key is not of repetition type \"required\"", col.SchemaElement.Name)
   862  				}
   863  				foundKey = true
   864  			case "value":
   865  				foundValue = true
   866  				// nothing else to check.
   867  			default:
   868  				return fmt.Errorf("field %[1]s is a MAP so %[1]s.key_value.%[2]s is not allowed", col.SchemaElement.Name, c.SchemaElement.Name)
   869  			}
   870  		}
   871  		if !foundKey {
   872  			return fmt.Errorf("field %[1]s is missing %[1]s.key_value.key", col.SchemaElement.Name)
   873  		}
   874  		if !foundValue {
   875  			return fmt.Errorf("field %[1]s is missing %[1]s.key_value.value", col.SchemaElement.Name)
   876  		}
   877  	} else {
   878  		if len(col.Children[0].Children) != 2 {
   879  			return fmt.Errorf("field %[1]s is a MAP but %[1]s.%[2]s contains %[3]d children (expected 2)", col.SchemaElement.Name, col.Children[0].SchemaElement.Name, len(col.Children[0].Children))
   880  		}
   881  	}
   882  
   883  	for _, c := range col.Children[0].Children {
   884  		if err := c.validate(false, strictMode); err != nil {
   885  			return err
   886  		}
   887  	}
   888  
   889  	return nil
   890  }
   891  
   892  func (col *ColumnDefinition) validateTimeLogicalType() error {
   893  	t := col.SchemaElement.GetLogicalType().TIME
   894  	switch {
   895  	case t.Unit.IsSetNANOS():
   896  		if col.SchemaElement.GetType() != parquet.Type_INT64 {
   897  			return fmt.Errorf("field %s is annotated as TIME(NANOS, %t) but is not an int64", col.SchemaElement.Name, t.IsAdjustedToUTC)
   898  		}
   899  	case t.Unit.IsSetMICROS():
   900  		if col.SchemaElement.GetType() != parquet.Type_INT64 {
   901  			return fmt.Errorf("field %s is annotated as TIME(MICROS, %t) but is not an int64", col.SchemaElement.Name, t.IsAdjustedToUTC)
   902  		}
   903  	case t.Unit.IsSetMILLIS():
   904  		if col.SchemaElement.GetType() != parquet.Type_INT32 {
   905  			return fmt.Errorf("field %s is annotated as TIME(MILLIS, %t) but is not an int32", col.SchemaElement.Name, t.IsAdjustedToUTC)
   906  		}
   907  	}
   908  	return nil
   909  }
   910  
   911  func (col *ColumnDefinition) validateDecimalLogicalType() error {
   912  	dec := col.SchemaElement.GetLogicalType().DECIMAL
   913  	switch col.SchemaElement.GetType() {
   914  	case parquet.Type_INT32:
   915  		if dec.Precision < 1 || dec.Precision > 9 {
   916  			return fmt.Errorf("field %s is int32 and annotated as DECIMAL but precision %d is out of bounds; needs to be 1 <= precision <= 9", col.SchemaElement.Name, dec.Precision)
   917  		}
   918  	case parquet.Type_INT64:
   919  		if dec.Precision < 1 || dec.Precision > 18 {
   920  			return fmt.Errorf("field %s is int64 and annotated as DECIMAL but precision %d is out of bounds; needs to be 1 <= precision <= 18", col.SchemaElement.Name, dec.Precision)
   921  		}
   922  	case parquet.Type_FIXED_LEN_BYTE_ARRAY:
   923  		n := *col.SchemaElement.TypeLength
   924  		maxDigits := int32(math.Floor(math.Log10(math.Exp2(8*float64(n)-1) - 1)))
   925  		if dec.Precision < 1 || dec.Precision > maxDigits {
   926  			return fmt.Errorf("field %s is fixed_len_byte_array(%d) and annotated as DECIMAL but precision %d is out of bounds; needs to be 0 <= precision <= %d", col.SchemaElement.Name, n, dec.Precision, maxDigits)
   927  		}
   928  	case parquet.Type_BYTE_ARRAY:
   929  		if dec.Precision < 1 {
   930  			return fmt.Errorf("field %s is int64 and annotated as DECIMAL but precision %d is out of bounds; needs to be 1 <= precision", col.SchemaElement.Name, dec.Precision)
   931  		}
   932  	default:
   933  		return fmt.Errorf("field %s is annotated as DECIMAL but type %s is unsupported", col.SchemaElement.Name, col.SchemaElement.GetType().String())
   934  	}
   935  	return nil
   936  }
   937  
   938  func (col *ColumnDefinition) validateIntegerLogicalType() error {
   939  	bitWidth := col.SchemaElement.LogicalType.INTEGER.BitWidth
   940  	isSigned := col.SchemaElement.LogicalType.INTEGER.IsSigned
   941  	switch bitWidth {
   942  	case 8, 16, 32:
   943  		if col.SchemaElement.GetType() != parquet.Type_INT32 {
   944  			return fmt.Errorf("field %s is annotated as INT(%d, %t) but element type is %s", col.SchemaElement.Name, bitWidth, isSigned, col.SchemaElement.GetType().String())
   945  		}
   946  	case 64:
   947  		if col.SchemaElement.GetType() != parquet.Type_INT64 {
   948  			return fmt.Errorf("field %s is annotated as INT(%d, %t) but element type is %s", col.SchemaElement.Name, bitWidth, isSigned, col.SchemaElement.GetType().String())
   949  		}
   950  	default:
   951  		return fmt.Errorf("invalid bitWidth %d", bitWidth)
   952  	}
   953  	return nil
   954  }
   955  
   956  func (col *ColumnDefinition) validate(isRoot bool, strictMode bool) error {
   957  	if err := col.validateColumn(isRoot, strictMode); err != nil {
   958  		return err
   959  	}
   960  
   961  	switch {
   962  	case (col.SchemaElement.LogicalType != nil && col.SchemaElement.GetLogicalType().IsSetLIST()) || col.SchemaElement.GetConvertedType() == parquet.ConvertedType_LIST:
   963  		if err := col.validateListLogicalType(strictMode); err != nil {
   964  			return err
   965  		}
   966  	case (col.SchemaElement.LogicalType != nil && col.SchemaElement.GetLogicalType().IsSetMAP()) || col.SchemaElement.GetConvertedType() == parquet.ConvertedType_MAP || col.SchemaElement.GetConvertedType() == parquet.ConvertedType_MAP_KEY_VALUE:
   967  		if err := col.validateMapLogicalType(strictMode); err != nil {
   968  			return err
   969  		}
   970  	case (col.SchemaElement.LogicalType != nil && col.SchemaElement.GetLogicalType().IsSetDATE()) || col.SchemaElement.GetConvertedType() == parquet.ConvertedType_DATE:
   971  		if col.SchemaElement.GetType() != parquet.Type_INT32 {
   972  			return fmt.Errorf("field %[1]s is annotated as DATE but is not an int32", col.SchemaElement.Name)
   973  		}
   974  	case col.SchemaElement.LogicalType != nil && col.SchemaElement.GetLogicalType().IsSetTIMESTAMP():
   975  		if col.SchemaElement.GetType() != parquet.Type_INT64 && col.SchemaElement.GetType() != parquet.Type_INT96 {
   976  			return fmt.Errorf("field %s is annotated as TIMESTAMP but is not an int64/int96", col.SchemaElement.Name)
   977  		}
   978  	case col.SchemaElement.LogicalType != nil && col.SchemaElement.GetLogicalType().IsSetTIME():
   979  		if err := col.validateTimeLogicalType(); err != nil {
   980  			return err
   981  		}
   982  	case col.SchemaElement.LogicalType != nil && col.SchemaElement.GetLogicalType().IsSetUUID():
   983  		if col.SchemaElement.GetType() != parquet.Type_FIXED_LEN_BYTE_ARRAY || col.SchemaElement.GetTypeLength() != 16 {
   984  			return fmt.Errorf("field %s is annotated as UUID but is not a fixed_len_byte_array(16)", col.SchemaElement.Name)
   985  		}
   986  	case col.SchemaElement.LogicalType != nil && col.SchemaElement.GetLogicalType().IsSetENUM():
   987  		if col.SchemaElement.GetType() != parquet.Type_BYTE_ARRAY {
   988  			return fmt.Errorf("field %s is annotated as ENUM but is not a binary", col.SchemaElement.Name)
   989  		}
   990  	case col.SchemaElement.LogicalType != nil && col.SchemaElement.GetLogicalType().IsSetJSON():
   991  		if col.SchemaElement.GetType() != parquet.Type_BYTE_ARRAY {
   992  			return fmt.Errorf("field %s is annotated as JSON but is not a binary", col.SchemaElement.Name)
   993  		}
   994  	case col.SchemaElement.LogicalType != nil && col.SchemaElement.GetLogicalType().IsSetBSON():
   995  		if col.SchemaElement.GetType() != parquet.Type_BYTE_ARRAY {
   996  			return fmt.Errorf("field %s is annotated as BSON but is not a binary", col.SchemaElement.Name)
   997  		}
   998  	case col.SchemaElement.LogicalType != nil && col.SchemaElement.GetLogicalType().IsSetDECIMAL():
   999  		if err := col.validateDecimalLogicalType(); err != nil {
  1000  			return err
  1001  		}
  1002  	case col.SchemaElement.LogicalType != nil && col.SchemaElement.GetLogicalType().IsSetINTEGER():
  1003  		if err := col.validateIntegerLogicalType(); err != nil {
  1004  			return err
  1005  		}
  1006  	case col.SchemaElement.ConvertedType != nil && col.SchemaElement.GetConvertedType() == parquet.ConvertedType_UTF8:
  1007  		if col.SchemaElement.GetType() != parquet.Type_BYTE_ARRAY {
  1008  			return fmt.Errorf("field %s is annotated as UTF8 but element type is %s, not binary", col.SchemaElement.Name, col.SchemaElement.GetType().String())
  1009  		}
  1010  	case col.SchemaElement.ConvertedType != nil && col.SchemaElement.GetConvertedType() == parquet.ConvertedType_TIME_MILLIS:
  1011  		if col.SchemaElement.GetType() != parquet.Type_INT32 {
  1012  			return fmt.Errorf("field %s is annotated as TIME_MILLIS but element type is %s, not int32", col.SchemaElement.Name, col.SchemaElement.GetType().String())
  1013  		}
  1014  	case col.SchemaElement.ConvertedType != nil && col.SchemaElement.GetConvertedType() == parquet.ConvertedType_TIME_MICROS:
  1015  		if col.SchemaElement.GetType() != parquet.Type_INT64 {
  1016  			return fmt.Errorf("field %s is annotated as TIME_MICROS but element type is %s, not int64", col.SchemaElement.Name, col.SchemaElement.GetType().String())
  1017  		}
  1018  	case col.SchemaElement.ConvertedType != nil && col.SchemaElement.GetConvertedType() == parquet.ConvertedType_TIMESTAMP_MILLIS:
  1019  		if col.SchemaElement.GetType() != parquet.Type_INT64 {
  1020  			return fmt.Errorf("field %s is annotated as TIMESTAMP_MILLIS but element type is %s, not int64", col.SchemaElement.Name, col.SchemaElement.GetType().String())
  1021  		}
  1022  	case col.SchemaElement.ConvertedType != nil && col.SchemaElement.GetConvertedType() == parquet.ConvertedType_TIMESTAMP_MICROS:
  1023  		if col.SchemaElement.GetType() != parquet.Type_INT64 {
  1024  			return fmt.Errorf("field %s is annotated as TIMESTAMP_MICROS but element type is %s, not int64", col.SchemaElement.Name, col.SchemaElement.GetType().String())
  1025  		}
  1026  	case col.SchemaElement.ConvertedType != nil &&
  1027  		col.SchemaElement.GetConvertedType() == parquet.ConvertedType_UINT_8 ||
  1028  		col.SchemaElement.GetConvertedType() == parquet.ConvertedType_UINT_16 ||
  1029  		col.SchemaElement.GetConvertedType() == parquet.ConvertedType_UINT_32 ||
  1030  		col.SchemaElement.GetConvertedType() == parquet.ConvertedType_INT_8 ||
  1031  		col.SchemaElement.GetConvertedType() == parquet.ConvertedType_INT_16 ||
  1032  		col.SchemaElement.GetConvertedType() == parquet.ConvertedType_INT_32:
  1033  		if col.SchemaElement.GetType() != parquet.Type_INT32 {
  1034  			return fmt.Errorf("field %s is annotated as %s but element type is %s, not int32", col.SchemaElement.Name, col.SchemaElement.GetConvertedType().String(), col.SchemaElement.GetType().String())
  1035  		}
  1036  	case col.SchemaElement.ConvertedType != nil && col.SchemaElement.GetConvertedType() == parquet.ConvertedType_UINT_64 || col.SchemaElement.GetConvertedType() == parquet.ConvertedType_INT_64:
  1037  		if col.SchemaElement.GetType() != parquet.Type_INT64 {
  1038  			return fmt.Errorf("field %s is annotated as %s but element type is %s, not int64", col.SchemaElement.Name, col.SchemaElement.GetConvertedType().String(), col.SchemaElement.GetType().String())
  1039  		}
  1040  	case col.SchemaElement.ConvertedType != nil && col.SchemaElement.GetConvertedType() == parquet.ConvertedType_INTERVAL:
  1041  		if col.SchemaElement.GetType() != parquet.Type_FIXED_LEN_BYTE_ARRAY || col.SchemaElement.GetTypeLength() != 12 {
  1042  			return fmt.Errorf("field %s is annotated as INTERVAL but element type is %s, not fixed_len_byte_array(12)", col.SchemaElement.Name, col.SchemaElement.GetType().String())
  1043  		}
  1044  	default:
  1045  		for _, c := range col.Children {
  1046  			if err := c.validate(false, strictMode); err != nil {
  1047  				return err
  1048  			}
  1049  		}
  1050  	}
  1051  
  1052  	return nil
  1053  }