github.com/apache/arrow/go/v14@v14.0.1/parquet/schema/reflection.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package schema
    18  
    19  import (
    20  	"fmt"
    21  	"reflect"
    22  	"strconv"
    23  	"strings"
    24  
    25  	"github.com/apache/arrow/go/v14/parquet"
    26  	format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet"
    27  	"golang.org/x/xerrors"
    28  )
    29  
    30  type taggedInfo struct {
    31  	Name string
    32  
    33  	Type      parquet.Type
    34  	KeyType   parquet.Type
    35  	ValueType parquet.Type
    36  
    37  	Length      int32
    38  	KeyLength   int32
    39  	ValueLength int32
    40  
    41  	Scale      int32
    42  	KeyScale   int32
    43  	ValueScale int32
    44  
    45  	Precision      int32
    46  	KeyPrecision   int32
    47  	ValuePrecision int32
    48  
    49  	FieldID      int32
    50  	KeyFieldID   int32
    51  	ValueFieldID int32
    52  
    53  	RepetitionType  parquet.Repetition
    54  	ValueRepetition parquet.Repetition
    55  
    56  	Converted      ConvertedType
    57  	KeyConverted   ConvertedType
    58  	ValueConverted ConvertedType
    59  
    60  	LogicalFields      map[string]string
    61  	KeyLogicalFields   map[string]string
    62  	ValueLogicalFields map[string]string
    63  
    64  	LogicalType      LogicalType
    65  	KeyLogicalType   LogicalType
    66  	ValueLogicalType LogicalType
    67  
    68  	Exclude bool
    69  }
    70  
    71  func (t *taggedInfo) CopyForKey() (ret taggedInfo) {
    72  	ret = *t
    73  	ret.Type = t.KeyType
    74  	ret.Length = t.KeyLength
    75  	ret.Scale = t.KeyScale
    76  	ret.Precision = t.KeyPrecision
    77  	ret.FieldID = t.KeyFieldID
    78  	ret.RepetitionType = parquet.Repetitions.Required
    79  	ret.Converted = t.KeyConverted
    80  	ret.LogicalType = t.KeyLogicalType
    81  	return
    82  }
    83  
    84  func (t *taggedInfo) CopyForValue() (ret taggedInfo) {
    85  	ret = *t
    86  	ret.Type = t.ValueType
    87  	ret.Length = t.ValueLength
    88  	ret.Scale = t.ValueScale
    89  	ret.Precision = t.ValuePrecision
    90  	ret.FieldID = t.ValueFieldID
    91  	ret.RepetitionType = t.ValueRepetition
    92  	ret.Converted = t.ValueConverted
    93  	ret.LogicalType = t.ValueLogicalType
    94  	return
    95  }
    96  
    97  func (t *taggedInfo) UpdateLogicalTypes() {
    98  	processLogicalType := func(fields map[string]string, precision, scale int32) LogicalType {
    99  		t, ok := fields["type"]
   100  		if !ok {
   101  			return NoLogicalType{}
   102  		}
   103  
   104  		switch strings.ToLower(t) {
   105  		case "string":
   106  			return StringLogicalType{}
   107  		case "map":
   108  			return MapLogicalType{}
   109  		case "list":
   110  			return ListLogicalType{}
   111  		case "enum":
   112  			return EnumLogicalType{}
   113  		case "decimal":
   114  			if v, ok := fields["precision"]; ok {
   115  				precision = int32FromType(v)
   116  			}
   117  			if v, ok := fields["scale"]; ok {
   118  				scale = int32FromType(v)
   119  			}
   120  			return NewDecimalLogicalType(precision, scale)
   121  		case "date":
   122  			return DateLogicalType{}
   123  		case "time":
   124  			unit, ok := fields["unit"]
   125  			if !ok {
   126  				panic("must specify unit for time logical type")
   127  			}
   128  			adjustedToUtc, ok := fields["isadjustedutc"]
   129  			if !ok {
   130  				adjustedToUtc = "true"
   131  			}
   132  			return NewTimeLogicalType(boolFromStr(adjustedToUtc), timeUnitFromString(strings.ToLower(unit)))
   133  		case "timestamp":
   134  			unit, ok := fields["unit"]
   135  			if !ok {
   136  				panic("must specify unit for time logical type")
   137  			}
   138  			adjustedToUtc, ok := fields["isadjustedutc"]
   139  			if !ok {
   140  				adjustedToUtc = "true"
   141  			}
   142  			return NewTimestampLogicalType(boolFromStr(adjustedToUtc), timeUnitFromString(unit))
   143  		case "integer":
   144  			width, ok := fields["bitwidth"]
   145  			if !ok {
   146  				panic("must specify bitwidth if explicitly setting integer logical type")
   147  			}
   148  			signed, ok := fields["signed"]
   149  			if !ok {
   150  				signed = "true"
   151  			}
   152  
   153  			return NewIntLogicalType(int8(int32FromType(width)), boolFromStr(signed))
   154  		case "null":
   155  			return NullLogicalType{}
   156  		case "json":
   157  			return JSONLogicalType{}
   158  		case "bson":
   159  			return BSONLogicalType{}
   160  		case "uuid":
   161  			return UUIDLogicalType{}
   162  		default:
   163  			panic(fmt.Errorf("invalid logical type specified: %s", t))
   164  		}
   165  	}
   166  
   167  	t.LogicalType = processLogicalType(t.LogicalFields, t.Precision, t.Scale)
   168  	t.KeyLogicalType = processLogicalType(t.KeyLogicalFields, t.KeyPrecision, t.KeyScale)
   169  	t.ValueLogicalType = processLogicalType(t.ValueLogicalFields, t.ValuePrecision, t.ValueScale)
   170  }
   171  
   172  func newTaggedInfo() taggedInfo {
   173  	return taggedInfo{
   174  		Type:               parquet.Types.Undefined,
   175  		KeyType:            parquet.Types.Undefined,
   176  		ValueType:          parquet.Types.Undefined,
   177  		RepetitionType:     parquet.Repetitions.Undefined,
   178  		ValueRepetition:    parquet.Repetitions.Undefined,
   179  		Converted:          ConvertedTypes.NA,
   180  		KeyConverted:       ConvertedTypes.NA,
   181  		ValueConverted:     ConvertedTypes.NA,
   182  		FieldID:            -1,
   183  		KeyFieldID:         -1,
   184  		ValueFieldID:       -1,
   185  		LogicalFields:      make(map[string]string),
   186  		KeyLogicalFields:   make(map[string]string),
   187  		ValueLogicalFields: make(map[string]string),
   188  		LogicalType:        NoLogicalType{},
   189  		KeyLogicalType:     NoLogicalType{},
   190  		ValueLogicalType:   NoLogicalType{},
   191  		Exclude:            false,
   192  	}
   193  }
   194  
   195  var int32FromType = func(v string) int32 {
   196  	val, err := strconv.Atoi(v)
   197  	if err != nil {
   198  		panic(err)
   199  	}
   200  	return int32(val)
   201  }
   202  
   203  var boolFromStr = func(v string) bool {
   204  	val, err := strconv.ParseBool(v)
   205  	if err != nil {
   206  		panic(err)
   207  	}
   208  	return val
   209  }
   210  
   211  func infoFromTags(f reflect.StructTag) *taggedInfo {
   212  	typeFromStr := func(v string) parquet.Type {
   213  		t, err := format.TypeFromString(strings.ToUpper(v))
   214  		if err != nil {
   215  			panic(fmt.Errorf("invalid type specified: %s", v))
   216  		}
   217  		return parquet.Type(t)
   218  	}
   219  
   220  	repFromStr := func(v string) parquet.Repetition {
   221  		r, err := format.FieldRepetitionTypeFromString(strings.ToUpper(v))
   222  		if err != nil {
   223  			panic(err)
   224  		}
   225  		return parquet.Repetition(r)
   226  	}
   227  
   228  	convertedFromStr := func(v string) ConvertedType {
   229  		c, err := format.ConvertedTypeFromString(strings.ToUpper(v))
   230  		if err != nil {
   231  			panic(err)
   232  		}
   233  		return ConvertedType(c)
   234  	}
   235  
   236  	if ptags, ok := f.Lookup("parquet"); ok {
   237  		info := newTaggedInfo()
   238  		if ptags == "-" {
   239  			info.Exclude = true
   240  			return &info
   241  		}
   242  		for _, tag := range strings.Split(strings.Replace(ptags, "\t", "", -1), ",") {
   243  			tag = strings.TrimSpace(tag)
   244  			kv := strings.SplitN(tag, "=", 2)
   245  			key := strings.TrimSpace(strings.ToLower(kv[0]))
   246  			value := strings.TrimSpace(kv[1])
   247  
   248  			switch key {
   249  			case "name":
   250  				info.Name = value
   251  			case "type":
   252  				info.Type = typeFromStr(value)
   253  			case "keytype":
   254  				info.KeyType = typeFromStr(value)
   255  			case "valuetype":
   256  				info.ValueType = typeFromStr(value)
   257  			case "length":
   258  				info.Length = int32FromType(value)
   259  			case "keylength":
   260  				info.KeyLength = int32FromType(value)
   261  			case "valuelength":
   262  				info.ValueLength = int32FromType(value)
   263  			case "scale":
   264  				info.Scale = int32FromType(value)
   265  			case "keyscale":
   266  				info.KeyScale = int32FromType(value)
   267  			case "valuescale":
   268  				info.ValueScale = int32FromType(value)
   269  			case "precision":
   270  				info.Precision = int32FromType(value)
   271  			case "keyprecision":
   272  				info.KeyPrecision = int32FromType(value)
   273  			case "valueprecision":
   274  				info.ValuePrecision = int32FromType(value)
   275  			case "fieldid":
   276  				info.FieldID = int32FromType(value)
   277  			case "keyfieldid":
   278  				info.KeyFieldID = int32FromType(value)
   279  			case "valuefieldid":
   280  				info.ValueFieldID = int32FromType(value)
   281  			case "repetition":
   282  				info.RepetitionType = repFromStr(value)
   283  			case "valuerepetition":
   284  				info.ValueRepetition = repFromStr(value)
   285  			case "converted":
   286  				info.Converted = convertedFromStr(value)
   287  			case "keyconverted":
   288  				info.KeyConverted = convertedFromStr(value)
   289  			case "valueconverted":
   290  				info.ValueConverted = convertedFromStr(value)
   291  			case "logical":
   292  				info.LogicalFields["type"] = value
   293  			case "keylogical":
   294  				info.KeyLogicalFields["type"] = value
   295  			case "valuelogical":
   296  				info.ValueLogicalFields["type"] = value
   297  			default:
   298  				switch {
   299  				case strings.HasPrefix(key, "logical."):
   300  					info.LogicalFields[strings.TrimPrefix(key, "logical.")] = value
   301  				case strings.HasPrefix(key, "keylogical."):
   302  					info.KeyLogicalFields[strings.TrimPrefix(key, "keylogical.")] = value
   303  				case strings.HasPrefix(key, "valuelogical."):
   304  					info.ValueLogicalFields[strings.TrimPrefix(key, "valuelogical.")] = value
   305  				}
   306  			}
   307  		}
   308  		info.UpdateLogicalTypes()
   309  		return &info
   310  	}
   311  	return nil
   312  }
   313  
   314  // typeToNode recurseively converts a physical type and the tag info into parquet Nodes
   315  //
   316  // to avoid having to propagate errors up potentially high numbers of recursive calls
   317  // we use panics and then recover in the public function NewSchemaFromStruct so that a
   318  // failure very far down the stack quickly unwinds.
   319  func typeToNode(name string, typ reflect.Type, repType parquet.Repetition, info *taggedInfo) Node {
   320  	// set up our default values for everything
   321  	var (
   322  		converted             = ConvertedTypes.None
   323  		logical   LogicalType = NoLogicalType{}
   324  		fieldID               = int32(-1)
   325  		physical              = parquet.Types.Undefined
   326  		typeLen               = 0
   327  		precision             = 0
   328  		scale                 = 0
   329  	)
   330  	if info != nil { // we have struct tag info to process
   331  		fieldID = info.FieldID
   332  		if info.Converted != ConvertedTypes.NA {
   333  			converted = info.Converted
   334  		}
   335  		logical = info.LogicalType
   336  		physical = info.Type
   337  		typeLen = int(info.Length)
   338  		precision = int(info.Precision)
   339  		scale = int(info.Scale)
   340  
   341  		if info.Name != "" {
   342  			name = info.Name
   343  		}
   344  		if info.RepetitionType != parquet.Repetitions.Undefined {
   345  			repType = info.RepetitionType
   346  		}
   347  	}
   348  
   349  	// simplify the logic by switching based on the reflection Kind
   350  	switch typ.Kind() {
   351  	case reflect.Map:
   352  		// a map must have a logical type of MAP or have no tag for logical type in which case
   353  		// we assume MAP logical type.
   354  		if !logical.IsNone() && !logical.Equals(MapLogicalType{}) {
   355  			panic("cannot set logical type to something other than map for a map")
   356  		}
   357  
   358  		infoCopy := newTaggedInfo()
   359  		if info != nil { // populate any value specific tags to propagate for the value type
   360  			infoCopy = info.CopyForValue()
   361  		}
   362  
   363  		// create the node for the value type of the map
   364  		value := typeToNode("value", typ.Elem(), parquet.Repetitions.Required, &infoCopy)
   365  		if info != nil { // change our copy to now use the key specific tags if they exist
   366  			infoCopy = info.CopyForKey()
   367  		}
   368  
   369  		// create the node for the key type of the map
   370  		key := typeToNode("key", typ.Key(), parquet.Repetitions.Required, &infoCopy)
   371  		if key.RepetitionType() != parquet.Repetitions.Required { // key cannot be optional
   372  			panic("key type of map must be Required")
   373  		}
   374  		return Must(MapOf(name, key, value, repType, fieldID))
   375  	case reflect.Struct:
   376  		// structs are Group nodes
   377  		fields := make(FieldList, 0)
   378  		for i := 0; i < typ.NumField(); i++ {
   379  			f := typ.Field(i)
   380  			tags := infoFromTags(f.Tag)
   381  			if tags == nil || !tags.Exclude {
   382  				fields = append(fields, typeToNode(f.Name, f.Type, parquet.Repetitions.Required, tags))
   383  			}
   384  		}
   385  		// group nodes don't have a physical type
   386  		if physical != parquet.Types.Undefined {
   387  			panic("cannot specify custom type on struct")
   388  		}
   389  		// group nodes don't have converted or logical types
   390  		if converted != ConvertedTypes.None {
   391  			panic("cannot specify converted types for a struct")
   392  		}
   393  		if !logical.IsNone() {
   394  			panic("cannot specify logicaltype for a struct")
   395  		}
   396  		return Must(NewGroupNode(name, repType, fields, fieldID))
   397  	case reflect.Ptr: // if we encounter a pointer create a node for the type it points to, but mark it as optional
   398  		return typeToNode(name, typ.Elem(), parquet.Repetitions.Optional, info)
   399  	case reflect.Array:
   400  		// arrays are repeated or fixed size
   401  		if typ == reflect.TypeOf(parquet.Int96{}) {
   402  			return NewInt96Node(name, repType, fieldID)
   403  		}
   404  
   405  		if typ.Elem() == reflect.TypeOf(byte(0)) { // something like [12]byte translates to FixedLenByteArray with length 12
   406  			if physical == parquet.Types.Undefined {
   407  				physical = parquet.Types.FixedLenByteArray
   408  			}
   409  			if typeLen == 0 { // if there was no type length specified in the tag, use the length of the type.
   410  				typeLen = typ.Len()
   411  			}
   412  			if !logical.IsNone() {
   413  				return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, physical, typeLen, fieldID))
   414  			}
   415  			return MustPrimitive(NewPrimitiveNodeConverted(name, repType, physical, converted, typeLen, precision, scale, fieldID))
   416  		}
   417  		fallthrough // if it's not a fixed len byte array type, then just treat it like a slice
   418  	case reflect.Slice:
   419  		// for slices, we default to treating them as lists unless the repetition type is set to REPEATED or they are
   420  		// a bytearray/fixedlenbytearray
   421  		switch {
   422  		case repType == parquet.Repetitions.Repeated:
   423  			return typeToNode(name, typ.Elem(), parquet.Repetitions.Repeated, info)
   424  		case physical == parquet.Types.FixedLenByteArray || physical == parquet.Types.ByteArray:
   425  			if typ.Elem() != reflect.TypeOf(byte(0)) {
   426  				panic("slice with physical type ByteArray or FixedLenByteArray must be []byte")
   427  			}
   428  			fallthrough
   429  		case typ.Elem() == reflect.TypeOf(byte(0)):
   430  			if physical == parquet.Types.Undefined {
   431  				physical = parquet.Types.ByteArray
   432  			}
   433  			if !logical.IsNone() {
   434  				return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, physical, typeLen, fieldID))
   435  			}
   436  			return MustPrimitive(NewPrimitiveNodeConverted(name, repType, physical, converted, typeLen, precision, scale, fieldID))
   437  		default:
   438  			var elemInfo *taggedInfo
   439  			if info != nil {
   440  				elemInfo = &taggedInfo{}
   441  				*elemInfo = info.CopyForValue()
   442  			}
   443  
   444  			if !logical.IsNone() && !logical.Equals(ListLogicalType{}) {
   445  				panic("slice must either be repeated or a List type")
   446  			}
   447  			if converted != ConvertedTypes.None && converted != ConvertedTypes.List {
   448  				panic("slice must either be repeated or a List type")
   449  			}
   450  			return Must(ListOf(typeToNode(name, typ.Elem(), parquet.Repetitions.Required, elemInfo), repType, fieldID))
   451  		}
   452  	case reflect.String:
   453  		// strings are byte arrays or fixedlen byte array
   454  		t := parquet.Types.ByteArray
   455  		switch physical {
   456  		case parquet.Types.Undefined, parquet.Types.ByteArray:
   457  		case parquet.Types.FixedLenByteArray:
   458  			t = parquet.Types.FixedLenByteArray
   459  		default:
   460  			panic("string fields should be of type bytearray or fixedlenbytearray only")
   461  		}
   462  
   463  		if !logical.IsNone() {
   464  			return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, t, typeLen, fieldID))
   465  		}
   466  
   467  		return MustPrimitive(NewPrimitiveNodeConverted(name, repType, t, converted, typeLen, precision, scale, fieldID))
   468  	case reflect.Int, reflect.Int32, reflect.Int8, reflect.Int16, reflect.Int64:
   469  		// handle integer types, default to setting the corresponding logical type
   470  		ptyp := parquet.Types.Int32
   471  		if typ.Bits() == 64 {
   472  			ptyp = parquet.Types.Int64
   473  		}
   474  
   475  		if physical != parquet.Types.Undefined {
   476  			ptyp = physical
   477  		}
   478  
   479  		if !logical.IsNone() {
   480  			return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, ptyp, typeLen, fieldID))
   481  		}
   482  
   483  		bitwidth := int8(typ.Bits())
   484  		if physical != parquet.Types.Undefined {
   485  			if ptyp == parquet.Types.Int32 {
   486  				bitwidth = 32
   487  			} else if ptyp == parquet.Types.Int64 {
   488  				bitwidth = 64
   489  			}
   490  		}
   491  
   492  		if converted != ConvertedTypes.None {
   493  			return MustPrimitive(NewPrimitiveNodeConverted(name, repType, ptyp, converted, 0, precision, scale, fieldID))
   494  		}
   495  
   496  		return MustPrimitive(NewPrimitiveNodeLogical(name, repType, NewIntLogicalType(bitwidth, true), ptyp, 0, fieldID))
   497  	case reflect.Uint, reflect.Uint32, reflect.Uint8, reflect.Uint16, reflect.Uint64:
   498  		// handle unsigned integer types and default to the corresponding logical type for it.
   499  		ptyp := parquet.Types.Int32
   500  		if typ.Bits() == 64 {
   501  			ptyp = parquet.Types.Int64
   502  		}
   503  
   504  		if physical != parquet.Types.Undefined {
   505  			ptyp = physical
   506  		}
   507  
   508  		if !logical.IsNone() {
   509  			return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, ptyp, typeLen, fieldID))
   510  		}
   511  
   512  		bitwidth := int8(typ.Bits())
   513  		if physical != parquet.Types.Undefined {
   514  			if ptyp == parquet.Types.Int32 {
   515  				bitwidth = 32
   516  			} else if ptyp == parquet.Types.Int64 {
   517  				bitwidth = 64
   518  			}
   519  		}
   520  
   521  		if converted != ConvertedTypes.None {
   522  			return MustPrimitive(NewPrimitiveNodeConverted(name, repType, ptyp, converted, 0, precision, scale, fieldID))
   523  		}
   524  
   525  		return MustPrimitive(NewPrimitiveNodeLogical(name, repType, NewIntLogicalType(bitwidth, false), ptyp, 0, fieldID))
   526  	case reflect.Bool:
   527  		if !logical.IsNone() {
   528  			return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, parquet.Types.Boolean, typeLen, fieldID))
   529  		}
   530  		return MustPrimitive(NewPrimitiveNodeConverted(name, repType, parquet.Types.Boolean, converted, typeLen, precision, scale, fieldID))
   531  	case reflect.Float32:
   532  		if !logical.IsNone() {
   533  			return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, parquet.Types.Float, typeLen, fieldID))
   534  		}
   535  		return MustPrimitive(NewPrimitiveNodeConverted(name, repType, parquet.Types.Float, converted, typeLen, precision, scale, fieldID))
   536  	case reflect.Float64:
   537  		if !logical.IsNone() {
   538  			return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, parquet.Types.Double, typeLen, fieldID))
   539  		}
   540  		return MustPrimitive(NewPrimitiveNodeConverted(name, repType, parquet.Types.Double, converted, typeLen, precision, scale, fieldID))
   541  	}
   542  	return nil
   543  }
   544  
   545  // NewSchemaFromStruct generates a schema from an object type via reflection of
   546  // the type and reading struct tags for "parquet".
   547  //
   548  // Rules
   549  //
   550  // Everything defaults to Required repetition, unless otherwise specified.
   551  // Pointer types become Optional repetition.
   552  // Arrays and Slices become logical List types unless using the tag `repetition=repeated`.
   553  //
   554  // A length specified byte field (like [5]byte) becomes a fixed_len_byte_array of that length
   555  // unless otherwise specified by tags.
   556  //
   557  // string and []byte both become ByteArray unless otherwise specified.
   558  //
   559  // Integer types will default to having a logical type of the appropriate bit width
   560  // and signedness rather than having no logical type, ie: an int8 will become an int32
   561  // node with logical type Int(bitWidth=8, signed=true).
   562  //
   563  // Structs will become group nodes with the fields of the struct as the fields of the group,
   564  // recursively creating the nodes.
   565  //
   566  // maps will become appropriate Map structures in the schema of the defined key and values.
   567  //
   568  // Available Tags
   569  //
   570  // name: by default the node will have the same name as the field, this tag let's you specify a name
   571  //
   572  // type: Specify the physical type instead of using the field type
   573  //
   574  // length: specify the type length of the node, only relevant for fixed_len_byte_array
   575  //
   576  // scale: specify the scale for a decimal field
   577  //
   578  // precision: specify the precision for a decimal field
   579  //
   580  // fieldid: specify the field ID for that node, defaults to -1 which means it is not set in the parquet file.
   581  //
   582  // repetition: specify the repetition as something other than what is determined by the type
   583  //
   584  // converted: specify the Converted Type of the field
   585  //
   586  // logical: specify the logical type of the field, if using decimal then the scale and precision
   587  // will be determined by the precision and scale fields, or by the logical.precision / logical.scale fields
   588  // with the logical. prefixed versions taking precedence. For Time or Timestamp logical types,
   589  // use logical.unit=<millis|micros|nanos> and logical.isadjustedutc=<true|false> to set those. Unit is required
   590  // isadjustedutc defaults to true. For Integer logical type, use logical.bitwidth and logical.signed to specify
   591  // those values, with bitwidth being required, and signed defaulting to true.
   592  //
   593  // All tags other than name can use a prefix of "key<tagname>=<value>" to refer to the type of the key for a map
   594  // and "value<tagname>=<value>" to refer to the value type of a map or the element of a list (such as the type of a slice)
   595  func NewSchemaFromStruct(obj interface{}) (sc *Schema, err error) {
   596  	ot := reflect.TypeOf(obj)
   597  	if ot.Kind() == reflect.Ptr {
   598  		ot = ot.Elem()
   599  	}
   600  
   601  	// typeToNode uses panics to fail fast / fail early instead of propagating
   602  	// errors up recursive stacks. so we recover here and return it as an error
   603  	defer func() {
   604  		if r := recover(); r != nil {
   605  			sc = nil
   606  			switch x := r.(type) {
   607  			case string:
   608  				err = xerrors.New(x)
   609  			case error:
   610  				err = x
   611  			default:
   612  				err = xerrors.New("unknown panic")
   613  			}
   614  		}
   615  	}()
   616  
   617  	root := typeToNode(ot.Name(), ot, parquet.Repetitions.Repeated, nil)
   618  	return NewSchema(root.(*GroupNode)), nil
   619  }
   620  
   621  var parquetTypeToReflect = map[parquet.Type]reflect.Type{
   622  	parquet.Types.Boolean:           reflect.TypeOf(true),
   623  	parquet.Types.Int32:             reflect.TypeOf(int32(0)),
   624  	parquet.Types.Int64:             reflect.TypeOf(int64(0)),
   625  	parquet.Types.Float:             reflect.TypeOf(float32(0)),
   626  	parquet.Types.Double:            reflect.TypeOf(float64(0)),
   627  	parquet.Types.Int96:             reflect.TypeOf(parquet.Int96{}),
   628  	parquet.Types.ByteArray:         reflect.TypeOf(parquet.ByteArray{}),
   629  	parquet.Types.FixedLenByteArray: reflect.TypeOf(parquet.FixedLenByteArray{}),
   630  }
   631  
   632  func typeFromNode(n Node) reflect.Type {
   633  	switch n.Type() {
   634  	case Primitive:
   635  		typ := parquetTypeToReflect[n.(*PrimitiveNode).PhysicalType()]
   636  		// if a bytearray field is annoted as a String logical type or a UTF8 converted type
   637  		// then use a string instead of parquet.ByteArray / parquet.FixedLenByteArray which are []byte
   638  		if n.LogicalType().Equals(StringLogicalType{}) || n.ConvertedType() == ConvertedTypes.UTF8 {
   639  			typ = reflect.TypeOf(string(""))
   640  		}
   641  
   642  		if n.RepetitionType() == parquet.Repetitions.Optional {
   643  			typ = reflect.PtrTo(typ)
   644  		} else if n.RepetitionType() == parquet.Repetitions.Repeated {
   645  			typ = reflect.SliceOf(typ)
   646  		}
   647  
   648  		return typ
   649  	case Group:
   650  		gnode := n.(*GroupNode)
   651  		switch gnode.ConvertedType() {
   652  		case ConvertedTypes.List:
   653  			// According to the Parquet Spec, a list should always be a 3-level structure
   654  			//
   655  			//	<list-repetition> group <name> (LIST) {
   656  			//		repeated group list {
   657  			//			<element-repetition> <element-type> element;
   658  			//		}
   659  			//	}
   660  			//
   661  			// Outer-most level must be a group annotated with LIST containing a single field named "list".
   662  			// this level must be only optional (if the list is nullable) or required
   663  			// Middle level, named list, must be repeated group with a single field named "element"
   664  			// "element" field is the lists element type and repetition, which should be only required or optional
   665  
   666  			if gnode.fields.Len() != 1 {
   667  				panic("invalid list node, should have exactly 1 child.")
   668  			}
   669  
   670  			if gnode.fields[0].RepetitionType() != parquet.Repetitions.Repeated {
   671  				panic("invalid list node, child should be repeated")
   672  			}
   673  
   674  			// it is required that the repeated group of elements is named "list" and it's element
   675  			// field is named "element", however existing data may not use this so readers shouldn't
   676  			// enforce them as errors
   677  			//
   678  			// Rules for backward compatibility from the parquet spec:
   679  			//
   680  			// 1) if the repeated field is not a group, then it's type is the element type and elements
   681  			//    must be required.
   682  			// 2) if the repeated field is a group with multiple fields, then its type is the element type
   683  			//    and elements must be required.
   684  			// 3) if the repeated field is a group with one field AND is named either "array" or uses the
   685  			//    LIST-annotated group's name with "_tuple" suffix, then the repeated type is the element
   686  			//    type and the elements must be required.
   687  			// 4) otherwise, the repeated field's type is the element type with the repeated field's repetition
   688  
   689  			elemMustBeRequired := false
   690  			addSlice := false
   691  			var elemType reflect.Type
   692  			elemNode := gnode.fields[0]
   693  			switch {
   694  			case elemNode.Type() == Primitive,
   695  				elemNode.(*GroupNode).fields.Len() > 1,
   696  				elemNode.(*GroupNode).fields.Len() == 1 && (elemNode.Name() == "array" || elemNode.Name() == gnode.Name()+"_tuple"):
   697  				elemMustBeRequired = true
   698  				elemType = typeFromNode(elemNode)
   699  			default:
   700  				addSlice = true
   701  				elemType = typeFromNode(elemNode.(*GroupNode).fields[0])
   702  			}
   703  
   704  			if elemMustBeRequired && elemType.Kind() == reflect.Ptr {
   705  				elemType = elemType.Elem()
   706  			}
   707  			if addSlice {
   708  				elemType = reflect.SliceOf(elemType)
   709  			}
   710  			if gnode.RepetitionType() == parquet.Repetitions.Optional {
   711  				elemType = reflect.PtrTo(elemType)
   712  			}
   713  			return elemType
   714  		case ConvertedTypes.Map, ConvertedTypes.MapKeyValue:
   715  			// According to the Parquet Spec, the outer-most level should be
   716  			// a group containing a single field named "key_value" with repetition
   717  			// either optional or required for whether or not the map is nullable.
   718  			//
   719  			// The key_value middle level *must* be a repeated group with a "key" field
   720  			// and *optionally* a "value" field
   721  			//
   722  			// the "key" field *must* be required and must always exist
   723  			//
   724  			// the "value" field can be required or optional or omitted.
   725  			//
   726  			// 	<map-repetition> group <name> (MAP) {
   727  			//		repeated group key_value {
   728  			//			required <key-type> key;
   729  			//			<value-repetition> <value-type> value;
   730  			//		}
   731  			//	}
   732  
   733  			if gnode.fields.Len() != 1 {
   734  				panic("invalid map node, should have exactly 1 child")
   735  			}
   736  
   737  			if gnode.fields[0].Type() != Group {
   738  				panic("invalid map node, child should be a group node")
   739  			}
   740  
   741  			// that said, this may not be used in existing data and should not be
   742  			// enforced as errors when reading.
   743  			//
   744  			// some data may also incorrectly use MAP_KEY_VALUE instead of MAP
   745  			//
   746  			// so any group with MAP_KEY_VALUE that is not contained inside of a "MAP"
   747  			// group, should be considered equivalent to being a MAP group itself.
   748  			//
   749  			// in addition, the fields may not be called "key" and "value" in existing
   750  			// data, and as such should not be enforced as errors when reading.
   751  
   752  			keyval := gnode.fields[0].(*GroupNode)
   753  
   754  			keyIndex := keyval.FieldIndexByName("key")
   755  			if keyIndex == -1 {
   756  				keyIndex = 0 // use first child if there is no child named "key"
   757  			}
   758  
   759  			keyType := typeFromNode(keyval.fields[keyIndex])
   760  			if keyType.Kind() == reflect.Ptr {
   761  				keyType = keyType.Elem()
   762  			}
   763  			// can't use a []byte as a key for a map, so use string
   764  			if keyType == reflect.TypeOf(parquet.ByteArray{}) || keyType == reflect.TypeOf(parquet.FixedLenByteArray{}) {
   765  				keyType = reflect.TypeOf(string(""))
   766  			}
   767  
   768  			// if the value node is omitted, then consider this a "set" and make it a
   769  			// map[key-type]bool
   770  			valType := reflect.TypeOf(true)
   771  			if keyval.fields.Len() > 1 {
   772  				valIndex := keyval.FieldIndexByName("value")
   773  				if valIndex == -1 {
   774  					valIndex = 1 // use second child if there is no child named "value"
   775  				}
   776  
   777  				valType = typeFromNode(keyval.fields[valIndex])
   778  			}
   779  
   780  			mapType := reflect.MapOf(keyType, valType)
   781  			if gnode.RepetitionType() == parquet.Repetitions.Optional {
   782  				mapType = reflect.PtrTo(mapType)
   783  			}
   784  			return mapType
   785  		default:
   786  			fields := []reflect.StructField{}
   787  			for _, f := range gnode.fields {
   788  				fields = append(fields, reflect.StructField{
   789  					Name:    f.Name(),
   790  					Type:    typeFromNode(f),
   791  					PkgPath: "parquet",
   792  				})
   793  			}
   794  
   795  			structType := reflect.StructOf(fields)
   796  			if gnode.RepetitionType() == parquet.Repetitions.Repeated {
   797  				return reflect.SliceOf(structType)
   798  			}
   799  			if gnode.RepetitionType() == parquet.Repetitions.Optional {
   800  				return reflect.PtrTo(structType)
   801  			}
   802  			return structType
   803  		}
   804  	}
   805  	panic("what happened?")
   806  }
   807  
   808  // NewStructFromSchema generates a struct type as a reflect.Type from the schema
   809  // by using the appropriate physical types and making things either pointers or slices
   810  // based on whether they are repeated/optional/required. It does not use the logical
   811  // or converted types to change the physical storage so that it is more efficient to use
   812  // the resulting type for reading without having to do conversions.
   813  //
   814  // It will use maps for map types and slices for list types, but otherwise ignores the
   815  // converted and logical types of the nodes. Group nodes that are not List or Map will
   816  // be nested structs.
   817  func NewStructFromSchema(sc *Schema) (t reflect.Type, err error) {
   818  	defer func() {
   819  		if r := recover(); r != nil {
   820  			t = nil
   821  			switch x := r.(type) {
   822  			case string:
   823  				err = xerrors.New(x)
   824  			case error:
   825  				err = x
   826  			default:
   827  				err = xerrors.New("unknown panic")
   828  			}
   829  		}
   830  	}()
   831  
   832  	t = typeFromNode(sc.root)
   833  	if t.Kind() == reflect.Slice || t.Kind() == reflect.Ptr {
   834  		return t.Elem(), nil
   835  	}
   836  	return
   837  }