github.com/apache/arrow/go/v10@v10.0.1/parquet/schema/reflection.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package schema
    18  
    19  import (
    20  	"fmt"
    21  	"reflect"
    22  	"strconv"
    23  	"strings"
    24  
    25  	"github.com/apache/arrow/go/v10/parquet"
    26  	format "github.com/apache/arrow/go/v10/parquet/internal/gen-go/parquet"
    27  	"golang.org/x/xerrors"
    28  )
    29  
    30  type taggedInfo struct {
    31  	Name string
    32  
    33  	Type      parquet.Type
    34  	KeyType   parquet.Type
    35  	ValueType parquet.Type
    36  
    37  	Length      int32
    38  	KeyLength   int32
    39  	ValueLength int32
    40  
    41  	Scale      int32
    42  	KeyScale   int32
    43  	ValueScale int32
    44  
    45  	Precision      int32
    46  	KeyPrecision   int32
    47  	ValuePrecision int32
    48  
    49  	FieldID      int32
    50  	KeyFieldID   int32
    51  	ValueFieldID int32
    52  
    53  	RepetitionType  parquet.Repetition
    54  	ValueRepetition parquet.Repetition
    55  
    56  	Converted      ConvertedType
    57  	KeyConverted   ConvertedType
    58  	ValueConverted ConvertedType
    59  
    60  	LogicalFields      map[string]string
    61  	KeyLogicalFields   map[string]string
    62  	ValueLogicalFields map[string]string
    63  
    64  	LogicalType      LogicalType
    65  	KeyLogicalType   LogicalType
    66  	ValueLogicalType LogicalType
    67  }
    68  
    69  func (t *taggedInfo) CopyForKey() (ret taggedInfo) {
    70  	ret = *t
    71  	ret.Type = t.KeyType
    72  	ret.Length = t.KeyLength
    73  	ret.Scale = t.KeyScale
    74  	ret.Precision = t.KeyPrecision
    75  	ret.FieldID = t.KeyFieldID
    76  	ret.RepetitionType = parquet.Repetitions.Required
    77  	ret.Converted = t.KeyConverted
    78  	ret.LogicalType = t.KeyLogicalType
    79  	return
    80  }
    81  
    82  func (t *taggedInfo) CopyForValue() (ret taggedInfo) {
    83  	ret = *t
    84  	ret.Type = t.ValueType
    85  	ret.Length = t.ValueLength
    86  	ret.Scale = t.ValueScale
    87  	ret.Precision = t.ValuePrecision
    88  	ret.FieldID = t.ValueFieldID
    89  	ret.RepetitionType = t.ValueRepetition
    90  	ret.Converted = t.ValueConverted
    91  	ret.LogicalType = t.ValueLogicalType
    92  	return
    93  }
    94  
    95  func (t *taggedInfo) UpdateLogicalTypes() {
    96  	processLogicalType := func(fields map[string]string, precision, scale int32) LogicalType {
    97  		t, ok := fields["type"]
    98  		if !ok {
    99  			return NoLogicalType{}
   100  		}
   101  
   102  		switch strings.ToLower(t) {
   103  		case "string":
   104  			return StringLogicalType{}
   105  		case "map":
   106  			return MapLogicalType{}
   107  		case "list":
   108  			return ListLogicalType{}
   109  		case "enum":
   110  			return EnumLogicalType{}
   111  		case "decimal":
   112  			if v, ok := fields["precision"]; ok {
   113  				precision = int32FromType(v)
   114  			}
   115  			if v, ok := fields["scale"]; ok {
   116  				scale = int32FromType(v)
   117  			}
   118  			return NewDecimalLogicalType(precision, scale)
   119  		case "date":
   120  			return DateLogicalType{}
   121  		case "time":
   122  			unit, ok := fields["unit"]
   123  			if !ok {
   124  				panic("must specify unit for time logical type")
   125  			}
   126  			adjustedToUtc, ok := fields["isadjustedutc"]
   127  			if !ok {
   128  				adjustedToUtc = "true"
   129  			}
   130  			return NewTimeLogicalType(boolFromStr(adjustedToUtc), timeUnitFromString(strings.ToLower(unit)))
   131  		case "timestamp":
   132  			unit, ok := fields["unit"]
   133  			if !ok {
   134  				panic("must specify unit for time logical type")
   135  			}
   136  			adjustedToUtc, ok := fields["isadjustedutc"]
   137  			if !ok {
   138  				adjustedToUtc = "true"
   139  			}
   140  			return NewTimestampLogicalType(boolFromStr(adjustedToUtc), timeUnitFromString(unit))
   141  		case "integer":
   142  			width, ok := fields["bitwidth"]
   143  			if !ok {
   144  				panic("must specify bitwidth if explicitly setting integer logical type")
   145  			}
   146  			signed, ok := fields["signed"]
   147  			if !ok {
   148  				signed = "true"
   149  			}
   150  
   151  			return NewIntLogicalType(int8(int32FromType(width)), boolFromStr(signed))
   152  		case "null":
   153  			return NullLogicalType{}
   154  		case "json":
   155  			return JSONLogicalType{}
   156  		case "bson":
   157  			return BSONLogicalType{}
   158  		case "uuid":
   159  			return UUIDLogicalType{}
   160  		default:
   161  			panic(fmt.Errorf("invalid logical type specified: %s", t))
   162  		}
   163  	}
   164  
   165  	t.LogicalType = processLogicalType(t.LogicalFields, t.Precision, t.Scale)
   166  	t.KeyLogicalType = processLogicalType(t.KeyLogicalFields, t.KeyPrecision, t.KeyScale)
   167  	t.ValueLogicalType = processLogicalType(t.ValueLogicalFields, t.ValuePrecision, t.ValueScale)
   168  }
   169  
   170  func newTaggedInfo() taggedInfo {
   171  	return taggedInfo{
   172  		Type:               parquet.Types.Undefined,
   173  		KeyType:            parquet.Types.Undefined,
   174  		ValueType:          parquet.Types.Undefined,
   175  		RepetitionType:     parquet.Repetitions.Undefined,
   176  		ValueRepetition:    parquet.Repetitions.Undefined,
   177  		Converted:          ConvertedTypes.NA,
   178  		KeyConverted:       ConvertedTypes.NA,
   179  		ValueConverted:     ConvertedTypes.NA,
   180  		FieldID:            -1,
   181  		KeyFieldID:         -1,
   182  		ValueFieldID:       -1,
   183  		LogicalFields:      make(map[string]string),
   184  		KeyLogicalFields:   make(map[string]string),
   185  		ValueLogicalFields: make(map[string]string),
   186  		LogicalType:        NoLogicalType{},
   187  		KeyLogicalType:     NoLogicalType{},
   188  		ValueLogicalType:   NoLogicalType{},
   189  	}
   190  }
   191  
   192  var int32FromType = func(v string) int32 {
   193  	val, err := strconv.Atoi(v)
   194  	if err != nil {
   195  		panic(err)
   196  	}
   197  	return int32(val)
   198  }
   199  
   200  var boolFromStr = func(v string) bool {
   201  	val, err := strconv.ParseBool(v)
   202  	if err != nil {
   203  		panic(err)
   204  	}
   205  	return val
   206  }
   207  
   208  func infoFromTags(f reflect.StructTag) *taggedInfo {
   209  	typeFromStr := func(v string) parquet.Type {
   210  		t, err := format.TypeFromString(strings.ToUpper(v))
   211  		if err != nil {
   212  			panic(fmt.Errorf("invalid type specified: %s", v))
   213  		}
   214  		return parquet.Type(t)
   215  	}
   216  
   217  	repFromStr := func(v string) parquet.Repetition {
   218  		r, err := format.FieldRepetitionTypeFromString(strings.ToUpper(v))
   219  		if err != nil {
   220  			panic(err)
   221  		}
   222  		return parquet.Repetition(r)
   223  	}
   224  
   225  	convertedFromStr := func(v string) ConvertedType {
   226  		c, err := format.ConvertedTypeFromString(strings.ToUpper(v))
   227  		if err != nil {
   228  			panic(err)
   229  		}
   230  		return ConvertedType(c)
   231  	}
   232  
   233  	if ptags, ok := f.Lookup("parquet"); ok {
   234  		info := newTaggedInfo()
   235  		for _, tag := range strings.Split(strings.Replace(ptags, "\t", "", -1), ",") {
   236  			tag = strings.TrimSpace(tag)
   237  			kv := strings.SplitN(tag, "=", 2)
   238  			key := strings.TrimSpace(strings.ToLower(kv[0]))
   239  			value := strings.TrimSpace(kv[1])
   240  
   241  			switch key {
   242  			case "name":
   243  				info.Name = value
   244  			case "type":
   245  				info.Type = typeFromStr(value)
   246  			case "keytype":
   247  				info.KeyType = typeFromStr(value)
   248  			case "valuetype":
   249  				info.ValueType = typeFromStr(value)
   250  			case "length":
   251  				info.Length = int32FromType(value)
   252  			case "keylength":
   253  				info.KeyLength = int32FromType(value)
   254  			case "valuelength":
   255  				info.ValueLength = int32FromType(value)
   256  			case "scale":
   257  				info.Scale = int32FromType(value)
   258  			case "keyscale":
   259  				info.KeyScale = int32FromType(value)
   260  			case "valuescale":
   261  				info.ValueScale = int32FromType(value)
   262  			case "precision":
   263  				info.Precision = int32FromType(value)
   264  			case "keyprecision":
   265  				info.KeyPrecision = int32FromType(value)
   266  			case "valueprecision":
   267  				info.ValuePrecision = int32FromType(value)
   268  			case "fieldid":
   269  				info.FieldID = int32FromType(value)
   270  			case "keyfieldid":
   271  				info.KeyFieldID = int32FromType(value)
   272  			case "valuefieldid":
   273  				info.ValueFieldID = int32FromType(value)
   274  			case "repetition":
   275  				info.RepetitionType = repFromStr(value)
   276  			case "valuerepetition":
   277  				info.ValueRepetition = repFromStr(value)
   278  			case "converted":
   279  				info.Converted = convertedFromStr(value)
   280  			case "keyconverted":
   281  				info.KeyConverted = convertedFromStr(value)
   282  			case "valueconverted":
   283  				info.ValueConverted = convertedFromStr(value)
   284  			case "logical":
   285  				info.LogicalFields["type"] = value
   286  			case "keylogical":
   287  				info.KeyLogicalFields["type"] = value
   288  			case "valuelogical":
   289  				info.ValueLogicalFields["type"] = value
   290  			default:
   291  				switch {
   292  				case strings.HasPrefix(key, "logical."):
   293  					info.LogicalFields[strings.TrimPrefix(key, "logical.")] = value
   294  				case strings.HasPrefix(key, "keylogical."):
   295  					info.KeyLogicalFields[strings.TrimPrefix(key, "keylogical.")] = value
   296  				case strings.HasPrefix(key, "valuelogical."):
   297  					info.ValueLogicalFields[strings.TrimPrefix(key, "valuelogical.")] = value
   298  				}
   299  			}
   300  		}
   301  		info.UpdateLogicalTypes()
   302  		return &info
   303  	}
   304  	return nil
   305  }
   306  
   307  // typeToNode recurseively converts a physical type and the tag info into parquet Nodes
   308  //
   309  // to avoid having to propagate errors up potentially high numbers of recursive calls
   310  // we use panics and then recover in the public function NewSchemaFromStruct so that a
   311  // failure very far down the stack quickly unwinds.
   312  func typeToNode(name string, typ reflect.Type, repType parquet.Repetition, info *taggedInfo) Node {
   313  	// set up our default values for everything
   314  	var (
   315  		converted             = ConvertedTypes.None
   316  		logical   LogicalType = NoLogicalType{}
   317  		fieldID               = int32(-1)
   318  		physical              = parquet.Types.Undefined
   319  		typeLen               = 0
   320  		precision             = 0
   321  		scale                 = 0
   322  	)
   323  	if info != nil { // we have struct tag info to process
   324  		fieldID = info.FieldID
   325  		if info.Converted != ConvertedTypes.NA {
   326  			converted = info.Converted
   327  		}
   328  		logical = info.LogicalType
   329  		physical = info.Type
   330  		typeLen = int(info.Length)
   331  		precision = int(info.Precision)
   332  		scale = int(info.Scale)
   333  
   334  		if info.Name != "" {
   335  			name = info.Name
   336  		}
   337  		if info.RepetitionType != parquet.Repetitions.Undefined {
   338  			repType = info.RepetitionType
   339  		}
   340  	}
   341  
   342  	// simplify the logic by switching based on the reflection Kind
   343  	switch typ.Kind() {
   344  	case reflect.Map:
   345  		// a map must have a logical type of MAP or have no tag for logical type in which case
   346  		// we assume MAP logical type.
   347  		if !logical.IsNone() && !logical.Equals(MapLogicalType{}) {
   348  			panic("cannot set logical type to something other than map for a map")
   349  		}
   350  
   351  		infoCopy := newTaggedInfo()
   352  		if info != nil { // populate any value specific tags to propagate for the value type
   353  			infoCopy = info.CopyForValue()
   354  		}
   355  
   356  		// create the node for the value type of the map
   357  		value := typeToNode("value", typ.Elem(), parquet.Repetitions.Required, &infoCopy)
   358  		if info != nil { // change our copy to now use the key specific tags if they exist
   359  			infoCopy = info.CopyForKey()
   360  		}
   361  
   362  		// create the node for the key type of the map
   363  		key := typeToNode("key", typ.Key(), parquet.Repetitions.Required, &infoCopy)
   364  		if key.RepetitionType() != parquet.Repetitions.Required { // key cannot be optional
   365  			panic("key type of map must be Required")
   366  		}
   367  		return Must(MapOf(name, key, value, repType, fieldID))
   368  	case reflect.Struct:
   369  		// structs are Group nodes
   370  		fields := make(FieldList, 0)
   371  		for i := 0; i < typ.NumField(); i++ {
   372  			f := typ.Field(i)
   373  
   374  			fields = append(fields, typeToNode(f.Name, f.Type, parquet.Repetitions.Required, infoFromTags(f.Tag)))
   375  		}
   376  		// group nodes don't have a physical type
   377  		if physical != parquet.Types.Undefined {
   378  			panic("cannot specify custom type on struct")
   379  		}
   380  		// group nodes don't have converted or logical types
   381  		if converted != ConvertedTypes.None {
   382  			panic("cannot specify converted types for a struct")
   383  		}
   384  		if !logical.IsNone() {
   385  			panic("cannot specify logicaltype for a struct")
   386  		}
   387  		return Must(NewGroupNode(name, repType, fields, fieldID))
   388  	case reflect.Ptr: // if we encounter a pointer create a node for the type it points to, but mark it as optional
   389  		return typeToNode(name, typ.Elem(), parquet.Repetitions.Optional, info)
   390  	case reflect.Array:
   391  		// arrays are repeated or fixed size
   392  		if typ == reflect.TypeOf(parquet.Int96{}) {
   393  			return NewInt96Node(name, repType, fieldID)
   394  		}
   395  
   396  		if typ.Elem() == reflect.TypeOf(byte(0)) { // something like [12]byte translates to FixedLenByteArray with length 12
   397  			if physical == parquet.Types.Undefined {
   398  				physical = parquet.Types.FixedLenByteArray
   399  			}
   400  			if typeLen == 0 { // if there was no type length specified in the tag, use the length of the type.
   401  				typeLen = typ.Len()
   402  			}
   403  			if !logical.IsNone() {
   404  				return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, physical, typeLen, fieldID))
   405  			}
   406  			return MustPrimitive(NewPrimitiveNodeConverted(name, repType, physical, converted, typeLen, precision, scale, fieldID))
   407  		}
   408  		fallthrough // if it's not a fixed len byte array type, then just treat it like a slice
   409  	case reflect.Slice:
   410  		// for slices, we default to treating them as lists unless the repetition type is set to REPEATED or they are
   411  		// a bytearray/fixedlenbytearray
   412  		switch {
   413  		case repType == parquet.Repetitions.Repeated:
   414  			return typeToNode(name, typ.Elem(), parquet.Repetitions.Repeated, info)
   415  		case physical == parquet.Types.FixedLenByteArray || physical == parquet.Types.ByteArray:
   416  			if typ.Elem() != reflect.TypeOf(byte(0)) {
   417  				panic("slice with physical type ByteArray or FixedLenByteArray must be []byte")
   418  			}
   419  			fallthrough
   420  		case typ.Elem() == reflect.TypeOf(byte(0)):
   421  			if physical == parquet.Types.Undefined {
   422  				physical = parquet.Types.ByteArray
   423  			}
   424  			if !logical.IsNone() {
   425  				return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, physical, typeLen, fieldID))
   426  			}
   427  			return MustPrimitive(NewPrimitiveNodeConverted(name, repType, physical, converted, typeLen, precision, scale, fieldID))
   428  		default:
   429  			var elemInfo *taggedInfo
   430  			if info != nil {
   431  				elemInfo = &taggedInfo{}
   432  				*elemInfo = info.CopyForValue()
   433  			}
   434  
   435  			if !logical.IsNone() && !logical.Equals(ListLogicalType{}) {
   436  				panic("slice must either be repeated or a List type")
   437  			}
   438  			if converted != ConvertedTypes.None && converted != ConvertedTypes.List {
   439  				panic("slice must either be repeated or a List type")
   440  			}
   441  			return Must(ListOf(typeToNode(name, typ.Elem(), parquet.Repetitions.Required, elemInfo), repType, fieldID))
   442  		}
   443  	case reflect.String:
   444  		// strings are byte arrays or fixedlen byte array
   445  		t := parquet.Types.ByteArray
   446  		switch physical {
   447  		case parquet.Types.Undefined, parquet.Types.ByteArray:
   448  		case parquet.Types.FixedLenByteArray:
   449  			t = parquet.Types.FixedLenByteArray
   450  		default:
   451  			panic("string fields should be of type bytearray or fixedlenbytearray only")
   452  		}
   453  
   454  		if !logical.IsNone() {
   455  			return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, t, typeLen, fieldID))
   456  		}
   457  
   458  		return MustPrimitive(NewPrimitiveNodeConverted(name, repType, t, converted, typeLen, precision, scale, fieldID))
   459  	case reflect.Int, reflect.Int32, reflect.Int8, reflect.Int16, reflect.Int64:
   460  		// handle integer types, default to setting the corresponding logical type
   461  		ptyp := parquet.Types.Int32
   462  		if typ.Bits() == 64 {
   463  			ptyp = parquet.Types.Int64
   464  		}
   465  
   466  		if physical != parquet.Types.Undefined {
   467  			ptyp = physical
   468  		}
   469  
   470  		if !logical.IsNone() {
   471  			return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, ptyp, typeLen, fieldID))
   472  		}
   473  
   474  		bitwidth := int8(typ.Bits())
   475  		if physical != parquet.Types.Undefined {
   476  			if ptyp == parquet.Types.Int32 {
   477  				bitwidth = 32
   478  			} else if ptyp == parquet.Types.Int64 {
   479  				bitwidth = 64
   480  			}
   481  		}
   482  
   483  		if converted != ConvertedTypes.None {
   484  			return MustPrimitive(NewPrimitiveNodeConverted(name, repType, ptyp, converted, 0, precision, scale, fieldID))
   485  		}
   486  
   487  		return MustPrimitive(NewPrimitiveNodeLogical(name, repType, NewIntLogicalType(bitwidth, true), ptyp, 0, fieldID))
   488  	case reflect.Uint, reflect.Uint32, reflect.Uint8, reflect.Uint16, reflect.Uint64:
   489  		// handle unsigned integer types and default to the corresponding logical type for it.
   490  		ptyp := parquet.Types.Int32
   491  		if typ.Bits() == 64 {
   492  			ptyp = parquet.Types.Int64
   493  		}
   494  
   495  		if physical != parquet.Types.Undefined {
   496  			ptyp = physical
   497  		}
   498  
   499  		if !logical.IsNone() {
   500  			return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, ptyp, typeLen, fieldID))
   501  		}
   502  
   503  		bitwidth := int8(typ.Bits())
   504  		if physical != parquet.Types.Undefined {
   505  			if ptyp == parquet.Types.Int32 {
   506  				bitwidth = 32
   507  			} else if ptyp == parquet.Types.Int64 {
   508  				bitwidth = 64
   509  			}
   510  		}
   511  
   512  		if converted != ConvertedTypes.None {
   513  			return MustPrimitive(NewPrimitiveNodeConverted(name, repType, ptyp, converted, 0, precision, scale, fieldID))
   514  		}
   515  
   516  		return MustPrimitive(NewPrimitiveNodeLogical(name, repType, NewIntLogicalType(bitwidth, false), ptyp, 0, fieldID))
   517  	case reflect.Bool:
   518  		if !logical.IsNone() {
   519  			return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, parquet.Types.Boolean, typeLen, fieldID))
   520  		}
   521  		return MustPrimitive(NewPrimitiveNodeConverted(name, repType, parquet.Types.Boolean, converted, typeLen, precision, scale, fieldID))
   522  	case reflect.Float32:
   523  		if !logical.IsNone() {
   524  			return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, parquet.Types.Float, typeLen, fieldID))
   525  		}
   526  		return MustPrimitive(NewPrimitiveNodeConverted(name, repType, parquet.Types.Float, converted, typeLen, precision, scale, fieldID))
   527  	case reflect.Float64:
   528  		if !logical.IsNone() {
   529  			return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, parquet.Types.Double, typeLen, fieldID))
   530  		}
   531  		return MustPrimitive(NewPrimitiveNodeConverted(name, repType, parquet.Types.Double, converted, typeLen, precision, scale, fieldID))
   532  	}
   533  	return nil
   534  }
   535  
   536  // NewSchemaFromStruct generates a schema from an object type via reflection of
   537  // the type and reading struct tags for "parquet".
   538  //
   539  // Rules
   540  //
   541  // Everything defaults to Required repetition, unless otherwise specified.
   542  // Pointer types become Optional repetition.
   543  // Arrays and Slices become logical List types unless using the tag `repetition=repeated`.
   544  //
   545  // A length specified byte field (like [5]byte) becomes a fixed_len_byte_array of that length
   546  // unless otherwise specified by tags.
   547  //
   548  // string and []byte both become ByteArray unless otherwise specified.
   549  //
   550  // Integer types will default to having a logical type of the appropriate bit width
   551  // and signedness rather than having no logical type, ie: an int8 will become an int32
   552  // node with logical type Int(bitWidth=8, signed=true).
   553  //
   554  // Structs will become group nodes with the fields of the struct as the fields of the group,
   555  // recursively creating the nodes.
   556  //
   557  // maps will become appropriate Map structures in the schema of the defined key and values.
   558  //
   559  // Available Tags
   560  //
   561  // name: by default the node will have the same name as the field, this tag let's you specify a name
   562  //
   563  // type: Specify the physical type instead of using the field type
   564  //
   565  // length: specify the type length of the node, only relevant for fixed_len_byte_array
   566  //
   567  // scale: specify the scale for a decimal field
   568  //
   569  // precision: specify the precision for a decimal field
   570  //
   571  // fieldid: specify the field ID for that node, defaults to -1 which means it is not set in the parquet file.
   572  //
   573  // repetition: specify the repetition as something other than what is determined by the type
   574  //
   575  // converted: specify the Converted Type of the field
   576  //
   577  // logical: specify the logical type of the field, if using decimal then the scale and precision
   578  // will be determined by the precision and scale fields, or by the logical.precision / logical.scale fields
   579  // with the logical. prefixed versions taking precedence. For Time or Timestamp logical types,
   580  // use logical.unit=<millis|micros|nanos> and logical.isadjustedutc=<true|false> to set those. Unit is required
   581  // isadjustedutc defaults to true. For Integer logical type, use logical.bitwidth and logical.signed to specify
   582  // those values, with bitwidth being required, and signed defaulting to true.
   583  //
   584  // All tags other than name can use a prefix of "key<tagname>=<value>" to refer to the type of the key for a map
   585  // and "value<tagname>=<value>" to refer to the value type of a map or the element of a list (such as the type of a slice)
   586  func NewSchemaFromStruct(obj interface{}) (sc *Schema, err error) {
   587  	ot := reflect.TypeOf(obj)
   588  	if ot.Kind() == reflect.Ptr {
   589  		ot = ot.Elem()
   590  	}
   591  
   592  	// typeToNode uses panics to fail fast / fail early instead of propagating
   593  	// errors up recursive stacks. so we recover here and return it as an error
   594  	defer func() {
   595  		if r := recover(); r != nil {
   596  			sc = nil
   597  			switch x := r.(type) {
   598  			case string:
   599  				err = xerrors.New(x)
   600  			case error:
   601  				err = x
   602  			default:
   603  				err = xerrors.New("unknown panic")
   604  			}
   605  		}
   606  	}()
   607  
   608  	root := typeToNode(ot.Name(), ot, parquet.Repetitions.Repeated, nil)
   609  	return NewSchema(root.(*GroupNode)), nil
   610  }
   611  
   612  var parquetTypeToReflect = map[parquet.Type]reflect.Type{
   613  	parquet.Types.Boolean:           reflect.TypeOf(true),
   614  	parquet.Types.Int32:             reflect.TypeOf(int32(0)),
   615  	parquet.Types.Int64:             reflect.TypeOf(int64(0)),
   616  	parquet.Types.Float:             reflect.TypeOf(float32(0)),
   617  	parquet.Types.Double:            reflect.TypeOf(float64(0)),
   618  	parquet.Types.Int96:             reflect.TypeOf(parquet.Int96{}),
   619  	parquet.Types.ByteArray:         reflect.TypeOf(parquet.ByteArray{}),
   620  	parquet.Types.FixedLenByteArray: reflect.TypeOf(parquet.FixedLenByteArray{}),
   621  }
   622  
   623  func typeFromNode(n Node) reflect.Type {
   624  	switch n.Type() {
   625  	case Primitive:
   626  		typ := parquetTypeToReflect[n.(*PrimitiveNode).PhysicalType()]
   627  		// if a bytearray field is annoted as a String logical type or a UTF8 converted type
   628  		// then use a string instead of parquet.ByteArray / parquet.FixedLenByteArray which are []byte
   629  		if n.LogicalType().Equals(StringLogicalType{}) || n.ConvertedType() == ConvertedTypes.UTF8 {
   630  			typ = reflect.TypeOf(string(""))
   631  		}
   632  
   633  		if n.RepetitionType() == parquet.Repetitions.Optional {
   634  			typ = reflect.PtrTo(typ)
   635  		} else if n.RepetitionType() == parquet.Repetitions.Repeated {
   636  			typ = reflect.SliceOf(typ)
   637  		}
   638  
   639  		return typ
   640  	case Group:
   641  		gnode := n.(*GroupNode)
   642  		switch gnode.ConvertedType() {
   643  		case ConvertedTypes.List:
   644  			// According to the Parquet Spec, a list should always be a 3-level structure
   645  			//
   646  			//	<list-repetition> group <name> (LIST) {
   647  			//		repeated group list {
   648  			//			<element-repetition> <element-type> element;
   649  			//		}
   650  			//	}
   651  			//
   652  			// Outer-most level must be a group annotated with LIST containing a single field named "list".
   653  			// this level must be only optional (if the list is nullable) or required
   654  			// Middle level, named list, must be repeated group with a single field named "element"
   655  			// "element" field is the lists element type and repetition, which should be only required or optional
   656  
   657  			if gnode.fields.Len() != 1 {
   658  				panic("invalid list node, should have exactly 1 child.")
   659  			}
   660  
   661  			if gnode.fields[0].RepetitionType() != parquet.Repetitions.Repeated {
   662  				panic("invalid list node, child should be repeated")
   663  			}
   664  
   665  			// it is required that the repeated group of elements is named "list" and it's element
   666  			// field is named "element", however existing data may not use this so readers shouldn't
   667  			// enforce them as errors
   668  			//
   669  			// Rules for backward compatibility from the parquet spec:
   670  			//
   671  			// 1) if the repeated field is not a group, then it's type is the element type and elements
   672  			//    must be required.
   673  			// 2) if the repeated field is a group with multiple fields, then its type is the element type
   674  			//    and elements must be required.
   675  			// 3) if the repeated field is a group with one field AND is named either "array" or uses the
   676  			//    LIST-annotated group's name with "_tuple" suffix, then the repeated type is the element
   677  			//    type and the elements must be required.
   678  			// 4) otherwise, the repeated field's type is the element type with the repeated field's repetition
   679  
   680  			elemMustBeRequired := false
   681  			addSlice := false
   682  			var elemType reflect.Type
   683  			elemNode := gnode.fields[0]
   684  			switch {
   685  			case elemNode.Type() == Primitive,
   686  				elemNode.(*GroupNode).fields.Len() > 1,
   687  				elemNode.(*GroupNode).fields.Len() == 1 && (elemNode.Name() == "array" || elemNode.Name() == gnode.Name()+"_tuple"):
   688  				elemMustBeRequired = true
   689  				elemType = typeFromNode(elemNode)
   690  			default:
   691  				addSlice = true
   692  				elemType = typeFromNode(elemNode.(*GroupNode).fields[0])
   693  			}
   694  
   695  			if elemMustBeRequired && elemType.Kind() == reflect.Ptr {
   696  				elemType = elemType.Elem()
   697  			}
   698  			if addSlice {
   699  				elemType = reflect.SliceOf(elemType)
   700  			}
   701  			if gnode.RepetitionType() == parquet.Repetitions.Optional {
   702  				elemType = reflect.PtrTo(elemType)
   703  			}
   704  			return elemType
   705  		case ConvertedTypes.Map, ConvertedTypes.MapKeyValue:
   706  			// According to the Parquet Spec, the outer-most level should be
   707  			// a group containing a single field named "key_value" with repetition
   708  			// either optional or required for whether or not the map is nullable.
   709  			//
   710  			// The key_value middle level *must* be a repeated group with a "key" field
   711  			// and *optionally* a "value" field
   712  			//
   713  			// the "key" field *must* be required and must always exist
   714  			//
   715  			// the "value" field can be required or optional or omitted.
   716  			//
   717  			// 	<map-repetition> group <name> (MAP) {
   718  			//		repeated group key_value {
   719  			//			required <key-type> key;
   720  			//			<value-repetition> <value-type> value;
   721  			//		}
   722  			//	}
   723  
   724  			if gnode.fields.Len() != 1 {
   725  				panic("invalid map node, should have exactly 1 child")
   726  			}
   727  
   728  			if gnode.fields[0].Type() != Group {
   729  				panic("invalid map node, child should be a group node")
   730  			}
   731  
   732  			// that said, this may not be used in existing data and should not be
   733  			// enforced as errors when reading.
   734  			//
   735  			// some data may also incorrectly use MAP_KEY_VALUE instead of MAP
   736  			//
   737  			// so any group with MAP_KEY_VALUE that is not contained inside of a "MAP"
   738  			// group, should be considered equivalent to being a MAP group itself.
   739  			//
   740  			// in addition, the fields may not be called "key" and "value" in existing
   741  			// data, and as such should not be enforced as errors when reading.
   742  
   743  			keyval := gnode.fields[0].(*GroupNode)
   744  
   745  			keyIndex := keyval.FieldIndexByName("key")
   746  			if keyIndex == -1 {
   747  				keyIndex = 0 // use first child if there is no child named "key"
   748  			}
   749  
   750  			keyType := typeFromNode(keyval.fields[keyIndex])
   751  			if keyType.Kind() == reflect.Ptr {
   752  				keyType = keyType.Elem()
   753  			}
   754  			// can't use a []byte as a key for a map, so use string
   755  			if keyType == reflect.TypeOf(parquet.ByteArray{}) || keyType == reflect.TypeOf(parquet.FixedLenByteArray{}) {
   756  				keyType = reflect.TypeOf(string(""))
   757  			}
   758  
   759  			// if the value node is omitted, then consider this a "set" and make it a
   760  			// map[key-type]bool
   761  			valType := reflect.TypeOf(true)
   762  			if keyval.fields.Len() > 1 {
   763  				valIndex := keyval.FieldIndexByName("value")
   764  				if valIndex == -1 {
   765  					valIndex = 1 // use second child if there is no child named "value"
   766  				}
   767  
   768  				valType = typeFromNode(keyval.fields[valIndex])
   769  			}
   770  
   771  			mapType := reflect.MapOf(keyType, valType)
   772  			if gnode.RepetitionType() == parquet.Repetitions.Optional {
   773  				mapType = reflect.PtrTo(mapType)
   774  			}
   775  			return mapType
   776  		default:
   777  			fields := []reflect.StructField{}
   778  			for _, f := range gnode.fields {
   779  				fields = append(fields, reflect.StructField{
   780  					Name:    f.Name(),
   781  					Type:    typeFromNode(f),
   782  					PkgPath: "parquet",
   783  				})
   784  			}
   785  
   786  			structType := reflect.StructOf(fields)
   787  			if gnode.RepetitionType() == parquet.Repetitions.Repeated {
   788  				return reflect.SliceOf(structType)
   789  			}
   790  			if gnode.RepetitionType() == parquet.Repetitions.Optional {
   791  				return reflect.PtrTo(structType)
   792  			}
   793  			return structType
   794  		}
   795  	}
   796  	panic("what happened?")
   797  }
   798  
   799  // NewStructFromSchema generates a struct type as a reflect.Type from the schema
   800  // by using the appropriate physical types and making things either pointers or slices
   801  // based on whether they are repeated/optional/required. It does not use the logical
   802  // or converted types to change the physical storage so that it is more efficient to use
   803  // the resulting type for reading without having to do conversions.
   804  //
   805  // It will use maps for map types and slices for list types, but otherwise ignores the
   806  // converted and logical types of the nodes. Group nodes that are not List or Map will
   807  // be nested structs.
   808  func NewStructFromSchema(sc *Schema) (t reflect.Type, err error) {
   809  	defer func() {
   810  		if r := recover(); r != nil {
   811  			t = nil
   812  			switch x := r.(type) {
   813  			case string:
   814  				err = xerrors.New(x)
   815  			case error:
   816  				err = x
   817  			default:
   818  				err = xerrors.New("unknown panic")
   819  			}
   820  		}
   821  	}()
   822  
   823  	t = typeFromNode(sc.root)
   824  	if t.Kind() == reflect.Slice || t.Kind() == reflect.Ptr {
   825  		return t.Elem(), nil
   826  	}
   827  	return
   828  }