github.com/apache/arrow/go/v16@v16.1.0/parquet/pqarrow/schema.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow
    18  
    19  import (
    20  	"encoding/base64"
    21  	"fmt"
    22  	"math"
    23  	"strconv"
    24  
    25  	"github.com/apache/arrow/go/v16/arrow"
    26  	"github.com/apache/arrow/go/v16/arrow/decimal128"
    27  	"github.com/apache/arrow/go/v16/arrow/flight"
    28  	"github.com/apache/arrow/go/v16/arrow/ipc"
    29  	"github.com/apache/arrow/go/v16/arrow/memory"
    30  	"github.com/apache/arrow/go/v16/parquet"
    31  	"github.com/apache/arrow/go/v16/parquet/file"
    32  	"github.com/apache/arrow/go/v16/parquet/metadata"
    33  	"github.com/apache/arrow/go/v16/parquet/schema"
    34  	"golang.org/x/xerrors"
    35  )
    36  
    37  // SchemaField is a holder that defines a specific logical field in the schema
    38  // which could potentially refer to multiple physical columns in the underlying
    39  // parquet file if it is a nested type.
    40  //
    41  // ColIndex is only populated (not -1) when it is a leaf column.
    42  type SchemaField struct {
    43  	Field     *arrow.Field
    44  	Children  []SchemaField
    45  	ColIndex  int
    46  	LevelInfo file.LevelInfo
    47  }
    48  
    49  // IsLeaf returns true if the SchemaField is a leaf column, ie: ColIndex != -1
    50  func (s *SchemaField) IsLeaf() bool { return s.ColIndex != -1 }
    51  
    52  // SchemaManifest represents a full manifest for mapping a Parquet schema
    53  // to an arrow Schema.
    54  type SchemaManifest struct {
    55  	descr        *schema.Schema
    56  	OriginSchema *arrow.Schema
    57  	SchemaMeta   *arrow.Metadata
    58  
    59  	ColIndexToField map[int]*SchemaField
    60  	ChildToParent   map[*SchemaField]*SchemaField
    61  	Fields          []SchemaField
    62  }
    63  
    64  // GetColumnField returns the corresponding Field for a given column index.
    65  func (sm *SchemaManifest) GetColumnField(index int) (*SchemaField, error) {
    66  	if field, ok := sm.ColIndexToField[index]; ok {
    67  		return field, nil
    68  	}
    69  	return nil, fmt.Errorf("Column Index %d not found in schema manifest", index)
    70  }
    71  
    72  // GetParent gets the parent field for a given field if it is a nested column, otherwise
    73  // returns nil if there is no parent field.
    74  func (sm *SchemaManifest) GetParent(field *SchemaField) *SchemaField {
    75  	if p, ok := sm.ChildToParent[field]; ok {
    76  		return p
    77  	}
    78  	return nil
    79  }
    80  
    81  // GetFieldIndices coalesces a list of field indices (relative to the equivalent arrow::Schema) which
    82  // correspond to the column root (first node below the parquet schema's root group) of
    83  // each leaf referenced in column_indices.
    84  //
    85  // For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
    86  // the roots are `a` and `i` (return=[0,2]).
    87  //
    88  // root
    89  // -- a  <------
    90  // -- -- b  |  |
    91  // -- -- -- c  |
    92  // -- -- -- d  |
    93  // -- -- -- -- e
    94  // -- f
    95  // -- -- g
    96  // -- -- -- h
    97  // -- i  <---
    98  // -- -- j  |
    99  // -- -- -- k
   100  func (sm *SchemaManifest) GetFieldIndices(indices []int) ([]int, error) {
   101  	added := make(map[int]bool)
   102  	ret := make([]int, 0)
   103  
   104  	for _, idx := range indices {
   105  		if idx < 0 || idx >= sm.descr.NumColumns() {
   106  			return nil, fmt.Errorf("column index %d is not valid", idx)
   107  		}
   108  
   109  		fieldNode := sm.descr.ColumnRoot(idx)
   110  		fieldIdx := sm.descr.Root().FieldIndexByField(fieldNode)
   111  		if fieldIdx == -1 {
   112  			return nil, fmt.Errorf("column index %d is not valid", idx)
   113  		}
   114  
   115  		if _, ok := added[fieldIdx]; !ok {
   116  			ret = append(ret, fieldIdx)
   117  			added[fieldIdx] = true
   118  		}
   119  	}
   120  	return ret, nil
   121  }
   122  
   123  func isDictionaryReadSupported(dt arrow.DataType) bool {
   124  	return arrow.IsBinaryLike(dt.ID())
   125  }
   126  
   127  func arrowTimestampToLogical(typ *arrow.TimestampType, unit arrow.TimeUnit) schema.LogicalType {
   128  	isAdjustedToUTC := typ.TimeZone != ""
   129  
   130  	// for forward compatibility reasons, and because there's no other way
   131  	// to signal to old readers that values are timestamps, we force
   132  	// the convertedtype field to be set to the corresponding TIMESTAMP_* value.
   133  	// this does cause some ambiguity as parquet readers have not been consistent
   134  	// about the interpretation of TIMESTAMP_* values as being utc-normalized
   135  	// see ARROW-5878
   136  	var scunit schema.TimeUnitType
   137  	switch unit {
   138  	case arrow.Millisecond:
   139  		scunit = schema.TimeUnitMillis
   140  	case arrow.Microsecond:
   141  		scunit = schema.TimeUnitMicros
   142  	case arrow.Nanosecond:
   143  		scunit = schema.TimeUnitNanos
   144  	case arrow.Second:
   145  		// no equivalent in parquet
   146  		return schema.NoLogicalType{}
   147  	}
   148  
   149  	return schema.NewTimestampLogicalTypeForce(isAdjustedToUTC, scunit)
   150  }
   151  
   152  func getTimestampMeta(typ *arrow.TimestampType, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (parquet.Type, schema.LogicalType, error) {
   153  	coerce := arrprops.coerceTimestamps
   154  	target := typ.Unit
   155  	if coerce {
   156  		target = arrprops.coerceTimestampUnit
   157  	}
   158  
   159  	// user is explicitly asking for int96, no logical type
   160  	if arrprops.timestampAsInt96 && target == arrow.Nanosecond {
   161  		return parquet.Types.Int96, schema.NoLogicalType{}, nil
   162  	}
   163  
   164  	physical := parquet.Types.Int64
   165  	logicalType := arrowTimestampToLogical(typ, target)
   166  
   167  	// user is explicitly asking for timestamp data to be converted to the specified
   168  	// units (target) via coercion
   169  	if coerce {
   170  		if props.Version() == parquet.V1_0 || props.Version() == parquet.V2_4 {
   171  			switch target {
   172  			case arrow.Millisecond, arrow.Microsecond:
   173  			case arrow.Nanosecond, arrow.Second:
   174  				return physical, nil, fmt.Errorf("parquet version %s files can only coerce arrow timestamps to millis or micros", props.Version())
   175  			}
   176  		} else if target == arrow.Second {
   177  			return physical, nil, fmt.Errorf("parquet version %s files can only coerce arrow timestamps to millis, micros or nanos", props.Version())
   178  		}
   179  		return physical, logicalType, nil
   180  	}
   181  
   182  	// the user implicitly wants timestamp data to retain its original time units
   183  	// however the converted type field used to indicate logical types for parquet
   184  	// version <=2.4 fields, does not allow for nanosecond time units and so nanos
   185  	// must be coerced to micros
   186  	if (props.Version() == parquet.V1_0 || props.Version() == parquet.V2_4) && typ.Unit == arrow.Nanosecond {
   187  		logicalType = arrowTimestampToLogical(typ, arrow.Microsecond)
   188  		return physical, logicalType, nil
   189  	}
   190  
   191  	// the user implicitly wants timestamp data to retain it's original time units,
   192  	// however the arrow seconds time unit cannot be represented in parquet, so must
   193  	// be coerced to milliseconds
   194  	if typ.Unit == arrow.Second {
   195  		logicalType = arrowTimestampToLogical(typ, arrow.Millisecond)
   196  	}
   197  
   198  	return physical, logicalType, nil
   199  }
   200  
   201  // DecimalSize returns the minimum number of bytes necessary to represent a decimal
   202  // with the requested precision.
   203  //
   204  // Taken from the Apache Impala codebase. The comments next to the return values
   205  // are the maximum value that can be represented in 2's complement with the returned
   206  // number of bytes
   207  func DecimalSize(precision int32) int32 {
   208  	if precision < 1 {
   209  		panic("precision must be >= 1")
   210  	}
   211  
   212  	// generated in python with:
   213  	// >>> decimal_size = lambda prec: int(math.ceil((prec * math.log2(10) + 1) / 8))
   214  	// >>> [-1] + [decimal_size(i) for i in range(1, 77)]
   215  	var byteblock = [...]int32{
   216  		-1, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 9,
   217  		9, 9, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16, 16, 17,
   218  		17, 18, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25,
   219  		26, 26, 26, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32,
   220  	}
   221  
   222  	if precision <= 76 {
   223  		return byteblock[precision]
   224  	}
   225  	return int32(math.Ceil(float64(precision)/8.0)*math.Log2(10) + 1)
   226  }
   227  
   228  func repFromNullable(isnullable bool) parquet.Repetition {
   229  	if isnullable {
   230  		return parquet.Repetitions.Optional
   231  	}
   232  	return parquet.Repetitions.Required
   233  }
   234  
   235  func structToNode(typ *arrow.StructType, name string, nullable bool, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) {
   236  	if typ.NumFields() == 0 {
   237  		return nil, fmt.Errorf("cannot write struct type '%s' with no children field to parquet. Consider adding a dummy child", name)
   238  	}
   239  
   240  	children := make(schema.FieldList, 0, typ.NumFields())
   241  	for _, f := range typ.Fields() {
   242  		n, err := fieldToNode(f.Name, f, props, arrprops)
   243  		if err != nil {
   244  			return nil, err
   245  		}
   246  		children = append(children, n)
   247  	}
   248  
   249  	return schema.NewGroupNode(name, repFromNullable(nullable), children, -1)
   250  }
   251  
   252  func fieldToNode(name string, field arrow.Field, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) {
   253  	var (
   254  		logicalType schema.LogicalType = schema.NoLogicalType{}
   255  		typ         parquet.Type
   256  		repType     = repFromNullable(field.Nullable)
   257  		length      = -1
   258  		precision   = -1
   259  		scale       = -1
   260  		err         error
   261  	)
   262  
   263  	switch field.Type.ID() {
   264  	case arrow.NULL:
   265  		typ = parquet.Types.Int32
   266  		logicalType = &schema.NullLogicalType{}
   267  		if repType != parquet.Repetitions.Optional {
   268  			return nil, xerrors.New("nulltype arrow field must be nullable")
   269  		}
   270  	case arrow.BOOL:
   271  		typ = parquet.Types.Boolean
   272  	case arrow.UINT8:
   273  		typ = parquet.Types.Int32
   274  		logicalType = schema.NewIntLogicalType(8, false)
   275  	case arrow.INT8:
   276  		typ = parquet.Types.Int32
   277  		logicalType = schema.NewIntLogicalType(8, true)
   278  	case arrow.UINT16:
   279  		typ = parquet.Types.Int32
   280  		logicalType = schema.NewIntLogicalType(16, false)
   281  	case arrow.INT16:
   282  		typ = parquet.Types.Int32
   283  		logicalType = schema.NewIntLogicalType(16, true)
   284  	case arrow.UINT32:
   285  		typ = parquet.Types.Int32
   286  		logicalType = schema.NewIntLogicalType(32, false)
   287  	case arrow.INT32:
   288  		typ = parquet.Types.Int32
   289  		logicalType = schema.NewIntLogicalType(32, true)
   290  	case arrow.UINT64:
   291  		typ = parquet.Types.Int64
   292  		logicalType = schema.NewIntLogicalType(64, false)
   293  	case arrow.INT64:
   294  		typ = parquet.Types.Int64
   295  		logicalType = schema.NewIntLogicalType(64, true)
   296  	case arrow.FLOAT32:
   297  		typ = parquet.Types.Float
   298  	case arrow.FLOAT64:
   299  		typ = parquet.Types.Double
   300  	case arrow.STRING, arrow.LARGE_STRING:
   301  		logicalType = schema.StringLogicalType{}
   302  		fallthrough
   303  	case arrow.BINARY, arrow.LARGE_BINARY:
   304  		typ = parquet.Types.ByteArray
   305  	case arrow.FIXED_SIZE_BINARY:
   306  		typ = parquet.Types.FixedLenByteArray
   307  		length = field.Type.(*arrow.FixedSizeBinaryType).ByteWidth
   308  	case arrow.DECIMAL, arrow.DECIMAL256:
   309  		dectype := field.Type.(arrow.DecimalType)
   310  		precision = int(dectype.GetPrecision())
   311  		scale = int(dectype.GetScale())
   312  
   313  		if props.StoreDecimalAsInteger() && 1 <= precision && precision <= 18 {
   314  			if precision <= 9 {
   315  				typ = parquet.Types.Int32
   316  			} else {
   317  				typ = parquet.Types.Int64
   318  			}
   319  		} else {
   320  			typ = parquet.Types.FixedLenByteArray
   321  			length = int(DecimalSize(int32(precision)))
   322  		}
   323  
   324  		logicalType = schema.NewDecimalLogicalType(int32(precision), int32(scale))
   325  	case arrow.DATE32:
   326  		typ = parquet.Types.Int32
   327  		logicalType = schema.DateLogicalType{}
   328  	case arrow.DATE64:
   329  		typ = parquet.Types.Int32
   330  		logicalType = schema.DateLogicalType{}
   331  	case arrow.TIMESTAMP:
   332  		typ, logicalType, err = getTimestampMeta(field.Type.(*arrow.TimestampType), props, arrprops)
   333  		if err != nil {
   334  			return nil, err
   335  		}
   336  	case arrow.TIME32:
   337  		typ = parquet.Types.Int32
   338  		logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMillis)
   339  	case arrow.TIME64:
   340  		typ = parquet.Types.Int64
   341  		timeType := field.Type.(*arrow.Time64Type)
   342  		if timeType.Unit == arrow.Nanosecond {
   343  			logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitNanos)
   344  		} else {
   345  			logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMicros)
   346  		}
   347  	case arrow.FLOAT16:
   348  		typ = parquet.Types.FixedLenByteArray
   349  		length = arrow.Float16SizeBytes
   350  		logicalType = schema.Float16LogicalType{}
   351  	case arrow.STRUCT:
   352  		return structToNode(field.Type.(*arrow.StructType), field.Name, field.Nullable, props, arrprops)
   353  	case arrow.FIXED_SIZE_LIST, arrow.LIST:
   354  		var elem arrow.DataType
   355  		if lt, ok := field.Type.(*arrow.ListType); ok {
   356  			elem = lt.Elem()
   357  		} else {
   358  			elem = field.Type.(*arrow.FixedSizeListType).Elem()
   359  		}
   360  
   361  		child, err := fieldToNode(name, arrow.Field{Name: name, Type: elem, Nullable: true}, props, arrprops)
   362  		if err != nil {
   363  			return nil, err
   364  		}
   365  
   366  		return schema.ListOf(child, repFromNullable(field.Nullable), -1)
   367  	case arrow.DICTIONARY:
   368  		// parquet has no dictionary type, dictionary is encoding, not schema level
   369  		dictType := field.Type.(*arrow.DictionaryType)
   370  		return fieldToNode(name, arrow.Field{Name: name, Type: dictType.ValueType, Nullable: field.Nullable, Metadata: field.Metadata},
   371  			props, arrprops)
   372  	case arrow.EXTENSION:
   373  		return fieldToNode(name, arrow.Field{
   374  			Name:     name,
   375  			Type:     field.Type.(arrow.ExtensionType).StorageType(),
   376  			Nullable: field.Nullable,
   377  			Metadata: arrow.MetadataFrom(map[string]string{
   378  				ipc.ExtensionTypeKeyName:     field.Type.(arrow.ExtensionType).ExtensionName(),
   379  				ipc.ExtensionMetadataKeyName: field.Type.(arrow.ExtensionType).Serialize(),
   380  			}),
   381  		}, props, arrprops)
   382  	case arrow.MAP:
   383  		mapType := field.Type.(*arrow.MapType)
   384  		keyNode, err := fieldToNode("key", mapType.KeyField(), props, arrprops)
   385  		if err != nil {
   386  			return nil, err
   387  		}
   388  
   389  		valueNode, err := fieldToNode("value", mapType.ItemField(), props, arrprops)
   390  		if err != nil {
   391  			return nil, err
   392  		}
   393  
   394  		if arrprops.noMapLogicalType {
   395  			keyval := schema.FieldList{keyNode, valueNode}
   396  			keyvalNode, err := schema.NewGroupNode("key_value", parquet.Repetitions.Repeated, keyval, -1)
   397  			if err != nil {
   398  				return nil, err
   399  			}
   400  			return schema.NewGroupNode(field.Name, repFromNullable(field.Nullable), schema.FieldList{
   401  				keyvalNode,
   402  			}, -1)
   403  		}
   404  		return schema.MapOf(field.Name, keyNode, valueNode, repFromNullable(field.Nullable), -1)
   405  	default:
   406  		return nil, fmt.Errorf("%w: support for %s", arrow.ErrNotImplemented, field.Type.ID())
   407  	}
   408  
   409  	return schema.NewPrimitiveNodeLogical(name, repType, logicalType, typ, length, fieldIDFromMeta(field.Metadata))
   410  }
   411  
   412  const fieldIDKey = "PARQUET:field_id"
   413  
   414  func fieldIDFromMeta(m arrow.Metadata) int32 {
   415  	if m.Len() == 0 {
   416  		return -1
   417  	}
   418  
   419  	key := m.FindKey(fieldIDKey)
   420  	if key < 0 {
   421  		return -1
   422  	}
   423  
   424  	id, err := strconv.ParseInt(m.Values()[key], 10, 32)
   425  	if err != nil {
   426  		return -1
   427  	}
   428  
   429  	if id < 0 {
   430  		return -1
   431  	}
   432  
   433  	return int32(id)
   434  }
   435  
   436  // ToParquet generates a Parquet Schema from an arrow Schema using the given properties to make
   437  // decisions when determining the logical/physical types of the columns.
   438  func ToParquet(sc *arrow.Schema, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (*schema.Schema, error) {
   439  	if props == nil {
   440  		props = parquet.NewWriterProperties()
   441  	}
   442  
   443  	nodes := make(schema.FieldList, 0, sc.NumFields())
   444  	for _, f := range sc.Fields() {
   445  		n, err := fieldToNode(f.Name, f, props, arrprops)
   446  		if err != nil {
   447  			return nil, err
   448  		}
   449  		nodes = append(nodes, n)
   450  	}
   451  
   452  	root, err := schema.NewGroupNode(props.RootName(), props.RootRepetition(), nodes, -1)
   453  	if err != nil {
   454  		return nil, err
   455  	}
   456  
   457  	return schema.NewSchema(root), err
   458  }
   459  
   460  type schemaTree struct {
   461  	manifest *SchemaManifest
   462  
   463  	schema *schema.Schema
   464  	props  *ArrowReadProperties
   465  }
   466  
   467  func (s schemaTree) LinkParent(child, parent *SchemaField) {
   468  	s.manifest.ChildToParent[child] = parent
   469  }
   470  
   471  func (s schemaTree) RecordLeaf(leaf *SchemaField) {
   472  	s.manifest.ColIndexToField[leaf.ColIndex] = leaf
   473  }
   474  
   475  func arrowInt(log *schema.IntLogicalType) (arrow.DataType, error) {
   476  	switch log.BitWidth() {
   477  	case 8:
   478  		if log.IsSigned() {
   479  			return arrow.PrimitiveTypes.Int8, nil
   480  		}
   481  		return arrow.PrimitiveTypes.Uint8, nil
   482  	case 16:
   483  		if log.IsSigned() {
   484  			return arrow.PrimitiveTypes.Int16, nil
   485  		}
   486  		return arrow.PrimitiveTypes.Uint16, nil
   487  	case 32:
   488  		if log.IsSigned() {
   489  			return arrow.PrimitiveTypes.Int32, nil
   490  		}
   491  		return arrow.PrimitiveTypes.Uint32, nil
   492  	case 64:
   493  		if log.IsSigned() {
   494  			return arrow.PrimitiveTypes.Int64, nil
   495  		}
   496  		return arrow.PrimitiveTypes.Uint64, nil
   497  	default:
   498  		return nil, xerrors.New("invalid logical type for int32")
   499  	}
   500  }
   501  
   502  func arrowTime32(logical *schema.TimeLogicalType) (arrow.DataType, error) {
   503  	if logical.TimeUnit() == schema.TimeUnitMillis {
   504  		return arrow.FixedWidthTypes.Time32ms, nil
   505  	}
   506  
   507  	return nil, xerrors.New(logical.String() + " cannot annotate a time32")
   508  }
   509  
   510  func arrowTime64(logical *schema.TimeLogicalType) (arrow.DataType, error) {
   511  	switch logical.TimeUnit() {
   512  	case schema.TimeUnitMicros:
   513  		return arrow.FixedWidthTypes.Time64us, nil
   514  	case schema.TimeUnitNanos:
   515  		return arrow.FixedWidthTypes.Time64ns, nil
   516  	default:
   517  		return nil, xerrors.New(logical.String() + " cannot annotate int64")
   518  	}
   519  }
   520  
   521  func arrowTimestamp(logical *schema.TimestampLogicalType) (arrow.DataType, error) {
   522  	tz := ""
   523  
   524  	// ConvertedTypes are adjusted to UTC per backward compatibility guidelines
   525  	// https://github.com/apache/parquet-format/blob/eb4b31c1d64a01088d02a2f9aefc6c17c54cc6fc/LogicalTypes.md?plain=1#L480-L485
   526  	if logical.IsAdjustedToUTC() || logical.IsFromConvertedType() {
   527  		tz = "UTC"
   528  	}
   529  
   530  	switch logical.TimeUnit() {
   531  	case schema.TimeUnitMillis:
   532  		return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Millisecond}, nil
   533  	case schema.TimeUnitMicros:
   534  		return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Microsecond}, nil
   535  	case schema.TimeUnitNanos:
   536  		return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Nanosecond}, nil
   537  	default:
   538  		return nil, xerrors.New("Unrecognized unit in timestamp logical type " + logical.String())
   539  	}
   540  }
   541  
   542  func arrowDecimal(logical *schema.DecimalLogicalType) arrow.DataType {
   543  	if logical.Precision() <= decimal128.MaxPrecision {
   544  		return &arrow.Decimal128Type{Precision: logical.Precision(), Scale: logical.Scale()}
   545  	}
   546  	return &arrow.Decimal256Type{Precision: logical.Precision(), Scale: logical.Scale()}
   547  }
   548  
   549  func arrowFromInt32(logical schema.LogicalType) (arrow.DataType, error) {
   550  	switch logtype := logical.(type) {
   551  	case schema.NoLogicalType:
   552  		return arrow.PrimitiveTypes.Int32, nil
   553  	case *schema.TimeLogicalType:
   554  		return arrowTime32(logtype)
   555  	case *schema.DecimalLogicalType:
   556  		return arrowDecimal(logtype), nil
   557  	case *schema.IntLogicalType:
   558  		return arrowInt(logtype)
   559  	case schema.DateLogicalType:
   560  		return arrow.FixedWidthTypes.Date32, nil
   561  	default:
   562  		return nil, xerrors.New(logical.String() + " cannot annotate int32")
   563  	}
   564  }
   565  
   566  func arrowFromInt64(logical schema.LogicalType) (arrow.DataType, error) {
   567  	if logical.IsNone() {
   568  		return arrow.PrimitiveTypes.Int64, nil
   569  	}
   570  
   571  	switch logtype := logical.(type) {
   572  	case *schema.IntLogicalType:
   573  		return arrowInt(logtype)
   574  	case *schema.DecimalLogicalType:
   575  		return arrowDecimal(logtype), nil
   576  	case *schema.TimeLogicalType:
   577  		return arrowTime64(logtype)
   578  	case *schema.TimestampLogicalType:
   579  		return arrowTimestamp(logtype)
   580  	default:
   581  		return nil, xerrors.New(logical.String() + " cannot annotate int64")
   582  	}
   583  }
   584  
   585  func arrowFromByteArray(logical schema.LogicalType) (arrow.DataType, error) {
   586  	switch logtype := logical.(type) {
   587  	case schema.StringLogicalType:
   588  		return arrow.BinaryTypes.String, nil
   589  	case *schema.DecimalLogicalType:
   590  		return arrowDecimal(logtype), nil
   591  	case schema.NoLogicalType,
   592  		schema.EnumLogicalType,
   593  		schema.JSONLogicalType,
   594  		schema.BSONLogicalType:
   595  		return arrow.BinaryTypes.Binary, nil
   596  	default:
   597  		return nil, xerrors.New("unhandled logicaltype " + logical.String() + " for byte_array")
   598  	}
   599  }
   600  
   601  func arrowFromFLBA(logical schema.LogicalType, length int) (arrow.DataType, error) {
   602  	switch logtype := logical.(type) {
   603  	case *schema.DecimalLogicalType:
   604  		return arrowDecimal(logtype), nil
   605  	case schema.NoLogicalType, schema.IntervalLogicalType, schema.UUIDLogicalType:
   606  		return &arrow.FixedSizeBinaryType{ByteWidth: int(length)}, nil
   607  	case schema.Float16LogicalType:
   608  		return &arrow.Float16Type{}, nil
   609  	default:
   610  		return nil, xerrors.New("unhandled logical type " + logical.String() + " for fixed-length byte array")
   611  	}
   612  }
   613  
   614  func getArrowType(physical parquet.Type, logical schema.LogicalType, typeLen int) (arrow.DataType, error) {
   615  	if !logical.IsValid() || logical.Equals(schema.NullLogicalType{}) {
   616  		return arrow.Null, nil
   617  	}
   618  
   619  	switch physical {
   620  	case parquet.Types.Boolean:
   621  		return arrow.FixedWidthTypes.Boolean, nil
   622  	case parquet.Types.Int32:
   623  		return arrowFromInt32(logical)
   624  	case parquet.Types.Int64:
   625  		return arrowFromInt64(logical)
   626  	case parquet.Types.Int96:
   627  		return arrow.FixedWidthTypes.Timestamp_ns, nil
   628  	case parquet.Types.Float:
   629  		return arrow.PrimitiveTypes.Float32, nil
   630  	case parquet.Types.Double:
   631  		return arrow.PrimitiveTypes.Float64, nil
   632  	case parquet.Types.ByteArray:
   633  		return arrowFromByteArray(logical)
   634  	case parquet.Types.FixedLenByteArray:
   635  		return arrowFromFLBA(logical, typeLen)
   636  	default:
   637  		return nil, xerrors.New("invalid physical column type")
   638  	}
   639  }
   640  
   641  func populateLeaf(colIndex int, field *arrow.Field, currentLevels file.LevelInfo, ctx *schemaTree, parent *SchemaField, out *SchemaField) {
   642  	out.Field = field
   643  	out.ColIndex = colIndex
   644  	out.LevelInfo = currentLevels
   645  	ctx.RecordLeaf(out)
   646  	ctx.LinkParent(out, parent)
   647  }
   648  
   649  func listToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   650  	if n.NumFields() != 1 {
   651  		return xerrors.New("LIST groups must have only 1 child")
   652  	}
   653  
   654  	if n.RepetitionType() == parquet.Repetitions.Repeated {
   655  		return xerrors.New("LIST groups must not be repeated")
   656  	}
   657  
   658  	currentLevels.Increment(n)
   659  
   660  	out.Children = make([]SchemaField, n.NumFields())
   661  	ctx.LinkParent(out, parent)
   662  	ctx.LinkParent(&out.Children[0], out)
   663  
   664  	listNode := n.Field(0)
   665  	if listNode.RepetitionType() != parquet.Repetitions.Repeated {
   666  		return xerrors.New("non-repeated nodes in a list group are not supported")
   667  	}
   668  
   669  	repeatedAncestorDef := currentLevels.IncrementRepeated()
   670  	if listNode.Type() == schema.Group {
   671  		// Resolve 3-level encoding
   672  		//
   673  		// required/optional group name=whatever {
   674  		//   repeated group name=list {
   675  		//     required/optional TYPE item;
   676  		//   }
   677  		// }
   678  		//
   679  		// yields list<item: TYPE ?nullable> ?nullable
   680  		//
   681  		// We distinguish the special case that we have
   682  		//
   683  		// required/optional group name=whatever {
   684  		//   repeated group name=array or $SOMETHING_tuple {
   685  		//     required/optional TYPE item;
   686  		//   }
   687  		// }
   688  		//
   689  		// In this latter case, the inner type of the list should be a struct
   690  		// rather than a primitive value
   691  		//
   692  		// yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
   693  		// Special case mentioned in the format spec:
   694  		//   If the name is array or ends in _tuple, this should be a list of struct
   695  		//   even for single child elements.
   696  		listGroup := listNode.(*schema.GroupNode)
   697  		if listGroup.NumFields() == 1 && !(listGroup.Name() == "array" || listGroup.Name() == (n.Name()+"_tuple")) {
   698  			// list of primitive type
   699  			if err := nodeToSchemaField(listGroup.Field(0), currentLevels, ctx, out, &out.Children[0]); err != nil {
   700  				return err
   701  			}
   702  		} else {
   703  			if err := groupToStructField(listGroup, currentLevels, ctx, out, &out.Children[0]); err != nil {
   704  				return err
   705  			}
   706  		}
   707  	} else {
   708  		// Two-level list encoding
   709  		//
   710  		// required/optional group LIST {
   711  		//   repeated TYPE;
   712  		// }
   713  		primitiveNode := listNode.(*schema.PrimitiveNode)
   714  		colIndex := ctx.schema.ColumnIndexByNode(primitiveNode)
   715  		arrowType, err := getArrowType(primitiveNode.PhysicalType(), primitiveNode.LogicalType(), primitiveNode.TypeLength())
   716  		if err != nil {
   717  			return err
   718  		}
   719  
   720  		if ctx.props.ReadDict(colIndex) && isDictionaryReadSupported(arrowType) {
   721  			arrowType = &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32, ValueType: arrowType}
   722  		}
   723  
   724  		itemField := arrow.Field{Name: listNode.Name(), Type: arrowType, Nullable: false, Metadata: createFieldMeta(int(listNode.FieldID()))}
   725  		populateLeaf(colIndex, &itemField, currentLevels, ctx, out, &out.Children[0])
   726  	}
   727  
   728  	out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOfField(
   729  		arrow.Field{Name: listNode.Name(), Type: out.Children[0].Field.Type, Nullable: true}),
   730  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional, Metadata: createFieldMeta(int(n.FieldID()))}
   731  
   732  	out.LevelInfo = currentLevels
   733  	// At this point current levels contains the def level for this list,
   734  	// we need to reset to the prior parent.
   735  	out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef
   736  	return nil
   737  }
   738  
   739  func groupToStructField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   740  	arrowFields := make([]arrow.Field, 0, n.NumFields())
   741  	out.Children = make([]SchemaField, n.NumFields())
   742  
   743  	for i := 0; i < n.NumFields(); i++ {
   744  		if err := nodeToSchemaField(n.Field(i), currentLevels, ctx, out, &out.Children[i]); err != nil {
   745  			return err
   746  		}
   747  		arrowFields = append(arrowFields, *out.Children[i].Field)
   748  	}
   749  
   750  	out.Field = &arrow.Field{Name: n.Name(), Type: arrow.StructOf(arrowFields...),
   751  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional, Metadata: createFieldMeta(int(n.FieldID()))}
   752  	out.LevelInfo = currentLevels
   753  	return nil
   754  }
   755  
   756  func mapToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   757  	if n.NumFields() != 1 {
   758  		return xerrors.New("MAP group must have exactly 1 child")
   759  	}
   760  	if n.RepetitionType() == parquet.Repetitions.Repeated {
   761  		return xerrors.New("MAP groups must not be repeated")
   762  	}
   763  
   764  	keyvalueNode := n.Field(0)
   765  	if keyvalueNode.RepetitionType() != parquet.Repetitions.Repeated {
   766  		return xerrors.New("Non-repeated keyvalue group in MAP group is not supported")
   767  	}
   768  
   769  	if keyvalueNode.Type() != schema.Group {
   770  		return xerrors.New("keyvalue node must be a group")
   771  	}
   772  
   773  	kvgroup := keyvalueNode.(*schema.GroupNode)
   774  	if kvgroup.NumFields() != 1 && kvgroup.NumFields() != 2 {
   775  		return fmt.Errorf("keyvalue node group must have exactly 1 or 2 child elements, Found %d", kvgroup.NumFields())
   776  	}
   777  
   778  	keyNode := kvgroup.Field(0)
   779  	if keyNode.RepetitionType() != parquet.Repetitions.Required {
   780  		return xerrors.New("MAP keys must be required")
   781  	}
   782  
   783  	// Arrow doesn't support 1 column maps (i.e. Sets).  The options are to either
   784  	// make the values column nullable, or process the map as a list.  We choose the latter
   785  	// as it is simpler.
   786  	if kvgroup.NumFields() == 1 {
   787  		return listToSchemaField(n, currentLevels, ctx, parent, out)
   788  	}
   789  
   790  	currentLevels.Increment(n)
   791  	repeatedAncestorDef := currentLevels.IncrementRepeated()
   792  	out.Children = make([]SchemaField, 1)
   793  
   794  	kvfield := &out.Children[0]
   795  	kvfield.Children = make([]SchemaField, 2)
   796  
   797  	keyField := &kvfield.Children[0]
   798  	valueField := &kvfield.Children[1]
   799  
   800  	ctx.LinkParent(out, parent)
   801  	ctx.LinkParent(kvfield, out)
   802  	ctx.LinkParent(keyField, kvfield)
   803  	ctx.LinkParent(valueField, kvfield)
   804  
   805  	// required/optional group name=whatever {
   806  	//   repeated group name=key_values{
   807  	//     required TYPE key;
   808  	// required/optional TYPE value;
   809  	//   }
   810  	// }
   811  	//
   812  
   813  	if err := nodeToSchemaField(keyNode, currentLevels, ctx, kvfield, keyField); err != nil {
   814  		return err
   815  	}
   816  	if err := nodeToSchemaField(kvgroup.Field(1), currentLevels, ctx, kvfield, valueField); err != nil {
   817  		return err
   818  	}
   819  
   820  	kvfield.Field = &arrow.Field{Name: n.Name(), Type: arrow.StructOf(*keyField.Field, *valueField.Field),
   821  		Nullable: false, Metadata: createFieldMeta(int(kvgroup.FieldID()))}
   822  
   823  	kvfield.LevelInfo = currentLevels
   824  	out.Field = &arrow.Field{Name: n.Name(), Type: arrow.MapOf(keyField.Field.Type, valueField.Field.Type),
   825  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional,
   826  		Metadata: createFieldMeta(int(n.FieldID()))}
   827  	out.LevelInfo = currentLevels
   828  	// At this point current levels contains the def level for this map,
   829  	// we need to reset to the prior parent.
   830  	out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef
   831  	return nil
   832  }
   833  
   834  func groupToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   835  	if n.LogicalType().Equals(schema.NewListLogicalType()) {
   836  		return listToSchemaField(n, currentLevels, ctx, parent, out)
   837  	} else if n.LogicalType().Equals(schema.MapLogicalType{}) {
   838  		return mapToSchemaField(n, currentLevels, ctx, parent, out)
   839  	}
   840  
   841  	if n.RepetitionType() == parquet.Repetitions.Repeated {
   842  		// Simple repeated struct
   843  		//
   844  		// repeated group $NAME {
   845  		//   r/o TYPE[0] f0
   846  		//   r/o TYPE[1] f1
   847  		// }
   848  		out.Children = make([]SchemaField, 1)
   849  		repeatedAncestorDef := currentLevels.IncrementRepeated()
   850  		if err := groupToStructField(n, currentLevels, ctx, out, &out.Children[0]); err != nil {
   851  			return err
   852  		}
   853  
   854  		out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOf(out.Children[0].Field.Type), Nullable: false,
   855  			Metadata: createFieldMeta(int(n.FieldID()))}
   856  		ctx.LinkParent(&out.Children[0], out)
   857  		out.LevelInfo = currentLevels
   858  		out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef
   859  		return nil
   860  	}
   861  
   862  	currentLevels.Increment(n)
   863  	return groupToStructField(n, currentLevels, ctx, parent, out)
   864  }
   865  
   866  func createFieldMeta(fieldID int) arrow.Metadata {
   867  	return arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{strconv.Itoa(fieldID)})
   868  }
   869  
   870  func nodeToSchemaField(n schema.Node, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   871  	ctx.LinkParent(out, parent)
   872  
   873  	if n.Type() == schema.Group {
   874  		return groupToSchemaField(n.(*schema.GroupNode), currentLevels, ctx, parent, out)
   875  	}
   876  
   877  	// Either a normal flat primitive type, or a list type encoded with 1-level
   878  	// list encoding. Note that the 3-level encoding is the form recommended by
   879  	// the parquet specification, but technically we can have either
   880  	//
   881  	// required/optional $TYPE $FIELD_NAME
   882  	//
   883  	// or
   884  	//
   885  	// repeated $TYPE $FIELD_NAME
   886  
   887  	primitive := n.(*schema.PrimitiveNode)
   888  	colIndex := ctx.schema.ColumnIndexByNode(primitive)
   889  	arrowType, err := getArrowType(primitive.PhysicalType(), primitive.LogicalType(), primitive.TypeLength())
   890  	if err != nil {
   891  		return err
   892  	}
   893  
   894  	if ctx.props.ReadDict(colIndex) && isDictionaryReadSupported(arrowType) {
   895  		arrowType = &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32, ValueType: arrowType}
   896  	}
   897  
   898  	if primitive.RepetitionType() == parquet.Repetitions.Repeated {
   899  		// one-level list encoding e.g. a: repeated int32;
   900  		repeatedAncestorDefLevel := currentLevels.IncrementRepeated()
   901  		out.Children = make([]SchemaField, 1)
   902  		child := arrow.Field{Name: primitive.Name(), Type: arrowType, Nullable: false}
   903  		populateLeaf(colIndex, &child, currentLevels, ctx, out, &out.Children[0])
   904  		out.Field = &arrow.Field{Name: primitive.Name(), Type: arrow.ListOf(child.Type), Nullable: false,
   905  			Metadata: createFieldMeta(int(primitive.FieldID()))}
   906  		out.LevelInfo = currentLevels
   907  		out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDefLevel
   908  		return nil
   909  	}
   910  
   911  	currentLevels.Increment(n)
   912  	populateLeaf(colIndex, &arrow.Field{Name: n.Name(), Type: arrowType,
   913  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional,
   914  		Metadata: createFieldMeta(int(n.FieldID()))},
   915  		currentLevels, ctx, parent, out)
   916  	return nil
   917  }
   918  
   919  func getOriginSchema(meta metadata.KeyValueMetadata, mem memory.Allocator) (*arrow.Schema, error) {
   920  	if meta == nil {
   921  		return nil, nil
   922  	}
   923  
   924  	const arrowSchemaKey = "ARROW:schema"
   925  	serialized := meta.FindValue(arrowSchemaKey)
   926  	if serialized == nil {
   927  		return nil, nil
   928  	}
   929  
   930  	var (
   931  		decoded []byte
   932  		err     error
   933  	)
   934  
   935  	// if the length of serialized is not a multiple of 4, it cannot be
   936  	// padded with std encoding.
   937  	if len(*serialized)%4 == 0 {
   938  		decoded, err = base64.StdEncoding.DecodeString(*serialized)
   939  	}
   940  	// if we failed to decode it with stdencoding or the length wasn't
   941  	// a multiple of 4, try using the Raw unpadded encoding
   942  	if len(decoded) == 0 || err != nil {
   943  		decoded, err = base64.RawStdEncoding.DecodeString(*serialized)
   944  	}
   945  
   946  	if err != nil {
   947  		return nil, err
   948  	}
   949  
   950  	return flight.DeserializeSchema(decoded, mem)
   951  }
   952  
   953  func getNestedFactory(origin, inferred arrow.DataType) func(fieldList []arrow.Field) arrow.DataType {
   954  	switch inferred.ID() {
   955  	case arrow.STRUCT:
   956  		if origin.ID() == arrow.STRUCT {
   957  			return func(list []arrow.Field) arrow.DataType {
   958  				return arrow.StructOf(list...)
   959  			}
   960  		}
   961  	case arrow.LIST:
   962  		switch origin.ID() {
   963  		case arrow.LIST:
   964  			return func(list []arrow.Field) arrow.DataType {
   965  				return arrow.ListOf(list[0].Type)
   966  			}
   967  		case arrow.FIXED_SIZE_LIST:
   968  			sz := origin.(*arrow.FixedSizeListType).Len()
   969  			return func(list []arrow.Field) arrow.DataType {
   970  				return arrow.FixedSizeListOf(sz, list[0].Type)
   971  			}
   972  		}
   973  	case arrow.MAP:
   974  		if origin.ID() == arrow.MAP {
   975  			return func(list []arrow.Field) arrow.DataType {
   976  				valType := list[0].Type.(*arrow.StructType)
   977  				return arrow.MapOf(valType.Field(0).Type, valType.Field(1).Type)
   978  			}
   979  		}
   980  	}
   981  	return nil
   982  }
   983  
   984  func applyOriginalStorageMetadata(origin arrow.Field, inferred *SchemaField) (modified bool, err error) {
   985  	nchildren := len(inferred.Children)
   986  	switch origin.Type.ID() {
   987  	case arrow.EXTENSION:
   988  		extType := origin.Type.(arrow.ExtensionType)
   989  		modified, err = applyOriginalStorageMetadata(arrow.Field{
   990  			Type:     extType.StorageType(),
   991  			Metadata: origin.Metadata,
   992  		}, inferred)
   993  		if err != nil {
   994  			return
   995  		}
   996  
   997  		if !arrow.TypeEqual(extType.StorageType(), inferred.Field.Type) {
   998  			return modified, fmt.Errorf("%w: mismatch storage type '%s' for extension type '%s'",
   999  				arrow.ErrInvalid, inferred.Field.Type, extType)
  1000  		}
  1001  
  1002  		inferred.Field.Type = extType
  1003  		modified = true
  1004  	case arrow.SPARSE_UNION, arrow.DENSE_UNION:
  1005  		err = xerrors.New("unimplemented type")
  1006  	case arrow.STRUCT:
  1007  		typ := origin.Type.(*arrow.StructType)
  1008  		if nchildren != typ.NumFields() {
  1009  			return
  1010  		}
  1011  
  1012  		factory := getNestedFactory(typ, inferred.Field.Type)
  1013  		if factory == nil {
  1014  			return
  1015  		}
  1016  
  1017  		modified = typ.ID() != inferred.Field.Type.ID()
  1018  		for idx := range inferred.Children {
  1019  			childMod, err := applyOriginalMetadata(typ.Field(idx), &inferred.Children[idx])
  1020  			if err != nil {
  1021  				return false, err
  1022  			}
  1023  			modified = modified || childMod
  1024  		}
  1025  		if modified {
  1026  			modifiedChildren := make([]arrow.Field, len(inferred.Children))
  1027  			for idx, child := range inferred.Children {
  1028  				modifiedChildren[idx] = *child.Field
  1029  			}
  1030  			inferred.Field.Type = factory(modifiedChildren)
  1031  		}
  1032  	case arrow.FIXED_SIZE_LIST, arrow.LIST, arrow.LARGE_LIST, arrow.MAP: // arrow.ListLike
  1033  		if nchildren != 1 {
  1034  			return
  1035  		}
  1036  		factory := getNestedFactory(origin.Type, inferred.Field.Type)
  1037  		if factory == nil {
  1038  			return
  1039  		}
  1040  
  1041  		modified = origin.Type.ID() != inferred.Field.Type.ID()
  1042  		childModified, err := applyOriginalMetadata(arrow.Field{Type: origin.Type.(arrow.ListLikeType).Elem()}, &inferred.Children[0])
  1043  		if err != nil {
  1044  			return modified, err
  1045  		}
  1046  		modified = modified || childModified
  1047  		if modified {
  1048  			inferred.Field.Type = factory([]arrow.Field{*inferred.Children[0].Field})
  1049  		}
  1050  	case arrow.TIMESTAMP:
  1051  		if inferred.Field.Type.ID() != arrow.TIMESTAMP {
  1052  			return
  1053  		}
  1054  
  1055  		tsOtype := origin.Type.(*arrow.TimestampType)
  1056  		tsInfType := inferred.Field.Type.(*arrow.TimestampType)
  1057  
  1058  		// if the unit is the same and the data is tz-aware, then set the original time zone
  1059  		// since parquet has no native storage of timezones
  1060  		if tsOtype.Unit == tsInfType.Unit && tsInfType.TimeZone == "UTC" && tsOtype.TimeZone != "" {
  1061  			inferred.Field.Type = origin.Type
  1062  		}
  1063  		modified = true
  1064  	case arrow.LARGE_STRING, arrow.LARGE_BINARY:
  1065  		inferred.Field.Type = origin.Type
  1066  		modified = true
  1067  	case arrow.DICTIONARY:
  1068  		if origin.Type.ID() != arrow.DICTIONARY || (inferred.Field.Type.ID() == arrow.DICTIONARY || !isDictionaryReadSupported(inferred.Field.Type)) {
  1069  			return
  1070  		}
  1071  
  1072  		// direct dictionary reads are only supported for a few primitive types
  1073  		// so no need to recurse on value types
  1074  		dictOriginType := origin.Type.(*arrow.DictionaryType)
  1075  		inferred.Field.Type = &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32,
  1076  			ValueType: inferred.Field.Type, Ordered: dictOriginType.Ordered}
  1077  		modified = true
  1078  	case arrow.DECIMAL256:
  1079  		if inferred.Field.Type.ID() == arrow.DECIMAL128 {
  1080  			inferred.Field.Type = origin.Type
  1081  			modified = true
  1082  		}
  1083  	}
  1084  
  1085  	if origin.HasMetadata() {
  1086  		meta := origin.Metadata
  1087  		if inferred.Field.HasMetadata() {
  1088  			final := make(map[string]string)
  1089  			for idx, k := range meta.Keys() {
  1090  				final[k] = meta.Values()[idx]
  1091  			}
  1092  			for idx, k := range inferred.Field.Metadata.Keys() {
  1093  				final[k] = inferred.Field.Metadata.Values()[idx]
  1094  			}
  1095  			inferred.Field.Metadata = arrow.MetadataFrom(final)
  1096  		} else {
  1097  			inferred.Field.Metadata = meta
  1098  		}
  1099  		modified = true
  1100  	}
  1101  
  1102  	return
  1103  }
  1104  
  1105  func applyOriginalMetadata(origin arrow.Field, inferred *SchemaField) (bool, error) {
  1106  	return applyOriginalStorageMetadata(origin, inferred)
  1107  }
  1108  
  1109  // NewSchemaManifest creates a manifest for mapping a parquet schema to a given arrow schema.
  1110  //
  1111  // The metadata passed in should be the file level key value metadata from the parquet file or nil.
  1112  // If the ARROW:schema was in the metadata, then it is utilized to determine types.
  1113  func NewSchemaManifest(sc *schema.Schema, meta metadata.KeyValueMetadata, props *ArrowReadProperties) (*SchemaManifest, error) {
  1114  	var ctx schemaTree
  1115  	ctx.manifest = &SchemaManifest{
  1116  		ColIndexToField: make(map[int]*SchemaField),
  1117  		ChildToParent:   make(map[*SchemaField]*SchemaField),
  1118  		descr:           sc,
  1119  		Fields:          make([]SchemaField, sc.Root().NumFields()),
  1120  	}
  1121  	ctx.props = props
  1122  	if ctx.props == nil {
  1123  		ctx.props = &ArrowReadProperties{}
  1124  	}
  1125  	ctx.schema = sc
  1126  
  1127  	var err error
  1128  	ctx.manifest.OriginSchema, err = getOriginSchema(meta, memory.DefaultAllocator)
  1129  	if err != nil {
  1130  		return nil, err
  1131  	}
  1132  
  1133  	// if original schema is not compatible with the parquet schema, ignore it
  1134  	if ctx.manifest.OriginSchema != nil && len(ctx.manifest.OriginSchema.Fields()) != sc.Root().NumFields() {
  1135  		ctx.manifest.OriginSchema = nil
  1136  	}
  1137  
  1138  	for idx := range ctx.manifest.Fields {
  1139  		field := &ctx.manifest.Fields[idx]
  1140  		if err := nodeToSchemaField(sc.Root().Field(idx), file.LevelInfo{NullSlotUsage: 1}, &ctx, nil, field); err != nil {
  1141  			return nil, err
  1142  		}
  1143  
  1144  		if ctx.manifest.OriginSchema != nil {
  1145  			if _, err := applyOriginalMetadata(ctx.manifest.OriginSchema.Field(idx), field); err != nil {
  1146  				return nil, err
  1147  			}
  1148  		}
  1149  	}
  1150  	return ctx.manifest, nil
  1151  }
  1152  
  1153  // FromParquet generates an arrow Schema from a provided Parquet Schema
  1154  func FromParquet(sc *schema.Schema, props *ArrowReadProperties, kv metadata.KeyValueMetadata) (*arrow.Schema, error) {
  1155  	manifest, err := NewSchemaManifest(sc, kv, props)
  1156  	if err != nil {
  1157  		return nil, err
  1158  	}
  1159  
  1160  	fields := make([]arrow.Field, len(manifest.Fields))
  1161  	for idx, field := range manifest.Fields {
  1162  		fields[idx] = *field.Field
  1163  	}
  1164  
  1165  	if manifest.OriginSchema != nil {
  1166  		meta := manifest.OriginSchema.Metadata()
  1167  		return arrow.NewSchema(fields, &meta), nil
  1168  	}
  1169  	return arrow.NewSchema(fields, manifest.SchemaMeta), nil
  1170  }