github.com/apache/arrow/go/v14@v14.0.1/parquet/pqarrow/schema.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow
    18  
    19  import (
    20  	"encoding/base64"
    21  	"fmt"
    22  	"math"
    23  	"strconv"
    24  
    25  	"github.com/apache/arrow/go/v14/arrow"
    26  	"github.com/apache/arrow/go/v14/arrow/decimal128"
    27  	"github.com/apache/arrow/go/v14/arrow/flight"
    28  	"github.com/apache/arrow/go/v14/arrow/ipc"
    29  	"github.com/apache/arrow/go/v14/arrow/memory"
    30  	"github.com/apache/arrow/go/v14/parquet"
    31  	"github.com/apache/arrow/go/v14/parquet/file"
    32  	"github.com/apache/arrow/go/v14/parquet/metadata"
    33  	"github.com/apache/arrow/go/v14/parquet/schema"
    34  	"golang.org/x/xerrors"
    35  )
    36  
    37  // SchemaField is a holder that defines a specific logical field in the schema
    38  // which could potentially refer to multiple physical columns in the underlying
    39  // parquet file if it is a nested type.
    40  //
    41  // ColIndex is only populated (not -1) when it is a leaf column.
    42  type SchemaField struct {
    43  	Field     *arrow.Field
    44  	Children  []SchemaField
    45  	ColIndex  int
    46  	LevelInfo file.LevelInfo
    47  }
    48  
    49  // IsLeaf returns true if the SchemaField is a leaf column, ie: ColIndex != -1
    50  func (s *SchemaField) IsLeaf() bool { return s.ColIndex != -1 }
    51  
    52  // SchemaManifest represents a full manifest for mapping a Parquet schema
    53  // to an arrow Schema.
    54  type SchemaManifest struct {
    55  	descr        *schema.Schema
    56  	OriginSchema *arrow.Schema
    57  	SchemaMeta   *arrow.Metadata
    58  
    59  	ColIndexToField map[int]*SchemaField
    60  	ChildToParent   map[*SchemaField]*SchemaField
    61  	Fields          []SchemaField
    62  }
    63  
    64  // GetColumnField returns the corresponding Field for a given column index.
    65  func (sm *SchemaManifest) GetColumnField(index int) (*SchemaField, error) {
    66  	if field, ok := sm.ColIndexToField[index]; ok {
    67  		return field, nil
    68  	}
    69  	return nil, fmt.Errorf("Column Index %d not found in schema manifest", index)
    70  }
    71  
    72  // GetParent gets the parent field for a given field if it is a nested column, otherwise
    73  // returns nil if there is no parent field.
    74  func (sm *SchemaManifest) GetParent(field *SchemaField) *SchemaField {
    75  	if p, ok := sm.ChildToParent[field]; ok {
    76  		return p
    77  	}
    78  	return nil
    79  }
    80  
    81  // GetFieldIndices coalesces a list of field indices (relative to the equivalent arrow::Schema) which
    82  // correspond to the column root (first node below the parquet schema's root group) of
    83  // each leaf referenced in column_indices.
    84  //
    85  // For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
    86  // the roots are `a` and `i` (return=[0,2]).
    87  //
    88  // root
    89  // -- a  <------
    90  // -- -- b  |  |
    91  // -- -- -- c  |
    92  // -- -- -- d  |
    93  // -- -- -- -- e
    94  // -- f
    95  // -- -- g
    96  // -- -- -- h
    97  // -- i  <---
    98  // -- -- j  |
    99  // -- -- -- k
   100  func (sm *SchemaManifest) GetFieldIndices(indices []int) ([]int, error) {
   101  	added := make(map[int]bool)
   102  	ret := make([]int, 0)
   103  
   104  	for _, idx := range indices {
   105  		if idx < 0 || idx >= sm.descr.NumColumns() {
   106  			return nil, fmt.Errorf("column index %d is not valid", idx)
   107  		}
   108  
   109  		fieldNode := sm.descr.ColumnRoot(idx)
   110  		fieldIdx := sm.descr.Root().FieldIndexByField(fieldNode)
   111  		if fieldIdx == -1 {
   112  			return nil, fmt.Errorf("column index %d is not valid", idx)
   113  		}
   114  
   115  		if _, ok := added[fieldIdx]; !ok {
   116  			ret = append(ret, fieldIdx)
   117  			added[fieldIdx] = true
   118  		}
   119  	}
   120  	return ret, nil
   121  }
   122  
   123  func isDictionaryReadSupported(dt arrow.DataType) bool {
   124  	return arrow.IsBinaryLike(dt.ID())
   125  }
   126  
   127  func arrowTimestampToLogical(typ *arrow.TimestampType, unit arrow.TimeUnit) schema.LogicalType {
   128  	utc := typ.TimeZone == "" || typ.TimeZone == "UTC"
   129  
   130  	// for forward compatibility reasons, and because there's no other way
   131  	// to signal to old readers that values are timestamps, we force
   132  	// the convertedtype field to be set to the corresponding TIMESTAMP_* value.
   133  	// this does cause some ambiguity as parquet readers have not been consistent
   134  	// about the interpretation of TIMESTAMP_* values as being utc-normalized
   135  	// see ARROW-5878
   136  	var scunit schema.TimeUnitType
   137  	switch unit {
   138  	case arrow.Millisecond:
   139  		scunit = schema.TimeUnitMillis
   140  	case arrow.Microsecond:
   141  		scunit = schema.TimeUnitMicros
   142  	case arrow.Nanosecond:
   143  		scunit = schema.TimeUnitNanos
   144  	case arrow.Second:
   145  		// no equivalent in parquet
   146  		return schema.NoLogicalType{}
   147  	}
   148  
   149  	return schema.NewTimestampLogicalTypeForce(utc, scunit)
   150  }
   151  
   152  func getTimestampMeta(typ *arrow.TimestampType, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (parquet.Type, schema.LogicalType, error) {
   153  	coerce := arrprops.coerceTimestamps
   154  	target := typ.Unit
   155  	if coerce {
   156  		target = arrprops.coerceTimestampUnit
   157  	}
   158  
   159  	// user is explicitly asking for int96, no logical type
   160  	if arrprops.timestampAsInt96 && target == arrow.Nanosecond {
   161  		return parquet.Types.Int96, schema.NoLogicalType{}, nil
   162  	}
   163  
   164  	physical := parquet.Types.Int64
   165  	logicalType := arrowTimestampToLogical(typ, target)
   166  
   167  	// user is explicitly asking for timestamp data to be converted to the specified
   168  	// units (target) via coercion
   169  	if coerce {
   170  		if props.Version() == parquet.V1_0 || props.Version() == parquet.V2_4 {
   171  			switch target {
   172  			case arrow.Millisecond, arrow.Microsecond:
   173  			case arrow.Nanosecond, arrow.Second:
   174  				return physical, nil, fmt.Errorf("parquet version %s files can only coerce arrow timestamps to millis or micros", props.Version())
   175  			}
   176  		} else if target == arrow.Second {
   177  			return physical, nil, fmt.Errorf("parquet version %s files can only coerce arrow timestampts to millis, micros or nanos", props.Version())
   178  		}
   179  		return physical, logicalType, nil
   180  	}
   181  
   182  	// the user implicitly wants timestamp data to retain its original time units
   183  	// however the converted type field used to indicate logical types for parquet
   184  	// version <=2.4 fields, does not allow for nanosecond time units and so nanos
   185  	// must be coerced to micros
   186  	if (props.Version() == parquet.V1_0 || props.Version() == parquet.V2_4) && typ.Unit == arrow.Nanosecond {
   187  		logicalType = arrowTimestampToLogical(typ, arrow.Microsecond)
   188  		return physical, logicalType, nil
   189  	}
   190  
   191  	// the user implicitly wants timestamp data to retain it's original time units,
   192  	// however the arrow seconds time unit cannot be represented in parquet, so must
   193  	// be coerced to milliseconds
   194  	if typ.Unit == arrow.Second {
   195  		logicalType = arrowTimestampToLogical(typ, arrow.Millisecond)
   196  	}
   197  
   198  	return physical, logicalType, nil
   199  }
   200  
   201  // DecimalSize returns the minimum number of bytes necessary to represent a decimal
   202  // with the requested precision.
   203  //
   204  // Taken from the Apache Impala codebase. The comments next to the return values
   205  // are the maximum value that can be represented in 2's complement with the returned
   206  // number of bytes
   207  func DecimalSize(precision int32) int32 {
   208  	if precision < 1 {
   209  		panic("precision must be >= 1")
   210  	}
   211  
   212  	// generated in python with:
   213  	// >>> decimal_size = lambda prec: int(math.ceil((prec * math.log2(10) + 1) / 8))
   214  	// >>> [-1] + [decimal_size(i) for i in range(1, 77)]
   215  	var byteblock = [...]int32{
   216  		-1, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 9,
   217  		9, 9, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16, 16, 17,
   218  		17, 18, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25,
   219  		26, 26, 26, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32,
   220  	}
   221  
   222  	if precision <= 76 {
   223  		return byteblock[precision]
   224  	}
   225  	return int32(math.Ceil(float64(precision)/8.0)*math.Log2(10) + 1)
   226  }
   227  
   228  func repFromNullable(isnullable bool) parquet.Repetition {
   229  	if isnullable {
   230  		return parquet.Repetitions.Optional
   231  	}
   232  	return parquet.Repetitions.Required
   233  }
   234  
   235  func structToNode(typ *arrow.StructType, name string, nullable bool, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) {
   236  	if len(typ.Fields()) == 0 {
   237  		return nil, fmt.Errorf("cannot write struct type '%s' with no children field to parquet. Consider adding a dummy child", name)
   238  	}
   239  
   240  	children := make(schema.FieldList, 0, len(typ.Fields()))
   241  	for _, f := range typ.Fields() {
   242  		n, err := fieldToNode(f.Name, f, props, arrprops)
   243  		if err != nil {
   244  			return nil, err
   245  		}
   246  		children = append(children, n)
   247  	}
   248  
   249  	return schema.NewGroupNode(name, repFromNullable(nullable), children, -1)
   250  }
   251  
   252  func fieldToNode(name string, field arrow.Field, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) {
   253  	var (
   254  		logicalType schema.LogicalType = schema.NoLogicalType{}
   255  		typ         parquet.Type
   256  		repType     = repFromNullable(field.Nullable)
   257  		length      = -1
   258  		precision   = -1
   259  		scale       = -1
   260  		err         error
   261  	)
   262  
   263  	switch field.Type.ID() {
   264  	case arrow.NULL:
   265  		typ = parquet.Types.Int32
   266  		logicalType = &schema.NullLogicalType{}
   267  		if repType != parquet.Repetitions.Optional {
   268  			return nil, xerrors.New("nulltype arrow field must be nullable")
   269  		}
   270  	case arrow.BOOL:
   271  		typ = parquet.Types.Boolean
   272  	case arrow.UINT8:
   273  		typ = parquet.Types.Int32
   274  		logicalType = schema.NewIntLogicalType(8, false)
   275  	case arrow.INT8:
   276  		typ = parquet.Types.Int32
   277  		logicalType = schema.NewIntLogicalType(8, true)
   278  	case arrow.UINT16:
   279  		typ = parquet.Types.Int32
   280  		logicalType = schema.NewIntLogicalType(16, false)
   281  	case arrow.INT16:
   282  		typ = parquet.Types.Int32
   283  		logicalType = schema.NewIntLogicalType(16, true)
   284  	case arrow.UINT32:
   285  		typ = parquet.Types.Int32
   286  		logicalType = schema.NewIntLogicalType(32, false)
   287  	case arrow.INT32:
   288  		typ = parquet.Types.Int32
   289  		logicalType = schema.NewIntLogicalType(32, true)
   290  	case arrow.UINT64:
   291  		typ = parquet.Types.Int64
   292  		logicalType = schema.NewIntLogicalType(64, false)
   293  	case arrow.INT64:
   294  		typ = parquet.Types.Int64
   295  		logicalType = schema.NewIntLogicalType(64, true)
   296  	case arrow.FLOAT32:
   297  		typ = parquet.Types.Float
   298  	case arrow.FLOAT64:
   299  		typ = parquet.Types.Double
   300  	case arrow.STRING, arrow.LARGE_STRING:
   301  		logicalType = schema.StringLogicalType{}
   302  		fallthrough
   303  	case arrow.BINARY, arrow.LARGE_BINARY:
   304  		typ = parquet.Types.ByteArray
   305  	case arrow.FIXED_SIZE_BINARY:
   306  		typ = parquet.Types.FixedLenByteArray
   307  		length = field.Type.(*arrow.FixedSizeBinaryType).ByteWidth
   308  	case arrow.DECIMAL, arrow.DECIMAL256:
   309  		dectype := field.Type.(arrow.DecimalType)
   310  		precision = int(dectype.GetPrecision())
   311  		scale = int(dectype.GetScale())
   312  
   313  		if props.StoreDecimalAsInteger() && 1 <= precision && precision <= 18 {
   314  			if precision <= 9 {
   315  				typ = parquet.Types.Int32
   316  			} else {
   317  				typ = parquet.Types.Int64
   318  			}
   319  		} else {
   320  			typ = parquet.Types.FixedLenByteArray
   321  			length = int(DecimalSize(int32(precision)))
   322  		}
   323  
   324  		logicalType = schema.NewDecimalLogicalType(int32(precision), int32(scale))
   325  	case arrow.DATE32:
   326  		typ = parquet.Types.Int32
   327  		logicalType = schema.DateLogicalType{}
   328  	case arrow.DATE64:
   329  		typ = parquet.Types.Int64
   330  		logicalType = schema.NewTimestampLogicalType(true, schema.TimeUnitMillis)
   331  	case arrow.TIMESTAMP:
   332  		typ, logicalType, err = getTimestampMeta(field.Type.(*arrow.TimestampType), props, arrprops)
   333  		if err != nil {
   334  			return nil, err
   335  		}
   336  	case arrow.TIME32:
   337  		typ = parquet.Types.Int32
   338  		logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMillis)
   339  	case arrow.TIME64:
   340  		typ = parquet.Types.Int64
   341  		timeType := field.Type.(*arrow.Time64Type)
   342  		if timeType.Unit == arrow.Nanosecond {
   343  			logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitNanos)
   344  		} else {
   345  			logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMicros)
   346  		}
   347  	case arrow.STRUCT:
   348  		return structToNode(field.Type.(*arrow.StructType), field.Name, field.Nullable, props, arrprops)
   349  	case arrow.FIXED_SIZE_LIST, arrow.LIST:
   350  		var elem arrow.DataType
   351  		if lt, ok := field.Type.(*arrow.ListType); ok {
   352  			elem = lt.Elem()
   353  		} else {
   354  			elem = field.Type.(*arrow.FixedSizeListType).Elem()
   355  		}
   356  
   357  		child, err := fieldToNode(name, arrow.Field{Name: name, Type: elem, Nullable: true}, props, arrprops)
   358  		if err != nil {
   359  			return nil, err
   360  		}
   361  
   362  		return schema.ListOf(child, repFromNullable(field.Nullable), -1)
   363  	case arrow.DICTIONARY:
   364  		// parquet has no dictionary type, dictionary is encoding, not schema level
   365  		dictType := field.Type.(*arrow.DictionaryType)
   366  		return fieldToNode(name, arrow.Field{Name: name, Type: dictType.ValueType, Nullable: field.Nullable, Metadata: field.Metadata},
   367  			props, arrprops)
   368  	case arrow.EXTENSION:
   369  		return fieldToNode(name, arrow.Field{
   370  			Name:     name,
   371  			Type:     field.Type.(arrow.ExtensionType).StorageType(),
   372  			Nullable: field.Nullable,
   373  			Metadata: arrow.MetadataFrom(map[string]string{
   374  				ipc.ExtensionTypeKeyName:     field.Type.(arrow.ExtensionType).ExtensionName(),
   375  				ipc.ExtensionMetadataKeyName: field.Type.(arrow.ExtensionType).Serialize(),
   376  			}),
   377  		}, props, arrprops)
   378  	case arrow.MAP:
   379  		mapType := field.Type.(*arrow.MapType)
   380  		keyNode, err := fieldToNode("key", mapType.KeyField(), props, arrprops)
   381  		if err != nil {
   382  			return nil, err
   383  		}
   384  
   385  		valueNode, err := fieldToNode("value", mapType.ItemField(), props, arrprops)
   386  		if err != nil {
   387  			return nil, err
   388  		}
   389  
   390  		if arrprops.noMapLogicalType {
   391  			keyval := schema.FieldList{keyNode, valueNode}
   392  			keyvalNode, err := schema.NewGroupNode("key_value", parquet.Repetitions.Repeated, keyval, -1)
   393  			if err != nil {
   394  				return nil, err
   395  			}
   396  			return schema.NewGroupNode(field.Name, repFromNullable(field.Nullable), schema.FieldList{
   397  				keyvalNode,
   398  			}, -1)
   399  		}
   400  		return schema.MapOf(field.Name, keyNode, valueNode, repFromNullable(field.Nullable), -1)
   401  	default:
   402  		return nil, fmt.Errorf("%w: support for %s", arrow.ErrNotImplemented, field.Type.ID())
   403  	}
   404  
   405  	return schema.NewPrimitiveNodeLogical(name, repType, logicalType, typ, length, fieldIDFromMeta(field.Metadata))
   406  }
   407  
   408  const fieldIDKey = "PARQUET:field_id"
   409  
   410  func fieldIDFromMeta(m arrow.Metadata) int32 {
   411  	if m.Len() == 0 {
   412  		return -1
   413  	}
   414  
   415  	key := m.FindKey(fieldIDKey)
   416  	if key < 0 {
   417  		return -1
   418  	}
   419  
   420  	id, err := strconv.ParseInt(m.Values()[key], 10, 32)
   421  	if err != nil {
   422  		return -1
   423  	}
   424  
   425  	if id < 0 {
   426  		return -1
   427  	}
   428  
   429  	return int32(id)
   430  }
   431  
   432  // ToParquet generates a Parquet Schema from an arrow Schema using the given properties to make
   433  // decisions when determining the logical/physical types of the columns.
   434  func ToParquet(sc *arrow.Schema, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (*schema.Schema, error) {
   435  	if props == nil {
   436  		props = parquet.NewWriterProperties()
   437  	}
   438  
   439  	nodes := make(schema.FieldList, 0, len(sc.Fields()))
   440  	for _, f := range sc.Fields() {
   441  		n, err := fieldToNode(f.Name, f, props, arrprops)
   442  		if err != nil {
   443  			return nil, err
   444  		}
   445  		nodes = append(nodes, n)
   446  	}
   447  
   448  	root, err := schema.NewGroupNode(props.RootName(), props.RootRepetition(), nodes, -1)
   449  	if err != nil {
   450  		return nil, err
   451  	}
   452  
   453  	return schema.NewSchema(root), err
   454  }
   455  
   456  type schemaTree struct {
   457  	manifest *SchemaManifest
   458  
   459  	schema *schema.Schema
   460  	props  *ArrowReadProperties
   461  }
   462  
   463  func (s schemaTree) LinkParent(child, parent *SchemaField) {
   464  	s.manifest.ChildToParent[child] = parent
   465  }
   466  
   467  func (s schemaTree) RecordLeaf(leaf *SchemaField) {
   468  	s.manifest.ColIndexToField[leaf.ColIndex] = leaf
   469  }
   470  
   471  func arrowInt(log *schema.IntLogicalType) (arrow.DataType, error) {
   472  	switch log.BitWidth() {
   473  	case 8:
   474  		if log.IsSigned() {
   475  			return arrow.PrimitiveTypes.Int8, nil
   476  		}
   477  		return arrow.PrimitiveTypes.Uint8, nil
   478  	case 16:
   479  		if log.IsSigned() {
   480  			return arrow.PrimitiveTypes.Int16, nil
   481  		}
   482  		return arrow.PrimitiveTypes.Uint16, nil
   483  	case 32:
   484  		if log.IsSigned() {
   485  			return arrow.PrimitiveTypes.Int32, nil
   486  		}
   487  		return arrow.PrimitiveTypes.Uint32, nil
   488  	case 64:
   489  		if log.IsSigned() {
   490  			return arrow.PrimitiveTypes.Int64, nil
   491  		}
   492  		return arrow.PrimitiveTypes.Uint64, nil
   493  	default:
   494  		return nil, xerrors.New("invalid logical type for int32")
   495  	}
   496  }
   497  
   498  func arrowTime32(logical *schema.TimeLogicalType) (arrow.DataType, error) {
   499  	if logical.TimeUnit() == schema.TimeUnitMillis {
   500  		return arrow.FixedWidthTypes.Time32ms, nil
   501  	}
   502  
   503  	return nil, xerrors.New(logical.String() + " cannot annotate a time32")
   504  }
   505  
   506  func arrowTime64(logical *schema.TimeLogicalType) (arrow.DataType, error) {
   507  	switch logical.TimeUnit() {
   508  	case schema.TimeUnitMicros:
   509  		return arrow.FixedWidthTypes.Time64us, nil
   510  	case schema.TimeUnitNanos:
   511  		return arrow.FixedWidthTypes.Time64ns, nil
   512  	default:
   513  		return nil, xerrors.New(logical.String() + " cannot annotate int64")
   514  	}
   515  }
   516  
   517  func arrowTimestamp(logical *schema.TimestampLogicalType) (arrow.DataType, error) {
   518  	tz := "UTC"
   519  	if logical.IsFromConvertedType() {
   520  		tz = ""
   521  	}
   522  
   523  	switch logical.TimeUnit() {
   524  	case schema.TimeUnitMillis:
   525  		return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Millisecond}, nil
   526  	case schema.TimeUnitMicros:
   527  		return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Microsecond}, nil
   528  	case schema.TimeUnitNanos:
   529  		return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Nanosecond}, nil
   530  	default:
   531  		return nil, xerrors.New("Unrecognized unit in timestamp logical type " + logical.String())
   532  	}
   533  }
   534  
   535  func arrowDecimal(logical *schema.DecimalLogicalType) arrow.DataType {
   536  	if logical.Precision() <= decimal128.MaxPrecision {
   537  		return &arrow.Decimal128Type{Precision: logical.Precision(), Scale: logical.Scale()}
   538  	}
   539  	return &arrow.Decimal256Type{Precision: logical.Precision(), Scale: logical.Scale()}
   540  }
   541  
   542  func arrowFromInt32(logical schema.LogicalType) (arrow.DataType, error) {
   543  	switch logtype := logical.(type) {
   544  	case schema.NoLogicalType:
   545  		return arrow.PrimitiveTypes.Int32, nil
   546  	case *schema.TimeLogicalType:
   547  		return arrowTime32(logtype)
   548  	case *schema.DecimalLogicalType:
   549  		return arrowDecimal(logtype), nil
   550  	case *schema.IntLogicalType:
   551  		return arrowInt(logtype)
   552  	case schema.DateLogicalType:
   553  		return arrow.FixedWidthTypes.Date32, nil
   554  	default:
   555  		return nil, xerrors.New(logical.String() + " cannot annotate int32")
   556  	}
   557  }
   558  
   559  func arrowFromInt64(logical schema.LogicalType) (arrow.DataType, error) {
   560  	if logical.IsNone() {
   561  		return arrow.PrimitiveTypes.Int64, nil
   562  	}
   563  
   564  	switch logtype := logical.(type) {
   565  	case *schema.IntLogicalType:
   566  		return arrowInt(logtype)
   567  	case *schema.DecimalLogicalType:
   568  		return arrowDecimal(logtype), nil
   569  	case *schema.TimeLogicalType:
   570  		return arrowTime64(logtype)
   571  	case *schema.TimestampLogicalType:
   572  		return arrowTimestamp(logtype)
   573  	default:
   574  		return nil, xerrors.New(logical.String() + " cannot annotate int64")
   575  	}
   576  }
   577  
   578  func arrowFromByteArray(logical schema.LogicalType) (arrow.DataType, error) {
   579  	switch logtype := logical.(type) {
   580  	case schema.StringLogicalType:
   581  		return arrow.BinaryTypes.String, nil
   582  	case *schema.DecimalLogicalType:
   583  		return arrowDecimal(logtype), nil
   584  	case schema.NoLogicalType,
   585  		schema.EnumLogicalType,
   586  		schema.JSONLogicalType,
   587  		schema.BSONLogicalType:
   588  		return arrow.BinaryTypes.Binary, nil
   589  	default:
   590  		return nil, xerrors.New("unhandled logicaltype " + logical.String() + " for byte_array")
   591  	}
   592  }
   593  
   594  func arrowFromFLBA(logical schema.LogicalType, length int) (arrow.DataType, error) {
   595  	switch logtype := logical.(type) {
   596  	case *schema.DecimalLogicalType:
   597  		return arrowDecimal(logtype), nil
   598  	case schema.NoLogicalType, schema.IntervalLogicalType, schema.UUIDLogicalType:
   599  		return &arrow.FixedSizeBinaryType{ByteWidth: int(length)}, nil
   600  	default:
   601  		return nil, xerrors.New("unhandled logical type " + logical.String() + " for fixed-length byte array")
   602  	}
   603  }
   604  
   605  func getArrowType(physical parquet.Type, logical schema.LogicalType, typeLen int) (arrow.DataType, error) {
   606  	if !logical.IsValid() || logical.Equals(schema.NullLogicalType{}) {
   607  		return arrow.Null, nil
   608  	}
   609  
   610  	switch physical {
   611  	case parquet.Types.Boolean:
   612  		return arrow.FixedWidthTypes.Boolean, nil
   613  	case parquet.Types.Int32:
   614  		return arrowFromInt32(logical)
   615  	case parquet.Types.Int64:
   616  		return arrowFromInt64(logical)
   617  	case parquet.Types.Int96:
   618  		return arrow.FixedWidthTypes.Timestamp_ns, nil
   619  	case parquet.Types.Float:
   620  		return arrow.PrimitiveTypes.Float32, nil
   621  	case parquet.Types.Double:
   622  		return arrow.PrimitiveTypes.Float64, nil
   623  	case parquet.Types.ByteArray:
   624  		return arrowFromByteArray(logical)
   625  	case parquet.Types.FixedLenByteArray:
   626  		return arrowFromFLBA(logical, typeLen)
   627  	default:
   628  		return nil, xerrors.New("invalid physical column type")
   629  	}
   630  }
   631  
   632  func populateLeaf(colIndex int, field *arrow.Field, currentLevels file.LevelInfo, ctx *schemaTree, parent *SchemaField, out *SchemaField) {
   633  	out.Field = field
   634  	out.ColIndex = colIndex
   635  	out.LevelInfo = currentLevels
   636  	ctx.RecordLeaf(out)
   637  	ctx.LinkParent(out, parent)
   638  }
   639  
   640  func listToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   641  	if n.NumFields() != 1 {
   642  		return xerrors.New("LIST groups must have only 1 child")
   643  	}
   644  
   645  	if n.RepetitionType() == parquet.Repetitions.Repeated {
   646  		return xerrors.New("LIST groups must not be repeated")
   647  	}
   648  
   649  	currentLevels.Increment(n)
   650  
   651  	out.Children = make([]SchemaField, n.NumFields())
   652  	ctx.LinkParent(out, parent)
   653  	ctx.LinkParent(&out.Children[0], out)
   654  
   655  	listNode := n.Field(0)
   656  	if listNode.RepetitionType() != parquet.Repetitions.Repeated {
   657  		return xerrors.New("non-repeated nodes in a list group are not supported")
   658  	}
   659  
   660  	repeatedAncestorDef := currentLevels.IncrementRepeated()
   661  	if listNode.Type() == schema.Group {
   662  		// Resolve 3-level encoding
   663  		//
   664  		// required/optional group name=whatever {
   665  		//   repeated group name=list {
   666  		//     required/optional TYPE item;
   667  		//   }
   668  		// }
   669  		//
   670  		// yields list<item: TYPE ?nullable> ?nullable
   671  		//
   672  		// We distinguish the special case that we have
   673  		//
   674  		// required/optional group name=whatever {
   675  		//   repeated group name=array or $SOMETHING_tuple {
   676  		//     required/optional TYPE item;
   677  		//   }
   678  		// }
   679  		//
   680  		// In this latter case, the inner type of the list should be a struct
   681  		// rather than a primitive value
   682  		//
   683  		// yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
   684  		// Special case mentioned in the format spec:
   685  		//   If the name is array or ends in _tuple, this should be a list of struct
   686  		//   even for single child elements.
   687  		listGroup := listNode.(*schema.GroupNode)
   688  		if listGroup.NumFields() == 1 && !(listGroup.Name() == "array" || listGroup.Name() == (n.Name()+"_tuple")) {
   689  			// list of primitive type
   690  			if err := nodeToSchemaField(listGroup.Field(0), currentLevels, ctx, out, &out.Children[0]); err != nil {
   691  				return err
   692  			}
   693  		} else {
   694  			if err := groupToStructField(listGroup, currentLevels, ctx, out, &out.Children[0]); err != nil {
   695  				return err
   696  			}
   697  		}
   698  	} else {
   699  		// Two-level list encoding
   700  		//
   701  		// required/optional group LIST {
   702  		//   repeated TYPE;
   703  		// }
   704  		primitiveNode := listNode.(*schema.PrimitiveNode)
   705  		colIndex := ctx.schema.ColumnIndexByNode(primitiveNode)
   706  		arrowType, err := getArrowType(primitiveNode.PhysicalType(), primitiveNode.LogicalType(), primitiveNode.TypeLength())
   707  		if err != nil {
   708  			return err
   709  		}
   710  
   711  		if ctx.props.ReadDict(colIndex) && isDictionaryReadSupported(arrowType) {
   712  			arrowType = &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32, ValueType: arrowType}
   713  		}
   714  
   715  		itemField := arrow.Field{Name: listNode.Name(), Type: arrowType, Nullable: false, Metadata: createFieldMeta(int(listNode.FieldID()))}
   716  		populateLeaf(colIndex, &itemField, currentLevels, ctx, out, &out.Children[0])
   717  	}
   718  
   719  	out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOfField(
   720  		arrow.Field{Name: listNode.Name(), Type: out.Children[0].Field.Type, Nullable: true}),
   721  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional, Metadata: createFieldMeta(int(n.FieldID()))}
   722  
   723  	out.LevelInfo = currentLevels
   724  	// At this point current levels contains the def level for this list,
   725  	// we need to reset to the prior parent.
   726  	out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef
   727  	return nil
   728  }
   729  
   730  func groupToStructField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   731  	arrowFields := make([]arrow.Field, 0, n.NumFields())
   732  	out.Children = make([]SchemaField, n.NumFields())
   733  
   734  	for i := 0; i < n.NumFields(); i++ {
   735  		if err := nodeToSchemaField(n.Field(i), currentLevels, ctx, out, &out.Children[i]); err != nil {
   736  			return err
   737  		}
   738  		arrowFields = append(arrowFields, *out.Children[i].Field)
   739  	}
   740  
   741  	out.Field = &arrow.Field{Name: n.Name(), Type: arrow.StructOf(arrowFields...),
   742  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional, Metadata: createFieldMeta(int(n.FieldID()))}
   743  	out.LevelInfo = currentLevels
   744  	return nil
   745  }
   746  
   747  func mapToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   748  	if n.NumFields() != 1 {
   749  		return xerrors.New("MAP group must have exactly 1 child")
   750  	}
   751  	if n.RepetitionType() == parquet.Repetitions.Repeated {
   752  		return xerrors.New("MAP groups must not be repeated")
   753  	}
   754  
   755  	keyvalueNode := n.Field(0)
   756  	if keyvalueNode.RepetitionType() != parquet.Repetitions.Repeated {
   757  		return xerrors.New("Non-repeated keyvalue group in MAP group is not supported")
   758  	}
   759  
   760  	if keyvalueNode.Type() != schema.Group {
   761  		return xerrors.New("keyvalue node must be a group")
   762  	}
   763  
   764  	kvgroup := keyvalueNode.(*schema.GroupNode)
   765  	if kvgroup.NumFields() != 1 && kvgroup.NumFields() != 2 {
   766  		return fmt.Errorf("keyvalue node group must have exactly 1 or 2 child elements, Found %d", kvgroup.NumFields())
   767  	}
   768  
   769  	keyNode := kvgroup.Field(0)
   770  	if keyNode.RepetitionType() != parquet.Repetitions.Required {
   771  		return xerrors.New("MAP keys must be required")
   772  	}
   773  
   774  	// Arrow doesn't support 1 column maps (i.e. Sets).  The options are to either
   775  	// make the values column nullable, or process the map as a list.  We choose the latter
   776  	// as it is simpler.
   777  	if kvgroup.NumFields() == 1 {
   778  		return listToSchemaField(n, currentLevels, ctx, parent, out)
   779  	}
   780  
   781  	currentLevels.Increment(n)
   782  	repeatedAncestorDef := currentLevels.IncrementRepeated()
   783  	out.Children = make([]SchemaField, 1)
   784  
   785  	kvfield := &out.Children[0]
   786  	kvfield.Children = make([]SchemaField, 2)
   787  
   788  	keyField := &kvfield.Children[0]
   789  	valueField := &kvfield.Children[1]
   790  
   791  	ctx.LinkParent(out, parent)
   792  	ctx.LinkParent(kvfield, out)
   793  	ctx.LinkParent(keyField, kvfield)
   794  	ctx.LinkParent(valueField, kvfield)
   795  
   796  	// required/optional group name=whatever {
   797  	//   repeated group name=key_values{
   798  	//     required TYPE key;
   799  	// required/optional TYPE value;
   800  	//   }
   801  	// }
   802  	//
   803  
   804  	if err := nodeToSchemaField(keyNode, currentLevels, ctx, kvfield, keyField); err != nil {
   805  		return err
   806  	}
   807  	if err := nodeToSchemaField(kvgroup.Field(1), currentLevels, ctx, kvfield, valueField); err != nil {
   808  		return err
   809  	}
   810  
   811  	kvfield.Field = &arrow.Field{Name: n.Name(), Type: arrow.StructOf(*keyField.Field, *valueField.Field),
   812  		Nullable: false, Metadata: createFieldMeta(int(kvgroup.FieldID()))}
   813  
   814  	kvfield.LevelInfo = currentLevels
   815  	out.Field = &arrow.Field{Name: n.Name(), Type: arrow.MapOf(keyField.Field.Type, valueField.Field.Type),
   816  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional,
   817  		Metadata: createFieldMeta(int(n.FieldID()))}
   818  	out.LevelInfo = currentLevels
   819  	// At this point current levels contains the def level for this map,
   820  	// we need to reset to the prior parent.
   821  	out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef
   822  	return nil
   823  }
   824  
   825  func groupToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   826  	if n.LogicalType().Equals(schema.NewListLogicalType()) {
   827  		return listToSchemaField(n, currentLevels, ctx, parent, out)
   828  	} else if n.LogicalType().Equals(schema.MapLogicalType{}) {
   829  		return mapToSchemaField(n, currentLevels, ctx, parent, out)
   830  	}
   831  
   832  	if n.RepetitionType() == parquet.Repetitions.Repeated {
   833  		// Simple repeated struct
   834  		//
   835  		// repeated group $NAME {
   836  		//   r/o TYPE[0] f0
   837  		//   r/o TYPE[1] f1
   838  		// }
   839  		out.Children = make([]SchemaField, 1)
   840  		repeatedAncestorDef := currentLevels.IncrementRepeated()
   841  		if err := groupToStructField(n, currentLevels, ctx, out, &out.Children[0]); err != nil {
   842  			return err
   843  		}
   844  
   845  		out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOf(out.Children[0].Field.Type), Nullable: false,
   846  			Metadata: createFieldMeta(int(n.FieldID()))}
   847  		ctx.LinkParent(&out.Children[0], out)
   848  		out.LevelInfo = currentLevels
   849  		out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef
   850  		return nil
   851  	}
   852  
   853  	currentLevels.Increment(n)
   854  	return groupToStructField(n, currentLevels, ctx, parent, out)
   855  }
   856  
   857  func createFieldMeta(fieldID int) arrow.Metadata {
   858  	return arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{strconv.Itoa(fieldID)})
   859  }
   860  
   861  func nodeToSchemaField(n schema.Node, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   862  	ctx.LinkParent(out, parent)
   863  
   864  	if n.Type() == schema.Group {
   865  		return groupToSchemaField(n.(*schema.GroupNode), currentLevels, ctx, parent, out)
   866  	}
   867  
   868  	// Either a normal flat primitive type, or a list type encoded with 1-level
   869  	// list encoding. Note that the 3-level encoding is the form recommended by
   870  	// the parquet specification, but technically we can have either
   871  	//
   872  	// required/optional $TYPE $FIELD_NAME
   873  	//
   874  	// or
   875  	//
   876  	// repeated $TYPE $FIELD_NAME
   877  
   878  	primitive := n.(*schema.PrimitiveNode)
   879  	colIndex := ctx.schema.ColumnIndexByNode(primitive)
   880  	arrowType, err := getArrowType(primitive.PhysicalType(), primitive.LogicalType(), primitive.TypeLength())
   881  	if err != nil {
   882  		return err
   883  	}
   884  
   885  	if ctx.props.ReadDict(colIndex) && isDictionaryReadSupported(arrowType) {
   886  		arrowType = &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32, ValueType: arrowType}
   887  	}
   888  
   889  	if primitive.RepetitionType() == parquet.Repetitions.Repeated {
   890  		// one-level list encoding e.g. a: repeated int32;
   891  		repeatedAncestorDefLevel := currentLevels.IncrementRepeated()
   892  		out.Children = make([]SchemaField, 1)
   893  		child := arrow.Field{Name: primitive.Name(), Type: arrowType, Nullable: false}
   894  		populateLeaf(colIndex, &child, currentLevels, ctx, out, &out.Children[0])
   895  		out.Field = &arrow.Field{Name: primitive.Name(), Type: arrow.ListOf(child.Type), Nullable: false,
   896  			Metadata: createFieldMeta(int(primitive.FieldID()))}
   897  		out.LevelInfo = currentLevels
   898  		out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDefLevel
   899  		return nil
   900  	}
   901  
   902  	currentLevels.Increment(n)
   903  	populateLeaf(colIndex, &arrow.Field{Name: n.Name(), Type: arrowType,
   904  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional,
   905  		Metadata: createFieldMeta(int(n.FieldID()))},
   906  		currentLevels, ctx, parent, out)
   907  	return nil
   908  }
   909  
   910  func getOriginSchema(meta metadata.KeyValueMetadata, mem memory.Allocator) (*arrow.Schema, error) {
   911  	if meta == nil {
   912  		return nil, nil
   913  	}
   914  
   915  	const arrowSchemaKey = "ARROW:schema"
   916  	serialized := meta.FindValue(arrowSchemaKey)
   917  	if serialized == nil {
   918  		return nil, nil
   919  	}
   920  
   921  	var (
   922  		decoded []byte
   923  		err     error
   924  	)
   925  
   926  	// if the length of serialized is not a multiple of 4, it cannot be
   927  	// padded with std encoding.
   928  	if len(*serialized)%4 == 0 {
   929  		decoded, err = base64.StdEncoding.DecodeString(*serialized)
   930  	}
   931  	// if we failed to decode it with stdencoding or the length wasn't
   932  	// a multiple of 4, try using the Raw unpadded encoding
   933  	if len(decoded) == 0 || err != nil {
   934  		decoded, err = base64.RawStdEncoding.DecodeString(*serialized)
   935  	}
   936  
   937  	if err != nil {
   938  		return nil, err
   939  	}
   940  
   941  	return flight.DeserializeSchema(decoded, mem)
   942  }
   943  
   944  func getNestedFactory(origin, inferred arrow.DataType) func(fieldList []arrow.Field) arrow.DataType {
   945  	switch inferred.ID() {
   946  	case arrow.STRUCT:
   947  		if origin.ID() == arrow.STRUCT {
   948  			return func(list []arrow.Field) arrow.DataType {
   949  				return arrow.StructOf(list...)
   950  			}
   951  		}
   952  	case arrow.LIST:
   953  		switch origin.ID() {
   954  		case arrow.LIST:
   955  			return func(list []arrow.Field) arrow.DataType {
   956  				return arrow.ListOf(list[0].Type)
   957  			}
   958  		case arrow.FIXED_SIZE_LIST:
   959  			sz := origin.(*arrow.FixedSizeListType).Len()
   960  			return func(list []arrow.Field) arrow.DataType {
   961  				return arrow.FixedSizeListOf(sz, list[0].Type)
   962  			}
   963  		}
   964  	case arrow.MAP:
   965  		if origin.ID() == arrow.MAP {
   966  			return func(list []arrow.Field) arrow.DataType {
   967  				valType := list[0].Type.(*arrow.StructType)
   968  				return arrow.MapOf(valType.Field(0).Type, valType.Field(1).Type)
   969  			}
   970  		}
   971  	}
   972  	return nil
   973  }
   974  
   975  func applyOriginalStorageMetadata(origin arrow.Field, inferred *SchemaField) (modified bool, err error) {
   976  	nchildren := len(inferred.Children)
   977  	switch origin.Type.ID() {
   978  	case arrow.EXTENSION:
   979  		extType := origin.Type.(arrow.ExtensionType)
   980  		modified, err = applyOriginalStorageMetadata(arrow.Field{
   981  			Type:     extType.StorageType(),
   982  			Metadata: origin.Metadata,
   983  		}, inferred)
   984  		if err != nil {
   985  			return
   986  		}
   987  
   988  		if !arrow.TypeEqual(extType.StorageType(), inferred.Field.Type) {
   989  			return modified, fmt.Errorf("%w: mismatch storage type '%s' for extension type '%s'",
   990  				arrow.ErrInvalid, inferred.Field.Type, extType)
   991  		}
   992  
   993  		inferred.Field.Type = extType
   994  		modified = true
   995  	case arrow.SPARSE_UNION, arrow.DENSE_UNION:
   996  		err = xerrors.New("unimplemented type")
   997  	case arrow.STRUCT:
   998  		typ := origin.Type.(*arrow.StructType)
   999  		if nchildren != len(typ.Fields()) {
  1000  			return
  1001  		}
  1002  
  1003  		factory := getNestedFactory(typ, inferred.Field.Type)
  1004  		if factory == nil {
  1005  			return
  1006  		}
  1007  
  1008  		modified = typ.ID() != inferred.Field.Type.ID()
  1009  		for idx := range inferred.Children {
  1010  			childMod, err := applyOriginalMetadata(typ.Field(idx), &inferred.Children[idx])
  1011  			if err != nil {
  1012  				return false, err
  1013  			}
  1014  			modified = modified || childMod
  1015  		}
  1016  		if modified {
  1017  			modifiedChildren := make([]arrow.Field, len(inferred.Children))
  1018  			for idx, child := range inferred.Children {
  1019  				modifiedChildren[idx] = *child.Field
  1020  			}
  1021  			inferred.Field.Type = factory(modifiedChildren)
  1022  		}
  1023  	case arrow.FIXED_SIZE_LIST, arrow.LIST, arrow.LARGE_LIST, arrow.MAP: // arrow.ListLike
  1024  		if nchildren != 1 {
  1025  			return
  1026  		}
  1027  		factory := getNestedFactory(origin.Type, inferred.Field.Type)
  1028  		if factory == nil {
  1029  			return
  1030  		}
  1031  
  1032  		modified = origin.Type.ID() != inferred.Field.Type.ID()
  1033  		childModified, err := applyOriginalMetadata(arrow.Field{Type: origin.Type.(arrow.ListLikeType).Elem()}, &inferred.Children[0])
  1034  		if err != nil {
  1035  			return modified, err
  1036  		}
  1037  		modified = modified || childModified
  1038  		if modified {
  1039  			inferred.Field.Type = factory([]arrow.Field{*inferred.Children[0].Field})
  1040  		}
  1041  	case arrow.TIMESTAMP:
  1042  		if inferred.Field.Type.ID() != arrow.TIMESTAMP {
  1043  			return
  1044  		}
  1045  
  1046  		tsOtype := origin.Type.(*arrow.TimestampType)
  1047  		tsInfType := inferred.Field.Type.(*arrow.TimestampType)
  1048  
  1049  		// if the unit is the same and the data is tz-aware, then set the original time zone
  1050  		// since parquet has no native storage of timezones
  1051  		if tsOtype.Unit == tsInfType.Unit && tsInfType.TimeZone == "UTC" && tsOtype.TimeZone != "" {
  1052  			inferred.Field.Type = origin.Type
  1053  		}
  1054  		modified = true
  1055  	case arrow.LARGE_STRING, arrow.LARGE_BINARY:
  1056  		inferred.Field.Type = origin.Type
  1057  		modified = true
  1058  	case arrow.DICTIONARY:
  1059  		if origin.Type.ID() != arrow.DICTIONARY || (inferred.Field.Type.ID() == arrow.DICTIONARY || !isDictionaryReadSupported(inferred.Field.Type)) {
  1060  			return
  1061  		}
  1062  
  1063  		// direct dictionary reads are only supported for a few primitive types
  1064  		// so no need to recurse on value types
  1065  		dictOriginType := origin.Type.(*arrow.DictionaryType)
  1066  		inferred.Field.Type = &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32,
  1067  			ValueType: inferred.Field.Type, Ordered: dictOriginType.Ordered}
  1068  		modified = true
  1069  	case arrow.DECIMAL256:
  1070  		if inferred.Field.Type.ID() == arrow.DECIMAL128 {
  1071  			inferred.Field.Type = origin.Type
  1072  			modified = true
  1073  		}
  1074  	}
  1075  
  1076  	if origin.HasMetadata() {
  1077  		meta := origin.Metadata
  1078  		if inferred.Field.HasMetadata() {
  1079  			final := make(map[string]string)
  1080  			for idx, k := range meta.Keys() {
  1081  				final[k] = meta.Values()[idx]
  1082  			}
  1083  			for idx, k := range inferred.Field.Metadata.Keys() {
  1084  				final[k] = inferred.Field.Metadata.Values()[idx]
  1085  			}
  1086  			inferred.Field.Metadata = arrow.MetadataFrom(final)
  1087  		} else {
  1088  			inferred.Field.Metadata = meta
  1089  		}
  1090  		modified = true
  1091  	}
  1092  
  1093  	return
  1094  }
  1095  
  1096  func applyOriginalMetadata(origin arrow.Field, inferred *SchemaField) (bool, error) {
  1097  	return applyOriginalStorageMetadata(origin, inferred)
  1098  }
  1099  
  1100  // NewSchemaManifest creates a manifest for mapping a parquet schema to a given arrow schema.
  1101  //
  1102  // The metadata passed in should be the file level key value metadata from the parquet file or nil.
  1103  // If the ARROW:schema was in the metadata, then it is utilized to determine types.
  1104  func NewSchemaManifest(sc *schema.Schema, meta metadata.KeyValueMetadata, props *ArrowReadProperties) (*SchemaManifest, error) {
  1105  	var ctx schemaTree
  1106  	ctx.manifest = &SchemaManifest{
  1107  		ColIndexToField: make(map[int]*SchemaField),
  1108  		ChildToParent:   make(map[*SchemaField]*SchemaField),
  1109  		descr:           sc,
  1110  		Fields:          make([]SchemaField, sc.Root().NumFields()),
  1111  	}
  1112  	ctx.props = props
  1113  	if ctx.props == nil {
  1114  		ctx.props = &ArrowReadProperties{}
  1115  	}
  1116  	ctx.schema = sc
  1117  
  1118  	var err error
  1119  	ctx.manifest.OriginSchema, err = getOriginSchema(meta, memory.DefaultAllocator)
  1120  	if err != nil {
  1121  		return nil, err
  1122  	}
  1123  
  1124  	// if original schema is not compatible with the parquet schema, ignore it
  1125  	if ctx.manifest.OriginSchema != nil && len(ctx.manifest.OriginSchema.Fields()) != sc.Root().NumFields() {
  1126  		ctx.manifest.OriginSchema = nil
  1127  	}
  1128  
  1129  	for idx := range ctx.manifest.Fields {
  1130  		field := &ctx.manifest.Fields[idx]
  1131  		if err := nodeToSchemaField(sc.Root().Field(idx), file.LevelInfo{NullSlotUsage: 1}, &ctx, nil, field); err != nil {
  1132  			return nil, err
  1133  		}
  1134  
  1135  		if ctx.manifest.OriginSchema != nil {
  1136  			if _, err := applyOriginalMetadata(ctx.manifest.OriginSchema.Field(idx), field); err != nil {
  1137  				return nil, err
  1138  			}
  1139  		}
  1140  	}
  1141  	return ctx.manifest, nil
  1142  }
  1143  
  1144  // FromParquet generates an arrow Schema from a provided Parquet Schema
  1145  func FromParquet(sc *schema.Schema, props *ArrowReadProperties, kv metadata.KeyValueMetadata) (*arrow.Schema, error) {
  1146  	manifest, err := NewSchemaManifest(sc, kv, props)
  1147  	if err != nil {
  1148  		return nil, err
  1149  	}
  1150  
  1151  	fields := make([]arrow.Field, len(manifest.Fields))
  1152  	for idx, field := range manifest.Fields {
  1153  		fields[idx] = *field.Field
  1154  	}
  1155  
  1156  	if manifest.OriginSchema != nil {
  1157  		meta := manifest.OriginSchema.Metadata()
  1158  		return arrow.NewSchema(fields, &meta), nil
  1159  	}
  1160  	return arrow.NewSchema(fields, manifest.SchemaMeta), nil
  1161  }