github.com/apache/arrow/go/v7@v7.0.1/parquet/pqarrow/schema.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow
    18  
    19  import (
    20  	"encoding/base64"
    21  	"math"
    22  	"strconv"
    23  	"strings"
    24  
    25  	"github.com/apache/arrow/go/v7/arrow"
    26  	"github.com/apache/arrow/go/v7/arrow/flight"
    27  	"github.com/apache/arrow/go/v7/arrow/memory"
    28  	"github.com/apache/arrow/go/v7/parquet"
    29  	"github.com/apache/arrow/go/v7/parquet/file"
    30  	"github.com/apache/arrow/go/v7/parquet/metadata"
    31  	"github.com/apache/arrow/go/v7/parquet/schema"
    32  	"golang.org/x/xerrors"
    33  )
    34  
    35  // SchemaField is a holder that defines a specific logical field in the schema
    36  // which could potentially refer to multiple physical columns in the underlying
    37  // parquet file if it is a nested type.
    38  //
    39  // ColIndex is only populated (not -1) when it is a leaf column.
    40  type SchemaField struct {
    41  	Field     *arrow.Field
    42  	Children  []SchemaField
    43  	ColIndex  int
    44  	LevelInfo file.LevelInfo
    45  }
    46  
    47  // IsLeaf returns true if the SchemaField is a leaf column, ie: ColIndex != -1
    48  func (s *SchemaField) IsLeaf() bool { return s.ColIndex != -1 }
    49  
    50  // SchemaManifest represents a full manifest for mapping a Parquet schema
    51  // to an arrow Schema.
    52  type SchemaManifest struct {
    53  	descr        *schema.Schema
    54  	OriginSchema *arrow.Schema
    55  	SchemaMeta   *arrow.Metadata
    56  
    57  	ColIndexToField map[int]*SchemaField
    58  	ChildToParent   map[*SchemaField]*SchemaField
    59  	Fields          []SchemaField
    60  }
    61  
    62  // GetColumnField returns the corresponding Field for a given column index.
    63  func (sm *SchemaManifest) GetColumnField(index int) (*SchemaField, error) {
    64  	if field, ok := sm.ColIndexToField[index]; ok {
    65  		return field, nil
    66  	}
    67  	return nil, xerrors.Errorf("Column Index %d not found in schema manifest", index)
    68  }
    69  
    70  // GetParent gets the parent field for a given field if it is a nested column, otherwise
    71  // returns nil if there is no parent field.
    72  func (sm *SchemaManifest) GetParent(field *SchemaField) *SchemaField {
    73  	if p, ok := sm.ChildToParent[field]; ok {
    74  		return p
    75  	}
    76  	return nil
    77  }
    78  
    79  // GetFieldIndices coalesces a list of field indices (relative to the equivalent arrow::Schema) which
    80  // correspond to the column root (first node below the parquet schema's root group) of
    81  // each leaf referenced in column_indices.
    82  //
    83  // For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
    84  // the roots are `a` and `i` (return=[0,2]).
    85  //
    86  // root
    87  // -- a  <------
    88  // -- -- b  |  |
    89  // -- -- -- c  |
    90  // -- -- -- d  |
    91  // -- -- -- -- e
    92  // -- f
    93  // -- -- g
    94  // -- -- -- h
    95  // -- i  <---
    96  // -- -- j  |
    97  // -- -- -- k
    98  func (sm *SchemaManifest) GetFieldIndices(indices []int) ([]int, error) {
    99  	added := make(map[int]bool)
   100  	ret := make([]int, 0)
   101  
   102  	for _, idx := range indices {
   103  		if idx < 0 || idx >= sm.descr.NumColumns() {
   104  			return nil, xerrors.Errorf("column index %d is not valid", idx)
   105  		}
   106  
   107  		fieldNode := sm.descr.ColumnRoot(idx)
   108  		fieldIdx := sm.descr.Root().FieldIndexByField(fieldNode)
   109  		if fieldIdx == -1 {
   110  			return nil, xerrors.Errorf("column index %d is not valid", idx)
   111  		}
   112  
   113  		if _, ok := added[fieldIdx]; !ok {
   114  			ret = append(ret, fieldIdx)
   115  			added[fieldIdx] = true
   116  		}
   117  	}
   118  	return ret, nil
   119  }
   120  
   121  func arrowTimestampToLogical(typ *arrow.TimestampType, unit arrow.TimeUnit) schema.LogicalType {
   122  	utc := typ.TimeZone == "" || typ.TimeZone == "UTC"
   123  
   124  	// for forward compatibility reasons, and because there's no other way
   125  	// to signal to old readers that values are timestamps, we force
   126  	// the convertedtype field to be set to the corresponding TIMESTAMP_* value.
   127  	// this does cause some ambiguity as parquet readers have not been consistent
   128  	// about the interpretation of TIMESTAMP_* values as being utc-normalized
   129  	// see ARROW-5878
   130  	var scunit schema.TimeUnitType
   131  	switch unit {
   132  	case arrow.Millisecond:
   133  		scunit = schema.TimeUnitMillis
   134  	case arrow.Microsecond:
   135  		scunit = schema.TimeUnitMicros
   136  	case arrow.Nanosecond:
   137  		scunit = schema.TimeUnitNanos
   138  	case arrow.Second:
   139  		// no equivalent in parquet
   140  		return schema.NoLogicalType{}
   141  	}
   142  
   143  	return schema.NewTimestampLogicalTypeForce(utc, scunit)
   144  }
   145  
   146  func getTimestampMeta(typ *arrow.TimestampType, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (parquet.Type, schema.LogicalType, error) {
   147  	coerce := arrprops.coerceTimestamps
   148  	target := typ.Unit
   149  	if coerce {
   150  		target = arrprops.coerceTimestampUnit
   151  	}
   152  
   153  	// user is explicitly asking for int96, no logical type
   154  	if arrprops.timestampAsInt96 && target == arrow.Nanosecond {
   155  		return parquet.Types.Int96, schema.NoLogicalType{}, nil
   156  	}
   157  
   158  	physical := parquet.Types.Int64
   159  	logicalType := arrowTimestampToLogical(typ, target)
   160  
   161  	// user is explicitly asking for timestamp data to be converted to the specified
   162  	// units (target) via coercion
   163  	if coerce {
   164  		if props.Version() == parquet.V1_0 || props.Version() == parquet.V2_4 {
   165  			switch target {
   166  			case arrow.Millisecond, arrow.Microsecond:
   167  			case arrow.Nanosecond, arrow.Second:
   168  				return physical, nil, xerrors.Errorf("parquet version %s files can only coerce arrow timestamps to millis or micros", props.Version())
   169  			}
   170  		} else if target == arrow.Second {
   171  			return physical, nil, xerrors.Errorf("parquet version %s files can only coerce arrow timestampts to millis, micros or nanos", props.Version())
   172  		}
   173  		return physical, logicalType, nil
   174  	}
   175  
   176  	// the user implicitly wants timestamp data to retain its original time units
   177  	// however the converted type field used to indicate logical types for parquet
   178  	// version <=2.4 fields, does not allow for nanosecond time units and so nanos
   179  	// must be coerced to micros
   180  	if (props.Version() == parquet.V1_0 || props.Version() == parquet.V2_4) && typ.Unit == arrow.Nanosecond {
   181  		logicalType = arrowTimestampToLogical(typ, arrow.Microsecond)
   182  		return physical, logicalType, nil
   183  	}
   184  
   185  	// the user implicitly wants timestamp data to retain it's original time units,
   186  	// however the arrow seconds time unit cannot be represented in parquet, so must
   187  	// be coerced to milliseconds
   188  	if typ.Unit == arrow.Second {
   189  		logicalType = arrowTimestampToLogical(typ, arrow.Millisecond)
   190  	}
   191  
   192  	return physical, logicalType, nil
   193  }
   194  
   195  // DecimalSize returns the minimum number of bytes necessary to represent a decimal
   196  // with the requested precision.
   197  //
   198  // Taken from the Apache Impala codebase. The comments next to the return values
   199  // are the maximum value that can be represented in 2's complement with the returned
   200  // number of bytes
   201  func DecimalSize(precision int32) int32 {
   202  	if precision < 1 {
   203  		panic("precision must be >= 1")
   204  	}
   205  
   206  	// generated in python with:
   207  	// >>> decimal_size = lambda prec: int(math.ceil((prec * math.log2(10) + 1) / 8))
   208  	// >>> [-1] + [decimal_size(i) for i in range(1, 77)]
   209  	var byteblock = [...]int32{
   210  		-1, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 9,
   211  		9, 9, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16, 16, 17,
   212  		17, 18, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25,
   213  		26, 26, 26, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32,
   214  	}
   215  
   216  	if precision <= 76 {
   217  		return byteblock[precision]
   218  	}
   219  	return int32(math.Ceil(float64(precision)/8.0)*math.Log2(10) + 1)
   220  }
   221  
   222  func repFromNullable(isnullable bool) parquet.Repetition {
   223  	if isnullable {
   224  		return parquet.Repetitions.Optional
   225  	}
   226  	return parquet.Repetitions.Required
   227  }
   228  
   229  func structToNode(typ *arrow.StructType, name string, nullable bool, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) {
   230  	if len(typ.Fields()) == 0 {
   231  		return nil, xerrors.Errorf("cannot write struct type '%s' with no children field to parquet. Consider adding a dummy child", name)
   232  	}
   233  
   234  	children := make(schema.FieldList, 0, len(typ.Fields()))
   235  	for _, f := range typ.Fields() {
   236  		n, err := fieldToNode(f.Name, f, props, arrprops)
   237  		if err != nil {
   238  			return nil, err
   239  		}
   240  		children = append(children, n)
   241  	}
   242  
   243  	return schema.NewGroupNode(name, repFromNullable(nullable), children, -1)
   244  }
   245  
   246  func fieldToNode(name string, field arrow.Field, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) {
   247  	var (
   248  		logicalType schema.LogicalType = schema.NoLogicalType{}
   249  		typ         parquet.Type
   250  		repType     = repFromNullable(field.Nullable)
   251  		length      = -1
   252  		precision   = -1
   253  		scale       = -1
   254  		err         error
   255  	)
   256  
   257  	switch field.Type.ID() {
   258  	case arrow.NULL:
   259  		typ = parquet.Types.Int32
   260  		logicalType = &schema.NullLogicalType{}
   261  		if repType != parquet.Repetitions.Optional {
   262  			return nil, xerrors.New("nulltype arrow field must be nullable")
   263  		}
   264  	case arrow.BOOL:
   265  		typ = parquet.Types.Boolean
   266  	case arrow.UINT8:
   267  		typ = parquet.Types.Int32
   268  		logicalType = schema.NewIntLogicalType(8, false)
   269  	case arrow.INT8:
   270  		typ = parquet.Types.Int32
   271  		logicalType = schema.NewIntLogicalType(8, true)
   272  	case arrow.UINT16:
   273  		typ = parquet.Types.Int32
   274  		logicalType = schema.NewIntLogicalType(16, false)
   275  	case arrow.INT16:
   276  		typ = parquet.Types.Int32
   277  		logicalType = schema.NewIntLogicalType(16, true)
   278  	case arrow.UINT32:
   279  		typ = parquet.Types.Int32
   280  		logicalType = schema.NewIntLogicalType(32, false)
   281  	case arrow.INT32:
   282  		typ = parquet.Types.Int32
   283  		logicalType = schema.NewIntLogicalType(32, true)
   284  	case arrow.UINT64:
   285  		typ = parquet.Types.Int64
   286  		logicalType = schema.NewIntLogicalType(64, false)
   287  	case arrow.INT64:
   288  		typ = parquet.Types.Int64
   289  		logicalType = schema.NewIntLogicalType(64, true)
   290  	case arrow.FLOAT32:
   291  		typ = parquet.Types.Float
   292  	case arrow.FLOAT64:
   293  		typ = parquet.Types.Double
   294  	case arrow.STRING:
   295  		logicalType = schema.StringLogicalType{}
   296  		fallthrough
   297  	case arrow.BINARY:
   298  		typ = parquet.Types.ByteArray
   299  	case arrow.FIXED_SIZE_BINARY:
   300  		typ = parquet.Types.FixedLenByteArray
   301  		length = field.Type.(*arrow.FixedSizeBinaryType).ByteWidth
   302  	case arrow.DECIMAL:
   303  		typ = parquet.Types.FixedLenByteArray
   304  		dectype := field.Type.(*arrow.Decimal128Type)
   305  		precision = int(dectype.Precision)
   306  		scale = int(dectype.Scale)
   307  		length = int(DecimalSize(int32(precision)))
   308  		logicalType = schema.NewDecimalLogicalType(int32(precision), int32(scale))
   309  	case arrow.DATE32:
   310  		typ = parquet.Types.Int32
   311  		logicalType = schema.DateLogicalType{}
   312  	case arrow.DATE64:
   313  		typ = parquet.Types.Int64
   314  		logicalType = schema.NewTimestampLogicalType(true, schema.TimeUnitMillis)
   315  	case arrow.TIMESTAMP:
   316  		typ, logicalType, err = getTimestampMeta(field.Type.(*arrow.TimestampType), props, arrprops)
   317  		if err != nil {
   318  			return nil, err
   319  		}
   320  	case arrow.TIME32:
   321  		typ = parquet.Types.Int32
   322  		logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMillis)
   323  	case arrow.TIME64:
   324  		typ = parquet.Types.Int64
   325  		timeType := field.Type.(*arrow.Time64Type)
   326  		if timeType.Unit == arrow.Nanosecond {
   327  			logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitNanos)
   328  		} else {
   329  			logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMicros)
   330  		}
   331  	case arrow.STRUCT:
   332  		return structToNode(field.Type.(*arrow.StructType), field.Name, field.Nullable, props, arrprops)
   333  	case arrow.FIXED_SIZE_LIST, arrow.LIST:
   334  		var elem arrow.DataType
   335  		if lt, ok := field.Type.(*arrow.ListType); ok {
   336  			elem = lt.Elem()
   337  		} else {
   338  			elem = field.Type.(*arrow.FixedSizeListType).Elem()
   339  		}
   340  
   341  		child, err := fieldToNode(name, arrow.Field{Name: name, Type: elem, Nullable: true}, props, arrprops)
   342  		if err != nil {
   343  			return nil, err
   344  		}
   345  
   346  		return schema.ListOf(child, repFromNullable(field.Nullable), -1)
   347  	case arrow.DICTIONARY:
   348  		// parquet has no dictionary type, dictionary is encoding, not schema level
   349  		return nil, xerrors.New("not implemented yet")
   350  	case arrow.EXTENSION:
   351  		return nil, xerrors.New("not implemented yet")
   352  	case arrow.MAP:
   353  		mapType := field.Type.(*arrow.MapType)
   354  		keyNode, err := fieldToNode("key", mapType.KeyField(), props, arrprops)
   355  		if err != nil {
   356  			return nil, err
   357  		}
   358  
   359  		valueNode, err := fieldToNode("value", mapType.ItemField(), props, arrprops)
   360  		if err != nil {
   361  			return nil, err
   362  		}
   363  
   364  		if arrprops.noMapLogicalType {
   365  			keyval := schema.FieldList{keyNode, valueNode}
   366  			keyvalNode, err := schema.NewGroupNode("key_value", parquet.Repetitions.Repeated, keyval, -1)
   367  			if err != nil {
   368  				return nil, err
   369  			}
   370  			return schema.NewGroupNode(field.Name, repFromNullable(field.Nullable), schema.FieldList{
   371  				keyvalNode,
   372  			}, -1)
   373  		}
   374  		return schema.MapOf(field.Name, keyNode, valueNode, repFromNullable(field.Nullable), -1)
   375  	default:
   376  		return nil, xerrors.New("not implemented yet")
   377  	}
   378  
   379  	return schema.NewPrimitiveNodeLogical(name, repType, logicalType, typ, length, fieldIDFromMeta(field.Metadata))
   380  }
   381  
   382  const fieldIDKey = "PARQUET:field_id"
   383  
   384  func fieldIDFromMeta(m arrow.Metadata) int32 {
   385  	if m.Len() == 0 {
   386  		return -1
   387  	}
   388  
   389  	key := m.FindKey(fieldIDKey)
   390  	if key < 0 {
   391  		return -1
   392  	}
   393  
   394  	id, err := strconv.ParseInt(m.Values()[key], 10, 32)
   395  	if err != nil {
   396  		return -1
   397  	}
   398  
   399  	if id < 0 {
   400  		return -1
   401  	}
   402  
   403  	return int32(id)
   404  }
   405  
   406  // ToParquet generates a Parquet Schema from an arrow Schema using the given properties to make
   407  // decisions when determining the logical/physical types of the columns.
   408  func ToParquet(sc *arrow.Schema, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (*schema.Schema, error) {
   409  	if props == nil {
   410  		props = parquet.NewWriterProperties()
   411  	}
   412  
   413  	nodes := make(schema.FieldList, 0, len(sc.Fields()))
   414  	for _, f := range sc.Fields() {
   415  		n, err := fieldToNode(f.Name, f, props, arrprops)
   416  		if err != nil {
   417  			return nil, err
   418  		}
   419  		nodes = append(nodes, n)
   420  	}
   421  
   422  	root, err := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, nodes, -1)
   423  	return schema.NewSchema(root), err
   424  }
   425  
   426  type schemaTree struct {
   427  	manifest *SchemaManifest
   428  
   429  	schema *schema.Schema
   430  	props  *ArrowReadProperties
   431  }
   432  
   433  func (s schemaTree) LinkParent(child, parent *SchemaField) {
   434  	s.manifest.ChildToParent[child] = parent
   435  }
   436  
   437  func (s schemaTree) RecordLeaf(leaf *SchemaField) {
   438  	s.manifest.ColIndexToField[leaf.ColIndex] = leaf
   439  }
   440  
   441  func arrowInt(log *schema.IntLogicalType) (arrow.DataType, error) {
   442  	switch log.BitWidth() {
   443  	case 8:
   444  		if log.IsSigned() {
   445  			return arrow.PrimitiveTypes.Int8, nil
   446  		}
   447  		return arrow.PrimitiveTypes.Uint8, nil
   448  	case 16:
   449  		if log.IsSigned() {
   450  			return arrow.PrimitiveTypes.Int16, nil
   451  		}
   452  		return arrow.PrimitiveTypes.Uint16, nil
   453  	case 32:
   454  		if log.IsSigned() {
   455  			return arrow.PrimitiveTypes.Int32, nil
   456  		}
   457  		return arrow.PrimitiveTypes.Uint32, nil
   458  	case 64:
   459  		if log.IsSigned() {
   460  			return arrow.PrimitiveTypes.Int64, nil
   461  		}
   462  		return arrow.PrimitiveTypes.Uint64, nil
   463  	default:
   464  		return nil, xerrors.New("invalid logical type for int32")
   465  	}
   466  }
   467  
   468  func arrowTime32(logical *schema.TimeLogicalType) (arrow.DataType, error) {
   469  	if logical.TimeUnit() == schema.TimeUnitMillis {
   470  		return arrow.FixedWidthTypes.Time32ms, nil
   471  	}
   472  
   473  	return nil, xerrors.New(logical.String() + " cannot annotate a time32")
   474  }
   475  
   476  func arrowTime64(logical *schema.TimeLogicalType) (arrow.DataType, error) {
   477  	switch logical.TimeUnit() {
   478  	case schema.TimeUnitMicros:
   479  		return arrow.FixedWidthTypes.Time64us, nil
   480  	case schema.TimeUnitNanos:
   481  		return arrow.FixedWidthTypes.Time64ns, nil
   482  	default:
   483  		return nil, xerrors.New(logical.String() + " cannot annotate int64")
   484  	}
   485  }
   486  
   487  func arrowTimestamp(logical *schema.TimestampLogicalType) (arrow.DataType, error) {
   488  	tz := "UTC"
   489  	if logical.IsFromConvertedType() {
   490  		tz = ""
   491  	}
   492  
   493  	switch logical.TimeUnit() {
   494  	case schema.TimeUnitMillis:
   495  		return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Millisecond}, nil
   496  	case schema.TimeUnitMicros:
   497  		return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Microsecond}, nil
   498  	case schema.TimeUnitNanos:
   499  		return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Nanosecond}, nil
   500  	default:
   501  		return nil, xerrors.New("Unrecognized unit in timestamp logical type " + logical.String())
   502  	}
   503  }
   504  
   505  func arrowFromInt32(logical schema.LogicalType) (arrow.DataType, error) {
   506  	switch logtype := logical.(type) {
   507  	case schema.NoLogicalType:
   508  		return arrow.PrimitiveTypes.Int32, nil
   509  	case *schema.TimeLogicalType:
   510  		return arrowTime32(logtype)
   511  	case *schema.DecimalLogicalType:
   512  		return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil
   513  	case *schema.IntLogicalType:
   514  		return arrowInt(logtype)
   515  	case schema.DateLogicalType:
   516  		return arrow.FixedWidthTypes.Date32, nil
   517  	default:
   518  		return nil, xerrors.New(logical.String() + " cannot annotate int32")
   519  	}
   520  }
   521  
   522  func arrowFromInt64(logical schema.LogicalType) (arrow.DataType, error) {
   523  	if logical.IsNone() {
   524  		return arrow.PrimitiveTypes.Int64, nil
   525  	}
   526  
   527  	switch logtype := logical.(type) {
   528  	case *schema.IntLogicalType:
   529  		return arrowInt(logtype)
   530  	case *schema.DecimalLogicalType:
   531  		return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil
   532  	case *schema.TimeLogicalType:
   533  		return arrowTime64(logtype)
   534  	case *schema.TimestampLogicalType:
   535  		return arrowTimestamp(logtype)
   536  	default:
   537  		return nil, xerrors.New(logical.String() + " cannot annotate int64")
   538  	}
   539  }
   540  
   541  func arrowFromByteArray(logical schema.LogicalType) (arrow.DataType, error) {
   542  	switch logtype := logical.(type) {
   543  	case schema.StringLogicalType:
   544  		return arrow.BinaryTypes.String, nil
   545  	case *schema.DecimalLogicalType:
   546  		return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil
   547  	case schema.NoLogicalType,
   548  		schema.EnumLogicalType,
   549  		schema.JSONLogicalType,
   550  		schema.BSONLogicalType:
   551  		return arrow.BinaryTypes.Binary, nil
   552  	default:
   553  		return nil, xerrors.New("unhandled logicaltype " + logical.String() + " for byte_array")
   554  	}
   555  }
   556  
   557  func arrowFromFLBA(logical schema.LogicalType, length int) (arrow.DataType, error) {
   558  	switch logtype := logical.(type) {
   559  	case *schema.DecimalLogicalType:
   560  		return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil
   561  	case schema.NoLogicalType, schema.IntervalLogicalType, schema.UUIDLogicalType:
   562  		return &arrow.FixedSizeBinaryType{ByteWidth: int(length)}, nil
   563  	default:
   564  		return nil, xerrors.New("unhandled logical type " + logical.String() + " for fixed-length byte array")
   565  	}
   566  }
   567  
   568  func getArrowType(physical parquet.Type, logical schema.LogicalType, typeLen int) (arrow.DataType, error) {
   569  	if !logical.IsValid() || logical.Equals(schema.NullLogicalType{}) {
   570  		return arrow.Null, nil
   571  	}
   572  
   573  	switch physical {
   574  	case parquet.Types.Boolean:
   575  		return arrow.FixedWidthTypes.Boolean, nil
   576  	case parquet.Types.Int32:
   577  		return arrowFromInt32(logical)
   578  	case parquet.Types.Int64:
   579  		return arrowFromInt64(logical)
   580  	case parquet.Types.Int96:
   581  		return arrow.FixedWidthTypes.Timestamp_ns, nil
   582  	case parquet.Types.Float:
   583  		return arrow.PrimitiveTypes.Float32, nil
   584  	case parquet.Types.Double:
   585  		return arrow.PrimitiveTypes.Float64, nil
   586  	case parquet.Types.ByteArray:
   587  		return arrowFromByteArray(logical)
   588  	case parquet.Types.FixedLenByteArray:
   589  		return arrowFromFLBA(logical, typeLen)
   590  	default:
   591  		return nil, xerrors.New("invalid physical column type")
   592  	}
   593  }
   594  
   595  func populateLeaf(colIndex int, field *arrow.Field, currentLevels file.LevelInfo, ctx *schemaTree, parent *SchemaField, out *SchemaField) {
   596  	out.Field = field
   597  	out.ColIndex = colIndex
   598  	out.LevelInfo = currentLevels
   599  	ctx.RecordLeaf(out)
   600  	ctx.LinkParent(out, parent)
   601  }
   602  
   603  func listToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   604  	if n.NumFields() != 1 {
   605  		return xerrors.New("LIST groups must have only 1 child")
   606  	}
   607  
   608  	if n.RepetitionType() == parquet.Repetitions.Repeated {
   609  		return xerrors.New("LIST groups must not be repeated")
   610  	}
   611  
   612  	currentLevels.Increment(n)
   613  
   614  	out.Children = make([]SchemaField, n.NumFields())
   615  	ctx.LinkParent(out, parent)
   616  	ctx.LinkParent(&out.Children[0], out)
   617  
   618  	listNode := n.Field(0)
   619  	if listNode.RepetitionType() != parquet.Repetitions.Repeated {
   620  		return xerrors.New("non-repeated nodes in a list group are not supported")
   621  	}
   622  
   623  	repeatedAncestorDef := currentLevels.IncrementRepeated()
   624  	if listNode.Type() == schema.Group {
   625  		// Resolve 3-level encoding
   626  		//
   627  		// required/optional group name=whatever {
   628  		//   repeated group name=list {
   629  		//     required/optional TYPE item;
   630  		//   }
   631  		// }
   632  		//
   633  		// yields list<item: TYPE ?nullable> ?nullable
   634  		//
   635  		// We distinguish the special case that we have
   636  		//
   637  		// required/optional group name=whatever {
   638  		//   repeated group name=array or $SOMETHING_tuple {
   639  		//     required/optional TYPE item;
   640  		//   }
   641  		// }
   642  		//
   643  		// In this latter case, the inner type of the list should be a struct
   644  		// rather than a primitive value
   645  		//
   646  		// yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
   647  		// Special case mentioned in the format spec:
   648  		//   If the name is array or ends in _tuple, this should be a list of struct
   649  		//   even for single child elements.
   650  		listGroup := listNode.(*schema.GroupNode)
   651  		if listGroup.NumFields() == 1 && (listGroup.Name() == "array" || strings.HasSuffix(listGroup.Name(), "_tuple")) {
   652  			// list of primitive type
   653  			if err := groupToStructField(listGroup, currentLevels, ctx, out, &out.Children[0]); err != nil {
   654  				return err
   655  			}
   656  		} else {
   657  			if err := nodeToSchemaField(listGroup.Field(0), currentLevels, ctx, out, &out.Children[0]); err != nil {
   658  				return err
   659  			}
   660  		}
   661  	} else {
   662  		// Two-level list encoding
   663  		//
   664  		// required/optional group LIST {
   665  		//   repeated TYPE;
   666  		// }
   667  		primitiveNode := listNode.(*schema.PrimitiveNode)
   668  		colIndex := ctx.schema.ColumnIndexByNode(primitiveNode)
   669  		arrowType, err := getArrowType(primitiveNode.PhysicalType(), primitiveNode.LogicalType(), primitiveNode.TypeLength())
   670  		if err != nil {
   671  			return err
   672  		}
   673  
   674  		itemField := arrow.Field{Name: listNode.Name(), Type: arrowType, Nullable: false, Metadata: createFieldMeta(int(listNode.FieldID()))}
   675  		populateLeaf(colIndex, &itemField, currentLevels, ctx, out, &out.Children[0])
   676  	}
   677  
   678  	out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOf(out.Children[0].Field.Type),
   679  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional, Metadata: createFieldMeta(int(n.FieldID()))}
   680  	out.LevelInfo = currentLevels
   681  	// At this point current levels contains the def level for this list,
   682  	// we need to reset to the prior parent.
   683  	out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef
   684  	return nil
   685  }
   686  
   687  func groupToStructField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   688  	arrowFields := make([]arrow.Field, 0, n.NumFields())
   689  	out.Children = make([]SchemaField, n.NumFields())
   690  
   691  	for i := 0; i < n.NumFields(); i++ {
   692  		if err := nodeToSchemaField(n.Field(i), currentLevels, ctx, out, &out.Children[i]); err != nil {
   693  			return err
   694  		}
   695  		arrowFields = append(arrowFields, *out.Children[i].Field)
   696  	}
   697  
   698  	out.Field = &arrow.Field{Name: n.Name(), Type: arrow.StructOf(arrowFields...),
   699  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional, Metadata: createFieldMeta(int(n.FieldID()))}
   700  	out.LevelInfo = currentLevels
   701  	return nil
   702  }
   703  
   704  func mapToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   705  	if n.NumFields() != 1 {
   706  		return xerrors.New("MAP group must have exactly 1 child")
   707  	}
   708  	if n.RepetitionType() == parquet.Repetitions.Repeated {
   709  		return xerrors.New("MAP groups must not be repeated")
   710  	}
   711  
   712  	keyvalueNode := n.Field(0)
   713  	if keyvalueNode.RepetitionType() != parquet.Repetitions.Repeated {
   714  		return xerrors.New("Non-repeated keyvalue group in MAP group is not supported")
   715  	}
   716  
   717  	if keyvalueNode.Type() != schema.Group {
   718  		return xerrors.New("keyvalue node must be a group")
   719  	}
   720  
   721  	kvgroup := keyvalueNode.(*schema.GroupNode)
   722  	if kvgroup.NumFields() != 1 && kvgroup.NumFields() != 2 {
   723  		return xerrors.Errorf("keyvalue node group must have exactly 1 or 2 child elements, Found %d", kvgroup.NumFields())
   724  	}
   725  
   726  	keyNode := kvgroup.Field(0)
   727  	if keyNode.RepetitionType() != parquet.Repetitions.Required {
   728  		return xerrors.New("MAP keys must be required")
   729  	}
   730  
   731  	// Arrow doesn't support 1 column maps (i.e. Sets).  The options are to either
   732  	// make the values column nullable, or process the map as a list.  We choose the latter
   733  	// as it is simpler.
   734  	if kvgroup.NumFields() == 1 {
   735  		return listToSchemaField(n, currentLevels, ctx, parent, out)
   736  	}
   737  
   738  	currentLevels.Increment(n)
   739  	repeatedAncestorDef := currentLevels.IncrementRepeated()
   740  	out.Children = make([]SchemaField, 1)
   741  
   742  	kvfield := &out.Children[0]
   743  	kvfield.Children = make([]SchemaField, 2)
   744  
   745  	keyField := &kvfield.Children[0]
   746  	valueField := &kvfield.Children[1]
   747  
   748  	ctx.LinkParent(out, parent)
   749  	ctx.LinkParent(kvfield, out)
   750  	ctx.LinkParent(keyField, kvfield)
   751  	ctx.LinkParent(valueField, kvfield)
   752  
   753  	// required/optional group name=whatever {
   754  	//   repeated group name=key_values{
   755  	//     required TYPE key;
   756  	// required/optional TYPE value;
   757  	//   }
   758  	// }
   759  	//
   760  
   761  	if err := nodeToSchemaField(keyNode, currentLevels, ctx, kvfield, keyField); err != nil {
   762  		return err
   763  	}
   764  	if err := nodeToSchemaField(kvgroup.Field(1), currentLevels, ctx, kvfield, valueField); err != nil {
   765  		return err
   766  	}
   767  
   768  	kvfield.Field = &arrow.Field{Name: n.Name(), Type: arrow.StructOf(*keyField.Field, *valueField.Field),
   769  		Nullable: false, Metadata: createFieldMeta(int(kvgroup.FieldID()))}
   770  
   771  	kvfield.LevelInfo = currentLevels
   772  	out.Field = &arrow.Field{Name: n.Name(), Type: arrow.MapOf(keyField.Field.Type, valueField.Field.Type),
   773  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional,
   774  		Metadata: createFieldMeta(int(n.FieldID()))}
   775  	out.LevelInfo = currentLevels
   776  	// At this point current levels contains the def level for this map,
   777  	// we need to reset to the prior parent.
   778  	out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef
   779  	return nil
   780  }
   781  
   782  func groupToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   783  	if n.LogicalType().Equals(schema.NewListLogicalType()) {
   784  		return listToSchemaField(n, currentLevels, ctx, parent, out)
   785  	} else if n.LogicalType().Equals(schema.MapLogicalType{}) {
   786  		return mapToSchemaField(n, currentLevels, ctx, parent, out)
   787  	}
   788  
   789  	if n.RepetitionType() == parquet.Repetitions.Repeated {
   790  		// Simple repeated struct
   791  		//
   792  		// repeated group $NAME {
   793  		//   r/o TYPE[0] f0
   794  		//   r/o TYPE[1] f1
   795  		// }
   796  		out.Children = make([]SchemaField, 1)
   797  		repeatedAncestorDef := currentLevels.IncrementRepeated()
   798  		if err := groupToStructField(n, currentLevels, ctx, out, &out.Children[0]); err != nil {
   799  			return err
   800  		}
   801  
   802  		out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOf(out.Children[0].Field.Type), Nullable: false,
   803  			Metadata: createFieldMeta(int(n.FieldID()))}
   804  		ctx.LinkParent(&out.Children[0], out)
   805  		out.LevelInfo = currentLevels
   806  		out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef
   807  		return nil
   808  	}
   809  
   810  	currentLevels.Increment(n)
   811  	return groupToStructField(n, currentLevels, ctx, parent, out)
   812  }
   813  
   814  func createFieldMeta(fieldID int) arrow.Metadata {
   815  	return arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{strconv.Itoa(fieldID)})
   816  }
   817  
   818  func nodeToSchemaField(n schema.Node, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   819  	ctx.LinkParent(out, parent)
   820  
   821  	if n.Type() == schema.Group {
   822  		return groupToSchemaField(n.(*schema.GroupNode), currentLevels, ctx, parent, out)
   823  	}
   824  
   825  	// Either a normal flat primitive type, or a list type encoded with 1-level
   826  	// list encoding. Note that the 3-level encoding is the form recommended by
   827  	// the parquet specification, but technically we can have either
   828  	//
   829  	// required/optional $TYPE $FIELD_NAME
   830  	//
   831  	// or
   832  	//
   833  	// repeated $TYPE $FIELD_NAME
   834  
   835  	primitive := n.(*schema.PrimitiveNode)
   836  	colIndex := ctx.schema.ColumnIndexByNode(primitive)
   837  	arrowType, err := getArrowType(primitive.PhysicalType(), primitive.LogicalType(), primitive.TypeLength())
   838  	if err != nil {
   839  		return err
   840  	}
   841  
   842  	if primitive.RepetitionType() == parquet.Repetitions.Repeated {
   843  		// one-level list encoding e.g. a: repeated int32;
   844  		repeatedAncestorDefLevel := currentLevels.IncrementRepeated()
   845  		out.Children = make([]SchemaField, 1)
   846  		child := arrow.Field{Name: primitive.Name(), Type: arrowType, Nullable: false}
   847  		populateLeaf(colIndex, &child, currentLevels, ctx, out, &out.Children[0])
   848  		out.Field = &arrow.Field{Name: primitive.Name(), Type: arrow.ListOf(child.Type), Nullable: false,
   849  			Metadata: createFieldMeta(int(primitive.FieldID()))}
   850  		out.LevelInfo = currentLevels
   851  		out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDefLevel
   852  		return nil
   853  	}
   854  
   855  	currentLevels.Increment(n)
   856  	populateLeaf(colIndex, &arrow.Field{Name: n.Name(), Type: arrowType,
   857  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional,
   858  		Metadata: createFieldMeta(int(n.FieldID()))},
   859  		currentLevels, ctx, parent, out)
   860  	return nil
   861  }
   862  
   863  func getOriginSchema(meta metadata.KeyValueMetadata, mem memory.Allocator) (*arrow.Schema, error) {
   864  	if meta == nil {
   865  		return nil, nil
   866  	}
   867  
   868  	const arrowSchemaKey = "ARROW:schema"
   869  	serialized := meta.FindValue(arrowSchemaKey)
   870  	if serialized == nil {
   871  		return nil, nil
   872  	}
   873  
   874  	decoded, err := base64.RawStdEncoding.DecodeString(*serialized)
   875  	if err != nil {
   876  		return nil, err
   877  	}
   878  
   879  	return flight.DeserializeSchema(decoded, mem)
   880  }
   881  
   882  func getNestedFactory(origin, inferred arrow.DataType) func(fieldList []arrow.Field) arrow.DataType {
   883  	switch inferred.ID() {
   884  	case arrow.STRUCT:
   885  		if origin.ID() == arrow.STRUCT {
   886  			return func(list []arrow.Field) arrow.DataType {
   887  				return arrow.StructOf(list...)
   888  			}
   889  		}
   890  	case arrow.LIST:
   891  		switch origin.ID() {
   892  		case arrow.LIST:
   893  			return func(list []arrow.Field) arrow.DataType {
   894  				return arrow.ListOf(list[0].Type)
   895  			}
   896  		case arrow.FIXED_SIZE_LIST:
   897  			sz := origin.(*arrow.FixedSizeListType).Len()
   898  			return func(list []arrow.Field) arrow.DataType {
   899  				return arrow.FixedSizeListOf(sz, list[0].Type)
   900  			}
   901  		}
   902  	case arrow.MAP:
   903  		if origin.ID() == arrow.MAP {
   904  			return func(list []arrow.Field) arrow.DataType {
   905  				valType := list[0].Type.(*arrow.StructType)
   906  				return arrow.MapOf(valType.Field(0).Type, valType.Field(1).Type)
   907  			}
   908  		}
   909  	}
   910  	return nil
   911  }
   912  
   913  func applyOriginalStorageMetadata(origin arrow.Field, inferred *SchemaField) (modified bool, err error) {
   914  	nchildren := len(inferred.Children)
   915  	switch origin.Type.ID() {
   916  	case arrow.EXTENSION, arrow.SPARSE_UNION, arrow.DENSE_UNION, arrow.DICTIONARY:
   917  		err = xerrors.New("unimplemented type")
   918  	case arrow.STRUCT:
   919  		typ := origin.Type.(*arrow.StructType)
   920  		if nchildren != len(typ.Fields()) {
   921  			return
   922  		}
   923  
   924  		factory := getNestedFactory(typ, inferred.Field.Type)
   925  		if factory == nil {
   926  			return
   927  		}
   928  
   929  		modified = typ.ID() != inferred.Field.Type.ID()
   930  		for idx := range inferred.Children {
   931  			childMod, err := applyOriginalMetadata(typ.Field(idx), &inferred.Children[idx])
   932  			if err != nil {
   933  				return false, err
   934  			}
   935  			modified = modified || childMod
   936  		}
   937  		if modified {
   938  			modifiedChildren := make([]arrow.Field, len(inferred.Children))
   939  			for idx, child := range inferred.Children {
   940  				modifiedChildren[idx] = *child.Field
   941  			}
   942  			inferred.Field.Type = factory(modifiedChildren)
   943  		}
   944  	case arrow.FIXED_SIZE_LIST, arrow.LIST, arrow.MAP:
   945  		if nchildren != 1 {
   946  			return
   947  		}
   948  		factory := getNestedFactory(origin.Type, inferred.Field.Type)
   949  		if factory == nil {
   950  			return
   951  		}
   952  
   953  		modified = origin.Type.ID() != inferred.Field.Type.ID()
   954  		var childModified bool
   955  		switch typ := origin.Type.(type) {
   956  		case *arrow.FixedSizeListType:
   957  			childModified, err = applyOriginalMetadata(arrow.Field{Type: typ.Elem()}, &inferred.Children[0])
   958  		case *arrow.ListType:
   959  			childModified, err = applyOriginalMetadata(arrow.Field{Type: typ.Elem()}, &inferred.Children[0])
   960  		case *arrow.MapType:
   961  			childModified, err = applyOriginalMetadata(arrow.Field{Type: typ.ValueType()}, &inferred.Children[0])
   962  		}
   963  		if err != nil {
   964  			return
   965  		}
   966  		modified = modified || childModified
   967  		if modified {
   968  			inferred.Field.Type = factory([]arrow.Field{*inferred.Children[0].Field})
   969  		}
   970  	case arrow.TIMESTAMP:
   971  		if inferred.Field.Type.ID() != arrow.TIMESTAMP {
   972  			return
   973  		}
   974  
   975  		tsOtype := origin.Type.(*arrow.TimestampType)
   976  		tsInfType := inferred.Field.Type.(*arrow.TimestampType)
   977  
   978  		// if the unit is the same and the data is tz-aware, then set the original time zone
   979  		// since parquet has no native storage of timezones
   980  		if tsOtype.Unit == tsInfType.Unit && tsInfType.TimeZone == "UTC" && tsOtype.TimeZone != "" {
   981  			inferred.Field.Type = origin.Type
   982  		}
   983  		modified = true
   984  	}
   985  
   986  	if origin.HasMetadata() {
   987  		meta := origin.Metadata
   988  		if inferred.Field.HasMetadata() {
   989  			final := make(map[string]string)
   990  			for idx, k := range meta.Keys() {
   991  				final[k] = meta.Values()[idx]
   992  			}
   993  			for idx, k := range inferred.Field.Metadata.Keys() {
   994  				final[k] = inferred.Field.Metadata.Values()[idx]
   995  			}
   996  			inferred.Field.Metadata = arrow.MetadataFrom(final)
   997  		} else {
   998  			inferred.Field.Metadata = meta
   999  		}
  1000  		modified = true
  1001  	}
  1002  
  1003  	return
  1004  }
  1005  
  1006  func applyOriginalMetadata(origin arrow.Field, inferred *SchemaField) (bool, error) {
  1007  	if origin.Type.ID() == arrow.EXTENSION {
  1008  		return false, xerrors.New("extension types not implemented yet")
  1009  	}
  1010  
  1011  	return applyOriginalStorageMetadata(origin, inferred)
  1012  }
  1013  
  1014  // NewSchemaManifest creates a manifest for mapping a parquet schema to a given arrow schema.
  1015  //
  1016  // The metadata passed in should be the file level key value metadata from the parquet file or nil.
  1017  // If the ARROW:schema was in the metadata, then it is utilized to determine types.
  1018  func NewSchemaManifest(sc *schema.Schema, meta metadata.KeyValueMetadata, props *ArrowReadProperties) (*SchemaManifest, error) {
  1019  	var ctx schemaTree
  1020  	ctx.manifest = &SchemaManifest{
  1021  		ColIndexToField: make(map[int]*SchemaField),
  1022  		ChildToParent:   make(map[*SchemaField]*SchemaField),
  1023  		descr:           sc,
  1024  		Fields:          make([]SchemaField, sc.Root().NumFields()),
  1025  	}
  1026  	ctx.props = props
  1027  	ctx.schema = sc
  1028  
  1029  	var err error
  1030  	ctx.manifest.OriginSchema, err = getOriginSchema(meta, memory.DefaultAllocator)
  1031  	if err != nil {
  1032  		return nil, err
  1033  	}
  1034  
  1035  	// if original schema is not compatible with the parquet schema, ignore it
  1036  	if ctx.manifest.OriginSchema != nil && len(ctx.manifest.OriginSchema.Fields()) != sc.Root().NumFields() {
  1037  		ctx.manifest.OriginSchema = nil
  1038  	}
  1039  
  1040  	for idx := range ctx.manifest.Fields {
  1041  		field := &ctx.manifest.Fields[idx]
  1042  		if err := nodeToSchemaField(sc.Root().Field(idx), file.LevelInfo{NullSlotUsage: 1}, &ctx, nil, field); err != nil {
  1043  			return nil, err
  1044  		}
  1045  
  1046  		if ctx.manifest.OriginSchema != nil {
  1047  			if _, err := applyOriginalMetadata(ctx.manifest.OriginSchema.Field(idx), field); err != nil {
  1048  				return nil, err
  1049  			}
  1050  		}
  1051  	}
  1052  	return ctx.manifest, nil
  1053  }
  1054  
  1055  // FromParquet generates an arrow Schema from a provided Parquet Schema
  1056  func FromParquet(sc *schema.Schema, props *ArrowReadProperties, kv metadata.KeyValueMetadata) (*arrow.Schema, error) {
  1057  	manifest, err := NewSchemaManifest(sc, kv, props)
  1058  	if err != nil {
  1059  		return nil, err
  1060  	}
  1061  
  1062  	fields := make([]arrow.Field, len(manifest.Fields))
  1063  	for idx, field := range manifest.Fields {
  1064  		fields[idx] = *field.Field
  1065  	}
  1066  
  1067  	if manifest.OriginSchema != nil {
  1068  		meta := manifest.OriginSchema.Metadata()
  1069  		return arrow.NewSchema(fields, &meta), nil
  1070  	}
  1071  	return arrow.NewSchema(fields, manifest.SchemaMeta), nil
  1072  }