github.com/apache/arrow/go/v10@v10.0.1/parquet/pqarrow/schema.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow
    18  
    19  import (
    20  	"encoding/base64"
    21  	"fmt"
    22  	"math"
    23  	"strconv"
    24  	"strings"
    25  
    26  	"github.com/apache/arrow/go/v10/arrow"
    27  	"github.com/apache/arrow/go/v10/arrow/flight"
    28  	"github.com/apache/arrow/go/v10/arrow/memory"
    29  	"github.com/apache/arrow/go/v10/parquet"
    30  	"github.com/apache/arrow/go/v10/parquet/file"
    31  	"github.com/apache/arrow/go/v10/parquet/metadata"
    32  	"github.com/apache/arrow/go/v10/parquet/schema"
    33  	"golang.org/x/xerrors"
    34  )
    35  
    36  // SchemaField is a holder that defines a specific logical field in the schema
    37  // which could potentially refer to multiple physical columns in the underlying
    38  // parquet file if it is a nested type.
    39  //
    40  // ColIndex is only populated (not -1) when it is a leaf column.
    41  type SchemaField struct {
    42  	Field     *arrow.Field
    43  	Children  []SchemaField
    44  	ColIndex  int
    45  	LevelInfo file.LevelInfo
    46  }
    47  
    48  // IsLeaf returns true if the SchemaField is a leaf column, ie: ColIndex != -1
    49  func (s *SchemaField) IsLeaf() bool { return s.ColIndex != -1 }
    50  
    51  // SchemaManifest represents a full manifest for mapping a Parquet schema
    52  // to an arrow Schema.
    53  type SchemaManifest struct {
    54  	descr        *schema.Schema
    55  	OriginSchema *arrow.Schema
    56  	SchemaMeta   *arrow.Metadata
    57  
    58  	ColIndexToField map[int]*SchemaField
    59  	ChildToParent   map[*SchemaField]*SchemaField
    60  	Fields          []SchemaField
    61  }
    62  
    63  // GetColumnField returns the corresponding Field for a given column index.
    64  func (sm *SchemaManifest) GetColumnField(index int) (*SchemaField, error) {
    65  	if field, ok := sm.ColIndexToField[index]; ok {
    66  		return field, nil
    67  	}
    68  	return nil, fmt.Errorf("Column Index %d not found in schema manifest", index)
    69  }
    70  
    71  // GetParent gets the parent field for a given field if it is a nested column, otherwise
    72  // returns nil if there is no parent field.
    73  func (sm *SchemaManifest) GetParent(field *SchemaField) *SchemaField {
    74  	if p, ok := sm.ChildToParent[field]; ok {
    75  		return p
    76  	}
    77  	return nil
    78  }
    79  
    80  // GetFieldIndices coalesces a list of field indices (relative to the equivalent arrow::Schema) which
    81  // correspond to the column root (first node below the parquet schema's root group) of
    82  // each leaf referenced in column_indices.
    83  //
    84  // For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
    85  // the roots are `a` and `i` (return=[0,2]).
    86  //
    87  // root
    88  // -- a  <------
    89  // -- -- b  |  |
    90  // -- -- -- c  |
    91  // -- -- -- d  |
    92  // -- -- -- -- e
    93  // -- f
    94  // -- -- g
    95  // -- -- -- h
    96  // -- i  <---
    97  // -- -- j  |
    98  // -- -- -- k
    99  func (sm *SchemaManifest) GetFieldIndices(indices []int) ([]int, error) {
   100  	added := make(map[int]bool)
   101  	ret := make([]int, 0)
   102  
   103  	for _, idx := range indices {
   104  		if idx < 0 || idx >= sm.descr.NumColumns() {
   105  			return nil, fmt.Errorf("column index %d is not valid", idx)
   106  		}
   107  
   108  		fieldNode := sm.descr.ColumnRoot(idx)
   109  		fieldIdx := sm.descr.Root().FieldIndexByField(fieldNode)
   110  		if fieldIdx == -1 {
   111  			return nil, fmt.Errorf("column index %d is not valid", idx)
   112  		}
   113  
   114  		if _, ok := added[fieldIdx]; !ok {
   115  			ret = append(ret, fieldIdx)
   116  			added[fieldIdx] = true
   117  		}
   118  	}
   119  	return ret, nil
   120  }
   121  
   122  func arrowTimestampToLogical(typ *arrow.TimestampType, unit arrow.TimeUnit) schema.LogicalType {
   123  	utc := typ.TimeZone == "" || typ.TimeZone == "UTC"
   124  
   125  	// for forward compatibility reasons, and because there's no other way
   126  	// to signal to old readers that values are timestamps, we force
   127  	// the convertedtype field to be set to the corresponding TIMESTAMP_* value.
   128  	// this does cause some ambiguity as parquet readers have not been consistent
   129  	// about the interpretation of TIMESTAMP_* values as being utc-normalized
   130  	// see ARROW-5878
   131  	var scunit schema.TimeUnitType
   132  	switch unit {
   133  	case arrow.Millisecond:
   134  		scunit = schema.TimeUnitMillis
   135  	case arrow.Microsecond:
   136  		scunit = schema.TimeUnitMicros
   137  	case arrow.Nanosecond:
   138  		scunit = schema.TimeUnitNanos
   139  	case arrow.Second:
   140  		// no equivalent in parquet
   141  		return schema.NoLogicalType{}
   142  	}
   143  
   144  	return schema.NewTimestampLogicalTypeForce(utc, scunit)
   145  }
   146  
   147  func getTimestampMeta(typ *arrow.TimestampType, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (parquet.Type, schema.LogicalType, error) {
   148  	coerce := arrprops.coerceTimestamps
   149  	target := typ.Unit
   150  	if coerce {
   151  		target = arrprops.coerceTimestampUnit
   152  	}
   153  
   154  	// user is explicitly asking for int96, no logical type
   155  	if arrprops.timestampAsInt96 && target == arrow.Nanosecond {
   156  		return parquet.Types.Int96, schema.NoLogicalType{}, nil
   157  	}
   158  
   159  	physical := parquet.Types.Int64
   160  	logicalType := arrowTimestampToLogical(typ, target)
   161  
   162  	// user is explicitly asking for timestamp data to be converted to the specified
   163  	// units (target) via coercion
   164  	if coerce {
   165  		if props.Version() == parquet.V1_0 || props.Version() == parquet.V2_4 {
   166  			switch target {
   167  			case arrow.Millisecond, arrow.Microsecond:
   168  			case arrow.Nanosecond, arrow.Second:
   169  				return physical, nil, fmt.Errorf("parquet version %s files can only coerce arrow timestamps to millis or micros", props.Version())
   170  			}
   171  		} else if target == arrow.Second {
   172  			return physical, nil, fmt.Errorf("parquet version %s files can only coerce arrow timestampts to millis, micros or nanos", props.Version())
   173  		}
   174  		return physical, logicalType, nil
   175  	}
   176  
   177  	// the user implicitly wants timestamp data to retain its original time units
   178  	// however the converted type field used to indicate logical types for parquet
   179  	// version <=2.4 fields, does not allow for nanosecond time units and so nanos
   180  	// must be coerced to micros
   181  	if (props.Version() == parquet.V1_0 || props.Version() == parquet.V2_4) && typ.Unit == arrow.Nanosecond {
   182  		logicalType = arrowTimestampToLogical(typ, arrow.Microsecond)
   183  		return physical, logicalType, nil
   184  	}
   185  
   186  	// the user implicitly wants timestamp data to retain it's original time units,
   187  	// however the arrow seconds time unit cannot be represented in parquet, so must
   188  	// be coerced to milliseconds
   189  	if typ.Unit == arrow.Second {
   190  		logicalType = arrowTimestampToLogical(typ, arrow.Millisecond)
   191  	}
   192  
   193  	return physical, logicalType, nil
   194  }
   195  
   196  // DecimalSize returns the minimum number of bytes necessary to represent a decimal
   197  // with the requested precision.
   198  //
   199  // Taken from the Apache Impala codebase. The comments next to the return values
   200  // are the maximum value that can be represented in 2's complement with the returned
   201  // number of bytes
   202  func DecimalSize(precision int32) int32 {
   203  	if precision < 1 {
   204  		panic("precision must be >= 1")
   205  	}
   206  
   207  	// generated in python with:
   208  	// >>> decimal_size = lambda prec: int(math.ceil((prec * math.log2(10) + 1) / 8))
   209  	// >>> [-1] + [decimal_size(i) for i in range(1, 77)]
   210  	var byteblock = [...]int32{
   211  		-1, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 9,
   212  		9, 9, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16, 16, 17,
   213  		17, 18, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25,
   214  		26, 26, 26, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32,
   215  	}
   216  
   217  	if precision <= 76 {
   218  		return byteblock[precision]
   219  	}
   220  	return int32(math.Ceil(float64(precision)/8.0)*math.Log2(10) + 1)
   221  }
   222  
   223  func repFromNullable(isnullable bool) parquet.Repetition {
   224  	if isnullable {
   225  		return parquet.Repetitions.Optional
   226  	}
   227  	return parquet.Repetitions.Required
   228  }
   229  
   230  func structToNode(typ *arrow.StructType, name string, nullable bool, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) {
   231  	if len(typ.Fields()) == 0 {
   232  		return nil, fmt.Errorf("cannot write struct type '%s' with no children field to parquet. Consider adding a dummy child", name)
   233  	}
   234  
   235  	children := make(schema.FieldList, 0, len(typ.Fields()))
   236  	for _, f := range typ.Fields() {
   237  		n, err := fieldToNode(f.Name, f, props, arrprops)
   238  		if err != nil {
   239  			return nil, err
   240  		}
   241  		children = append(children, n)
   242  	}
   243  
   244  	return schema.NewGroupNode(name, repFromNullable(nullable), children, -1)
   245  }
   246  
   247  func fieldToNode(name string, field arrow.Field, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) {
   248  	var (
   249  		logicalType schema.LogicalType = schema.NoLogicalType{}
   250  		typ         parquet.Type
   251  		repType     = repFromNullable(field.Nullable)
   252  		length      = -1
   253  		precision   = -1
   254  		scale       = -1
   255  		err         error
   256  	)
   257  
   258  	switch field.Type.ID() {
   259  	case arrow.NULL:
   260  		typ = parquet.Types.Int32
   261  		logicalType = &schema.NullLogicalType{}
   262  		if repType != parquet.Repetitions.Optional {
   263  			return nil, xerrors.New("nulltype arrow field must be nullable")
   264  		}
   265  	case arrow.BOOL:
   266  		typ = parquet.Types.Boolean
   267  	case arrow.UINT8:
   268  		typ = parquet.Types.Int32
   269  		logicalType = schema.NewIntLogicalType(8, false)
   270  	case arrow.INT8:
   271  		typ = parquet.Types.Int32
   272  		logicalType = schema.NewIntLogicalType(8, true)
   273  	case arrow.UINT16:
   274  		typ = parquet.Types.Int32
   275  		logicalType = schema.NewIntLogicalType(16, false)
   276  	case arrow.INT16:
   277  		typ = parquet.Types.Int32
   278  		logicalType = schema.NewIntLogicalType(16, true)
   279  	case arrow.UINT32:
   280  		typ = parquet.Types.Int32
   281  		logicalType = schema.NewIntLogicalType(32, false)
   282  	case arrow.INT32:
   283  		typ = parquet.Types.Int32
   284  		logicalType = schema.NewIntLogicalType(32, true)
   285  	case arrow.UINT64:
   286  		typ = parquet.Types.Int64
   287  		logicalType = schema.NewIntLogicalType(64, false)
   288  	case arrow.INT64:
   289  		typ = parquet.Types.Int64
   290  		logicalType = schema.NewIntLogicalType(64, true)
   291  	case arrow.FLOAT32:
   292  		typ = parquet.Types.Float
   293  	case arrow.FLOAT64:
   294  		typ = parquet.Types.Double
   295  	case arrow.STRING:
   296  		logicalType = schema.StringLogicalType{}
   297  		fallthrough
   298  	case arrow.BINARY:
   299  		typ = parquet.Types.ByteArray
   300  	case arrow.FIXED_SIZE_BINARY:
   301  		typ = parquet.Types.FixedLenByteArray
   302  		length = field.Type.(*arrow.FixedSizeBinaryType).ByteWidth
   303  	case arrow.DECIMAL:
   304  		typ = parquet.Types.FixedLenByteArray
   305  		dectype := field.Type.(*arrow.Decimal128Type)
   306  		precision = int(dectype.Precision)
   307  		scale = int(dectype.Scale)
   308  		length = int(DecimalSize(int32(precision)))
   309  		logicalType = schema.NewDecimalLogicalType(int32(precision), int32(scale))
   310  	case arrow.DATE32:
   311  		typ = parquet.Types.Int32
   312  		logicalType = schema.DateLogicalType{}
   313  	case arrow.DATE64:
   314  		typ = parquet.Types.Int64
   315  		logicalType = schema.NewTimestampLogicalType(true, schema.TimeUnitMillis)
   316  	case arrow.TIMESTAMP:
   317  		typ, logicalType, err = getTimestampMeta(field.Type.(*arrow.TimestampType), props, arrprops)
   318  		if err != nil {
   319  			return nil, err
   320  		}
   321  	case arrow.TIME32:
   322  		typ = parquet.Types.Int32
   323  		logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMillis)
   324  	case arrow.TIME64:
   325  		typ = parquet.Types.Int64
   326  		timeType := field.Type.(*arrow.Time64Type)
   327  		if timeType.Unit == arrow.Nanosecond {
   328  			logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitNanos)
   329  		} else {
   330  			logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMicros)
   331  		}
   332  	case arrow.STRUCT:
   333  		return structToNode(field.Type.(*arrow.StructType), field.Name, field.Nullable, props, arrprops)
   334  	case arrow.FIXED_SIZE_LIST, arrow.LIST:
   335  		var elem arrow.DataType
   336  		if lt, ok := field.Type.(*arrow.ListType); ok {
   337  			elem = lt.Elem()
   338  		} else {
   339  			elem = field.Type.(*arrow.FixedSizeListType).Elem()
   340  		}
   341  
   342  		child, err := fieldToNode(name, arrow.Field{Name: name, Type: elem, Nullable: true}, props, arrprops)
   343  		if err != nil {
   344  			return nil, err
   345  		}
   346  
   347  		return schema.ListOf(child, repFromNullable(field.Nullable), -1)
   348  	case arrow.DICTIONARY:
   349  		// parquet has no dictionary type, dictionary is encoding, not schema level
   350  		return nil, xerrors.New("not implemented yet")
   351  	case arrow.EXTENSION:
   352  		return nil, xerrors.New("not implemented yet")
   353  	case arrow.MAP:
   354  		mapType := field.Type.(*arrow.MapType)
   355  		keyNode, err := fieldToNode("key", mapType.KeyField(), props, arrprops)
   356  		if err != nil {
   357  			return nil, err
   358  		}
   359  
   360  		valueNode, err := fieldToNode("value", mapType.ItemField(), props, arrprops)
   361  		if err != nil {
   362  			return nil, err
   363  		}
   364  
   365  		if arrprops.noMapLogicalType {
   366  			keyval := schema.FieldList{keyNode, valueNode}
   367  			keyvalNode, err := schema.NewGroupNode("key_value", parquet.Repetitions.Repeated, keyval, -1)
   368  			if err != nil {
   369  				return nil, err
   370  			}
   371  			return schema.NewGroupNode(field.Name, repFromNullable(field.Nullable), schema.FieldList{
   372  				keyvalNode,
   373  			}, -1)
   374  		}
   375  		return schema.MapOf(field.Name, keyNode, valueNode, repFromNullable(field.Nullable), -1)
   376  	default:
   377  		return nil, xerrors.New("not implemented yet")
   378  	}
   379  
   380  	return schema.NewPrimitiveNodeLogical(name, repType, logicalType, typ, length, fieldIDFromMeta(field.Metadata))
   381  }
   382  
   383  const fieldIDKey = "PARQUET:field_id"
   384  
   385  func fieldIDFromMeta(m arrow.Metadata) int32 {
   386  	if m.Len() == 0 {
   387  		return -1
   388  	}
   389  
   390  	key := m.FindKey(fieldIDKey)
   391  	if key < 0 {
   392  		return -1
   393  	}
   394  
   395  	id, err := strconv.ParseInt(m.Values()[key], 10, 32)
   396  	if err != nil {
   397  		return -1
   398  	}
   399  
   400  	if id < 0 {
   401  		return -1
   402  	}
   403  
   404  	return int32(id)
   405  }
   406  
   407  // ToParquet generates a Parquet Schema from an arrow Schema using the given properties to make
   408  // decisions when determining the logical/physical types of the columns.
   409  func ToParquet(sc *arrow.Schema, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (*schema.Schema, error) {
   410  	if props == nil {
   411  		props = parquet.NewWriterProperties()
   412  	}
   413  
   414  	nodes := make(schema.FieldList, 0, len(sc.Fields()))
   415  	for _, f := range sc.Fields() {
   416  		n, err := fieldToNode(f.Name, f, props, arrprops)
   417  		if err != nil {
   418  			return nil, err
   419  		}
   420  		nodes = append(nodes, n)
   421  	}
   422  
   423  	root, err := schema.NewGroupNode(props.RootName(), props.RootRepetition(), nodes, -1)
   424  	if err != nil {
   425  		return nil, err
   426  	}
   427  
   428  	return schema.NewSchema(root), err
   429  }
   430  
   431  type schemaTree struct {
   432  	manifest *SchemaManifest
   433  
   434  	schema *schema.Schema
   435  	props  *ArrowReadProperties
   436  }
   437  
   438  func (s schemaTree) LinkParent(child, parent *SchemaField) {
   439  	s.manifest.ChildToParent[child] = parent
   440  }
   441  
   442  func (s schemaTree) RecordLeaf(leaf *SchemaField) {
   443  	s.manifest.ColIndexToField[leaf.ColIndex] = leaf
   444  }
   445  
   446  func arrowInt(log *schema.IntLogicalType) (arrow.DataType, error) {
   447  	switch log.BitWidth() {
   448  	case 8:
   449  		if log.IsSigned() {
   450  			return arrow.PrimitiveTypes.Int8, nil
   451  		}
   452  		return arrow.PrimitiveTypes.Uint8, nil
   453  	case 16:
   454  		if log.IsSigned() {
   455  			return arrow.PrimitiveTypes.Int16, nil
   456  		}
   457  		return arrow.PrimitiveTypes.Uint16, nil
   458  	case 32:
   459  		if log.IsSigned() {
   460  			return arrow.PrimitiveTypes.Int32, nil
   461  		}
   462  		return arrow.PrimitiveTypes.Uint32, nil
   463  	case 64:
   464  		if log.IsSigned() {
   465  			return arrow.PrimitiveTypes.Int64, nil
   466  		}
   467  		return arrow.PrimitiveTypes.Uint64, nil
   468  	default:
   469  		return nil, xerrors.New("invalid logical type for int32")
   470  	}
   471  }
   472  
   473  func arrowTime32(logical *schema.TimeLogicalType) (arrow.DataType, error) {
   474  	if logical.TimeUnit() == schema.TimeUnitMillis {
   475  		return arrow.FixedWidthTypes.Time32ms, nil
   476  	}
   477  
   478  	return nil, xerrors.New(logical.String() + " cannot annotate a time32")
   479  }
   480  
   481  func arrowTime64(logical *schema.TimeLogicalType) (arrow.DataType, error) {
   482  	switch logical.TimeUnit() {
   483  	case schema.TimeUnitMicros:
   484  		return arrow.FixedWidthTypes.Time64us, nil
   485  	case schema.TimeUnitNanos:
   486  		return arrow.FixedWidthTypes.Time64ns, nil
   487  	default:
   488  		return nil, xerrors.New(logical.String() + " cannot annotate int64")
   489  	}
   490  }
   491  
   492  func arrowTimestamp(logical *schema.TimestampLogicalType) (arrow.DataType, error) {
   493  	tz := "UTC"
   494  	if logical.IsFromConvertedType() {
   495  		tz = ""
   496  	}
   497  
   498  	switch logical.TimeUnit() {
   499  	case schema.TimeUnitMillis:
   500  		return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Millisecond}, nil
   501  	case schema.TimeUnitMicros:
   502  		return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Microsecond}, nil
   503  	case schema.TimeUnitNanos:
   504  		return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Nanosecond}, nil
   505  	default:
   506  		return nil, xerrors.New("Unrecognized unit in timestamp logical type " + logical.String())
   507  	}
   508  }
   509  
   510  func arrowFromInt32(logical schema.LogicalType) (arrow.DataType, error) {
   511  	switch logtype := logical.(type) {
   512  	case schema.NoLogicalType:
   513  		return arrow.PrimitiveTypes.Int32, nil
   514  	case *schema.TimeLogicalType:
   515  		return arrowTime32(logtype)
   516  	case *schema.DecimalLogicalType:
   517  		return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil
   518  	case *schema.IntLogicalType:
   519  		return arrowInt(logtype)
   520  	case schema.DateLogicalType:
   521  		return arrow.FixedWidthTypes.Date32, nil
   522  	default:
   523  		return nil, xerrors.New(logical.String() + " cannot annotate int32")
   524  	}
   525  }
   526  
   527  func arrowFromInt64(logical schema.LogicalType) (arrow.DataType, error) {
   528  	if logical.IsNone() {
   529  		return arrow.PrimitiveTypes.Int64, nil
   530  	}
   531  
   532  	switch logtype := logical.(type) {
   533  	case *schema.IntLogicalType:
   534  		return arrowInt(logtype)
   535  	case *schema.DecimalLogicalType:
   536  		return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil
   537  	case *schema.TimeLogicalType:
   538  		return arrowTime64(logtype)
   539  	case *schema.TimestampLogicalType:
   540  		return arrowTimestamp(logtype)
   541  	default:
   542  		return nil, xerrors.New(logical.String() + " cannot annotate int64")
   543  	}
   544  }
   545  
   546  func arrowFromByteArray(logical schema.LogicalType) (arrow.DataType, error) {
   547  	switch logtype := logical.(type) {
   548  	case schema.StringLogicalType:
   549  		return arrow.BinaryTypes.String, nil
   550  	case *schema.DecimalLogicalType:
   551  		return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil
   552  	case schema.NoLogicalType,
   553  		schema.EnumLogicalType,
   554  		schema.JSONLogicalType,
   555  		schema.BSONLogicalType:
   556  		return arrow.BinaryTypes.Binary, nil
   557  	default:
   558  		return nil, xerrors.New("unhandled logicaltype " + logical.String() + " for byte_array")
   559  	}
   560  }
   561  
   562  func arrowFromFLBA(logical schema.LogicalType, length int) (arrow.DataType, error) {
   563  	switch logtype := logical.(type) {
   564  	case *schema.DecimalLogicalType:
   565  		return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil
   566  	case schema.NoLogicalType, schema.IntervalLogicalType, schema.UUIDLogicalType:
   567  		return &arrow.FixedSizeBinaryType{ByteWidth: int(length)}, nil
   568  	default:
   569  		return nil, xerrors.New("unhandled logical type " + logical.String() + " for fixed-length byte array")
   570  	}
   571  }
   572  
   573  func getArrowType(physical parquet.Type, logical schema.LogicalType, typeLen int) (arrow.DataType, error) {
   574  	if !logical.IsValid() || logical.Equals(schema.NullLogicalType{}) {
   575  		return arrow.Null, nil
   576  	}
   577  
   578  	switch physical {
   579  	case parquet.Types.Boolean:
   580  		return arrow.FixedWidthTypes.Boolean, nil
   581  	case parquet.Types.Int32:
   582  		return arrowFromInt32(logical)
   583  	case parquet.Types.Int64:
   584  		return arrowFromInt64(logical)
   585  	case parquet.Types.Int96:
   586  		return arrow.FixedWidthTypes.Timestamp_ns, nil
   587  	case parquet.Types.Float:
   588  		return arrow.PrimitiveTypes.Float32, nil
   589  	case parquet.Types.Double:
   590  		return arrow.PrimitiveTypes.Float64, nil
   591  	case parquet.Types.ByteArray:
   592  		return arrowFromByteArray(logical)
   593  	case parquet.Types.FixedLenByteArray:
   594  		return arrowFromFLBA(logical, typeLen)
   595  	default:
   596  		return nil, xerrors.New("invalid physical column type")
   597  	}
   598  }
   599  
   600  func populateLeaf(colIndex int, field *arrow.Field, currentLevels file.LevelInfo, ctx *schemaTree, parent *SchemaField, out *SchemaField) {
   601  	out.Field = field
   602  	out.ColIndex = colIndex
   603  	out.LevelInfo = currentLevels
   604  	ctx.RecordLeaf(out)
   605  	ctx.LinkParent(out, parent)
   606  }
   607  
   608  func listToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   609  	if n.NumFields() != 1 {
   610  		return xerrors.New("LIST groups must have only 1 child")
   611  	}
   612  
   613  	if n.RepetitionType() == parquet.Repetitions.Repeated {
   614  		return xerrors.New("LIST groups must not be repeated")
   615  	}
   616  
   617  	currentLevels.Increment(n)
   618  
   619  	out.Children = make([]SchemaField, n.NumFields())
   620  	ctx.LinkParent(out, parent)
   621  	ctx.LinkParent(&out.Children[0], out)
   622  
   623  	listNode := n.Field(0)
   624  	if listNode.RepetitionType() != parquet.Repetitions.Repeated {
   625  		return xerrors.New("non-repeated nodes in a list group are not supported")
   626  	}
   627  
   628  	repeatedAncestorDef := currentLevels.IncrementRepeated()
   629  	if listNode.Type() == schema.Group {
   630  		// Resolve 3-level encoding
   631  		//
   632  		// required/optional group name=whatever {
   633  		//   repeated group name=list {
   634  		//     required/optional TYPE item;
   635  		//   }
   636  		// }
   637  		//
   638  		// yields list<item: TYPE ?nullable> ?nullable
   639  		//
   640  		// We distinguish the special case that we have
   641  		//
   642  		// required/optional group name=whatever {
   643  		//   repeated group name=array or $SOMETHING_tuple {
   644  		//     required/optional TYPE item;
   645  		//   }
   646  		// }
   647  		//
   648  		// In this latter case, the inner type of the list should be a struct
   649  		// rather than a primitive value
   650  		//
   651  		// yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
   652  		// Special case mentioned in the format spec:
   653  		//   If the name is array or ends in _tuple, this should be a list of struct
   654  		//   even for single child elements.
   655  		listGroup := listNode.(*schema.GroupNode)
   656  		if listGroup.NumFields() == 1 && (listGroup.Name() == "array" || strings.HasSuffix(listGroup.Name(), "_tuple")) {
   657  			// list of primitive type
   658  			if err := groupToStructField(listGroup, currentLevels, ctx, out, &out.Children[0]); err != nil {
   659  				return err
   660  			}
   661  		} else {
   662  			if err := nodeToSchemaField(listGroup.Field(0), currentLevels, ctx, out, &out.Children[0]); err != nil {
   663  				return err
   664  			}
   665  		}
   666  	} else {
   667  		// Two-level list encoding
   668  		//
   669  		// required/optional group LIST {
   670  		//   repeated TYPE;
   671  		// }
   672  		primitiveNode := listNode.(*schema.PrimitiveNode)
   673  		colIndex := ctx.schema.ColumnIndexByNode(primitiveNode)
   674  		arrowType, err := getArrowType(primitiveNode.PhysicalType(), primitiveNode.LogicalType(), primitiveNode.TypeLength())
   675  		if err != nil {
   676  			return err
   677  		}
   678  
   679  		itemField := arrow.Field{Name: listNode.Name(), Type: arrowType, Nullable: false, Metadata: createFieldMeta(int(listNode.FieldID()))}
   680  		populateLeaf(colIndex, &itemField, currentLevels, ctx, out, &out.Children[0])
   681  	}
   682  
   683  	out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOf(out.Children[0].Field.Type),
   684  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional, Metadata: createFieldMeta(int(n.FieldID()))}
   685  	out.LevelInfo = currentLevels
   686  	// At this point current levels contains the def level for this list,
   687  	// we need to reset to the prior parent.
   688  	out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef
   689  	return nil
   690  }
   691  
   692  func groupToStructField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   693  	arrowFields := make([]arrow.Field, 0, n.NumFields())
   694  	out.Children = make([]SchemaField, n.NumFields())
   695  
   696  	for i := 0; i < n.NumFields(); i++ {
   697  		if err := nodeToSchemaField(n.Field(i), currentLevels, ctx, out, &out.Children[i]); err != nil {
   698  			return err
   699  		}
   700  		arrowFields = append(arrowFields, *out.Children[i].Field)
   701  	}
   702  
   703  	out.Field = &arrow.Field{Name: n.Name(), Type: arrow.StructOf(arrowFields...),
   704  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional, Metadata: createFieldMeta(int(n.FieldID()))}
   705  	out.LevelInfo = currentLevels
   706  	return nil
   707  }
   708  
   709  func mapToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   710  	if n.NumFields() != 1 {
   711  		return xerrors.New("MAP group must have exactly 1 child")
   712  	}
   713  	if n.RepetitionType() == parquet.Repetitions.Repeated {
   714  		return xerrors.New("MAP groups must not be repeated")
   715  	}
   716  
   717  	keyvalueNode := n.Field(0)
   718  	if keyvalueNode.RepetitionType() != parquet.Repetitions.Repeated {
   719  		return xerrors.New("Non-repeated keyvalue group in MAP group is not supported")
   720  	}
   721  
   722  	if keyvalueNode.Type() != schema.Group {
   723  		return xerrors.New("keyvalue node must be a group")
   724  	}
   725  
   726  	kvgroup := keyvalueNode.(*schema.GroupNode)
   727  	if kvgroup.NumFields() != 1 && kvgroup.NumFields() != 2 {
   728  		return fmt.Errorf("keyvalue node group must have exactly 1 or 2 child elements, Found %d", kvgroup.NumFields())
   729  	}
   730  
   731  	keyNode := kvgroup.Field(0)
   732  	if keyNode.RepetitionType() != parquet.Repetitions.Required {
   733  		return xerrors.New("MAP keys must be required")
   734  	}
   735  
   736  	// Arrow doesn't support 1 column maps (i.e. Sets).  The options are to either
   737  	// make the values column nullable, or process the map as a list.  We choose the latter
   738  	// as it is simpler.
   739  	if kvgroup.NumFields() == 1 {
   740  		return listToSchemaField(n, currentLevels, ctx, parent, out)
   741  	}
   742  
   743  	currentLevels.Increment(n)
   744  	repeatedAncestorDef := currentLevels.IncrementRepeated()
   745  	out.Children = make([]SchemaField, 1)
   746  
   747  	kvfield := &out.Children[0]
   748  	kvfield.Children = make([]SchemaField, 2)
   749  
   750  	keyField := &kvfield.Children[0]
   751  	valueField := &kvfield.Children[1]
   752  
   753  	ctx.LinkParent(out, parent)
   754  	ctx.LinkParent(kvfield, out)
   755  	ctx.LinkParent(keyField, kvfield)
   756  	ctx.LinkParent(valueField, kvfield)
   757  
   758  	// required/optional group name=whatever {
   759  	//   repeated group name=key_values{
   760  	//     required TYPE key;
   761  	// required/optional TYPE value;
   762  	//   }
   763  	// }
   764  	//
   765  
   766  	if err := nodeToSchemaField(keyNode, currentLevels, ctx, kvfield, keyField); err != nil {
   767  		return err
   768  	}
   769  	if err := nodeToSchemaField(kvgroup.Field(1), currentLevels, ctx, kvfield, valueField); err != nil {
   770  		return err
   771  	}
   772  
   773  	kvfield.Field = &arrow.Field{Name: n.Name(), Type: arrow.StructOf(*keyField.Field, *valueField.Field),
   774  		Nullable: false, Metadata: createFieldMeta(int(kvgroup.FieldID()))}
   775  
   776  	kvfield.LevelInfo = currentLevels
   777  	out.Field = &arrow.Field{Name: n.Name(), Type: arrow.MapOf(keyField.Field.Type, valueField.Field.Type),
   778  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional,
   779  		Metadata: createFieldMeta(int(n.FieldID()))}
   780  	out.LevelInfo = currentLevels
   781  	// At this point current levels contains the def level for this map,
   782  	// we need to reset to the prior parent.
   783  	out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef
   784  	return nil
   785  }
   786  
   787  func groupToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   788  	if n.LogicalType().Equals(schema.NewListLogicalType()) {
   789  		return listToSchemaField(n, currentLevels, ctx, parent, out)
   790  	} else if n.LogicalType().Equals(schema.MapLogicalType{}) {
   791  		return mapToSchemaField(n, currentLevels, ctx, parent, out)
   792  	}
   793  
   794  	if n.RepetitionType() == parquet.Repetitions.Repeated {
   795  		// Simple repeated struct
   796  		//
   797  		// repeated group $NAME {
   798  		//   r/o TYPE[0] f0
   799  		//   r/o TYPE[1] f1
   800  		// }
   801  		out.Children = make([]SchemaField, 1)
   802  		repeatedAncestorDef := currentLevels.IncrementRepeated()
   803  		if err := groupToStructField(n, currentLevels, ctx, out, &out.Children[0]); err != nil {
   804  			return err
   805  		}
   806  
   807  		out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOf(out.Children[0].Field.Type), Nullable: false,
   808  			Metadata: createFieldMeta(int(n.FieldID()))}
   809  		ctx.LinkParent(&out.Children[0], out)
   810  		out.LevelInfo = currentLevels
   811  		out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef
   812  		return nil
   813  	}
   814  
   815  	currentLevels.Increment(n)
   816  	return groupToStructField(n, currentLevels, ctx, parent, out)
   817  }
   818  
   819  func createFieldMeta(fieldID int) arrow.Metadata {
   820  	return arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{strconv.Itoa(fieldID)})
   821  }
   822  
   823  func nodeToSchemaField(n schema.Node, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error {
   824  	ctx.LinkParent(out, parent)
   825  
   826  	if n.Type() == schema.Group {
   827  		return groupToSchemaField(n.(*schema.GroupNode), currentLevels, ctx, parent, out)
   828  	}
   829  
   830  	// Either a normal flat primitive type, or a list type encoded with 1-level
   831  	// list encoding. Note that the 3-level encoding is the form recommended by
   832  	// the parquet specification, but technically we can have either
   833  	//
   834  	// required/optional $TYPE $FIELD_NAME
   835  	//
   836  	// or
   837  	//
   838  	// repeated $TYPE $FIELD_NAME
   839  
   840  	primitive := n.(*schema.PrimitiveNode)
   841  	colIndex := ctx.schema.ColumnIndexByNode(primitive)
   842  	arrowType, err := getArrowType(primitive.PhysicalType(), primitive.LogicalType(), primitive.TypeLength())
   843  	if err != nil {
   844  		return err
   845  	}
   846  
   847  	if primitive.RepetitionType() == parquet.Repetitions.Repeated {
   848  		// one-level list encoding e.g. a: repeated int32;
   849  		repeatedAncestorDefLevel := currentLevels.IncrementRepeated()
   850  		out.Children = make([]SchemaField, 1)
   851  		child := arrow.Field{Name: primitive.Name(), Type: arrowType, Nullable: false}
   852  		populateLeaf(colIndex, &child, currentLevels, ctx, out, &out.Children[0])
   853  		out.Field = &arrow.Field{Name: primitive.Name(), Type: arrow.ListOf(child.Type), Nullable: false,
   854  			Metadata: createFieldMeta(int(primitive.FieldID()))}
   855  		out.LevelInfo = currentLevels
   856  		out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDefLevel
   857  		return nil
   858  	}
   859  
   860  	currentLevels.Increment(n)
   861  	populateLeaf(colIndex, &arrow.Field{Name: n.Name(), Type: arrowType,
   862  		Nullable: n.RepetitionType() == parquet.Repetitions.Optional,
   863  		Metadata: createFieldMeta(int(n.FieldID()))},
   864  		currentLevels, ctx, parent, out)
   865  	return nil
   866  }
   867  
   868  func getOriginSchema(meta metadata.KeyValueMetadata, mem memory.Allocator) (*arrow.Schema, error) {
   869  	if meta == nil {
   870  		return nil, nil
   871  	}
   872  
   873  	const arrowSchemaKey = "ARROW:schema"
   874  	serialized := meta.FindValue(arrowSchemaKey)
   875  	if serialized == nil {
   876  		return nil, nil
   877  	}
   878  
   879  	var (
   880  		decoded []byte
   881  		err     error
   882  	)
   883  
   884  	// if the length of serialized is not a multiple of 4, it cannot be
   885  	// padded with std encoding.
   886  	if len(*serialized)%4 == 0 {
   887  		decoded, err = base64.StdEncoding.DecodeString(*serialized)
   888  	}
   889  	// if we failed to decode it with stdencoding or the length wasn't
   890  	// a multiple of 4, try using the Raw unpadded encoding
   891  	if len(decoded) == 0 || err != nil {
   892  		decoded, err = base64.RawStdEncoding.DecodeString(*serialized)
   893  	}
   894  
   895  	if err != nil {
   896  		return nil, err
   897  	}
   898  
   899  	return flight.DeserializeSchema(decoded, mem)
   900  }
   901  
   902  func getNestedFactory(origin, inferred arrow.DataType) func(fieldList []arrow.Field) arrow.DataType {
   903  	switch inferred.ID() {
   904  	case arrow.STRUCT:
   905  		if origin.ID() == arrow.STRUCT {
   906  			return func(list []arrow.Field) arrow.DataType {
   907  				return arrow.StructOf(list...)
   908  			}
   909  		}
   910  	case arrow.LIST:
   911  		switch origin.ID() {
   912  		case arrow.LIST:
   913  			return func(list []arrow.Field) arrow.DataType {
   914  				return arrow.ListOf(list[0].Type)
   915  			}
   916  		case arrow.FIXED_SIZE_LIST:
   917  			sz := origin.(*arrow.FixedSizeListType).Len()
   918  			return func(list []arrow.Field) arrow.DataType {
   919  				return arrow.FixedSizeListOf(sz, list[0].Type)
   920  			}
   921  		}
   922  	case arrow.MAP:
   923  		if origin.ID() == arrow.MAP {
   924  			return func(list []arrow.Field) arrow.DataType {
   925  				valType := list[0].Type.(*arrow.StructType)
   926  				return arrow.MapOf(valType.Field(0).Type, valType.Field(1).Type)
   927  			}
   928  		}
   929  	}
   930  	return nil
   931  }
   932  
   933  func applyOriginalStorageMetadata(origin arrow.Field, inferred *SchemaField) (modified bool, err error) {
   934  	nchildren := len(inferred.Children)
   935  	switch origin.Type.ID() {
   936  	case arrow.EXTENSION, arrow.SPARSE_UNION, arrow.DENSE_UNION, arrow.DICTIONARY:
   937  		err = xerrors.New("unimplemented type")
   938  	case arrow.STRUCT:
   939  		typ := origin.Type.(*arrow.StructType)
   940  		if nchildren != len(typ.Fields()) {
   941  			return
   942  		}
   943  
   944  		factory := getNestedFactory(typ, inferred.Field.Type)
   945  		if factory == nil {
   946  			return
   947  		}
   948  
   949  		modified = typ.ID() != inferred.Field.Type.ID()
   950  		for idx := range inferred.Children {
   951  			childMod, err := applyOriginalMetadata(typ.Field(idx), &inferred.Children[idx])
   952  			if err != nil {
   953  				return false, err
   954  			}
   955  			modified = modified || childMod
   956  		}
   957  		if modified {
   958  			modifiedChildren := make([]arrow.Field, len(inferred.Children))
   959  			for idx, child := range inferred.Children {
   960  				modifiedChildren[idx] = *child.Field
   961  			}
   962  			inferred.Field.Type = factory(modifiedChildren)
   963  		}
   964  	case arrow.FIXED_SIZE_LIST, arrow.LIST, arrow.MAP:
   965  		if nchildren != 1 {
   966  			return
   967  		}
   968  		factory := getNestedFactory(origin.Type, inferred.Field.Type)
   969  		if factory == nil {
   970  			return
   971  		}
   972  
   973  		modified = origin.Type.ID() != inferred.Field.Type.ID()
   974  		var childModified bool
   975  		switch typ := origin.Type.(type) {
   976  		case *arrow.FixedSizeListType:
   977  			childModified, err = applyOriginalMetadata(arrow.Field{Type: typ.Elem()}, &inferred.Children[0])
   978  		case *arrow.ListType:
   979  			childModified, err = applyOriginalMetadata(arrow.Field{Type: typ.Elem()}, &inferred.Children[0])
   980  		case *arrow.MapType:
   981  			childModified, err = applyOriginalMetadata(arrow.Field{Type: typ.ValueType()}, &inferred.Children[0])
   982  		}
   983  		if err != nil {
   984  			return
   985  		}
   986  		modified = modified || childModified
   987  		if modified {
   988  			inferred.Field.Type = factory([]arrow.Field{*inferred.Children[0].Field})
   989  		}
   990  	case arrow.TIMESTAMP:
   991  		if inferred.Field.Type.ID() != arrow.TIMESTAMP {
   992  			return
   993  		}
   994  
   995  		tsOtype := origin.Type.(*arrow.TimestampType)
   996  		tsInfType := inferred.Field.Type.(*arrow.TimestampType)
   997  
   998  		// if the unit is the same and the data is tz-aware, then set the original time zone
   999  		// since parquet has no native storage of timezones
  1000  		if tsOtype.Unit == tsInfType.Unit && tsInfType.TimeZone == "UTC" && tsOtype.TimeZone != "" {
  1001  			inferred.Field.Type = origin.Type
  1002  		}
  1003  		modified = true
  1004  	}
  1005  
  1006  	if origin.HasMetadata() {
  1007  		meta := origin.Metadata
  1008  		if inferred.Field.HasMetadata() {
  1009  			final := make(map[string]string)
  1010  			for idx, k := range meta.Keys() {
  1011  				final[k] = meta.Values()[idx]
  1012  			}
  1013  			for idx, k := range inferred.Field.Metadata.Keys() {
  1014  				final[k] = inferred.Field.Metadata.Values()[idx]
  1015  			}
  1016  			inferred.Field.Metadata = arrow.MetadataFrom(final)
  1017  		} else {
  1018  			inferred.Field.Metadata = meta
  1019  		}
  1020  		modified = true
  1021  	}
  1022  
  1023  	return
  1024  }
  1025  
  1026  func applyOriginalMetadata(origin arrow.Field, inferred *SchemaField) (bool, error) {
  1027  	if origin.Type.ID() == arrow.EXTENSION {
  1028  		return false, xerrors.New("extension types not implemented yet")
  1029  	}
  1030  
  1031  	return applyOriginalStorageMetadata(origin, inferred)
  1032  }
  1033  
  1034  // NewSchemaManifest creates a manifest for mapping a parquet schema to a given arrow schema.
  1035  //
  1036  // The metadata passed in should be the file level key value metadata from the parquet file or nil.
  1037  // If the ARROW:schema was in the metadata, then it is utilized to determine types.
  1038  func NewSchemaManifest(sc *schema.Schema, meta metadata.KeyValueMetadata, props *ArrowReadProperties) (*SchemaManifest, error) {
  1039  	var ctx schemaTree
  1040  	ctx.manifest = &SchemaManifest{
  1041  		ColIndexToField: make(map[int]*SchemaField),
  1042  		ChildToParent:   make(map[*SchemaField]*SchemaField),
  1043  		descr:           sc,
  1044  		Fields:          make([]SchemaField, sc.Root().NumFields()),
  1045  	}
  1046  	ctx.props = props
  1047  	ctx.schema = sc
  1048  
  1049  	var err error
  1050  	ctx.manifest.OriginSchema, err = getOriginSchema(meta, memory.DefaultAllocator)
  1051  	if err != nil {
  1052  		return nil, err
  1053  	}
  1054  
  1055  	// if original schema is not compatible with the parquet schema, ignore it
  1056  	if ctx.manifest.OriginSchema != nil && len(ctx.manifest.OriginSchema.Fields()) != sc.Root().NumFields() {
  1057  		ctx.manifest.OriginSchema = nil
  1058  	}
  1059  
  1060  	for idx := range ctx.manifest.Fields {
  1061  		field := &ctx.manifest.Fields[idx]
  1062  		if err := nodeToSchemaField(sc.Root().Field(idx), file.LevelInfo{NullSlotUsage: 1}, &ctx, nil, field); err != nil {
  1063  			return nil, err
  1064  		}
  1065  
  1066  		if ctx.manifest.OriginSchema != nil {
  1067  			if _, err := applyOriginalMetadata(ctx.manifest.OriginSchema.Field(idx), field); err != nil {
  1068  				return nil, err
  1069  			}
  1070  		}
  1071  	}
  1072  	return ctx.manifest, nil
  1073  }
  1074  
  1075  // FromParquet generates an arrow Schema from a provided Parquet Schema
  1076  func FromParquet(sc *schema.Schema, props *ArrowReadProperties, kv metadata.KeyValueMetadata) (*arrow.Schema, error) {
  1077  	manifest, err := NewSchemaManifest(sc, kv, props)
  1078  	if err != nil {
  1079  		return nil, err
  1080  	}
  1081  
  1082  	fields := make([]arrow.Field, len(manifest.Fields))
  1083  	for idx, field := range manifest.Fields {
  1084  		fields[idx] = *field.Field
  1085  	}
  1086  
  1087  	if manifest.OriginSchema != nil {
  1088  		meta := manifest.OriginSchema.Metadata()
  1089  		return arrow.NewSchema(fields, &meta), nil
  1090  	}
  1091  	return arrow.NewSchema(fields, manifest.SchemaMeta), nil
  1092  }