github.com/apache/arrow/go/v16@v16.1.0/parquet/pqarrow/schema.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow 18 19 import ( 20 "encoding/base64" 21 "fmt" 22 "math" 23 "strconv" 24 25 "github.com/apache/arrow/go/v16/arrow" 26 "github.com/apache/arrow/go/v16/arrow/decimal128" 27 "github.com/apache/arrow/go/v16/arrow/flight" 28 "github.com/apache/arrow/go/v16/arrow/ipc" 29 "github.com/apache/arrow/go/v16/arrow/memory" 30 "github.com/apache/arrow/go/v16/parquet" 31 "github.com/apache/arrow/go/v16/parquet/file" 32 "github.com/apache/arrow/go/v16/parquet/metadata" 33 "github.com/apache/arrow/go/v16/parquet/schema" 34 "golang.org/x/xerrors" 35 ) 36 37 // SchemaField is a holder that defines a specific logical field in the schema 38 // which could potentially refer to multiple physical columns in the underlying 39 // parquet file if it is a nested type. 40 // 41 // ColIndex is only populated (not -1) when it is a leaf column. 42 type SchemaField struct { 43 Field *arrow.Field 44 Children []SchemaField 45 ColIndex int 46 LevelInfo file.LevelInfo 47 } 48 49 // IsLeaf returns true if the SchemaField is a leaf column, ie: ColIndex != -1 50 func (s *SchemaField) IsLeaf() bool { return s.ColIndex != -1 } 51 52 // SchemaManifest represents a full manifest for mapping a Parquet schema 53 // to an arrow Schema. 54 type SchemaManifest struct { 55 descr *schema.Schema 56 OriginSchema *arrow.Schema 57 SchemaMeta *arrow.Metadata 58 59 ColIndexToField map[int]*SchemaField 60 ChildToParent map[*SchemaField]*SchemaField 61 Fields []SchemaField 62 } 63 64 // GetColumnField returns the corresponding Field for a given column index. 65 func (sm *SchemaManifest) GetColumnField(index int) (*SchemaField, error) { 66 if field, ok := sm.ColIndexToField[index]; ok { 67 return field, nil 68 } 69 return nil, fmt.Errorf("Column Index %d not found in schema manifest", index) 70 } 71 72 // GetParent gets the parent field for a given field if it is a nested column, otherwise 73 // returns nil if there is no parent field. 74 func (sm *SchemaManifest) GetParent(field *SchemaField) *SchemaField { 75 if p, ok := sm.ChildToParent[field]; ok { 76 return p 77 } 78 return nil 79 } 80 81 // GetFieldIndices coalesces a list of field indices (relative to the equivalent arrow::Schema) which 82 // correspond to the column root (first node below the parquet schema's root group) of 83 // each leaf referenced in column_indices. 84 // 85 // For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3]) 86 // the roots are `a` and `i` (return=[0,2]). 87 // 88 // root 89 // -- a <------ 90 // -- -- b | | 91 // -- -- -- c | 92 // -- -- -- d | 93 // -- -- -- -- e 94 // -- f 95 // -- -- g 96 // -- -- -- h 97 // -- i <--- 98 // -- -- j | 99 // -- -- -- k 100 func (sm *SchemaManifest) GetFieldIndices(indices []int) ([]int, error) { 101 added := make(map[int]bool) 102 ret := make([]int, 0) 103 104 for _, idx := range indices { 105 if idx < 0 || idx >= sm.descr.NumColumns() { 106 return nil, fmt.Errorf("column index %d is not valid", idx) 107 } 108 109 fieldNode := sm.descr.ColumnRoot(idx) 110 fieldIdx := sm.descr.Root().FieldIndexByField(fieldNode) 111 if fieldIdx == -1 { 112 return nil, fmt.Errorf("column index %d is not valid", idx) 113 } 114 115 if _, ok := added[fieldIdx]; !ok { 116 ret = append(ret, fieldIdx) 117 added[fieldIdx] = true 118 } 119 } 120 return ret, nil 121 } 122 123 func isDictionaryReadSupported(dt arrow.DataType) bool { 124 return arrow.IsBinaryLike(dt.ID()) 125 } 126 127 func arrowTimestampToLogical(typ *arrow.TimestampType, unit arrow.TimeUnit) schema.LogicalType { 128 isAdjustedToUTC := typ.TimeZone != "" 129 130 // for forward compatibility reasons, and because there's no other way 131 // to signal to old readers that values are timestamps, we force 132 // the convertedtype field to be set to the corresponding TIMESTAMP_* value. 133 // this does cause some ambiguity as parquet readers have not been consistent 134 // about the interpretation of TIMESTAMP_* values as being utc-normalized 135 // see ARROW-5878 136 var scunit schema.TimeUnitType 137 switch unit { 138 case arrow.Millisecond: 139 scunit = schema.TimeUnitMillis 140 case arrow.Microsecond: 141 scunit = schema.TimeUnitMicros 142 case arrow.Nanosecond: 143 scunit = schema.TimeUnitNanos 144 case arrow.Second: 145 // no equivalent in parquet 146 return schema.NoLogicalType{} 147 } 148 149 return schema.NewTimestampLogicalTypeForce(isAdjustedToUTC, scunit) 150 } 151 152 func getTimestampMeta(typ *arrow.TimestampType, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (parquet.Type, schema.LogicalType, error) { 153 coerce := arrprops.coerceTimestamps 154 target := typ.Unit 155 if coerce { 156 target = arrprops.coerceTimestampUnit 157 } 158 159 // user is explicitly asking for int96, no logical type 160 if arrprops.timestampAsInt96 && target == arrow.Nanosecond { 161 return parquet.Types.Int96, schema.NoLogicalType{}, nil 162 } 163 164 physical := parquet.Types.Int64 165 logicalType := arrowTimestampToLogical(typ, target) 166 167 // user is explicitly asking for timestamp data to be converted to the specified 168 // units (target) via coercion 169 if coerce { 170 if props.Version() == parquet.V1_0 || props.Version() == parquet.V2_4 { 171 switch target { 172 case arrow.Millisecond, arrow.Microsecond: 173 case arrow.Nanosecond, arrow.Second: 174 return physical, nil, fmt.Errorf("parquet version %s files can only coerce arrow timestamps to millis or micros", props.Version()) 175 } 176 } else if target == arrow.Second { 177 return physical, nil, fmt.Errorf("parquet version %s files can only coerce arrow timestamps to millis, micros or nanos", props.Version()) 178 } 179 return physical, logicalType, nil 180 } 181 182 // the user implicitly wants timestamp data to retain its original time units 183 // however the converted type field used to indicate logical types for parquet 184 // version <=2.4 fields, does not allow for nanosecond time units and so nanos 185 // must be coerced to micros 186 if (props.Version() == parquet.V1_0 || props.Version() == parquet.V2_4) && typ.Unit == arrow.Nanosecond { 187 logicalType = arrowTimestampToLogical(typ, arrow.Microsecond) 188 return physical, logicalType, nil 189 } 190 191 // the user implicitly wants timestamp data to retain it's original time units, 192 // however the arrow seconds time unit cannot be represented in parquet, so must 193 // be coerced to milliseconds 194 if typ.Unit == arrow.Second { 195 logicalType = arrowTimestampToLogical(typ, arrow.Millisecond) 196 } 197 198 return physical, logicalType, nil 199 } 200 201 // DecimalSize returns the minimum number of bytes necessary to represent a decimal 202 // with the requested precision. 203 // 204 // Taken from the Apache Impala codebase. The comments next to the return values 205 // are the maximum value that can be represented in 2's complement with the returned 206 // number of bytes 207 func DecimalSize(precision int32) int32 { 208 if precision < 1 { 209 panic("precision must be >= 1") 210 } 211 212 // generated in python with: 213 // >>> decimal_size = lambda prec: int(math.ceil((prec * math.log2(10) + 1) / 8)) 214 // >>> [-1] + [decimal_size(i) for i in range(1, 77)] 215 var byteblock = [...]int32{ 216 -1, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 9, 217 9, 9, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16, 16, 17, 218 17, 18, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25, 219 26, 26, 26, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32, 220 } 221 222 if precision <= 76 { 223 return byteblock[precision] 224 } 225 return int32(math.Ceil(float64(precision)/8.0)*math.Log2(10) + 1) 226 } 227 228 func repFromNullable(isnullable bool) parquet.Repetition { 229 if isnullable { 230 return parquet.Repetitions.Optional 231 } 232 return parquet.Repetitions.Required 233 } 234 235 func structToNode(typ *arrow.StructType, name string, nullable bool, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) { 236 if typ.NumFields() == 0 { 237 return nil, fmt.Errorf("cannot write struct type '%s' with no children field to parquet. Consider adding a dummy child", name) 238 } 239 240 children := make(schema.FieldList, 0, typ.NumFields()) 241 for _, f := range typ.Fields() { 242 n, err := fieldToNode(f.Name, f, props, arrprops) 243 if err != nil { 244 return nil, err 245 } 246 children = append(children, n) 247 } 248 249 return schema.NewGroupNode(name, repFromNullable(nullable), children, -1) 250 } 251 252 func fieldToNode(name string, field arrow.Field, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) { 253 var ( 254 logicalType schema.LogicalType = schema.NoLogicalType{} 255 typ parquet.Type 256 repType = repFromNullable(field.Nullable) 257 length = -1 258 precision = -1 259 scale = -1 260 err error 261 ) 262 263 switch field.Type.ID() { 264 case arrow.NULL: 265 typ = parquet.Types.Int32 266 logicalType = &schema.NullLogicalType{} 267 if repType != parquet.Repetitions.Optional { 268 return nil, xerrors.New("nulltype arrow field must be nullable") 269 } 270 case arrow.BOOL: 271 typ = parquet.Types.Boolean 272 case arrow.UINT8: 273 typ = parquet.Types.Int32 274 logicalType = schema.NewIntLogicalType(8, false) 275 case arrow.INT8: 276 typ = parquet.Types.Int32 277 logicalType = schema.NewIntLogicalType(8, true) 278 case arrow.UINT16: 279 typ = parquet.Types.Int32 280 logicalType = schema.NewIntLogicalType(16, false) 281 case arrow.INT16: 282 typ = parquet.Types.Int32 283 logicalType = schema.NewIntLogicalType(16, true) 284 case arrow.UINT32: 285 typ = parquet.Types.Int32 286 logicalType = schema.NewIntLogicalType(32, false) 287 case arrow.INT32: 288 typ = parquet.Types.Int32 289 logicalType = schema.NewIntLogicalType(32, true) 290 case arrow.UINT64: 291 typ = parquet.Types.Int64 292 logicalType = schema.NewIntLogicalType(64, false) 293 case arrow.INT64: 294 typ = parquet.Types.Int64 295 logicalType = schema.NewIntLogicalType(64, true) 296 case arrow.FLOAT32: 297 typ = parquet.Types.Float 298 case arrow.FLOAT64: 299 typ = parquet.Types.Double 300 case arrow.STRING, arrow.LARGE_STRING: 301 logicalType = schema.StringLogicalType{} 302 fallthrough 303 case arrow.BINARY, arrow.LARGE_BINARY: 304 typ = parquet.Types.ByteArray 305 case arrow.FIXED_SIZE_BINARY: 306 typ = parquet.Types.FixedLenByteArray 307 length = field.Type.(*arrow.FixedSizeBinaryType).ByteWidth 308 case arrow.DECIMAL, arrow.DECIMAL256: 309 dectype := field.Type.(arrow.DecimalType) 310 precision = int(dectype.GetPrecision()) 311 scale = int(dectype.GetScale()) 312 313 if props.StoreDecimalAsInteger() && 1 <= precision && precision <= 18 { 314 if precision <= 9 { 315 typ = parquet.Types.Int32 316 } else { 317 typ = parquet.Types.Int64 318 } 319 } else { 320 typ = parquet.Types.FixedLenByteArray 321 length = int(DecimalSize(int32(precision))) 322 } 323 324 logicalType = schema.NewDecimalLogicalType(int32(precision), int32(scale)) 325 case arrow.DATE32: 326 typ = parquet.Types.Int32 327 logicalType = schema.DateLogicalType{} 328 case arrow.DATE64: 329 typ = parquet.Types.Int32 330 logicalType = schema.DateLogicalType{} 331 case arrow.TIMESTAMP: 332 typ, logicalType, err = getTimestampMeta(field.Type.(*arrow.TimestampType), props, arrprops) 333 if err != nil { 334 return nil, err 335 } 336 case arrow.TIME32: 337 typ = parquet.Types.Int32 338 logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMillis) 339 case arrow.TIME64: 340 typ = parquet.Types.Int64 341 timeType := field.Type.(*arrow.Time64Type) 342 if timeType.Unit == arrow.Nanosecond { 343 logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitNanos) 344 } else { 345 logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMicros) 346 } 347 case arrow.FLOAT16: 348 typ = parquet.Types.FixedLenByteArray 349 length = arrow.Float16SizeBytes 350 logicalType = schema.Float16LogicalType{} 351 case arrow.STRUCT: 352 return structToNode(field.Type.(*arrow.StructType), field.Name, field.Nullable, props, arrprops) 353 case arrow.FIXED_SIZE_LIST, arrow.LIST: 354 var elem arrow.DataType 355 if lt, ok := field.Type.(*arrow.ListType); ok { 356 elem = lt.Elem() 357 } else { 358 elem = field.Type.(*arrow.FixedSizeListType).Elem() 359 } 360 361 child, err := fieldToNode(name, arrow.Field{Name: name, Type: elem, Nullable: true}, props, arrprops) 362 if err != nil { 363 return nil, err 364 } 365 366 return schema.ListOf(child, repFromNullable(field.Nullable), -1) 367 case arrow.DICTIONARY: 368 // parquet has no dictionary type, dictionary is encoding, not schema level 369 dictType := field.Type.(*arrow.DictionaryType) 370 return fieldToNode(name, arrow.Field{Name: name, Type: dictType.ValueType, Nullable: field.Nullable, Metadata: field.Metadata}, 371 props, arrprops) 372 case arrow.EXTENSION: 373 return fieldToNode(name, arrow.Field{ 374 Name: name, 375 Type: field.Type.(arrow.ExtensionType).StorageType(), 376 Nullable: field.Nullable, 377 Metadata: arrow.MetadataFrom(map[string]string{ 378 ipc.ExtensionTypeKeyName: field.Type.(arrow.ExtensionType).ExtensionName(), 379 ipc.ExtensionMetadataKeyName: field.Type.(arrow.ExtensionType).Serialize(), 380 }), 381 }, props, arrprops) 382 case arrow.MAP: 383 mapType := field.Type.(*arrow.MapType) 384 keyNode, err := fieldToNode("key", mapType.KeyField(), props, arrprops) 385 if err != nil { 386 return nil, err 387 } 388 389 valueNode, err := fieldToNode("value", mapType.ItemField(), props, arrprops) 390 if err != nil { 391 return nil, err 392 } 393 394 if arrprops.noMapLogicalType { 395 keyval := schema.FieldList{keyNode, valueNode} 396 keyvalNode, err := schema.NewGroupNode("key_value", parquet.Repetitions.Repeated, keyval, -1) 397 if err != nil { 398 return nil, err 399 } 400 return schema.NewGroupNode(field.Name, repFromNullable(field.Nullable), schema.FieldList{ 401 keyvalNode, 402 }, -1) 403 } 404 return schema.MapOf(field.Name, keyNode, valueNode, repFromNullable(field.Nullable), -1) 405 default: 406 return nil, fmt.Errorf("%w: support for %s", arrow.ErrNotImplemented, field.Type.ID()) 407 } 408 409 return schema.NewPrimitiveNodeLogical(name, repType, logicalType, typ, length, fieldIDFromMeta(field.Metadata)) 410 } 411 412 const fieldIDKey = "PARQUET:field_id" 413 414 func fieldIDFromMeta(m arrow.Metadata) int32 { 415 if m.Len() == 0 { 416 return -1 417 } 418 419 key := m.FindKey(fieldIDKey) 420 if key < 0 { 421 return -1 422 } 423 424 id, err := strconv.ParseInt(m.Values()[key], 10, 32) 425 if err != nil { 426 return -1 427 } 428 429 if id < 0 { 430 return -1 431 } 432 433 return int32(id) 434 } 435 436 // ToParquet generates a Parquet Schema from an arrow Schema using the given properties to make 437 // decisions when determining the logical/physical types of the columns. 438 func ToParquet(sc *arrow.Schema, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (*schema.Schema, error) { 439 if props == nil { 440 props = parquet.NewWriterProperties() 441 } 442 443 nodes := make(schema.FieldList, 0, sc.NumFields()) 444 for _, f := range sc.Fields() { 445 n, err := fieldToNode(f.Name, f, props, arrprops) 446 if err != nil { 447 return nil, err 448 } 449 nodes = append(nodes, n) 450 } 451 452 root, err := schema.NewGroupNode(props.RootName(), props.RootRepetition(), nodes, -1) 453 if err != nil { 454 return nil, err 455 } 456 457 return schema.NewSchema(root), err 458 } 459 460 type schemaTree struct { 461 manifest *SchemaManifest 462 463 schema *schema.Schema 464 props *ArrowReadProperties 465 } 466 467 func (s schemaTree) LinkParent(child, parent *SchemaField) { 468 s.manifest.ChildToParent[child] = parent 469 } 470 471 func (s schemaTree) RecordLeaf(leaf *SchemaField) { 472 s.manifest.ColIndexToField[leaf.ColIndex] = leaf 473 } 474 475 func arrowInt(log *schema.IntLogicalType) (arrow.DataType, error) { 476 switch log.BitWidth() { 477 case 8: 478 if log.IsSigned() { 479 return arrow.PrimitiveTypes.Int8, nil 480 } 481 return arrow.PrimitiveTypes.Uint8, nil 482 case 16: 483 if log.IsSigned() { 484 return arrow.PrimitiveTypes.Int16, nil 485 } 486 return arrow.PrimitiveTypes.Uint16, nil 487 case 32: 488 if log.IsSigned() { 489 return arrow.PrimitiveTypes.Int32, nil 490 } 491 return arrow.PrimitiveTypes.Uint32, nil 492 case 64: 493 if log.IsSigned() { 494 return arrow.PrimitiveTypes.Int64, nil 495 } 496 return arrow.PrimitiveTypes.Uint64, nil 497 default: 498 return nil, xerrors.New("invalid logical type for int32") 499 } 500 } 501 502 func arrowTime32(logical *schema.TimeLogicalType) (arrow.DataType, error) { 503 if logical.TimeUnit() == schema.TimeUnitMillis { 504 return arrow.FixedWidthTypes.Time32ms, nil 505 } 506 507 return nil, xerrors.New(logical.String() + " cannot annotate a time32") 508 } 509 510 func arrowTime64(logical *schema.TimeLogicalType) (arrow.DataType, error) { 511 switch logical.TimeUnit() { 512 case schema.TimeUnitMicros: 513 return arrow.FixedWidthTypes.Time64us, nil 514 case schema.TimeUnitNanos: 515 return arrow.FixedWidthTypes.Time64ns, nil 516 default: 517 return nil, xerrors.New(logical.String() + " cannot annotate int64") 518 } 519 } 520 521 func arrowTimestamp(logical *schema.TimestampLogicalType) (arrow.DataType, error) { 522 tz := "" 523 524 // ConvertedTypes are adjusted to UTC per backward compatibility guidelines 525 // https://github.com/apache/parquet-format/blob/eb4b31c1d64a01088d02a2f9aefc6c17c54cc6fc/LogicalTypes.md?plain=1#L480-L485 526 if logical.IsAdjustedToUTC() || logical.IsFromConvertedType() { 527 tz = "UTC" 528 } 529 530 switch logical.TimeUnit() { 531 case schema.TimeUnitMillis: 532 return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Millisecond}, nil 533 case schema.TimeUnitMicros: 534 return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Microsecond}, nil 535 case schema.TimeUnitNanos: 536 return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Nanosecond}, nil 537 default: 538 return nil, xerrors.New("Unrecognized unit in timestamp logical type " + logical.String()) 539 } 540 } 541 542 func arrowDecimal(logical *schema.DecimalLogicalType) arrow.DataType { 543 if logical.Precision() <= decimal128.MaxPrecision { 544 return &arrow.Decimal128Type{Precision: logical.Precision(), Scale: logical.Scale()} 545 } 546 return &arrow.Decimal256Type{Precision: logical.Precision(), Scale: logical.Scale()} 547 } 548 549 func arrowFromInt32(logical schema.LogicalType) (arrow.DataType, error) { 550 switch logtype := logical.(type) { 551 case schema.NoLogicalType: 552 return arrow.PrimitiveTypes.Int32, nil 553 case *schema.TimeLogicalType: 554 return arrowTime32(logtype) 555 case *schema.DecimalLogicalType: 556 return arrowDecimal(logtype), nil 557 case *schema.IntLogicalType: 558 return arrowInt(logtype) 559 case schema.DateLogicalType: 560 return arrow.FixedWidthTypes.Date32, nil 561 default: 562 return nil, xerrors.New(logical.String() + " cannot annotate int32") 563 } 564 } 565 566 func arrowFromInt64(logical schema.LogicalType) (arrow.DataType, error) { 567 if logical.IsNone() { 568 return arrow.PrimitiveTypes.Int64, nil 569 } 570 571 switch logtype := logical.(type) { 572 case *schema.IntLogicalType: 573 return arrowInt(logtype) 574 case *schema.DecimalLogicalType: 575 return arrowDecimal(logtype), nil 576 case *schema.TimeLogicalType: 577 return arrowTime64(logtype) 578 case *schema.TimestampLogicalType: 579 return arrowTimestamp(logtype) 580 default: 581 return nil, xerrors.New(logical.String() + " cannot annotate int64") 582 } 583 } 584 585 func arrowFromByteArray(logical schema.LogicalType) (arrow.DataType, error) { 586 switch logtype := logical.(type) { 587 case schema.StringLogicalType: 588 return arrow.BinaryTypes.String, nil 589 case *schema.DecimalLogicalType: 590 return arrowDecimal(logtype), nil 591 case schema.NoLogicalType, 592 schema.EnumLogicalType, 593 schema.JSONLogicalType, 594 schema.BSONLogicalType: 595 return arrow.BinaryTypes.Binary, nil 596 default: 597 return nil, xerrors.New("unhandled logicaltype " + logical.String() + " for byte_array") 598 } 599 } 600 601 func arrowFromFLBA(logical schema.LogicalType, length int) (arrow.DataType, error) { 602 switch logtype := logical.(type) { 603 case *schema.DecimalLogicalType: 604 return arrowDecimal(logtype), nil 605 case schema.NoLogicalType, schema.IntervalLogicalType, schema.UUIDLogicalType: 606 return &arrow.FixedSizeBinaryType{ByteWidth: int(length)}, nil 607 case schema.Float16LogicalType: 608 return &arrow.Float16Type{}, nil 609 default: 610 return nil, xerrors.New("unhandled logical type " + logical.String() + " for fixed-length byte array") 611 } 612 } 613 614 func getArrowType(physical parquet.Type, logical schema.LogicalType, typeLen int) (arrow.DataType, error) { 615 if !logical.IsValid() || logical.Equals(schema.NullLogicalType{}) { 616 return arrow.Null, nil 617 } 618 619 switch physical { 620 case parquet.Types.Boolean: 621 return arrow.FixedWidthTypes.Boolean, nil 622 case parquet.Types.Int32: 623 return arrowFromInt32(logical) 624 case parquet.Types.Int64: 625 return arrowFromInt64(logical) 626 case parquet.Types.Int96: 627 return arrow.FixedWidthTypes.Timestamp_ns, nil 628 case parquet.Types.Float: 629 return arrow.PrimitiveTypes.Float32, nil 630 case parquet.Types.Double: 631 return arrow.PrimitiveTypes.Float64, nil 632 case parquet.Types.ByteArray: 633 return arrowFromByteArray(logical) 634 case parquet.Types.FixedLenByteArray: 635 return arrowFromFLBA(logical, typeLen) 636 default: 637 return nil, xerrors.New("invalid physical column type") 638 } 639 } 640 641 func populateLeaf(colIndex int, field *arrow.Field, currentLevels file.LevelInfo, ctx *schemaTree, parent *SchemaField, out *SchemaField) { 642 out.Field = field 643 out.ColIndex = colIndex 644 out.LevelInfo = currentLevels 645 ctx.RecordLeaf(out) 646 ctx.LinkParent(out, parent) 647 } 648 649 func listToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error { 650 if n.NumFields() != 1 { 651 return xerrors.New("LIST groups must have only 1 child") 652 } 653 654 if n.RepetitionType() == parquet.Repetitions.Repeated { 655 return xerrors.New("LIST groups must not be repeated") 656 } 657 658 currentLevels.Increment(n) 659 660 out.Children = make([]SchemaField, n.NumFields()) 661 ctx.LinkParent(out, parent) 662 ctx.LinkParent(&out.Children[0], out) 663 664 listNode := n.Field(0) 665 if listNode.RepetitionType() != parquet.Repetitions.Repeated { 666 return xerrors.New("non-repeated nodes in a list group are not supported") 667 } 668 669 repeatedAncestorDef := currentLevels.IncrementRepeated() 670 if listNode.Type() == schema.Group { 671 // Resolve 3-level encoding 672 // 673 // required/optional group name=whatever { 674 // repeated group name=list { 675 // required/optional TYPE item; 676 // } 677 // } 678 // 679 // yields list<item: TYPE ?nullable> ?nullable 680 // 681 // We distinguish the special case that we have 682 // 683 // required/optional group name=whatever { 684 // repeated group name=array or $SOMETHING_tuple { 685 // required/optional TYPE item; 686 // } 687 // } 688 // 689 // In this latter case, the inner type of the list should be a struct 690 // rather than a primitive value 691 // 692 // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable 693 // Special case mentioned in the format spec: 694 // If the name is array or ends in _tuple, this should be a list of struct 695 // even for single child elements. 696 listGroup := listNode.(*schema.GroupNode) 697 if listGroup.NumFields() == 1 && !(listGroup.Name() == "array" || listGroup.Name() == (n.Name()+"_tuple")) { 698 // list of primitive type 699 if err := nodeToSchemaField(listGroup.Field(0), currentLevels, ctx, out, &out.Children[0]); err != nil { 700 return err 701 } 702 } else { 703 if err := groupToStructField(listGroup, currentLevels, ctx, out, &out.Children[0]); err != nil { 704 return err 705 } 706 } 707 } else { 708 // Two-level list encoding 709 // 710 // required/optional group LIST { 711 // repeated TYPE; 712 // } 713 primitiveNode := listNode.(*schema.PrimitiveNode) 714 colIndex := ctx.schema.ColumnIndexByNode(primitiveNode) 715 arrowType, err := getArrowType(primitiveNode.PhysicalType(), primitiveNode.LogicalType(), primitiveNode.TypeLength()) 716 if err != nil { 717 return err 718 } 719 720 if ctx.props.ReadDict(colIndex) && isDictionaryReadSupported(arrowType) { 721 arrowType = &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32, ValueType: arrowType} 722 } 723 724 itemField := arrow.Field{Name: listNode.Name(), Type: arrowType, Nullable: false, Metadata: createFieldMeta(int(listNode.FieldID()))} 725 populateLeaf(colIndex, &itemField, currentLevels, ctx, out, &out.Children[0]) 726 } 727 728 out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOfField( 729 arrow.Field{Name: listNode.Name(), Type: out.Children[0].Field.Type, Nullable: true}), 730 Nullable: n.RepetitionType() == parquet.Repetitions.Optional, Metadata: createFieldMeta(int(n.FieldID()))} 731 732 out.LevelInfo = currentLevels 733 // At this point current levels contains the def level for this list, 734 // we need to reset to the prior parent. 735 out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef 736 return nil 737 } 738 739 func groupToStructField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error { 740 arrowFields := make([]arrow.Field, 0, n.NumFields()) 741 out.Children = make([]SchemaField, n.NumFields()) 742 743 for i := 0; i < n.NumFields(); i++ { 744 if err := nodeToSchemaField(n.Field(i), currentLevels, ctx, out, &out.Children[i]); err != nil { 745 return err 746 } 747 arrowFields = append(arrowFields, *out.Children[i].Field) 748 } 749 750 out.Field = &arrow.Field{Name: n.Name(), Type: arrow.StructOf(arrowFields...), 751 Nullable: n.RepetitionType() == parquet.Repetitions.Optional, Metadata: createFieldMeta(int(n.FieldID()))} 752 out.LevelInfo = currentLevels 753 return nil 754 } 755 756 func mapToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error { 757 if n.NumFields() != 1 { 758 return xerrors.New("MAP group must have exactly 1 child") 759 } 760 if n.RepetitionType() == parquet.Repetitions.Repeated { 761 return xerrors.New("MAP groups must not be repeated") 762 } 763 764 keyvalueNode := n.Field(0) 765 if keyvalueNode.RepetitionType() != parquet.Repetitions.Repeated { 766 return xerrors.New("Non-repeated keyvalue group in MAP group is not supported") 767 } 768 769 if keyvalueNode.Type() != schema.Group { 770 return xerrors.New("keyvalue node must be a group") 771 } 772 773 kvgroup := keyvalueNode.(*schema.GroupNode) 774 if kvgroup.NumFields() != 1 && kvgroup.NumFields() != 2 { 775 return fmt.Errorf("keyvalue node group must have exactly 1 or 2 child elements, Found %d", kvgroup.NumFields()) 776 } 777 778 keyNode := kvgroup.Field(0) 779 if keyNode.RepetitionType() != parquet.Repetitions.Required { 780 return xerrors.New("MAP keys must be required") 781 } 782 783 // Arrow doesn't support 1 column maps (i.e. Sets). The options are to either 784 // make the values column nullable, or process the map as a list. We choose the latter 785 // as it is simpler. 786 if kvgroup.NumFields() == 1 { 787 return listToSchemaField(n, currentLevels, ctx, parent, out) 788 } 789 790 currentLevels.Increment(n) 791 repeatedAncestorDef := currentLevels.IncrementRepeated() 792 out.Children = make([]SchemaField, 1) 793 794 kvfield := &out.Children[0] 795 kvfield.Children = make([]SchemaField, 2) 796 797 keyField := &kvfield.Children[0] 798 valueField := &kvfield.Children[1] 799 800 ctx.LinkParent(out, parent) 801 ctx.LinkParent(kvfield, out) 802 ctx.LinkParent(keyField, kvfield) 803 ctx.LinkParent(valueField, kvfield) 804 805 // required/optional group name=whatever { 806 // repeated group name=key_values{ 807 // required TYPE key; 808 // required/optional TYPE value; 809 // } 810 // } 811 // 812 813 if err := nodeToSchemaField(keyNode, currentLevels, ctx, kvfield, keyField); err != nil { 814 return err 815 } 816 if err := nodeToSchemaField(kvgroup.Field(1), currentLevels, ctx, kvfield, valueField); err != nil { 817 return err 818 } 819 820 kvfield.Field = &arrow.Field{Name: n.Name(), Type: arrow.StructOf(*keyField.Field, *valueField.Field), 821 Nullable: false, Metadata: createFieldMeta(int(kvgroup.FieldID()))} 822 823 kvfield.LevelInfo = currentLevels 824 out.Field = &arrow.Field{Name: n.Name(), Type: arrow.MapOf(keyField.Field.Type, valueField.Field.Type), 825 Nullable: n.RepetitionType() == parquet.Repetitions.Optional, 826 Metadata: createFieldMeta(int(n.FieldID()))} 827 out.LevelInfo = currentLevels 828 // At this point current levels contains the def level for this map, 829 // we need to reset to the prior parent. 830 out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef 831 return nil 832 } 833 834 func groupToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error { 835 if n.LogicalType().Equals(schema.NewListLogicalType()) { 836 return listToSchemaField(n, currentLevels, ctx, parent, out) 837 } else if n.LogicalType().Equals(schema.MapLogicalType{}) { 838 return mapToSchemaField(n, currentLevels, ctx, parent, out) 839 } 840 841 if n.RepetitionType() == parquet.Repetitions.Repeated { 842 // Simple repeated struct 843 // 844 // repeated group $NAME { 845 // r/o TYPE[0] f0 846 // r/o TYPE[1] f1 847 // } 848 out.Children = make([]SchemaField, 1) 849 repeatedAncestorDef := currentLevels.IncrementRepeated() 850 if err := groupToStructField(n, currentLevels, ctx, out, &out.Children[0]); err != nil { 851 return err 852 } 853 854 out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOf(out.Children[0].Field.Type), Nullable: false, 855 Metadata: createFieldMeta(int(n.FieldID()))} 856 ctx.LinkParent(&out.Children[0], out) 857 out.LevelInfo = currentLevels 858 out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef 859 return nil 860 } 861 862 currentLevels.Increment(n) 863 return groupToStructField(n, currentLevels, ctx, parent, out) 864 } 865 866 func createFieldMeta(fieldID int) arrow.Metadata { 867 return arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{strconv.Itoa(fieldID)}) 868 } 869 870 func nodeToSchemaField(n schema.Node, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error { 871 ctx.LinkParent(out, parent) 872 873 if n.Type() == schema.Group { 874 return groupToSchemaField(n.(*schema.GroupNode), currentLevels, ctx, parent, out) 875 } 876 877 // Either a normal flat primitive type, or a list type encoded with 1-level 878 // list encoding. Note that the 3-level encoding is the form recommended by 879 // the parquet specification, but technically we can have either 880 // 881 // required/optional $TYPE $FIELD_NAME 882 // 883 // or 884 // 885 // repeated $TYPE $FIELD_NAME 886 887 primitive := n.(*schema.PrimitiveNode) 888 colIndex := ctx.schema.ColumnIndexByNode(primitive) 889 arrowType, err := getArrowType(primitive.PhysicalType(), primitive.LogicalType(), primitive.TypeLength()) 890 if err != nil { 891 return err 892 } 893 894 if ctx.props.ReadDict(colIndex) && isDictionaryReadSupported(arrowType) { 895 arrowType = &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32, ValueType: arrowType} 896 } 897 898 if primitive.RepetitionType() == parquet.Repetitions.Repeated { 899 // one-level list encoding e.g. a: repeated int32; 900 repeatedAncestorDefLevel := currentLevels.IncrementRepeated() 901 out.Children = make([]SchemaField, 1) 902 child := arrow.Field{Name: primitive.Name(), Type: arrowType, Nullable: false} 903 populateLeaf(colIndex, &child, currentLevels, ctx, out, &out.Children[0]) 904 out.Field = &arrow.Field{Name: primitive.Name(), Type: arrow.ListOf(child.Type), Nullable: false, 905 Metadata: createFieldMeta(int(primitive.FieldID()))} 906 out.LevelInfo = currentLevels 907 out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDefLevel 908 return nil 909 } 910 911 currentLevels.Increment(n) 912 populateLeaf(colIndex, &arrow.Field{Name: n.Name(), Type: arrowType, 913 Nullable: n.RepetitionType() == parquet.Repetitions.Optional, 914 Metadata: createFieldMeta(int(n.FieldID()))}, 915 currentLevels, ctx, parent, out) 916 return nil 917 } 918 919 func getOriginSchema(meta metadata.KeyValueMetadata, mem memory.Allocator) (*arrow.Schema, error) { 920 if meta == nil { 921 return nil, nil 922 } 923 924 const arrowSchemaKey = "ARROW:schema" 925 serialized := meta.FindValue(arrowSchemaKey) 926 if serialized == nil { 927 return nil, nil 928 } 929 930 var ( 931 decoded []byte 932 err error 933 ) 934 935 // if the length of serialized is not a multiple of 4, it cannot be 936 // padded with std encoding. 937 if len(*serialized)%4 == 0 { 938 decoded, err = base64.StdEncoding.DecodeString(*serialized) 939 } 940 // if we failed to decode it with stdencoding or the length wasn't 941 // a multiple of 4, try using the Raw unpadded encoding 942 if len(decoded) == 0 || err != nil { 943 decoded, err = base64.RawStdEncoding.DecodeString(*serialized) 944 } 945 946 if err != nil { 947 return nil, err 948 } 949 950 return flight.DeserializeSchema(decoded, mem) 951 } 952 953 func getNestedFactory(origin, inferred arrow.DataType) func(fieldList []arrow.Field) arrow.DataType { 954 switch inferred.ID() { 955 case arrow.STRUCT: 956 if origin.ID() == arrow.STRUCT { 957 return func(list []arrow.Field) arrow.DataType { 958 return arrow.StructOf(list...) 959 } 960 } 961 case arrow.LIST: 962 switch origin.ID() { 963 case arrow.LIST: 964 return func(list []arrow.Field) arrow.DataType { 965 return arrow.ListOf(list[0].Type) 966 } 967 case arrow.FIXED_SIZE_LIST: 968 sz := origin.(*arrow.FixedSizeListType).Len() 969 return func(list []arrow.Field) arrow.DataType { 970 return arrow.FixedSizeListOf(sz, list[0].Type) 971 } 972 } 973 case arrow.MAP: 974 if origin.ID() == arrow.MAP { 975 return func(list []arrow.Field) arrow.DataType { 976 valType := list[0].Type.(*arrow.StructType) 977 return arrow.MapOf(valType.Field(0).Type, valType.Field(1).Type) 978 } 979 } 980 } 981 return nil 982 } 983 984 func applyOriginalStorageMetadata(origin arrow.Field, inferred *SchemaField) (modified bool, err error) { 985 nchildren := len(inferred.Children) 986 switch origin.Type.ID() { 987 case arrow.EXTENSION: 988 extType := origin.Type.(arrow.ExtensionType) 989 modified, err = applyOriginalStorageMetadata(arrow.Field{ 990 Type: extType.StorageType(), 991 Metadata: origin.Metadata, 992 }, inferred) 993 if err != nil { 994 return 995 } 996 997 if !arrow.TypeEqual(extType.StorageType(), inferred.Field.Type) { 998 return modified, fmt.Errorf("%w: mismatch storage type '%s' for extension type '%s'", 999 arrow.ErrInvalid, inferred.Field.Type, extType) 1000 } 1001 1002 inferred.Field.Type = extType 1003 modified = true 1004 case arrow.SPARSE_UNION, arrow.DENSE_UNION: 1005 err = xerrors.New("unimplemented type") 1006 case arrow.STRUCT: 1007 typ := origin.Type.(*arrow.StructType) 1008 if nchildren != typ.NumFields() { 1009 return 1010 } 1011 1012 factory := getNestedFactory(typ, inferred.Field.Type) 1013 if factory == nil { 1014 return 1015 } 1016 1017 modified = typ.ID() != inferred.Field.Type.ID() 1018 for idx := range inferred.Children { 1019 childMod, err := applyOriginalMetadata(typ.Field(idx), &inferred.Children[idx]) 1020 if err != nil { 1021 return false, err 1022 } 1023 modified = modified || childMod 1024 } 1025 if modified { 1026 modifiedChildren := make([]arrow.Field, len(inferred.Children)) 1027 for idx, child := range inferred.Children { 1028 modifiedChildren[idx] = *child.Field 1029 } 1030 inferred.Field.Type = factory(modifiedChildren) 1031 } 1032 case arrow.FIXED_SIZE_LIST, arrow.LIST, arrow.LARGE_LIST, arrow.MAP: // arrow.ListLike 1033 if nchildren != 1 { 1034 return 1035 } 1036 factory := getNestedFactory(origin.Type, inferred.Field.Type) 1037 if factory == nil { 1038 return 1039 } 1040 1041 modified = origin.Type.ID() != inferred.Field.Type.ID() 1042 childModified, err := applyOriginalMetadata(arrow.Field{Type: origin.Type.(arrow.ListLikeType).Elem()}, &inferred.Children[0]) 1043 if err != nil { 1044 return modified, err 1045 } 1046 modified = modified || childModified 1047 if modified { 1048 inferred.Field.Type = factory([]arrow.Field{*inferred.Children[0].Field}) 1049 } 1050 case arrow.TIMESTAMP: 1051 if inferred.Field.Type.ID() != arrow.TIMESTAMP { 1052 return 1053 } 1054 1055 tsOtype := origin.Type.(*arrow.TimestampType) 1056 tsInfType := inferred.Field.Type.(*arrow.TimestampType) 1057 1058 // if the unit is the same and the data is tz-aware, then set the original time zone 1059 // since parquet has no native storage of timezones 1060 if tsOtype.Unit == tsInfType.Unit && tsInfType.TimeZone == "UTC" && tsOtype.TimeZone != "" { 1061 inferred.Field.Type = origin.Type 1062 } 1063 modified = true 1064 case arrow.LARGE_STRING, arrow.LARGE_BINARY: 1065 inferred.Field.Type = origin.Type 1066 modified = true 1067 case arrow.DICTIONARY: 1068 if origin.Type.ID() != arrow.DICTIONARY || (inferred.Field.Type.ID() == arrow.DICTIONARY || !isDictionaryReadSupported(inferred.Field.Type)) { 1069 return 1070 } 1071 1072 // direct dictionary reads are only supported for a few primitive types 1073 // so no need to recurse on value types 1074 dictOriginType := origin.Type.(*arrow.DictionaryType) 1075 inferred.Field.Type = &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32, 1076 ValueType: inferred.Field.Type, Ordered: dictOriginType.Ordered} 1077 modified = true 1078 case arrow.DECIMAL256: 1079 if inferred.Field.Type.ID() == arrow.DECIMAL128 { 1080 inferred.Field.Type = origin.Type 1081 modified = true 1082 } 1083 } 1084 1085 if origin.HasMetadata() { 1086 meta := origin.Metadata 1087 if inferred.Field.HasMetadata() { 1088 final := make(map[string]string) 1089 for idx, k := range meta.Keys() { 1090 final[k] = meta.Values()[idx] 1091 } 1092 for idx, k := range inferred.Field.Metadata.Keys() { 1093 final[k] = inferred.Field.Metadata.Values()[idx] 1094 } 1095 inferred.Field.Metadata = arrow.MetadataFrom(final) 1096 } else { 1097 inferred.Field.Metadata = meta 1098 } 1099 modified = true 1100 } 1101 1102 return 1103 } 1104 1105 func applyOriginalMetadata(origin arrow.Field, inferred *SchemaField) (bool, error) { 1106 return applyOriginalStorageMetadata(origin, inferred) 1107 } 1108 1109 // NewSchemaManifest creates a manifest for mapping a parquet schema to a given arrow schema. 1110 // 1111 // The metadata passed in should be the file level key value metadata from the parquet file or nil. 1112 // If the ARROW:schema was in the metadata, then it is utilized to determine types. 1113 func NewSchemaManifest(sc *schema.Schema, meta metadata.KeyValueMetadata, props *ArrowReadProperties) (*SchemaManifest, error) { 1114 var ctx schemaTree 1115 ctx.manifest = &SchemaManifest{ 1116 ColIndexToField: make(map[int]*SchemaField), 1117 ChildToParent: make(map[*SchemaField]*SchemaField), 1118 descr: sc, 1119 Fields: make([]SchemaField, sc.Root().NumFields()), 1120 } 1121 ctx.props = props 1122 if ctx.props == nil { 1123 ctx.props = &ArrowReadProperties{} 1124 } 1125 ctx.schema = sc 1126 1127 var err error 1128 ctx.manifest.OriginSchema, err = getOriginSchema(meta, memory.DefaultAllocator) 1129 if err != nil { 1130 return nil, err 1131 } 1132 1133 // if original schema is not compatible with the parquet schema, ignore it 1134 if ctx.manifest.OriginSchema != nil && len(ctx.manifest.OriginSchema.Fields()) != sc.Root().NumFields() { 1135 ctx.manifest.OriginSchema = nil 1136 } 1137 1138 for idx := range ctx.manifest.Fields { 1139 field := &ctx.manifest.Fields[idx] 1140 if err := nodeToSchemaField(sc.Root().Field(idx), file.LevelInfo{NullSlotUsage: 1}, &ctx, nil, field); err != nil { 1141 return nil, err 1142 } 1143 1144 if ctx.manifest.OriginSchema != nil { 1145 if _, err := applyOriginalMetadata(ctx.manifest.OriginSchema.Field(idx), field); err != nil { 1146 return nil, err 1147 } 1148 } 1149 } 1150 return ctx.manifest, nil 1151 } 1152 1153 // FromParquet generates an arrow Schema from a provided Parquet Schema 1154 func FromParquet(sc *schema.Schema, props *ArrowReadProperties, kv metadata.KeyValueMetadata) (*arrow.Schema, error) { 1155 manifest, err := NewSchemaManifest(sc, kv, props) 1156 if err != nil { 1157 return nil, err 1158 } 1159 1160 fields := make([]arrow.Field, len(manifest.Fields)) 1161 for idx, field := range manifest.Fields { 1162 fields[idx] = *field.Field 1163 } 1164 1165 if manifest.OriginSchema != nil { 1166 meta := manifest.OriginSchema.Metadata() 1167 return arrow.NewSchema(fields, &meta), nil 1168 } 1169 return arrow.NewSchema(fields, manifest.SchemaMeta), nil 1170 }