github.com/apache/arrow/go/v10@v10.0.1/parquet/pqarrow/schema.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow 18 19 import ( 20 "encoding/base64" 21 "fmt" 22 "math" 23 "strconv" 24 "strings" 25 26 "github.com/apache/arrow/go/v10/arrow" 27 "github.com/apache/arrow/go/v10/arrow/flight" 28 "github.com/apache/arrow/go/v10/arrow/memory" 29 "github.com/apache/arrow/go/v10/parquet" 30 "github.com/apache/arrow/go/v10/parquet/file" 31 "github.com/apache/arrow/go/v10/parquet/metadata" 32 "github.com/apache/arrow/go/v10/parquet/schema" 33 "golang.org/x/xerrors" 34 ) 35 36 // SchemaField is a holder that defines a specific logical field in the schema 37 // which could potentially refer to multiple physical columns in the underlying 38 // parquet file if it is a nested type. 39 // 40 // ColIndex is only populated (not -1) when it is a leaf column. 41 type SchemaField struct { 42 Field *arrow.Field 43 Children []SchemaField 44 ColIndex int 45 LevelInfo file.LevelInfo 46 } 47 48 // IsLeaf returns true if the SchemaField is a leaf column, ie: ColIndex != -1 49 func (s *SchemaField) IsLeaf() bool { return s.ColIndex != -1 } 50 51 // SchemaManifest represents a full manifest for mapping a Parquet schema 52 // to an arrow Schema. 53 type SchemaManifest struct { 54 descr *schema.Schema 55 OriginSchema *arrow.Schema 56 SchemaMeta *arrow.Metadata 57 58 ColIndexToField map[int]*SchemaField 59 ChildToParent map[*SchemaField]*SchemaField 60 Fields []SchemaField 61 } 62 63 // GetColumnField returns the corresponding Field for a given column index. 64 func (sm *SchemaManifest) GetColumnField(index int) (*SchemaField, error) { 65 if field, ok := sm.ColIndexToField[index]; ok { 66 return field, nil 67 } 68 return nil, fmt.Errorf("Column Index %d not found in schema manifest", index) 69 } 70 71 // GetParent gets the parent field for a given field if it is a nested column, otherwise 72 // returns nil if there is no parent field. 73 func (sm *SchemaManifest) GetParent(field *SchemaField) *SchemaField { 74 if p, ok := sm.ChildToParent[field]; ok { 75 return p 76 } 77 return nil 78 } 79 80 // GetFieldIndices coalesces a list of field indices (relative to the equivalent arrow::Schema) which 81 // correspond to the column root (first node below the parquet schema's root group) of 82 // each leaf referenced in column_indices. 83 // 84 // For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3]) 85 // the roots are `a` and `i` (return=[0,2]). 86 // 87 // root 88 // -- a <------ 89 // -- -- b | | 90 // -- -- -- c | 91 // -- -- -- d | 92 // -- -- -- -- e 93 // -- f 94 // -- -- g 95 // -- -- -- h 96 // -- i <--- 97 // -- -- j | 98 // -- -- -- k 99 func (sm *SchemaManifest) GetFieldIndices(indices []int) ([]int, error) { 100 added := make(map[int]bool) 101 ret := make([]int, 0) 102 103 for _, idx := range indices { 104 if idx < 0 || idx >= sm.descr.NumColumns() { 105 return nil, fmt.Errorf("column index %d is not valid", idx) 106 } 107 108 fieldNode := sm.descr.ColumnRoot(idx) 109 fieldIdx := sm.descr.Root().FieldIndexByField(fieldNode) 110 if fieldIdx == -1 { 111 return nil, fmt.Errorf("column index %d is not valid", idx) 112 } 113 114 if _, ok := added[fieldIdx]; !ok { 115 ret = append(ret, fieldIdx) 116 added[fieldIdx] = true 117 } 118 } 119 return ret, nil 120 } 121 122 func arrowTimestampToLogical(typ *arrow.TimestampType, unit arrow.TimeUnit) schema.LogicalType { 123 utc := typ.TimeZone == "" || typ.TimeZone == "UTC" 124 125 // for forward compatibility reasons, and because there's no other way 126 // to signal to old readers that values are timestamps, we force 127 // the convertedtype field to be set to the corresponding TIMESTAMP_* value. 128 // this does cause some ambiguity as parquet readers have not been consistent 129 // about the interpretation of TIMESTAMP_* values as being utc-normalized 130 // see ARROW-5878 131 var scunit schema.TimeUnitType 132 switch unit { 133 case arrow.Millisecond: 134 scunit = schema.TimeUnitMillis 135 case arrow.Microsecond: 136 scunit = schema.TimeUnitMicros 137 case arrow.Nanosecond: 138 scunit = schema.TimeUnitNanos 139 case arrow.Second: 140 // no equivalent in parquet 141 return schema.NoLogicalType{} 142 } 143 144 return schema.NewTimestampLogicalTypeForce(utc, scunit) 145 } 146 147 func getTimestampMeta(typ *arrow.TimestampType, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (parquet.Type, schema.LogicalType, error) { 148 coerce := arrprops.coerceTimestamps 149 target := typ.Unit 150 if coerce { 151 target = arrprops.coerceTimestampUnit 152 } 153 154 // user is explicitly asking for int96, no logical type 155 if arrprops.timestampAsInt96 && target == arrow.Nanosecond { 156 return parquet.Types.Int96, schema.NoLogicalType{}, nil 157 } 158 159 physical := parquet.Types.Int64 160 logicalType := arrowTimestampToLogical(typ, target) 161 162 // user is explicitly asking for timestamp data to be converted to the specified 163 // units (target) via coercion 164 if coerce { 165 if props.Version() == parquet.V1_0 || props.Version() == parquet.V2_4 { 166 switch target { 167 case arrow.Millisecond, arrow.Microsecond: 168 case arrow.Nanosecond, arrow.Second: 169 return physical, nil, fmt.Errorf("parquet version %s files can only coerce arrow timestamps to millis or micros", props.Version()) 170 } 171 } else if target == arrow.Second { 172 return physical, nil, fmt.Errorf("parquet version %s files can only coerce arrow timestampts to millis, micros or nanos", props.Version()) 173 } 174 return physical, logicalType, nil 175 } 176 177 // the user implicitly wants timestamp data to retain its original time units 178 // however the converted type field used to indicate logical types for parquet 179 // version <=2.4 fields, does not allow for nanosecond time units and so nanos 180 // must be coerced to micros 181 if (props.Version() == parquet.V1_0 || props.Version() == parquet.V2_4) && typ.Unit == arrow.Nanosecond { 182 logicalType = arrowTimestampToLogical(typ, arrow.Microsecond) 183 return physical, logicalType, nil 184 } 185 186 // the user implicitly wants timestamp data to retain it's original time units, 187 // however the arrow seconds time unit cannot be represented in parquet, so must 188 // be coerced to milliseconds 189 if typ.Unit == arrow.Second { 190 logicalType = arrowTimestampToLogical(typ, arrow.Millisecond) 191 } 192 193 return physical, logicalType, nil 194 } 195 196 // DecimalSize returns the minimum number of bytes necessary to represent a decimal 197 // with the requested precision. 198 // 199 // Taken from the Apache Impala codebase. The comments next to the return values 200 // are the maximum value that can be represented in 2's complement with the returned 201 // number of bytes 202 func DecimalSize(precision int32) int32 { 203 if precision < 1 { 204 panic("precision must be >= 1") 205 } 206 207 // generated in python with: 208 // >>> decimal_size = lambda prec: int(math.ceil((prec * math.log2(10) + 1) / 8)) 209 // >>> [-1] + [decimal_size(i) for i in range(1, 77)] 210 var byteblock = [...]int32{ 211 -1, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 9, 212 9, 9, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16, 16, 17, 213 17, 18, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25, 214 26, 26, 26, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32, 215 } 216 217 if precision <= 76 { 218 return byteblock[precision] 219 } 220 return int32(math.Ceil(float64(precision)/8.0)*math.Log2(10) + 1) 221 } 222 223 func repFromNullable(isnullable bool) parquet.Repetition { 224 if isnullable { 225 return parquet.Repetitions.Optional 226 } 227 return parquet.Repetitions.Required 228 } 229 230 func structToNode(typ *arrow.StructType, name string, nullable bool, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) { 231 if len(typ.Fields()) == 0 { 232 return nil, fmt.Errorf("cannot write struct type '%s' with no children field to parquet. Consider adding a dummy child", name) 233 } 234 235 children := make(schema.FieldList, 0, len(typ.Fields())) 236 for _, f := range typ.Fields() { 237 n, err := fieldToNode(f.Name, f, props, arrprops) 238 if err != nil { 239 return nil, err 240 } 241 children = append(children, n) 242 } 243 244 return schema.NewGroupNode(name, repFromNullable(nullable), children, -1) 245 } 246 247 func fieldToNode(name string, field arrow.Field, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) { 248 var ( 249 logicalType schema.LogicalType = schema.NoLogicalType{} 250 typ parquet.Type 251 repType = repFromNullable(field.Nullable) 252 length = -1 253 precision = -1 254 scale = -1 255 err error 256 ) 257 258 switch field.Type.ID() { 259 case arrow.NULL: 260 typ = parquet.Types.Int32 261 logicalType = &schema.NullLogicalType{} 262 if repType != parquet.Repetitions.Optional { 263 return nil, xerrors.New("nulltype arrow field must be nullable") 264 } 265 case arrow.BOOL: 266 typ = parquet.Types.Boolean 267 case arrow.UINT8: 268 typ = parquet.Types.Int32 269 logicalType = schema.NewIntLogicalType(8, false) 270 case arrow.INT8: 271 typ = parquet.Types.Int32 272 logicalType = schema.NewIntLogicalType(8, true) 273 case arrow.UINT16: 274 typ = parquet.Types.Int32 275 logicalType = schema.NewIntLogicalType(16, false) 276 case arrow.INT16: 277 typ = parquet.Types.Int32 278 logicalType = schema.NewIntLogicalType(16, true) 279 case arrow.UINT32: 280 typ = parquet.Types.Int32 281 logicalType = schema.NewIntLogicalType(32, false) 282 case arrow.INT32: 283 typ = parquet.Types.Int32 284 logicalType = schema.NewIntLogicalType(32, true) 285 case arrow.UINT64: 286 typ = parquet.Types.Int64 287 logicalType = schema.NewIntLogicalType(64, false) 288 case arrow.INT64: 289 typ = parquet.Types.Int64 290 logicalType = schema.NewIntLogicalType(64, true) 291 case arrow.FLOAT32: 292 typ = parquet.Types.Float 293 case arrow.FLOAT64: 294 typ = parquet.Types.Double 295 case arrow.STRING: 296 logicalType = schema.StringLogicalType{} 297 fallthrough 298 case arrow.BINARY: 299 typ = parquet.Types.ByteArray 300 case arrow.FIXED_SIZE_BINARY: 301 typ = parquet.Types.FixedLenByteArray 302 length = field.Type.(*arrow.FixedSizeBinaryType).ByteWidth 303 case arrow.DECIMAL: 304 typ = parquet.Types.FixedLenByteArray 305 dectype := field.Type.(*arrow.Decimal128Type) 306 precision = int(dectype.Precision) 307 scale = int(dectype.Scale) 308 length = int(DecimalSize(int32(precision))) 309 logicalType = schema.NewDecimalLogicalType(int32(precision), int32(scale)) 310 case arrow.DATE32: 311 typ = parquet.Types.Int32 312 logicalType = schema.DateLogicalType{} 313 case arrow.DATE64: 314 typ = parquet.Types.Int64 315 logicalType = schema.NewTimestampLogicalType(true, schema.TimeUnitMillis) 316 case arrow.TIMESTAMP: 317 typ, logicalType, err = getTimestampMeta(field.Type.(*arrow.TimestampType), props, arrprops) 318 if err != nil { 319 return nil, err 320 } 321 case arrow.TIME32: 322 typ = parquet.Types.Int32 323 logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMillis) 324 case arrow.TIME64: 325 typ = parquet.Types.Int64 326 timeType := field.Type.(*arrow.Time64Type) 327 if timeType.Unit == arrow.Nanosecond { 328 logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitNanos) 329 } else { 330 logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMicros) 331 } 332 case arrow.STRUCT: 333 return structToNode(field.Type.(*arrow.StructType), field.Name, field.Nullable, props, arrprops) 334 case arrow.FIXED_SIZE_LIST, arrow.LIST: 335 var elem arrow.DataType 336 if lt, ok := field.Type.(*arrow.ListType); ok { 337 elem = lt.Elem() 338 } else { 339 elem = field.Type.(*arrow.FixedSizeListType).Elem() 340 } 341 342 child, err := fieldToNode(name, arrow.Field{Name: name, Type: elem, Nullable: true}, props, arrprops) 343 if err != nil { 344 return nil, err 345 } 346 347 return schema.ListOf(child, repFromNullable(field.Nullable), -1) 348 case arrow.DICTIONARY: 349 // parquet has no dictionary type, dictionary is encoding, not schema level 350 return nil, xerrors.New("not implemented yet") 351 case arrow.EXTENSION: 352 return nil, xerrors.New("not implemented yet") 353 case arrow.MAP: 354 mapType := field.Type.(*arrow.MapType) 355 keyNode, err := fieldToNode("key", mapType.KeyField(), props, arrprops) 356 if err != nil { 357 return nil, err 358 } 359 360 valueNode, err := fieldToNode("value", mapType.ItemField(), props, arrprops) 361 if err != nil { 362 return nil, err 363 } 364 365 if arrprops.noMapLogicalType { 366 keyval := schema.FieldList{keyNode, valueNode} 367 keyvalNode, err := schema.NewGroupNode("key_value", parquet.Repetitions.Repeated, keyval, -1) 368 if err != nil { 369 return nil, err 370 } 371 return schema.NewGroupNode(field.Name, repFromNullable(field.Nullable), schema.FieldList{ 372 keyvalNode, 373 }, -1) 374 } 375 return schema.MapOf(field.Name, keyNode, valueNode, repFromNullable(field.Nullable), -1) 376 default: 377 return nil, xerrors.New("not implemented yet") 378 } 379 380 return schema.NewPrimitiveNodeLogical(name, repType, logicalType, typ, length, fieldIDFromMeta(field.Metadata)) 381 } 382 383 const fieldIDKey = "PARQUET:field_id" 384 385 func fieldIDFromMeta(m arrow.Metadata) int32 { 386 if m.Len() == 0 { 387 return -1 388 } 389 390 key := m.FindKey(fieldIDKey) 391 if key < 0 { 392 return -1 393 } 394 395 id, err := strconv.ParseInt(m.Values()[key], 10, 32) 396 if err != nil { 397 return -1 398 } 399 400 if id < 0 { 401 return -1 402 } 403 404 return int32(id) 405 } 406 407 // ToParquet generates a Parquet Schema from an arrow Schema using the given properties to make 408 // decisions when determining the logical/physical types of the columns. 409 func ToParquet(sc *arrow.Schema, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (*schema.Schema, error) { 410 if props == nil { 411 props = parquet.NewWriterProperties() 412 } 413 414 nodes := make(schema.FieldList, 0, len(sc.Fields())) 415 for _, f := range sc.Fields() { 416 n, err := fieldToNode(f.Name, f, props, arrprops) 417 if err != nil { 418 return nil, err 419 } 420 nodes = append(nodes, n) 421 } 422 423 root, err := schema.NewGroupNode(props.RootName(), props.RootRepetition(), nodes, -1) 424 if err != nil { 425 return nil, err 426 } 427 428 return schema.NewSchema(root), err 429 } 430 431 type schemaTree struct { 432 manifest *SchemaManifest 433 434 schema *schema.Schema 435 props *ArrowReadProperties 436 } 437 438 func (s schemaTree) LinkParent(child, parent *SchemaField) { 439 s.manifest.ChildToParent[child] = parent 440 } 441 442 func (s schemaTree) RecordLeaf(leaf *SchemaField) { 443 s.manifest.ColIndexToField[leaf.ColIndex] = leaf 444 } 445 446 func arrowInt(log *schema.IntLogicalType) (arrow.DataType, error) { 447 switch log.BitWidth() { 448 case 8: 449 if log.IsSigned() { 450 return arrow.PrimitiveTypes.Int8, nil 451 } 452 return arrow.PrimitiveTypes.Uint8, nil 453 case 16: 454 if log.IsSigned() { 455 return arrow.PrimitiveTypes.Int16, nil 456 } 457 return arrow.PrimitiveTypes.Uint16, nil 458 case 32: 459 if log.IsSigned() { 460 return arrow.PrimitiveTypes.Int32, nil 461 } 462 return arrow.PrimitiveTypes.Uint32, nil 463 case 64: 464 if log.IsSigned() { 465 return arrow.PrimitiveTypes.Int64, nil 466 } 467 return arrow.PrimitiveTypes.Uint64, nil 468 default: 469 return nil, xerrors.New("invalid logical type for int32") 470 } 471 } 472 473 func arrowTime32(logical *schema.TimeLogicalType) (arrow.DataType, error) { 474 if logical.TimeUnit() == schema.TimeUnitMillis { 475 return arrow.FixedWidthTypes.Time32ms, nil 476 } 477 478 return nil, xerrors.New(logical.String() + " cannot annotate a time32") 479 } 480 481 func arrowTime64(logical *schema.TimeLogicalType) (arrow.DataType, error) { 482 switch logical.TimeUnit() { 483 case schema.TimeUnitMicros: 484 return arrow.FixedWidthTypes.Time64us, nil 485 case schema.TimeUnitNanos: 486 return arrow.FixedWidthTypes.Time64ns, nil 487 default: 488 return nil, xerrors.New(logical.String() + " cannot annotate int64") 489 } 490 } 491 492 func arrowTimestamp(logical *schema.TimestampLogicalType) (arrow.DataType, error) { 493 tz := "UTC" 494 if logical.IsFromConvertedType() { 495 tz = "" 496 } 497 498 switch logical.TimeUnit() { 499 case schema.TimeUnitMillis: 500 return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Millisecond}, nil 501 case schema.TimeUnitMicros: 502 return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Microsecond}, nil 503 case schema.TimeUnitNanos: 504 return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Nanosecond}, nil 505 default: 506 return nil, xerrors.New("Unrecognized unit in timestamp logical type " + logical.String()) 507 } 508 } 509 510 func arrowFromInt32(logical schema.LogicalType) (arrow.DataType, error) { 511 switch logtype := logical.(type) { 512 case schema.NoLogicalType: 513 return arrow.PrimitiveTypes.Int32, nil 514 case *schema.TimeLogicalType: 515 return arrowTime32(logtype) 516 case *schema.DecimalLogicalType: 517 return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil 518 case *schema.IntLogicalType: 519 return arrowInt(logtype) 520 case schema.DateLogicalType: 521 return arrow.FixedWidthTypes.Date32, nil 522 default: 523 return nil, xerrors.New(logical.String() + " cannot annotate int32") 524 } 525 } 526 527 func arrowFromInt64(logical schema.LogicalType) (arrow.DataType, error) { 528 if logical.IsNone() { 529 return arrow.PrimitiveTypes.Int64, nil 530 } 531 532 switch logtype := logical.(type) { 533 case *schema.IntLogicalType: 534 return arrowInt(logtype) 535 case *schema.DecimalLogicalType: 536 return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil 537 case *schema.TimeLogicalType: 538 return arrowTime64(logtype) 539 case *schema.TimestampLogicalType: 540 return arrowTimestamp(logtype) 541 default: 542 return nil, xerrors.New(logical.String() + " cannot annotate int64") 543 } 544 } 545 546 func arrowFromByteArray(logical schema.LogicalType) (arrow.DataType, error) { 547 switch logtype := logical.(type) { 548 case schema.StringLogicalType: 549 return arrow.BinaryTypes.String, nil 550 case *schema.DecimalLogicalType: 551 return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil 552 case schema.NoLogicalType, 553 schema.EnumLogicalType, 554 schema.JSONLogicalType, 555 schema.BSONLogicalType: 556 return arrow.BinaryTypes.Binary, nil 557 default: 558 return nil, xerrors.New("unhandled logicaltype " + logical.String() + " for byte_array") 559 } 560 } 561 562 func arrowFromFLBA(logical schema.LogicalType, length int) (arrow.DataType, error) { 563 switch logtype := logical.(type) { 564 case *schema.DecimalLogicalType: 565 return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil 566 case schema.NoLogicalType, schema.IntervalLogicalType, schema.UUIDLogicalType: 567 return &arrow.FixedSizeBinaryType{ByteWidth: int(length)}, nil 568 default: 569 return nil, xerrors.New("unhandled logical type " + logical.String() + " for fixed-length byte array") 570 } 571 } 572 573 func getArrowType(physical parquet.Type, logical schema.LogicalType, typeLen int) (arrow.DataType, error) { 574 if !logical.IsValid() || logical.Equals(schema.NullLogicalType{}) { 575 return arrow.Null, nil 576 } 577 578 switch physical { 579 case parquet.Types.Boolean: 580 return arrow.FixedWidthTypes.Boolean, nil 581 case parquet.Types.Int32: 582 return arrowFromInt32(logical) 583 case parquet.Types.Int64: 584 return arrowFromInt64(logical) 585 case parquet.Types.Int96: 586 return arrow.FixedWidthTypes.Timestamp_ns, nil 587 case parquet.Types.Float: 588 return arrow.PrimitiveTypes.Float32, nil 589 case parquet.Types.Double: 590 return arrow.PrimitiveTypes.Float64, nil 591 case parquet.Types.ByteArray: 592 return arrowFromByteArray(logical) 593 case parquet.Types.FixedLenByteArray: 594 return arrowFromFLBA(logical, typeLen) 595 default: 596 return nil, xerrors.New("invalid physical column type") 597 } 598 } 599 600 func populateLeaf(colIndex int, field *arrow.Field, currentLevels file.LevelInfo, ctx *schemaTree, parent *SchemaField, out *SchemaField) { 601 out.Field = field 602 out.ColIndex = colIndex 603 out.LevelInfo = currentLevels 604 ctx.RecordLeaf(out) 605 ctx.LinkParent(out, parent) 606 } 607 608 func listToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error { 609 if n.NumFields() != 1 { 610 return xerrors.New("LIST groups must have only 1 child") 611 } 612 613 if n.RepetitionType() == parquet.Repetitions.Repeated { 614 return xerrors.New("LIST groups must not be repeated") 615 } 616 617 currentLevels.Increment(n) 618 619 out.Children = make([]SchemaField, n.NumFields()) 620 ctx.LinkParent(out, parent) 621 ctx.LinkParent(&out.Children[0], out) 622 623 listNode := n.Field(0) 624 if listNode.RepetitionType() != parquet.Repetitions.Repeated { 625 return xerrors.New("non-repeated nodes in a list group are not supported") 626 } 627 628 repeatedAncestorDef := currentLevels.IncrementRepeated() 629 if listNode.Type() == schema.Group { 630 // Resolve 3-level encoding 631 // 632 // required/optional group name=whatever { 633 // repeated group name=list { 634 // required/optional TYPE item; 635 // } 636 // } 637 // 638 // yields list<item: TYPE ?nullable> ?nullable 639 // 640 // We distinguish the special case that we have 641 // 642 // required/optional group name=whatever { 643 // repeated group name=array or $SOMETHING_tuple { 644 // required/optional TYPE item; 645 // } 646 // } 647 // 648 // In this latter case, the inner type of the list should be a struct 649 // rather than a primitive value 650 // 651 // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable 652 // Special case mentioned in the format spec: 653 // If the name is array or ends in _tuple, this should be a list of struct 654 // even for single child elements. 655 listGroup := listNode.(*schema.GroupNode) 656 if listGroup.NumFields() == 1 && (listGroup.Name() == "array" || strings.HasSuffix(listGroup.Name(), "_tuple")) { 657 // list of primitive type 658 if err := groupToStructField(listGroup, currentLevels, ctx, out, &out.Children[0]); err != nil { 659 return err 660 } 661 } else { 662 if err := nodeToSchemaField(listGroup.Field(0), currentLevels, ctx, out, &out.Children[0]); err != nil { 663 return err 664 } 665 } 666 } else { 667 // Two-level list encoding 668 // 669 // required/optional group LIST { 670 // repeated TYPE; 671 // } 672 primitiveNode := listNode.(*schema.PrimitiveNode) 673 colIndex := ctx.schema.ColumnIndexByNode(primitiveNode) 674 arrowType, err := getArrowType(primitiveNode.PhysicalType(), primitiveNode.LogicalType(), primitiveNode.TypeLength()) 675 if err != nil { 676 return err 677 } 678 679 itemField := arrow.Field{Name: listNode.Name(), Type: arrowType, Nullable: false, Metadata: createFieldMeta(int(listNode.FieldID()))} 680 populateLeaf(colIndex, &itemField, currentLevels, ctx, out, &out.Children[0]) 681 } 682 683 out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOf(out.Children[0].Field.Type), 684 Nullable: n.RepetitionType() == parquet.Repetitions.Optional, Metadata: createFieldMeta(int(n.FieldID()))} 685 out.LevelInfo = currentLevels 686 // At this point current levels contains the def level for this list, 687 // we need to reset to the prior parent. 688 out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef 689 return nil 690 } 691 692 func groupToStructField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error { 693 arrowFields := make([]arrow.Field, 0, n.NumFields()) 694 out.Children = make([]SchemaField, n.NumFields()) 695 696 for i := 0; i < n.NumFields(); i++ { 697 if err := nodeToSchemaField(n.Field(i), currentLevels, ctx, out, &out.Children[i]); err != nil { 698 return err 699 } 700 arrowFields = append(arrowFields, *out.Children[i].Field) 701 } 702 703 out.Field = &arrow.Field{Name: n.Name(), Type: arrow.StructOf(arrowFields...), 704 Nullable: n.RepetitionType() == parquet.Repetitions.Optional, Metadata: createFieldMeta(int(n.FieldID()))} 705 out.LevelInfo = currentLevels 706 return nil 707 } 708 709 func mapToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error { 710 if n.NumFields() != 1 { 711 return xerrors.New("MAP group must have exactly 1 child") 712 } 713 if n.RepetitionType() == parquet.Repetitions.Repeated { 714 return xerrors.New("MAP groups must not be repeated") 715 } 716 717 keyvalueNode := n.Field(0) 718 if keyvalueNode.RepetitionType() != parquet.Repetitions.Repeated { 719 return xerrors.New("Non-repeated keyvalue group in MAP group is not supported") 720 } 721 722 if keyvalueNode.Type() != schema.Group { 723 return xerrors.New("keyvalue node must be a group") 724 } 725 726 kvgroup := keyvalueNode.(*schema.GroupNode) 727 if kvgroup.NumFields() != 1 && kvgroup.NumFields() != 2 { 728 return fmt.Errorf("keyvalue node group must have exactly 1 or 2 child elements, Found %d", kvgroup.NumFields()) 729 } 730 731 keyNode := kvgroup.Field(0) 732 if keyNode.RepetitionType() != parquet.Repetitions.Required { 733 return xerrors.New("MAP keys must be required") 734 } 735 736 // Arrow doesn't support 1 column maps (i.e. Sets). The options are to either 737 // make the values column nullable, or process the map as a list. We choose the latter 738 // as it is simpler. 739 if kvgroup.NumFields() == 1 { 740 return listToSchemaField(n, currentLevels, ctx, parent, out) 741 } 742 743 currentLevels.Increment(n) 744 repeatedAncestorDef := currentLevels.IncrementRepeated() 745 out.Children = make([]SchemaField, 1) 746 747 kvfield := &out.Children[0] 748 kvfield.Children = make([]SchemaField, 2) 749 750 keyField := &kvfield.Children[0] 751 valueField := &kvfield.Children[1] 752 753 ctx.LinkParent(out, parent) 754 ctx.LinkParent(kvfield, out) 755 ctx.LinkParent(keyField, kvfield) 756 ctx.LinkParent(valueField, kvfield) 757 758 // required/optional group name=whatever { 759 // repeated group name=key_values{ 760 // required TYPE key; 761 // required/optional TYPE value; 762 // } 763 // } 764 // 765 766 if err := nodeToSchemaField(keyNode, currentLevels, ctx, kvfield, keyField); err != nil { 767 return err 768 } 769 if err := nodeToSchemaField(kvgroup.Field(1), currentLevels, ctx, kvfield, valueField); err != nil { 770 return err 771 } 772 773 kvfield.Field = &arrow.Field{Name: n.Name(), Type: arrow.StructOf(*keyField.Field, *valueField.Field), 774 Nullable: false, Metadata: createFieldMeta(int(kvgroup.FieldID()))} 775 776 kvfield.LevelInfo = currentLevels 777 out.Field = &arrow.Field{Name: n.Name(), Type: arrow.MapOf(keyField.Field.Type, valueField.Field.Type), 778 Nullable: n.RepetitionType() == parquet.Repetitions.Optional, 779 Metadata: createFieldMeta(int(n.FieldID()))} 780 out.LevelInfo = currentLevels 781 // At this point current levels contains the def level for this map, 782 // we need to reset to the prior parent. 783 out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef 784 return nil 785 } 786 787 func groupToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error { 788 if n.LogicalType().Equals(schema.NewListLogicalType()) { 789 return listToSchemaField(n, currentLevels, ctx, parent, out) 790 } else if n.LogicalType().Equals(schema.MapLogicalType{}) { 791 return mapToSchemaField(n, currentLevels, ctx, parent, out) 792 } 793 794 if n.RepetitionType() == parquet.Repetitions.Repeated { 795 // Simple repeated struct 796 // 797 // repeated group $NAME { 798 // r/o TYPE[0] f0 799 // r/o TYPE[1] f1 800 // } 801 out.Children = make([]SchemaField, 1) 802 repeatedAncestorDef := currentLevels.IncrementRepeated() 803 if err := groupToStructField(n, currentLevels, ctx, out, &out.Children[0]); err != nil { 804 return err 805 } 806 807 out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOf(out.Children[0].Field.Type), Nullable: false, 808 Metadata: createFieldMeta(int(n.FieldID()))} 809 ctx.LinkParent(&out.Children[0], out) 810 out.LevelInfo = currentLevels 811 out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef 812 return nil 813 } 814 815 currentLevels.Increment(n) 816 return groupToStructField(n, currentLevels, ctx, parent, out) 817 } 818 819 func createFieldMeta(fieldID int) arrow.Metadata { 820 return arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{strconv.Itoa(fieldID)}) 821 } 822 823 func nodeToSchemaField(n schema.Node, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error { 824 ctx.LinkParent(out, parent) 825 826 if n.Type() == schema.Group { 827 return groupToSchemaField(n.(*schema.GroupNode), currentLevels, ctx, parent, out) 828 } 829 830 // Either a normal flat primitive type, or a list type encoded with 1-level 831 // list encoding. Note that the 3-level encoding is the form recommended by 832 // the parquet specification, but technically we can have either 833 // 834 // required/optional $TYPE $FIELD_NAME 835 // 836 // or 837 // 838 // repeated $TYPE $FIELD_NAME 839 840 primitive := n.(*schema.PrimitiveNode) 841 colIndex := ctx.schema.ColumnIndexByNode(primitive) 842 arrowType, err := getArrowType(primitive.PhysicalType(), primitive.LogicalType(), primitive.TypeLength()) 843 if err != nil { 844 return err 845 } 846 847 if primitive.RepetitionType() == parquet.Repetitions.Repeated { 848 // one-level list encoding e.g. a: repeated int32; 849 repeatedAncestorDefLevel := currentLevels.IncrementRepeated() 850 out.Children = make([]SchemaField, 1) 851 child := arrow.Field{Name: primitive.Name(), Type: arrowType, Nullable: false} 852 populateLeaf(colIndex, &child, currentLevels, ctx, out, &out.Children[0]) 853 out.Field = &arrow.Field{Name: primitive.Name(), Type: arrow.ListOf(child.Type), Nullable: false, 854 Metadata: createFieldMeta(int(primitive.FieldID()))} 855 out.LevelInfo = currentLevels 856 out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDefLevel 857 return nil 858 } 859 860 currentLevels.Increment(n) 861 populateLeaf(colIndex, &arrow.Field{Name: n.Name(), Type: arrowType, 862 Nullable: n.RepetitionType() == parquet.Repetitions.Optional, 863 Metadata: createFieldMeta(int(n.FieldID()))}, 864 currentLevels, ctx, parent, out) 865 return nil 866 } 867 868 func getOriginSchema(meta metadata.KeyValueMetadata, mem memory.Allocator) (*arrow.Schema, error) { 869 if meta == nil { 870 return nil, nil 871 } 872 873 const arrowSchemaKey = "ARROW:schema" 874 serialized := meta.FindValue(arrowSchemaKey) 875 if serialized == nil { 876 return nil, nil 877 } 878 879 var ( 880 decoded []byte 881 err error 882 ) 883 884 // if the length of serialized is not a multiple of 4, it cannot be 885 // padded with std encoding. 886 if len(*serialized)%4 == 0 { 887 decoded, err = base64.StdEncoding.DecodeString(*serialized) 888 } 889 // if we failed to decode it with stdencoding or the length wasn't 890 // a multiple of 4, try using the Raw unpadded encoding 891 if len(decoded) == 0 || err != nil { 892 decoded, err = base64.RawStdEncoding.DecodeString(*serialized) 893 } 894 895 if err != nil { 896 return nil, err 897 } 898 899 return flight.DeserializeSchema(decoded, mem) 900 } 901 902 func getNestedFactory(origin, inferred arrow.DataType) func(fieldList []arrow.Field) arrow.DataType { 903 switch inferred.ID() { 904 case arrow.STRUCT: 905 if origin.ID() == arrow.STRUCT { 906 return func(list []arrow.Field) arrow.DataType { 907 return arrow.StructOf(list...) 908 } 909 } 910 case arrow.LIST: 911 switch origin.ID() { 912 case arrow.LIST: 913 return func(list []arrow.Field) arrow.DataType { 914 return arrow.ListOf(list[0].Type) 915 } 916 case arrow.FIXED_SIZE_LIST: 917 sz := origin.(*arrow.FixedSizeListType).Len() 918 return func(list []arrow.Field) arrow.DataType { 919 return arrow.FixedSizeListOf(sz, list[0].Type) 920 } 921 } 922 case arrow.MAP: 923 if origin.ID() == arrow.MAP { 924 return func(list []arrow.Field) arrow.DataType { 925 valType := list[0].Type.(*arrow.StructType) 926 return arrow.MapOf(valType.Field(0).Type, valType.Field(1).Type) 927 } 928 } 929 } 930 return nil 931 } 932 933 func applyOriginalStorageMetadata(origin arrow.Field, inferred *SchemaField) (modified bool, err error) { 934 nchildren := len(inferred.Children) 935 switch origin.Type.ID() { 936 case arrow.EXTENSION, arrow.SPARSE_UNION, arrow.DENSE_UNION, arrow.DICTIONARY: 937 err = xerrors.New("unimplemented type") 938 case arrow.STRUCT: 939 typ := origin.Type.(*arrow.StructType) 940 if nchildren != len(typ.Fields()) { 941 return 942 } 943 944 factory := getNestedFactory(typ, inferred.Field.Type) 945 if factory == nil { 946 return 947 } 948 949 modified = typ.ID() != inferred.Field.Type.ID() 950 for idx := range inferred.Children { 951 childMod, err := applyOriginalMetadata(typ.Field(idx), &inferred.Children[idx]) 952 if err != nil { 953 return false, err 954 } 955 modified = modified || childMod 956 } 957 if modified { 958 modifiedChildren := make([]arrow.Field, len(inferred.Children)) 959 for idx, child := range inferred.Children { 960 modifiedChildren[idx] = *child.Field 961 } 962 inferred.Field.Type = factory(modifiedChildren) 963 } 964 case arrow.FIXED_SIZE_LIST, arrow.LIST, arrow.MAP: 965 if nchildren != 1 { 966 return 967 } 968 factory := getNestedFactory(origin.Type, inferred.Field.Type) 969 if factory == nil { 970 return 971 } 972 973 modified = origin.Type.ID() != inferred.Field.Type.ID() 974 var childModified bool 975 switch typ := origin.Type.(type) { 976 case *arrow.FixedSizeListType: 977 childModified, err = applyOriginalMetadata(arrow.Field{Type: typ.Elem()}, &inferred.Children[0]) 978 case *arrow.ListType: 979 childModified, err = applyOriginalMetadata(arrow.Field{Type: typ.Elem()}, &inferred.Children[0]) 980 case *arrow.MapType: 981 childModified, err = applyOriginalMetadata(arrow.Field{Type: typ.ValueType()}, &inferred.Children[0]) 982 } 983 if err != nil { 984 return 985 } 986 modified = modified || childModified 987 if modified { 988 inferred.Field.Type = factory([]arrow.Field{*inferred.Children[0].Field}) 989 } 990 case arrow.TIMESTAMP: 991 if inferred.Field.Type.ID() != arrow.TIMESTAMP { 992 return 993 } 994 995 tsOtype := origin.Type.(*arrow.TimestampType) 996 tsInfType := inferred.Field.Type.(*arrow.TimestampType) 997 998 // if the unit is the same and the data is tz-aware, then set the original time zone 999 // since parquet has no native storage of timezones 1000 if tsOtype.Unit == tsInfType.Unit && tsInfType.TimeZone == "UTC" && tsOtype.TimeZone != "" { 1001 inferred.Field.Type = origin.Type 1002 } 1003 modified = true 1004 } 1005 1006 if origin.HasMetadata() { 1007 meta := origin.Metadata 1008 if inferred.Field.HasMetadata() { 1009 final := make(map[string]string) 1010 for idx, k := range meta.Keys() { 1011 final[k] = meta.Values()[idx] 1012 } 1013 for idx, k := range inferred.Field.Metadata.Keys() { 1014 final[k] = inferred.Field.Metadata.Values()[idx] 1015 } 1016 inferred.Field.Metadata = arrow.MetadataFrom(final) 1017 } else { 1018 inferred.Field.Metadata = meta 1019 } 1020 modified = true 1021 } 1022 1023 return 1024 } 1025 1026 func applyOriginalMetadata(origin arrow.Field, inferred *SchemaField) (bool, error) { 1027 if origin.Type.ID() == arrow.EXTENSION { 1028 return false, xerrors.New("extension types not implemented yet") 1029 } 1030 1031 return applyOriginalStorageMetadata(origin, inferred) 1032 } 1033 1034 // NewSchemaManifest creates a manifest for mapping a parquet schema to a given arrow schema. 1035 // 1036 // The metadata passed in should be the file level key value metadata from the parquet file or nil. 1037 // If the ARROW:schema was in the metadata, then it is utilized to determine types. 1038 func NewSchemaManifest(sc *schema.Schema, meta metadata.KeyValueMetadata, props *ArrowReadProperties) (*SchemaManifest, error) { 1039 var ctx schemaTree 1040 ctx.manifest = &SchemaManifest{ 1041 ColIndexToField: make(map[int]*SchemaField), 1042 ChildToParent: make(map[*SchemaField]*SchemaField), 1043 descr: sc, 1044 Fields: make([]SchemaField, sc.Root().NumFields()), 1045 } 1046 ctx.props = props 1047 ctx.schema = sc 1048 1049 var err error 1050 ctx.manifest.OriginSchema, err = getOriginSchema(meta, memory.DefaultAllocator) 1051 if err != nil { 1052 return nil, err 1053 } 1054 1055 // if original schema is not compatible with the parquet schema, ignore it 1056 if ctx.manifest.OriginSchema != nil && len(ctx.manifest.OriginSchema.Fields()) != sc.Root().NumFields() { 1057 ctx.manifest.OriginSchema = nil 1058 } 1059 1060 for idx := range ctx.manifest.Fields { 1061 field := &ctx.manifest.Fields[idx] 1062 if err := nodeToSchemaField(sc.Root().Field(idx), file.LevelInfo{NullSlotUsage: 1}, &ctx, nil, field); err != nil { 1063 return nil, err 1064 } 1065 1066 if ctx.manifest.OriginSchema != nil { 1067 if _, err := applyOriginalMetadata(ctx.manifest.OriginSchema.Field(idx), field); err != nil { 1068 return nil, err 1069 } 1070 } 1071 } 1072 return ctx.manifest, nil 1073 } 1074 1075 // FromParquet generates an arrow Schema from a provided Parquet Schema 1076 func FromParquet(sc *schema.Schema, props *ArrowReadProperties, kv metadata.KeyValueMetadata) (*arrow.Schema, error) { 1077 manifest, err := NewSchemaManifest(sc, kv, props) 1078 if err != nil { 1079 return nil, err 1080 } 1081 1082 fields := make([]arrow.Field, len(manifest.Fields)) 1083 for idx, field := range manifest.Fields { 1084 fields[idx] = *field.Field 1085 } 1086 1087 if manifest.OriginSchema != nil { 1088 meta := manifest.OriginSchema.Metadata() 1089 return arrow.NewSchema(fields, &meta), nil 1090 } 1091 return arrow.NewSchema(fields, manifest.SchemaMeta), nil 1092 }