github.com/apache/arrow/go/v7@v7.0.1/parquet/pqarrow/schema.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow 18 19 import ( 20 "encoding/base64" 21 "math" 22 "strconv" 23 "strings" 24 25 "github.com/apache/arrow/go/v7/arrow" 26 "github.com/apache/arrow/go/v7/arrow/flight" 27 "github.com/apache/arrow/go/v7/arrow/memory" 28 "github.com/apache/arrow/go/v7/parquet" 29 "github.com/apache/arrow/go/v7/parquet/file" 30 "github.com/apache/arrow/go/v7/parquet/metadata" 31 "github.com/apache/arrow/go/v7/parquet/schema" 32 "golang.org/x/xerrors" 33 ) 34 35 // SchemaField is a holder that defines a specific logical field in the schema 36 // which could potentially refer to multiple physical columns in the underlying 37 // parquet file if it is a nested type. 38 // 39 // ColIndex is only populated (not -1) when it is a leaf column. 40 type SchemaField struct { 41 Field *arrow.Field 42 Children []SchemaField 43 ColIndex int 44 LevelInfo file.LevelInfo 45 } 46 47 // IsLeaf returns true if the SchemaField is a leaf column, ie: ColIndex != -1 48 func (s *SchemaField) IsLeaf() bool { return s.ColIndex != -1 } 49 50 // SchemaManifest represents a full manifest for mapping a Parquet schema 51 // to an arrow Schema. 52 type SchemaManifest struct { 53 descr *schema.Schema 54 OriginSchema *arrow.Schema 55 SchemaMeta *arrow.Metadata 56 57 ColIndexToField map[int]*SchemaField 58 ChildToParent map[*SchemaField]*SchemaField 59 Fields []SchemaField 60 } 61 62 // GetColumnField returns the corresponding Field for a given column index. 63 func (sm *SchemaManifest) GetColumnField(index int) (*SchemaField, error) { 64 if field, ok := sm.ColIndexToField[index]; ok { 65 return field, nil 66 } 67 return nil, xerrors.Errorf("Column Index %d not found in schema manifest", index) 68 } 69 70 // GetParent gets the parent field for a given field if it is a nested column, otherwise 71 // returns nil if there is no parent field. 72 func (sm *SchemaManifest) GetParent(field *SchemaField) *SchemaField { 73 if p, ok := sm.ChildToParent[field]; ok { 74 return p 75 } 76 return nil 77 } 78 79 // GetFieldIndices coalesces a list of field indices (relative to the equivalent arrow::Schema) which 80 // correspond to the column root (first node below the parquet schema's root group) of 81 // each leaf referenced in column_indices. 82 // 83 // For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3]) 84 // the roots are `a` and `i` (return=[0,2]). 85 // 86 // root 87 // -- a <------ 88 // -- -- b | | 89 // -- -- -- c | 90 // -- -- -- d | 91 // -- -- -- -- e 92 // -- f 93 // -- -- g 94 // -- -- -- h 95 // -- i <--- 96 // -- -- j | 97 // -- -- -- k 98 func (sm *SchemaManifest) GetFieldIndices(indices []int) ([]int, error) { 99 added := make(map[int]bool) 100 ret := make([]int, 0) 101 102 for _, idx := range indices { 103 if idx < 0 || idx >= sm.descr.NumColumns() { 104 return nil, xerrors.Errorf("column index %d is not valid", idx) 105 } 106 107 fieldNode := sm.descr.ColumnRoot(idx) 108 fieldIdx := sm.descr.Root().FieldIndexByField(fieldNode) 109 if fieldIdx == -1 { 110 return nil, xerrors.Errorf("column index %d is not valid", idx) 111 } 112 113 if _, ok := added[fieldIdx]; !ok { 114 ret = append(ret, fieldIdx) 115 added[fieldIdx] = true 116 } 117 } 118 return ret, nil 119 } 120 121 func arrowTimestampToLogical(typ *arrow.TimestampType, unit arrow.TimeUnit) schema.LogicalType { 122 utc := typ.TimeZone == "" || typ.TimeZone == "UTC" 123 124 // for forward compatibility reasons, and because there's no other way 125 // to signal to old readers that values are timestamps, we force 126 // the convertedtype field to be set to the corresponding TIMESTAMP_* value. 127 // this does cause some ambiguity as parquet readers have not been consistent 128 // about the interpretation of TIMESTAMP_* values as being utc-normalized 129 // see ARROW-5878 130 var scunit schema.TimeUnitType 131 switch unit { 132 case arrow.Millisecond: 133 scunit = schema.TimeUnitMillis 134 case arrow.Microsecond: 135 scunit = schema.TimeUnitMicros 136 case arrow.Nanosecond: 137 scunit = schema.TimeUnitNanos 138 case arrow.Second: 139 // no equivalent in parquet 140 return schema.NoLogicalType{} 141 } 142 143 return schema.NewTimestampLogicalTypeForce(utc, scunit) 144 } 145 146 func getTimestampMeta(typ *arrow.TimestampType, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (parquet.Type, schema.LogicalType, error) { 147 coerce := arrprops.coerceTimestamps 148 target := typ.Unit 149 if coerce { 150 target = arrprops.coerceTimestampUnit 151 } 152 153 // user is explicitly asking for int96, no logical type 154 if arrprops.timestampAsInt96 && target == arrow.Nanosecond { 155 return parquet.Types.Int96, schema.NoLogicalType{}, nil 156 } 157 158 physical := parquet.Types.Int64 159 logicalType := arrowTimestampToLogical(typ, target) 160 161 // user is explicitly asking for timestamp data to be converted to the specified 162 // units (target) via coercion 163 if coerce { 164 if props.Version() == parquet.V1_0 || props.Version() == parquet.V2_4 { 165 switch target { 166 case arrow.Millisecond, arrow.Microsecond: 167 case arrow.Nanosecond, arrow.Second: 168 return physical, nil, xerrors.Errorf("parquet version %s files can only coerce arrow timestamps to millis or micros", props.Version()) 169 } 170 } else if target == arrow.Second { 171 return physical, nil, xerrors.Errorf("parquet version %s files can only coerce arrow timestampts to millis, micros or nanos", props.Version()) 172 } 173 return physical, logicalType, nil 174 } 175 176 // the user implicitly wants timestamp data to retain its original time units 177 // however the converted type field used to indicate logical types for parquet 178 // version <=2.4 fields, does not allow for nanosecond time units and so nanos 179 // must be coerced to micros 180 if (props.Version() == parquet.V1_0 || props.Version() == parquet.V2_4) && typ.Unit == arrow.Nanosecond { 181 logicalType = arrowTimestampToLogical(typ, arrow.Microsecond) 182 return physical, logicalType, nil 183 } 184 185 // the user implicitly wants timestamp data to retain it's original time units, 186 // however the arrow seconds time unit cannot be represented in parquet, so must 187 // be coerced to milliseconds 188 if typ.Unit == arrow.Second { 189 logicalType = arrowTimestampToLogical(typ, arrow.Millisecond) 190 } 191 192 return physical, logicalType, nil 193 } 194 195 // DecimalSize returns the minimum number of bytes necessary to represent a decimal 196 // with the requested precision. 197 // 198 // Taken from the Apache Impala codebase. The comments next to the return values 199 // are the maximum value that can be represented in 2's complement with the returned 200 // number of bytes 201 func DecimalSize(precision int32) int32 { 202 if precision < 1 { 203 panic("precision must be >= 1") 204 } 205 206 // generated in python with: 207 // >>> decimal_size = lambda prec: int(math.ceil((prec * math.log2(10) + 1) / 8)) 208 // >>> [-1] + [decimal_size(i) for i in range(1, 77)] 209 var byteblock = [...]int32{ 210 -1, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 9, 211 9, 9, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16, 16, 17, 212 17, 18, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25, 213 26, 26, 26, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32, 214 } 215 216 if precision <= 76 { 217 return byteblock[precision] 218 } 219 return int32(math.Ceil(float64(precision)/8.0)*math.Log2(10) + 1) 220 } 221 222 func repFromNullable(isnullable bool) parquet.Repetition { 223 if isnullable { 224 return parquet.Repetitions.Optional 225 } 226 return parquet.Repetitions.Required 227 } 228 229 func structToNode(typ *arrow.StructType, name string, nullable bool, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) { 230 if len(typ.Fields()) == 0 { 231 return nil, xerrors.Errorf("cannot write struct type '%s' with no children field to parquet. Consider adding a dummy child", name) 232 } 233 234 children := make(schema.FieldList, 0, len(typ.Fields())) 235 for _, f := range typ.Fields() { 236 n, err := fieldToNode(f.Name, f, props, arrprops) 237 if err != nil { 238 return nil, err 239 } 240 children = append(children, n) 241 } 242 243 return schema.NewGroupNode(name, repFromNullable(nullable), children, -1) 244 } 245 246 func fieldToNode(name string, field arrow.Field, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) { 247 var ( 248 logicalType schema.LogicalType = schema.NoLogicalType{} 249 typ parquet.Type 250 repType = repFromNullable(field.Nullable) 251 length = -1 252 precision = -1 253 scale = -1 254 err error 255 ) 256 257 switch field.Type.ID() { 258 case arrow.NULL: 259 typ = parquet.Types.Int32 260 logicalType = &schema.NullLogicalType{} 261 if repType != parquet.Repetitions.Optional { 262 return nil, xerrors.New("nulltype arrow field must be nullable") 263 } 264 case arrow.BOOL: 265 typ = parquet.Types.Boolean 266 case arrow.UINT8: 267 typ = parquet.Types.Int32 268 logicalType = schema.NewIntLogicalType(8, false) 269 case arrow.INT8: 270 typ = parquet.Types.Int32 271 logicalType = schema.NewIntLogicalType(8, true) 272 case arrow.UINT16: 273 typ = parquet.Types.Int32 274 logicalType = schema.NewIntLogicalType(16, false) 275 case arrow.INT16: 276 typ = parquet.Types.Int32 277 logicalType = schema.NewIntLogicalType(16, true) 278 case arrow.UINT32: 279 typ = parquet.Types.Int32 280 logicalType = schema.NewIntLogicalType(32, false) 281 case arrow.INT32: 282 typ = parquet.Types.Int32 283 logicalType = schema.NewIntLogicalType(32, true) 284 case arrow.UINT64: 285 typ = parquet.Types.Int64 286 logicalType = schema.NewIntLogicalType(64, false) 287 case arrow.INT64: 288 typ = parquet.Types.Int64 289 logicalType = schema.NewIntLogicalType(64, true) 290 case arrow.FLOAT32: 291 typ = parquet.Types.Float 292 case arrow.FLOAT64: 293 typ = parquet.Types.Double 294 case arrow.STRING: 295 logicalType = schema.StringLogicalType{} 296 fallthrough 297 case arrow.BINARY: 298 typ = parquet.Types.ByteArray 299 case arrow.FIXED_SIZE_BINARY: 300 typ = parquet.Types.FixedLenByteArray 301 length = field.Type.(*arrow.FixedSizeBinaryType).ByteWidth 302 case arrow.DECIMAL: 303 typ = parquet.Types.FixedLenByteArray 304 dectype := field.Type.(*arrow.Decimal128Type) 305 precision = int(dectype.Precision) 306 scale = int(dectype.Scale) 307 length = int(DecimalSize(int32(precision))) 308 logicalType = schema.NewDecimalLogicalType(int32(precision), int32(scale)) 309 case arrow.DATE32: 310 typ = parquet.Types.Int32 311 logicalType = schema.DateLogicalType{} 312 case arrow.DATE64: 313 typ = parquet.Types.Int64 314 logicalType = schema.NewTimestampLogicalType(true, schema.TimeUnitMillis) 315 case arrow.TIMESTAMP: 316 typ, logicalType, err = getTimestampMeta(field.Type.(*arrow.TimestampType), props, arrprops) 317 if err != nil { 318 return nil, err 319 } 320 case arrow.TIME32: 321 typ = parquet.Types.Int32 322 logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMillis) 323 case arrow.TIME64: 324 typ = parquet.Types.Int64 325 timeType := field.Type.(*arrow.Time64Type) 326 if timeType.Unit == arrow.Nanosecond { 327 logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitNanos) 328 } else { 329 logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMicros) 330 } 331 case arrow.STRUCT: 332 return structToNode(field.Type.(*arrow.StructType), field.Name, field.Nullable, props, arrprops) 333 case arrow.FIXED_SIZE_LIST, arrow.LIST: 334 var elem arrow.DataType 335 if lt, ok := field.Type.(*arrow.ListType); ok { 336 elem = lt.Elem() 337 } else { 338 elem = field.Type.(*arrow.FixedSizeListType).Elem() 339 } 340 341 child, err := fieldToNode(name, arrow.Field{Name: name, Type: elem, Nullable: true}, props, arrprops) 342 if err != nil { 343 return nil, err 344 } 345 346 return schema.ListOf(child, repFromNullable(field.Nullable), -1) 347 case arrow.DICTIONARY: 348 // parquet has no dictionary type, dictionary is encoding, not schema level 349 return nil, xerrors.New("not implemented yet") 350 case arrow.EXTENSION: 351 return nil, xerrors.New("not implemented yet") 352 case arrow.MAP: 353 mapType := field.Type.(*arrow.MapType) 354 keyNode, err := fieldToNode("key", mapType.KeyField(), props, arrprops) 355 if err != nil { 356 return nil, err 357 } 358 359 valueNode, err := fieldToNode("value", mapType.ItemField(), props, arrprops) 360 if err != nil { 361 return nil, err 362 } 363 364 if arrprops.noMapLogicalType { 365 keyval := schema.FieldList{keyNode, valueNode} 366 keyvalNode, err := schema.NewGroupNode("key_value", parquet.Repetitions.Repeated, keyval, -1) 367 if err != nil { 368 return nil, err 369 } 370 return schema.NewGroupNode(field.Name, repFromNullable(field.Nullable), schema.FieldList{ 371 keyvalNode, 372 }, -1) 373 } 374 return schema.MapOf(field.Name, keyNode, valueNode, repFromNullable(field.Nullable), -1) 375 default: 376 return nil, xerrors.New("not implemented yet") 377 } 378 379 return schema.NewPrimitiveNodeLogical(name, repType, logicalType, typ, length, fieldIDFromMeta(field.Metadata)) 380 } 381 382 const fieldIDKey = "PARQUET:field_id" 383 384 func fieldIDFromMeta(m arrow.Metadata) int32 { 385 if m.Len() == 0 { 386 return -1 387 } 388 389 key := m.FindKey(fieldIDKey) 390 if key < 0 { 391 return -1 392 } 393 394 id, err := strconv.ParseInt(m.Values()[key], 10, 32) 395 if err != nil { 396 return -1 397 } 398 399 if id < 0 { 400 return -1 401 } 402 403 return int32(id) 404 } 405 406 // ToParquet generates a Parquet Schema from an arrow Schema using the given properties to make 407 // decisions when determining the logical/physical types of the columns. 408 func ToParquet(sc *arrow.Schema, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (*schema.Schema, error) { 409 if props == nil { 410 props = parquet.NewWriterProperties() 411 } 412 413 nodes := make(schema.FieldList, 0, len(sc.Fields())) 414 for _, f := range sc.Fields() { 415 n, err := fieldToNode(f.Name, f, props, arrprops) 416 if err != nil { 417 return nil, err 418 } 419 nodes = append(nodes, n) 420 } 421 422 root, err := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, nodes, -1) 423 return schema.NewSchema(root), err 424 } 425 426 type schemaTree struct { 427 manifest *SchemaManifest 428 429 schema *schema.Schema 430 props *ArrowReadProperties 431 } 432 433 func (s schemaTree) LinkParent(child, parent *SchemaField) { 434 s.manifest.ChildToParent[child] = parent 435 } 436 437 func (s schemaTree) RecordLeaf(leaf *SchemaField) { 438 s.manifest.ColIndexToField[leaf.ColIndex] = leaf 439 } 440 441 func arrowInt(log *schema.IntLogicalType) (arrow.DataType, error) { 442 switch log.BitWidth() { 443 case 8: 444 if log.IsSigned() { 445 return arrow.PrimitiveTypes.Int8, nil 446 } 447 return arrow.PrimitiveTypes.Uint8, nil 448 case 16: 449 if log.IsSigned() { 450 return arrow.PrimitiveTypes.Int16, nil 451 } 452 return arrow.PrimitiveTypes.Uint16, nil 453 case 32: 454 if log.IsSigned() { 455 return arrow.PrimitiveTypes.Int32, nil 456 } 457 return arrow.PrimitiveTypes.Uint32, nil 458 case 64: 459 if log.IsSigned() { 460 return arrow.PrimitiveTypes.Int64, nil 461 } 462 return arrow.PrimitiveTypes.Uint64, nil 463 default: 464 return nil, xerrors.New("invalid logical type for int32") 465 } 466 } 467 468 func arrowTime32(logical *schema.TimeLogicalType) (arrow.DataType, error) { 469 if logical.TimeUnit() == schema.TimeUnitMillis { 470 return arrow.FixedWidthTypes.Time32ms, nil 471 } 472 473 return nil, xerrors.New(logical.String() + " cannot annotate a time32") 474 } 475 476 func arrowTime64(logical *schema.TimeLogicalType) (arrow.DataType, error) { 477 switch logical.TimeUnit() { 478 case schema.TimeUnitMicros: 479 return arrow.FixedWidthTypes.Time64us, nil 480 case schema.TimeUnitNanos: 481 return arrow.FixedWidthTypes.Time64ns, nil 482 default: 483 return nil, xerrors.New(logical.String() + " cannot annotate int64") 484 } 485 } 486 487 func arrowTimestamp(logical *schema.TimestampLogicalType) (arrow.DataType, error) { 488 tz := "UTC" 489 if logical.IsFromConvertedType() { 490 tz = "" 491 } 492 493 switch logical.TimeUnit() { 494 case schema.TimeUnitMillis: 495 return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Millisecond}, nil 496 case schema.TimeUnitMicros: 497 return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Microsecond}, nil 498 case schema.TimeUnitNanos: 499 return &arrow.TimestampType{TimeZone: tz, Unit: arrow.Nanosecond}, nil 500 default: 501 return nil, xerrors.New("Unrecognized unit in timestamp logical type " + logical.String()) 502 } 503 } 504 505 func arrowFromInt32(logical schema.LogicalType) (arrow.DataType, error) { 506 switch logtype := logical.(type) { 507 case schema.NoLogicalType: 508 return arrow.PrimitiveTypes.Int32, nil 509 case *schema.TimeLogicalType: 510 return arrowTime32(logtype) 511 case *schema.DecimalLogicalType: 512 return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil 513 case *schema.IntLogicalType: 514 return arrowInt(logtype) 515 case schema.DateLogicalType: 516 return arrow.FixedWidthTypes.Date32, nil 517 default: 518 return nil, xerrors.New(logical.String() + " cannot annotate int32") 519 } 520 } 521 522 func arrowFromInt64(logical schema.LogicalType) (arrow.DataType, error) { 523 if logical.IsNone() { 524 return arrow.PrimitiveTypes.Int64, nil 525 } 526 527 switch logtype := logical.(type) { 528 case *schema.IntLogicalType: 529 return arrowInt(logtype) 530 case *schema.DecimalLogicalType: 531 return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil 532 case *schema.TimeLogicalType: 533 return arrowTime64(logtype) 534 case *schema.TimestampLogicalType: 535 return arrowTimestamp(logtype) 536 default: 537 return nil, xerrors.New(logical.String() + " cannot annotate int64") 538 } 539 } 540 541 func arrowFromByteArray(logical schema.LogicalType) (arrow.DataType, error) { 542 switch logtype := logical.(type) { 543 case schema.StringLogicalType: 544 return arrow.BinaryTypes.String, nil 545 case *schema.DecimalLogicalType: 546 return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil 547 case schema.NoLogicalType, 548 schema.EnumLogicalType, 549 schema.JSONLogicalType, 550 schema.BSONLogicalType: 551 return arrow.BinaryTypes.Binary, nil 552 default: 553 return nil, xerrors.New("unhandled logicaltype " + logical.String() + " for byte_array") 554 } 555 } 556 557 func arrowFromFLBA(logical schema.LogicalType, length int) (arrow.DataType, error) { 558 switch logtype := logical.(type) { 559 case *schema.DecimalLogicalType: 560 return &arrow.Decimal128Type{Precision: logtype.Precision(), Scale: logtype.Scale()}, nil 561 case schema.NoLogicalType, schema.IntervalLogicalType, schema.UUIDLogicalType: 562 return &arrow.FixedSizeBinaryType{ByteWidth: int(length)}, nil 563 default: 564 return nil, xerrors.New("unhandled logical type " + logical.String() + " for fixed-length byte array") 565 } 566 } 567 568 func getArrowType(physical parquet.Type, logical schema.LogicalType, typeLen int) (arrow.DataType, error) { 569 if !logical.IsValid() || logical.Equals(schema.NullLogicalType{}) { 570 return arrow.Null, nil 571 } 572 573 switch physical { 574 case parquet.Types.Boolean: 575 return arrow.FixedWidthTypes.Boolean, nil 576 case parquet.Types.Int32: 577 return arrowFromInt32(logical) 578 case parquet.Types.Int64: 579 return arrowFromInt64(logical) 580 case parquet.Types.Int96: 581 return arrow.FixedWidthTypes.Timestamp_ns, nil 582 case parquet.Types.Float: 583 return arrow.PrimitiveTypes.Float32, nil 584 case parquet.Types.Double: 585 return arrow.PrimitiveTypes.Float64, nil 586 case parquet.Types.ByteArray: 587 return arrowFromByteArray(logical) 588 case parquet.Types.FixedLenByteArray: 589 return arrowFromFLBA(logical, typeLen) 590 default: 591 return nil, xerrors.New("invalid physical column type") 592 } 593 } 594 595 func populateLeaf(colIndex int, field *arrow.Field, currentLevels file.LevelInfo, ctx *schemaTree, parent *SchemaField, out *SchemaField) { 596 out.Field = field 597 out.ColIndex = colIndex 598 out.LevelInfo = currentLevels 599 ctx.RecordLeaf(out) 600 ctx.LinkParent(out, parent) 601 } 602 603 func listToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error { 604 if n.NumFields() != 1 { 605 return xerrors.New("LIST groups must have only 1 child") 606 } 607 608 if n.RepetitionType() == parquet.Repetitions.Repeated { 609 return xerrors.New("LIST groups must not be repeated") 610 } 611 612 currentLevels.Increment(n) 613 614 out.Children = make([]SchemaField, n.NumFields()) 615 ctx.LinkParent(out, parent) 616 ctx.LinkParent(&out.Children[0], out) 617 618 listNode := n.Field(0) 619 if listNode.RepetitionType() != parquet.Repetitions.Repeated { 620 return xerrors.New("non-repeated nodes in a list group are not supported") 621 } 622 623 repeatedAncestorDef := currentLevels.IncrementRepeated() 624 if listNode.Type() == schema.Group { 625 // Resolve 3-level encoding 626 // 627 // required/optional group name=whatever { 628 // repeated group name=list { 629 // required/optional TYPE item; 630 // } 631 // } 632 // 633 // yields list<item: TYPE ?nullable> ?nullable 634 // 635 // We distinguish the special case that we have 636 // 637 // required/optional group name=whatever { 638 // repeated group name=array or $SOMETHING_tuple { 639 // required/optional TYPE item; 640 // } 641 // } 642 // 643 // In this latter case, the inner type of the list should be a struct 644 // rather than a primitive value 645 // 646 // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable 647 // Special case mentioned in the format spec: 648 // If the name is array or ends in _tuple, this should be a list of struct 649 // even for single child elements. 650 listGroup := listNode.(*schema.GroupNode) 651 if listGroup.NumFields() == 1 && (listGroup.Name() == "array" || strings.HasSuffix(listGroup.Name(), "_tuple")) { 652 // list of primitive type 653 if err := groupToStructField(listGroup, currentLevels, ctx, out, &out.Children[0]); err != nil { 654 return err 655 } 656 } else { 657 if err := nodeToSchemaField(listGroup.Field(0), currentLevels, ctx, out, &out.Children[0]); err != nil { 658 return err 659 } 660 } 661 } else { 662 // Two-level list encoding 663 // 664 // required/optional group LIST { 665 // repeated TYPE; 666 // } 667 primitiveNode := listNode.(*schema.PrimitiveNode) 668 colIndex := ctx.schema.ColumnIndexByNode(primitiveNode) 669 arrowType, err := getArrowType(primitiveNode.PhysicalType(), primitiveNode.LogicalType(), primitiveNode.TypeLength()) 670 if err != nil { 671 return err 672 } 673 674 itemField := arrow.Field{Name: listNode.Name(), Type: arrowType, Nullable: false, Metadata: createFieldMeta(int(listNode.FieldID()))} 675 populateLeaf(colIndex, &itemField, currentLevels, ctx, out, &out.Children[0]) 676 } 677 678 out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOf(out.Children[0].Field.Type), 679 Nullable: n.RepetitionType() == parquet.Repetitions.Optional, Metadata: createFieldMeta(int(n.FieldID()))} 680 out.LevelInfo = currentLevels 681 // At this point current levels contains the def level for this list, 682 // we need to reset to the prior parent. 683 out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef 684 return nil 685 } 686 687 func groupToStructField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error { 688 arrowFields := make([]arrow.Field, 0, n.NumFields()) 689 out.Children = make([]SchemaField, n.NumFields()) 690 691 for i := 0; i < n.NumFields(); i++ { 692 if err := nodeToSchemaField(n.Field(i), currentLevels, ctx, out, &out.Children[i]); err != nil { 693 return err 694 } 695 arrowFields = append(arrowFields, *out.Children[i].Field) 696 } 697 698 out.Field = &arrow.Field{Name: n.Name(), Type: arrow.StructOf(arrowFields...), 699 Nullable: n.RepetitionType() == parquet.Repetitions.Optional, Metadata: createFieldMeta(int(n.FieldID()))} 700 out.LevelInfo = currentLevels 701 return nil 702 } 703 704 func mapToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error { 705 if n.NumFields() != 1 { 706 return xerrors.New("MAP group must have exactly 1 child") 707 } 708 if n.RepetitionType() == parquet.Repetitions.Repeated { 709 return xerrors.New("MAP groups must not be repeated") 710 } 711 712 keyvalueNode := n.Field(0) 713 if keyvalueNode.RepetitionType() != parquet.Repetitions.Repeated { 714 return xerrors.New("Non-repeated keyvalue group in MAP group is not supported") 715 } 716 717 if keyvalueNode.Type() != schema.Group { 718 return xerrors.New("keyvalue node must be a group") 719 } 720 721 kvgroup := keyvalueNode.(*schema.GroupNode) 722 if kvgroup.NumFields() != 1 && kvgroup.NumFields() != 2 { 723 return xerrors.Errorf("keyvalue node group must have exactly 1 or 2 child elements, Found %d", kvgroup.NumFields()) 724 } 725 726 keyNode := kvgroup.Field(0) 727 if keyNode.RepetitionType() != parquet.Repetitions.Required { 728 return xerrors.New("MAP keys must be required") 729 } 730 731 // Arrow doesn't support 1 column maps (i.e. Sets). The options are to either 732 // make the values column nullable, or process the map as a list. We choose the latter 733 // as it is simpler. 734 if kvgroup.NumFields() == 1 { 735 return listToSchemaField(n, currentLevels, ctx, parent, out) 736 } 737 738 currentLevels.Increment(n) 739 repeatedAncestorDef := currentLevels.IncrementRepeated() 740 out.Children = make([]SchemaField, 1) 741 742 kvfield := &out.Children[0] 743 kvfield.Children = make([]SchemaField, 2) 744 745 keyField := &kvfield.Children[0] 746 valueField := &kvfield.Children[1] 747 748 ctx.LinkParent(out, parent) 749 ctx.LinkParent(kvfield, out) 750 ctx.LinkParent(keyField, kvfield) 751 ctx.LinkParent(valueField, kvfield) 752 753 // required/optional group name=whatever { 754 // repeated group name=key_values{ 755 // required TYPE key; 756 // required/optional TYPE value; 757 // } 758 // } 759 // 760 761 if err := nodeToSchemaField(keyNode, currentLevels, ctx, kvfield, keyField); err != nil { 762 return err 763 } 764 if err := nodeToSchemaField(kvgroup.Field(1), currentLevels, ctx, kvfield, valueField); err != nil { 765 return err 766 } 767 768 kvfield.Field = &arrow.Field{Name: n.Name(), Type: arrow.StructOf(*keyField.Field, *valueField.Field), 769 Nullable: false, Metadata: createFieldMeta(int(kvgroup.FieldID()))} 770 771 kvfield.LevelInfo = currentLevels 772 out.Field = &arrow.Field{Name: n.Name(), Type: arrow.MapOf(keyField.Field.Type, valueField.Field.Type), 773 Nullable: n.RepetitionType() == parquet.Repetitions.Optional, 774 Metadata: createFieldMeta(int(n.FieldID()))} 775 out.LevelInfo = currentLevels 776 // At this point current levels contains the def level for this map, 777 // we need to reset to the prior parent. 778 out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef 779 return nil 780 } 781 782 func groupToSchemaField(n *schema.GroupNode, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error { 783 if n.LogicalType().Equals(schema.NewListLogicalType()) { 784 return listToSchemaField(n, currentLevels, ctx, parent, out) 785 } else if n.LogicalType().Equals(schema.MapLogicalType{}) { 786 return mapToSchemaField(n, currentLevels, ctx, parent, out) 787 } 788 789 if n.RepetitionType() == parquet.Repetitions.Repeated { 790 // Simple repeated struct 791 // 792 // repeated group $NAME { 793 // r/o TYPE[0] f0 794 // r/o TYPE[1] f1 795 // } 796 out.Children = make([]SchemaField, 1) 797 repeatedAncestorDef := currentLevels.IncrementRepeated() 798 if err := groupToStructField(n, currentLevels, ctx, out, &out.Children[0]); err != nil { 799 return err 800 } 801 802 out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOf(out.Children[0].Field.Type), Nullable: false, 803 Metadata: createFieldMeta(int(n.FieldID()))} 804 ctx.LinkParent(&out.Children[0], out) 805 out.LevelInfo = currentLevels 806 out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDef 807 return nil 808 } 809 810 currentLevels.Increment(n) 811 return groupToStructField(n, currentLevels, ctx, parent, out) 812 } 813 814 func createFieldMeta(fieldID int) arrow.Metadata { 815 return arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{strconv.Itoa(fieldID)}) 816 } 817 818 func nodeToSchemaField(n schema.Node, currentLevels file.LevelInfo, ctx *schemaTree, parent, out *SchemaField) error { 819 ctx.LinkParent(out, parent) 820 821 if n.Type() == schema.Group { 822 return groupToSchemaField(n.(*schema.GroupNode), currentLevels, ctx, parent, out) 823 } 824 825 // Either a normal flat primitive type, or a list type encoded with 1-level 826 // list encoding. Note that the 3-level encoding is the form recommended by 827 // the parquet specification, but technically we can have either 828 // 829 // required/optional $TYPE $FIELD_NAME 830 // 831 // or 832 // 833 // repeated $TYPE $FIELD_NAME 834 835 primitive := n.(*schema.PrimitiveNode) 836 colIndex := ctx.schema.ColumnIndexByNode(primitive) 837 arrowType, err := getArrowType(primitive.PhysicalType(), primitive.LogicalType(), primitive.TypeLength()) 838 if err != nil { 839 return err 840 } 841 842 if primitive.RepetitionType() == parquet.Repetitions.Repeated { 843 // one-level list encoding e.g. a: repeated int32; 844 repeatedAncestorDefLevel := currentLevels.IncrementRepeated() 845 out.Children = make([]SchemaField, 1) 846 child := arrow.Field{Name: primitive.Name(), Type: arrowType, Nullable: false} 847 populateLeaf(colIndex, &child, currentLevels, ctx, out, &out.Children[0]) 848 out.Field = &arrow.Field{Name: primitive.Name(), Type: arrow.ListOf(child.Type), Nullable: false, 849 Metadata: createFieldMeta(int(primitive.FieldID()))} 850 out.LevelInfo = currentLevels 851 out.LevelInfo.RepeatedAncestorDefLevel = repeatedAncestorDefLevel 852 return nil 853 } 854 855 currentLevels.Increment(n) 856 populateLeaf(colIndex, &arrow.Field{Name: n.Name(), Type: arrowType, 857 Nullable: n.RepetitionType() == parquet.Repetitions.Optional, 858 Metadata: createFieldMeta(int(n.FieldID()))}, 859 currentLevels, ctx, parent, out) 860 return nil 861 } 862 863 func getOriginSchema(meta metadata.KeyValueMetadata, mem memory.Allocator) (*arrow.Schema, error) { 864 if meta == nil { 865 return nil, nil 866 } 867 868 const arrowSchemaKey = "ARROW:schema" 869 serialized := meta.FindValue(arrowSchemaKey) 870 if serialized == nil { 871 return nil, nil 872 } 873 874 decoded, err := base64.RawStdEncoding.DecodeString(*serialized) 875 if err != nil { 876 return nil, err 877 } 878 879 return flight.DeserializeSchema(decoded, mem) 880 } 881 882 func getNestedFactory(origin, inferred arrow.DataType) func(fieldList []arrow.Field) arrow.DataType { 883 switch inferred.ID() { 884 case arrow.STRUCT: 885 if origin.ID() == arrow.STRUCT { 886 return func(list []arrow.Field) arrow.DataType { 887 return arrow.StructOf(list...) 888 } 889 } 890 case arrow.LIST: 891 switch origin.ID() { 892 case arrow.LIST: 893 return func(list []arrow.Field) arrow.DataType { 894 return arrow.ListOf(list[0].Type) 895 } 896 case arrow.FIXED_SIZE_LIST: 897 sz := origin.(*arrow.FixedSizeListType).Len() 898 return func(list []arrow.Field) arrow.DataType { 899 return arrow.FixedSizeListOf(sz, list[0].Type) 900 } 901 } 902 case arrow.MAP: 903 if origin.ID() == arrow.MAP { 904 return func(list []arrow.Field) arrow.DataType { 905 valType := list[0].Type.(*arrow.StructType) 906 return arrow.MapOf(valType.Field(0).Type, valType.Field(1).Type) 907 } 908 } 909 } 910 return nil 911 } 912 913 func applyOriginalStorageMetadata(origin arrow.Field, inferred *SchemaField) (modified bool, err error) { 914 nchildren := len(inferred.Children) 915 switch origin.Type.ID() { 916 case arrow.EXTENSION, arrow.SPARSE_UNION, arrow.DENSE_UNION, arrow.DICTIONARY: 917 err = xerrors.New("unimplemented type") 918 case arrow.STRUCT: 919 typ := origin.Type.(*arrow.StructType) 920 if nchildren != len(typ.Fields()) { 921 return 922 } 923 924 factory := getNestedFactory(typ, inferred.Field.Type) 925 if factory == nil { 926 return 927 } 928 929 modified = typ.ID() != inferred.Field.Type.ID() 930 for idx := range inferred.Children { 931 childMod, err := applyOriginalMetadata(typ.Field(idx), &inferred.Children[idx]) 932 if err != nil { 933 return false, err 934 } 935 modified = modified || childMod 936 } 937 if modified { 938 modifiedChildren := make([]arrow.Field, len(inferred.Children)) 939 for idx, child := range inferred.Children { 940 modifiedChildren[idx] = *child.Field 941 } 942 inferred.Field.Type = factory(modifiedChildren) 943 } 944 case arrow.FIXED_SIZE_LIST, arrow.LIST, arrow.MAP: 945 if nchildren != 1 { 946 return 947 } 948 factory := getNestedFactory(origin.Type, inferred.Field.Type) 949 if factory == nil { 950 return 951 } 952 953 modified = origin.Type.ID() != inferred.Field.Type.ID() 954 var childModified bool 955 switch typ := origin.Type.(type) { 956 case *arrow.FixedSizeListType: 957 childModified, err = applyOriginalMetadata(arrow.Field{Type: typ.Elem()}, &inferred.Children[0]) 958 case *arrow.ListType: 959 childModified, err = applyOriginalMetadata(arrow.Field{Type: typ.Elem()}, &inferred.Children[0]) 960 case *arrow.MapType: 961 childModified, err = applyOriginalMetadata(arrow.Field{Type: typ.ValueType()}, &inferred.Children[0]) 962 } 963 if err != nil { 964 return 965 } 966 modified = modified || childModified 967 if modified { 968 inferred.Field.Type = factory([]arrow.Field{*inferred.Children[0].Field}) 969 } 970 case arrow.TIMESTAMP: 971 if inferred.Field.Type.ID() != arrow.TIMESTAMP { 972 return 973 } 974 975 tsOtype := origin.Type.(*arrow.TimestampType) 976 tsInfType := inferred.Field.Type.(*arrow.TimestampType) 977 978 // if the unit is the same and the data is tz-aware, then set the original time zone 979 // since parquet has no native storage of timezones 980 if tsOtype.Unit == tsInfType.Unit && tsInfType.TimeZone == "UTC" && tsOtype.TimeZone != "" { 981 inferred.Field.Type = origin.Type 982 } 983 modified = true 984 } 985 986 if origin.HasMetadata() { 987 meta := origin.Metadata 988 if inferred.Field.HasMetadata() { 989 final := make(map[string]string) 990 for idx, k := range meta.Keys() { 991 final[k] = meta.Values()[idx] 992 } 993 for idx, k := range inferred.Field.Metadata.Keys() { 994 final[k] = inferred.Field.Metadata.Values()[idx] 995 } 996 inferred.Field.Metadata = arrow.MetadataFrom(final) 997 } else { 998 inferred.Field.Metadata = meta 999 } 1000 modified = true 1001 } 1002 1003 return 1004 } 1005 1006 func applyOriginalMetadata(origin arrow.Field, inferred *SchemaField) (bool, error) { 1007 if origin.Type.ID() == arrow.EXTENSION { 1008 return false, xerrors.New("extension types not implemented yet") 1009 } 1010 1011 return applyOriginalStorageMetadata(origin, inferred) 1012 } 1013 1014 // NewSchemaManifest creates a manifest for mapping a parquet schema to a given arrow schema. 1015 // 1016 // The metadata passed in should be the file level key value metadata from the parquet file or nil. 1017 // If the ARROW:schema was in the metadata, then it is utilized to determine types. 1018 func NewSchemaManifest(sc *schema.Schema, meta metadata.KeyValueMetadata, props *ArrowReadProperties) (*SchemaManifest, error) { 1019 var ctx schemaTree 1020 ctx.manifest = &SchemaManifest{ 1021 ColIndexToField: make(map[int]*SchemaField), 1022 ChildToParent: make(map[*SchemaField]*SchemaField), 1023 descr: sc, 1024 Fields: make([]SchemaField, sc.Root().NumFields()), 1025 } 1026 ctx.props = props 1027 ctx.schema = sc 1028 1029 var err error 1030 ctx.manifest.OriginSchema, err = getOriginSchema(meta, memory.DefaultAllocator) 1031 if err != nil { 1032 return nil, err 1033 } 1034 1035 // if original schema is not compatible with the parquet schema, ignore it 1036 if ctx.manifest.OriginSchema != nil && len(ctx.manifest.OriginSchema.Fields()) != sc.Root().NumFields() { 1037 ctx.manifest.OriginSchema = nil 1038 } 1039 1040 for idx := range ctx.manifest.Fields { 1041 field := &ctx.manifest.Fields[idx] 1042 if err := nodeToSchemaField(sc.Root().Field(idx), file.LevelInfo{NullSlotUsage: 1}, &ctx, nil, field); err != nil { 1043 return nil, err 1044 } 1045 1046 if ctx.manifest.OriginSchema != nil { 1047 if _, err := applyOriginalMetadata(ctx.manifest.OriginSchema.Field(idx), field); err != nil { 1048 return nil, err 1049 } 1050 } 1051 } 1052 return ctx.manifest, nil 1053 } 1054 1055 // FromParquet generates an arrow Schema from a provided Parquet Schema 1056 func FromParquet(sc *schema.Schema, props *ArrowReadProperties, kv metadata.KeyValueMetadata) (*arrow.Schema, error) { 1057 manifest, err := NewSchemaManifest(sc, kv, props) 1058 if err != nil { 1059 return nil, err 1060 } 1061 1062 fields := make([]arrow.Field, len(manifest.Fields)) 1063 for idx, field := range manifest.Fields { 1064 fields[idx] = *field.Field 1065 } 1066 1067 if manifest.OriginSchema != nil { 1068 meta := manifest.OriginSchema.Metadata() 1069 return arrow.NewSchema(fields, &meta), nil 1070 } 1071 return arrow.NewSchema(fields, manifest.SchemaMeta), nil 1072 }