github.com/apache/arrow/go/v14@v14.0.1/parquet/schema/reflection.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package schema 18 19 import ( 20 "fmt" 21 "reflect" 22 "strconv" 23 "strings" 24 25 "github.com/apache/arrow/go/v14/parquet" 26 format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet" 27 "golang.org/x/xerrors" 28 ) 29 30 type taggedInfo struct { 31 Name string 32 33 Type parquet.Type 34 KeyType parquet.Type 35 ValueType parquet.Type 36 37 Length int32 38 KeyLength int32 39 ValueLength int32 40 41 Scale int32 42 KeyScale int32 43 ValueScale int32 44 45 Precision int32 46 KeyPrecision int32 47 ValuePrecision int32 48 49 FieldID int32 50 KeyFieldID int32 51 ValueFieldID int32 52 53 RepetitionType parquet.Repetition 54 ValueRepetition parquet.Repetition 55 56 Converted ConvertedType 57 KeyConverted ConvertedType 58 ValueConverted ConvertedType 59 60 LogicalFields map[string]string 61 KeyLogicalFields map[string]string 62 ValueLogicalFields map[string]string 63 64 LogicalType LogicalType 65 KeyLogicalType LogicalType 66 ValueLogicalType LogicalType 67 68 Exclude bool 69 } 70 71 func (t *taggedInfo) CopyForKey() (ret taggedInfo) { 72 ret = *t 73 ret.Type = t.KeyType 74 ret.Length = t.KeyLength 75 ret.Scale = t.KeyScale 76 ret.Precision = t.KeyPrecision 77 ret.FieldID = t.KeyFieldID 78 ret.RepetitionType = parquet.Repetitions.Required 79 ret.Converted = t.KeyConverted 80 ret.LogicalType = t.KeyLogicalType 81 return 82 } 83 84 func (t *taggedInfo) CopyForValue() (ret taggedInfo) { 85 ret = *t 86 ret.Type = t.ValueType 87 ret.Length = t.ValueLength 88 ret.Scale = t.ValueScale 89 ret.Precision = t.ValuePrecision 90 ret.FieldID = t.ValueFieldID 91 ret.RepetitionType = t.ValueRepetition 92 ret.Converted = t.ValueConverted 93 ret.LogicalType = t.ValueLogicalType 94 return 95 } 96 97 func (t *taggedInfo) UpdateLogicalTypes() { 98 processLogicalType := func(fields map[string]string, precision, scale int32) LogicalType { 99 t, ok := fields["type"] 100 if !ok { 101 return NoLogicalType{} 102 } 103 104 switch strings.ToLower(t) { 105 case "string": 106 return StringLogicalType{} 107 case "map": 108 return MapLogicalType{} 109 case "list": 110 return ListLogicalType{} 111 case "enum": 112 return EnumLogicalType{} 113 case "decimal": 114 if v, ok := fields["precision"]; ok { 115 precision = int32FromType(v) 116 } 117 if v, ok := fields["scale"]; ok { 118 scale = int32FromType(v) 119 } 120 return NewDecimalLogicalType(precision, scale) 121 case "date": 122 return DateLogicalType{} 123 case "time": 124 unit, ok := fields["unit"] 125 if !ok { 126 panic("must specify unit for time logical type") 127 } 128 adjustedToUtc, ok := fields["isadjustedutc"] 129 if !ok { 130 adjustedToUtc = "true" 131 } 132 return NewTimeLogicalType(boolFromStr(adjustedToUtc), timeUnitFromString(strings.ToLower(unit))) 133 case "timestamp": 134 unit, ok := fields["unit"] 135 if !ok { 136 panic("must specify unit for time logical type") 137 } 138 adjustedToUtc, ok := fields["isadjustedutc"] 139 if !ok { 140 adjustedToUtc = "true" 141 } 142 return NewTimestampLogicalType(boolFromStr(adjustedToUtc), timeUnitFromString(unit)) 143 case "integer": 144 width, ok := fields["bitwidth"] 145 if !ok { 146 panic("must specify bitwidth if explicitly setting integer logical type") 147 } 148 signed, ok := fields["signed"] 149 if !ok { 150 signed = "true" 151 } 152 153 return NewIntLogicalType(int8(int32FromType(width)), boolFromStr(signed)) 154 case "null": 155 return NullLogicalType{} 156 case "json": 157 return JSONLogicalType{} 158 case "bson": 159 return BSONLogicalType{} 160 case "uuid": 161 return UUIDLogicalType{} 162 default: 163 panic(fmt.Errorf("invalid logical type specified: %s", t)) 164 } 165 } 166 167 t.LogicalType = processLogicalType(t.LogicalFields, t.Precision, t.Scale) 168 t.KeyLogicalType = processLogicalType(t.KeyLogicalFields, t.KeyPrecision, t.KeyScale) 169 t.ValueLogicalType = processLogicalType(t.ValueLogicalFields, t.ValuePrecision, t.ValueScale) 170 } 171 172 func newTaggedInfo() taggedInfo { 173 return taggedInfo{ 174 Type: parquet.Types.Undefined, 175 KeyType: parquet.Types.Undefined, 176 ValueType: parquet.Types.Undefined, 177 RepetitionType: parquet.Repetitions.Undefined, 178 ValueRepetition: parquet.Repetitions.Undefined, 179 Converted: ConvertedTypes.NA, 180 KeyConverted: ConvertedTypes.NA, 181 ValueConverted: ConvertedTypes.NA, 182 FieldID: -1, 183 KeyFieldID: -1, 184 ValueFieldID: -1, 185 LogicalFields: make(map[string]string), 186 KeyLogicalFields: make(map[string]string), 187 ValueLogicalFields: make(map[string]string), 188 LogicalType: NoLogicalType{}, 189 KeyLogicalType: NoLogicalType{}, 190 ValueLogicalType: NoLogicalType{}, 191 Exclude: false, 192 } 193 } 194 195 var int32FromType = func(v string) int32 { 196 val, err := strconv.Atoi(v) 197 if err != nil { 198 panic(err) 199 } 200 return int32(val) 201 } 202 203 var boolFromStr = func(v string) bool { 204 val, err := strconv.ParseBool(v) 205 if err != nil { 206 panic(err) 207 } 208 return val 209 } 210 211 func infoFromTags(f reflect.StructTag) *taggedInfo { 212 typeFromStr := func(v string) parquet.Type { 213 t, err := format.TypeFromString(strings.ToUpper(v)) 214 if err != nil { 215 panic(fmt.Errorf("invalid type specified: %s", v)) 216 } 217 return parquet.Type(t) 218 } 219 220 repFromStr := func(v string) parquet.Repetition { 221 r, err := format.FieldRepetitionTypeFromString(strings.ToUpper(v)) 222 if err != nil { 223 panic(err) 224 } 225 return parquet.Repetition(r) 226 } 227 228 convertedFromStr := func(v string) ConvertedType { 229 c, err := format.ConvertedTypeFromString(strings.ToUpper(v)) 230 if err != nil { 231 panic(err) 232 } 233 return ConvertedType(c) 234 } 235 236 if ptags, ok := f.Lookup("parquet"); ok { 237 info := newTaggedInfo() 238 if ptags == "-" { 239 info.Exclude = true 240 return &info 241 } 242 for _, tag := range strings.Split(strings.Replace(ptags, "\t", "", -1), ",") { 243 tag = strings.TrimSpace(tag) 244 kv := strings.SplitN(tag, "=", 2) 245 key := strings.TrimSpace(strings.ToLower(kv[0])) 246 value := strings.TrimSpace(kv[1]) 247 248 switch key { 249 case "name": 250 info.Name = value 251 case "type": 252 info.Type = typeFromStr(value) 253 case "keytype": 254 info.KeyType = typeFromStr(value) 255 case "valuetype": 256 info.ValueType = typeFromStr(value) 257 case "length": 258 info.Length = int32FromType(value) 259 case "keylength": 260 info.KeyLength = int32FromType(value) 261 case "valuelength": 262 info.ValueLength = int32FromType(value) 263 case "scale": 264 info.Scale = int32FromType(value) 265 case "keyscale": 266 info.KeyScale = int32FromType(value) 267 case "valuescale": 268 info.ValueScale = int32FromType(value) 269 case "precision": 270 info.Precision = int32FromType(value) 271 case "keyprecision": 272 info.KeyPrecision = int32FromType(value) 273 case "valueprecision": 274 info.ValuePrecision = int32FromType(value) 275 case "fieldid": 276 info.FieldID = int32FromType(value) 277 case "keyfieldid": 278 info.KeyFieldID = int32FromType(value) 279 case "valuefieldid": 280 info.ValueFieldID = int32FromType(value) 281 case "repetition": 282 info.RepetitionType = repFromStr(value) 283 case "valuerepetition": 284 info.ValueRepetition = repFromStr(value) 285 case "converted": 286 info.Converted = convertedFromStr(value) 287 case "keyconverted": 288 info.KeyConverted = convertedFromStr(value) 289 case "valueconverted": 290 info.ValueConverted = convertedFromStr(value) 291 case "logical": 292 info.LogicalFields["type"] = value 293 case "keylogical": 294 info.KeyLogicalFields["type"] = value 295 case "valuelogical": 296 info.ValueLogicalFields["type"] = value 297 default: 298 switch { 299 case strings.HasPrefix(key, "logical."): 300 info.LogicalFields[strings.TrimPrefix(key, "logical.")] = value 301 case strings.HasPrefix(key, "keylogical."): 302 info.KeyLogicalFields[strings.TrimPrefix(key, "keylogical.")] = value 303 case strings.HasPrefix(key, "valuelogical."): 304 info.ValueLogicalFields[strings.TrimPrefix(key, "valuelogical.")] = value 305 } 306 } 307 } 308 info.UpdateLogicalTypes() 309 return &info 310 } 311 return nil 312 } 313 314 // typeToNode recurseively converts a physical type and the tag info into parquet Nodes 315 // 316 // to avoid having to propagate errors up potentially high numbers of recursive calls 317 // we use panics and then recover in the public function NewSchemaFromStruct so that a 318 // failure very far down the stack quickly unwinds. 319 func typeToNode(name string, typ reflect.Type, repType parquet.Repetition, info *taggedInfo) Node { 320 // set up our default values for everything 321 var ( 322 converted = ConvertedTypes.None 323 logical LogicalType = NoLogicalType{} 324 fieldID = int32(-1) 325 physical = parquet.Types.Undefined 326 typeLen = 0 327 precision = 0 328 scale = 0 329 ) 330 if info != nil { // we have struct tag info to process 331 fieldID = info.FieldID 332 if info.Converted != ConvertedTypes.NA { 333 converted = info.Converted 334 } 335 logical = info.LogicalType 336 physical = info.Type 337 typeLen = int(info.Length) 338 precision = int(info.Precision) 339 scale = int(info.Scale) 340 341 if info.Name != "" { 342 name = info.Name 343 } 344 if info.RepetitionType != parquet.Repetitions.Undefined { 345 repType = info.RepetitionType 346 } 347 } 348 349 // simplify the logic by switching based on the reflection Kind 350 switch typ.Kind() { 351 case reflect.Map: 352 // a map must have a logical type of MAP or have no tag for logical type in which case 353 // we assume MAP logical type. 354 if !logical.IsNone() && !logical.Equals(MapLogicalType{}) { 355 panic("cannot set logical type to something other than map for a map") 356 } 357 358 infoCopy := newTaggedInfo() 359 if info != nil { // populate any value specific tags to propagate for the value type 360 infoCopy = info.CopyForValue() 361 } 362 363 // create the node for the value type of the map 364 value := typeToNode("value", typ.Elem(), parquet.Repetitions.Required, &infoCopy) 365 if info != nil { // change our copy to now use the key specific tags if they exist 366 infoCopy = info.CopyForKey() 367 } 368 369 // create the node for the key type of the map 370 key := typeToNode("key", typ.Key(), parquet.Repetitions.Required, &infoCopy) 371 if key.RepetitionType() != parquet.Repetitions.Required { // key cannot be optional 372 panic("key type of map must be Required") 373 } 374 return Must(MapOf(name, key, value, repType, fieldID)) 375 case reflect.Struct: 376 // structs are Group nodes 377 fields := make(FieldList, 0) 378 for i := 0; i < typ.NumField(); i++ { 379 f := typ.Field(i) 380 tags := infoFromTags(f.Tag) 381 if tags == nil || !tags.Exclude { 382 fields = append(fields, typeToNode(f.Name, f.Type, parquet.Repetitions.Required, tags)) 383 } 384 } 385 // group nodes don't have a physical type 386 if physical != parquet.Types.Undefined { 387 panic("cannot specify custom type on struct") 388 } 389 // group nodes don't have converted or logical types 390 if converted != ConvertedTypes.None { 391 panic("cannot specify converted types for a struct") 392 } 393 if !logical.IsNone() { 394 panic("cannot specify logicaltype for a struct") 395 } 396 return Must(NewGroupNode(name, repType, fields, fieldID)) 397 case reflect.Ptr: // if we encounter a pointer create a node for the type it points to, but mark it as optional 398 return typeToNode(name, typ.Elem(), parquet.Repetitions.Optional, info) 399 case reflect.Array: 400 // arrays are repeated or fixed size 401 if typ == reflect.TypeOf(parquet.Int96{}) { 402 return NewInt96Node(name, repType, fieldID) 403 } 404 405 if typ.Elem() == reflect.TypeOf(byte(0)) { // something like [12]byte translates to FixedLenByteArray with length 12 406 if physical == parquet.Types.Undefined { 407 physical = parquet.Types.FixedLenByteArray 408 } 409 if typeLen == 0 { // if there was no type length specified in the tag, use the length of the type. 410 typeLen = typ.Len() 411 } 412 if !logical.IsNone() { 413 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, physical, typeLen, fieldID)) 414 } 415 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, physical, converted, typeLen, precision, scale, fieldID)) 416 } 417 fallthrough // if it's not a fixed len byte array type, then just treat it like a slice 418 case reflect.Slice: 419 // for slices, we default to treating them as lists unless the repetition type is set to REPEATED or they are 420 // a bytearray/fixedlenbytearray 421 switch { 422 case repType == parquet.Repetitions.Repeated: 423 return typeToNode(name, typ.Elem(), parquet.Repetitions.Repeated, info) 424 case physical == parquet.Types.FixedLenByteArray || physical == parquet.Types.ByteArray: 425 if typ.Elem() != reflect.TypeOf(byte(0)) { 426 panic("slice with physical type ByteArray or FixedLenByteArray must be []byte") 427 } 428 fallthrough 429 case typ.Elem() == reflect.TypeOf(byte(0)): 430 if physical == parquet.Types.Undefined { 431 physical = parquet.Types.ByteArray 432 } 433 if !logical.IsNone() { 434 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, physical, typeLen, fieldID)) 435 } 436 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, physical, converted, typeLen, precision, scale, fieldID)) 437 default: 438 var elemInfo *taggedInfo 439 if info != nil { 440 elemInfo = &taggedInfo{} 441 *elemInfo = info.CopyForValue() 442 } 443 444 if !logical.IsNone() && !logical.Equals(ListLogicalType{}) { 445 panic("slice must either be repeated or a List type") 446 } 447 if converted != ConvertedTypes.None && converted != ConvertedTypes.List { 448 panic("slice must either be repeated or a List type") 449 } 450 return Must(ListOf(typeToNode(name, typ.Elem(), parquet.Repetitions.Required, elemInfo), repType, fieldID)) 451 } 452 case reflect.String: 453 // strings are byte arrays or fixedlen byte array 454 t := parquet.Types.ByteArray 455 switch physical { 456 case parquet.Types.Undefined, parquet.Types.ByteArray: 457 case parquet.Types.FixedLenByteArray: 458 t = parquet.Types.FixedLenByteArray 459 default: 460 panic("string fields should be of type bytearray or fixedlenbytearray only") 461 } 462 463 if !logical.IsNone() { 464 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, t, typeLen, fieldID)) 465 } 466 467 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, t, converted, typeLen, precision, scale, fieldID)) 468 case reflect.Int, reflect.Int32, reflect.Int8, reflect.Int16, reflect.Int64: 469 // handle integer types, default to setting the corresponding logical type 470 ptyp := parquet.Types.Int32 471 if typ.Bits() == 64 { 472 ptyp = parquet.Types.Int64 473 } 474 475 if physical != parquet.Types.Undefined { 476 ptyp = physical 477 } 478 479 if !logical.IsNone() { 480 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, ptyp, typeLen, fieldID)) 481 } 482 483 bitwidth := int8(typ.Bits()) 484 if physical != parquet.Types.Undefined { 485 if ptyp == parquet.Types.Int32 { 486 bitwidth = 32 487 } else if ptyp == parquet.Types.Int64 { 488 bitwidth = 64 489 } 490 } 491 492 if converted != ConvertedTypes.None { 493 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, ptyp, converted, 0, precision, scale, fieldID)) 494 } 495 496 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, NewIntLogicalType(bitwidth, true), ptyp, 0, fieldID)) 497 case reflect.Uint, reflect.Uint32, reflect.Uint8, reflect.Uint16, reflect.Uint64: 498 // handle unsigned integer types and default to the corresponding logical type for it. 499 ptyp := parquet.Types.Int32 500 if typ.Bits() == 64 { 501 ptyp = parquet.Types.Int64 502 } 503 504 if physical != parquet.Types.Undefined { 505 ptyp = physical 506 } 507 508 if !logical.IsNone() { 509 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, ptyp, typeLen, fieldID)) 510 } 511 512 bitwidth := int8(typ.Bits()) 513 if physical != parquet.Types.Undefined { 514 if ptyp == parquet.Types.Int32 { 515 bitwidth = 32 516 } else if ptyp == parquet.Types.Int64 { 517 bitwidth = 64 518 } 519 } 520 521 if converted != ConvertedTypes.None { 522 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, ptyp, converted, 0, precision, scale, fieldID)) 523 } 524 525 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, NewIntLogicalType(bitwidth, false), ptyp, 0, fieldID)) 526 case reflect.Bool: 527 if !logical.IsNone() { 528 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, parquet.Types.Boolean, typeLen, fieldID)) 529 } 530 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, parquet.Types.Boolean, converted, typeLen, precision, scale, fieldID)) 531 case reflect.Float32: 532 if !logical.IsNone() { 533 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, parquet.Types.Float, typeLen, fieldID)) 534 } 535 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, parquet.Types.Float, converted, typeLen, precision, scale, fieldID)) 536 case reflect.Float64: 537 if !logical.IsNone() { 538 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, parquet.Types.Double, typeLen, fieldID)) 539 } 540 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, parquet.Types.Double, converted, typeLen, precision, scale, fieldID)) 541 } 542 return nil 543 } 544 545 // NewSchemaFromStruct generates a schema from an object type via reflection of 546 // the type and reading struct tags for "parquet". 547 // 548 // Rules 549 // 550 // Everything defaults to Required repetition, unless otherwise specified. 551 // Pointer types become Optional repetition. 552 // Arrays and Slices become logical List types unless using the tag `repetition=repeated`. 553 // 554 // A length specified byte field (like [5]byte) becomes a fixed_len_byte_array of that length 555 // unless otherwise specified by tags. 556 // 557 // string and []byte both become ByteArray unless otherwise specified. 558 // 559 // Integer types will default to having a logical type of the appropriate bit width 560 // and signedness rather than having no logical type, ie: an int8 will become an int32 561 // node with logical type Int(bitWidth=8, signed=true). 562 // 563 // Structs will become group nodes with the fields of the struct as the fields of the group, 564 // recursively creating the nodes. 565 // 566 // maps will become appropriate Map structures in the schema of the defined key and values. 567 // 568 // Available Tags 569 // 570 // name: by default the node will have the same name as the field, this tag let's you specify a name 571 // 572 // type: Specify the physical type instead of using the field type 573 // 574 // length: specify the type length of the node, only relevant for fixed_len_byte_array 575 // 576 // scale: specify the scale for a decimal field 577 // 578 // precision: specify the precision for a decimal field 579 // 580 // fieldid: specify the field ID for that node, defaults to -1 which means it is not set in the parquet file. 581 // 582 // repetition: specify the repetition as something other than what is determined by the type 583 // 584 // converted: specify the Converted Type of the field 585 // 586 // logical: specify the logical type of the field, if using decimal then the scale and precision 587 // will be determined by the precision and scale fields, or by the logical.precision / logical.scale fields 588 // with the logical. prefixed versions taking precedence. For Time or Timestamp logical types, 589 // use logical.unit=<millis|micros|nanos> and logical.isadjustedutc=<true|false> to set those. Unit is required 590 // isadjustedutc defaults to true. For Integer logical type, use logical.bitwidth and logical.signed to specify 591 // those values, with bitwidth being required, and signed defaulting to true. 592 // 593 // All tags other than name can use a prefix of "key<tagname>=<value>" to refer to the type of the key for a map 594 // and "value<tagname>=<value>" to refer to the value type of a map or the element of a list (such as the type of a slice) 595 func NewSchemaFromStruct(obj interface{}) (sc *Schema, err error) { 596 ot := reflect.TypeOf(obj) 597 if ot.Kind() == reflect.Ptr { 598 ot = ot.Elem() 599 } 600 601 // typeToNode uses panics to fail fast / fail early instead of propagating 602 // errors up recursive stacks. so we recover here and return it as an error 603 defer func() { 604 if r := recover(); r != nil { 605 sc = nil 606 switch x := r.(type) { 607 case string: 608 err = xerrors.New(x) 609 case error: 610 err = x 611 default: 612 err = xerrors.New("unknown panic") 613 } 614 } 615 }() 616 617 root := typeToNode(ot.Name(), ot, parquet.Repetitions.Repeated, nil) 618 return NewSchema(root.(*GroupNode)), nil 619 } 620 621 var parquetTypeToReflect = map[parquet.Type]reflect.Type{ 622 parquet.Types.Boolean: reflect.TypeOf(true), 623 parquet.Types.Int32: reflect.TypeOf(int32(0)), 624 parquet.Types.Int64: reflect.TypeOf(int64(0)), 625 parquet.Types.Float: reflect.TypeOf(float32(0)), 626 parquet.Types.Double: reflect.TypeOf(float64(0)), 627 parquet.Types.Int96: reflect.TypeOf(parquet.Int96{}), 628 parquet.Types.ByteArray: reflect.TypeOf(parquet.ByteArray{}), 629 parquet.Types.FixedLenByteArray: reflect.TypeOf(parquet.FixedLenByteArray{}), 630 } 631 632 func typeFromNode(n Node) reflect.Type { 633 switch n.Type() { 634 case Primitive: 635 typ := parquetTypeToReflect[n.(*PrimitiveNode).PhysicalType()] 636 // if a bytearray field is annoted as a String logical type or a UTF8 converted type 637 // then use a string instead of parquet.ByteArray / parquet.FixedLenByteArray which are []byte 638 if n.LogicalType().Equals(StringLogicalType{}) || n.ConvertedType() == ConvertedTypes.UTF8 { 639 typ = reflect.TypeOf(string("")) 640 } 641 642 if n.RepetitionType() == parquet.Repetitions.Optional { 643 typ = reflect.PtrTo(typ) 644 } else if n.RepetitionType() == parquet.Repetitions.Repeated { 645 typ = reflect.SliceOf(typ) 646 } 647 648 return typ 649 case Group: 650 gnode := n.(*GroupNode) 651 switch gnode.ConvertedType() { 652 case ConvertedTypes.List: 653 // According to the Parquet Spec, a list should always be a 3-level structure 654 // 655 // <list-repetition> group <name> (LIST) { 656 // repeated group list { 657 // <element-repetition> <element-type> element; 658 // } 659 // } 660 // 661 // Outer-most level must be a group annotated with LIST containing a single field named "list". 662 // this level must be only optional (if the list is nullable) or required 663 // Middle level, named list, must be repeated group with a single field named "element" 664 // "element" field is the lists element type and repetition, which should be only required or optional 665 666 if gnode.fields.Len() != 1 { 667 panic("invalid list node, should have exactly 1 child.") 668 } 669 670 if gnode.fields[0].RepetitionType() != parquet.Repetitions.Repeated { 671 panic("invalid list node, child should be repeated") 672 } 673 674 // it is required that the repeated group of elements is named "list" and it's element 675 // field is named "element", however existing data may not use this so readers shouldn't 676 // enforce them as errors 677 // 678 // Rules for backward compatibility from the parquet spec: 679 // 680 // 1) if the repeated field is not a group, then it's type is the element type and elements 681 // must be required. 682 // 2) if the repeated field is a group with multiple fields, then its type is the element type 683 // and elements must be required. 684 // 3) if the repeated field is a group with one field AND is named either "array" or uses the 685 // LIST-annotated group's name with "_tuple" suffix, then the repeated type is the element 686 // type and the elements must be required. 687 // 4) otherwise, the repeated field's type is the element type with the repeated field's repetition 688 689 elemMustBeRequired := false 690 addSlice := false 691 var elemType reflect.Type 692 elemNode := gnode.fields[0] 693 switch { 694 case elemNode.Type() == Primitive, 695 elemNode.(*GroupNode).fields.Len() > 1, 696 elemNode.(*GroupNode).fields.Len() == 1 && (elemNode.Name() == "array" || elemNode.Name() == gnode.Name()+"_tuple"): 697 elemMustBeRequired = true 698 elemType = typeFromNode(elemNode) 699 default: 700 addSlice = true 701 elemType = typeFromNode(elemNode.(*GroupNode).fields[0]) 702 } 703 704 if elemMustBeRequired && elemType.Kind() == reflect.Ptr { 705 elemType = elemType.Elem() 706 } 707 if addSlice { 708 elemType = reflect.SliceOf(elemType) 709 } 710 if gnode.RepetitionType() == parquet.Repetitions.Optional { 711 elemType = reflect.PtrTo(elemType) 712 } 713 return elemType 714 case ConvertedTypes.Map, ConvertedTypes.MapKeyValue: 715 // According to the Parquet Spec, the outer-most level should be 716 // a group containing a single field named "key_value" with repetition 717 // either optional or required for whether or not the map is nullable. 718 // 719 // The key_value middle level *must* be a repeated group with a "key" field 720 // and *optionally* a "value" field 721 // 722 // the "key" field *must* be required and must always exist 723 // 724 // the "value" field can be required or optional or omitted. 725 // 726 // <map-repetition> group <name> (MAP) { 727 // repeated group key_value { 728 // required <key-type> key; 729 // <value-repetition> <value-type> value; 730 // } 731 // } 732 733 if gnode.fields.Len() != 1 { 734 panic("invalid map node, should have exactly 1 child") 735 } 736 737 if gnode.fields[0].Type() != Group { 738 panic("invalid map node, child should be a group node") 739 } 740 741 // that said, this may not be used in existing data and should not be 742 // enforced as errors when reading. 743 // 744 // some data may also incorrectly use MAP_KEY_VALUE instead of MAP 745 // 746 // so any group with MAP_KEY_VALUE that is not contained inside of a "MAP" 747 // group, should be considered equivalent to being a MAP group itself. 748 // 749 // in addition, the fields may not be called "key" and "value" in existing 750 // data, and as such should not be enforced as errors when reading. 751 752 keyval := gnode.fields[0].(*GroupNode) 753 754 keyIndex := keyval.FieldIndexByName("key") 755 if keyIndex == -1 { 756 keyIndex = 0 // use first child if there is no child named "key" 757 } 758 759 keyType := typeFromNode(keyval.fields[keyIndex]) 760 if keyType.Kind() == reflect.Ptr { 761 keyType = keyType.Elem() 762 } 763 // can't use a []byte as a key for a map, so use string 764 if keyType == reflect.TypeOf(parquet.ByteArray{}) || keyType == reflect.TypeOf(parquet.FixedLenByteArray{}) { 765 keyType = reflect.TypeOf(string("")) 766 } 767 768 // if the value node is omitted, then consider this a "set" and make it a 769 // map[key-type]bool 770 valType := reflect.TypeOf(true) 771 if keyval.fields.Len() > 1 { 772 valIndex := keyval.FieldIndexByName("value") 773 if valIndex == -1 { 774 valIndex = 1 // use second child if there is no child named "value" 775 } 776 777 valType = typeFromNode(keyval.fields[valIndex]) 778 } 779 780 mapType := reflect.MapOf(keyType, valType) 781 if gnode.RepetitionType() == parquet.Repetitions.Optional { 782 mapType = reflect.PtrTo(mapType) 783 } 784 return mapType 785 default: 786 fields := []reflect.StructField{} 787 for _, f := range gnode.fields { 788 fields = append(fields, reflect.StructField{ 789 Name: f.Name(), 790 Type: typeFromNode(f), 791 PkgPath: "parquet", 792 }) 793 } 794 795 structType := reflect.StructOf(fields) 796 if gnode.RepetitionType() == parquet.Repetitions.Repeated { 797 return reflect.SliceOf(structType) 798 } 799 if gnode.RepetitionType() == parquet.Repetitions.Optional { 800 return reflect.PtrTo(structType) 801 } 802 return structType 803 } 804 } 805 panic("what happened?") 806 } 807 808 // NewStructFromSchema generates a struct type as a reflect.Type from the schema 809 // by using the appropriate physical types and making things either pointers or slices 810 // based on whether they are repeated/optional/required. It does not use the logical 811 // or converted types to change the physical storage so that it is more efficient to use 812 // the resulting type for reading without having to do conversions. 813 // 814 // It will use maps for map types and slices for list types, but otherwise ignores the 815 // converted and logical types of the nodes. Group nodes that are not List or Map will 816 // be nested structs. 817 func NewStructFromSchema(sc *Schema) (t reflect.Type, err error) { 818 defer func() { 819 if r := recover(); r != nil { 820 t = nil 821 switch x := r.(type) { 822 case string: 823 err = xerrors.New(x) 824 case error: 825 err = x 826 default: 827 err = xerrors.New("unknown panic") 828 } 829 } 830 }() 831 832 t = typeFromNode(sc.root) 833 if t.Kind() == reflect.Slice || t.Kind() == reflect.Ptr { 834 return t.Elem(), nil 835 } 836 return 837 }