github.com/apache/arrow/go/v10@v10.0.1/parquet/schema/reflection.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package schema 18 19 import ( 20 "fmt" 21 "reflect" 22 "strconv" 23 "strings" 24 25 "github.com/apache/arrow/go/v10/parquet" 26 format "github.com/apache/arrow/go/v10/parquet/internal/gen-go/parquet" 27 "golang.org/x/xerrors" 28 ) 29 30 type taggedInfo struct { 31 Name string 32 33 Type parquet.Type 34 KeyType parquet.Type 35 ValueType parquet.Type 36 37 Length int32 38 KeyLength int32 39 ValueLength int32 40 41 Scale int32 42 KeyScale int32 43 ValueScale int32 44 45 Precision int32 46 KeyPrecision int32 47 ValuePrecision int32 48 49 FieldID int32 50 KeyFieldID int32 51 ValueFieldID int32 52 53 RepetitionType parquet.Repetition 54 ValueRepetition parquet.Repetition 55 56 Converted ConvertedType 57 KeyConverted ConvertedType 58 ValueConverted ConvertedType 59 60 LogicalFields map[string]string 61 KeyLogicalFields map[string]string 62 ValueLogicalFields map[string]string 63 64 LogicalType LogicalType 65 KeyLogicalType LogicalType 66 ValueLogicalType LogicalType 67 } 68 69 func (t *taggedInfo) CopyForKey() (ret taggedInfo) { 70 ret = *t 71 ret.Type = t.KeyType 72 ret.Length = t.KeyLength 73 ret.Scale = t.KeyScale 74 ret.Precision = t.KeyPrecision 75 ret.FieldID = t.KeyFieldID 76 ret.RepetitionType = parquet.Repetitions.Required 77 ret.Converted = t.KeyConverted 78 ret.LogicalType = t.KeyLogicalType 79 return 80 } 81 82 func (t *taggedInfo) CopyForValue() (ret taggedInfo) { 83 ret = *t 84 ret.Type = t.ValueType 85 ret.Length = t.ValueLength 86 ret.Scale = t.ValueScale 87 ret.Precision = t.ValuePrecision 88 ret.FieldID = t.ValueFieldID 89 ret.RepetitionType = t.ValueRepetition 90 ret.Converted = t.ValueConverted 91 ret.LogicalType = t.ValueLogicalType 92 return 93 } 94 95 func (t *taggedInfo) UpdateLogicalTypes() { 96 processLogicalType := func(fields map[string]string, precision, scale int32) LogicalType { 97 t, ok := fields["type"] 98 if !ok { 99 return NoLogicalType{} 100 } 101 102 switch strings.ToLower(t) { 103 case "string": 104 return StringLogicalType{} 105 case "map": 106 return MapLogicalType{} 107 case "list": 108 return ListLogicalType{} 109 case "enum": 110 return EnumLogicalType{} 111 case "decimal": 112 if v, ok := fields["precision"]; ok { 113 precision = int32FromType(v) 114 } 115 if v, ok := fields["scale"]; ok { 116 scale = int32FromType(v) 117 } 118 return NewDecimalLogicalType(precision, scale) 119 case "date": 120 return DateLogicalType{} 121 case "time": 122 unit, ok := fields["unit"] 123 if !ok { 124 panic("must specify unit for time logical type") 125 } 126 adjustedToUtc, ok := fields["isadjustedutc"] 127 if !ok { 128 adjustedToUtc = "true" 129 } 130 return NewTimeLogicalType(boolFromStr(adjustedToUtc), timeUnitFromString(strings.ToLower(unit))) 131 case "timestamp": 132 unit, ok := fields["unit"] 133 if !ok { 134 panic("must specify unit for time logical type") 135 } 136 adjustedToUtc, ok := fields["isadjustedutc"] 137 if !ok { 138 adjustedToUtc = "true" 139 } 140 return NewTimestampLogicalType(boolFromStr(adjustedToUtc), timeUnitFromString(unit)) 141 case "integer": 142 width, ok := fields["bitwidth"] 143 if !ok { 144 panic("must specify bitwidth if explicitly setting integer logical type") 145 } 146 signed, ok := fields["signed"] 147 if !ok { 148 signed = "true" 149 } 150 151 return NewIntLogicalType(int8(int32FromType(width)), boolFromStr(signed)) 152 case "null": 153 return NullLogicalType{} 154 case "json": 155 return JSONLogicalType{} 156 case "bson": 157 return BSONLogicalType{} 158 case "uuid": 159 return UUIDLogicalType{} 160 default: 161 panic(fmt.Errorf("invalid logical type specified: %s", t)) 162 } 163 } 164 165 t.LogicalType = processLogicalType(t.LogicalFields, t.Precision, t.Scale) 166 t.KeyLogicalType = processLogicalType(t.KeyLogicalFields, t.KeyPrecision, t.KeyScale) 167 t.ValueLogicalType = processLogicalType(t.ValueLogicalFields, t.ValuePrecision, t.ValueScale) 168 } 169 170 func newTaggedInfo() taggedInfo { 171 return taggedInfo{ 172 Type: parquet.Types.Undefined, 173 KeyType: parquet.Types.Undefined, 174 ValueType: parquet.Types.Undefined, 175 RepetitionType: parquet.Repetitions.Undefined, 176 ValueRepetition: parquet.Repetitions.Undefined, 177 Converted: ConvertedTypes.NA, 178 KeyConverted: ConvertedTypes.NA, 179 ValueConverted: ConvertedTypes.NA, 180 FieldID: -1, 181 KeyFieldID: -1, 182 ValueFieldID: -1, 183 LogicalFields: make(map[string]string), 184 KeyLogicalFields: make(map[string]string), 185 ValueLogicalFields: make(map[string]string), 186 LogicalType: NoLogicalType{}, 187 KeyLogicalType: NoLogicalType{}, 188 ValueLogicalType: NoLogicalType{}, 189 } 190 } 191 192 var int32FromType = func(v string) int32 { 193 val, err := strconv.Atoi(v) 194 if err != nil { 195 panic(err) 196 } 197 return int32(val) 198 } 199 200 var boolFromStr = func(v string) bool { 201 val, err := strconv.ParseBool(v) 202 if err != nil { 203 panic(err) 204 } 205 return val 206 } 207 208 func infoFromTags(f reflect.StructTag) *taggedInfo { 209 typeFromStr := func(v string) parquet.Type { 210 t, err := format.TypeFromString(strings.ToUpper(v)) 211 if err != nil { 212 panic(fmt.Errorf("invalid type specified: %s", v)) 213 } 214 return parquet.Type(t) 215 } 216 217 repFromStr := func(v string) parquet.Repetition { 218 r, err := format.FieldRepetitionTypeFromString(strings.ToUpper(v)) 219 if err != nil { 220 panic(err) 221 } 222 return parquet.Repetition(r) 223 } 224 225 convertedFromStr := func(v string) ConvertedType { 226 c, err := format.ConvertedTypeFromString(strings.ToUpper(v)) 227 if err != nil { 228 panic(err) 229 } 230 return ConvertedType(c) 231 } 232 233 if ptags, ok := f.Lookup("parquet"); ok { 234 info := newTaggedInfo() 235 for _, tag := range strings.Split(strings.Replace(ptags, "\t", "", -1), ",") { 236 tag = strings.TrimSpace(tag) 237 kv := strings.SplitN(tag, "=", 2) 238 key := strings.TrimSpace(strings.ToLower(kv[0])) 239 value := strings.TrimSpace(kv[1]) 240 241 switch key { 242 case "name": 243 info.Name = value 244 case "type": 245 info.Type = typeFromStr(value) 246 case "keytype": 247 info.KeyType = typeFromStr(value) 248 case "valuetype": 249 info.ValueType = typeFromStr(value) 250 case "length": 251 info.Length = int32FromType(value) 252 case "keylength": 253 info.KeyLength = int32FromType(value) 254 case "valuelength": 255 info.ValueLength = int32FromType(value) 256 case "scale": 257 info.Scale = int32FromType(value) 258 case "keyscale": 259 info.KeyScale = int32FromType(value) 260 case "valuescale": 261 info.ValueScale = int32FromType(value) 262 case "precision": 263 info.Precision = int32FromType(value) 264 case "keyprecision": 265 info.KeyPrecision = int32FromType(value) 266 case "valueprecision": 267 info.ValuePrecision = int32FromType(value) 268 case "fieldid": 269 info.FieldID = int32FromType(value) 270 case "keyfieldid": 271 info.KeyFieldID = int32FromType(value) 272 case "valuefieldid": 273 info.ValueFieldID = int32FromType(value) 274 case "repetition": 275 info.RepetitionType = repFromStr(value) 276 case "valuerepetition": 277 info.ValueRepetition = repFromStr(value) 278 case "converted": 279 info.Converted = convertedFromStr(value) 280 case "keyconverted": 281 info.KeyConverted = convertedFromStr(value) 282 case "valueconverted": 283 info.ValueConverted = convertedFromStr(value) 284 case "logical": 285 info.LogicalFields["type"] = value 286 case "keylogical": 287 info.KeyLogicalFields["type"] = value 288 case "valuelogical": 289 info.ValueLogicalFields["type"] = value 290 default: 291 switch { 292 case strings.HasPrefix(key, "logical."): 293 info.LogicalFields[strings.TrimPrefix(key, "logical.")] = value 294 case strings.HasPrefix(key, "keylogical."): 295 info.KeyLogicalFields[strings.TrimPrefix(key, "keylogical.")] = value 296 case strings.HasPrefix(key, "valuelogical."): 297 info.ValueLogicalFields[strings.TrimPrefix(key, "valuelogical.")] = value 298 } 299 } 300 } 301 info.UpdateLogicalTypes() 302 return &info 303 } 304 return nil 305 } 306 307 // typeToNode recurseively converts a physical type and the tag info into parquet Nodes 308 // 309 // to avoid having to propagate errors up potentially high numbers of recursive calls 310 // we use panics and then recover in the public function NewSchemaFromStruct so that a 311 // failure very far down the stack quickly unwinds. 312 func typeToNode(name string, typ reflect.Type, repType parquet.Repetition, info *taggedInfo) Node { 313 // set up our default values for everything 314 var ( 315 converted = ConvertedTypes.None 316 logical LogicalType = NoLogicalType{} 317 fieldID = int32(-1) 318 physical = parquet.Types.Undefined 319 typeLen = 0 320 precision = 0 321 scale = 0 322 ) 323 if info != nil { // we have struct tag info to process 324 fieldID = info.FieldID 325 if info.Converted != ConvertedTypes.NA { 326 converted = info.Converted 327 } 328 logical = info.LogicalType 329 physical = info.Type 330 typeLen = int(info.Length) 331 precision = int(info.Precision) 332 scale = int(info.Scale) 333 334 if info.Name != "" { 335 name = info.Name 336 } 337 if info.RepetitionType != parquet.Repetitions.Undefined { 338 repType = info.RepetitionType 339 } 340 } 341 342 // simplify the logic by switching based on the reflection Kind 343 switch typ.Kind() { 344 case reflect.Map: 345 // a map must have a logical type of MAP or have no tag for logical type in which case 346 // we assume MAP logical type. 347 if !logical.IsNone() && !logical.Equals(MapLogicalType{}) { 348 panic("cannot set logical type to something other than map for a map") 349 } 350 351 infoCopy := newTaggedInfo() 352 if info != nil { // populate any value specific tags to propagate for the value type 353 infoCopy = info.CopyForValue() 354 } 355 356 // create the node for the value type of the map 357 value := typeToNode("value", typ.Elem(), parquet.Repetitions.Required, &infoCopy) 358 if info != nil { // change our copy to now use the key specific tags if they exist 359 infoCopy = info.CopyForKey() 360 } 361 362 // create the node for the key type of the map 363 key := typeToNode("key", typ.Key(), parquet.Repetitions.Required, &infoCopy) 364 if key.RepetitionType() != parquet.Repetitions.Required { // key cannot be optional 365 panic("key type of map must be Required") 366 } 367 return Must(MapOf(name, key, value, repType, fieldID)) 368 case reflect.Struct: 369 // structs are Group nodes 370 fields := make(FieldList, 0) 371 for i := 0; i < typ.NumField(); i++ { 372 f := typ.Field(i) 373 374 fields = append(fields, typeToNode(f.Name, f.Type, parquet.Repetitions.Required, infoFromTags(f.Tag))) 375 } 376 // group nodes don't have a physical type 377 if physical != parquet.Types.Undefined { 378 panic("cannot specify custom type on struct") 379 } 380 // group nodes don't have converted or logical types 381 if converted != ConvertedTypes.None { 382 panic("cannot specify converted types for a struct") 383 } 384 if !logical.IsNone() { 385 panic("cannot specify logicaltype for a struct") 386 } 387 return Must(NewGroupNode(name, repType, fields, fieldID)) 388 case reflect.Ptr: // if we encounter a pointer create a node for the type it points to, but mark it as optional 389 return typeToNode(name, typ.Elem(), parquet.Repetitions.Optional, info) 390 case reflect.Array: 391 // arrays are repeated or fixed size 392 if typ == reflect.TypeOf(parquet.Int96{}) { 393 return NewInt96Node(name, repType, fieldID) 394 } 395 396 if typ.Elem() == reflect.TypeOf(byte(0)) { // something like [12]byte translates to FixedLenByteArray with length 12 397 if physical == parquet.Types.Undefined { 398 physical = parquet.Types.FixedLenByteArray 399 } 400 if typeLen == 0 { // if there was no type length specified in the tag, use the length of the type. 401 typeLen = typ.Len() 402 } 403 if !logical.IsNone() { 404 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, physical, typeLen, fieldID)) 405 } 406 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, physical, converted, typeLen, precision, scale, fieldID)) 407 } 408 fallthrough // if it's not a fixed len byte array type, then just treat it like a slice 409 case reflect.Slice: 410 // for slices, we default to treating them as lists unless the repetition type is set to REPEATED or they are 411 // a bytearray/fixedlenbytearray 412 switch { 413 case repType == parquet.Repetitions.Repeated: 414 return typeToNode(name, typ.Elem(), parquet.Repetitions.Repeated, info) 415 case physical == parquet.Types.FixedLenByteArray || physical == parquet.Types.ByteArray: 416 if typ.Elem() != reflect.TypeOf(byte(0)) { 417 panic("slice with physical type ByteArray or FixedLenByteArray must be []byte") 418 } 419 fallthrough 420 case typ.Elem() == reflect.TypeOf(byte(0)): 421 if physical == parquet.Types.Undefined { 422 physical = parquet.Types.ByteArray 423 } 424 if !logical.IsNone() { 425 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, physical, typeLen, fieldID)) 426 } 427 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, physical, converted, typeLen, precision, scale, fieldID)) 428 default: 429 var elemInfo *taggedInfo 430 if info != nil { 431 elemInfo = &taggedInfo{} 432 *elemInfo = info.CopyForValue() 433 } 434 435 if !logical.IsNone() && !logical.Equals(ListLogicalType{}) { 436 panic("slice must either be repeated or a List type") 437 } 438 if converted != ConvertedTypes.None && converted != ConvertedTypes.List { 439 panic("slice must either be repeated or a List type") 440 } 441 return Must(ListOf(typeToNode(name, typ.Elem(), parquet.Repetitions.Required, elemInfo), repType, fieldID)) 442 } 443 case reflect.String: 444 // strings are byte arrays or fixedlen byte array 445 t := parquet.Types.ByteArray 446 switch physical { 447 case parquet.Types.Undefined, parquet.Types.ByteArray: 448 case parquet.Types.FixedLenByteArray: 449 t = parquet.Types.FixedLenByteArray 450 default: 451 panic("string fields should be of type bytearray or fixedlenbytearray only") 452 } 453 454 if !logical.IsNone() { 455 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, t, typeLen, fieldID)) 456 } 457 458 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, t, converted, typeLen, precision, scale, fieldID)) 459 case reflect.Int, reflect.Int32, reflect.Int8, reflect.Int16, reflect.Int64: 460 // handle integer types, default to setting the corresponding logical type 461 ptyp := parquet.Types.Int32 462 if typ.Bits() == 64 { 463 ptyp = parquet.Types.Int64 464 } 465 466 if physical != parquet.Types.Undefined { 467 ptyp = physical 468 } 469 470 if !logical.IsNone() { 471 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, ptyp, typeLen, fieldID)) 472 } 473 474 bitwidth := int8(typ.Bits()) 475 if physical != parquet.Types.Undefined { 476 if ptyp == parquet.Types.Int32 { 477 bitwidth = 32 478 } else if ptyp == parquet.Types.Int64 { 479 bitwidth = 64 480 } 481 } 482 483 if converted != ConvertedTypes.None { 484 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, ptyp, converted, 0, precision, scale, fieldID)) 485 } 486 487 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, NewIntLogicalType(bitwidth, true), ptyp, 0, fieldID)) 488 case reflect.Uint, reflect.Uint32, reflect.Uint8, reflect.Uint16, reflect.Uint64: 489 // handle unsigned integer types and default to the corresponding logical type for it. 490 ptyp := parquet.Types.Int32 491 if typ.Bits() == 64 { 492 ptyp = parquet.Types.Int64 493 } 494 495 if physical != parquet.Types.Undefined { 496 ptyp = physical 497 } 498 499 if !logical.IsNone() { 500 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, ptyp, typeLen, fieldID)) 501 } 502 503 bitwidth := int8(typ.Bits()) 504 if physical != parquet.Types.Undefined { 505 if ptyp == parquet.Types.Int32 { 506 bitwidth = 32 507 } else if ptyp == parquet.Types.Int64 { 508 bitwidth = 64 509 } 510 } 511 512 if converted != ConvertedTypes.None { 513 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, ptyp, converted, 0, precision, scale, fieldID)) 514 } 515 516 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, NewIntLogicalType(bitwidth, false), ptyp, 0, fieldID)) 517 case reflect.Bool: 518 if !logical.IsNone() { 519 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, parquet.Types.Boolean, typeLen, fieldID)) 520 } 521 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, parquet.Types.Boolean, converted, typeLen, precision, scale, fieldID)) 522 case reflect.Float32: 523 if !logical.IsNone() { 524 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, parquet.Types.Float, typeLen, fieldID)) 525 } 526 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, parquet.Types.Float, converted, typeLen, precision, scale, fieldID)) 527 case reflect.Float64: 528 if !logical.IsNone() { 529 return MustPrimitive(NewPrimitiveNodeLogical(name, repType, logical, parquet.Types.Double, typeLen, fieldID)) 530 } 531 return MustPrimitive(NewPrimitiveNodeConverted(name, repType, parquet.Types.Double, converted, typeLen, precision, scale, fieldID)) 532 } 533 return nil 534 } 535 536 // NewSchemaFromStruct generates a schema from an object type via reflection of 537 // the type and reading struct tags for "parquet". 538 // 539 // Rules 540 // 541 // Everything defaults to Required repetition, unless otherwise specified. 542 // Pointer types become Optional repetition. 543 // Arrays and Slices become logical List types unless using the tag `repetition=repeated`. 544 // 545 // A length specified byte field (like [5]byte) becomes a fixed_len_byte_array of that length 546 // unless otherwise specified by tags. 547 // 548 // string and []byte both become ByteArray unless otherwise specified. 549 // 550 // Integer types will default to having a logical type of the appropriate bit width 551 // and signedness rather than having no logical type, ie: an int8 will become an int32 552 // node with logical type Int(bitWidth=8, signed=true). 553 // 554 // Structs will become group nodes with the fields of the struct as the fields of the group, 555 // recursively creating the nodes. 556 // 557 // maps will become appropriate Map structures in the schema of the defined key and values. 558 // 559 // Available Tags 560 // 561 // name: by default the node will have the same name as the field, this tag let's you specify a name 562 // 563 // type: Specify the physical type instead of using the field type 564 // 565 // length: specify the type length of the node, only relevant for fixed_len_byte_array 566 // 567 // scale: specify the scale for a decimal field 568 // 569 // precision: specify the precision for a decimal field 570 // 571 // fieldid: specify the field ID for that node, defaults to -1 which means it is not set in the parquet file. 572 // 573 // repetition: specify the repetition as something other than what is determined by the type 574 // 575 // converted: specify the Converted Type of the field 576 // 577 // logical: specify the logical type of the field, if using decimal then the scale and precision 578 // will be determined by the precision and scale fields, or by the logical.precision / logical.scale fields 579 // with the logical. prefixed versions taking precedence. For Time or Timestamp logical types, 580 // use logical.unit=<millis|micros|nanos> and logical.isadjustedutc=<true|false> to set those. Unit is required 581 // isadjustedutc defaults to true. For Integer logical type, use logical.bitwidth and logical.signed to specify 582 // those values, with bitwidth being required, and signed defaulting to true. 583 // 584 // All tags other than name can use a prefix of "key<tagname>=<value>" to refer to the type of the key for a map 585 // and "value<tagname>=<value>" to refer to the value type of a map or the element of a list (such as the type of a slice) 586 func NewSchemaFromStruct(obj interface{}) (sc *Schema, err error) { 587 ot := reflect.TypeOf(obj) 588 if ot.Kind() == reflect.Ptr { 589 ot = ot.Elem() 590 } 591 592 // typeToNode uses panics to fail fast / fail early instead of propagating 593 // errors up recursive stacks. so we recover here and return it as an error 594 defer func() { 595 if r := recover(); r != nil { 596 sc = nil 597 switch x := r.(type) { 598 case string: 599 err = xerrors.New(x) 600 case error: 601 err = x 602 default: 603 err = xerrors.New("unknown panic") 604 } 605 } 606 }() 607 608 root := typeToNode(ot.Name(), ot, parquet.Repetitions.Repeated, nil) 609 return NewSchema(root.(*GroupNode)), nil 610 } 611 612 var parquetTypeToReflect = map[parquet.Type]reflect.Type{ 613 parquet.Types.Boolean: reflect.TypeOf(true), 614 parquet.Types.Int32: reflect.TypeOf(int32(0)), 615 parquet.Types.Int64: reflect.TypeOf(int64(0)), 616 parquet.Types.Float: reflect.TypeOf(float32(0)), 617 parquet.Types.Double: reflect.TypeOf(float64(0)), 618 parquet.Types.Int96: reflect.TypeOf(parquet.Int96{}), 619 parquet.Types.ByteArray: reflect.TypeOf(parquet.ByteArray{}), 620 parquet.Types.FixedLenByteArray: reflect.TypeOf(parquet.FixedLenByteArray{}), 621 } 622 623 func typeFromNode(n Node) reflect.Type { 624 switch n.Type() { 625 case Primitive: 626 typ := parquetTypeToReflect[n.(*PrimitiveNode).PhysicalType()] 627 // if a bytearray field is annoted as a String logical type or a UTF8 converted type 628 // then use a string instead of parquet.ByteArray / parquet.FixedLenByteArray which are []byte 629 if n.LogicalType().Equals(StringLogicalType{}) || n.ConvertedType() == ConvertedTypes.UTF8 { 630 typ = reflect.TypeOf(string("")) 631 } 632 633 if n.RepetitionType() == parquet.Repetitions.Optional { 634 typ = reflect.PtrTo(typ) 635 } else if n.RepetitionType() == parquet.Repetitions.Repeated { 636 typ = reflect.SliceOf(typ) 637 } 638 639 return typ 640 case Group: 641 gnode := n.(*GroupNode) 642 switch gnode.ConvertedType() { 643 case ConvertedTypes.List: 644 // According to the Parquet Spec, a list should always be a 3-level structure 645 // 646 // <list-repetition> group <name> (LIST) { 647 // repeated group list { 648 // <element-repetition> <element-type> element; 649 // } 650 // } 651 // 652 // Outer-most level must be a group annotated with LIST containing a single field named "list". 653 // this level must be only optional (if the list is nullable) or required 654 // Middle level, named list, must be repeated group with a single field named "element" 655 // "element" field is the lists element type and repetition, which should be only required or optional 656 657 if gnode.fields.Len() != 1 { 658 panic("invalid list node, should have exactly 1 child.") 659 } 660 661 if gnode.fields[0].RepetitionType() != parquet.Repetitions.Repeated { 662 panic("invalid list node, child should be repeated") 663 } 664 665 // it is required that the repeated group of elements is named "list" and it's element 666 // field is named "element", however existing data may not use this so readers shouldn't 667 // enforce them as errors 668 // 669 // Rules for backward compatibility from the parquet spec: 670 // 671 // 1) if the repeated field is not a group, then it's type is the element type and elements 672 // must be required. 673 // 2) if the repeated field is a group with multiple fields, then its type is the element type 674 // and elements must be required. 675 // 3) if the repeated field is a group with one field AND is named either "array" or uses the 676 // LIST-annotated group's name with "_tuple" suffix, then the repeated type is the element 677 // type and the elements must be required. 678 // 4) otherwise, the repeated field's type is the element type with the repeated field's repetition 679 680 elemMustBeRequired := false 681 addSlice := false 682 var elemType reflect.Type 683 elemNode := gnode.fields[0] 684 switch { 685 case elemNode.Type() == Primitive, 686 elemNode.(*GroupNode).fields.Len() > 1, 687 elemNode.(*GroupNode).fields.Len() == 1 && (elemNode.Name() == "array" || elemNode.Name() == gnode.Name()+"_tuple"): 688 elemMustBeRequired = true 689 elemType = typeFromNode(elemNode) 690 default: 691 addSlice = true 692 elemType = typeFromNode(elemNode.(*GroupNode).fields[0]) 693 } 694 695 if elemMustBeRequired && elemType.Kind() == reflect.Ptr { 696 elemType = elemType.Elem() 697 } 698 if addSlice { 699 elemType = reflect.SliceOf(elemType) 700 } 701 if gnode.RepetitionType() == parquet.Repetitions.Optional { 702 elemType = reflect.PtrTo(elemType) 703 } 704 return elemType 705 case ConvertedTypes.Map, ConvertedTypes.MapKeyValue: 706 // According to the Parquet Spec, the outer-most level should be 707 // a group containing a single field named "key_value" with repetition 708 // either optional or required for whether or not the map is nullable. 709 // 710 // The key_value middle level *must* be a repeated group with a "key" field 711 // and *optionally* a "value" field 712 // 713 // the "key" field *must* be required and must always exist 714 // 715 // the "value" field can be required or optional or omitted. 716 // 717 // <map-repetition> group <name> (MAP) { 718 // repeated group key_value { 719 // required <key-type> key; 720 // <value-repetition> <value-type> value; 721 // } 722 // } 723 724 if gnode.fields.Len() != 1 { 725 panic("invalid map node, should have exactly 1 child") 726 } 727 728 if gnode.fields[0].Type() != Group { 729 panic("invalid map node, child should be a group node") 730 } 731 732 // that said, this may not be used in existing data and should not be 733 // enforced as errors when reading. 734 // 735 // some data may also incorrectly use MAP_KEY_VALUE instead of MAP 736 // 737 // so any group with MAP_KEY_VALUE that is not contained inside of a "MAP" 738 // group, should be considered equivalent to being a MAP group itself. 739 // 740 // in addition, the fields may not be called "key" and "value" in existing 741 // data, and as such should not be enforced as errors when reading. 742 743 keyval := gnode.fields[0].(*GroupNode) 744 745 keyIndex := keyval.FieldIndexByName("key") 746 if keyIndex == -1 { 747 keyIndex = 0 // use first child if there is no child named "key" 748 } 749 750 keyType := typeFromNode(keyval.fields[keyIndex]) 751 if keyType.Kind() == reflect.Ptr { 752 keyType = keyType.Elem() 753 } 754 // can't use a []byte as a key for a map, so use string 755 if keyType == reflect.TypeOf(parquet.ByteArray{}) || keyType == reflect.TypeOf(parquet.FixedLenByteArray{}) { 756 keyType = reflect.TypeOf(string("")) 757 } 758 759 // if the value node is omitted, then consider this a "set" and make it a 760 // map[key-type]bool 761 valType := reflect.TypeOf(true) 762 if keyval.fields.Len() > 1 { 763 valIndex := keyval.FieldIndexByName("value") 764 if valIndex == -1 { 765 valIndex = 1 // use second child if there is no child named "value" 766 } 767 768 valType = typeFromNode(keyval.fields[valIndex]) 769 } 770 771 mapType := reflect.MapOf(keyType, valType) 772 if gnode.RepetitionType() == parquet.Repetitions.Optional { 773 mapType = reflect.PtrTo(mapType) 774 } 775 return mapType 776 default: 777 fields := []reflect.StructField{} 778 for _, f := range gnode.fields { 779 fields = append(fields, reflect.StructField{ 780 Name: f.Name(), 781 Type: typeFromNode(f), 782 PkgPath: "parquet", 783 }) 784 } 785 786 structType := reflect.StructOf(fields) 787 if gnode.RepetitionType() == parquet.Repetitions.Repeated { 788 return reflect.SliceOf(structType) 789 } 790 if gnode.RepetitionType() == parquet.Repetitions.Optional { 791 return reflect.PtrTo(structType) 792 } 793 return structType 794 } 795 } 796 panic("what happened?") 797 } 798 799 // NewStructFromSchema generates a struct type as a reflect.Type from the schema 800 // by using the appropriate physical types and making things either pointers or slices 801 // based on whether they are repeated/optional/required. It does not use the logical 802 // or converted types to change the physical storage so that it is more efficient to use 803 // the resulting type for reading without having to do conversions. 804 // 805 // It will use maps for map types and slices for list types, but otherwise ignores the 806 // converted and logical types of the nodes. Group nodes that are not List or Map will 807 // be nested structs. 808 func NewStructFromSchema(sc *Schema) (t reflect.Type, err error) { 809 defer func() { 810 if r := recover(); r != nil { 811 t = nil 812 switch x := r.(type) { 813 case string: 814 err = xerrors.New(x) 815 case error: 816 err = x 817 default: 818 err = xerrors.New("unknown panic") 819 } 820 } 821 }() 822 823 t = typeFromNode(sc.root) 824 if t.Kind() == reflect.Slice || t.Kind() == reflect.Ptr { 825 return t.Elem(), nil 826 } 827 return 828 }