github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/schema.go (about) 1 package parquet 2 3 import ( 4 "fmt" 5 "math" 6 "reflect" 7 "strconv" 8 "strings" 9 "sync" 10 "time" 11 12 "github.com/google/uuid" 13 "github.com/segmentio/parquet-go/compress" 14 "github.com/segmentio/parquet-go/deprecated" 15 "github.com/segmentio/parquet-go/encoding" 16 ) 17 18 // Schema represents a parquet schema created from a Go value. 19 // 20 // Schema implements the Node interface to represent the root node of a parquet 21 // schema. 22 type Schema struct { 23 name string 24 root Node 25 deconstruct deconstructFunc 26 reconstruct reconstructFunc 27 mapping columnMapping 28 columns [][]string 29 } 30 31 // SchemaOf constructs a parquet schema from a Go value. 32 // 33 // The function can construct parquet schemas from struct or pointer-to-struct 34 // values only. A panic is raised if a Go value of a different type is passed 35 // to this function. 36 // 37 // When creating a parquet Schema from a Go value, the struct fields may contain 38 // a "parquet" tag to describe properties of the parquet node. The "parquet" tag 39 // follows the conventional format of Go struct tags: a comma-separated list of 40 // values describe the options, with the first one defining the name of the 41 // parquet column. 42 // 43 // The following options are also supported in the "parquet" struct tag: 44 // 45 // optional | make the parquet column optional 46 // snappy | sets the parquet column compression codec to snappy 47 // gzip | sets the parquet column compression codec to gzip 48 // brotli | sets the parquet column compression codec to brotli 49 // lz4 | sets the parquet column compression codec to lz4 50 // zstd | sets the parquet column compression codec to zstd 51 // plain | enables the plain encoding (no-op default) 52 // dict | enables dictionary encoding on the parquet column 53 // delta | enables delta encoding on the parquet column 54 // list | for slice types, use the parquet LIST logical type 55 // enum | for string types, use the parquet ENUM logical type 56 // uuid | for string and [16]byte types, use the parquet UUID logical type 57 // decimal | for int32, int64 and [n]byte types, use the parquet DECIMAL logical type 58 // date | for int32 types use the DATE logical type 59 // timestamp | for int64 types use the TIMESTAMP logical type with, by default, millisecond precision 60 // split | for float32/float64, use the BYTE_STREAM_SPLIT encoding 61 // 62 // # The date logical type is an int32 value of the number of days since the unix epoch 63 // 64 // The timestamp precision can be changed by defining which precision to use as an argument. 65 // Supported precisions are: nanosecond, millisecond and microsecond. Example: 66 // 67 // type Message struct { 68 // TimestrampMicros int64 `parquet:"timestamp_micros,timestamp(microsecond)" 69 // } 70 // 71 // The decimal tag must be followed by two integer parameters, the first integer 72 // representing the scale and the second the precision; for example: 73 // 74 // type Item struct { 75 // Cost int64 `parquet:"cost,decimal(0:3)"` 76 // } 77 // 78 // Invalid combination of struct tags and Go types, or repeating options will 79 // cause the function to panic. 80 // 81 // As a special case, if the field tag is "-", the field is omitted from the schema 82 // and the data will not be written into the parquet file(s). 83 // Note that a field with name "-" can still be generated using the tag "-,". 84 // 85 // The configuration of Parquet maps are done via two tags: 86 // - The `parquet-key` tag allows to configure the key of a map. 87 // - The parquet-value tag allows users to configure a map's values, for example to declare their native Parquet types. 88 // 89 // When configuring a Parquet map, the `parquet` tag will configure the map itself. 90 // 91 // For example, the following will set the int64 key of the map to be a timestamp: 92 // 93 // type Actions struct { 94 // Action map[int64]string `parquet:"," parquet-key:",timestamp"` 95 // } 96 // 97 // The schema name is the Go type name of the value. 98 func SchemaOf(model interface{}) *Schema { 99 return schemaOf(dereference(reflect.TypeOf(model))) 100 } 101 102 var cachedSchemas sync.Map // map[reflect.Type]*Schema 103 104 func schemaOf(model reflect.Type) *Schema { 105 cached, _ := cachedSchemas.Load(model) 106 schema, _ := cached.(*Schema) 107 if schema != nil { 108 return schema 109 } 110 if model.Kind() != reflect.Struct { 111 panic("cannot construct parquet schema from value of type " + model.String()) 112 } 113 schema = NewSchema(model.Name(), nodeOf(model, nil)) 114 if actual, loaded := cachedSchemas.LoadOrStore(model, schema); loaded { 115 schema = actual.(*Schema) 116 } 117 return schema 118 } 119 120 // NewSchema constructs a new Schema object with the given name and root node. 121 // 122 // The function panics if Node contains more leaf columns than supported by the 123 // package (see parquet.MaxColumnIndex). 124 func NewSchema(name string, root Node) *Schema { 125 mapping, columns := columnMappingOf(root) 126 return &Schema{ 127 name: name, 128 root: root, 129 deconstruct: makeDeconstructFunc(root), 130 reconstruct: makeReconstructFunc(root), 131 mapping: mapping, 132 columns: columns, 133 } 134 } 135 136 func dereference(t reflect.Type) reflect.Type { 137 for t.Kind() == reflect.Ptr { 138 t = t.Elem() 139 } 140 return t 141 } 142 143 func makeDeconstructFunc(node Node) (deconstruct deconstructFunc) { 144 if schema, _ := node.(*Schema); schema != nil { 145 return schema.deconstruct 146 } 147 if !node.Leaf() { 148 _, deconstruct = deconstructFuncOf(0, node) 149 } 150 return deconstruct 151 } 152 153 func makeReconstructFunc(node Node) (reconstruct reconstructFunc) { 154 if schema, _ := node.(*Schema); schema != nil { 155 return schema.reconstruct 156 } 157 if !node.Leaf() { 158 _, reconstruct = reconstructFuncOf(0, node) 159 } 160 return reconstruct 161 } 162 163 // ConfigureRowGroup satisfies the RowGroupOption interface, allowing Schema 164 // instances to be passed to row group constructors to pre-declare the schema of 165 // the output parquet file. 166 func (s *Schema) ConfigureRowGroup(config *RowGroupConfig) { config.Schema = s } 167 168 // ConfigureReader satisfies the ReaderOption interface, allowing Schema 169 // instances to be passed to NewReader to pre-declare the schema of rows 170 // read from the reader. 171 func (s *Schema) ConfigureReader(config *ReaderConfig) { config.Schema = s } 172 173 // ConfigureWriter satisfies the WriterOption interface, allowing Schema 174 // instances to be passed to NewWriter to pre-declare the schema of the 175 // output parquet file. 176 func (s *Schema) ConfigureWriter(config *WriterConfig) { config.Schema = s } 177 178 // String returns a parquet schema representation of s. 179 func (s *Schema) String() string { return sprint(s.name, s.root) } 180 181 // Name returns the name of s. 182 func (s *Schema) Name() string { return s.name } 183 184 // Type returns the parquet type of s. 185 func (s *Schema) Type() Type { return s.root.Type() } 186 187 // Optional returns false since the root node of a parquet schema is always required. 188 func (s *Schema) Optional() bool { return s.root.Optional() } 189 190 // Repeated returns false since the root node of a parquet schema is always required. 191 func (s *Schema) Repeated() bool { return s.root.Repeated() } 192 193 // Required returns true since the root node of a parquet schema is always required. 194 func (s *Schema) Required() bool { return s.root.Required() } 195 196 // Leaf returns true if the root node of the parquet schema is a leaf column. 197 func (s *Schema) Leaf() bool { return s.root.Leaf() } 198 199 // Fields returns the list of fields on the root node of the parquet schema. 200 func (s *Schema) Fields() []Field { return s.root.Fields() } 201 202 // Encoding returns the encoding set on the root node of the parquet schema. 203 func (s *Schema) Encoding() encoding.Encoding { return s.root.Encoding() } 204 205 // Compression returns the compression codec set on the root node of the parquet 206 // schema. 207 func (s *Schema) Compression() compress.Codec { return s.root.Compression() } 208 209 // GoType returns the Go type that best represents the schema. 210 func (s *Schema) GoType() reflect.Type { return s.root.GoType() } 211 212 // Deconstruct deconstructs a Go value and appends it to a row. 213 // 214 // The method panics is the structure of the go value does not match the 215 // parquet schema. 216 func (s *Schema) Deconstruct(row Row, value interface{}) Row { 217 columns := make([][]Value, len(s.columns)) 218 values := make([]Value, len(s.columns)) 219 220 for i := range columns { 221 columns[i] = values[i : i : i+1] 222 } 223 224 s.deconstructValueToColumns(columns, reflect.ValueOf(value)) 225 return appendRow(row, columns) 226 } 227 228 func (s *Schema) deconstructValueToColumns(columns [][]Value, value reflect.Value) { 229 for value.Kind() == reflect.Ptr || value.Kind() == reflect.Interface { 230 if value.IsNil() { 231 value = reflect.Value{} 232 break 233 } 234 value = value.Elem() 235 } 236 s.deconstruct(columns, levels{}, value) 237 } 238 239 // Reconstruct reconstructs a Go value from a row. 240 // 241 // The go value passed as first argument must be a non-nil pointer for the 242 // row to be decoded into. 243 // 244 // The method panics if the structure of the go value and parquet row do not 245 // match. 246 func (s *Schema) Reconstruct(value interface{}, row Row) error { 247 v := reflect.ValueOf(value) 248 if !v.IsValid() { 249 panic("cannot reconstruct row into go value of type <nil>") 250 } 251 if v.Kind() != reflect.Ptr { 252 panic("cannot reconstruct row into go value of non-pointer type " + v.Type().String()) 253 } 254 if v.IsNil() { 255 panic("cannot reconstruct row into nil pointer of type " + v.Type().String()) 256 } 257 for v.Kind() == reflect.Ptr { 258 if v.IsNil() { 259 v.Set(reflect.New(v.Type().Elem())) 260 } 261 v = v.Elem() 262 } 263 264 columns := make([][]Value, len(s.columns)) 265 row.Range(func(columnIndex int, columnValues []Value) bool { 266 if columnIndex < len(columns) { 267 columns[columnIndex] = columnValues 268 } 269 return true 270 }) 271 272 return s.reconstruct(v, levels{}, columns) 273 } 274 275 // Lookup returns the leaf column at the given path. 276 // 277 // The path is the sequence of column names identifying a leaf column (not 278 // including the root). 279 // 280 // If the path was not found in the mapping, or if it did not represent a 281 // leaf column of the parquet schema, the boolean will be false. 282 func (s *Schema) Lookup(path ...string) (LeafColumn, bool) { 283 leaf := s.mapping.lookup(path) 284 return LeafColumn{ 285 Node: leaf.node, 286 Path: leaf.path, 287 ColumnIndex: int(leaf.columnIndex), 288 MaxRepetitionLevel: int(leaf.maxRepetitionLevel), 289 MaxDefinitionLevel: int(leaf.maxDefinitionLevel), 290 }, leaf.node != nil 291 } 292 293 // Columns returns the list of column paths available in the schema. 294 // 295 // The method always returns the same slice value across calls to ColumnPaths, 296 // applications should treat it as immutable. 297 func (s *Schema) Columns() [][]string { 298 return s.columns 299 } 300 301 // Comparator constructs a comparator function which orders rows according to 302 // the list of sorting columns passed as arguments. 303 func (s *Schema) Comparator(sortingColumns ...SortingColumn) func(Row, Row) int { 304 return compareRowsFuncOf(s, sortingColumns) 305 } 306 307 func (s *Schema) forEachNode(do func(name string, node Node)) { 308 forEachNodeOf(s.Name(), s, do) 309 } 310 311 type structNode struct { 312 gotype reflect.Type 313 fields []structField 314 } 315 316 func structNodeOf(t reflect.Type) *structNode { 317 // Collect struct fields first so we can order them before generating the 318 // column indexes. 319 fields := structFieldsOf(t) 320 321 s := &structNode{ 322 gotype: t, 323 fields: make([]structField, len(fields)), 324 } 325 326 for i := range fields { 327 field := structField{name: fields[i].Name, index: fields[i].Index} 328 field.Node = makeNodeOf(fields[i].Type, fields[i].Name, []string{ 329 fields[i].Tag.Get("parquet"), 330 fields[i].Tag.Get("parquet-key"), 331 fields[i].Tag.Get("parquet-value"), 332 }) 333 s.fields[i] = field 334 } 335 336 return s 337 } 338 339 func structFieldsOf(t reflect.Type) []reflect.StructField { 340 fields := appendStructFields(t, nil, nil, 0) 341 342 for i := range fields { 343 f := &fields[i] 344 345 if tag := f.Tag.Get("parquet"); tag != "" { 346 name, _ := split(tag) 347 if name != "" { 348 f.Name = name 349 } 350 } 351 } 352 353 return fields 354 } 355 356 func appendStructFields(t reflect.Type, fields []reflect.StructField, index []int, offset uintptr) []reflect.StructField { 357 for i, n := 0, t.NumField(); i < n; i++ { 358 f := t.Field(i) 359 if tag := f.Tag.Get("parquet"); tag != "" { 360 name, _ := split(tag) 361 if tag != "-," && name == "-" { 362 continue 363 } 364 } 365 366 fieldIndex := index[:len(index):len(index)] 367 fieldIndex = append(fieldIndex, i) 368 369 f.Offset += offset 370 371 if f.Anonymous { 372 fields = appendStructFields(f.Type, fields, fieldIndex, f.Offset) 373 } else if f.IsExported() { 374 f.Index = fieldIndex 375 fields = append(fields, f) 376 } 377 } 378 return fields 379 } 380 381 func (s *structNode) Optional() bool { return false } 382 383 func (s *structNode) Repeated() bool { return false } 384 385 func (s *structNode) Required() bool { return true } 386 387 func (s *structNode) Leaf() bool { return false } 388 389 func (s *structNode) Encoding() encoding.Encoding { return nil } 390 391 func (s *structNode) Compression() compress.Codec { return nil } 392 393 func (s *structNode) GoType() reflect.Type { return s.gotype } 394 395 func (s *structNode) String() string { return sprint("", s) } 396 397 func (s *structNode) Type() Type { return groupType{} } 398 399 func (s *structNode) Fields() []Field { 400 fields := make([]Field, len(s.fields)) 401 for i := range s.fields { 402 fields[i] = &s.fields[i] 403 } 404 return fields 405 } 406 407 // fieldByIndex is like reflect.Value.FieldByIndex but returns the zero-value of 408 // reflect.Value if one of the fields was a nil pointer instead of panicking. 409 func fieldByIndex(v reflect.Value, index []int) reflect.Value { 410 for _, i := range index { 411 if v = v.Field(i); v.Kind() == reflect.Ptr || v.Kind() == reflect.Interface { 412 if v.IsNil() { 413 v = reflect.Value{} 414 break 415 } else { 416 v = v.Elem() 417 } 418 } 419 } 420 return v 421 } 422 423 type structField struct { 424 Node 425 name string 426 index []int 427 } 428 429 func (f *structField) Name() string { return f.name } 430 431 func (f *structField) Value(base reflect.Value) reflect.Value { 432 switch base.Kind() { 433 case reflect.Map: 434 return base.MapIndex(reflect.ValueOf(&f.name).Elem()) 435 case reflect.Ptr: 436 if base.IsNil() { 437 base.Set(reflect.New(base.Type().Elem())) 438 } 439 return fieldByIndex(base.Elem(), f.index) 440 default: 441 if len(f.index) == 1 { 442 return base.Field(f.index[0]) 443 } else { 444 return fieldByIndex(base, f.index) 445 } 446 } 447 } 448 449 func nodeString(t reflect.Type, name string, tag ...string) string { 450 return fmt.Sprintf("%s %s %v", name, t.String(), tag) 451 } 452 453 func throwInvalidTag(t reflect.Type, name string, tag string) { 454 panic(tag + " is an invalid parquet tag: " + nodeString(t, name, tag)) 455 } 456 457 func throwUnknownTag(t reflect.Type, name string, tag string) { 458 panic(tag + " is an unrecognized parquet tag: " + nodeString(t, name, tag)) 459 } 460 461 func throwInvalidNode(t reflect.Type, msg, name string, tag ...string) { 462 panic(msg + ": " + nodeString(t, name, tag...)) 463 } 464 465 // FixedLenByteArray decimals are sized based on precision 466 // this function calculates the necessary byte array size. 467 func decimalFixedLenByteArraySize(precision int) int { 468 return int(math.Ceil((math.Log10(2) + float64(precision)) / math.Log10(256))) 469 } 470 471 func forEachStructTagOption(sf reflect.StructField, do func(t reflect.Type, option, args string)) { 472 if tag := sf.Tag.Get("parquet"); tag != "" { 473 _, tag = split(tag) // skip the field name 474 for tag != "" { 475 option := "" 476 args := "" 477 option, tag = split(tag) 478 option, args = splitOptionArgs(option) 479 ft := sf.Type 480 if ft.Kind() == reflect.Ptr { 481 ft = ft.Elem() 482 } 483 do(ft, option, args) 484 } 485 } 486 } 487 488 func nodeOf(t reflect.Type, tag []string) Node { 489 switch t { 490 case reflect.TypeOf(deprecated.Int96{}): 491 return Leaf(Int96Type) 492 case reflect.TypeOf(uuid.UUID{}): 493 return UUID() 494 case reflect.TypeOf(time.Time{}): 495 return Timestamp(Nanosecond) 496 } 497 498 var n Node 499 switch t.Kind() { 500 case reflect.Bool: 501 n = Leaf(BooleanType) 502 503 case reflect.Int, reflect.Int64: 504 n = Int(64) 505 506 case reflect.Int8, reflect.Int16, reflect.Int32: 507 n = Int(t.Bits()) 508 509 case reflect.Uint, reflect.Uintptr, reflect.Uint64: 510 n = Uint(64) 511 512 case reflect.Uint8, reflect.Uint16, reflect.Uint32: 513 n = Uint(t.Bits()) 514 515 case reflect.Float32: 516 n = Leaf(FloatType) 517 518 case reflect.Float64: 519 n = Leaf(DoubleType) 520 521 case reflect.String: 522 n = String() 523 524 case reflect.Ptr: 525 n = Optional(nodeOf(t.Elem(), nil)) 526 527 case reflect.Slice: 528 if elem := t.Elem(); elem.Kind() == reflect.Uint8 { // []byte? 529 n = Leaf(ByteArrayType) 530 } else { 531 n = Repeated(nodeOf(elem, nil)) 532 } 533 534 case reflect.Array: 535 if t.Elem().Kind() == reflect.Uint8 { 536 n = Leaf(FixedLenByteArrayType(t.Len())) 537 } 538 539 case reflect.Map: 540 var mapTag, valueTag, keyTag string 541 if len(tag) > 0 { 542 mapTag = tag[0] 543 if len(tag) > 1 { 544 keyTag = tag[1] 545 } 546 if len(tag) >= 2 { 547 valueTag = tag[2] 548 } 549 } 550 551 if strings.Contains(mapTag, "json") { 552 n = JSON() 553 } else { 554 n = Map( 555 makeNodeOf(t.Key(), t.Name(), []string{keyTag}), 556 makeNodeOf(t.Elem(), t.Name(), []string{valueTag}), 557 ) 558 } 559 560 forEachTagOption([]string{mapTag}, func(option, args string) { 561 switch option { 562 case "", "json": 563 return 564 case "optional": 565 n = Optional(n) 566 default: 567 throwUnknownTag(t, "map", option) 568 } 569 }) 570 571 case reflect.Struct: 572 return structNodeOf(t) 573 } 574 575 if n == nil { 576 panic("cannot create parquet node from go value of type " + t.String()) 577 } 578 579 return &goNode{Node: n, gotype: t} 580 } 581 582 func split(s string) (head, tail string) { 583 if i := strings.IndexByte(s, ','); i < 0 { 584 head = s 585 } else { 586 head, tail = s[:i], s[i+1:] 587 } 588 return 589 } 590 591 func splitOptionArgs(s string) (option, args string) { 592 if i := strings.IndexByte(s, '('); i >= 0 { 593 option = s[:i] 594 args = s[i:] 595 } else { 596 option = s 597 args = "()" 598 } 599 return 600 } 601 602 func parseDecimalArgs(args string) (scale, precision int, err error) { 603 if !strings.HasPrefix(args, "(") || !strings.HasSuffix(args, ")") { 604 return 0, 0, fmt.Errorf("malformed decimal args: %s", args) 605 } 606 args = strings.TrimPrefix(args, "(") 607 args = strings.TrimSuffix(args, ")") 608 parts := strings.Split(args, ":") 609 if len(parts) != 2 { 610 return 0, 0, fmt.Errorf("malformed decimal args: (%s)", args) 611 } 612 s, err := strconv.ParseInt(parts[0], 10, 32) 613 if err != nil { 614 return 0, 0, err 615 } 616 p, err := strconv.ParseInt(parts[1], 10, 32) 617 if err != nil { 618 return 0, 0, err 619 } 620 return int(s), int(p), nil 621 } 622 623 func parseTimestampArgs(args string) (TimeUnit, error) { 624 if !strings.HasPrefix(args, "(") || !strings.HasSuffix(args, ")") { 625 return nil, fmt.Errorf("malformed timestamp args: %s", args) 626 } 627 628 args = strings.TrimPrefix(args, "(") 629 args = strings.TrimSuffix(args, ")") 630 631 if len(args) == 0 { 632 return Millisecond, nil 633 } 634 635 switch args { 636 case "millisecond": 637 return Millisecond, nil 638 case "microsecond": 639 return Microsecond, nil 640 case "nanosecond": 641 return Nanosecond, nil 642 default: 643 } 644 645 return nil, fmt.Errorf("unknown time unit: %s", args) 646 } 647 648 type goNode struct { 649 Node 650 gotype reflect.Type 651 } 652 653 func (n *goNode) GoType() reflect.Type { return n.gotype } 654 655 var ( 656 _ RowGroupOption = (*Schema)(nil) 657 _ ReaderOption = (*Schema)(nil) 658 _ WriterOption = (*Schema)(nil) 659 ) 660 661 func makeNodeOf(t reflect.Type, name string, tag []string) Node { 662 var ( 663 node Node 664 optional bool 665 list bool 666 encoded encoding.Encoding 667 compressed compress.Codec 668 ) 669 670 setNode := func(n Node) { 671 if node != nil { 672 throwInvalidNode(t, "struct field has multiple logical parquet types declared", name, tag...) 673 } 674 node = n 675 } 676 677 setOptional := func() { 678 if optional { 679 throwInvalidNode(t, "struct field has multiple declaration of the optional tag", name, tag...) 680 } 681 optional = true 682 } 683 684 setList := func() { 685 if list { 686 throwInvalidNode(t, "struct field has multiple declaration of the list tag", name, tag...) 687 } 688 list = true 689 } 690 691 setEncoding := func(e encoding.Encoding) { 692 if encoded != nil { 693 throwInvalidNode(t, "struct field has encoding declared multiple time", name, tag...) 694 } 695 encoded = e 696 } 697 698 setCompression := func(c compress.Codec) { 699 if compressed != nil { 700 throwInvalidNode(t, "struct field has compression codecs declared multiple times", name, tag...) 701 } 702 compressed = c 703 } 704 705 forEachTagOption(tag, func(option, args string) { 706 if t.Kind() == reflect.Map { 707 node = nodeOf(t, tag) 708 return 709 } 710 switch option { 711 case "": 712 return 713 case "optional": 714 setOptional() 715 716 case "snappy": 717 setCompression(&Snappy) 718 719 case "gzip": 720 setCompression(&Gzip) 721 722 case "brotli": 723 setCompression(&Brotli) 724 725 case "lz4": 726 setCompression(&Lz4Raw) 727 728 case "zstd": 729 setCompression(&Zstd) 730 731 case "uncompressed": 732 setCompression(&Uncompressed) 733 734 case "plain": 735 setEncoding(&Plain) 736 737 case "dict": 738 setEncoding(&RLEDictionary) 739 740 case "json": 741 setNode(JSON()) 742 743 case "delta": 744 switch t.Kind() { 745 case reflect.Int, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint32, reflect.Uint64: 746 setEncoding(&DeltaBinaryPacked) 747 case reflect.String: 748 setEncoding(&DeltaByteArray) 749 case reflect.Slice: 750 if t.Elem().Kind() == reflect.Uint8 { // []byte? 751 setEncoding(&DeltaByteArray) 752 } else { 753 throwInvalidTag(t, name, option) 754 } 755 case reflect.Array: 756 if t.Elem().Kind() == reflect.Uint8 { // [N]byte? 757 setEncoding(&DeltaByteArray) 758 } else { 759 throwInvalidTag(t, name, option) 760 } 761 default: 762 throwInvalidTag(t, name, option) 763 } 764 765 case "split": 766 switch t.Kind() { 767 case reflect.Float32, reflect.Float64: 768 setEncoding(&ByteStreamSplit) 769 default: 770 throwInvalidTag(t, name, option) 771 } 772 773 case "list": 774 switch t.Kind() { 775 case reflect.Slice: 776 element := nodeOf(t.Elem(), nil) 777 setNode(element) 778 setList() 779 default: 780 throwInvalidTag(t, name, option) 781 } 782 783 case "enum": 784 switch t.Kind() { 785 case reflect.String: 786 setNode(Enum()) 787 default: 788 throwInvalidTag(t, name, option) 789 } 790 791 case "uuid": 792 switch t.Kind() { 793 case reflect.Array: 794 if t.Elem().Kind() != reflect.Uint8 || t.Len() != 16 { 795 throwInvalidTag(t, name, option) 796 } 797 default: 798 throwInvalidTag(t, name, option) 799 } 800 801 case "decimal": 802 scale, precision, err := parseDecimalArgs(args) 803 if err != nil { 804 throwInvalidTag(t, name, option+args) 805 } 806 var baseType Type 807 switch t.Kind() { 808 case reflect.Int32: 809 baseType = Int32Type 810 case reflect.Int64: 811 baseType = Int64Type 812 case reflect.Array, reflect.Slice: 813 baseType = FixedLenByteArrayType(decimalFixedLenByteArraySize(precision)) 814 default: 815 throwInvalidTag(t, name, option) 816 } 817 818 setNode(Decimal(scale, precision, baseType)) 819 case "date": 820 switch t.Kind() { 821 case reflect.Int32: 822 setNode(Date()) 823 default: 824 throwInvalidTag(t, name, option) 825 } 826 case "timestamp": 827 switch t.Kind() { 828 case reflect.Int64: 829 timeUnit, err := parseTimestampArgs(args) 830 if err != nil { 831 throwInvalidTag(t, name, option) 832 } 833 setNode(Timestamp(timeUnit)) 834 default: 835 switch t { 836 case reflect.TypeOf(time.Time{}): 837 timeUnit, err := parseTimestampArgs(args) 838 if err != nil { 839 throwInvalidTag(t, name, option) 840 } 841 setNode(Timestamp(timeUnit)) 842 default: 843 throwInvalidTag(t, name, option) 844 } 845 } 846 default: 847 throwUnknownTag(t, name, option) 848 } 849 }) 850 851 // Special case: an "optional" struct tag on a slice applies to the 852 // individual items, not the overall list. The least messy way to 853 // deal with this is at this level, instead of passing down optional 854 // information into the nodeOf function, and then passing back whether an 855 // optional tag was applied. 856 if node == nil && t.Kind() == reflect.Slice { 857 isUint8 := t.Elem().Kind() == reflect.Uint8 858 // Note for strings "optional" applies only to the entire BYTE_ARRAY and 859 // not each individual byte. 860 if optional && !isUint8 { 861 node = Repeated(Optional(nodeOf(t.Elem(), tag))) 862 // Don't also apply "optional" to the whole list. 863 optional = false 864 } 865 } 866 867 if node == nil { 868 node = nodeOf(t, tag) 869 } 870 871 if compressed != nil { 872 node = Compressed(node, compressed) 873 } 874 875 if encoded != nil { 876 node = Encoded(node, encoded) 877 } 878 879 if list { 880 node = List(node) 881 } 882 883 if node.Repeated() && !list { 884 elemKind := node.GoType().Elem().Kind() 885 if elemKind == reflect.Slice { 886 panic("unhandled nested slice on parquet schema without list tag") 887 } 888 } 889 890 if optional { 891 node = Optional(node) 892 } 893 894 return node 895 } 896 897 func forEachTagOption(tags []string, do func(option, args string)) { 898 for _, tag := range tags { 899 _, tag = split(tag) // skip the field name 900 for tag != "" { 901 option := "" 902 option, tag = split(tag) 903 var args string 904 option, args = splitOptionArgs(option) 905 do(option, args) 906 } 907 } 908 }