github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/schema.go (about) 1 package parquet 2 3 import ( 4 "fmt" 5 "math" 6 "reflect" 7 "strconv" 8 "strings" 9 "sync" 10 "time" 11 12 "github.com/google/uuid" 13 "github.com/parquet-go/parquet-go/compress" 14 "github.com/parquet-go/parquet-go/deprecated" 15 "github.com/parquet-go/parquet-go/encoding" 16 ) 17 18 // Schema represents a parquet schema created from a Go value. 19 // 20 // Schema implements the Node interface to represent the root node of a parquet 21 // schema. 22 type Schema struct { 23 name string 24 root Node 25 deconstruct deconstructFunc 26 reconstruct reconstructFunc 27 mapping columnMapping 28 columns [][]string 29 } 30 31 // SchemaOf constructs a parquet schema from a Go value. 32 // 33 // The function can construct parquet schemas from struct or pointer-to-struct 34 // values only. A panic is raised if a Go value of a different type is passed 35 // to this function. 36 // 37 // When creating a parquet Schema from a Go value, the struct fields may contain 38 // a "parquet" tag to describe properties of the parquet node. The "parquet" tag 39 // follows the conventional format of Go struct tags: a comma-separated list of 40 // values describe the options, with the first one defining the name of the 41 // parquet column. 42 // 43 // The following options are also supported in the "parquet" struct tag: 44 // 45 // optional | make the parquet column optional 46 // snappy | sets the parquet column compression codec to snappy 47 // gzip | sets the parquet column compression codec to gzip 48 // brotli | sets the parquet column compression codec to brotli 49 // lz4 | sets the parquet column compression codec to lz4 50 // zstd | sets the parquet column compression codec to zstd 51 // plain | enables the plain encoding (no-op default) 52 // dict | enables dictionary encoding on the parquet column 53 // delta | enables delta encoding on the parquet column 54 // list | for slice types, use the parquet LIST logical type 55 // enum | for string types, use the parquet ENUM logical type 56 // uuid | for string and [16]byte types, use the parquet UUID logical type 57 // decimal | for int32, int64 and [n]byte types, use the parquet DECIMAL logical type 58 // date | for int32 types use the DATE logical type 59 // timestamp | for int64 types use the TIMESTAMP logical type with, by default, millisecond precision 60 // split | for float32/float64, use the BYTE_STREAM_SPLIT encoding 61 // id(n) | where n is int denoting a column field id. Example id(2) for a column with field id of 2 62 // 63 // # The date logical type is an int32 value of the number of days since the unix epoch 64 // 65 // The timestamp precision can be changed by defining which precision to use as an argument. 66 // Supported precisions are: nanosecond, millisecond and microsecond. Example: 67 // 68 // type Message struct { 69 // TimestrampMicros int64 `parquet:"timestamp_micros,timestamp(microsecond)" 70 // } 71 // 72 // The decimal tag must be followed by two integer parameters, the first integer 73 // representing the scale and the second the precision; for example: 74 // 75 // type Item struct { 76 // Cost int64 `parquet:"cost,decimal(0:3)"` 77 // } 78 // 79 // Invalid combination of struct tags and Go types, or repeating options will 80 // cause the function to panic. 81 // 82 // As a special case, if the field tag is "-", the field is omitted from the schema 83 // and the data will not be written into the parquet file(s). 84 // Note that a field with name "-" can still be generated using the tag "-,". 85 // 86 // The configuration of Parquet maps are done via two tags: 87 // - The `parquet-key` tag allows to configure the key of a map. 88 // - The parquet-value tag allows users to configure a map's values, for example to declare their native Parquet types. 89 // 90 // When configuring a Parquet map, the `parquet` tag will configure the map itself. 91 // 92 // For example, the following will set the int64 key of the map to be a timestamp: 93 // 94 // type Actions struct { 95 // Action map[int64]string `parquet:"," parquet-key:",timestamp"` 96 // } 97 // 98 // The schema name is the Go type name of the value. 99 func SchemaOf(model interface{}) *Schema { 100 return schemaOf(dereference(reflect.TypeOf(model))) 101 } 102 103 var cachedSchemas sync.Map // map[reflect.Type]*Schema 104 105 func schemaOf(model reflect.Type) *Schema { 106 cached, _ := cachedSchemas.Load(model) 107 schema, _ := cached.(*Schema) 108 if schema != nil { 109 return schema 110 } 111 if model.Kind() != reflect.Struct { 112 panic("cannot construct parquet schema from value of type " + model.String()) 113 } 114 schema = NewSchema(model.Name(), nodeOf(model, nil)) 115 if actual, loaded := cachedSchemas.LoadOrStore(model, schema); loaded { 116 schema = actual.(*Schema) 117 } 118 return schema 119 } 120 121 // NewSchema constructs a new Schema object with the given name and root node. 122 // 123 // The function panics if Node contains more leaf columns than supported by the 124 // package (see parquet.MaxColumnIndex). 125 func NewSchema(name string, root Node) *Schema { 126 mapping, columns := columnMappingOf(root) 127 return &Schema{ 128 name: name, 129 root: root, 130 deconstruct: makeDeconstructFunc(root), 131 reconstruct: makeReconstructFunc(root), 132 mapping: mapping, 133 columns: columns, 134 } 135 } 136 137 func dereference(t reflect.Type) reflect.Type { 138 for t.Kind() == reflect.Ptr { 139 t = t.Elem() 140 } 141 return t 142 } 143 144 func makeDeconstructFunc(node Node) (deconstruct deconstructFunc) { 145 if schema, _ := node.(*Schema); schema != nil { 146 return schema.deconstruct 147 } 148 if !node.Leaf() { 149 _, deconstruct = deconstructFuncOf(0, node) 150 } 151 return deconstruct 152 } 153 154 func makeReconstructFunc(node Node) (reconstruct reconstructFunc) { 155 if schema, _ := node.(*Schema); schema != nil { 156 return schema.reconstruct 157 } 158 if !node.Leaf() { 159 _, reconstruct = reconstructFuncOf(0, node) 160 } 161 return reconstruct 162 } 163 164 // ConfigureRowGroup satisfies the RowGroupOption interface, allowing Schema 165 // instances to be passed to row group constructors to pre-declare the schema of 166 // the output parquet file. 167 func (s *Schema) ConfigureRowGroup(config *RowGroupConfig) { config.Schema = s } 168 169 // ConfigureReader satisfies the ReaderOption interface, allowing Schema 170 // instances to be passed to NewReader to pre-declare the schema of rows 171 // read from the reader. 172 func (s *Schema) ConfigureReader(config *ReaderConfig) { config.Schema = s } 173 174 // ConfigureWriter satisfies the WriterOption interface, allowing Schema 175 // instances to be passed to NewWriter to pre-declare the schema of the 176 // output parquet file. 177 func (s *Schema) ConfigureWriter(config *WriterConfig) { config.Schema = s } 178 179 // ID returns field id of the root node. 180 func (s *Schema) ID() int { return s.root.ID() } 181 182 // String returns a parquet schema representation of s. 183 func (s *Schema) String() string { return sprint(s.name, s.root) } 184 185 // Name returns the name of s. 186 func (s *Schema) Name() string { return s.name } 187 188 // Type returns the parquet type of s. 189 func (s *Schema) Type() Type { return s.root.Type() } 190 191 // Optional returns false since the root node of a parquet schema is always required. 192 func (s *Schema) Optional() bool { return s.root.Optional() } 193 194 // Repeated returns false since the root node of a parquet schema is always required. 195 func (s *Schema) Repeated() bool { return s.root.Repeated() } 196 197 // Required returns true since the root node of a parquet schema is always required. 198 func (s *Schema) Required() bool { return s.root.Required() } 199 200 // Leaf returns true if the root node of the parquet schema is a leaf column. 201 func (s *Schema) Leaf() bool { return s.root.Leaf() } 202 203 // Fields returns the list of fields on the root node of the parquet schema. 204 func (s *Schema) Fields() []Field { return s.root.Fields() } 205 206 // Encoding returns the encoding set on the root node of the parquet schema. 207 func (s *Schema) Encoding() encoding.Encoding { return s.root.Encoding() } 208 209 // Compression returns the compression codec set on the root node of the parquet 210 // schema. 211 func (s *Schema) Compression() compress.Codec { return s.root.Compression() } 212 213 // GoType returns the Go type that best represents the schema. 214 func (s *Schema) GoType() reflect.Type { return s.root.GoType() } 215 216 // Deconstruct deconstructs a Go value and appends it to a row. 217 // 218 // The method panics is the structure of the go value does not match the 219 // parquet schema. 220 func (s *Schema) Deconstruct(row Row, value interface{}) Row { 221 columns := make([][]Value, len(s.columns)) 222 values := make([]Value, len(s.columns)) 223 224 for i := range columns { 225 columns[i] = values[i : i : i+1] 226 } 227 228 s.deconstructValueToColumns(columns, reflect.ValueOf(value)) 229 return appendRow(row, columns) 230 } 231 232 func (s *Schema) deconstructValueToColumns(columns [][]Value, value reflect.Value) { 233 for value.Kind() == reflect.Ptr || value.Kind() == reflect.Interface { 234 if value.IsNil() { 235 value = reflect.Value{} 236 break 237 } 238 value = value.Elem() 239 } 240 s.deconstruct(columns, levels{}, value) 241 } 242 243 // Reconstruct reconstructs a Go value from a row. 244 // 245 // The go value passed as first argument must be a non-nil pointer for the 246 // row to be decoded into. 247 // 248 // The method panics if the structure of the go value and parquet row do not 249 // match. 250 func (s *Schema) Reconstruct(value interface{}, row Row) error { 251 v := reflect.ValueOf(value) 252 if !v.IsValid() { 253 panic("cannot reconstruct row into go value of type <nil>") 254 } 255 if v.Kind() != reflect.Ptr { 256 panic("cannot reconstruct row into go value of non-pointer type " + v.Type().String()) 257 } 258 if v.IsNil() { 259 panic("cannot reconstruct row into nil pointer of type " + v.Type().String()) 260 } 261 for v.Kind() == reflect.Ptr { 262 if v.IsNil() { 263 v.Set(reflect.New(v.Type().Elem())) 264 } 265 v = v.Elem() 266 } 267 268 b := valuesSliceBufferPool.Get().(*valuesSliceBuffer) 269 270 columns := b.reserve(len(s.columns)) 271 row.Range(func(columnIndex int, columnValues []Value) bool { 272 if columnIndex < len(columns) { 273 columns[columnIndex] = columnValues 274 } 275 return true 276 }) 277 // we avoid the defer penalty by releasing b manually 278 err := s.reconstruct(v, levels{}, columns) 279 b.release() 280 return err 281 } 282 283 type valuesSliceBuffer struct { 284 values [][]Value 285 } 286 287 func (v *valuesSliceBuffer) reserve(n int) [][]Value { 288 if n <= cap(v.values) { 289 return v.values[:n] 290 } 291 // we can try to keep growing by the power of two, but we care more about the 292 // memory footprint so this should suffice. 293 // 294 // The nature of reads tends to be from similar number of columns.The less work 295 // we do here the better performance we can get. 296 v.values = make([][]Value, n) 297 return v.values 298 } 299 300 func (v *valuesSliceBuffer) release() { 301 v.values = v.values[:0] 302 valuesSliceBufferPool.Put(v) 303 } 304 305 var valuesSliceBufferPool = &sync.Pool{ 306 New: func() interface{} { 307 return &valuesSliceBuffer{ 308 // use 64 as a cache friendly base estimate of max column numbers we will be 309 // reading. 310 values: make([][]Value, 0, 64), 311 } 312 }, 313 } 314 315 // Lookup returns the leaf column at the given path. 316 // 317 // The path is the sequence of column names identifying a leaf column (not 318 // including the root). 319 // 320 // If the path was not found in the mapping, or if it did not represent a 321 // leaf column of the parquet schema, the boolean will be false. 322 func (s *Schema) Lookup(path ...string) (LeafColumn, bool) { 323 leaf := s.mapping.lookup(path) 324 return LeafColumn{ 325 Node: leaf.node, 326 Path: leaf.path, 327 ColumnIndex: int(leaf.columnIndex), 328 MaxRepetitionLevel: int(leaf.maxRepetitionLevel), 329 MaxDefinitionLevel: int(leaf.maxDefinitionLevel), 330 }, leaf.node != nil 331 } 332 333 // Columns returns the list of column paths available in the schema. 334 // 335 // The method always returns the same slice value across calls to ColumnPaths, 336 // applications should treat it as immutable. 337 func (s *Schema) Columns() [][]string { 338 return s.columns 339 } 340 341 // Comparator constructs a comparator function which orders rows according to 342 // the list of sorting columns passed as arguments. 343 func (s *Schema) Comparator(sortingColumns ...SortingColumn) func(Row, Row) int { 344 return compareRowsFuncOf(s, sortingColumns) 345 } 346 347 func (s *Schema) forEachNode(do func(name string, node Node)) { 348 forEachNodeOf(s.Name(), s, do) 349 } 350 351 type structNode struct { 352 gotype reflect.Type 353 fields []structField 354 } 355 356 func structNodeOf(t reflect.Type) *structNode { 357 // Collect struct fields first so we can order them before generating the 358 // column indexes. 359 fields := structFieldsOf(t) 360 361 s := &structNode{ 362 gotype: t, 363 fields: make([]structField, len(fields)), 364 } 365 366 for i := range fields { 367 field := structField{name: fields[i].Name, index: fields[i].Index} 368 field.Node = makeNodeOf(fields[i].Type, fields[i].Name, []string{ 369 fields[i].Tag.Get("parquet"), 370 fields[i].Tag.Get("parquet-key"), 371 fields[i].Tag.Get("parquet-value"), 372 }) 373 s.fields[i] = field 374 } 375 376 return s 377 } 378 379 func structFieldsOf(t reflect.Type) []reflect.StructField { 380 fields := appendStructFields(t, nil, nil, 0) 381 382 for i := range fields { 383 f := &fields[i] 384 385 if tag := f.Tag.Get("parquet"); tag != "" { 386 name, _ := split(tag) 387 if name != "" { 388 f.Name = name 389 } 390 } 391 } 392 393 return fields 394 } 395 396 func appendStructFields(t reflect.Type, fields []reflect.StructField, index []int, offset uintptr) []reflect.StructField { 397 for i, n := 0, t.NumField(); i < n; i++ { 398 f := t.Field(i) 399 if tag := f.Tag.Get("parquet"); tag != "" { 400 name, _ := split(tag) 401 if tag != "-," && name == "-" { 402 continue 403 } 404 } 405 406 fieldIndex := index[:len(index):len(index)] 407 fieldIndex = append(fieldIndex, i) 408 409 f.Offset += offset 410 411 if f.Anonymous { 412 fields = appendStructFields(f.Type, fields, fieldIndex, f.Offset) 413 } else if f.IsExported() { 414 f.Index = fieldIndex 415 fields = append(fields, f) 416 } 417 } 418 return fields 419 } 420 421 func (s *structNode) Optional() bool { return false } 422 423 func (s *structNode) Repeated() bool { return false } 424 425 func (s *structNode) Required() bool { return true } 426 427 func (s *structNode) Leaf() bool { return false } 428 429 func (s *structNode) Encoding() encoding.Encoding { return nil } 430 431 func (s *structNode) Compression() compress.Codec { return nil } 432 433 func (s *structNode) GoType() reflect.Type { return s.gotype } 434 435 func (s *structNode) ID() int { return 0 } 436 437 func (s *structNode) String() string { return sprint("", s) } 438 439 func (s *structNode) Type() Type { return groupType{} } 440 441 func (s *structNode) Fields() []Field { 442 fields := make([]Field, len(s.fields)) 443 for i := range s.fields { 444 fields[i] = &s.fields[i] 445 } 446 return fields 447 } 448 449 // fieldByIndex is like reflect.Value.FieldByIndex but returns the zero-value of 450 // reflect.Value if one of the fields was a nil pointer instead of panicking. 451 func fieldByIndex(v reflect.Value, index []int) reflect.Value { 452 for _, i := range index { 453 if v = v.Field(i); v.Kind() == reflect.Ptr || v.Kind() == reflect.Interface { 454 if v.IsNil() { 455 v.Set(reflect.New(v.Type().Elem())) 456 v = v.Elem() 457 break 458 } else { 459 v = v.Elem() 460 } 461 } 462 } 463 return v 464 } 465 466 type structField struct { 467 Node 468 name string 469 index []int 470 } 471 472 func (f *structField) Name() string { return f.name } 473 474 func (f *structField) Value(base reflect.Value) reflect.Value { 475 switch base.Kind() { 476 case reflect.Map: 477 return base.MapIndex(reflect.ValueOf(&f.name).Elem()) 478 case reflect.Ptr: 479 if base.IsNil() { 480 base.Set(reflect.New(base.Type().Elem())) 481 } 482 return fieldByIndex(base.Elem(), f.index) 483 default: 484 if len(f.index) == 1 { 485 return base.Field(f.index[0]) 486 } else { 487 return fieldByIndex(base, f.index) 488 } 489 } 490 } 491 492 func nodeString(t reflect.Type, name string, tag ...string) string { 493 return fmt.Sprintf("%s %s %v", name, t.String(), tag) 494 } 495 496 func throwInvalidTag(t reflect.Type, name string, tag string) { 497 panic(tag + " is an invalid parquet tag: " + nodeString(t, name, tag)) 498 } 499 500 func throwUnknownTag(t reflect.Type, name string, tag string) { 501 panic(tag + " is an unrecognized parquet tag: " + nodeString(t, name, tag)) 502 } 503 504 func throwInvalidNode(t reflect.Type, msg, name string, tag ...string) { 505 panic(msg + ": " + nodeString(t, name, tag...)) 506 } 507 508 // FixedLenByteArray decimals are sized based on precision 509 // this function calculates the necessary byte array size. 510 func decimalFixedLenByteArraySize(precision int) int { 511 return int(math.Ceil((math.Log10(2) + float64(precision)) / math.Log10(256))) 512 } 513 514 func forEachStructTagOption(sf reflect.StructField, do func(t reflect.Type, option, args string)) { 515 if tag := sf.Tag.Get("parquet"); tag != "" { 516 _, tag = split(tag) // skip the field name 517 for tag != "" { 518 option := "" 519 args := "" 520 option, tag = split(tag) 521 option, args = splitOptionArgs(option) 522 ft := sf.Type 523 if ft.Kind() == reflect.Ptr { 524 ft = ft.Elem() 525 } 526 do(ft, option, args) 527 } 528 } 529 } 530 531 func nodeOf(t reflect.Type, tag []string) Node { 532 switch t { 533 case reflect.TypeOf(deprecated.Int96{}): 534 return Leaf(Int96Type) 535 case reflect.TypeOf(uuid.UUID{}): 536 return UUID() 537 case reflect.TypeOf(time.Time{}): 538 return Timestamp(Nanosecond) 539 } 540 541 var n Node 542 switch t.Kind() { 543 case reflect.Bool: 544 n = Leaf(BooleanType) 545 546 case reflect.Int, reflect.Int64: 547 n = Int(64) 548 549 case reflect.Int8, reflect.Int16, reflect.Int32: 550 n = Int(t.Bits()) 551 552 case reflect.Uint, reflect.Uintptr, reflect.Uint64: 553 n = Uint(64) 554 555 case reflect.Uint8, reflect.Uint16, reflect.Uint32: 556 n = Uint(t.Bits()) 557 558 case reflect.Float32: 559 n = Leaf(FloatType) 560 561 case reflect.Float64: 562 n = Leaf(DoubleType) 563 564 case reflect.String: 565 n = String() 566 567 case reflect.Ptr: 568 n = Optional(nodeOf(t.Elem(), nil)) 569 570 case reflect.Slice: 571 if elem := t.Elem(); elem.Kind() == reflect.Uint8 { // []byte? 572 n = Leaf(ByteArrayType) 573 } else { 574 n = Repeated(nodeOf(elem, nil)) 575 } 576 577 case reflect.Array: 578 if t.Elem().Kind() == reflect.Uint8 { 579 n = Leaf(FixedLenByteArrayType(t.Len())) 580 } 581 582 case reflect.Map: 583 var mapTag, valueTag, keyTag string 584 if len(tag) > 0 { 585 mapTag = tag[0] 586 if len(tag) > 1 { 587 keyTag = tag[1] 588 } 589 if len(tag) >= 2 { 590 valueTag = tag[2] 591 } 592 } 593 594 if strings.Contains(mapTag, "json") { 595 n = JSON() 596 } else { 597 n = Map( 598 makeNodeOf(t.Key(), t.Name(), []string{keyTag}), 599 makeNodeOf(t.Elem(), t.Name(), []string{valueTag}), 600 ) 601 } 602 603 forEachTagOption([]string{mapTag}, func(option, args string) { 604 switch option { 605 case "", "json": 606 return 607 case "optional": 608 n = Optional(n) 609 case "id": 610 id, err := parseIDArgs(args) 611 if err != nil { 612 throwInvalidTag(t, "map", option) 613 } 614 n = FieldID(n, id) 615 default: 616 throwUnknownTag(t, "map", option) 617 } 618 }) 619 620 case reflect.Struct: 621 return structNodeOf(t) 622 } 623 624 if n == nil { 625 panic("cannot create parquet node from go value of type " + t.String()) 626 } 627 628 return &goNode{Node: n, gotype: t} 629 } 630 631 func split(s string) (head, tail string) { 632 if i := strings.IndexByte(s, ','); i < 0 { 633 head = s 634 } else { 635 head, tail = s[:i], s[i+1:] 636 } 637 return 638 } 639 640 func splitOptionArgs(s string) (option, args string) { 641 if i := strings.IndexByte(s, '('); i >= 0 { 642 option = s[:i] 643 args = s[i:] 644 } else { 645 option = s 646 args = "()" 647 } 648 return 649 } 650 651 func parseDecimalArgs(args string) (scale, precision int, err error) { 652 if !strings.HasPrefix(args, "(") || !strings.HasSuffix(args, ")") { 653 return 0, 0, fmt.Errorf("malformed decimal args: %s", args) 654 } 655 args = strings.TrimPrefix(args, "(") 656 args = strings.TrimSuffix(args, ")") 657 parts := strings.Split(args, ":") 658 if len(parts) != 2 { 659 return 0, 0, fmt.Errorf("malformed decimal args: (%s)", args) 660 } 661 s, err := strconv.ParseInt(parts[0], 10, 32) 662 if err != nil { 663 return 0, 0, err 664 } 665 p, err := strconv.ParseInt(parts[1], 10, 32) 666 if err != nil { 667 return 0, 0, err 668 } 669 return int(s), int(p), nil 670 } 671 672 func parseIDArgs(args string) (int, error) { 673 if !strings.HasPrefix(args, "(") || !strings.HasSuffix(args, ")") { 674 return 0, fmt.Errorf("malformed id args: %s", args) 675 } 676 args = strings.TrimPrefix(args, "(") 677 args = strings.TrimSuffix(args, ")") 678 return strconv.Atoi(args) 679 } 680 681 func parseTimestampArgs(args string) (TimeUnit, error) { 682 if !strings.HasPrefix(args, "(") || !strings.HasSuffix(args, ")") { 683 return nil, fmt.Errorf("malformed timestamp args: %s", args) 684 } 685 686 args = strings.TrimPrefix(args, "(") 687 args = strings.TrimSuffix(args, ")") 688 689 if len(args) == 0 { 690 return Millisecond, nil 691 } 692 693 switch args { 694 case "millisecond": 695 return Millisecond, nil 696 case "microsecond": 697 return Microsecond, nil 698 case "nanosecond": 699 return Nanosecond, nil 700 default: 701 } 702 703 return nil, fmt.Errorf("unknown time unit: %s", args) 704 } 705 706 type goNode struct { 707 Node 708 gotype reflect.Type 709 } 710 711 func (n *goNode) GoType() reflect.Type { return n.gotype } 712 713 var ( 714 _ RowGroupOption = (*Schema)(nil) 715 _ ReaderOption = (*Schema)(nil) 716 _ WriterOption = (*Schema)(nil) 717 ) 718 719 func makeNodeOf(t reflect.Type, name string, tag []string) Node { 720 var ( 721 node Node 722 optional bool 723 list bool 724 encoded encoding.Encoding 725 compressed compress.Codec 726 fieldID int 727 ) 728 729 setNode := func(n Node) { 730 if node != nil { 731 throwInvalidNode(t, "struct field has multiple logical parquet types declared", name, tag...) 732 } 733 node = n 734 } 735 736 setOptional := func() { 737 if optional { 738 throwInvalidNode(t, "struct field has multiple declaration of the optional tag", name, tag...) 739 } 740 optional = true 741 } 742 743 setList := func() { 744 if list { 745 throwInvalidNode(t, "struct field has multiple declaration of the list tag", name, tag...) 746 } 747 list = true 748 } 749 750 setEncoding := func(e encoding.Encoding) { 751 if encoded != nil { 752 throwInvalidNode(t, "struct field has encoding declared multiple time", name, tag...) 753 } 754 encoded = e 755 } 756 757 setCompression := func(c compress.Codec) { 758 if compressed != nil { 759 throwInvalidNode(t, "struct field has compression codecs declared multiple times", name, tag...) 760 } 761 compressed = c 762 } 763 764 forEachTagOption(tag, func(option, args string) { 765 if t.Kind() == reflect.Map { 766 node = nodeOf(t, tag) 767 return 768 } 769 switch option { 770 case "": 771 return 772 case "optional": 773 setOptional() 774 775 case "snappy": 776 setCompression(&Snappy) 777 778 case "gzip": 779 setCompression(&Gzip) 780 781 case "brotli": 782 setCompression(&Brotli) 783 784 case "lz4": 785 setCompression(&Lz4Raw) 786 787 case "zstd": 788 setCompression(&Zstd) 789 790 case "uncompressed": 791 setCompression(&Uncompressed) 792 793 case "plain": 794 setEncoding(&Plain) 795 796 case "dict": 797 setEncoding(&RLEDictionary) 798 799 case "json": 800 setNode(JSON()) 801 802 case "delta": 803 switch t.Kind() { 804 case reflect.Int, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint32, reflect.Uint64: 805 setEncoding(&DeltaBinaryPacked) 806 case reflect.String: 807 setEncoding(&DeltaByteArray) 808 case reflect.Slice: 809 if t.Elem().Kind() == reflect.Uint8 { // []byte? 810 setEncoding(&DeltaByteArray) 811 } else { 812 throwInvalidTag(t, name, option) 813 } 814 case reflect.Array: 815 if t.Elem().Kind() == reflect.Uint8 { // [N]byte? 816 setEncoding(&DeltaByteArray) 817 } else { 818 throwInvalidTag(t, name, option) 819 } 820 default: 821 switch t { 822 case reflect.TypeOf(time.Time{}): 823 setEncoding(&DeltaBinaryPacked) 824 default: 825 throwInvalidTag(t, name, option) 826 } 827 } 828 829 case "split": 830 switch t.Kind() { 831 case reflect.Float32, reflect.Float64: 832 setEncoding(&ByteStreamSplit) 833 default: 834 throwInvalidTag(t, name, option) 835 } 836 837 case "list": 838 switch t.Kind() { 839 case reflect.Slice: 840 element := nodeOf(t.Elem(), nil) 841 setNode(element) 842 setList() 843 default: 844 throwInvalidTag(t, name, option) 845 } 846 847 case "enum": 848 switch t.Kind() { 849 case reflect.String: 850 setNode(Enum()) 851 default: 852 throwInvalidTag(t, name, option) 853 } 854 855 case "uuid": 856 switch t.Kind() { 857 case reflect.Array: 858 if t.Elem().Kind() != reflect.Uint8 || t.Len() != 16 { 859 throwInvalidTag(t, name, option) 860 } 861 default: 862 throwInvalidTag(t, name, option) 863 } 864 865 case "decimal": 866 scale, precision, err := parseDecimalArgs(args) 867 if err != nil { 868 throwInvalidTag(t, name, option+args) 869 } 870 var baseType Type 871 switch t.Kind() { 872 case reflect.Int32: 873 baseType = Int32Type 874 case reflect.Int64: 875 baseType = Int64Type 876 case reflect.Array, reflect.Slice: 877 baseType = FixedLenByteArrayType(decimalFixedLenByteArraySize(precision)) 878 default: 879 throwInvalidTag(t, name, option) 880 } 881 882 setNode(Decimal(scale, precision, baseType)) 883 case "date": 884 switch t.Kind() { 885 case reflect.Int32: 886 setNode(Date()) 887 default: 888 throwInvalidTag(t, name, option) 889 } 890 case "timestamp": 891 switch t.Kind() { 892 case reflect.Int64: 893 timeUnit, err := parseTimestampArgs(args) 894 if err != nil { 895 throwInvalidTag(t, name, option) 896 } 897 setNode(Timestamp(timeUnit)) 898 default: 899 switch t { 900 case reflect.TypeOf(time.Time{}): 901 timeUnit, err := parseTimestampArgs(args) 902 if err != nil { 903 throwInvalidTag(t, name, option) 904 } 905 setNode(Timestamp(timeUnit)) 906 default: 907 throwInvalidTag(t, name, option) 908 } 909 } 910 case "id": 911 id, err := parseIDArgs(args) 912 if err != nil { 913 throwInvalidNode(t, "struct field has field id that is not a valid int", name, tag...) 914 } 915 fieldID = id 916 } 917 }) 918 919 // Special case: an "optional" struct tag on a slice applies to the 920 // individual items, not the overall list. The least messy way to 921 // deal with this is at this level, instead of passing down optional 922 // information into the nodeOf function, and then passing back whether an 923 // optional tag was applied. 924 if node == nil && t.Kind() == reflect.Slice { 925 isUint8 := t.Elem().Kind() == reflect.Uint8 926 // Note for strings "optional" applies only to the entire BYTE_ARRAY and 927 // not each individual byte. 928 if optional && !isUint8 { 929 node = Repeated(Optional(nodeOf(t.Elem(), tag))) 930 // Don't also apply "optional" to the whole list. 931 optional = false 932 } 933 } 934 935 if node == nil { 936 node = nodeOf(t, tag) 937 } 938 939 if compressed != nil { 940 node = Compressed(node, compressed) 941 } 942 943 if encoded != nil { 944 node = Encoded(node, encoded) 945 } 946 947 if list { 948 node = List(node) 949 } 950 951 if node.Repeated() && !list { 952 repeated := node.GoType().Elem() 953 if repeated.Kind() == reflect.Slice { 954 // Special case: allow [][]uint as seen in a logical map of strings 955 if repeated.Elem().Kind() != reflect.Uint8 { 956 panic("unhandled nested slice on parquet schema without list tag") 957 } 958 } 959 } 960 961 if optional { 962 node = Optional(node) 963 } 964 if fieldID != 0 { 965 node = FieldID(node, fieldID) 966 } 967 return node 968 } 969 970 func forEachTagOption(tags []string, do func(option, args string)) { 971 for _, tag := range tags { 972 _, tag = split(tag) // skip the field name 973 for tag != "" { 974 option := "" 975 option, tag = split(tag) 976 var args string 977 option, args = splitOptionArgs(option) 978 do(option, args) 979 } 980 } 981 }