github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/schema.go (about)

     1  package parquet
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"reflect"
     7  	"strconv"
     8  	"strings"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/google/uuid"
    13  	"github.com/segmentio/parquet-go/compress"
    14  	"github.com/segmentio/parquet-go/deprecated"
    15  	"github.com/segmentio/parquet-go/encoding"
    16  )
    17  
    18  // Schema represents a parquet schema created from a Go value.
    19  //
    20  // Schema implements the Node interface to represent the root node of a parquet
    21  // schema.
    22  type Schema struct {
    23  	name        string
    24  	root        Node
    25  	deconstruct deconstructFunc
    26  	reconstruct reconstructFunc
    27  	mapping     columnMapping
    28  	columns     [][]string
    29  }
    30  
    31  // SchemaOf constructs a parquet schema from a Go value.
    32  //
    33  // The function can construct parquet schemas from struct or pointer-to-struct
    34  // values only. A panic is raised if a Go value of a different type is passed
    35  // to this function.
    36  //
    37  // When creating a parquet Schema from a Go value, the struct fields may contain
    38  // a "parquet" tag to describe properties of the parquet node. The "parquet" tag
    39  // follows the conventional format of Go struct tags: a comma-separated list of
    40  // values describe the options, with the first one defining the name of the
    41  // parquet column.
    42  //
    43  // The following options are also supported in the "parquet" struct tag:
    44  //
    45  //	optional  | make the parquet column optional
    46  //	snappy    | sets the parquet column compression codec to snappy
    47  //	gzip      | sets the parquet column compression codec to gzip
    48  //	brotli    | sets the parquet column compression codec to brotli
    49  //	lz4       | sets the parquet column compression codec to lz4
    50  //	zstd      | sets the parquet column compression codec to zstd
    51  //	plain     | enables the plain encoding (no-op default)
    52  //	dict      | enables dictionary encoding on the parquet column
    53  //	delta     | enables delta encoding on the parquet column
    54  //	list      | for slice types, use the parquet LIST logical type
    55  //	enum      | for string types, use the parquet ENUM logical type
    56  //	uuid      | for string and [16]byte types, use the parquet UUID logical type
    57  //	decimal   | for int32, int64 and [n]byte types, use the parquet DECIMAL logical type
    58  //	date      | for int32 types use the DATE logical type
    59  //	timestamp | for int64 types use the TIMESTAMP logical type with, by default, millisecond precision
    60  //	split     | for float32/float64, use the BYTE_STREAM_SPLIT encoding
    61  //
    62  // # The date logical type is an int32 value of the number of days since the unix epoch
    63  //
    64  // The timestamp precision can be changed by defining which precision to use as an argument.
    65  // Supported precisions are: nanosecond, millisecond and microsecond. Example:
    66  //
    67  //	type Message struct {
    68  //	  TimestrampMicros int64 `parquet:"timestamp_micros,timestamp(microsecond)"
    69  //	}
    70  //
    71  // The decimal tag must be followed by two integer parameters, the first integer
    72  // representing the scale and the second the precision; for example:
    73  //
    74  //	type Item struct {
    75  //		Cost int64 `parquet:"cost,decimal(0:3)"`
    76  //	}
    77  //
    78  // Invalid combination of struct tags and Go types, or repeating options will
    79  // cause the function to panic.
    80  //
    81  // As a special case, if the field tag is "-", the field is omitted from the schema
    82  // and the data will not be written into the parquet file(s).
    83  // Note that a field with name "-" can still be generated using the tag "-,".
    84  //
    85  // The configuration of Parquet maps are done via two tags:
    86  //   - The `parquet-key` tag allows to configure the key of a map.
    87  //   - The parquet-value tag allows users to configure a map's values, for example to declare their native Parquet types.
    88  //
    89  // When configuring a Parquet map, the `parquet` tag will configure the map itself.
    90  //
    91  // For example, the following will set the int64 key of the map to be a timestamp:
    92  //
    93  //	type Actions struct {
    94  //	  Action map[int64]string `parquet:"," parquet-key:",timestamp"`
    95  //	}
    96  //
    97  // The schema name is the Go type name of the value.
    98  func SchemaOf(model interface{}) *Schema {
    99  	return schemaOf(dereference(reflect.TypeOf(model)))
   100  }
   101  
   102  var cachedSchemas sync.Map // map[reflect.Type]*Schema
   103  
   104  func schemaOf(model reflect.Type) *Schema {
   105  	cached, _ := cachedSchemas.Load(model)
   106  	schema, _ := cached.(*Schema)
   107  	if schema != nil {
   108  		return schema
   109  	}
   110  	if model.Kind() != reflect.Struct {
   111  		panic("cannot construct parquet schema from value of type " + model.String())
   112  	}
   113  	schema = NewSchema(model.Name(), nodeOf(model, nil))
   114  	if actual, loaded := cachedSchemas.LoadOrStore(model, schema); loaded {
   115  		schema = actual.(*Schema)
   116  	}
   117  	return schema
   118  }
   119  
   120  // NewSchema constructs a new Schema object with the given name and root node.
   121  //
   122  // The function panics if Node contains more leaf columns than supported by the
   123  // package (see parquet.MaxColumnIndex).
   124  func NewSchema(name string, root Node) *Schema {
   125  	mapping, columns := columnMappingOf(root)
   126  	return &Schema{
   127  		name:        name,
   128  		root:        root,
   129  		deconstruct: makeDeconstructFunc(root),
   130  		reconstruct: makeReconstructFunc(root),
   131  		mapping:     mapping,
   132  		columns:     columns,
   133  	}
   134  }
   135  
   136  func dereference(t reflect.Type) reflect.Type {
   137  	for t.Kind() == reflect.Ptr {
   138  		t = t.Elem()
   139  	}
   140  	return t
   141  }
   142  
   143  func makeDeconstructFunc(node Node) (deconstruct deconstructFunc) {
   144  	if schema, _ := node.(*Schema); schema != nil {
   145  		return schema.deconstruct
   146  	}
   147  	if !node.Leaf() {
   148  		_, deconstruct = deconstructFuncOf(0, node)
   149  	}
   150  	return deconstruct
   151  }
   152  
   153  func makeReconstructFunc(node Node) (reconstruct reconstructFunc) {
   154  	if schema, _ := node.(*Schema); schema != nil {
   155  		return schema.reconstruct
   156  	}
   157  	if !node.Leaf() {
   158  		_, reconstruct = reconstructFuncOf(0, node)
   159  	}
   160  	return reconstruct
   161  }
   162  
   163  // ConfigureRowGroup satisfies the RowGroupOption interface, allowing Schema
   164  // instances to be passed to row group constructors to pre-declare the schema of
   165  // the output parquet file.
   166  func (s *Schema) ConfigureRowGroup(config *RowGroupConfig) { config.Schema = s }
   167  
   168  // ConfigureReader satisfies the ReaderOption interface, allowing Schema
   169  // instances to be passed to NewReader to pre-declare the schema of rows
   170  // read from the reader.
   171  func (s *Schema) ConfigureReader(config *ReaderConfig) { config.Schema = s }
   172  
   173  // ConfigureWriter satisfies the WriterOption interface, allowing Schema
   174  // instances to be passed to NewWriter to pre-declare the schema of the
   175  // output parquet file.
   176  func (s *Schema) ConfigureWriter(config *WriterConfig) { config.Schema = s }
   177  
   178  // String returns a parquet schema representation of s.
   179  func (s *Schema) String() string { return sprint(s.name, s.root) }
   180  
   181  // Name returns the name of s.
   182  func (s *Schema) Name() string { return s.name }
   183  
   184  // Type returns the parquet type of s.
   185  func (s *Schema) Type() Type { return s.root.Type() }
   186  
   187  // Optional returns false since the root node of a parquet schema is always required.
   188  func (s *Schema) Optional() bool { return s.root.Optional() }
   189  
   190  // Repeated returns false since the root node of a parquet schema is always required.
   191  func (s *Schema) Repeated() bool { return s.root.Repeated() }
   192  
   193  // Required returns true since the root node of a parquet schema is always required.
   194  func (s *Schema) Required() bool { return s.root.Required() }
   195  
   196  // Leaf returns true if the root node of the parquet schema is a leaf column.
   197  func (s *Schema) Leaf() bool { return s.root.Leaf() }
   198  
   199  // Fields returns the list of fields on the root node of the parquet schema.
   200  func (s *Schema) Fields() []Field { return s.root.Fields() }
   201  
   202  // Encoding returns the encoding set on the root node of the parquet schema.
   203  func (s *Schema) Encoding() encoding.Encoding { return s.root.Encoding() }
   204  
   205  // Compression returns the compression codec set on the root node of the parquet
   206  // schema.
   207  func (s *Schema) Compression() compress.Codec { return s.root.Compression() }
   208  
   209  // GoType returns the Go type that best represents the schema.
   210  func (s *Schema) GoType() reflect.Type { return s.root.GoType() }
   211  
   212  // Deconstruct deconstructs a Go value and appends it to a row.
   213  //
   214  // The method panics is the structure of the go value does not match the
   215  // parquet schema.
   216  func (s *Schema) Deconstruct(row Row, value interface{}) Row {
   217  	columns := make([][]Value, len(s.columns))
   218  	values := make([]Value, len(s.columns))
   219  
   220  	for i := range columns {
   221  		columns[i] = values[i : i : i+1]
   222  	}
   223  
   224  	s.deconstructValueToColumns(columns, reflect.ValueOf(value))
   225  	return appendRow(row, columns)
   226  }
   227  
   228  func (s *Schema) deconstructValueToColumns(columns [][]Value, value reflect.Value) {
   229  	for value.Kind() == reflect.Ptr || value.Kind() == reflect.Interface {
   230  		if value.IsNil() {
   231  			value = reflect.Value{}
   232  			break
   233  		}
   234  		value = value.Elem()
   235  	}
   236  	s.deconstruct(columns, levels{}, value)
   237  }
   238  
   239  // Reconstruct reconstructs a Go value from a row.
   240  //
   241  // The go value passed as first argument must be a non-nil pointer for the
   242  // row to be decoded into.
   243  //
   244  // The method panics if the structure of the go value and parquet row do not
   245  // match.
   246  func (s *Schema) Reconstruct(value interface{}, row Row) error {
   247  	v := reflect.ValueOf(value)
   248  	if !v.IsValid() {
   249  		panic("cannot reconstruct row into go value of type <nil>")
   250  	}
   251  	if v.Kind() != reflect.Ptr {
   252  		panic("cannot reconstruct row into go value of non-pointer type " + v.Type().String())
   253  	}
   254  	if v.IsNil() {
   255  		panic("cannot reconstruct row into nil pointer of type " + v.Type().String())
   256  	}
   257  	for v.Kind() == reflect.Ptr {
   258  		if v.IsNil() {
   259  			v.Set(reflect.New(v.Type().Elem()))
   260  		}
   261  		v = v.Elem()
   262  	}
   263  
   264  	columns := make([][]Value, len(s.columns))
   265  	row.Range(func(columnIndex int, columnValues []Value) bool {
   266  		if columnIndex < len(columns) {
   267  			columns[columnIndex] = columnValues
   268  		}
   269  		return true
   270  	})
   271  
   272  	return s.reconstruct(v, levels{}, columns)
   273  }
   274  
   275  // Lookup returns the leaf column at the given path.
   276  //
   277  // The path is the sequence of column names identifying a leaf column (not
   278  // including the root).
   279  //
   280  // If the path was not found in the mapping, or if it did not represent a
   281  // leaf column of the parquet schema, the boolean will be false.
   282  func (s *Schema) Lookup(path ...string) (LeafColumn, bool) {
   283  	leaf := s.mapping.lookup(path)
   284  	return LeafColumn{
   285  		Node:               leaf.node,
   286  		Path:               leaf.path,
   287  		ColumnIndex:        int(leaf.columnIndex),
   288  		MaxRepetitionLevel: int(leaf.maxRepetitionLevel),
   289  		MaxDefinitionLevel: int(leaf.maxDefinitionLevel),
   290  	}, leaf.node != nil
   291  }
   292  
   293  // Columns returns the list of column paths available in the schema.
   294  //
   295  // The method always returns the same slice value across calls to ColumnPaths,
   296  // applications should treat it as immutable.
   297  func (s *Schema) Columns() [][]string {
   298  	return s.columns
   299  }
   300  
   301  // Comparator constructs a comparator function which orders rows according to
   302  // the list of sorting columns passed as arguments.
   303  func (s *Schema) Comparator(sortingColumns ...SortingColumn) func(Row, Row) int {
   304  	return compareRowsFuncOf(s, sortingColumns)
   305  }
   306  
   307  func (s *Schema) forEachNode(do func(name string, node Node)) {
   308  	forEachNodeOf(s.Name(), s, do)
   309  }
   310  
   311  type structNode struct {
   312  	gotype reflect.Type
   313  	fields []structField
   314  }
   315  
   316  func structNodeOf(t reflect.Type) *structNode {
   317  	// Collect struct fields first so we can order them before generating the
   318  	// column indexes.
   319  	fields := structFieldsOf(t)
   320  
   321  	s := &structNode{
   322  		gotype: t,
   323  		fields: make([]structField, len(fields)),
   324  	}
   325  
   326  	for i := range fields {
   327  		field := structField{name: fields[i].Name, index: fields[i].Index}
   328  		field.Node = makeNodeOf(fields[i].Type, fields[i].Name, []string{
   329  			fields[i].Tag.Get("parquet"),
   330  			fields[i].Tag.Get("parquet-key"),
   331  			fields[i].Tag.Get("parquet-value"),
   332  		})
   333  		s.fields[i] = field
   334  	}
   335  
   336  	return s
   337  }
   338  
   339  func structFieldsOf(t reflect.Type) []reflect.StructField {
   340  	fields := appendStructFields(t, nil, nil, 0)
   341  
   342  	for i := range fields {
   343  		f := &fields[i]
   344  
   345  		if tag := f.Tag.Get("parquet"); tag != "" {
   346  			name, _ := split(tag)
   347  			if name != "" {
   348  				f.Name = name
   349  			}
   350  		}
   351  	}
   352  
   353  	return fields
   354  }
   355  
   356  func appendStructFields(t reflect.Type, fields []reflect.StructField, index []int, offset uintptr) []reflect.StructField {
   357  	for i, n := 0, t.NumField(); i < n; i++ {
   358  		f := t.Field(i)
   359  		if tag := f.Tag.Get("parquet"); tag != "" {
   360  			name, _ := split(tag)
   361  			if tag != "-," && name == "-" {
   362  				continue
   363  			}
   364  		}
   365  
   366  		fieldIndex := index[:len(index):len(index)]
   367  		fieldIndex = append(fieldIndex, i)
   368  
   369  		f.Offset += offset
   370  
   371  		if f.Anonymous {
   372  			fields = appendStructFields(f.Type, fields, fieldIndex, f.Offset)
   373  		} else if f.IsExported() {
   374  			f.Index = fieldIndex
   375  			fields = append(fields, f)
   376  		}
   377  	}
   378  	return fields
   379  }
   380  
   381  func (s *structNode) Optional() bool { return false }
   382  
   383  func (s *structNode) Repeated() bool { return false }
   384  
   385  func (s *structNode) Required() bool { return true }
   386  
   387  func (s *structNode) Leaf() bool { return false }
   388  
   389  func (s *structNode) Encoding() encoding.Encoding { return nil }
   390  
   391  func (s *structNode) Compression() compress.Codec { return nil }
   392  
   393  func (s *structNode) GoType() reflect.Type { return s.gotype }
   394  
   395  func (s *structNode) String() string { return sprint("", s) }
   396  
   397  func (s *structNode) Type() Type { return groupType{} }
   398  
   399  func (s *structNode) Fields() []Field {
   400  	fields := make([]Field, len(s.fields))
   401  	for i := range s.fields {
   402  		fields[i] = &s.fields[i]
   403  	}
   404  	return fields
   405  }
   406  
   407  // fieldByIndex is like reflect.Value.FieldByIndex but returns the zero-value of
   408  // reflect.Value if one of the fields was a nil pointer instead of panicking.
   409  func fieldByIndex(v reflect.Value, index []int) reflect.Value {
   410  	for _, i := range index {
   411  		if v = v.Field(i); v.Kind() == reflect.Ptr || v.Kind() == reflect.Interface {
   412  			if v.IsNil() {
   413  				v = reflect.Value{}
   414  				break
   415  			} else {
   416  				v = v.Elem()
   417  			}
   418  		}
   419  	}
   420  	return v
   421  }
   422  
   423  type structField struct {
   424  	Node
   425  	name  string
   426  	index []int
   427  }
   428  
   429  func (f *structField) Name() string { return f.name }
   430  
   431  func (f *structField) Value(base reflect.Value) reflect.Value {
   432  	switch base.Kind() {
   433  	case reflect.Map:
   434  		return base.MapIndex(reflect.ValueOf(&f.name).Elem())
   435  	case reflect.Ptr:
   436  		if base.IsNil() {
   437  			base.Set(reflect.New(base.Type().Elem()))
   438  		}
   439  		return fieldByIndex(base.Elem(), f.index)
   440  	default:
   441  		if len(f.index) == 1 {
   442  			return base.Field(f.index[0])
   443  		} else {
   444  			return fieldByIndex(base, f.index)
   445  		}
   446  	}
   447  }
   448  
   449  func nodeString(t reflect.Type, name string, tag ...string) string {
   450  	return fmt.Sprintf("%s %s %v", name, t.String(), tag)
   451  }
   452  
   453  func throwInvalidTag(t reflect.Type, name string, tag string) {
   454  	panic(tag + " is an invalid parquet tag: " + nodeString(t, name, tag))
   455  }
   456  
   457  func throwUnknownTag(t reflect.Type, name string, tag string) {
   458  	panic(tag + " is an unrecognized parquet tag: " + nodeString(t, name, tag))
   459  }
   460  
   461  func throwInvalidNode(t reflect.Type, msg, name string, tag ...string) {
   462  	panic(msg + ": " + nodeString(t, name, tag...))
   463  }
   464  
   465  // FixedLenByteArray decimals are sized based on precision
   466  // this function calculates the necessary byte array size.
   467  func decimalFixedLenByteArraySize(precision int) int {
   468  	return int(math.Ceil((math.Log10(2) + float64(precision)) / math.Log10(256)))
   469  }
   470  
   471  func forEachStructTagOption(sf reflect.StructField, do func(t reflect.Type, option, args string)) {
   472  	if tag := sf.Tag.Get("parquet"); tag != "" {
   473  		_, tag = split(tag) // skip the field name
   474  		for tag != "" {
   475  			option := ""
   476  			args := ""
   477  			option, tag = split(tag)
   478  			option, args = splitOptionArgs(option)
   479  			ft := sf.Type
   480  			if ft.Kind() == reflect.Ptr {
   481  				ft = ft.Elem()
   482  			}
   483  			do(ft, option, args)
   484  		}
   485  	}
   486  }
   487  
   488  func nodeOf(t reflect.Type, tag []string) Node {
   489  	switch t {
   490  	case reflect.TypeOf(deprecated.Int96{}):
   491  		return Leaf(Int96Type)
   492  	case reflect.TypeOf(uuid.UUID{}):
   493  		return UUID()
   494  	case reflect.TypeOf(time.Time{}):
   495  		return Timestamp(Nanosecond)
   496  	}
   497  
   498  	var n Node
   499  	switch t.Kind() {
   500  	case reflect.Bool:
   501  		n = Leaf(BooleanType)
   502  
   503  	case reflect.Int, reflect.Int64:
   504  		n = Int(64)
   505  
   506  	case reflect.Int8, reflect.Int16, reflect.Int32:
   507  		n = Int(t.Bits())
   508  
   509  	case reflect.Uint, reflect.Uintptr, reflect.Uint64:
   510  		n = Uint(64)
   511  
   512  	case reflect.Uint8, reflect.Uint16, reflect.Uint32:
   513  		n = Uint(t.Bits())
   514  
   515  	case reflect.Float32:
   516  		n = Leaf(FloatType)
   517  
   518  	case reflect.Float64:
   519  		n = Leaf(DoubleType)
   520  
   521  	case reflect.String:
   522  		n = String()
   523  
   524  	case reflect.Ptr:
   525  		n = Optional(nodeOf(t.Elem(), nil))
   526  
   527  	case reflect.Slice:
   528  		if elem := t.Elem(); elem.Kind() == reflect.Uint8 { // []byte?
   529  			n = Leaf(ByteArrayType)
   530  		} else {
   531  			n = Repeated(nodeOf(elem, nil))
   532  		}
   533  
   534  	case reflect.Array:
   535  		if t.Elem().Kind() == reflect.Uint8 {
   536  			n = Leaf(FixedLenByteArrayType(t.Len()))
   537  		}
   538  
   539  	case reflect.Map:
   540  		var mapTag, valueTag, keyTag string
   541  		if len(tag) > 0 {
   542  			mapTag = tag[0]
   543  			if len(tag) > 1 {
   544  				keyTag = tag[1]
   545  			}
   546  			if len(tag) >= 2 {
   547  				valueTag = tag[2]
   548  			}
   549  		}
   550  
   551  		if strings.Contains(mapTag, "json") {
   552  			n = JSON()
   553  		} else {
   554  			n = Map(
   555  				makeNodeOf(t.Key(), t.Name(), []string{keyTag}),
   556  				makeNodeOf(t.Elem(), t.Name(), []string{valueTag}),
   557  			)
   558  		}
   559  
   560  		forEachTagOption([]string{mapTag}, func(option, args string) {
   561  			switch option {
   562  			case "", "json":
   563  				return
   564  			case "optional":
   565  				n = Optional(n)
   566  			default:
   567  				throwUnknownTag(t, "map", option)
   568  			}
   569  		})
   570  
   571  	case reflect.Struct:
   572  		return structNodeOf(t)
   573  	}
   574  
   575  	if n == nil {
   576  		panic("cannot create parquet node from go value of type " + t.String())
   577  	}
   578  
   579  	return &goNode{Node: n, gotype: t}
   580  }
   581  
   582  func split(s string) (head, tail string) {
   583  	if i := strings.IndexByte(s, ','); i < 0 {
   584  		head = s
   585  	} else {
   586  		head, tail = s[:i], s[i+1:]
   587  	}
   588  	return
   589  }
   590  
   591  func splitOptionArgs(s string) (option, args string) {
   592  	if i := strings.IndexByte(s, '('); i >= 0 {
   593  		option = s[:i]
   594  		args = s[i:]
   595  	} else {
   596  		option = s
   597  		args = "()"
   598  	}
   599  	return
   600  }
   601  
   602  func parseDecimalArgs(args string) (scale, precision int, err error) {
   603  	if !strings.HasPrefix(args, "(") || !strings.HasSuffix(args, ")") {
   604  		return 0, 0, fmt.Errorf("malformed decimal args: %s", args)
   605  	}
   606  	args = strings.TrimPrefix(args, "(")
   607  	args = strings.TrimSuffix(args, ")")
   608  	parts := strings.Split(args, ":")
   609  	if len(parts) != 2 {
   610  		return 0, 0, fmt.Errorf("malformed decimal args: (%s)", args)
   611  	}
   612  	s, err := strconv.ParseInt(parts[0], 10, 32)
   613  	if err != nil {
   614  		return 0, 0, err
   615  	}
   616  	p, err := strconv.ParseInt(parts[1], 10, 32)
   617  	if err != nil {
   618  		return 0, 0, err
   619  	}
   620  	return int(s), int(p), nil
   621  }
   622  
   623  func parseTimestampArgs(args string) (TimeUnit, error) {
   624  	if !strings.HasPrefix(args, "(") || !strings.HasSuffix(args, ")") {
   625  		return nil, fmt.Errorf("malformed timestamp args: %s", args)
   626  	}
   627  
   628  	args = strings.TrimPrefix(args, "(")
   629  	args = strings.TrimSuffix(args, ")")
   630  
   631  	if len(args) == 0 {
   632  		return Millisecond, nil
   633  	}
   634  
   635  	switch args {
   636  	case "millisecond":
   637  		return Millisecond, nil
   638  	case "microsecond":
   639  		return Microsecond, nil
   640  	case "nanosecond":
   641  		return Nanosecond, nil
   642  	default:
   643  	}
   644  
   645  	return nil, fmt.Errorf("unknown time unit: %s", args)
   646  }
   647  
   648  type goNode struct {
   649  	Node
   650  	gotype reflect.Type
   651  }
   652  
   653  func (n *goNode) GoType() reflect.Type { return n.gotype }
   654  
   655  var (
   656  	_ RowGroupOption = (*Schema)(nil)
   657  	_ ReaderOption   = (*Schema)(nil)
   658  	_ WriterOption   = (*Schema)(nil)
   659  )
   660  
   661  func makeNodeOf(t reflect.Type, name string, tag []string) Node {
   662  	var (
   663  		node       Node
   664  		optional   bool
   665  		list       bool
   666  		encoded    encoding.Encoding
   667  		compressed compress.Codec
   668  	)
   669  
   670  	setNode := func(n Node) {
   671  		if node != nil {
   672  			throwInvalidNode(t, "struct field has multiple logical parquet types declared", name, tag...)
   673  		}
   674  		node = n
   675  	}
   676  
   677  	setOptional := func() {
   678  		if optional {
   679  			throwInvalidNode(t, "struct field has multiple declaration of the optional tag", name, tag...)
   680  		}
   681  		optional = true
   682  	}
   683  
   684  	setList := func() {
   685  		if list {
   686  			throwInvalidNode(t, "struct field has multiple declaration of the list tag", name, tag...)
   687  		}
   688  		list = true
   689  	}
   690  
   691  	setEncoding := func(e encoding.Encoding) {
   692  		if encoded != nil {
   693  			throwInvalidNode(t, "struct field has encoding declared multiple time", name, tag...)
   694  		}
   695  		encoded = e
   696  	}
   697  
   698  	setCompression := func(c compress.Codec) {
   699  		if compressed != nil {
   700  			throwInvalidNode(t, "struct field has compression codecs declared multiple times", name, tag...)
   701  		}
   702  		compressed = c
   703  	}
   704  
   705  	forEachTagOption(tag, func(option, args string) {
   706  		if t.Kind() == reflect.Map {
   707  			node = nodeOf(t, tag)
   708  			return
   709  		}
   710  		switch option {
   711  		case "":
   712  			return
   713  		case "optional":
   714  			setOptional()
   715  
   716  		case "snappy":
   717  			setCompression(&Snappy)
   718  
   719  		case "gzip":
   720  			setCompression(&Gzip)
   721  
   722  		case "brotli":
   723  			setCompression(&Brotli)
   724  
   725  		case "lz4":
   726  			setCompression(&Lz4Raw)
   727  
   728  		case "zstd":
   729  			setCompression(&Zstd)
   730  
   731  		case "uncompressed":
   732  			setCompression(&Uncompressed)
   733  
   734  		case "plain":
   735  			setEncoding(&Plain)
   736  
   737  		case "dict":
   738  			setEncoding(&RLEDictionary)
   739  
   740  		case "json":
   741  			setNode(JSON())
   742  
   743  		case "delta":
   744  			switch t.Kind() {
   745  			case reflect.Int, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint32, reflect.Uint64:
   746  				setEncoding(&DeltaBinaryPacked)
   747  			case reflect.String:
   748  				setEncoding(&DeltaByteArray)
   749  			case reflect.Slice:
   750  				if t.Elem().Kind() == reflect.Uint8 { // []byte?
   751  					setEncoding(&DeltaByteArray)
   752  				} else {
   753  					throwInvalidTag(t, name, option)
   754  				}
   755  			case reflect.Array:
   756  				if t.Elem().Kind() == reflect.Uint8 { // [N]byte?
   757  					setEncoding(&DeltaByteArray)
   758  				} else {
   759  					throwInvalidTag(t, name, option)
   760  				}
   761  			default:
   762  				throwInvalidTag(t, name, option)
   763  			}
   764  
   765  		case "split":
   766  			switch t.Kind() {
   767  			case reflect.Float32, reflect.Float64:
   768  				setEncoding(&ByteStreamSplit)
   769  			default:
   770  				throwInvalidTag(t, name, option)
   771  			}
   772  
   773  		case "list":
   774  			switch t.Kind() {
   775  			case reflect.Slice:
   776  				element := nodeOf(t.Elem(), nil)
   777  				setNode(element)
   778  				setList()
   779  			default:
   780  				throwInvalidTag(t, name, option)
   781  			}
   782  
   783  		case "enum":
   784  			switch t.Kind() {
   785  			case reflect.String:
   786  				setNode(Enum())
   787  			default:
   788  				throwInvalidTag(t, name, option)
   789  			}
   790  
   791  		case "uuid":
   792  			switch t.Kind() {
   793  			case reflect.Array:
   794  				if t.Elem().Kind() != reflect.Uint8 || t.Len() != 16 {
   795  					throwInvalidTag(t, name, option)
   796  				}
   797  			default:
   798  				throwInvalidTag(t, name, option)
   799  			}
   800  
   801  		case "decimal":
   802  			scale, precision, err := parseDecimalArgs(args)
   803  			if err != nil {
   804  				throwInvalidTag(t, name, option+args)
   805  			}
   806  			var baseType Type
   807  			switch t.Kind() {
   808  			case reflect.Int32:
   809  				baseType = Int32Type
   810  			case reflect.Int64:
   811  				baseType = Int64Type
   812  			case reflect.Array, reflect.Slice:
   813  				baseType = FixedLenByteArrayType(decimalFixedLenByteArraySize(precision))
   814  			default:
   815  				throwInvalidTag(t, name, option)
   816  			}
   817  
   818  			setNode(Decimal(scale, precision, baseType))
   819  		case "date":
   820  			switch t.Kind() {
   821  			case reflect.Int32:
   822  				setNode(Date())
   823  			default:
   824  				throwInvalidTag(t, name, option)
   825  			}
   826  		case "timestamp":
   827  			switch t.Kind() {
   828  			case reflect.Int64:
   829  				timeUnit, err := parseTimestampArgs(args)
   830  				if err != nil {
   831  					throwInvalidTag(t, name, option)
   832  				}
   833  				setNode(Timestamp(timeUnit))
   834  			default:
   835  				switch t {
   836  				case reflect.TypeOf(time.Time{}):
   837  					timeUnit, err := parseTimestampArgs(args)
   838  					if err != nil {
   839  						throwInvalidTag(t, name, option)
   840  					}
   841  					setNode(Timestamp(timeUnit))
   842  				default:
   843  					throwInvalidTag(t, name, option)
   844  				}
   845  			}
   846  		default:
   847  			throwUnknownTag(t, name, option)
   848  		}
   849  	})
   850  
   851  	// Special case: an "optional" struct tag on a slice applies to the
   852  	// individual items, not the overall list. The least messy way to
   853  	// deal with this is at this level, instead of passing down optional
   854  	// information into the nodeOf function, and then passing back whether an
   855  	// optional tag was applied.
   856  	if node == nil && t.Kind() == reflect.Slice {
   857  		isUint8 := t.Elem().Kind() == reflect.Uint8
   858  		// Note for strings "optional" applies only to the entire BYTE_ARRAY and
   859  		// not each individual byte.
   860  		if optional && !isUint8 {
   861  			node = Repeated(Optional(nodeOf(t.Elem(), tag)))
   862  			// Don't also apply "optional" to the whole list.
   863  			optional = false
   864  		}
   865  	}
   866  
   867  	if node == nil {
   868  		node = nodeOf(t, tag)
   869  	}
   870  
   871  	if compressed != nil {
   872  		node = Compressed(node, compressed)
   873  	}
   874  
   875  	if encoded != nil {
   876  		node = Encoded(node, encoded)
   877  	}
   878  
   879  	if list {
   880  		node = List(node)
   881  	}
   882  
   883  	if node.Repeated() && !list {
   884  		elemKind := node.GoType().Elem().Kind()
   885  		if elemKind == reflect.Slice {
   886  			panic("unhandled nested slice on parquet schema without list tag")
   887  		}
   888  	}
   889  
   890  	if optional {
   891  		node = Optional(node)
   892  	}
   893  
   894  	return node
   895  }
   896  
   897  func forEachTagOption(tags []string, do func(option, args string)) {
   898  	for _, tag := range tags {
   899  		_, tag = split(tag) // skip the field name
   900  		for tag != "" {
   901  			option := ""
   902  			option, tag = split(tag)
   903  			var args string
   904  			option, args = splitOptionArgs(option)
   905  			do(option, args)
   906  		}
   907  	}
   908  }