github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/schema.go

github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/schema.go (about)

     1  package parquet
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"reflect"
     7  	"strconv"
     8  	"strings"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/google/uuid"
    13  	"github.com/parquet-go/parquet-go/compress"
    14  	"github.com/parquet-go/parquet-go/deprecated"
    15  	"github.com/parquet-go/parquet-go/encoding"
    16  )
    17  
    18  // Schema represents a parquet schema created from a Go value.
    19  //
    20  // Schema implements the Node interface to represent the root node of a parquet
    21  // schema.
    22  type Schema struct {
    23  	name        string
    24  	root        Node
    25  	deconstruct deconstructFunc
    26  	reconstruct reconstructFunc
    27  	mapping     columnMapping
    28  	columns     [][]string
    29  }
    30  
    31  // SchemaOf constructs a parquet schema from a Go value.
    32  //
    33  // The function can construct parquet schemas from struct or pointer-to-struct
    34  // values only. A panic is raised if a Go value of a different type is passed
    35  // to this function.
    36  //
    37  // When creating a parquet Schema from a Go value, the struct fields may contain
    38  // a "parquet" tag to describe properties of the parquet node. The "parquet" tag
    39  // follows the conventional format of Go struct tags: a comma-separated list of
    40  // values describe the options, with the first one defining the name of the
    41  // parquet column.
    42  //
    43  // The following options are also supported in the "parquet" struct tag:
    44  //
    45  //	optional  | make the parquet column optional
    46  //	snappy    | sets the parquet column compression codec to snappy
    47  //	gzip      | sets the parquet column compression codec to gzip
    48  //	brotli    | sets the parquet column compression codec to brotli
    49  //	lz4       | sets the parquet column compression codec to lz4
    50  //	zstd      | sets the parquet column compression codec to zstd
    51  //	plain     | enables the plain encoding (no-op default)
    52  //	dict      | enables dictionary encoding on the parquet column
    53  //	delta     | enables delta encoding on the parquet column
    54  //	list      | for slice types, use the parquet LIST logical type
    55  //	enum      | for string types, use the parquet ENUM logical type
    56  //	uuid      | for string and [16]byte types, use the parquet UUID logical type
    57  //	decimal   | for int32, int64 and [n]byte types, use the parquet DECIMAL logical type
    58  //	date      | for int32 types use the DATE logical type
    59  //	timestamp | for int64 types use the TIMESTAMP logical type with, by default, millisecond precision
    60  //	split     | for float32/float64, use the BYTE_STREAM_SPLIT encoding
    61  //	id(n)     | where n is int denoting a column field id. Example id(2) for a column with field id of 2
    62  //
    63  // # The date logical type is an int32 value of the number of days since the unix epoch
    64  //
    65  // The timestamp precision can be changed by defining which precision to use as an argument.
    66  // Supported precisions are: nanosecond, millisecond and microsecond. Example:
    67  //
    68  //	type Message struct {
    69  //	  TimestrampMicros int64 `parquet:"timestamp_micros,timestamp(microsecond)"
    70  //	}
    71  //
    72  // The decimal tag must be followed by two integer parameters, the first integer
    73  // representing the scale and the second the precision; for example:
    74  //
    75  //	type Item struct {
    76  //		Cost int64 `parquet:"cost,decimal(0:3)"`
    77  //	}
    78  //
    79  // Invalid combination of struct tags and Go types, or repeating options will
    80  // cause the function to panic.
    81  //
    82  // As a special case, if the field tag is "-", the field is omitted from the schema
    83  // and the data will not be written into the parquet file(s).
    84  // Note that a field with name "-" can still be generated using the tag "-,".
    85  //
    86  // The configuration of Parquet maps are done via two tags:
    87  //   - The `parquet-key` tag allows to configure the key of a map.
    88  //   - The parquet-value tag allows users to configure a map's values, for example to declare their native Parquet types.
    89  //
    90  // When configuring a Parquet map, the `parquet` tag will configure the map itself.
    91  //
    92  // For example, the following will set the int64 key of the map to be a timestamp:
    93  //
    94  //	type Actions struct {
    95  //	  Action map[int64]string `parquet:"," parquet-key:",timestamp"`
    96  //	}
    97  //
    98  // The schema name is the Go type name of the value.
    99  func SchemaOf(model interface{}) *Schema {
   100  	return schemaOf(dereference(reflect.TypeOf(model)))
   101  }
   102  
   103  var cachedSchemas sync.Map // map[reflect.Type]*Schema
   104  
   105  func schemaOf(model reflect.Type) *Schema {
   106  	cached, _ := cachedSchemas.Load(model)
   107  	schema, _ := cached.(*Schema)
   108  	if schema != nil {
   109  		return schema
   110  	}
   111  	if model.Kind() != reflect.Struct {
   112  		panic("cannot construct parquet schema from value of type " + model.String())
   113  	}
   114  	schema = NewSchema(model.Name(), nodeOf(model, nil))
   115  	if actual, loaded := cachedSchemas.LoadOrStore(model, schema); loaded {
   116  		schema = actual.(*Schema)
   117  	}
   118  	return schema
   119  }
   120  
   121  // NewSchema constructs a new Schema object with the given name and root node.
   122  //
   123  // The function panics if Node contains more leaf columns than supported by the
   124  // package (see parquet.MaxColumnIndex).
   125  func NewSchema(name string, root Node) *Schema {
   126  	mapping, columns := columnMappingOf(root)
   127  	return &Schema{
   128  		name:        name,
   129  		root:        root,
   130  		deconstruct: makeDeconstructFunc(root),
   131  		reconstruct: makeReconstructFunc(root),
   132  		mapping:     mapping,
   133  		columns:     columns,
   134  	}
   135  }
   136  
   137  func dereference(t reflect.Type) reflect.Type {
   138  	for t.Kind() == reflect.Ptr {
   139  		t = t.Elem()
   140  	}
   141  	return t
   142  }
   143  
   144  func makeDeconstructFunc(node Node) (deconstruct deconstructFunc) {
   145  	if schema, _ := node.(*Schema); schema != nil {
   146  		return schema.deconstruct
   147  	}
   148  	if !node.Leaf() {
   149  		_, deconstruct = deconstructFuncOf(0, node)
   150  	}
   151  	return deconstruct
   152  }
   153  
   154  func makeReconstructFunc(node Node) (reconstruct reconstructFunc) {
   155  	if schema, _ := node.(*Schema); schema != nil {
   156  		return schema.reconstruct
   157  	}
   158  	if !node.Leaf() {
   159  		_, reconstruct = reconstructFuncOf(0, node)
   160  	}
   161  	return reconstruct
   162  }
   163  
   164  // ConfigureRowGroup satisfies the RowGroupOption interface, allowing Schema
   165  // instances to be passed to row group constructors to pre-declare the schema of
   166  // the output parquet file.
   167  func (s *Schema) ConfigureRowGroup(config *RowGroupConfig) { config.Schema = s }
   168  
   169  // ConfigureReader satisfies the ReaderOption interface, allowing Schema
   170  // instances to be passed to NewReader to pre-declare the schema of rows
   171  // read from the reader.
   172  func (s *Schema) ConfigureReader(config *ReaderConfig) { config.Schema = s }
   173  
   174  // ConfigureWriter satisfies the WriterOption interface, allowing Schema
   175  // instances to be passed to NewWriter to pre-declare the schema of the
   176  // output parquet file.
   177  func (s *Schema) ConfigureWriter(config *WriterConfig) { config.Schema = s }
   178  
   179  // ID returns field id of the root node.
   180  func (s *Schema) ID() int { return s.root.ID() }
   181  
   182  // String returns a parquet schema representation of s.
   183  func (s *Schema) String() string { return sprint(s.name, s.root) }
   184  
   185  // Name returns the name of s.
   186  func (s *Schema) Name() string { return s.name }
   187  
   188  // Type returns the parquet type of s.
   189  func (s *Schema) Type() Type { return s.root.Type() }
   190  
   191  // Optional returns false since the root node of a parquet schema is always required.
   192  func (s *Schema) Optional() bool { return s.root.Optional() }
   193  
   194  // Repeated returns false since the root node of a parquet schema is always required.
   195  func (s *Schema) Repeated() bool { return s.root.Repeated() }
   196  
   197  // Required returns true since the root node of a parquet schema is always required.
   198  func (s *Schema) Required() bool { return s.root.Required() }
   199  
   200  // Leaf returns true if the root node of the parquet schema is a leaf column.
   201  func (s *Schema) Leaf() bool { return s.root.Leaf() }
   202  
   203  // Fields returns the list of fields on the root node of the parquet schema.
   204  func (s *Schema) Fields() []Field { return s.root.Fields() }
   205  
   206  // Encoding returns the encoding set on the root node of the parquet schema.
   207  func (s *Schema) Encoding() encoding.Encoding { return s.root.Encoding() }
   208  
   209  // Compression returns the compression codec set on the root node of the parquet
   210  // schema.
   211  func (s *Schema) Compression() compress.Codec { return s.root.Compression() }
   212  
   213  // GoType returns the Go type that best represents the schema.
   214  func (s *Schema) GoType() reflect.Type { return s.root.GoType() }
   215  
   216  // Deconstruct deconstructs a Go value and appends it to a row.
   217  //
   218  // The method panics is the structure of the go value does not match the
   219  // parquet schema.
   220  func (s *Schema) Deconstruct(row Row, value interface{}) Row {
   221  	columns := make([][]Value, len(s.columns))
   222  	values := make([]Value, len(s.columns))
   223  
   224  	for i := range columns {
   225  		columns[i] = values[i : i : i+1]
   226  	}
   227  
   228  	s.deconstructValueToColumns(columns, reflect.ValueOf(value))
   229  	return appendRow(row, columns)
   230  }
   231  
   232  func (s *Schema) deconstructValueToColumns(columns [][]Value, value reflect.Value) {
   233  	for value.Kind() == reflect.Ptr || value.Kind() == reflect.Interface {
   234  		if value.IsNil() {
   235  			value = reflect.Value{}
   236  			break
   237  		}
   238  		value = value.Elem()
   239  	}
   240  	s.deconstruct(columns, levels{}, value)
   241  }
   242  
   243  // Reconstruct reconstructs a Go value from a row.
   244  //
   245  // The go value passed as first argument must be a non-nil pointer for the
   246  // row to be decoded into.
   247  //
   248  // The method panics if the structure of the go value and parquet row do not
   249  // match.
   250  func (s *Schema) Reconstruct(value interface{}, row Row) error {
   251  	v := reflect.ValueOf(value)
   252  	if !v.IsValid() {
   253  		panic("cannot reconstruct row into go value of type <nil>")
   254  	}
   255  	if v.Kind() != reflect.Ptr {
   256  		panic("cannot reconstruct row into go value of non-pointer type " + v.Type().String())
   257  	}
   258  	if v.IsNil() {
   259  		panic("cannot reconstruct row into nil pointer of type " + v.Type().String())
   260  	}
   261  	for v.Kind() == reflect.Ptr {
   262  		if v.IsNil() {
   263  			v.Set(reflect.New(v.Type().Elem()))
   264  		}
   265  		v = v.Elem()
   266  	}
   267  
   268  	b := valuesSliceBufferPool.Get().(*valuesSliceBuffer)
   269  
   270  	columns := b.reserve(len(s.columns))
   271  	row.Range(func(columnIndex int, columnValues []Value) bool {
   272  		if columnIndex < len(columns) {
   273  			columns[columnIndex] = columnValues
   274  		}
   275  		return true
   276  	})
   277  	// we avoid the defer penalty by releasing b manually
   278  	err := s.reconstruct(v, levels{}, columns)
   279  	b.release()
   280  	return err
   281  }
   282  
   283  type valuesSliceBuffer struct {
   284  	values [][]Value
   285  }
   286  
   287  func (v *valuesSliceBuffer) reserve(n int) [][]Value {
   288  	if n <= cap(v.values) {
   289  		return v.values[:n]
   290  	}
   291  	// we can try to keep growing by the power of two, but we care more about the
   292  	// memory footprint so  this should suffice.
   293  	//
   294  	// The nature of reads tends to be from similar number of columns.The less work
   295  	// we do here the better performance we can get.
   296  	v.values = make([][]Value, n)
   297  	return v.values
   298  }
   299  
   300  func (v *valuesSliceBuffer) release() {
   301  	v.values = v.values[:0]
   302  	valuesSliceBufferPool.Put(v)
   303  }
   304  
   305  var valuesSliceBufferPool = &sync.Pool{
   306  	New: func() interface{} {
   307  		return &valuesSliceBuffer{
   308  			// use 64 as a cache friendly base estimate of max column numbers we will be
   309  			// reading.
   310  			values: make([][]Value, 0, 64),
   311  		}
   312  	},
   313  }
   314  
   315  // Lookup returns the leaf column at the given path.
   316  //
   317  // The path is the sequence of column names identifying a leaf column (not
   318  // including the root).
   319  //
   320  // If the path was not found in the mapping, or if it did not represent a
   321  // leaf column of the parquet schema, the boolean will be false.
   322  func (s *Schema) Lookup(path ...string) (LeafColumn, bool) {
   323  	leaf := s.mapping.lookup(path)
   324  	return LeafColumn{
   325  		Node:               leaf.node,
   326  		Path:               leaf.path,
   327  		ColumnIndex:        int(leaf.columnIndex),
   328  		MaxRepetitionLevel: int(leaf.maxRepetitionLevel),
   329  		MaxDefinitionLevel: int(leaf.maxDefinitionLevel),
   330  	}, leaf.node != nil
   331  }
   332  
   333  // Columns returns the list of column paths available in the schema.
   334  //
   335  // The method always returns the same slice value across calls to ColumnPaths,
   336  // applications should treat it as immutable.
   337  func (s *Schema) Columns() [][]string {
   338  	return s.columns
   339  }
   340  
   341  // Comparator constructs a comparator function which orders rows according to
   342  // the list of sorting columns passed as arguments.
   343  func (s *Schema) Comparator(sortingColumns ...SortingColumn) func(Row, Row) int {
   344  	return compareRowsFuncOf(s, sortingColumns)
   345  }
   346  
   347  func (s *Schema) forEachNode(do func(name string, node Node)) {
   348  	forEachNodeOf(s.Name(), s, do)
   349  }
   350  
   351  type structNode struct {
   352  	gotype reflect.Type
   353  	fields []structField
   354  }
   355  
   356  func structNodeOf(t reflect.Type) *structNode {
   357  	// Collect struct fields first so we can order them before generating the
   358  	// column indexes.
   359  	fields := structFieldsOf(t)
   360  
   361  	s := &structNode{
   362  		gotype: t,
   363  		fields: make([]structField, len(fields)),
   364  	}
   365  
   366  	for i := range fields {
   367  		field := structField{name: fields[i].Name, index: fields[i].Index}
   368  		field.Node = makeNodeOf(fields[i].Type, fields[i].Name, []string{
   369  			fields[i].Tag.Get("parquet"),
   370  			fields[i].Tag.Get("parquet-key"),
   371  			fields[i].Tag.Get("parquet-value"),
   372  		})
   373  		s.fields[i] = field
   374  	}
   375  
   376  	return s
   377  }
   378  
   379  func structFieldsOf(t reflect.Type) []reflect.StructField {
   380  	fields := appendStructFields(t, nil, nil, 0)
   381  
   382  	for i := range fields {
   383  		f := &fields[i]
   384  
   385  		if tag := f.Tag.Get("parquet"); tag != "" {
   386  			name, _ := split(tag)
   387  			if name != "" {
   388  				f.Name = name
   389  			}
   390  		}
   391  	}
   392  
   393  	return fields
   394  }
   395  
   396  func appendStructFields(t reflect.Type, fields []reflect.StructField, index []int, offset uintptr) []reflect.StructField {
   397  	for i, n := 0, t.NumField(); i < n; i++ {
   398  		f := t.Field(i)
   399  		if tag := f.Tag.Get("parquet"); tag != "" {
   400  			name, _ := split(tag)
   401  			if tag != "-," && name == "-" {
   402  				continue
   403  			}
   404  		}
   405  
   406  		fieldIndex := index[:len(index):len(index)]
   407  		fieldIndex = append(fieldIndex, i)
   408  
   409  		f.Offset += offset
   410  
   411  		if f.Anonymous {
   412  			fields = appendStructFields(f.Type, fields, fieldIndex, f.Offset)
   413  		} else if f.IsExported() {
   414  			f.Index = fieldIndex
   415  			fields = append(fields, f)
   416  		}
   417  	}
   418  	return fields
   419  }
   420  
   421  func (s *structNode) Optional() bool { return false }
   422  
   423  func (s *structNode) Repeated() bool { return false }
   424  
   425  func (s *structNode) Required() bool { return true }
   426  
   427  func (s *structNode) Leaf() bool { return false }
   428  
   429  func (s *structNode) Encoding() encoding.Encoding { return nil }
   430  
   431  func (s *structNode) Compression() compress.Codec { return nil }
   432  
   433  func (s *structNode) GoType() reflect.Type { return s.gotype }
   434  
   435  func (s *structNode) ID() int { return 0 }
   436  
   437  func (s *structNode) String() string { return sprint("", s) }
   438  
   439  func (s *structNode) Type() Type { return groupType{} }
   440  
   441  func (s *structNode) Fields() []Field {
   442  	fields := make([]Field, len(s.fields))
   443  	for i := range s.fields {
   444  		fields[i] = &s.fields[i]
   445  	}
   446  	return fields
   447  }
   448  
   449  // fieldByIndex is like reflect.Value.FieldByIndex but returns the zero-value of
   450  // reflect.Value if one of the fields was a nil pointer instead of panicking.
   451  func fieldByIndex(v reflect.Value, index []int) reflect.Value {
   452  	for _, i := range index {
   453  		if v = v.Field(i); v.Kind() == reflect.Ptr || v.Kind() == reflect.Interface {
   454  			if v.IsNil() {
   455  				v.Set(reflect.New(v.Type().Elem()))
   456  				v = v.Elem()
   457  				break
   458  			} else {
   459  				v = v.Elem()
   460  			}
   461  		}
   462  	}
   463  	return v
   464  }
   465  
   466  type structField struct {
   467  	Node
   468  	name  string
   469  	index []int
   470  }
   471  
   472  func (f *structField) Name() string { return f.name }
   473  
   474  func (f *structField) Value(base reflect.Value) reflect.Value {
   475  	switch base.Kind() {
   476  	case reflect.Map:
   477  		return base.MapIndex(reflect.ValueOf(&f.name).Elem())
   478  	case reflect.Ptr:
   479  		if base.IsNil() {
   480  			base.Set(reflect.New(base.Type().Elem()))
   481  		}
   482  		return fieldByIndex(base.Elem(), f.index)
   483  	default:
   484  		if len(f.index) == 1 {
   485  			return base.Field(f.index[0])
   486  		} else {
   487  			return fieldByIndex(base, f.index)
   488  		}
   489  	}
   490  }
   491  
   492  func nodeString(t reflect.Type, name string, tag ...string) string {
   493  	return fmt.Sprintf("%s %s %v", name, t.String(), tag)
   494  }
   495  
   496  func throwInvalidTag(t reflect.Type, name string, tag string) {
   497  	panic(tag + " is an invalid parquet tag: " + nodeString(t, name, tag))
   498  }
   499  
   500  func throwUnknownTag(t reflect.Type, name string, tag string) {
   501  	panic(tag + " is an unrecognized parquet tag: " + nodeString(t, name, tag))
   502  }
   503  
   504  func throwInvalidNode(t reflect.Type, msg, name string, tag ...string) {
   505  	panic(msg + ": " + nodeString(t, name, tag...))
   506  }
   507  
   508  // FixedLenByteArray decimals are sized based on precision
   509  // this function calculates the necessary byte array size.
   510  func decimalFixedLenByteArraySize(precision int) int {
   511  	return int(math.Ceil((math.Log10(2) + float64(precision)) / math.Log10(256)))
   512  }
   513  
   514  func forEachStructTagOption(sf reflect.StructField, do func(t reflect.Type, option, args string)) {
   515  	if tag := sf.Tag.Get("parquet"); tag != "" {
   516  		_, tag = split(tag) // skip the field name
   517  		for tag != "" {
   518  			option := ""
   519  			args := ""
   520  			option, tag = split(tag)
   521  			option, args = splitOptionArgs(option)
   522  			ft := sf.Type
   523  			if ft.Kind() == reflect.Ptr {
   524  				ft = ft.Elem()
   525  			}
   526  			do(ft, option, args)
   527  		}
   528  	}
   529  }
   530  
   531  func nodeOf(t reflect.Type, tag []string) Node {
   532  	switch t {
   533  	case reflect.TypeOf(deprecated.Int96{}):
   534  		return Leaf(Int96Type)
   535  	case reflect.TypeOf(uuid.UUID{}):
   536  		return UUID()
   537  	case reflect.TypeOf(time.Time{}):
   538  		return Timestamp(Nanosecond)
   539  	}
   540  
   541  	var n Node
   542  	switch t.Kind() {
   543  	case reflect.Bool:
   544  		n = Leaf(BooleanType)
   545  
   546  	case reflect.Int, reflect.Int64:
   547  		n = Int(64)
   548  
   549  	case reflect.Int8, reflect.Int16, reflect.Int32:
   550  		n = Int(t.Bits())
   551  
   552  	case reflect.Uint, reflect.Uintptr, reflect.Uint64:
   553  		n = Uint(64)
   554  
   555  	case reflect.Uint8, reflect.Uint16, reflect.Uint32:
   556  		n = Uint(t.Bits())
   557  
   558  	case reflect.Float32:
   559  		n = Leaf(FloatType)
   560  
   561  	case reflect.Float64:
   562  		n = Leaf(DoubleType)
   563  
   564  	case reflect.String:
   565  		n = String()
   566  
   567  	case reflect.Ptr:
   568  		n = Optional(nodeOf(t.Elem(), nil))
   569  
   570  	case reflect.Slice:
   571  		if elem := t.Elem(); elem.Kind() == reflect.Uint8 { // []byte?
   572  			n = Leaf(ByteArrayType)
   573  		} else {
   574  			n = Repeated(nodeOf(elem, nil))
   575  		}
   576  
   577  	case reflect.Array:
   578  		if t.Elem().Kind() == reflect.Uint8 {
   579  			n = Leaf(FixedLenByteArrayType(t.Len()))
   580  		}
   581  
   582  	case reflect.Map:
   583  		var mapTag, valueTag, keyTag string
   584  		if len(tag) > 0 {
   585  			mapTag = tag[0]
   586  			if len(tag) > 1 {
   587  				keyTag = tag[1]
   588  			}
   589  			if len(tag) >= 2 {
   590  				valueTag = tag[2]
   591  			}
   592  		}
   593  
   594  		if strings.Contains(mapTag, "json") {
   595  			n = JSON()
   596  		} else {
   597  			n = Map(
   598  				makeNodeOf(t.Key(), t.Name(), []string{keyTag}),
   599  				makeNodeOf(t.Elem(), t.Name(), []string{valueTag}),
   600  			)
   601  		}
   602  
   603  		forEachTagOption([]string{mapTag}, func(option, args string) {
   604  			switch option {
   605  			case "", "json":
   606  				return
   607  			case "optional":
   608  				n = Optional(n)
   609  			case "id":
   610  				id, err := parseIDArgs(args)
   611  				if err != nil {
   612  					throwInvalidTag(t, "map", option)
   613  				}
   614  				n = FieldID(n, id)
   615  			default:
   616  				throwUnknownTag(t, "map", option)
   617  			}
   618  		})
   619  
   620  	case reflect.Struct:
   621  		return structNodeOf(t)
   622  	}
   623  
   624  	if n == nil {
   625  		panic("cannot create parquet node from go value of type " + t.String())
   626  	}
   627  
   628  	return &goNode{Node: n, gotype: t}
   629  }
   630  
   631  func split(s string) (head, tail string) {
   632  	if i := strings.IndexByte(s, ','); i < 0 {
   633  		head = s
   634  	} else {
   635  		head, tail = s[:i], s[i+1:]
   636  	}
   637  	return
   638  }
   639  
   640  func splitOptionArgs(s string) (option, args string) {
   641  	if i := strings.IndexByte(s, '('); i >= 0 {
   642  		option = s[:i]
   643  		args = s[i:]
   644  	} else {
   645  		option = s
   646  		args = "()"
   647  	}
   648  	return
   649  }
   650  
   651  func parseDecimalArgs(args string) (scale, precision int, err error) {
   652  	if !strings.HasPrefix(args, "(") || !strings.HasSuffix(args, ")") {
   653  		return 0, 0, fmt.Errorf("malformed decimal args: %s", args)
   654  	}
   655  	args = strings.TrimPrefix(args, "(")
   656  	args = strings.TrimSuffix(args, ")")
   657  	parts := strings.Split(args, ":")
   658  	if len(parts) != 2 {
   659  		return 0, 0, fmt.Errorf("malformed decimal args: (%s)", args)
   660  	}
   661  	s, err := strconv.ParseInt(parts[0], 10, 32)
   662  	if err != nil {
   663  		return 0, 0, err
   664  	}
   665  	p, err := strconv.ParseInt(parts[1], 10, 32)
   666  	if err != nil {
   667  		return 0, 0, err
   668  	}
   669  	return int(s), int(p), nil
   670  }
   671  
   672  func parseIDArgs(args string) (int, error) {
   673  	if !strings.HasPrefix(args, "(") || !strings.HasSuffix(args, ")") {
   674  		return 0, fmt.Errorf("malformed id args: %s", args)
   675  	}
   676  	args = strings.TrimPrefix(args, "(")
   677  	args = strings.TrimSuffix(args, ")")
   678  	return strconv.Atoi(args)
   679  }
   680  
   681  func parseTimestampArgs(args string) (TimeUnit, error) {
   682  	if !strings.HasPrefix(args, "(") || !strings.HasSuffix(args, ")") {
   683  		return nil, fmt.Errorf("malformed timestamp args: %s", args)
   684  	}
   685  
   686  	args = strings.TrimPrefix(args, "(")
   687  	args = strings.TrimSuffix(args, ")")
   688  
   689  	if len(args) == 0 {
   690  		return Millisecond, nil
   691  	}
   692  
   693  	switch args {
   694  	case "millisecond":
   695  		return Millisecond, nil
   696  	case "microsecond":
   697  		return Microsecond, nil
   698  	case "nanosecond":
   699  		return Nanosecond, nil
   700  	default:
   701  	}
   702  
   703  	return nil, fmt.Errorf("unknown time unit: %s", args)
   704  }
   705  
   706  type goNode struct {
   707  	Node
   708  	gotype reflect.Type
   709  }
   710  
   711  func (n *goNode) GoType() reflect.Type { return n.gotype }
   712  
   713  var (
   714  	_ RowGroupOption = (*Schema)(nil)
   715  	_ ReaderOption   = (*Schema)(nil)
   716  	_ WriterOption   = (*Schema)(nil)
   717  )
   718  
   719  func makeNodeOf(t reflect.Type, name string, tag []string) Node {
   720  	var (
   721  		node       Node
   722  		optional   bool
   723  		list       bool
   724  		encoded    encoding.Encoding
   725  		compressed compress.Codec
   726  		fieldID    int
   727  	)
   728  
   729  	setNode := func(n Node) {
   730  		if node != nil {
   731  			throwInvalidNode(t, "struct field has multiple logical parquet types declared", name, tag...)
   732  		}
   733  		node = n
   734  	}
   735  
   736  	setOptional := func() {
   737  		if optional {
   738  			throwInvalidNode(t, "struct field has multiple declaration of the optional tag", name, tag...)
   739  		}
   740  		optional = true
   741  	}
   742  
   743  	setList := func() {
   744  		if list {
   745  			throwInvalidNode(t, "struct field has multiple declaration of the list tag", name, tag...)
   746  		}
   747  		list = true
   748  	}
   749  
   750  	setEncoding := func(e encoding.Encoding) {
   751  		if encoded != nil {
   752  			throwInvalidNode(t, "struct field has encoding declared multiple time", name, tag...)
   753  		}
   754  		encoded = e
   755  	}
   756  
   757  	setCompression := func(c compress.Codec) {
   758  		if compressed != nil {
   759  			throwInvalidNode(t, "struct field has compression codecs declared multiple times", name, tag...)
   760  		}
   761  		compressed = c
   762  	}
   763  
   764  	forEachTagOption(tag, func(option, args string) {
   765  		if t.Kind() == reflect.Map {
   766  			node = nodeOf(t, tag)
   767  			return
   768  		}
   769  		switch option {
   770  		case "":
   771  			return
   772  		case "optional":
   773  			setOptional()
   774  
   775  		case "snappy":
   776  			setCompression(&Snappy)
   777  
   778  		case "gzip":
   779  			setCompression(&Gzip)
   780  
   781  		case "brotli":
   782  			setCompression(&Brotli)
   783  
   784  		case "lz4":
   785  			setCompression(&Lz4Raw)
   786  
   787  		case "zstd":
   788  			setCompression(&Zstd)
   789  
   790  		case "uncompressed":
   791  			setCompression(&Uncompressed)
   792  
   793  		case "plain":
   794  			setEncoding(&Plain)
   795  
   796  		case "dict":
   797  			setEncoding(&RLEDictionary)
   798  
   799  		case "json":
   800  			setNode(JSON())
   801  
   802  		case "delta":
   803  			switch t.Kind() {
   804  			case reflect.Int, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint32, reflect.Uint64:
   805  				setEncoding(&DeltaBinaryPacked)
   806  			case reflect.String:
   807  				setEncoding(&DeltaByteArray)
   808  			case reflect.Slice:
   809  				if t.Elem().Kind() == reflect.Uint8 { // []byte?
   810  					setEncoding(&DeltaByteArray)
   811  				} else {
   812  					throwInvalidTag(t, name, option)
   813  				}
   814  			case reflect.Array:
   815  				if t.Elem().Kind() == reflect.Uint8 { // [N]byte?
   816  					setEncoding(&DeltaByteArray)
   817  				} else {
   818  					throwInvalidTag(t, name, option)
   819  				}
   820  			default:
   821  				switch t {
   822  				case reflect.TypeOf(time.Time{}):
   823  					setEncoding(&DeltaBinaryPacked)
   824  				default:
   825  					throwInvalidTag(t, name, option)
   826  				}
   827  			}
   828  
   829  		case "split":
   830  			switch t.Kind() {
   831  			case reflect.Float32, reflect.Float64:
   832  				setEncoding(&ByteStreamSplit)
   833  			default:
   834  				throwInvalidTag(t, name, option)
   835  			}
   836  
   837  		case "list":
   838  			switch t.Kind() {
   839  			case reflect.Slice:
   840  				element := nodeOf(t.Elem(), nil)
   841  				setNode(element)
   842  				setList()
   843  			default:
   844  				throwInvalidTag(t, name, option)
   845  			}
   846  
   847  		case "enum":
   848  			switch t.Kind() {
   849  			case reflect.String:
   850  				setNode(Enum())
   851  			default:
   852  				throwInvalidTag(t, name, option)
   853  			}
   854  
   855  		case "uuid":
   856  			switch t.Kind() {
   857  			case reflect.Array:
   858  				if t.Elem().Kind() != reflect.Uint8 || t.Len() != 16 {
   859  					throwInvalidTag(t, name, option)
   860  				}
   861  			default:
   862  				throwInvalidTag(t, name, option)
   863  			}
   864  
   865  		case "decimal":
   866  			scale, precision, err := parseDecimalArgs(args)
   867  			if err != nil {
   868  				throwInvalidTag(t, name, option+args)
   869  			}
   870  			var baseType Type
   871  			switch t.Kind() {
   872  			case reflect.Int32:
   873  				baseType = Int32Type
   874  			case reflect.Int64:
   875  				baseType = Int64Type
   876  			case reflect.Array, reflect.Slice:
   877  				baseType = FixedLenByteArrayType(decimalFixedLenByteArraySize(precision))
   878  			default:
   879  				throwInvalidTag(t, name, option)
   880  			}
   881  
   882  			setNode(Decimal(scale, precision, baseType))
   883  		case "date":
   884  			switch t.Kind() {
   885  			case reflect.Int32:
   886  				setNode(Date())
   887  			default:
   888  				throwInvalidTag(t, name, option)
   889  			}
   890  		case "timestamp":
   891  			switch t.Kind() {
   892  			case reflect.Int64:
   893  				timeUnit, err := parseTimestampArgs(args)
   894  				if err != nil {
   895  					throwInvalidTag(t, name, option)
   896  				}
   897  				setNode(Timestamp(timeUnit))
   898  			default:
   899  				switch t {
   900  				case reflect.TypeOf(time.Time{}):
   901  					timeUnit, err := parseTimestampArgs(args)
   902  					if err != nil {
   903  						throwInvalidTag(t, name, option)
   904  					}
   905  					setNode(Timestamp(timeUnit))
   906  				default:
   907  					throwInvalidTag(t, name, option)
   908  				}
   909  			}
   910  		case "id":
   911  			id, err := parseIDArgs(args)
   912  			if err != nil {
   913  				throwInvalidNode(t, "struct field has field id that is not a valid int", name, tag...)
   914  			}
   915  			fieldID = id
   916  		}
   917  	})
   918  
   919  	// Special case: an "optional" struct tag on a slice applies to the
   920  	// individual items, not the overall list. The least messy way to
   921  	// deal with this is at this level, instead of passing down optional
   922  	// information into the nodeOf function, and then passing back whether an
   923  	// optional tag was applied.
   924  	if node == nil && t.Kind() == reflect.Slice {
   925  		isUint8 := t.Elem().Kind() == reflect.Uint8
   926  		// Note for strings "optional" applies only to the entire BYTE_ARRAY and
   927  		// not each individual byte.
   928  		if optional && !isUint8 {
   929  			node = Repeated(Optional(nodeOf(t.Elem(), tag)))
   930  			// Don't also apply "optional" to the whole list.
   931  			optional = false
   932  		}
   933  	}
   934  
   935  	if node == nil {
   936  		node = nodeOf(t, tag)
   937  	}
   938  
   939  	if compressed != nil {
   940  		node = Compressed(node, compressed)
   941  	}
   942  
   943  	if encoded != nil {
   944  		node = Encoded(node, encoded)
   945  	}
   946  
   947  	if list {
   948  		node = List(node)
   949  	}
   950  
   951  	if node.Repeated() && !list {
   952  		repeated := node.GoType().Elem()
   953  		if repeated.Kind() == reflect.Slice {
   954  			// Special case: allow [][]uint as seen in a logical map of strings
   955  			if repeated.Elem().Kind() != reflect.Uint8 {
   956  				panic("unhandled nested slice on parquet schema without list tag")
   957  			}
   958  		}
   959  	}
   960  
   961  	if optional {
   962  		node = Optional(node)
   963  	}
   964  	if fieldID != 0 {
   965  		node = FieldID(node, fieldID)
   966  	}
   967  	return node
   968  }
   969  
   970  func forEachTagOption(tags []string, do func(option, args string)) {
   971  	for _, tag := range tags {
   972  		_, tag = split(tag) // skip the field name
   973  		for tag != "" {
   974  			option := ""
   975  			option, tag = split(tag)
   976  			var args string
   977  			option, args = splitOptionArgs(option)
   978  			do(option, args)
   979  		}
   980  	}
   981  }