github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/node.go (about)

     1  package parquet
     2  
     3  import (
     4  	"reflect"
     5  	"sort"
     6  	"unicode"
     7  	"unicode/utf8"
     8  
     9  	"github.com/parquet-go/parquet-go/compress"
    10  	"github.com/parquet-go/parquet-go/deprecated"
    11  	"github.com/parquet-go/parquet-go/encoding"
    12  	"github.com/parquet-go/parquet-go/format"
    13  )
    14  
    15  // Node values represent nodes of a parquet schema.
    16  //
    17  // Nodes carry the type of values, as well as properties like whether the values
    18  // are optional or repeat. Nodes with one or more children represent parquet
    19  // groups and therefore do not have a logical type.
    20  //
    21  // Nodes are immutable values and therefore safe to use concurrently from
    22  // multiple goroutines.
    23  type Node interface {
    24  	// The id of this node in its parent node. Zero value is treated as id is not
    25  	// set. ID only needs to be unique within its parent context.
    26  	//
    27  	// This is the same as parquet field_id
    28  	ID() int
    29  
    30  	// Returns a human-readable representation of the parquet node.
    31  	String() string
    32  
    33  	// For leaf nodes, returns the type of values of the parquet column.
    34  	//
    35  	// Calling this method on non-leaf nodes will panic.
    36  	Type() Type
    37  
    38  	// Returns whether the parquet column is optional.
    39  	Optional() bool
    40  
    41  	// Returns whether the parquet column is repeated.
    42  	Repeated() bool
    43  
    44  	// Returns whether the parquet column is required.
    45  	Required() bool
    46  
    47  	// Returns true if this a leaf node.
    48  	Leaf() bool
    49  
    50  	// Returns a mapping of the node's fields.
    51  	//
    52  	// As an optimization, the same slices may be returned by multiple calls to
    53  	// this method, programs must treat the returned values as immutable.
    54  	//
    55  	// This method returns an empty mapping when called on leaf nodes.
    56  	Fields() []Field
    57  
    58  	// Returns the encoding used by the node.
    59  	//
    60  	// The method may return nil to indicate that no specific encoding was
    61  	// configured on the node, in which case a default encoding might be used.
    62  	Encoding() encoding.Encoding
    63  
    64  	// Returns compression codec used by the node.
    65  	//
    66  	// The method may return nil to indicate that no specific compression codec
    67  	// was configured on the node, in which case a default compression might be
    68  	// used.
    69  	Compression() compress.Codec
    70  
    71  	// Returns the Go type that best represents the parquet node.
    72  	//
    73  	// For leaf nodes, this will be one of bool, int32, int64, deprecated.Int96,
    74  	// float32, float64, string, []byte, or [N]byte.
    75  	//
    76  	// For groups, the method returns a struct type.
    77  	//
    78  	// If the method is called on a repeated node, the method returns a slice of
    79  	// the underlying type.
    80  	//
    81  	// For optional nodes, the method returns a pointer of the underlying type.
    82  	//
    83  	// For nodes that were constructed from Go values (e.g. using SchemaOf), the
    84  	// method returns the original Go type.
    85  	GoType() reflect.Type
    86  }
    87  
    88  // Field instances represent fields of a parquet node, which associate a node to
    89  // their name in their parent node.
    90  type Field interface {
    91  	Node
    92  
    93  	// Returns the name of this field in its parent node.
    94  	Name() string
    95  
    96  	// Given a reference to the Go value matching the structure of the parent
    97  	// node, returns the Go value of the field.
    98  	Value(base reflect.Value) reflect.Value
    99  }
   100  
   101  // Encoded wraps the node passed as argument to use the given encoding.
   102  //
   103  // The function panics if it is called on a non-leaf node, or if the
   104  // encoding does not support the node type.
   105  func Encoded(node Node, encoding encoding.Encoding) Node {
   106  	if !node.Leaf() {
   107  		panic("cannot add encoding to a non-leaf node")
   108  	}
   109  	if encoding != nil {
   110  		kind := node.Type().Kind()
   111  		if !canEncode(encoding, kind) {
   112  			panic("cannot apply " + encoding.Encoding().String() + " to node of type " + kind.String())
   113  		}
   114  	}
   115  	return &encodedNode{
   116  		Node:     node,
   117  		encoding: encoding,
   118  	}
   119  }
   120  
   121  type encodedNode struct {
   122  	Node
   123  	encoding encoding.Encoding
   124  }
   125  
   126  func (n *encodedNode) Encoding() encoding.Encoding {
   127  	return n.encoding
   128  }
   129  
   130  // Compressed wraps the node passed as argument to use the given compression
   131  // codec.
   132  //
   133  // If the codec is nil, the node's compression is left unchanged.
   134  //
   135  // The function panics if it is called on a non-leaf node.
   136  func Compressed(node Node, codec compress.Codec) Node {
   137  	if !node.Leaf() {
   138  		panic("cannot add compression codec to a non-leaf node")
   139  	}
   140  	return &compressedNode{
   141  		Node:  node,
   142  		codec: codec,
   143  	}
   144  }
   145  
   146  type compressedNode struct {
   147  	Node
   148  	codec compress.Codec
   149  }
   150  
   151  func (n *compressedNode) Compression() compress.Codec {
   152  	return n.codec
   153  }
   154  
   155  // Optional wraps the given node to make it optional.
   156  func Optional(node Node) Node { return &optionalNode{node} }
   157  
   158  type optionalNode struct{ Node }
   159  
   160  func (opt *optionalNode) Optional() bool       { return true }
   161  func (opt *optionalNode) Repeated() bool       { return false }
   162  func (opt *optionalNode) Required() bool       { return false }
   163  func (opt *optionalNode) GoType() reflect.Type { return reflect.PtrTo(opt.Node.GoType()) }
   164  
   165  // FieldID wraps a node to provide node field id
   166  func FieldID(node Node, id int) Node { return &fieldIDNode{Node: node, id: id} }
   167  
   168  type fieldIDNode struct {
   169  	Node
   170  	id int
   171  }
   172  
   173  func (f *fieldIDNode) ID() int { return f.id }
   174  
   175  // Repeated wraps the given node to make it repeated.
   176  func Repeated(node Node) Node { return &repeatedNode{node} }
   177  
   178  type repeatedNode struct{ Node }
   179  
   180  func (rep *repeatedNode) Optional() bool       { return false }
   181  func (rep *repeatedNode) Repeated() bool       { return true }
   182  func (rep *repeatedNode) Required() bool       { return false }
   183  func (rep *repeatedNode) GoType() reflect.Type { return reflect.SliceOf(rep.Node.GoType()) }
   184  
   185  // Required wraps the given node to make it required.
   186  func Required(node Node) Node { return &requiredNode{node} }
   187  
   188  type requiredNode struct{ Node }
   189  
   190  func (req *requiredNode) Optional() bool       { return false }
   191  func (req *requiredNode) Repeated() bool       { return false }
   192  func (req *requiredNode) Required() bool       { return true }
   193  func (req *requiredNode) GoType() reflect.Type { return req.Node.GoType() }
   194  
   195  type node struct{}
   196  
   197  // Leaf returns a leaf node of the given type.
   198  func Leaf(typ Type) Node {
   199  	return &leafNode{typ: typ}
   200  }
   201  
   202  type leafNode struct{ typ Type }
   203  
   204  func (n *leafNode) ID() int { return 0 }
   205  
   206  func (n *leafNode) String() string { return sprint("", n) }
   207  
   208  func (n *leafNode) Type() Type { return n.typ }
   209  
   210  func (n *leafNode) Optional() bool { return false }
   211  
   212  func (n *leafNode) Repeated() bool { return false }
   213  
   214  func (n *leafNode) Required() bool { return true }
   215  
   216  func (n *leafNode) Leaf() bool { return true }
   217  
   218  func (n *leafNode) Fields() []Field { return nil }
   219  
   220  func (n *leafNode) Encoding() encoding.Encoding { return nil }
   221  
   222  func (n *leafNode) Compression() compress.Codec { return nil }
   223  
   224  func (n *leafNode) GoType() reflect.Type { return goTypeOfLeaf(n) }
   225  
   226  var repetitionTypes = [...]format.FieldRepetitionType{
   227  	0: format.Required,
   228  	1: format.Optional,
   229  	2: format.Repeated,
   230  }
   231  
   232  func fieldRepetitionTypePtrOf(node Node) *format.FieldRepetitionType {
   233  	switch {
   234  	case node.Required():
   235  		return &repetitionTypes[format.Required]
   236  	case node.Optional():
   237  		return &repetitionTypes[format.Optional]
   238  	case node.Repeated():
   239  		return &repetitionTypes[format.Repeated]
   240  	default:
   241  		return nil
   242  	}
   243  }
   244  
   245  func fieldRepetitionTypeOf(node Node) format.FieldRepetitionType {
   246  	switch {
   247  	case node.Optional():
   248  		return format.Optional
   249  	case node.Repeated():
   250  		return format.Repeated
   251  	default:
   252  		return format.Required
   253  	}
   254  }
   255  
   256  func applyFieldRepetitionType(t format.FieldRepetitionType, repetitionLevel, definitionLevel byte) (byte, byte) {
   257  	switch t {
   258  	case format.Optional:
   259  		definitionLevel++
   260  	case format.Repeated:
   261  		repetitionLevel++
   262  		definitionLevel++
   263  	}
   264  	return repetitionLevel, definitionLevel
   265  }
   266  
   267  type Group map[string]Node
   268  
   269  func (g Group) ID() int { return 0 }
   270  
   271  func (g Group) String() string { return sprint("", g) }
   272  
   273  func (g Group) Type() Type { return groupType{} }
   274  
   275  func (g Group) Optional() bool { return false }
   276  
   277  func (g Group) Repeated() bool { return false }
   278  
   279  func (g Group) Required() bool { return true }
   280  
   281  func (g Group) Leaf() bool { return false }
   282  
   283  func (g Group) Fields() []Field {
   284  	groupFields := make([]groupField, 0, len(g))
   285  	for name, node := range g {
   286  		groupFields = append(groupFields, groupField{
   287  			Node: node,
   288  			name: name,
   289  		})
   290  	}
   291  	sort.Slice(groupFields, func(i, j int) bool {
   292  		return groupFields[i].name < groupFields[j].name
   293  	})
   294  	fields := make([]Field, len(groupFields))
   295  	for i := range groupFields {
   296  		fields[i] = &groupFields[i]
   297  	}
   298  	return fields
   299  }
   300  
   301  func (g Group) Encoding() encoding.Encoding { return nil }
   302  
   303  func (g Group) Compression() compress.Codec { return nil }
   304  
   305  func (g Group) GoType() reflect.Type { return goTypeOfGroup(g) }
   306  
   307  type groupField struct {
   308  	Node
   309  	name string
   310  }
   311  
   312  func (f *groupField) Name() string { return f.name }
   313  
   314  func (f *groupField) Value(base reflect.Value) reflect.Value {
   315  	if base.Kind() == reflect.Interface {
   316  		if base.IsNil() {
   317  			return reflect.ValueOf(nil)
   318  		}
   319  		if base = base.Elem(); base.Kind() == reflect.Pointer && base.IsNil() {
   320  			return reflect.ValueOf(nil)
   321  		}
   322  	}
   323  	return base.MapIndex(reflect.ValueOf(&f.name).Elem())
   324  }
   325  
   326  func goTypeOf(node Node) reflect.Type {
   327  	switch {
   328  	case node.Optional():
   329  		return goTypeOfOptional(node)
   330  	case node.Repeated():
   331  		return goTypeOfRepeated(node)
   332  	default:
   333  		return goTypeOfRequired(node)
   334  	}
   335  }
   336  
   337  func goTypeOfOptional(node Node) reflect.Type {
   338  	return reflect.PtrTo(goTypeOfRequired(node))
   339  }
   340  
   341  func goTypeOfRepeated(node Node) reflect.Type {
   342  	return reflect.SliceOf(goTypeOfRequired(node))
   343  }
   344  
   345  func goTypeOfRequired(node Node) reflect.Type {
   346  	if node.Leaf() {
   347  		return goTypeOfLeaf(node)
   348  	} else {
   349  		return goTypeOfGroup(node)
   350  	}
   351  }
   352  
   353  func goTypeOfLeaf(node Node) reflect.Type {
   354  	t := node.Type()
   355  	if convertibleType, ok := t.(interface{ GoType() reflect.Type }); ok {
   356  		return convertibleType.GoType()
   357  	}
   358  	switch t.Kind() {
   359  	case Boolean:
   360  		return reflect.TypeOf(false)
   361  	case Int32:
   362  		return reflect.TypeOf(int32(0))
   363  	case Int64:
   364  		return reflect.TypeOf(int64(0))
   365  	case Int96:
   366  		return reflect.TypeOf(deprecated.Int96{})
   367  	case Float:
   368  		return reflect.TypeOf(float32(0))
   369  	case Double:
   370  		return reflect.TypeOf(float64(0))
   371  	case ByteArray:
   372  		return reflect.TypeOf(([]byte)(nil))
   373  	case FixedLenByteArray:
   374  		return reflect.ArrayOf(t.Length(), reflect.TypeOf(byte(0)))
   375  	default:
   376  		panic("BUG: parquet type returned an unsupported kind")
   377  	}
   378  }
   379  
   380  func goTypeOfGroup(node Node) reflect.Type {
   381  	fields := node.Fields()
   382  	structFields := make([]reflect.StructField, len(fields))
   383  	for i, field := range fields {
   384  		structFields[i].Name = exportedStructFieldName(field.Name())
   385  		structFields[i].Type = field.GoType()
   386  		// TODO: can we reconstruct a struct tag that would be valid if a value
   387  		// of this type were passed to SchemaOf?
   388  	}
   389  	return reflect.StructOf(structFields)
   390  }
   391  
   392  func exportedStructFieldName(name string) string {
   393  	firstRune, size := utf8.DecodeRuneInString(name)
   394  	return string([]rune{unicode.ToUpper(firstRune)}) + name[size:]
   395  }
   396  
   397  func isList(node Node) bool {
   398  	logicalType := node.Type().LogicalType()
   399  	return logicalType != nil && logicalType.List != nil
   400  }
   401  
   402  func isMap(node Node) bool {
   403  	logicalType := node.Type().LogicalType()
   404  	return logicalType != nil && logicalType.Map != nil
   405  }
   406  
   407  func numLeafColumnsOf(node Node) int16 {
   408  	return makeColumnIndex(numLeafColumns(node, 0))
   409  }
   410  
   411  func numLeafColumns(node Node, columnIndex int) int {
   412  	if node.Leaf() {
   413  		return columnIndex + 1
   414  	}
   415  	for _, field := range node.Fields() {
   416  		columnIndex = numLeafColumns(field, columnIndex)
   417  	}
   418  	return columnIndex
   419  }
   420  
   421  func listElementOf(node Node) Node {
   422  	if !node.Leaf() {
   423  		if list := fieldByName(node, "list"); list != nil {
   424  			if elem := fieldByName(list, "element"); elem != nil {
   425  				return elem
   426  			}
   427  		}
   428  	}
   429  	panic("node with logical type LIST is not composed of a repeated .list.element")
   430  }
   431  
   432  func mapKeyValueOf(node Node) Node {
   433  	if !node.Leaf() && (node.Required() || node.Optional()) {
   434  		if keyValue := fieldByName(node, "key_value"); keyValue != nil && !keyValue.Leaf() && keyValue.Repeated() {
   435  			k := fieldByName(keyValue, "key")
   436  			v := fieldByName(keyValue, "value")
   437  			if k != nil && v != nil && k.Required() {
   438  				return keyValue
   439  			}
   440  		}
   441  	}
   442  	panic("node with logical type MAP is not composed of a repeated .key_value group with key and value fields")
   443  }
   444  
   445  func encodingOf(node Node) encoding.Encoding {
   446  	encoding := node.Encoding()
   447  	// The parquet-format documentation states that the
   448  	// DELTA_LENGTH_BYTE_ARRAY is always preferred to PLAIN when
   449  	// encoding BYTE_ARRAY values. We apply it as a default if
   450  	// none were explicitly specified, which gives the application
   451  	// the opportunity to override this behavior if needed.
   452  	//
   453  	// https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-length-byte-array-delta_length_byte_array--6
   454  	if node.Type().Kind() == ByteArray && encoding == nil {
   455  		encoding = &DeltaLengthByteArray
   456  	}
   457  	if encoding == nil {
   458  		encoding = &Plain
   459  	}
   460  	return encoding
   461  }
   462  
   463  func forEachNodeOf(name string, node Node, do func(string, Node)) {
   464  	do(name, node)
   465  
   466  	for _, f := range node.Fields() {
   467  		forEachNodeOf(f.Name(), f, do)
   468  	}
   469  }
   470  
   471  func fieldByName(node Node, name string) Field {
   472  	for _, f := range node.Fields() {
   473  		if f.Name() == name {
   474  			return f
   475  		}
   476  	}
   477  	return nil
   478  }
   479  
   480  func nodesAreEqual(node1, node2 Node) bool {
   481  	if node1.Leaf() {
   482  		return node2.Leaf() && leafNodesAreEqual(node1, node2)
   483  	} else {
   484  		return !node2.Leaf() && groupNodesAreEqual(node1, node2)
   485  	}
   486  }
   487  
   488  func typesAreEqual(type1, type2 Type) bool {
   489  	return type1.Kind() == type2.Kind() &&
   490  		type1.Length() == type2.Length() &&
   491  		reflect.DeepEqual(type1.LogicalType(), type2.LogicalType())
   492  }
   493  
   494  func repetitionsAreEqual(node1, node2 Node) bool {
   495  	return node1.Optional() == node2.Optional() && node1.Repeated() == node2.Repeated()
   496  }
   497  
   498  func leafNodesAreEqual(node1, node2 Node) bool {
   499  	return typesAreEqual(node1.Type(), node2.Type()) && repetitionsAreEqual(node1, node2)
   500  }
   501  
   502  func groupNodesAreEqual(node1, node2 Node) bool {
   503  	fields1 := node1.Fields()
   504  	fields2 := node2.Fields()
   505  
   506  	if len(fields1) != len(fields2) {
   507  		return false
   508  	}
   509  
   510  	if !repetitionsAreEqual(node1, node2) {
   511  		return false
   512  	}
   513  
   514  	for i := range fields1 {
   515  		f1 := fields1[i]
   516  		f2 := fields2[i]
   517  
   518  		if f1.Name() != f2.Name() {
   519  			return false
   520  		}
   521  
   522  		if !nodesAreEqual(f1, f2) {
   523  			return false
   524  		}
   525  	}
   526  
   527  	return true
   528  }