github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/node.go (about)

     1  package parquet
     2  
     3  import (
     4  	"reflect"
     5  	"sort"
     6  	"unicode"
     7  	"unicode/utf8"
     8  
     9  	"github.com/vc42/parquet-go/compress"
    10  	"github.com/vc42/parquet-go/deprecated"
    11  	"github.com/vc42/parquet-go/encoding"
    12  	"github.com/vc42/parquet-go/format"
    13  )
    14  
    15  // Node values represent nodes of a parquet schema.
    16  //
    17  // Nodes carry the type of values, as well as properties like whether the values
    18  // are optional or repeat. Nodes with one or more children represent parquet
    19  // groups and therefore do not have a logical type.
    20  //
    21  // Nodes are immutable values and therefore safe to use concurrently from
    22  // multiple goroutines.
    23  type Node interface {
    24  	// Returns a human-readable representation of the parquet node.
    25  	String() string
    26  
    27  	// For leaf nodes, returns the type of values of the parquet column.
    28  	//
    29  	// Calling this method on non-leaf nodes will panic.
    30  	Type() Type
    31  
    32  	// Returns whether the parquet column is optional.
    33  	Optional() bool
    34  
    35  	// Returns whether the parquet column is repeated.
    36  	Repeated() bool
    37  
    38  	// Returns whether the parquet column is required.
    39  	Required() bool
    40  
    41  	// Returns true if this a leaf node.
    42  	Leaf() bool
    43  
    44  	// Returns a mapping of the node's fields.
    45  	//
    46  	// As an optimization, the same slices may be returned by multiple calls to
    47  	// this method, programs must treat the returned values as immutable.
    48  	//
    49  	// This method returns an empty mapping when called on leaf nodes.
    50  	Fields() []Field
    51  
    52  	// Returns the encoding used by the node.
    53  	//
    54  	// The method may return nil to indicate that no specific encoding was
    55  	// configured on the node, in which case a default encoding might be used.
    56  	Encoding() encoding.Encoding
    57  
    58  	// Returns compression codec used by the node.
    59  	//
    60  	// The method may return nil to indicate that no specific compression codec
    61  	// was configured on the node, in which case a default compression might be
    62  	// used.
    63  	Compression() compress.Codec
    64  
    65  	// Returns the Go type that best represents the parquet node.
    66  	//
    67  	// For leaf nodes, this will be one of bool, int32, int64, deprecated.Int96,
    68  	// float32, float64, string, []byte, or [N]byte.
    69  	//
    70  	// For groups, the method returns a struct type.
    71  	//
    72  	// If the method is called on a repeated node, the method returns a slice of
    73  	// the underlying type.
    74  	//
    75  	// For optional nodes, the method returns a pointer of the underlying type.
    76  	//
    77  	// For nodes that were constructed from Go values (e.g. using SchemaOf), the
    78  	// method returns the original Go type.
    79  	GoType() reflect.Type
    80  }
    81  
    82  // Field instances represent fields of a parquet node, which associate a node to
    83  // their name in their parent node.
    84  type Field interface {
    85  	Node
    86  
    87  	// Returns the name of this field in its parent node.
    88  	Name() string
    89  
    90  	// Given a reference to the Go value matching the structure of the parent
    91  	// node, returns the Go value of the field.
    92  	Value(base reflect.Value) reflect.Value
    93  }
    94  
    95  // Encoded wraps the node passed as argument to use the given encoding.
    96  //
    97  // The function panics if it is called on a non-leaf node, or if the
    98  // encoding does not support the node type.
    99  func Encoded(node Node, encoding encoding.Encoding) Node {
   100  	if !node.Leaf() {
   101  		panic("cannot add encoding to a non-leaf node")
   102  	}
   103  	if encoding != nil {
   104  		kind := node.Type().Kind()
   105  		if !canEncode(encoding, kind) {
   106  			panic("cannot apply " + encoding.Encoding().String() + " to node of type " + kind.String())
   107  		}
   108  	}
   109  	return &encodedNode{
   110  		Node:     node,
   111  		encoding: encoding,
   112  	}
   113  }
   114  
   115  type encodedNode struct {
   116  	Node
   117  	encoding encoding.Encoding
   118  }
   119  
   120  func (n *encodedNode) Encoding() encoding.Encoding {
   121  	return n.encoding
   122  }
   123  
   124  // Compressed wraps the node passed as argument to use the given compression
   125  // codec.
   126  //
   127  // If the codec is nil, the node's compression is left unchanged.
   128  //
   129  // The function panics if it is called on a non-leaf node.
   130  func Compressed(node Node, codec compress.Codec) Node {
   131  	if !node.Leaf() {
   132  		panic("cannot add compression codec to a non-leaf node")
   133  	}
   134  	return &compressedNode{
   135  		Node:  node,
   136  		codec: codec,
   137  	}
   138  }
   139  
   140  type compressedNode struct {
   141  	Node
   142  	codec compress.Codec
   143  }
   144  
   145  func (n *compressedNode) Compression() compress.Codec {
   146  	return n.codec
   147  }
   148  
   149  // Optional wraps the given node to make it optional.
   150  func Optional(node Node) Node { return &optionalNode{node} }
   151  
   152  type optionalNode struct{ Node }
   153  
   154  func (opt *optionalNode) Optional() bool       { return true }
   155  func (opt *optionalNode) Repeated() bool       { return false }
   156  func (opt *optionalNode) Required() bool       { return false }
   157  func (opt *optionalNode) GoType() reflect.Type { return reflect.PtrTo(opt.Node.GoType()) }
   158  
   159  // Repeated wraps the given node to make it repeated.
   160  func Repeated(node Node) Node { return &repeatedNode{node} }
   161  
   162  type repeatedNode struct{ Node }
   163  
   164  func (rep *repeatedNode) Optional() bool       { return false }
   165  func (rep *repeatedNode) Repeated() bool       { return true }
   166  func (rep *repeatedNode) Required() bool       { return false }
   167  func (rep *repeatedNode) GoType() reflect.Type { return reflect.SliceOf(rep.Node.GoType()) }
   168  
   169  // Required wraps the given node to make it required.
   170  func Required(node Node) Node { return &requiredNode{node} }
   171  
   172  type requiredNode struct{ Node }
   173  
   174  func (req *requiredNode) Optional() bool       { return false }
   175  func (req *requiredNode) Repeated() bool       { return false }
   176  func (req *requiredNode) Required() bool       { return true }
   177  func (req *requiredNode) GoType() reflect.Type { return req.Node.GoType() }
   178  
   179  type node struct{}
   180  
   181  // Leaf returns a leaf node of the given type.
   182  func Leaf(typ Type) Node {
   183  	return &leafNode{typ: typ}
   184  }
   185  
   186  type leafNode struct{ typ Type }
   187  
   188  func (n *leafNode) String() string { return sprint("", n) }
   189  
   190  func (n *leafNode) Type() Type { return n.typ }
   191  
   192  func (n *leafNode) Optional() bool { return false }
   193  
   194  func (n *leafNode) Repeated() bool { return false }
   195  
   196  func (n *leafNode) Required() bool { return true }
   197  
   198  func (n *leafNode) Leaf() bool { return true }
   199  
   200  func (n *leafNode) Fields() []Field { return nil }
   201  
   202  func (n *leafNode) Encoding() encoding.Encoding { return nil }
   203  
   204  func (n *leafNode) Compression() compress.Codec { return nil }
   205  
   206  func (n *leafNode) GoType() reflect.Type { return goTypeOfLeaf(n) }
   207  
   208  var repetitionTypes = [...]format.FieldRepetitionType{
   209  	0: format.Required,
   210  	1: format.Optional,
   211  	2: format.Repeated,
   212  }
   213  
   214  func fieldRepetitionTypePtrOf(node Node) *format.FieldRepetitionType {
   215  	switch {
   216  	case node.Required():
   217  		return &repetitionTypes[format.Required]
   218  	case node.Optional():
   219  		return &repetitionTypes[format.Optional]
   220  	case node.Repeated():
   221  		return &repetitionTypes[format.Repeated]
   222  	default:
   223  		return nil
   224  	}
   225  }
   226  
   227  func fieldRepetitionTypeOf(node Node) format.FieldRepetitionType {
   228  	switch {
   229  	case node.Optional():
   230  		return format.Optional
   231  	case node.Repeated():
   232  		return format.Repeated
   233  	default:
   234  		return format.Required
   235  	}
   236  }
   237  
   238  type Group map[string]Node
   239  
   240  func (g Group) String() string { return sprint("", g) }
   241  
   242  func (g Group) Type() Type { return groupType{} }
   243  
   244  func (g Group) Optional() bool { return false }
   245  
   246  func (g Group) Repeated() bool { return false }
   247  
   248  func (g Group) Required() bool { return true }
   249  
   250  func (g Group) Leaf() bool { return false }
   251  
   252  func (g Group) Fields() []Field {
   253  	groupFields := make([]groupField, 0, len(g))
   254  	for name, node := range g {
   255  		groupFields = append(groupFields, groupField{
   256  			Node: node,
   257  			name: name,
   258  		})
   259  	}
   260  	sort.Slice(groupFields, func(i, j int) bool {
   261  		return groupFields[i].name < groupFields[j].name
   262  	})
   263  	fields := make([]Field, len(groupFields))
   264  	for i := range groupFields {
   265  		fields[i] = &groupFields[i]
   266  	}
   267  	return fields
   268  }
   269  
   270  func (g Group) Encoding() encoding.Encoding { return nil }
   271  
   272  func (g Group) Compression() compress.Codec { return nil }
   273  
   274  func (g Group) GoType() reflect.Type { return goTypeOfGroup(g) }
   275  
   276  type groupField struct {
   277  	Node
   278  	name string
   279  }
   280  
   281  func (f *groupField) Name() string { return f.name }
   282  
   283  func (f *groupField) Value(base reflect.Value) reflect.Value {
   284  	return base.MapIndex(reflect.ValueOf(&f.name).Elem())
   285  }
   286  
   287  func goTypeOf(node Node) reflect.Type {
   288  	switch {
   289  	case node.Optional():
   290  		return goTypeOfOptional(node)
   291  	case node.Repeated():
   292  		return goTypeOfRepeated(node)
   293  	default:
   294  		return goTypeOfRequired(node)
   295  	}
   296  }
   297  
   298  func goTypeOfOptional(node Node) reflect.Type {
   299  	return reflect.PtrTo(goTypeOfRequired(node))
   300  }
   301  
   302  func goTypeOfRepeated(node Node) reflect.Type {
   303  	return reflect.SliceOf(goTypeOfRequired(node))
   304  }
   305  
   306  func goTypeOfRequired(node Node) reflect.Type {
   307  	if node.Leaf() {
   308  		return goTypeOfLeaf(node)
   309  	} else {
   310  		return goTypeOfGroup(node)
   311  	}
   312  }
   313  
   314  func goTypeOfLeaf(node Node) reflect.Type {
   315  	t := node.Type()
   316  	if convertibleType, ok := t.(interface{ GoType() reflect.Type }); ok {
   317  		return convertibleType.GoType()
   318  	}
   319  	switch t.Kind() {
   320  	case Boolean:
   321  		return reflect.TypeOf(false)
   322  	case Int32:
   323  		return reflect.TypeOf(int32(0))
   324  	case Int64:
   325  		return reflect.TypeOf(int64(0))
   326  	case Int96:
   327  		return reflect.TypeOf(deprecated.Int96{})
   328  	case Float:
   329  		return reflect.TypeOf(float32(0))
   330  	case Double:
   331  		return reflect.TypeOf(float64(0))
   332  	case ByteArray:
   333  		return reflect.TypeOf(([]byte)(nil))
   334  	case FixedLenByteArray:
   335  		return reflect.ArrayOf(t.Length(), reflect.TypeOf(byte(0)))
   336  	default:
   337  		panic("BUG: parquet type returned an unsupported kind")
   338  	}
   339  }
   340  
   341  func goTypeOfGroup(node Node) reflect.Type {
   342  	fields := node.Fields()
   343  	structFields := make([]reflect.StructField, len(fields))
   344  	for i, field := range fields {
   345  		structFields[i].Name = exportedStructFieldName(field.Name())
   346  		structFields[i].Type = field.GoType()
   347  		// TODO: can we reconstruct a struct tag that would be valid if a value
   348  		// of this type were passed to SchemaOf?
   349  	}
   350  	return reflect.StructOf(structFields)
   351  }
   352  
   353  func exportedStructFieldName(name string) string {
   354  	firstRune, size := utf8.DecodeRuneInString(name)
   355  	return string([]rune{unicode.ToUpper(firstRune)}) + name[size:]
   356  }
   357  
   358  func isList(node Node) bool {
   359  	logicalType := node.Type().LogicalType()
   360  	return logicalType != nil && logicalType.List != nil
   361  }
   362  
   363  func isMap(node Node) bool {
   364  	logicalType := node.Type().LogicalType()
   365  	return logicalType != nil && logicalType.Map != nil
   366  }
   367  
   368  func numLeafColumnsOf(node Node) int16 {
   369  	return makeColumnIndex(numLeafColumns(node, 0))
   370  }
   371  
   372  func numLeafColumns(node Node, columnIndex int) int {
   373  	if node.Leaf() {
   374  		return columnIndex + 1
   375  	}
   376  	for _, field := range node.Fields() {
   377  		columnIndex = numLeafColumns(field, columnIndex)
   378  	}
   379  	return columnIndex
   380  }
   381  
   382  func listElementOf(node Node) Node {
   383  	if !node.Leaf() {
   384  		if list := childByName(node, "list"); list != nil {
   385  			if elem := childByName(list, "element"); elem != nil {
   386  				return elem
   387  			}
   388  		}
   389  	}
   390  	panic("node with logical type LIST is not composed of a repeated .list.element")
   391  }
   392  
   393  func mapKeyValueOf(node Node) Node {
   394  	if !node.Leaf() && (node.Required() || node.Optional()) {
   395  		if keyValue := childByName(node, "key_value"); keyValue != nil && !keyValue.Leaf() && keyValue.Repeated() {
   396  			k := childByName(keyValue, "key")
   397  			v := childByName(keyValue, "value")
   398  			if k != nil && v != nil && k.Required() {
   399  				return keyValue
   400  			}
   401  		}
   402  	}
   403  	panic("node with logical type MAP is not composed of a repeated .key_value group with key and value fields")
   404  }
   405  
   406  func encodingOf(node Node) encoding.Encoding {
   407  	encoding := node.Encoding()
   408  	// The parquet-format documentation states that the
   409  	// DELTA_LENGTH_BYTE_ARRAY is always preferred to PLAIN when
   410  	// encoding BYTE_ARRAY values. We apply it as a default if
   411  	// none were explicitly specified, which gives the application
   412  	// the opportunity to override this behavior if needed.
   413  	//
   414  	// https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-length-byte-array-delta_length_byte_array--6
   415  	if node.Type().Kind() == ByteArray && encoding == nil {
   416  		encoding = &DeltaLengthByteArray
   417  	}
   418  	if encoding == nil {
   419  		encoding = &Plain
   420  	}
   421  	return encoding
   422  }
   423  
   424  func forEachNodeOf(name string, node Node, do func(string, Node)) {
   425  	do(name, node)
   426  
   427  	for _, f := range node.Fields() {
   428  		forEachNodeOf(f.Name(), f, do)
   429  	}
   430  }
   431  
   432  func childByName(node Node, name string) Node {
   433  	for _, f := range node.Fields() {
   434  		if f.Name() == name {
   435  			return f
   436  		}
   437  	}
   438  	return nil
   439  }
   440  
   441  func nodesAreEqual(node1, node2 Node) bool {
   442  	if node1.Leaf() {
   443  		return node2.Leaf() && leafNodesAreEqual(node1, node2)
   444  	} else {
   445  		return !node2.Leaf() && groupNodesAreEqual(node1, node2)
   446  	}
   447  }
   448  
   449  func typesAreEqual(node1, node2 Node) bool {
   450  	return node1.Type().Kind() == node2.Type().Kind()
   451  }
   452  
   453  func repetitionsAreEqual(node1, node2 Node) bool {
   454  	return node1.Optional() == node2.Optional() && node1.Repeated() == node2.Repeated()
   455  }
   456  
   457  func leafNodesAreEqual(node1, node2 Node) bool {
   458  	return typesAreEqual(node1, node2) && repetitionsAreEqual(node1, node2)
   459  }
   460  
   461  func groupNodesAreEqual(node1, node2 Node) bool {
   462  	fields1 := node1.Fields()
   463  	fields2 := node2.Fields()
   464  
   465  	if len(fields1) != len(fields2) {
   466  		return false
   467  	}
   468  
   469  	for i := range fields1 {
   470  		f1 := fields1[i]
   471  		f2 := fields2[i]
   472  
   473  		if f1.Name() != f2.Name() {
   474  			return false
   475  		}
   476  
   477  		if !nodesAreEqual(f1, f2) {
   478  			return false
   479  		}
   480  	}
   481  
   482  	return true
   483  }