github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/node.go (about)

     1  package parquet
     2  
     3  import (
     4  	"reflect"
     5  	"sort"
     6  	"unicode"
     7  	"unicode/utf8"
     8  
     9  	"github.com/segmentio/parquet-go/compress"
    10  	"github.com/segmentio/parquet-go/deprecated"
    11  	"github.com/segmentio/parquet-go/encoding"
    12  	"github.com/segmentio/parquet-go/format"
    13  )
    14  
    15  // Node values represent nodes of a parquet schema.
    16  //
    17  // Nodes carry the type of values, as well as properties like whether the values
    18  // are optional or repeat. Nodes with one or more children represent parquet
    19  // groups and therefore do not have a logical type.
    20  //
    21  // Nodes are immutable values and therefore safe to use concurrently from
    22  // multiple goroutines.
    23  type Node interface {
    24  	// Returns a human-readable representation of the parquet node.
    25  	String() string
    26  
    27  	// For leaf nodes, returns the type of values of the parquet column.
    28  	//
    29  	// Calling this method on non-leaf nodes will panic.
    30  	Type() Type
    31  
    32  	// Returns whether the parquet column is optional.
    33  	Optional() bool
    34  
    35  	// Returns whether the parquet column is repeated.
    36  	Repeated() bool
    37  
    38  	// Returns whether the parquet column is required.
    39  	Required() bool
    40  
    41  	// Returns true if this a leaf node.
    42  	Leaf() bool
    43  
    44  	// Returns a mapping of the node's fields.
    45  	//
    46  	// As an optimization, the same slices may be returned by multiple calls to
    47  	// this method, programs must treat the returned values as immutable.
    48  	//
    49  	// This method returns an empty mapping when called on leaf nodes.
    50  	Fields() []Field
    51  
    52  	// Returns the encoding used by the node.
    53  	//
    54  	// The method may return nil to indicate that no specific encoding was
    55  	// configured on the node, in which case a default encoding might be used.
    56  	Encoding() encoding.Encoding
    57  
    58  	// Returns compression codec used by the node.
    59  	//
    60  	// The method may return nil to indicate that no specific compression codec
    61  	// was configured on the node, in which case a default compression might be
    62  	// used.
    63  	Compression() compress.Codec
    64  
    65  	// Returns the Go type that best represents the parquet node.
    66  	//
    67  	// For leaf nodes, this will be one of bool, int32, int64, deprecated.Int96,
    68  	// float32, float64, string, []byte, or [N]byte.
    69  	//
    70  	// For groups, the method returns a struct type.
    71  	//
    72  	// If the method is called on a repeated node, the method returns a slice of
    73  	// the underlying type.
    74  	//
    75  	// For optional nodes, the method returns a pointer of the underlying type.
    76  	//
    77  	// For nodes that were constructed from Go values (e.g. using SchemaOf), the
    78  	// method returns the original Go type.
    79  	GoType() reflect.Type
    80  }
    81  
    82  // Field instances represent fields of a parquet node, which associate a node to
    83  // their name in their parent node.
    84  type Field interface {
    85  	Node
    86  
    87  	// Returns the name of this field in its parent node.
    88  	Name() string
    89  
    90  	// Given a reference to the Go value matching the structure of the parent
    91  	// node, returns the Go value of the field.
    92  	Value(base reflect.Value) reflect.Value
    93  }
    94  
    95  // Encoded wraps the node passed as argument to use the given encoding.
    96  //
    97  // The function panics if it is called on a non-leaf node, or if the
    98  // encoding does not support the node type.
    99  func Encoded(node Node, encoding encoding.Encoding) Node {
   100  	if !node.Leaf() {
   101  		panic("cannot add encoding to a non-leaf node")
   102  	}
   103  	if encoding != nil {
   104  		kind := node.Type().Kind()
   105  		if !canEncode(encoding, kind) {
   106  			panic("cannot apply " + encoding.Encoding().String() + " to node of type " + kind.String())
   107  		}
   108  	}
   109  	return &encodedNode{
   110  		Node:     node,
   111  		encoding: encoding,
   112  	}
   113  }
   114  
   115  type encodedNode struct {
   116  	Node
   117  	encoding encoding.Encoding
   118  }
   119  
   120  func (n *encodedNode) Encoding() encoding.Encoding {
   121  	return n.encoding
   122  }
   123  
   124  // Compressed wraps the node passed as argument to use the given compression
   125  // codec.
   126  //
   127  // If the codec is nil, the node's compression is left unchanged.
   128  //
   129  // The function panics if it is called on a non-leaf node.
   130  func Compressed(node Node, codec compress.Codec) Node {
   131  	if !node.Leaf() {
   132  		panic("cannot add compression codec to a non-leaf node")
   133  	}
   134  	return &compressedNode{
   135  		Node:  node,
   136  		codec: codec,
   137  	}
   138  }
   139  
   140  type compressedNode struct {
   141  	Node
   142  	codec compress.Codec
   143  }
   144  
   145  func (n *compressedNode) Compression() compress.Codec {
   146  	return n.codec
   147  }
   148  
   149  // Optional wraps the given node to make it optional.
   150  func Optional(node Node) Node { return &optionalNode{node} }
   151  
   152  type optionalNode struct{ Node }
   153  
   154  func (opt *optionalNode) Optional() bool       { return true }
   155  func (opt *optionalNode) Repeated() bool       { return false }
   156  func (opt *optionalNode) Required() bool       { return false }
   157  func (opt *optionalNode) GoType() reflect.Type { return reflect.PtrTo(opt.Node.GoType()) }
   158  
   159  // Repeated wraps the given node to make it repeated.
   160  func Repeated(node Node) Node { return &repeatedNode{node} }
   161  
   162  type repeatedNode struct{ Node }
   163  
   164  func (rep *repeatedNode) Optional() bool       { return false }
   165  func (rep *repeatedNode) Repeated() bool       { return true }
   166  func (rep *repeatedNode) Required() bool       { return false }
   167  func (rep *repeatedNode) GoType() reflect.Type { return reflect.SliceOf(rep.Node.GoType()) }
   168  
   169  // Required wraps the given node to make it required.
   170  func Required(node Node) Node { return &requiredNode{node} }
   171  
   172  type requiredNode struct{ Node }
   173  
   174  func (req *requiredNode) Optional() bool       { return false }
   175  func (req *requiredNode) Repeated() bool       { return false }
   176  func (req *requiredNode) Required() bool       { return true }
   177  func (req *requiredNode) GoType() reflect.Type { return req.Node.GoType() }
   178  
   179  type node struct{}
   180  
   181  // Leaf returns a leaf node of the given type.
   182  func Leaf(typ Type) Node {
   183  	return &leafNode{typ: typ}
   184  }
   185  
   186  type leafNode struct{ typ Type }
   187  
   188  func (n *leafNode) String() string { return sprint("", n) }
   189  
   190  func (n *leafNode) Type() Type { return n.typ }
   191  
   192  func (n *leafNode) Optional() bool { return false }
   193  
   194  func (n *leafNode) Repeated() bool { return false }
   195  
   196  func (n *leafNode) Required() bool { return true }
   197  
   198  func (n *leafNode) Leaf() bool { return true }
   199  
   200  func (n *leafNode) Fields() []Field { return nil }
   201  
   202  func (n *leafNode) Encoding() encoding.Encoding { return nil }
   203  
   204  func (n *leafNode) Compression() compress.Codec { return nil }
   205  
   206  func (n *leafNode) GoType() reflect.Type { return goTypeOfLeaf(n) }
   207  
   208  var repetitionTypes = [...]format.FieldRepetitionType{
   209  	0: format.Required,
   210  	1: format.Optional,
   211  	2: format.Repeated,
   212  }
   213  
   214  func fieldRepetitionTypePtrOf(node Node) *format.FieldRepetitionType {
   215  	switch {
   216  	case node.Required():
   217  		return &repetitionTypes[format.Required]
   218  	case node.Optional():
   219  		return &repetitionTypes[format.Optional]
   220  	case node.Repeated():
   221  		return &repetitionTypes[format.Repeated]
   222  	default:
   223  		return nil
   224  	}
   225  }
   226  
   227  func fieldRepetitionTypeOf(node Node) format.FieldRepetitionType {
   228  	switch {
   229  	case node.Optional():
   230  		return format.Optional
   231  	case node.Repeated():
   232  		return format.Repeated
   233  	default:
   234  		return format.Required
   235  	}
   236  }
   237  
   238  func applyFieldRepetitionType(t format.FieldRepetitionType, repetitionLevel, definitionLevel byte) (byte, byte) {
   239  	switch t {
   240  	case format.Optional:
   241  		definitionLevel++
   242  	case format.Repeated:
   243  		repetitionLevel++
   244  		definitionLevel++
   245  	}
   246  	return repetitionLevel, definitionLevel
   247  }
   248  
   249  type Group map[string]Node
   250  
   251  func (g Group) String() string { return sprint("", g) }
   252  
   253  func (g Group) Type() Type { return groupType{} }
   254  
   255  func (g Group) Optional() bool { return false }
   256  
   257  func (g Group) Repeated() bool { return false }
   258  
   259  func (g Group) Required() bool { return true }
   260  
   261  func (g Group) Leaf() bool { return false }
   262  
   263  func (g Group) Fields() []Field {
   264  	groupFields := make([]groupField, 0, len(g))
   265  	for name, node := range g {
   266  		groupFields = append(groupFields, groupField{
   267  			Node: node,
   268  			name: name,
   269  		})
   270  	}
   271  	sort.Slice(groupFields, func(i, j int) bool {
   272  		return groupFields[i].name < groupFields[j].name
   273  	})
   274  	fields := make([]Field, len(groupFields))
   275  	for i := range groupFields {
   276  		fields[i] = &groupFields[i]
   277  	}
   278  	return fields
   279  }
   280  
   281  func (g Group) Encoding() encoding.Encoding { return nil }
   282  
   283  func (g Group) Compression() compress.Codec { return nil }
   284  
   285  func (g Group) GoType() reflect.Type { return goTypeOfGroup(g) }
   286  
   287  type groupField struct {
   288  	Node
   289  	name string
   290  }
   291  
   292  func (f *groupField) Name() string { return f.name }
   293  
   294  func (f *groupField) Value(base reflect.Value) reflect.Value {
   295  	return base.MapIndex(reflect.ValueOf(&f.name).Elem())
   296  }
   297  
   298  func goTypeOf(node Node) reflect.Type {
   299  	switch {
   300  	case node.Optional():
   301  		return goTypeOfOptional(node)
   302  	case node.Repeated():
   303  		return goTypeOfRepeated(node)
   304  	default:
   305  		return goTypeOfRequired(node)
   306  	}
   307  }
   308  
   309  func goTypeOfOptional(node Node) reflect.Type {
   310  	return reflect.PtrTo(goTypeOfRequired(node))
   311  }
   312  
   313  func goTypeOfRepeated(node Node) reflect.Type {
   314  	return reflect.SliceOf(goTypeOfRequired(node))
   315  }
   316  
   317  func goTypeOfRequired(node Node) reflect.Type {
   318  	if node.Leaf() {
   319  		return goTypeOfLeaf(node)
   320  	} else {
   321  		return goTypeOfGroup(node)
   322  	}
   323  }
   324  
   325  func goTypeOfLeaf(node Node) reflect.Type {
   326  	t := node.Type()
   327  	if convertibleType, ok := t.(interface{ GoType() reflect.Type }); ok {
   328  		return convertibleType.GoType()
   329  	}
   330  	switch t.Kind() {
   331  	case Boolean:
   332  		return reflect.TypeOf(false)
   333  	case Int32:
   334  		return reflect.TypeOf(int32(0))
   335  	case Int64:
   336  		return reflect.TypeOf(int64(0))
   337  	case Int96:
   338  		return reflect.TypeOf(deprecated.Int96{})
   339  	case Float:
   340  		return reflect.TypeOf(float32(0))
   341  	case Double:
   342  		return reflect.TypeOf(float64(0))
   343  	case ByteArray:
   344  		return reflect.TypeOf(([]byte)(nil))
   345  	case FixedLenByteArray:
   346  		return reflect.ArrayOf(t.Length(), reflect.TypeOf(byte(0)))
   347  	default:
   348  		panic("BUG: parquet type returned an unsupported kind")
   349  	}
   350  }
   351  
   352  func goTypeOfGroup(node Node) reflect.Type {
   353  	fields := node.Fields()
   354  	structFields := make([]reflect.StructField, len(fields))
   355  	for i, field := range fields {
   356  		structFields[i].Name = exportedStructFieldName(field.Name())
   357  		structFields[i].Type = field.GoType()
   358  		// TODO: can we reconstruct a struct tag that would be valid if a value
   359  		// of this type were passed to SchemaOf?
   360  	}
   361  	return reflect.StructOf(structFields)
   362  }
   363  
   364  func exportedStructFieldName(name string) string {
   365  	firstRune, size := utf8.DecodeRuneInString(name)
   366  	return string([]rune{unicode.ToUpper(firstRune)}) + name[size:]
   367  }
   368  
   369  func isList(node Node) bool {
   370  	logicalType := node.Type().LogicalType()
   371  	return logicalType != nil && logicalType.List != nil
   372  }
   373  
   374  func isMap(node Node) bool {
   375  	logicalType := node.Type().LogicalType()
   376  	return logicalType != nil && logicalType.Map != nil
   377  }
   378  
   379  func numLeafColumnsOf(node Node) int16 {
   380  	return makeColumnIndex(numLeafColumns(node, 0))
   381  }
   382  
   383  func numLeafColumns(node Node, columnIndex int) int {
   384  	if node.Leaf() {
   385  		return columnIndex + 1
   386  	}
   387  	for _, field := range node.Fields() {
   388  		columnIndex = numLeafColumns(field, columnIndex)
   389  	}
   390  	return columnIndex
   391  }
   392  
   393  func listElementOf(node Node) Node {
   394  	if !node.Leaf() {
   395  		if list := fieldByName(node, "list"); list != nil {
   396  			if elem := fieldByName(list, "element"); elem != nil {
   397  				return elem
   398  			}
   399  		}
   400  	}
   401  	panic("node with logical type LIST is not composed of a repeated .list.element")
   402  }
   403  
   404  func mapKeyValueOf(node Node) Node {
   405  	if !node.Leaf() && (node.Required() || node.Optional()) {
   406  		if keyValue := fieldByName(node, "key_value"); keyValue != nil && !keyValue.Leaf() && keyValue.Repeated() {
   407  			k := fieldByName(keyValue, "key")
   408  			v := fieldByName(keyValue, "value")
   409  			if k != nil && v != nil && k.Required() {
   410  				return keyValue
   411  			}
   412  		}
   413  	}
   414  	panic("node with logical type MAP is not composed of a repeated .key_value group with key and value fields")
   415  }
   416  
   417  func encodingOf(node Node) encoding.Encoding {
   418  	encoding := node.Encoding()
   419  	// The parquet-format documentation states that the
   420  	// DELTA_LENGTH_BYTE_ARRAY is always preferred to PLAIN when
   421  	// encoding BYTE_ARRAY values. We apply it as a default if
   422  	// none were explicitly specified, which gives the application
   423  	// the opportunity to override this behavior if needed.
   424  	//
   425  	// https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-length-byte-array-delta_length_byte_array--6
   426  	if node.Type().Kind() == ByteArray && encoding == nil {
   427  		encoding = &DeltaLengthByteArray
   428  	}
   429  	if encoding == nil {
   430  		encoding = &Plain
   431  	}
   432  	return encoding
   433  }
   434  
   435  func forEachNodeOf(name string, node Node, do func(string, Node)) {
   436  	do(name, node)
   437  
   438  	for _, f := range node.Fields() {
   439  		forEachNodeOf(f.Name(), f, do)
   440  	}
   441  }
   442  
   443  func fieldByName(node Node, name string) Field {
   444  	for _, f := range node.Fields() {
   445  		if f.Name() == name {
   446  			return f
   447  		}
   448  	}
   449  	return nil
   450  }
   451  
   452  func nodesAreEqual(node1, node2 Node) bool {
   453  	if node1.Leaf() {
   454  		return node2.Leaf() && leafNodesAreEqual(node1, node2)
   455  	} else {
   456  		return !node2.Leaf() && groupNodesAreEqual(node1, node2)
   457  	}
   458  }
   459  
   460  func typesAreEqual(type1, type2 Type) bool {
   461  	return type1.Kind() == type2.Kind() &&
   462  		type1.Length() == type2.Length() &&
   463  		reflect.DeepEqual(type1.LogicalType(), type2.LogicalType())
   464  }
   465  
   466  func repetitionsAreEqual(node1, node2 Node) bool {
   467  	return node1.Optional() == node2.Optional() && node1.Repeated() == node2.Repeated()
   468  }
   469  
   470  func leafNodesAreEqual(node1, node2 Node) bool {
   471  	return typesAreEqual(node1.Type(), node2.Type()) && repetitionsAreEqual(node1, node2)
   472  }
   473  
   474  func groupNodesAreEqual(node1, node2 Node) bool {
   475  	fields1 := node1.Fields()
   476  	fields2 := node2.Fields()
   477  
   478  	if len(fields1) != len(fields2) {
   479  		return false
   480  	}
   481  
   482  	if !repetitionsAreEqual(node1, node2) {
   483  		return false
   484  	}
   485  
   486  	for i := range fields1 {
   487  		f1 := fields1[i]
   488  		f2 := fields2[i]
   489  
   490  		if f1.Name() != f2.Name() {
   491  			return false
   492  		}
   493  
   494  		if !nodesAreEqual(f1, f2) {
   495  			return false
   496  		}
   497  	}
   498  
   499  	return true
   500  }