github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/node.go (about) 1 package parquet 2 3 import ( 4 "reflect" 5 "sort" 6 "unicode" 7 "unicode/utf8" 8 9 "github.com/parquet-go/parquet-go/compress" 10 "github.com/parquet-go/parquet-go/deprecated" 11 "github.com/parquet-go/parquet-go/encoding" 12 "github.com/parquet-go/parquet-go/format" 13 ) 14 15 // Node values represent nodes of a parquet schema. 16 // 17 // Nodes carry the type of values, as well as properties like whether the values 18 // are optional or repeat. Nodes with one or more children represent parquet 19 // groups and therefore do not have a logical type. 20 // 21 // Nodes are immutable values and therefore safe to use concurrently from 22 // multiple goroutines. 23 type Node interface { 24 // The id of this node in its parent node. Zero value is treated as id is not 25 // set. ID only needs to be unique within its parent context. 26 // 27 // This is the same as parquet field_id 28 ID() int 29 30 // Returns a human-readable representation of the parquet node. 31 String() string 32 33 // For leaf nodes, returns the type of values of the parquet column. 34 // 35 // Calling this method on non-leaf nodes will panic. 36 Type() Type 37 38 // Returns whether the parquet column is optional. 39 Optional() bool 40 41 // Returns whether the parquet column is repeated. 42 Repeated() bool 43 44 // Returns whether the parquet column is required. 45 Required() bool 46 47 // Returns true if this a leaf node. 48 Leaf() bool 49 50 // Returns a mapping of the node's fields. 51 // 52 // As an optimization, the same slices may be returned by multiple calls to 53 // this method, programs must treat the returned values as immutable. 54 // 55 // This method returns an empty mapping when called on leaf nodes. 56 Fields() []Field 57 58 // Returns the encoding used by the node. 59 // 60 // The method may return nil to indicate that no specific encoding was 61 // configured on the node, in which case a default encoding might be used. 62 Encoding() encoding.Encoding 63 64 // Returns compression codec used by the node. 65 // 66 // The method may return nil to indicate that no specific compression codec 67 // was configured on the node, in which case a default compression might be 68 // used. 69 Compression() compress.Codec 70 71 // Returns the Go type that best represents the parquet node. 72 // 73 // For leaf nodes, this will be one of bool, int32, int64, deprecated.Int96, 74 // float32, float64, string, []byte, or [N]byte. 75 // 76 // For groups, the method returns a struct type. 77 // 78 // If the method is called on a repeated node, the method returns a slice of 79 // the underlying type. 80 // 81 // For optional nodes, the method returns a pointer of the underlying type. 82 // 83 // For nodes that were constructed from Go values (e.g. using SchemaOf), the 84 // method returns the original Go type. 85 GoType() reflect.Type 86 } 87 88 // Field instances represent fields of a parquet node, which associate a node to 89 // their name in their parent node. 90 type Field interface { 91 Node 92 93 // Returns the name of this field in its parent node. 94 Name() string 95 96 // Given a reference to the Go value matching the structure of the parent 97 // node, returns the Go value of the field. 98 Value(base reflect.Value) reflect.Value 99 } 100 101 // Encoded wraps the node passed as argument to use the given encoding. 102 // 103 // The function panics if it is called on a non-leaf node, or if the 104 // encoding does not support the node type. 105 func Encoded(node Node, encoding encoding.Encoding) Node { 106 if !node.Leaf() { 107 panic("cannot add encoding to a non-leaf node") 108 } 109 if encoding != nil { 110 kind := node.Type().Kind() 111 if !canEncode(encoding, kind) { 112 panic("cannot apply " + encoding.Encoding().String() + " to node of type " + kind.String()) 113 } 114 } 115 return &encodedNode{ 116 Node: node, 117 encoding: encoding, 118 } 119 } 120 121 type encodedNode struct { 122 Node 123 encoding encoding.Encoding 124 } 125 126 func (n *encodedNode) Encoding() encoding.Encoding { 127 return n.encoding 128 } 129 130 // Compressed wraps the node passed as argument to use the given compression 131 // codec. 132 // 133 // If the codec is nil, the node's compression is left unchanged. 134 // 135 // The function panics if it is called on a non-leaf node. 136 func Compressed(node Node, codec compress.Codec) Node { 137 if !node.Leaf() { 138 panic("cannot add compression codec to a non-leaf node") 139 } 140 return &compressedNode{ 141 Node: node, 142 codec: codec, 143 } 144 } 145 146 type compressedNode struct { 147 Node 148 codec compress.Codec 149 } 150 151 func (n *compressedNode) Compression() compress.Codec { 152 return n.codec 153 } 154 155 // Optional wraps the given node to make it optional. 156 func Optional(node Node) Node { return &optionalNode{node} } 157 158 type optionalNode struct{ Node } 159 160 func (opt *optionalNode) Optional() bool { return true } 161 func (opt *optionalNode) Repeated() bool { return false } 162 func (opt *optionalNode) Required() bool { return false } 163 func (opt *optionalNode) GoType() reflect.Type { return reflect.PtrTo(opt.Node.GoType()) } 164 165 // FieldID wraps a node to provide node field id 166 func FieldID(node Node, id int) Node { return &fieldIDNode{Node: node, id: id} } 167 168 type fieldIDNode struct { 169 Node 170 id int 171 } 172 173 func (f *fieldIDNode) ID() int { return f.id } 174 175 // Repeated wraps the given node to make it repeated. 176 func Repeated(node Node) Node { return &repeatedNode{node} } 177 178 type repeatedNode struct{ Node } 179 180 func (rep *repeatedNode) Optional() bool { return false } 181 func (rep *repeatedNode) Repeated() bool { return true } 182 func (rep *repeatedNode) Required() bool { return false } 183 func (rep *repeatedNode) GoType() reflect.Type { return reflect.SliceOf(rep.Node.GoType()) } 184 185 // Required wraps the given node to make it required. 186 func Required(node Node) Node { return &requiredNode{node} } 187 188 type requiredNode struct{ Node } 189 190 func (req *requiredNode) Optional() bool { return false } 191 func (req *requiredNode) Repeated() bool { return false } 192 func (req *requiredNode) Required() bool { return true } 193 func (req *requiredNode) GoType() reflect.Type { return req.Node.GoType() } 194 195 type node struct{} 196 197 // Leaf returns a leaf node of the given type. 198 func Leaf(typ Type) Node { 199 return &leafNode{typ: typ} 200 } 201 202 type leafNode struct{ typ Type } 203 204 func (n *leafNode) ID() int { return 0 } 205 206 func (n *leafNode) String() string { return sprint("", n) } 207 208 func (n *leafNode) Type() Type { return n.typ } 209 210 func (n *leafNode) Optional() bool { return false } 211 212 func (n *leafNode) Repeated() bool { return false } 213 214 func (n *leafNode) Required() bool { return true } 215 216 func (n *leafNode) Leaf() bool { return true } 217 218 func (n *leafNode) Fields() []Field { return nil } 219 220 func (n *leafNode) Encoding() encoding.Encoding { return nil } 221 222 func (n *leafNode) Compression() compress.Codec { return nil } 223 224 func (n *leafNode) GoType() reflect.Type { return goTypeOfLeaf(n) } 225 226 var repetitionTypes = [...]format.FieldRepetitionType{ 227 0: format.Required, 228 1: format.Optional, 229 2: format.Repeated, 230 } 231 232 func fieldRepetitionTypePtrOf(node Node) *format.FieldRepetitionType { 233 switch { 234 case node.Required(): 235 return &repetitionTypes[format.Required] 236 case node.Optional(): 237 return &repetitionTypes[format.Optional] 238 case node.Repeated(): 239 return &repetitionTypes[format.Repeated] 240 default: 241 return nil 242 } 243 } 244 245 func fieldRepetitionTypeOf(node Node) format.FieldRepetitionType { 246 switch { 247 case node.Optional(): 248 return format.Optional 249 case node.Repeated(): 250 return format.Repeated 251 default: 252 return format.Required 253 } 254 } 255 256 func applyFieldRepetitionType(t format.FieldRepetitionType, repetitionLevel, definitionLevel byte) (byte, byte) { 257 switch t { 258 case format.Optional: 259 definitionLevel++ 260 case format.Repeated: 261 repetitionLevel++ 262 definitionLevel++ 263 } 264 return repetitionLevel, definitionLevel 265 } 266 267 type Group map[string]Node 268 269 func (g Group) ID() int { return 0 } 270 271 func (g Group) String() string { return sprint("", g) } 272 273 func (g Group) Type() Type { return groupType{} } 274 275 func (g Group) Optional() bool { return false } 276 277 func (g Group) Repeated() bool { return false } 278 279 func (g Group) Required() bool { return true } 280 281 func (g Group) Leaf() bool { return false } 282 283 func (g Group) Fields() []Field { 284 groupFields := make([]groupField, 0, len(g)) 285 for name, node := range g { 286 groupFields = append(groupFields, groupField{ 287 Node: node, 288 name: name, 289 }) 290 } 291 sort.Slice(groupFields, func(i, j int) bool { 292 return groupFields[i].name < groupFields[j].name 293 }) 294 fields := make([]Field, len(groupFields)) 295 for i := range groupFields { 296 fields[i] = &groupFields[i] 297 } 298 return fields 299 } 300 301 func (g Group) Encoding() encoding.Encoding { return nil } 302 303 func (g Group) Compression() compress.Codec { return nil } 304 305 func (g Group) GoType() reflect.Type { return goTypeOfGroup(g) } 306 307 type groupField struct { 308 Node 309 name string 310 } 311 312 func (f *groupField) Name() string { return f.name } 313 314 func (f *groupField) Value(base reflect.Value) reflect.Value { 315 if base.Kind() == reflect.Interface { 316 if base.IsNil() { 317 return reflect.ValueOf(nil) 318 } 319 if base = base.Elem(); base.Kind() == reflect.Pointer && base.IsNil() { 320 return reflect.ValueOf(nil) 321 } 322 } 323 return base.MapIndex(reflect.ValueOf(&f.name).Elem()) 324 } 325 326 func goTypeOf(node Node) reflect.Type { 327 switch { 328 case node.Optional(): 329 return goTypeOfOptional(node) 330 case node.Repeated(): 331 return goTypeOfRepeated(node) 332 default: 333 return goTypeOfRequired(node) 334 } 335 } 336 337 func goTypeOfOptional(node Node) reflect.Type { 338 return reflect.PtrTo(goTypeOfRequired(node)) 339 } 340 341 func goTypeOfRepeated(node Node) reflect.Type { 342 return reflect.SliceOf(goTypeOfRequired(node)) 343 } 344 345 func goTypeOfRequired(node Node) reflect.Type { 346 if node.Leaf() { 347 return goTypeOfLeaf(node) 348 } else { 349 return goTypeOfGroup(node) 350 } 351 } 352 353 func goTypeOfLeaf(node Node) reflect.Type { 354 t := node.Type() 355 if convertibleType, ok := t.(interface{ GoType() reflect.Type }); ok { 356 return convertibleType.GoType() 357 } 358 switch t.Kind() { 359 case Boolean: 360 return reflect.TypeOf(false) 361 case Int32: 362 return reflect.TypeOf(int32(0)) 363 case Int64: 364 return reflect.TypeOf(int64(0)) 365 case Int96: 366 return reflect.TypeOf(deprecated.Int96{}) 367 case Float: 368 return reflect.TypeOf(float32(0)) 369 case Double: 370 return reflect.TypeOf(float64(0)) 371 case ByteArray: 372 return reflect.TypeOf(([]byte)(nil)) 373 case FixedLenByteArray: 374 return reflect.ArrayOf(t.Length(), reflect.TypeOf(byte(0))) 375 default: 376 panic("BUG: parquet type returned an unsupported kind") 377 } 378 } 379 380 func goTypeOfGroup(node Node) reflect.Type { 381 fields := node.Fields() 382 structFields := make([]reflect.StructField, len(fields)) 383 for i, field := range fields { 384 structFields[i].Name = exportedStructFieldName(field.Name()) 385 structFields[i].Type = field.GoType() 386 // TODO: can we reconstruct a struct tag that would be valid if a value 387 // of this type were passed to SchemaOf? 388 } 389 return reflect.StructOf(structFields) 390 } 391 392 func exportedStructFieldName(name string) string { 393 firstRune, size := utf8.DecodeRuneInString(name) 394 return string([]rune{unicode.ToUpper(firstRune)}) + name[size:] 395 } 396 397 func isList(node Node) bool { 398 logicalType := node.Type().LogicalType() 399 return logicalType != nil && logicalType.List != nil 400 } 401 402 func isMap(node Node) bool { 403 logicalType := node.Type().LogicalType() 404 return logicalType != nil && logicalType.Map != nil 405 } 406 407 func numLeafColumnsOf(node Node) int16 { 408 return makeColumnIndex(numLeafColumns(node, 0)) 409 } 410 411 func numLeafColumns(node Node, columnIndex int) int { 412 if node.Leaf() { 413 return columnIndex + 1 414 } 415 for _, field := range node.Fields() { 416 columnIndex = numLeafColumns(field, columnIndex) 417 } 418 return columnIndex 419 } 420 421 func listElementOf(node Node) Node { 422 if !node.Leaf() { 423 if list := fieldByName(node, "list"); list != nil { 424 if elem := fieldByName(list, "element"); elem != nil { 425 return elem 426 } 427 } 428 } 429 panic("node with logical type LIST is not composed of a repeated .list.element") 430 } 431 432 func mapKeyValueOf(node Node) Node { 433 if !node.Leaf() && (node.Required() || node.Optional()) { 434 if keyValue := fieldByName(node, "key_value"); keyValue != nil && !keyValue.Leaf() && keyValue.Repeated() { 435 k := fieldByName(keyValue, "key") 436 v := fieldByName(keyValue, "value") 437 if k != nil && v != nil && k.Required() { 438 return keyValue 439 } 440 } 441 } 442 panic("node with logical type MAP is not composed of a repeated .key_value group with key and value fields") 443 } 444 445 func encodingOf(node Node) encoding.Encoding { 446 encoding := node.Encoding() 447 // The parquet-format documentation states that the 448 // DELTA_LENGTH_BYTE_ARRAY is always preferred to PLAIN when 449 // encoding BYTE_ARRAY values. We apply it as a default if 450 // none were explicitly specified, which gives the application 451 // the opportunity to override this behavior if needed. 452 // 453 // https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-length-byte-array-delta_length_byte_array--6 454 if node.Type().Kind() == ByteArray && encoding == nil { 455 encoding = &DeltaLengthByteArray 456 } 457 if encoding == nil { 458 encoding = &Plain 459 } 460 return encoding 461 } 462 463 func forEachNodeOf(name string, node Node, do func(string, Node)) { 464 do(name, node) 465 466 for _, f := range node.Fields() { 467 forEachNodeOf(f.Name(), f, do) 468 } 469 } 470 471 func fieldByName(node Node, name string) Field { 472 for _, f := range node.Fields() { 473 if f.Name() == name { 474 return f 475 } 476 } 477 return nil 478 } 479 480 func nodesAreEqual(node1, node2 Node) bool { 481 if node1.Leaf() { 482 return node2.Leaf() && leafNodesAreEqual(node1, node2) 483 } else { 484 return !node2.Leaf() && groupNodesAreEqual(node1, node2) 485 } 486 } 487 488 func typesAreEqual(type1, type2 Type) bool { 489 return type1.Kind() == type2.Kind() && 490 type1.Length() == type2.Length() && 491 reflect.DeepEqual(type1.LogicalType(), type2.LogicalType()) 492 } 493 494 func repetitionsAreEqual(node1, node2 Node) bool { 495 return node1.Optional() == node2.Optional() && node1.Repeated() == node2.Repeated() 496 } 497 498 func leafNodesAreEqual(node1, node2 Node) bool { 499 return typesAreEqual(node1.Type(), node2.Type()) && repetitionsAreEqual(node1, node2) 500 } 501 502 func groupNodesAreEqual(node1, node2 Node) bool { 503 fields1 := node1.Fields() 504 fields2 := node2.Fields() 505 506 if len(fields1) != len(fields2) { 507 return false 508 } 509 510 if !repetitionsAreEqual(node1, node2) { 511 return false 512 } 513 514 for i := range fields1 { 515 f1 := fields1[i] 516 f2 := fields2[i] 517 518 if f1.Name() != f2.Name() { 519 return false 520 } 521 522 if !nodesAreEqual(f1, f2) { 523 return false 524 } 525 } 526 527 return true 528 }