github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/node.go (about) 1 package parquet 2 3 import ( 4 "reflect" 5 "sort" 6 "unicode" 7 "unicode/utf8" 8 9 "github.com/segmentio/parquet-go/compress" 10 "github.com/segmentio/parquet-go/deprecated" 11 "github.com/segmentio/parquet-go/encoding" 12 "github.com/segmentio/parquet-go/format" 13 ) 14 15 // Node values represent nodes of a parquet schema. 16 // 17 // Nodes carry the type of values, as well as properties like whether the values 18 // are optional or repeat. Nodes with one or more children represent parquet 19 // groups and therefore do not have a logical type. 20 // 21 // Nodes are immutable values and therefore safe to use concurrently from 22 // multiple goroutines. 23 type Node interface { 24 // Returns a human-readable representation of the parquet node. 25 String() string 26 27 // For leaf nodes, returns the type of values of the parquet column. 28 // 29 // Calling this method on non-leaf nodes will panic. 30 Type() Type 31 32 // Returns whether the parquet column is optional. 33 Optional() bool 34 35 // Returns whether the parquet column is repeated. 36 Repeated() bool 37 38 // Returns whether the parquet column is required. 39 Required() bool 40 41 // Returns true if this a leaf node. 42 Leaf() bool 43 44 // Returns a mapping of the node's fields. 45 // 46 // As an optimization, the same slices may be returned by multiple calls to 47 // this method, programs must treat the returned values as immutable. 48 // 49 // This method returns an empty mapping when called on leaf nodes. 50 Fields() []Field 51 52 // Returns the encoding used by the node. 53 // 54 // The method may return nil to indicate that no specific encoding was 55 // configured on the node, in which case a default encoding might be used. 56 Encoding() encoding.Encoding 57 58 // Returns compression codec used by the node. 59 // 60 // The method may return nil to indicate that no specific compression codec 61 // was configured on the node, in which case a default compression might be 62 // used. 63 Compression() compress.Codec 64 65 // Returns the Go type that best represents the parquet node. 66 // 67 // For leaf nodes, this will be one of bool, int32, int64, deprecated.Int96, 68 // float32, float64, string, []byte, or [N]byte. 69 // 70 // For groups, the method returns a struct type. 71 // 72 // If the method is called on a repeated node, the method returns a slice of 73 // the underlying type. 74 // 75 // For optional nodes, the method returns a pointer of the underlying type. 76 // 77 // For nodes that were constructed from Go values (e.g. using SchemaOf), the 78 // method returns the original Go type. 79 GoType() reflect.Type 80 } 81 82 // Field instances represent fields of a parquet node, which associate a node to 83 // their name in their parent node. 84 type Field interface { 85 Node 86 87 // Returns the name of this field in its parent node. 88 Name() string 89 90 // Given a reference to the Go value matching the structure of the parent 91 // node, returns the Go value of the field. 92 Value(base reflect.Value) reflect.Value 93 } 94 95 // Encoded wraps the node passed as argument to use the given encoding. 96 // 97 // The function panics if it is called on a non-leaf node, or if the 98 // encoding does not support the node type. 99 func Encoded(node Node, encoding encoding.Encoding) Node { 100 if !node.Leaf() { 101 panic("cannot add encoding to a non-leaf node") 102 } 103 if encoding != nil { 104 kind := node.Type().Kind() 105 if !canEncode(encoding, kind) { 106 panic("cannot apply " + encoding.Encoding().String() + " to node of type " + kind.String()) 107 } 108 } 109 return &encodedNode{ 110 Node: node, 111 encoding: encoding, 112 } 113 } 114 115 type encodedNode struct { 116 Node 117 encoding encoding.Encoding 118 } 119 120 func (n *encodedNode) Encoding() encoding.Encoding { 121 return n.encoding 122 } 123 124 // Compressed wraps the node passed as argument to use the given compression 125 // codec. 126 // 127 // If the codec is nil, the node's compression is left unchanged. 128 // 129 // The function panics if it is called on a non-leaf node. 130 func Compressed(node Node, codec compress.Codec) Node { 131 if !node.Leaf() { 132 panic("cannot add compression codec to a non-leaf node") 133 } 134 return &compressedNode{ 135 Node: node, 136 codec: codec, 137 } 138 } 139 140 type compressedNode struct { 141 Node 142 codec compress.Codec 143 } 144 145 func (n *compressedNode) Compression() compress.Codec { 146 return n.codec 147 } 148 149 // Optional wraps the given node to make it optional. 150 func Optional(node Node) Node { return &optionalNode{node} } 151 152 type optionalNode struct{ Node } 153 154 func (opt *optionalNode) Optional() bool { return true } 155 func (opt *optionalNode) Repeated() bool { return false } 156 func (opt *optionalNode) Required() bool { return false } 157 func (opt *optionalNode) GoType() reflect.Type { return reflect.PtrTo(opt.Node.GoType()) } 158 159 // Repeated wraps the given node to make it repeated. 160 func Repeated(node Node) Node { return &repeatedNode{node} } 161 162 type repeatedNode struct{ Node } 163 164 func (rep *repeatedNode) Optional() bool { return false } 165 func (rep *repeatedNode) Repeated() bool { return true } 166 func (rep *repeatedNode) Required() bool { return false } 167 func (rep *repeatedNode) GoType() reflect.Type { return reflect.SliceOf(rep.Node.GoType()) } 168 169 // Required wraps the given node to make it required. 170 func Required(node Node) Node { return &requiredNode{node} } 171 172 type requiredNode struct{ Node } 173 174 func (req *requiredNode) Optional() bool { return false } 175 func (req *requiredNode) Repeated() bool { return false } 176 func (req *requiredNode) Required() bool { return true } 177 func (req *requiredNode) GoType() reflect.Type { return req.Node.GoType() } 178 179 type node struct{} 180 181 // Leaf returns a leaf node of the given type. 182 func Leaf(typ Type) Node { 183 return &leafNode{typ: typ} 184 } 185 186 type leafNode struct{ typ Type } 187 188 func (n *leafNode) String() string { return sprint("", n) } 189 190 func (n *leafNode) Type() Type { return n.typ } 191 192 func (n *leafNode) Optional() bool { return false } 193 194 func (n *leafNode) Repeated() bool { return false } 195 196 func (n *leafNode) Required() bool { return true } 197 198 func (n *leafNode) Leaf() bool { return true } 199 200 func (n *leafNode) Fields() []Field { return nil } 201 202 func (n *leafNode) Encoding() encoding.Encoding { return nil } 203 204 func (n *leafNode) Compression() compress.Codec { return nil } 205 206 func (n *leafNode) GoType() reflect.Type { return goTypeOfLeaf(n) } 207 208 var repetitionTypes = [...]format.FieldRepetitionType{ 209 0: format.Required, 210 1: format.Optional, 211 2: format.Repeated, 212 } 213 214 func fieldRepetitionTypePtrOf(node Node) *format.FieldRepetitionType { 215 switch { 216 case node.Required(): 217 return &repetitionTypes[format.Required] 218 case node.Optional(): 219 return &repetitionTypes[format.Optional] 220 case node.Repeated(): 221 return &repetitionTypes[format.Repeated] 222 default: 223 return nil 224 } 225 } 226 227 func fieldRepetitionTypeOf(node Node) format.FieldRepetitionType { 228 switch { 229 case node.Optional(): 230 return format.Optional 231 case node.Repeated(): 232 return format.Repeated 233 default: 234 return format.Required 235 } 236 } 237 238 func applyFieldRepetitionType(t format.FieldRepetitionType, repetitionLevel, definitionLevel byte) (byte, byte) { 239 switch t { 240 case format.Optional: 241 definitionLevel++ 242 case format.Repeated: 243 repetitionLevel++ 244 definitionLevel++ 245 } 246 return repetitionLevel, definitionLevel 247 } 248 249 type Group map[string]Node 250 251 func (g Group) String() string { return sprint("", g) } 252 253 func (g Group) Type() Type { return groupType{} } 254 255 func (g Group) Optional() bool { return false } 256 257 func (g Group) Repeated() bool { return false } 258 259 func (g Group) Required() bool { return true } 260 261 func (g Group) Leaf() bool { return false } 262 263 func (g Group) Fields() []Field { 264 groupFields := make([]groupField, 0, len(g)) 265 for name, node := range g { 266 groupFields = append(groupFields, groupField{ 267 Node: node, 268 name: name, 269 }) 270 } 271 sort.Slice(groupFields, func(i, j int) bool { 272 return groupFields[i].name < groupFields[j].name 273 }) 274 fields := make([]Field, len(groupFields)) 275 for i := range groupFields { 276 fields[i] = &groupFields[i] 277 } 278 return fields 279 } 280 281 func (g Group) Encoding() encoding.Encoding { return nil } 282 283 func (g Group) Compression() compress.Codec { return nil } 284 285 func (g Group) GoType() reflect.Type { return goTypeOfGroup(g) } 286 287 type groupField struct { 288 Node 289 name string 290 } 291 292 func (f *groupField) Name() string { return f.name } 293 294 func (f *groupField) Value(base reflect.Value) reflect.Value { 295 return base.MapIndex(reflect.ValueOf(&f.name).Elem()) 296 } 297 298 func goTypeOf(node Node) reflect.Type { 299 switch { 300 case node.Optional(): 301 return goTypeOfOptional(node) 302 case node.Repeated(): 303 return goTypeOfRepeated(node) 304 default: 305 return goTypeOfRequired(node) 306 } 307 } 308 309 func goTypeOfOptional(node Node) reflect.Type { 310 return reflect.PtrTo(goTypeOfRequired(node)) 311 } 312 313 func goTypeOfRepeated(node Node) reflect.Type { 314 return reflect.SliceOf(goTypeOfRequired(node)) 315 } 316 317 func goTypeOfRequired(node Node) reflect.Type { 318 if node.Leaf() { 319 return goTypeOfLeaf(node) 320 } else { 321 return goTypeOfGroup(node) 322 } 323 } 324 325 func goTypeOfLeaf(node Node) reflect.Type { 326 t := node.Type() 327 if convertibleType, ok := t.(interface{ GoType() reflect.Type }); ok { 328 return convertibleType.GoType() 329 } 330 switch t.Kind() { 331 case Boolean: 332 return reflect.TypeOf(false) 333 case Int32: 334 return reflect.TypeOf(int32(0)) 335 case Int64: 336 return reflect.TypeOf(int64(0)) 337 case Int96: 338 return reflect.TypeOf(deprecated.Int96{}) 339 case Float: 340 return reflect.TypeOf(float32(0)) 341 case Double: 342 return reflect.TypeOf(float64(0)) 343 case ByteArray: 344 return reflect.TypeOf(([]byte)(nil)) 345 case FixedLenByteArray: 346 return reflect.ArrayOf(t.Length(), reflect.TypeOf(byte(0))) 347 default: 348 panic("BUG: parquet type returned an unsupported kind") 349 } 350 } 351 352 func goTypeOfGroup(node Node) reflect.Type { 353 fields := node.Fields() 354 structFields := make([]reflect.StructField, len(fields)) 355 for i, field := range fields { 356 structFields[i].Name = exportedStructFieldName(field.Name()) 357 structFields[i].Type = field.GoType() 358 // TODO: can we reconstruct a struct tag that would be valid if a value 359 // of this type were passed to SchemaOf? 360 } 361 return reflect.StructOf(structFields) 362 } 363 364 func exportedStructFieldName(name string) string { 365 firstRune, size := utf8.DecodeRuneInString(name) 366 return string([]rune{unicode.ToUpper(firstRune)}) + name[size:] 367 } 368 369 func isList(node Node) bool { 370 logicalType := node.Type().LogicalType() 371 return logicalType != nil && logicalType.List != nil 372 } 373 374 func isMap(node Node) bool { 375 logicalType := node.Type().LogicalType() 376 return logicalType != nil && logicalType.Map != nil 377 } 378 379 func numLeafColumnsOf(node Node) int16 { 380 return makeColumnIndex(numLeafColumns(node, 0)) 381 } 382 383 func numLeafColumns(node Node, columnIndex int) int { 384 if node.Leaf() { 385 return columnIndex + 1 386 } 387 for _, field := range node.Fields() { 388 columnIndex = numLeafColumns(field, columnIndex) 389 } 390 return columnIndex 391 } 392 393 func listElementOf(node Node) Node { 394 if !node.Leaf() { 395 if list := fieldByName(node, "list"); list != nil { 396 if elem := fieldByName(list, "element"); elem != nil { 397 return elem 398 } 399 } 400 } 401 panic("node with logical type LIST is not composed of a repeated .list.element") 402 } 403 404 func mapKeyValueOf(node Node) Node { 405 if !node.Leaf() && (node.Required() || node.Optional()) { 406 if keyValue := fieldByName(node, "key_value"); keyValue != nil && !keyValue.Leaf() && keyValue.Repeated() { 407 k := fieldByName(keyValue, "key") 408 v := fieldByName(keyValue, "value") 409 if k != nil && v != nil && k.Required() { 410 return keyValue 411 } 412 } 413 } 414 panic("node with logical type MAP is not composed of a repeated .key_value group with key and value fields") 415 } 416 417 func encodingOf(node Node) encoding.Encoding { 418 encoding := node.Encoding() 419 // The parquet-format documentation states that the 420 // DELTA_LENGTH_BYTE_ARRAY is always preferred to PLAIN when 421 // encoding BYTE_ARRAY values. We apply it as a default if 422 // none were explicitly specified, which gives the application 423 // the opportunity to override this behavior if needed. 424 // 425 // https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-length-byte-array-delta_length_byte_array--6 426 if node.Type().Kind() == ByteArray && encoding == nil { 427 encoding = &DeltaLengthByteArray 428 } 429 if encoding == nil { 430 encoding = &Plain 431 } 432 return encoding 433 } 434 435 func forEachNodeOf(name string, node Node, do func(string, Node)) { 436 do(name, node) 437 438 for _, f := range node.Fields() { 439 forEachNodeOf(f.Name(), f, do) 440 } 441 } 442 443 func fieldByName(node Node, name string) Field { 444 for _, f := range node.Fields() { 445 if f.Name() == name { 446 return f 447 } 448 } 449 return nil 450 } 451 452 func nodesAreEqual(node1, node2 Node) bool { 453 if node1.Leaf() { 454 return node2.Leaf() && leafNodesAreEqual(node1, node2) 455 } else { 456 return !node2.Leaf() && groupNodesAreEqual(node1, node2) 457 } 458 } 459 460 func typesAreEqual(type1, type2 Type) bool { 461 return type1.Kind() == type2.Kind() && 462 type1.Length() == type2.Length() && 463 reflect.DeepEqual(type1.LogicalType(), type2.LogicalType()) 464 } 465 466 func repetitionsAreEqual(node1, node2 Node) bool { 467 return node1.Optional() == node2.Optional() && node1.Repeated() == node2.Repeated() 468 } 469 470 func leafNodesAreEqual(node1, node2 Node) bool { 471 return typesAreEqual(node1.Type(), node2.Type()) && repetitionsAreEqual(node1, node2) 472 } 473 474 func groupNodesAreEqual(node1, node2 Node) bool { 475 fields1 := node1.Fields() 476 fields2 := node2.Fields() 477 478 if len(fields1) != len(fields2) { 479 return false 480 } 481 482 if !repetitionsAreEqual(node1, node2) { 483 return false 484 } 485 486 for i := range fields1 { 487 f1 := fields1[i] 488 f2 := fields2[i] 489 490 if f1.Name() != f2.Name() { 491 return false 492 } 493 494 if !nodesAreEqual(f1, f2) { 495 return false 496 } 497 } 498 499 return true 500 }