github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/node.go (about) 1 package parquet 2 3 import ( 4 "reflect" 5 "sort" 6 "unicode" 7 "unicode/utf8" 8 9 "github.com/vc42/parquet-go/compress" 10 "github.com/vc42/parquet-go/deprecated" 11 "github.com/vc42/parquet-go/encoding" 12 "github.com/vc42/parquet-go/format" 13 ) 14 15 // Node values represent nodes of a parquet schema. 16 // 17 // Nodes carry the type of values, as well as properties like whether the values 18 // are optional or repeat. Nodes with one or more children represent parquet 19 // groups and therefore do not have a logical type. 20 // 21 // Nodes are immutable values and therefore safe to use concurrently from 22 // multiple goroutines. 23 type Node interface { 24 // Returns a human-readable representation of the parquet node. 25 String() string 26 27 // For leaf nodes, returns the type of values of the parquet column. 28 // 29 // Calling this method on non-leaf nodes will panic. 30 Type() Type 31 32 // Returns whether the parquet column is optional. 33 Optional() bool 34 35 // Returns whether the parquet column is repeated. 36 Repeated() bool 37 38 // Returns whether the parquet column is required. 39 Required() bool 40 41 // Returns true if this a leaf node. 42 Leaf() bool 43 44 // Returns a mapping of the node's fields. 45 // 46 // As an optimization, the same slices may be returned by multiple calls to 47 // this method, programs must treat the returned values as immutable. 48 // 49 // This method returns an empty mapping when called on leaf nodes. 50 Fields() []Field 51 52 // Returns the encoding used by the node. 53 // 54 // The method may return nil to indicate that no specific encoding was 55 // configured on the node, in which case a default encoding might be used. 56 Encoding() encoding.Encoding 57 58 // Returns compression codec used by the node. 59 // 60 // The method may return nil to indicate that no specific compression codec 61 // was configured on the node, in which case a default compression might be 62 // used. 63 Compression() compress.Codec 64 65 // Returns the Go type that best represents the parquet node. 66 // 67 // For leaf nodes, this will be one of bool, int32, int64, deprecated.Int96, 68 // float32, float64, string, []byte, or [N]byte. 69 // 70 // For groups, the method returns a struct type. 71 // 72 // If the method is called on a repeated node, the method returns a slice of 73 // the underlying type. 74 // 75 // For optional nodes, the method returns a pointer of the underlying type. 76 // 77 // For nodes that were constructed from Go values (e.g. using SchemaOf), the 78 // method returns the original Go type. 79 GoType() reflect.Type 80 } 81 82 // Field instances represent fields of a parquet node, which associate a node to 83 // their name in their parent node. 84 type Field interface { 85 Node 86 87 // Returns the name of this field in its parent node. 88 Name() string 89 90 // Given a reference to the Go value matching the structure of the parent 91 // node, returns the Go value of the field. 92 Value(base reflect.Value) reflect.Value 93 } 94 95 // Encoded wraps the node passed as argument to use the given encoding. 96 // 97 // The function panics if it is called on a non-leaf node, or if the 98 // encoding does not support the node type. 99 func Encoded(node Node, encoding encoding.Encoding) Node { 100 if !node.Leaf() { 101 panic("cannot add encoding to a non-leaf node") 102 } 103 if encoding != nil { 104 kind := node.Type().Kind() 105 if !canEncode(encoding, kind) { 106 panic("cannot apply " + encoding.Encoding().String() + " to node of type " + kind.String()) 107 } 108 } 109 return &encodedNode{ 110 Node: node, 111 encoding: encoding, 112 } 113 } 114 115 type encodedNode struct { 116 Node 117 encoding encoding.Encoding 118 } 119 120 func (n *encodedNode) Encoding() encoding.Encoding { 121 return n.encoding 122 } 123 124 // Compressed wraps the node passed as argument to use the given compression 125 // codec. 126 // 127 // If the codec is nil, the node's compression is left unchanged. 128 // 129 // The function panics if it is called on a non-leaf node. 130 func Compressed(node Node, codec compress.Codec) Node { 131 if !node.Leaf() { 132 panic("cannot add compression codec to a non-leaf node") 133 } 134 return &compressedNode{ 135 Node: node, 136 codec: codec, 137 } 138 } 139 140 type compressedNode struct { 141 Node 142 codec compress.Codec 143 } 144 145 func (n *compressedNode) Compression() compress.Codec { 146 return n.codec 147 } 148 149 // Optional wraps the given node to make it optional. 150 func Optional(node Node) Node { return &optionalNode{node} } 151 152 type optionalNode struct{ Node } 153 154 func (opt *optionalNode) Optional() bool { return true } 155 func (opt *optionalNode) Repeated() bool { return false } 156 func (opt *optionalNode) Required() bool { return false } 157 func (opt *optionalNode) GoType() reflect.Type { return reflect.PtrTo(opt.Node.GoType()) } 158 159 // Repeated wraps the given node to make it repeated. 160 func Repeated(node Node) Node { return &repeatedNode{node} } 161 162 type repeatedNode struct{ Node } 163 164 func (rep *repeatedNode) Optional() bool { return false } 165 func (rep *repeatedNode) Repeated() bool { return true } 166 func (rep *repeatedNode) Required() bool { return false } 167 func (rep *repeatedNode) GoType() reflect.Type { return reflect.SliceOf(rep.Node.GoType()) } 168 169 // Required wraps the given node to make it required. 170 func Required(node Node) Node { return &requiredNode{node} } 171 172 type requiredNode struct{ Node } 173 174 func (req *requiredNode) Optional() bool { return false } 175 func (req *requiredNode) Repeated() bool { return false } 176 func (req *requiredNode) Required() bool { return true } 177 func (req *requiredNode) GoType() reflect.Type { return req.Node.GoType() } 178 179 type node struct{} 180 181 // Leaf returns a leaf node of the given type. 182 func Leaf(typ Type) Node { 183 return &leafNode{typ: typ} 184 } 185 186 type leafNode struct{ typ Type } 187 188 func (n *leafNode) String() string { return sprint("", n) } 189 190 func (n *leafNode) Type() Type { return n.typ } 191 192 func (n *leafNode) Optional() bool { return false } 193 194 func (n *leafNode) Repeated() bool { return false } 195 196 func (n *leafNode) Required() bool { return true } 197 198 func (n *leafNode) Leaf() bool { return true } 199 200 func (n *leafNode) Fields() []Field { return nil } 201 202 func (n *leafNode) Encoding() encoding.Encoding { return nil } 203 204 func (n *leafNode) Compression() compress.Codec { return nil } 205 206 func (n *leafNode) GoType() reflect.Type { return goTypeOfLeaf(n) } 207 208 var repetitionTypes = [...]format.FieldRepetitionType{ 209 0: format.Required, 210 1: format.Optional, 211 2: format.Repeated, 212 } 213 214 func fieldRepetitionTypePtrOf(node Node) *format.FieldRepetitionType { 215 switch { 216 case node.Required(): 217 return &repetitionTypes[format.Required] 218 case node.Optional(): 219 return &repetitionTypes[format.Optional] 220 case node.Repeated(): 221 return &repetitionTypes[format.Repeated] 222 default: 223 return nil 224 } 225 } 226 227 func fieldRepetitionTypeOf(node Node) format.FieldRepetitionType { 228 switch { 229 case node.Optional(): 230 return format.Optional 231 case node.Repeated(): 232 return format.Repeated 233 default: 234 return format.Required 235 } 236 } 237 238 type Group map[string]Node 239 240 func (g Group) String() string { return sprint("", g) } 241 242 func (g Group) Type() Type { return groupType{} } 243 244 func (g Group) Optional() bool { return false } 245 246 func (g Group) Repeated() bool { return false } 247 248 func (g Group) Required() bool { return true } 249 250 func (g Group) Leaf() bool { return false } 251 252 func (g Group) Fields() []Field { 253 groupFields := make([]groupField, 0, len(g)) 254 for name, node := range g { 255 groupFields = append(groupFields, groupField{ 256 Node: node, 257 name: name, 258 }) 259 } 260 sort.Slice(groupFields, func(i, j int) bool { 261 return groupFields[i].name < groupFields[j].name 262 }) 263 fields := make([]Field, len(groupFields)) 264 for i := range groupFields { 265 fields[i] = &groupFields[i] 266 } 267 return fields 268 } 269 270 func (g Group) Encoding() encoding.Encoding { return nil } 271 272 func (g Group) Compression() compress.Codec { return nil } 273 274 func (g Group) GoType() reflect.Type { return goTypeOfGroup(g) } 275 276 type groupField struct { 277 Node 278 name string 279 } 280 281 func (f *groupField) Name() string { return f.name } 282 283 func (f *groupField) Value(base reflect.Value) reflect.Value { 284 return base.MapIndex(reflect.ValueOf(&f.name).Elem()) 285 } 286 287 func goTypeOf(node Node) reflect.Type { 288 switch { 289 case node.Optional(): 290 return goTypeOfOptional(node) 291 case node.Repeated(): 292 return goTypeOfRepeated(node) 293 default: 294 return goTypeOfRequired(node) 295 } 296 } 297 298 func goTypeOfOptional(node Node) reflect.Type { 299 return reflect.PtrTo(goTypeOfRequired(node)) 300 } 301 302 func goTypeOfRepeated(node Node) reflect.Type { 303 return reflect.SliceOf(goTypeOfRequired(node)) 304 } 305 306 func goTypeOfRequired(node Node) reflect.Type { 307 if node.Leaf() { 308 return goTypeOfLeaf(node) 309 } else { 310 return goTypeOfGroup(node) 311 } 312 } 313 314 func goTypeOfLeaf(node Node) reflect.Type { 315 t := node.Type() 316 if convertibleType, ok := t.(interface{ GoType() reflect.Type }); ok { 317 return convertibleType.GoType() 318 } 319 switch t.Kind() { 320 case Boolean: 321 return reflect.TypeOf(false) 322 case Int32: 323 return reflect.TypeOf(int32(0)) 324 case Int64: 325 return reflect.TypeOf(int64(0)) 326 case Int96: 327 return reflect.TypeOf(deprecated.Int96{}) 328 case Float: 329 return reflect.TypeOf(float32(0)) 330 case Double: 331 return reflect.TypeOf(float64(0)) 332 case ByteArray: 333 return reflect.TypeOf(([]byte)(nil)) 334 case FixedLenByteArray: 335 return reflect.ArrayOf(t.Length(), reflect.TypeOf(byte(0))) 336 default: 337 panic("BUG: parquet type returned an unsupported kind") 338 } 339 } 340 341 func goTypeOfGroup(node Node) reflect.Type { 342 fields := node.Fields() 343 structFields := make([]reflect.StructField, len(fields)) 344 for i, field := range fields { 345 structFields[i].Name = exportedStructFieldName(field.Name()) 346 structFields[i].Type = field.GoType() 347 // TODO: can we reconstruct a struct tag that would be valid if a value 348 // of this type were passed to SchemaOf? 349 } 350 return reflect.StructOf(structFields) 351 } 352 353 func exportedStructFieldName(name string) string { 354 firstRune, size := utf8.DecodeRuneInString(name) 355 return string([]rune{unicode.ToUpper(firstRune)}) + name[size:] 356 } 357 358 func isList(node Node) bool { 359 logicalType := node.Type().LogicalType() 360 return logicalType != nil && logicalType.List != nil 361 } 362 363 func isMap(node Node) bool { 364 logicalType := node.Type().LogicalType() 365 return logicalType != nil && logicalType.Map != nil 366 } 367 368 func numLeafColumnsOf(node Node) int16 { 369 return makeColumnIndex(numLeafColumns(node, 0)) 370 } 371 372 func numLeafColumns(node Node, columnIndex int) int { 373 if node.Leaf() { 374 return columnIndex + 1 375 } 376 for _, field := range node.Fields() { 377 columnIndex = numLeafColumns(field, columnIndex) 378 } 379 return columnIndex 380 } 381 382 func listElementOf(node Node) Node { 383 if !node.Leaf() { 384 if list := childByName(node, "list"); list != nil { 385 if elem := childByName(list, "element"); elem != nil { 386 return elem 387 } 388 } 389 } 390 panic("node with logical type LIST is not composed of a repeated .list.element") 391 } 392 393 func mapKeyValueOf(node Node) Node { 394 if !node.Leaf() && (node.Required() || node.Optional()) { 395 if keyValue := childByName(node, "key_value"); keyValue != nil && !keyValue.Leaf() && keyValue.Repeated() { 396 k := childByName(keyValue, "key") 397 v := childByName(keyValue, "value") 398 if k != nil && v != nil && k.Required() { 399 return keyValue 400 } 401 } 402 } 403 panic("node with logical type MAP is not composed of a repeated .key_value group with key and value fields") 404 } 405 406 func encodingOf(node Node) encoding.Encoding { 407 encoding := node.Encoding() 408 // The parquet-format documentation states that the 409 // DELTA_LENGTH_BYTE_ARRAY is always preferred to PLAIN when 410 // encoding BYTE_ARRAY values. We apply it as a default if 411 // none were explicitly specified, which gives the application 412 // the opportunity to override this behavior if needed. 413 // 414 // https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-length-byte-array-delta_length_byte_array--6 415 if node.Type().Kind() == ByteArray && encoding == nil { 416 encoding = &DeltaLengthByteArray 417 } 418 if encoding == nil { 419 encoding = &Plain 420 } 421 return encoding 422 } 423 424 func forEachNodeOf(name string, node Node, do func(string, Node)) { 425 do(name, node) 426 427 for _, f := range node.Fields() { 428 forEachNodeOf(f.Name(), f, do) 429 } 430 } 431 432 func childByName(node Node, name string) Node { 433 for _, f := range node.Fields() { 434 if f.Name() == name { 435 return f 436 } 437 } 438 return nil 439 } 440 441 func nodesAreEqual(node1, node2 Node) bool { 442 if node1.Leaf() { 443 return node2.Leaf() && leafNodesAreEqual(node1, node2) 444 } else { 445 return !node2.Leaf() && groupNodesAreEqual(node1, node2) 446 } 447 } 448 449 func typesAreEqual(node1, node2 Node) bool { 450 return node1.Type().Kind() == node2.Type().Kind() 451 } 452 453 func repetitionsAreEqual(node1, node2 Node) bool { 454 return node1.Optional() == node2.Optional() && node1.Repeated() == node2.Repeated() 455 } 456 457 func leafNodesAreEqual(node1, node2 Node) bool { 458 return typesAreEqual(node1, node2) && repetitionsAreEqual(node1, node2) 459 } 460 461 func groupNodesAreEqual(node1, node2 Node) bool { 462 fields1 := node1.Fields() 463 fields2 := node2.Fields() 464 465 if len(fields1) != len(fields2) { 466 return false 467 } 468 469 for i := range fields1 { 470 f1 := fields1[i] 471 f2 := fields2[i] 472 473 if f1.Name() != f2.Name() { 474 return false 475 } 476 477 if !nodesAreEqual(f1, f2) { 478 return false 479 } 480 } 481 482 return true 483 }