github.com/apache/arrow/go/v14@v14.0.1/parquet/schema/schema.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 // Package schema provides types and functions for manipulating and building parquet 18 // file schemas. 19 // 20 // Some of the utilities provided include building a schema using Struct Tags 21 // on a struct type, getting Column Paths from a node, and dealing with the 22 // converted and logical types for Parquet. 23 // 24 // Logical types specify ways to interpret the primitive types allowing the 25 // number of primitive types to be smaller and reuse efficient encodings. 26 // For instance a "string" is just a ByteArray column with a UTF-8 annotation 27 // or "String Logical Type". 28 // 29 // For more information about Logical and Converted Types, check: 30 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md 31 package schema 32 33 import ( 34 "fmt" 35 "io" 36 "strings" 37 38 "github.com/apache/arrow/go/v14/parquet" 39 format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet" 40 "golang.org/x/xerrors" 41 ) 42 43 // Schema is the container for the converted Parquet schema with a computed 44 // information from the schema analysis needed for file reading 45 // 46 // * Column index to Node 47 // 48 // * Max repetition / definition levels for each primitive node 49 // 50 // The ColumnDescriptor objects produced by this class can be used to assist in 51 // the reconstruction of fully materialized data structures from the 52 // repetition-definition level encoding of nested data 53 type Schema struct { 54 root Node 55 56 leaves []*Column 57 nodeToLeaf map[*PrimitiveNode]int 58 leafToBase map[int]Node 59 leafToIndex strIntMultimap 60 } 61 62 // FromParquet converts a slice of thrift Schema Elements to the correct node type 63 func FromParquet(elems []*format.SchemaElement) (Node, error) { 64 if len(elems) == 0 { 65 return nil, xerrors.New("parquet: empty schema (no root)") 66 } 67 68 if elems[0].GetNumChildren() == 0 { 69 if len(elems) > 1 { 70 return nil, xerrors.New("parquet: schema had multiple nodes but root had no children") 71 } 72 // parquet file with no columns 73 return GroupNodeFromThrift(elems[0], []Node{}) 74 } 75 76 // We don't check that the root node is repeated since this is not 77 // consistently set by implementations 78 var ( 79 pos = 0 80 nextNode func() (Node, error) 81 ) 82 83 nextNode = func() (Node, error) { 84 if pos == len(elems) { 85 return nil, xerrors.New("parquet: malformed schema: not enough elements") 86 } 87 88 elem := elems[pos] 89 pos++ 90 91 if elem.GetNumChildren() == 0 { 92 return PrimitiveNodeFromThrift(elem) 93 } 94 95 fields := make([]Node, 0, elem.GetNumChildren()) 96 for i := 0; i < int(elem.GetNumChildren()); i++ { 97 n, err := nextNode() 98 if err != nil { 99 return nil, err 100 } 101 fields = append(fields, n) 102 } 103 104 return GroupNodeFromThrift(elem, fields) 105 } 106 107 return nextNode() 108 } 109 110 // Root returns the group node that is the root of this schema 111 func (s *Schema) Root() *GroupNode { 112 return s.root.(*GroupNode) 113 } 114 115 // NumColumns returns the number of leaf nodes that are the actual primitive 116 // columns in this schema. 117 func (s *Schema) NumColumns() int { 118 return len(s.leaves) 119 } 120 121 // Equals returns true as long as the leaf columns are equal, doesn't take 122 // into account the groups and only checks whether the schemas are compatible 123 // at the physical storage level. 124 func (s *Schema) Equals(rhs *Schema) bool { 125 if s.NumColumns() != rhs.NumColumns() { 126 return false 127 } 128 129 for idx, c := range s.leaves { 130 if !c.Equals(rhs.Column(idx)) { 131 return false 132 } 133 } 134 return true 135 } 136 137 func (s *Schema) buildTree(n Node, maxDefLvl, maxRepLvl int16, base Node) { 138 switch n.RepetitionType() { 139 case parquet.Repetitions.Repeated: 140 maxRepLvl++ 141 fallthrough 142 case parquet.Repetitions.Optional: 143 maxDefLvl++ 144 } 145 146 switch n := n.(type) { 147 case *GroupNode: 148 for _, f := range n.fields { 149 s.buildTree(f, maxDefLvl, maxRepLvl, base) 150 } 151 case *PrimitiveNode: 152 s.nodeToLeaf[n] = len(s.leaves) 153 s.leaves = append(s.leaves, NewColumn(n, maxDefLvl, maxRepLvl)) 154 s.leafToBase[len(s.leaves)-1] = base 155 s.leafToIndex.Add(n.Path(), len(s.leaves)-1) 156 } 157 } 158 159 // Column returns the (0-indexed) column of the provided index. 160 func (s *Schema) Column(i int) *Column { 161 return s.leaves[i] 162 } 163 164 // ColumnIndexByName looks up the column by it's full dot separated 165 // node path. If there are multiple columns that match, it returns the first one. 166 // 167 // Returns -1 if not found. 168 func (s *Schema) ColumnIndexByName(nodePath string) int { 169 if search, ok := s.leafToIndex[nodePath]; ok { 170 return search[0] 171 } 172 return -1 173 } 174 175 // ColumnIndexByNode returns the index of the column represented by this node. 176 // 177 // Returns -1 if not found. 178 func (s *Schema) ColumnIndexByNode(n Node) int { 179 if search, ok := s.leafToIndex[n.Path()]; ok { 180 for _, idx := range search { 181 if n == s.Column(idx).SchemaNode() { 182 return idx 183 } 184 } 185 } 186 return -1 187 } 188 189 // ColumnRoot returns the root node of a given column if it is under a 190 // nested group node, providing that root group node. 191 func (s *Schema) ColumnRoot(i int) Node { 192 return s.leafToBase[i] 193 } 194 195 // HasRepeatedFields returns true if any node in the schema has a repeated field type. 196 func (s *Schema) HasRepeatedFields() bool { 197 return s.root.(*GroupNode).HasRepeatedFields() 198 } 199 200 // UpdateColumnOrders must get a slice that is the same length as the number of leaf columns 201 // and is used to update the schema metadata Column Orders. len(orders) must equal s.NumColumns() 202 func (s *Schema) UpdateColumnOrders(orders []parquet.ColumnOrder) error { 203 if len(orders) != s.NumColumns() { 204 return xerrors.New("parquet: malformed schema: not enough ColumnOrder values") 205 } 206 207 visitor := schemaColumnOrderUpdater{orders, 0} 208 s.root.Visit(&visitor) 209 return nil 210 } 211 212 func (s *Schema) String() string { 213 var b strings.Builder 214 PrintSchema(s.root, &b, 2) 215 return b.String() 216 } 217 218 // NewSchema constructs a new Schema object from a root group node. 219 // 220 // Any fields with a field-id of -1 will be given an appropriate field number based on their order. 221 func NewSchema(root *GroupNode) *Schema { 222 s := &Schema{ 223 root, 224 make([]*Column, 0), 225 make(map[*PrimitiveNode]int), 226 make(map[int]Node), 227 make(strIntMultimap), 228 } 229 230 for _, f := range root.fields { 231 s.buildTree(f, 0, 0, f) 232 } 233 return s 234 } 235 236 type schemaColumnOrderUpdater struct { 237 colOrders []parquet.ColumnOrder 238 leafCount int 239 } 240 241 func (s *schemaColumnOrderUpdater) VisitPre(n Node) bool { 242 if n.Type() == Primitive { 243 leaf := n.(*PrimitiveNode) 244 leaf.ColumnOrder = s.colOrders[s.leafCount] 245 s.leafCount++ 246 } 247 return true 248 } 249 250 func (s *schemaColumnOrderUpdater) VisitPost(Node) {} 251 252 type toThriftVisitor struct { 253 elements []*format.SchemaElement 254 } 255 256 func (t *toThriftVisitor) VisitPre(n Node) bool { 257 t.elements = append(t.elements, n.toThrift()) 258 return true 259 } 260 261 func (t *toThriftVisitor) VisitPost(Node) {} 262 263 // ToThrift converts a GroupNode to a slice of SchemaElements which is used 264 // for thrift serialization. 265 func ToThrift(schema *GroupNode) []*format.SchemaElement { 266 t := &toThriftVisitor{make([]*format.SchemaElement, 0)} 267 schema.Visit(t) 268 return t.elements 269 } 270 271 type schemaPrinter struct { 272 w io.Writer 273 indent int 274 indentWidth int 275 } 276 277 func (s *schemaPrinter) VisitPre(n Node) bool { 278 fmt.Fprint(s.w, strings.Repeat(" ", s.indent)) 279 if n.Type() == Group { 280 g := n.(*GroupNode) 281 fmt.Fprintf(s.w, "%s group field_id=%d %s", g.RepetitionType(), g.FieldID(), g.Name()) 282 _, invalid := g.logicalType.(UnknownLogicalType) 283 _, none := g.logicalType.(NoLogicalType) 284 285 if g.logicalType != nil && !invalid && !none { 286 fmt.Fprintf(s.w, " (%s)", g.logicalType) 287 } else if g.convertedType != ConvertedTypes.None { 288 fmt.Fprintf(s.w, " (%s)", g.convertedType) 289 } 290 291 fmt.Fprintln(s.w, " {") 292 s.indent += s.indentWidth 293 } else { 294 p := n.(*PrimitiveNode) 295 fmt.Fprintf(s.w, "%s %s field_id=%d %s", p.RepetitionType(), strings.ToLower(p.PhysicalType().String()), p.FieldID(), p.Name()) 296 _, invalid := p.logicalType.(UnknownLogicalType) 297 _, none := p.logicalType.(NoLogicalType) 298 299 if p.logicalType != nil && !invalid && !none { 300 fmt.Fprintf(s.w, " (%s)", p.logicalType) 301 } else if p.convertedType == ConvertedTypes.Decimal { 302 fmt.Fprintf(s.w, " (%s(%d,%d))", p.convertedType, p.DecimalMetadata().Precision, p.DecimalMetadata().Scale) 303 } else if p.convertedType != ConvertedTypes.None { 304 fmt.Fprintf(s.w, " (%s)", p.convertedType) 305 } 306 fmt.Fprintln(s.w, ";") 307 } 308 return true 309 } 310 311 func (s *schemaPrinter) VisitPost(n Node) { 312 if n.Type() == Group { 313 s.indent -= s.indentWidth 314 fmt.Fprint(s.w, strings.Repeat(" ", s.indent)) 315 fmt.Fprintln(s.w, "}") 316 } 317 } 318 319 // PrintSchema writes a string representation of the tree to w using the indent 320 // width provided. 321 func PrintSchema(n Node, w io.Writer, indentWidth int) { 322 n.Visit(&schemaPrinter{w, 0, indentWidth}) 323 } 324 325 type strIntMultimap map[string][]int 326 327 func (f strIntMultimap) Add(key string, val int) bool { 328 if _, ok := f[key]; !ok { 329 f[key] = []int{val} 330 return false 331 } 332 f[key] = append(f[key], val) 333 return true 334 }