storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/schema/tree.go (about) 1 /* 2 * Minio Cloud Storage, (C) 2019 Minio, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package schema 18 19 import ( 20 "fmt" 21 "strings" 22 23 "storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" 24 ) 25 26 func updateMaxDLRL(schemaMap map[string]*Element, maxDL, maxRL int64) { 27 for _, element := range schemaMap { 28 element.MaxDefinitionLevel = maxDL 29 element.MaxRepetitionLevel = maxRL 30 if *element.RepetitionType != parquet.FieldRepetitionType_REQUIRED { 31 element.MaxDefinitionLevel++ 32 if *element.RepetitionType == parquet.FieldRepetitionType_REPEATED { 33 element.MaxRepetitionLevel++ 34 } 35 } 36 37 if element.Children != nil { 38 updateMaxDLRL(element.Children.schemaMap, element.MaxDefinitionLevel, element.MaxRepetitionLevel) 39 } 40 } 41 } 42 43 func toParquetSchema(tree *Tree, treePrefix string, schemaPrefix string, schemaList *[]*parquet.SchemaElement, valueElements *[]*Element) (err error) { 44 tree.Range(func(name string, element *Element) bool { 45 pathInTree := name 46 if treePrefix != "" { 47 pathInTree = treePrefix + "." + name 48 } 49 50 if element.Type == nil && element.ConvertedType == nil && element.Children == nil { 51 err = fmt.Errorf("%v: group element must have children", pathInTree) 52 return false 53 } 54 55 if element.ConvertedType != nil { 56 switch *element.ConvertedType { 57 case parquet.ConvertedType_LIST: 58 // Supported structure. 59 // <REQUIRED|OPTIONAL> group <name> (LIST) { 60 // REPEATED group list { 61 // <REQUIRED|OPTIONAL> <element-type> element; 62 // } 63 // } 64 65 if element.Type != nil { 66 err = fmt.Errorf("%v: type must be nil for LIST ConvertedType", pathInTree) 67 return false 68 } 69 70 if element.Children == nil || element.Children.Length() != 1 { 71 err = fmt.Errorf("%v: children must have one element only for LIST ConvertedType", pathInTree) 72 return false 73 } 74 75 listElement, ok := element.Children.Get("list") 76 if !ok { 77 err = fmt.Errorf("%v: missing group element 'list' for LIST ConvertedType", pathInTree) 78 return false 79 } 80 81 if listElement.Name != "list" { 82 err = fmt.Errorf("%v.list: name must be 'list'", pathInTree) 83 return false 84 } 85 86 if *listElement.RepetitionType != parquet.FieldRepetitionType_REPEATED { 87 err = fmt.Errorf("%v.list: repetition type must be REPEATED type", pathInTree) 88 return false 89 } 90 91 if listElement.Type != nil || listElement.ConvertedType != nil { 92 err = fmt.Errorf("%v.list: type and converted type must be nil", pathInTree) 93 return false 94 } 95 96 if listElement.Children == nil || listElement.Children.Length() != 1 { 97 err = fmt.Errorf("%v.list.element: not found", pathInTree) 98 return false 99 } 100 101 valueElement, ok := listElement.Children.Get("element") 102 if !ok { 103 err = fmt.Errorf("%v.list.element: not found", pathInTree) 104 return false 105 } 106 107 if valueElement.Name != "element" { 108 err = fmt.Errorf("%v.list.element: name must be 'element'", pathInTree) 109 return false 110 } 111 112 case parquet.ConvertedType_MAP: 113 // Supported structure: 114 // <REQUIRED|OPTIONAL> group <name> (MAP) { 115 // REPEATED group key_value { 116 // REQUIRED <key-type> key; 117 // <REQUIRED|OPTIONAL> <value-type> value; 118 // } 119 // } 120 121 if element.Type != nil { 122 err = fmt.Errorf("%v: type must be nil for MAP ConvertedType", pathInTree) 123 return false 124 } 125 126 if element.Children == nil || element.Children.Length() != 1 { 127 err = fmt.Errorf("%v: children must have one element only for MAP ConvertedType", pathInTree) 128 return false 129 } 130 131 keyValueElement, ok := element.Children.Get("key_value") 132 if !ok { 133 err = fmt.Errorf("%v: missing group element 'key_value' for MAP ConvertedType", pathInTree) 134 return false 135 } 136 137 if keyValueElement.Name != "key_value" { 138 err = fmt.Errorf("%v.key_value: name must be 'key_value'", pathInTree) 139 return false 140 } 141 142 if *keyValueElement.RepetitionType != parquet.FieldRepetitionType_REPEATED { 143 err = fmt.Errorf("%v.key_value: repetition type must be REPEATED type", pathInTree) 144 return false 145 } 146 147 if keyValueElement.Children == nil || keyValueElement.Children.Length() < 1 || keyValueElement.Children.Length() > 2 { 148 err = fmt.Errorf("%v.key_value: children must have 'key' and optionally 'value' elements for MAP ConvertedType", pathInTree) 149 return false 150 } 151 152 keyElement, ok := keyValueElement.Children.Get("key") 153 if !ok { 154 err = fmt.Errorf("%v.key_value: missing 'key' element for MAP ConvertedType", pathInTree) 155 return false 156 } 157 158 if keyElement.Name != "key" { 159 err = fmt.Errorf("%v.key_value.key: name must be 'key'", pathInTree) 160 return false 161 } 162 163 if *keyElement.RepetitionType != parquet.FieldRepetitionType_REQUIRED { 164 err = fmt.Errorf("%v.key_value: repetition type must be REQUIRED type", pathInTree) 165 return false 166 } 167 168 if keyValueElement.Children.Length() == 2 { 169 valueElement, ok := keyValueElement.Children.Get("value") 170 if !ok { 171 err = fmt.Errorf("%v.key_value: second element must be 'value' element for MAP ConvertedType", pathInTree) 172 return false 173 } 174 175 if valueElement.Name != "value" { 176 err = fmt.Errorf("%v.key_value.value: name must be 'value'", pathInTree) 177 return false 178 } 179 } 180 181 case parquet.ConvertedType_UTF8, parquet.ConvertedType_UINT_8, parquet.ConvertedType_UINT_16: 182 fallthrough 183 case parquet.ConvertedType_UINT_32, parquet.ConvertedType_UINT_64, parquet.ConvertedType_INT_8: 184 fallthrough 185 case parquet.ConvertedType_INT_16, parquet.ConvertedType_INT_32, parquet.ConvertedType_INT_64: 186 if element.Type == nil { 187 err = fmt.Errorf("%v: ConvertedType %v must have Type value", pathInTree, element.ConvertedType) 188 return false 189 } 190 191 default: 192 err = fmt.Errorf("%v: unsupported ConvertedType %v", pathInTree, element.ConvertedType) 193 return false 194 } 195 } 196 197 element.PathInTree = pathInTree 198 element.PathInSchema = element.Name 199 if schemaPrefix != "" { 200 element.PathInSchema = schemaPrefix + "." + element.Name 201 } 202 203 if element.Type != nil { 204 *valueElements = append(*valueElements, element) 205 } 206 207 *schemaList = append(*schemaList, &element.SchemaElement) 208 if element.Children != nil { 209 element.numChildren = int32(element.Children.Length()) 210 err = toParquetSchema(element.Children, element.PathInTree, element.PathInSchema, schemaList, valueElements) 211 } 212 213 return (err == nil) 214 }) 215 216 return err 217 } 218 219 // Tree - represents tree of schema. Tree preserves order in which elements are added. 220 type Tree struct { 221 schemaMap map[string]*Element 222 keys []string 223 readOnly bool 224 } 225 226 // String - stringify this tree. 227 func (tree *Tree) String() string { 228 var s []string 229 tree.Range(func(name string, element *Element) bool { 230 s = append(s, fmt.Sprintf("%v: %v", name, element)) 231 return true 232 }) 233 234 return "{" + strings.Join(s, ", ") + "}" 235 } 236 237 // Length - returns length of tree. 238 func (tree *Tree) Length() int { 239 return len(tree.keys) 240 } 241 242 func (tree *Tree) travel(pathSegments []string) (pathSegmentIndex int, pathSegment string, currElement *Element, parentTree *Tree, found bool) { 243 parentTree = tree 244 for pathSegmentIndex, pathSegment = range pathSegments { 245 if tree == nil { 246 found = false 247 break 248 } 249 250 var tmpCurrElement *Element 251 if tmpCurrElement, found = tree.schemaMap[pathSegment]; !found { 252 break 253 } 254 currElement = tmpCurrElement 255 256 parentTree = tree 257 tree = currElement.Children 258 } 259 260 return 261 } 262 263 // ReadOnly - returns whether this tree is read only or not. 264 func (tree *Tree) ReadOnly() bool { 265 return tree.readOnly 266 } 267 268 // Get - returns the element stored for name. 269 func (tree *Tree) Get(name string) (element *Element, ok bool) { 270 pathSegments := strings.Split(name, ".") 271 for _, pathSegment := range pathSegments { 272 if tree == nil { 273 element = nil 274 ok = false 275 break 276 } 277 278 if element, ok = tree.schemaMap[pathSegment]; !ok { 279 break 280 } 281 282 tree = element.Children 283 } 284 285 return element, ok 286 } 287 288 // Set - adds or sets element to name. 289 func (tree *Tree) Set(name string, element *Element) error { 290 if tree.readOnly { 291 return fmt.Errorf("read only tree") 292 } 293 294 pathSegments := strings.Split(name, ".") 295 if err := validataPathSegments(pathSegments); err != nil { 296 return err 297 } 298 299 i, pathSegment, currElement, parentTree, found := tree.travel(pathSegments) 300 301 if !found { 302 if i != len(pathSegments)-1 { 303 return fmt.Errorf("parent %v does not exist", strings.Join(pathSegments[:i+1], ".")) 304 } 305 306 if currElement == nil { 307 parentTree = tree 308 } else { 309 if currElement.Type != nil { 310 return fmt.Errorf("parent %v is not group element", strings.Join(pathSegments[:i], ".")) 311 } 312 313 if currElement.Children == nil { 314 currElement.Children = NewTree() 315 } 316 parentTree = currElement.Children 317 } 318 319 parentTree.keys = append(parentTree.keys, pathSegment) 320 } 321 322 parentTree.schemaMap[pathSegment] = element 323 return nil 324 } 325 326 // Delete - deletes name and its element. 327 func (tree *Tree) Delete(name string) { 328 if tree.readOnly { 329 panic(fmt.Errorf("read only tree")) 330 } 331 332 pathSegments := strings.Split(name, ".") 333 334 _, pathSegment, _, parentTree, found := tree.travel(pathSegments) 335 336 if found { 337 for i := range parentTree.keys { 338 if parentTree.keys[i] == pathSegment { 339 copy(parentTree.keys[i:], parentTree.keys[i+1:]) 340 parentTree.keys = parentTree.keys[:len(parentTree.keys)-1] 341 break 342 } 343 } 344 345 delete(parentTree.schemaMap, pathSegment) 346 } 347 } 348 349 // Range - calls f sequentially for each name and its element. If f returns false, range stops the iteration. 350 func (tree *Tree) Range(f func(name string, element *Element) bool) { 351 for _, name := range tree.keys { 352 if !f(name, tree.schemaMap[name]) { 353 break 354 } 355 } 356 } 357 358 // ToParquetSchema - returns list of parquet SchemaElement and list of elements those stores values. 359 func (tree *Tree) ToParquetSchema() (schemaList []*parquet.SchemaElement, valueElements []*Element, err error) { 360 if tree.readOnly { 361 return nil, nil, fmt.Errorf("read only tree") 362 } 363 364 updateMaxDLRL(tree.schemaMap, 0, 0) 365 366 var schemaElements []*parquet.SchemaElement 367 if err = toParquetSchema(tree, "", "", &schemaElements, &valueElements); err != nil { 368 return nil, nil, err 369 } 370 371 tree.readOnly = true 372 373 numChildren := int32(len(tree.keys)) 374 schemaList = append(schemaList, &parquet.SchemaElement{ 375 Name: "schema", 376 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 377 NumChildren: &numChildren, 378 }) 379 schemaList = append(schemaList, schemaElements...) 380 return schemaList, valueElements, nil 381 } 382 383 // NewTree - creates new schema tree. 384 func NewTree() *Tree { 385 return &Tree{ 386 schemaMap: make(map[string]*Element), 387 } 388 }