storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/schema/tree.go (about)

     1  /*
     2   * Minio Cloud Storage, (C) 2019 Minio, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package schema
    18  
    19  import (
    20  	"fmt"
    21  	"strings"
    22  
    23  	"storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
    24  )
    25  
    26  func updateMaxDLRL(schemaMap map[string]*Element, maxDL, maxRL int64) {
    27  	for _, element := range schemaMap {
    28  		element.MaxDefinitionLevel = maxDL
    29  		element.MaxRepetitionLevel = maxRL
    30  		if *element.RepetitionType != parquet.FieldRepetitionType_REQUIRED {
    31  			element.MaxDefinitionLevel++
    32  			if *element.RepetitionType == parquet.FieldRepetitionType_REPEATED {
    33  				element.MaxRepetitionLevel++
    34  			}
    35  		}
    36  
    37  		if element.Children != nil {
    38  			updateMaxDLRL(element.Children.schemaMap, element.MaxDefinitionLevel, element.MaxRepetitionLevel)
    39  		}
    40  	}
    41  }
    42  
    43  func toParquetSchema(tree *Tree, treePrefix string, schemaPrefix string, schemaList *[]*parquet.SchemaElement, valueElements *[]*Element) (err error) {
    44  	tree.Range(func(name string, element *Element) bool {
    45  		pathInTree := name
    46  		if treePrefix != "" {
    47  			pathInTree = treePrefix + "." + name
    48  		}
    49  
    50  		if element.Type == nil && element.ConvertedType == nil && element.Children == nil {
    51  			err = fmt.Errorf("%v: group element must have children", pathInTree)
    52  			return false
    53  		}
    54  
    55  		if element.ConvertedType != nil {
    56  			switch *element.ConvertedType {
    57  			case parquet.ConvertedType_LIST:
    58  				// Supported structure.
    59  				// <REQUIRED|OPTIONAL> group <name> (LIST) {
    60  				//   REPEATED group list {
    61  				//     <REQUIRED|OPTIONAL> <element-type> element;
    62  				//   }
    63  				// }
    64  
    65  				if element.Type != nil {
    66  					err = fmt.Errorf("%v: type must be nil for LIST ConvertedType", pathInTree)
    67  					return false
    68  				}
    69  
    70  				if element.Children == nil || element.Children.Length() != 1 {
    71  					err = fmt.Errorf("%v: children must have one element only for LIST ConvertedType", pathInTree)
    72  					return false
    73  				}
    74  
    75  				listElement, ok := element.Children.Get("list")
    76  				if !ok {
    77  					err = fmt.Errorf("%v: missing group element 'list' for LIST ConvertedType", pathInTree)
    78  					return false
    79  				}
    80  
    81  				if listElement.Name != "list" {
    82  					err = fmt.Errorf("%v.list: name must be 'list'", pathInTree)
    83  					return false
    84  				}
    85  
    86  				if *listElement.RepetitionType != parquet.FieldRepetitionType_REPEATED {
    87  					err = fmt.Errorf("%v.list: repetition type must be REPEATED type", pathInTree)
    88  					return false
    89  				}
    90  
    91  				if listElement.Type != nil || listElement.ConvertedType != nil {
    92  					err = fmt.Errorf("%v.list: type and converted type must be nil", pathInTree)
    93  					return false
    94  				}
    95  
    96  				if listElement.Children == nil || listElement.Children.Length() != 1 {
    97  					err = fmt.Errorf("%v.list.element: not found", pathInTree)
    98  					return false
    99  				}
   100  
   101  				valueElement, ok := listElement.Children.Get("element")
   102  				if !ok {
   103  					err = fmt.Errorf("%v.list.element: not found", pathInTree)
   104  					return false
   105  				}
   106  
   107  				if valueElement.Name != "element" {
   108  					err = fmt.Errorf("%v.list.element: name must be 'element'", pathInTree)
   109  					return false
   110  				}
   111  
   112  			case parquet.ConvertedType_MAP:
   113  				// Supported structure:
   114  				// <REQUIRED|OPTIONAL> group <name> (MAP) {
   115  				//   REPEATED group key_value {
   116  				//     REQUIRED <key-type> key;
   117  				//     <REQUIRED|OPTIONAL> <value-type> value;
   118  				//   }
   119  				// }
   120  
   121  				if element.Type != nil {
   122  					err = fmt.Errorf("%v: type must be nil for MAP ConvertedType", pathInTree)
   123  					return false
   124  				}
   125  
   126  				if element.Children == nil || element.Children.Length() != 1 {
   127  					err = fmt.Errorf("%v: children must have one element only for MAP ConvertedType", pathInTree)
   128  					return false
   129  				}
   130  
   131  				keyValueElement, ok := element.Children.Get("key_value")
   132  				if !ok {
   133  					err = fmt.Errorf("%v: missing group element 'key_value' for MAP ConvertedType", pathInTree)
   134  					return false
   135  				}
   136  
   137  				if keyValueElement.Name != "key_value" {
   138  					err = fmt.Errorf("%v.key_value: name must be 'key_value'", pathInTree)
   139  					return false
   140  				}
   141  
   142  				if *keyValueElement.RepetitionType != parquet.FieldRepetitionType_REPEATED {
   143  					err = fmt.Errorf("%v.key_value: repetition type must be REPEATED type", pathInTree)
   144  					return false
   145  				}
   146  
   147  				if keyValueElement.Children == nil || keyValueElement.Children.Length() < 1 || keyValueElement.Children.Length() > 2 {
   148  					err = fmt.Errorf("%v.key_value: children must have 'key' and optionally 'value' elements for MAP ConvertedType", pathInTree)
   149  					return false
   150  				}
   151  
   152  				keyElement, ok := keyValueElement.Children.Get("key")
   153  				if !ok {
   154  					err = fmt.Errorf("%v.key_value: missing 'key' element for MAP ConvertedType", pathInTree)
   155  					return false
   156  				}
   157  
   158  				if keyElement.Name != "key" {
   159  					err = fmt.Errorf("%v.key_value.key: name must be 'key'", pathInTree)
   160  					return false
   161  				}
   162  
   163  				if *keyElement.RepetitionType != parquet.FieldRepetitionType_REQUIRED {
   164  					err = fmt.Errorf("%v.key_value: repetition type must be REQUIRED type", pathInTree)
   165  					return false
   166  				}
   167  
   168  				if keyValueElement.Children.Length() == 2 {
   169  					valueElement, ok := keyValueElement.Children.Get("value")
   170  					if !ok {
   171  						err = fmt.Errorf("%v.key_value: second element must be 'value' element for MAP ConvertedType", pathInTree)
   172  						return false
   173  					}
   174  
   175  					if valueElement.Name != "value" {
   176  						err = fmt.Errorf("%v.key_value.value: name must be 'value'", pathInTree)
   177  						return false
   178  					}
   179  				}
   180  
   181  			case parquet.ConvertedType_UTF8, parquet.ConvertedType_UINT_8, parquet.ConvertedType_UINT_16:
   182  				fallthrough
   183  			case parquet.ConvertedType_UINT_32, parquet.ConvertedType_UINT_64, parquet.ConvertedType_INT_8:
   184  				fallthrough
   185  			case parquet.ConvertedType_INT_16, parquet.ConvertedType_INT_32, parquet.ConvertedType_INT_64:
   186  				if element.Type == nil {
   187  					err = fmt.Errorf("%v: ConvertedType %v must have Type value", pathInTree, element.ConvertedType)
   188  					return false
   189  				}
   190  
   191  			default:
   192  				err = fmt.Errorf("%v: unsupported ConvertedType %v", pathInTree, element.ConvertedType)
   193  				return false
   194  			}
   195  		}
   196  
   197  		element.PathInTree = pathInTree
   198  		element.PathInSchema = element.Name
   199  		if schemaPrefix != "" {
   200  			element.PathInSchema = schemaPrefix + "." + element.Name
   201  		}
   202  
   203  		if element.Type != nil {
   204  			*valueElements = append(*valueElements, element)
   205  		}
   206  
   207  		*schemaList = append(*schemaList, &element.SchemaElement)
   208  		if element.Children != nil {
   209  			element.numChildren = int32(element.Children.Length())
   210  			err = toParquetSchema(element.Children, element.PathInTree, element.PathInSchema, schemaList, valueElements)
   211  		}
   212  
   213  		return (err == nil)
   214  	})
   215  
   216  	return err
   217  }
   218  
   219  // Tree - represents tree of schema.  Tree preserves order in which elements are added.
   220  type Tree struct {
   221  	schemaMap map[string]*Element
   222  	keys      []string
   223  	readOnly  bool
   224  }
   225  
   226  // String - stringify this tree.
   227  func (tree *Tree) String() string {
   228  	var s []string
   229  	tree.Range(func(name string, element *Element) bool {
   230  		s = append(s, fmt.Sprintf("%v: %v", name, element))
   231  		return true
   232  	})
   233  
   234  	return "{" + strings.Join(s, ", ") + "}"
   235  }
   236  
   237  // Length - returns length of tree.
   238  func (tree *Tree) Length() int {
   239  	return len(tree.keys)
   240  }
   241  
   242  func (tree *Tree) travel(pathSegments []string) (pathSegmentIndex int, pathSegment string, currElement *Element, parentTree *Tree, found bool) {
   243  	parentTree = tree
   244  	for pathSegmentIndex, pathSegment = range pathSegments {
   245  		if tree == nil {
   246  			found = false
   247  			break
   248  		}
   249  
   250  		var tmpCurrElement *Element
   251  		if tmpCurrElement, found = tree.schemaMap[pathSegment]; !found {
   252  			break
   253  		}
   254  		currElement = tmpCurrElement
   255  
   256  		parentTree = tree
   257  		tree = currElement.Children
   258  	}
   259  
   260  	return
   261  }
   262  
   263  // ReadOnly - returns whether this tree is read only or not.
   264  func (tree *Tree) ReadOnly() bool {
   265  	return tree.readOnly
   266  }
   267  
   268  // Get - returns the element stored for name.
   269  func (tree *Tree) Get(name string) (element *Element, ok bool) {
   270  	pathSegments := strings.Split(name, ".")
   271  	for _, pathSegment := range pathSegments {
   272  		if tree == nil {
   273  			element = nil
   274  			ok = false
   275  			break
   276  		}
   277  
   278  		if element, ok = tree.schemaMap[pathSegment]; !ok {
   279  			break
   280  		}
   281  
   282  		tree = element.Children
   283  	}
   284  
   285  	return element, ok
   286  }
   287  
   288  // Set - adds or sets element to name.
   289  func (tree *Tree) Set(name string, element *Element) error {
   290  	if tree.readOnly {
   291  		return fmt.Errorf("read only tree")
   292  	}
   293  
   294  	pathSegments := strings.Split(name, ".")
   295  	if err := validataPathSegments(pathSegments); err != nil {
   296  		return err
   297  	}
   298  
   299  	i, pathSegment, currElement, parentTree, found := tree.travel(pathSegments)
   300  
   301  	if !found {
   302  		if i != len(pathSegments)-1 {
   303  			return fmt.Errorf("parent %v does not exist", strings.Join(pathSegments[:i+1], "."))
   304  		}
   305  
   306  		if currElement == nil {
   307  			parentTree = tree
   308  		} else {
   309  			if currElement.Type != nil {
   310  				return fmt.Errorf("parent %v is not group element", strings.Join(pathSegments[:i], "."))
   311  			}
   312  
   313  			if currElement.Children == nil {
   314  				currElement.Children = NewTree()
   315  			}
   316  			parentTree = currElement.Children
   317  		}
   318  
   319  		parentTree.keys = append(parentTree.keys, pathSegment)
   320  	}
   321  
   322  	parentTree.schemaMap[pathSegment] = element
   323  	return nil
   324  }
   325  
   326  // Delete - deletes name and its element.
   327  func (tree *Tree) Delete(name string) {
   328  	if tree.readOnly {
   329  		panic(fmt.Errorf("read only tree"))
   330  	}
   331  
   332  	pathSegments := strings.Split(name, ".")
   333  
   334  	_, pathSegment, _, parentTree, found := tree.travel(pathSegments)
   335  
   336  	if found {
   337  		for i := range parentTree.keys {
   338  			if parentTree.keys[i] == pathSegment {
   339  				copy(parentTree.keys[i:], parentTree.keys[i+1:])
   340  				parentTree.keys = parentTree.keys[:len(parentTree.keys)-1]
   341  				break
   342  			}
   343  		}
   344  
   345  		delete(parentTree.schemaMap, pathSegment)
   346  	}
   347  }
   348  
   349  // Range - calls f sequentially for each name and its element. If f returns false, range stops the iteration.
   350  func (tree *Tree) Range(f func(name string, element *Element) bool) {
   351  	for _, name := range tree.keys {
   352  		if !f(name, tree.schemaMap[name]) {
   353  			break
   354  		}
   355  	}
   356  }
   357  
   358  // ToParquetSchema - returns list of parquet SchemaElement and list of elements those stores values.
   359  func (tree *Tree) ToParquetSchema() (schemaList []*parquet.SchemaElement, valueElements []*Element, err error) {
   360  	if tree.readOnly {
   361  		return nil, nil, fmt.Errorf("read only tree")
   362  	}
   363  
   364  	updateMaxDLRL(tree.schemaMap, 0, 0)
   365  
   366  	var schemaElements []*parquet.SchemaElement
   367  	if err = toParquetSchema(tree, "", "", &schemaElements, &valueElements); err != nil {
   368  		return nil, nil, err
   369  	}
   370  
   371  	tree.readOnly = true
   372  
   373  	numChildren := int32(len(tree.keys))
   374  	schemaList = append(schemaList, &parquet.SchemaElement{
   375  		Name:           "schema",
   376  		RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   377  		NumChildren:    &numChildren,
   378  	})
   379  	schemaList = append(schemaList, schemaElements...)
   380  	return schemaList, valueElements, nil
   381  }
   382  
   383  // NewTree - creates new schema tree.
   384  func NewTree() *Tree {
   385  	return &Tree{
   386  		schemaMap: make(map[string]*Element),
   387  	}
   388  }