github.com/fraugster/parquet-go@v0.12.0/parquetschema/schema_def.go (about)

     1  package parquetschema
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"io"
     7  
     8  	"github.com/fraugster/parquet-go/parquet"
     9  )
    10  
    11  // SchemaDefinition represents a valid textual schema definition.
    12  type SchemaDefinition struct {
    13  	RootColumn *ColumnDefinition
    14  }
    15  
    16  // ColumnDefinition represents the schema definition of a column and optionally its children.
    17  type ColumnDefinition struct {
    18  	Children      []*ColumnDefinition
    19  	SchemaElement *parquet.SchemaElement
    20  }
    21  
    22  // SchemaDefinitionFromColumnDefinition creates a new schema definition from the provided root column definition.
    23  func SchemaDefinitionFromColumnDefinition(c *ColumnDefinition) *SchemaDefinition {
    24  	if c == nil {
    25  		return nil
    26  	}
    27  
    28  	return &SchemaDefinition{RootColumn: c}
    29  }
    30  
    31  // ParseSchemaDefinition parses a textual schema definition and returns
    32  // a SchemaDefinition object, or an error if parsing has failed. The textual schema definition
    33  // needs to adhere to the following grammar:
    34  //
    35  //	message ::= 'message' <identifier> '{' <message-body> '}'
    36  //	message-body ::= <column-definition>*
    37  //	column-definition ::= <repetition-type> <column-type-definition>
    38  //	repetition-type ::= 'required' | 'repeated' | 'optional'
    39  //	column-type-definition ::= <group-definition> | <field-definition>
    40  //	group-definition ::= 'group' <identifier> <converted-type-annotation>? '{' <message-body> '}'
    41  //	field-definition ::= <type> <identifier> <logical-type-annotation>? <field-id-definition>? ';'
    42  //	type ::= 'binary'
    43  //		| 'float'
    44  //		| 'double'
    45  //		| 'boolean'
    46  //		| 'int32'
    47  //		| 'int64'
    48  //		| 'int96'
    49  //		| 'fixed_len_byte_array' '(' <number> ')'
    50  //	converted-type-annotation ::= '(' <converted-type> ')'
    51  //	converted-type ::= 'UTF8'
    52  //		| 'MAP'
    53  //		| 'MAP_KEY_VALUE'
    54  //		| 'LIST'
    55  //		| 'ENUM'
    56  //		| 'DECIMAL'
    57  //		| 'DATE'
    58  //		| 'TIME_MILLIS'
    59  //		| 'TIME_MICROS'
    60  //		| 'TIMESTAMP_MILLIS'
    61  //		| 'TIMESTAMP_MICROS'
    62  //		| 'UINT_8'
    63  //		| 'UINT_16'
    64  //		| 'UINT_32'
    65  //		| 'UINT_64'
    66  //		| 'INT_8'
    67  //		| 'INT_16'
    68  //		| 'INT_32'
    69  //		| 'INT_64'
    70  //		| 'JSON'
    71  //		| 'BSON'
    72  //		| 'INTERVAL'
    73  //	logical-type-annotation ::= '(' <logical-type> ')'
    74  //	logical-type ::= 'STRING'
    75  //		| 'DATE'
    76  //		| 'TIMESTAMP' '(' <time-unit> ',' <boolean> ')'
    77  //		| 'UUID'
    78  //		| 'ENUM'
    79  //		| 'JSON'
    80  //		| 'BSON'
    81  //		| 'INT' '(' <bit-width> ',' <boolean> ')'
    82  //		| 'DECIMAL' '(' <precision> ',' <scale> ')'
    83  //	field-id-definition ::= '=' <number>
    84  //	number ::= <digit>+
    85  //	digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
    86  //	time-unit ::= 'MILLIS' | 'MICROS' | 'NANOS'
    87  //	boolean ::= 'false' | 'true'
    88  //	identifier ::= <all-characters> - ' ' - ';' - '{' - '}' - '(' - ')' - '=' - ','
    89  //	bit-width ::= '8' | '16' | '32' | '64'
    90  //	precision ::= <number>
    91  //	scale ::= <number>
    92  //	all-characters ::= ? all visible characters ?
    93  // For examples of textual schema definitions, please take a look at schema-files/*.schema.
    94  func ParseSchemaDefinition(schemaText string) (*SchemaDefinition, error) {
    95  	p := newSchemaParser(schemaText)
    96  	if err := p.parse(); err != nil {
    97  		return nil, err
    98  	}
    99  
   100  	return &SchemaDefinition{
   101  		RootColumn: p.root,
   102  	}, nil
   103  }
   104  
   105  // Clone returns a deep copy of the schema definition.
   106  func (sd *SchemaDefinition) Clone() *SchemaDefinition {
   107  	def, err := ParseSchemaDefinition(sd.String())
   108  	if err != nil {
   109  		panic(err) // this should never ever happen and indicates a serious bug.
   110  	}
   111  	return def
   112  }
   113  
   114  // String returns a textual representation of the schema definition. This textual representation
   115  // adheres to the format accepted by the ParseSchemaDefinition function. A textual schema definition
   116  // parsed by ParseSchemaDefinition and turned back into a string by this method repeatedly will
   117  // always remain the same, save for differences in the emitted whitespaces.
   118  func (sd *SchemaDefinition) String() string {
   119  	if sd == nil || sd.RootColumn == nil {
   120  		return "message empty {\n}\n"
   121  	}
   122  
   123  	buf := new(bytes.Buffer)
   124  
   125  	fmt.Fprintf(buf, "message %s {\n", sd.RootColumn.SchemaElement.Name)
   126  
   127  	printCols(buf, sd.RootColumn.Children, 2)
   128  
   129  	fmt.Fprintf(buf, "}\n")
   130  
   131  	return buf.String()
   132  }
   133  
   134  // SubSchema returns the direct child of the current schema definition
   135  // that matches the provided name. If no such child exists, nil is
   136  // returned.
   137  func (sd *SchemaDefinition) SubSchema(name string) *SchemaDefinition {
   138  	if sd == nil {
   139  		return nil
   140  	}
   141  
   142  	for _, c := range sd.RootColumn.Children {
   143  		if c.SchemaElement.Name == name {
   144  			return &SchemaDefinition{
   145  				RootColumn: c,
   146  			}
   147  		}
   148  	}
   149  	return nil
   150  }
   151  
   152  // SchemaElement returns the schema element associated with the current
   153  // schema definition. If no schema element is present, then nil is returned.
   154  func (sd *SchemaDefinition) SchemaElement() *parquet.SchemaElement {
   155  	if sd == nil || sd.RootColumn == nil {
   156  		return nil
   157  	}
   158  
   159  	return sd.RootColumn.SchemaElement
   160  }
   161  
   162  func printCols(w io.Writer, cols []*ColumnDefinition, indent int) {
   163  	for _, col := range cols {
   164  		printIndent(w, indent)
   165  
   166  		elem := col.SchemaElement
   167  
   168  		switch elem.GetRepetitionType() {
   169  		case parquet.FieldRepetitionType_REPEATED:
   170  			fmt.Fprintf(w, "repeated")
   171  		case parquet.FieldRepetitionType_OPTIONAL:
   172  			fmt.Fprintf(w, "optional")
   173  		case parquet.FieldRepetitionType_REQUIRED:
   174  			fmt.Fprintf(w, "required")
   175  		}
   176  		fmt.Fprintf(w, " ")
   177  
   178  		if elem.Type == nil {
   179  			fmt.Fprintf(w, "group %s", elem.GetName())
   180  			if elem.ConvertedType != nil {
   181  				fmt.Fprintf(w, " (%s)", elem.GetConvertedType().String())
   182  			}
   183  			fmt.Fprintf(w, " {\n")
   184  			printCols(w, col.Children, indent+2)
   185  
   186  			printIndent(w, indent)
   187  			fmt.Fprintf(w, "}\n")
   188  		} else {
   189  			typ := getSchemaType(elem)
   190  			fmt.Fprintf(w, "%s %s", typ, elem.GetName())
   191  			if elem.LogicalType != nil {
   192  				fmt.Fprintf(w, " (%s)", getSchemaLogicalType(elem.GetLogicalType()))
   193  			} else if elem.ConvertedType != nil {
   194  				fmt.Fprintf(w, " (%s)", elem.GetConvertedType().String())
   195  			}
   196  			if elem.FieldID != nil {
   197  				fmt.Fprintf(w, " = %d", elem.GetFieldID())
   198  			}
   199  			fmt.Fprintf(w, ";\n")
   200  		}
   201  	}
   202  }
   203  
   204  func printIndent(w io.Writer, indent int) {
   205  	for i := 0; i < indent; i++ {
   206  		fmt.Fprintf(w, " ")
   207  	}
   208  }
   209  
   210  func getSchemaType(elem *parquet.SchemaElement) string {
   211  	switch elem.GetType() {
   212  	case parquet.Type_BYTE_ARRAY:
   213  		return "binary"
   214  	case parquet.Type_FLOAT:
   215  		return "float"
   216  	case parquet.Type_DOUBLE:
   217  		return "double"
   218  	case parquet.Type_BOOLEAN:
   219  		return "boolean"
   220  	case parquet.Type_INT32:
   221  		return "int32"
   222  	case parquet.Type_INT64:
   223  		return "int64"
   224  	case parquet.Type_INT96:
   225  		return "int96"
   226  	case parquet.Type_FIXED_LEN_BYTE_ARRAY:
   227  		return fmt.Sprintf("fixed_len_byte_array(%d)", elem.GetTypeLength())
   228  	}
   229  	return fmt.Sprintf("UT:%s", elem.GetType())
   230  }
   231  
   232  func getTimestampLogicalType(t *parquet.LogicalType) string {
   233  	unit := ""
   234  	switch {
   235  	case t.TIMESTAMP.Unit.IsSetNANOS():
   236  		unit = "NANOS"
   237  	case t.TIMESTAMP.Unit.IsSetMICROS():
   238  		unit = "MICROS"
   239  	case t.TIMESTAMP.Unit.IsSetMILLIS():
   240  		unit = "MILLIS"
   241  	default:
   242  		unit = "BUG_UNKNOWN_TIMESTAMP_UNIT"
   243  	}
   244  	return fmt.Sprintf("TIMESTAMP(%s, %t)", unit, t.TIMESTAMP.IsAdjustedToUTC)
   245  }
   246  
   247  func getTimeLogicalType(t *parquet.LogicalType) string {
   248  	unit := ""
   249  	switch {
   250  	case t.TIME.Unit.IsSetNANOS():
   251  		unit = "NANOS"
   252  	case t.TIME.Unit.IsSetMICROS():
   253  		unit = "MICROS"
   254  	case t.TIME.Unit.IsSetMILLIS():
   255  		unit = "MILLIS"
   256  	default:
   257  		unit = "BUG_UNKNOWN_TIMESTAMP_UNIT"
   258  	}
   259  	return fmt.Sprintf("TIME(%s, %t)", unit, t.TIME.IsAdjustedToUTC)
   260  }
   261  
   262  func getSchemaLogicalType(t *parquet.LogicalType) string {
   263  	switch {
   264  	case t.IsSetSTRING():
   265  		return "STRING"
   266  	case t.IsSetDATE():
   267  		return "DATE"
   268  	case t.IsSetTIMESTAMP():
   269  		return getTimestampLogicalType(t)
   270  	case t.IsSetTIME():
   271  		return getTimeLogicalType(t)
   272  	case t.IsSetUUID():
   273  		return "UUID"
   274  	case t.IsSetENUM():
   275  		return "ENUM"
   276  	case t.IsSetJSON():
   277  		return "JSON"
   278  	case t.IsSetBSON():
   279  		return "BSON"
   280  	case t.IsSetDECIMAL():
   281  		return fmt.Sprintf("DECIMAL(%d, %d)", t.DECIMAL.Precision, t.DECIMAL.Scale)
   282  	case t.IsSetINTEGER():
   283  		return fmt.Sprintf("INT(%d, %t)", t.INTEGER.BitWidth, t.INTEGER.IsSigned)
   284  	default:
   285  		return "BUG(UNKNOWN)"
   286  	}
   287  }