github.com/fraugster/parquet-go@v0.12.0/parquetschema/schema_def.go (about) 1 package parquetschema 2 3 import ( 4 "bytes" 5 "fmt" 6 "io" 7 8 "github.com/fraugster/parquet-go/parquet" 9 ) 10 11 // SchemaDefinition represents a valid textual schema definition. 12 type SchemaDefinition struct { 13 RootColumn *ColumnDefinition 14 } 15 16 // ColumnDefinition represents the schema definition of a column and optionally its children. 17 type ColumnDefinition struct { 18 Children []*ColumnDefinition 19 SchemaElement *parquet.SchemaElement 20 } 21 22 // SchemaDefinitionFromColumnDefinition creates a new schema definition from the provided root column definition. 23 func SchemaDefinitionFromColumnDefinition(c *ColumnDefinition) *SchemaDefinition { 24 if c == nil { 25 return nil 26 } 27 28 return &SchemaDefinition{RootColumn: c} 29 } 30 31 // ParseSchemaDefinition parses a textual schema definition and returns 32 // a SchemaDefinition object, or an error if parsing has failed. The textual schema definition 33 // needs to adhere to the following grammar: 34 // 35 // message ::= 'message' <identifier> '{' <message-body> '}' 36 // message-body ::= <column-definition>* 37 // column-definition ::= <repetition-type> <column-type-definition> 38 // repetition-type ::= 'required' | 'repeated' | 'optional' 39 // column-type-definition ::= <group-definition> | <field-definition> 40 // group-definition ::= 'group' <identifier> <converted-type-annotation>? '{' <message-body> '}' 41 // field-definition ::= <type> <identifier> <logical-type-annotation>? <field-id-definition>? ';' 42 // type ::= 'binary' 43 // | 'float' 44 // | 'double' 45 // | 'boolean' 46 // | 'int32' 47 // | 'int64' 48 // | 'int96' 49 // | 'fixed_len_byte_array' '(' <number> ')' 50 // converted-type-annotation ::= '(' <converted-type> ')' 51 // converted-type ::= 'UTF8' 52 // | 'MAP' 53 // | 'MAP_KEY_VALUE' 54 // | 'LIST' 55 // | 'ENUM' 56 // | 'DECIMAL' 57 // | 'DATE' 58 // | 'TIME_MILLIS' 59 // | 'TIME_MICROS' 60 // | 'TIMESTAMP_MILLIS' 61 // | 'TIMESTAMP_MICROS' 62 // | 'UINT_8' 63 // | 'UINT_16' 64 // | 'UINT_32' 65 // | 'UINT_64' 66 // | 'INT_8' 67 // | 'INT_16' 68 // | 'INT_32' 69 // | 'INT_64' 70 // | 'JSON' 71 // | 'BSON' 72 // | 'INTERVAL' 73 // logical-type-annotation ::= '(' <logical-type> ')' 74 // logical-type ::= 'STRING' 75 // | 'DATE' 76 // | 'TIMESTAMP' '(' <time-unit> ',' <boolean> ')' 77 // | 'UUID' 78 // | 'ENUM' 79 // | 'JSON' 80 // | 'BSON' 81 // | 'INT' '(' <bit-width> ',' <boolean> ')' 82 // | 'DECIMAL' '(' <precision> ',' <scale> ')' 83 // field-id-definition ::= '=' <number> 84 // number ::= <digit>+ 85 // digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' 86 // time-unit ::= 'MILLIS' | 'MICROS' | 'NANOS' 87 // boolean ::= 'false' | 'true' 88 // identifier ::= <all-characters> - ' ' - ';' - '{' - '}' - '(' - ')' - '=' - ',' 89 // bit-width ::= '8' | '16' | '32' | '64' 90 // precision ::= <number> 91 // scale ::= <number> 92 // all-characters ::= ? all visible characters ? 93 // For examples of textual schema definitions, please take a look at schema-files/*.schema. 94 func ParseSchemaDefinition(schemaText string) (*SchemaDefinition, error) { 95 p := newSchemaParser(schemaText) 96 if err := p.parse(); err != nil { 97 return nil, err 98 } 99 100 return &SchemaDefinition{ 101 RootColumn: p.root, 102 }, nil 103 } 104 105 // Clone returns a deep copy of the schema definition. 106 func (sd *SchemaDefinition) Clone() *SchemaDefinition { 107 def, err := ParseSchemaDefinition(sd.String()) 108 if err != nil { 109 panic(err) // this should never ever happen and indicates a serious bug. 110 } 111 return def 112 } 113 114 // String returns a textual representation of the schema definition. This textual representation 115 // adheres to the format accepted by the ParseSchemaDefinition function. A textual schema definition 116 // parsed by ParseSchemaDefinition and turned back into a string by this method repeatedly will 117 // always remain the same, save for differences in the emitted whitespaces. 118 func (sd *SchemaDefinition) String() string { 119 if sd == nil || sd.RootColumn == nil { 120 return "message empty {\n}\n" 121 } 122 123 buf := new(bytes.Buffer) 124 125 fmt.Fprintf(buf, "message %s {\n", sd.RootColumn.SchemaElement.Name) 126 127 printCols(buf, sd.RootColumn.Children, 2) 128 129 fmt.Fprintf(buf, "}\n") 130 131 return buf.String() 132 } 133 134 // SubSchema returns the direct child of the current schema definition 135 // that matches the provided name. If no such child exists, nil is 136 // returned. 137 func (sd *SchemaDefinition) SubSchema(name string) *SchemaDefinition { 138 if sd == nil { 139 return nil 140 } 141 142 for _, c := range sd.RootColumn.Children { 143 if c.SchemaElement.Name == name { 144 return &SchemaDefinition{ 145 RootColumn: c, 146 } 147 } 148 } 149 return nil 150 } 151 152 // SchemaElement returns the schema element associated with the current 153 // schema definition. If no schema element is present, then nil is returned. 154 func (sd *SchemaDefinition) SchemaElement() *parquet.SchemaElement { 155 if sd == nil || sd.RootColumn == nil { 156 return nil 157 } 158 159 return sd.RootColumn.SchemaElement 160 } 161 162 func printCols(w io.Writer, cols []*ColumnDefinition, indent int) { 163 for _, col := range cols { 164 printIndent(w, indent) 165 166 elem := col.SchemaElement 167 168 switch elem.GetRepetitionType() { 169 case parquet.FieldRepetitionType_REPEATED: 170 fmt.Fprintf(w, "repeated") 171 case parquet.FieldRepetitionType_OPTIONAL: 172 fmt.Fprintf(w, "optional") 173 case parquet.FieldRepetitionType_REQUIRED: 174 fmt.Fprintf(w, "required") 175 } 176 fmt.Fprintf(w, " ") 177 178 if elem.Type == nil { 179 fmt.Fprintf(w, "group %s", elem.GetName()) 180 if elem.ConvertedType != nil { 181 fmt.Fprintf(w, " (%s)", elem.GetConvertedType().String()) 182 } 183 fmt.Fprintf(w, " {\n") 184 printCols(w, col.Children, indent+2) 185 186 printIndent(w, indent) 187 fmt.Fprintf(w, "}\n") 188 } else { 189 typ := getSchemaType(elem) 190 fmt.Fprintf(w, "%s %s", typ, elem.GetName()) 191 if elem.LogicalType != nil { 192 fmt.Fprintf(w, " (%s)", getSchemaLogicalType(elem.GetLogicalType())) 193 } else if elem.ConvertedType != nil { 194 fmt.Fprintf(w, " (%s)", elem.GetConvertedType().String()) 195 } 196 if elem.FieldID != nil { 197 fmt.Fprintf(w, " = %d", elem.GetFieldID()) 198 } 199 fmt.Fprintf(w, ";\n") 200 } 201 } 202 } 203 204 func printIndent(w io.Writer, indent int) { 205 for i := 0; i < indent; i++ { 206 fmt.Fprintf(w, " ") 207 } 208 } 209 210 func getSchemaType(elem *parquet.SchemaElement) string { 211 switch elem.GetType() { 212 case parquet.Type_BYTE_ARRAY: 213 return "binary" 214 case parquet.Type_FLOAT: 215 return "float" 216 case parquet.Type_DOUBLE: 217 return "double" 218 case parquet.Type_BOOLEAN: 219 return "boolean" 220 case parquet.Type_INT32: 221 return "int32" 222 case parquet.Type_INT64: 223 return "int64" 224 case parquet.Type_INT96: 225 return "int96" 226 case parquet.Type_FIXED_LEN_BYTE_ARRAY: 227 return fmt.Sprintf("fixed_len_byte_array(%d)", elem.GetTypeLength()) 228 } 229 return fmt.Sprintf("UT:%s", elem.GetType()) 230 } 231 232 func getTimestampLogicalType(t *parquet.LogicalType) string { 233 unit := "" 234 switch { 235 case t.TIMESTAMP.Unit.IsSetNANOS(): 236 unit = "NANOS" 237 case t.TIMESTAMP.Unit.IsSetMICROS(): 238 unit = "MICROS" 239 case t.TIMESTAMP.Unit.IsSetMILLIS(): 240 unit = "MILLIS" 241 default: 242 unit = "BUG_UNKNOWN_TIMESTAMP_UNIT" 243 } 244 return fmt.Sprintf("TIMESTAMP(%s, %t)", unit, t.TIMESTAMP.IsAdjustedToUTC) 245 } 246 247 func getTimeLogicalType(t *parquet.LogicalType) string { 248 unit := "" 249 switch { 250 case t.TIME.Unit.IsSetNANOS(): 251 unit = "NANOS" 252 case t.TIME.Unit.IsSetMICROS(): 253 unit = "MICROS" 254 case t.TIME.Unit.IsSetMILLIS(): 255 unit = "MILLIS" 256 default: 257 unit = "BUG_UNKNOWN_TIMESTAMP_UNIT" 258 } 259 return fmt.Sprintf("TIME(%s, %t)", unit, t.TIME.IsAdjustedToUTC) 260 } 261 262 func getSchemaLogicalType(t *parquet.LogicalType) string { 263 switch { 264 case t.IsSetSTRING(): 265 return "STRING" 266 case t.IsSetDATE(): 267 return "DATE" 268 case t.IsSetTIMESTAMP(): 269 return getTimestampLogicalType(t) 270 case t.IsSetTIME(): 271 return getTimeLogicalType(t) 272 case t.IsSetUUID(): 273 return "UUID" 274 case t.IsSetENUM(): 275 return "ENUM" 276 case t.IsSetJSON(): 277 return "JSON" 278 case t.IsSetBSON(): 279 return "BSON" 280 case t.IsSetDECIMAL(): 281 return fmt.Sprintf("DECIMAL(%d, %d)", t.DECIMAL.Precision, t.DECIMAL.Scale) 282 case t.IsSetINTEGER(): 283 return fmt.Sprintf("INT(%d, %t)", t.INTEGER.BitWidth, t.INTEGER.IsSigned) 284 default: 285 return "BUG(UNKNOWN)" 286 } 287 }