github.com/fraugster/parquet-go@v0.12.0/parquetschema/autoschema/gen.go (about) 1 package autoschema 2 3 import ( 4 "errors" 5 "fmt" 6 "reflect" 7 "strings" 8 "time" 9 10 "github.com/fraugster/parquet-go/parquet" 11 "github.com/fraugster/parquet-go/parquetschema" 12 ) 13 14 // GenerateSchema auto-generates a schema definition for a provided object's type 15 // using reflection. The generated schema is meant to be compatible with 16 // github.com/fraugster/parquet-go/floor's reflection-based marshalling/unmarshalling. 17 func GenerateSchema(obj interface{}) (*parquetschema.SchemaDefinition, error) { 18 valueObj := reflect.ValueOf(obj) 19 columns, err := generateSchema(valueObj.Type()) 20 if err != nil { 21 return nil, fmt.Errorf("can't generate schema: %w", err) 22 } 23 24 return &parquetschema.SchemaDefinition{ 25 RootColumn: &parquetschema.ColumnDefinition{ 26 SchemaElement: &parquet.SchemaElement{ 27 Name: "autogen_schema", 28 }, 29 Children: columns, 30 }, 31 }, nil 32 } 33 34 func generateSchema(objType reflect.Type) ([]*parquetschema.ColumnDefinition, error) { 35 if objType.Kind() == reflect.Ptr { 36 objType = objType.Elem() 37 } 38 39 if objType.Kind() != reflect.Struct { 40 return nil, errors.New("can't generate schema: provided object needs to be of type struct or *struct") 41 } 42 43 columns := []*parquetschema.ColumnDefinition{} 44 45 for i := 0; i < objType.NumField(); i++ { 46 fieldType := objType.Field(i) 47 fieldName := fieldNameToLower(fieldType) 48 49 column, err := generateField(fieldType.Type, fieldName) 50 if err != nil { 51 return nil, err 52 } 53 54 columns = append(columns, column) 55 } 56 57 return columns, nil 58 } 59 60 func generateField(fieldType reflect.Type, fieldName string) (*parquetschema.ColumnDefinition, error) { 61 switch fieldType.Kind() { 62 case reflect.Bool: 63 return &parquetschema.ColumnDefinition{ 64 SchemaElement: &parquet.SchemaElement{ 65 Type: parquet.TypePtr(parquet.Type_BOOLEAN), 66 Name: fieldName, 67 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 68 }, 69 }, nil 70 case reflect.Int: 71 return &parquetschema.ColumnDefinition{ 72 SchemaElement: &parquet.SchemaElement{ 73 Type: parquet.TypePtr(parquet.Type_INT64), 74 Name: fieldName, 75 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 76 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_INT_64), 77 LogicalType: &parquet.LogicalType{ 78 INTEGER: &parquet.IntType{ 79 BitWidth: 64, 80 IsSigned: true, 81 }, 82 }, 83 }, 84 }, nil 85 case reflect.Int8: 86 return &parquetschema.ColumnDefinition{ 87 SchemaElement: &parquet.SchemaElement{ 88 Type: parquet.TypePtr(parquet.Type_INT32), 89 Name: fieldName, 90 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 91 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_INT_16), 92 LogicalType: &parquet.LogicalType{ 93 INTEGER: &parquet.IntType{ 94 BitWidth: 8, 95 IsSigned: true, 96 }, 97 }, 98 }, 99 }, nil 100 case reflect.Int16: 101 return &parquetschema.ColumnDefinition{ 102 SchemaElement: &parquet.SchemaElement{ 103 Type: parquet.TypePtr(parquet.Type_INT32), 104 Name: fieldName, 105 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 106 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_INT_16), 107 LogicalType: &parquet.LogicalType{ 108 INTEGER: &parquet.IntType{ 109 BitWidth: 16, 110 IsSigned: true, 111 }, 112 }, 113 }, 114 }, nil 115 case reflect.Int32: 116 return &parquetschema.ColumnDefinition{ 117 SchemaElement: &parquet.SchemaElement{ 118 Type: parquet.TypePtr(parquet.Type_INT32), 119 Name: fieldName, 120 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 121 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_INT_32), 122 LogicalType: &parquet.LogicalType{ 123 INTEGER: &parquet.IntType{ 124 BitWidth: 32, 125 IsSigned: true, 126 }, 127 }, 128 }, 129 }, nil 130 case reflect.Int64: 131 return &parquetschema.ColumnDefinition{ 132 SchemaElement: &parquet.SchemaElement{ 133 Type: parquet.TypePtr(parquet.Type_INT64), 134 Name: fieldName, 135 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 136 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_INT_64), 137 LogicalType: &parquet.LogicalType{ 138 INTEGER: &parquet.IntType{ 139 BitWidth: 64, 140 IsSigned: true, 141 }, 142 }, 143 }, 144 }, nil 145 case reflect.Uint: 146 return &parquetschema.ColumnDefinition{ 147 SchemaElement: &parquet.SchemaElement{ 148 Type: parquet.TypePtr(parquet.Type_INT32), 149 Name: fieldName, 150 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 151 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32), 152 LogicalType: &parquet.LogicalType{ 153 INTEGER: &parquet.IntType{ 154 BitWidth: 32, 155 IsSigned: false, 156 }, 157 }, 158 }, 159 }, nil 160 case reflect.Uint8: 161 return &parquetschema.ColumnDefinition{ 162 SchemaElement: &parquet.SchemaElement{ 163 Type: parquet.TypePtr(parquet.Type_INT32), 164 Name: fieldName, 165 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 166 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_16), 167 LogicalType: &parquet.LogicalType{ 168 INTEGER: &parquet.IntType{ 169 BitWidth: 8, 170 IsSigned: false, 171 }, 172 }, 173 }, 174 }, nil 175 case reflect.Uint16: 176 return &parquetschema.ColumnDefinition{ 177 SchemaElement: &parquet.SchemaElement{ 178 Type: parquet.TypePtr(parquet.Type_INT32), 179 Name: fieldName, 180 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 181 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_16), 182 LogicalType: &parquet.LogicalType{ 183 INTEGER: &parquet.IntType{ 184 BitWidth: 16, 185 IsSigned: false, 186 }, 187 }, 188 }, 189 }, nil 190 case reflect.Uint32: 191 return &parquetschema.ColumnDefinition{ 192 SchemaElement: &parquet.SchemaElement{ 193 Type: parquet.TypePtr(parquet.Type_INT32), 194 Name: fieldName, 195 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 196 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32), 197 LogicalType: &parquet.LogicalType{ 198 INTEGER: &parquet.IntType{ 199 BitWidth: 32, 200 IsSigned: false, 201 }, 202 }, 203 }, 204 }, nil 205 case reflect.Uint64: 206 return &parquetschema.ColumnDefinition{ 207 SchemaElement: &parquet.SchemaElement{ 208 Type: parquet.TypePtr(parquet.Type_INT64), 209 Name: fieldName, 210 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 211 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_64), 212 LogicalType: &parquet.LogicalType{ 213 INTEGER: &parquet.IntType{ 214 BitWidth: 64, 215 IsSigned: false, 216 }, 217 }, 218 }, 219 }, nil 220 case reflect.Uintptr: 221 return nil, errors.New("unsupported type uintptr") 222 case reflect.Float32: 223 return &parquetschema.ColumnDefinition{ 224 SchemaElement: &parquet.SchemaElement{ 225 Type: parquet.TypePtr(parquet.Type_FLOAT), 226 Name: fieldName, 227 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 228 }, 229 }, nil 230 case reflect.Float64: 231 return &parquetschema.ColumnDefinition{ 232 SchemaElement: &parquet.SchemaElement{ 233 Type: parquet.TypePtr(parquet.Type_DOUBLE), 234 Name: fieldName, 235 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 236 }, 237 }, nil 238 case reflect.Complex64: 239 return nil, errors.New("unsupported type complex64") 240 case reflect.Complex128: 241 return nil, errors.New("unsupported type complex128") 242 case reflect.Chan: 243 return nil, errors.New("unsupported type chan") 244 case reflect.Func: 245 return nil, errors.New("unsupported type func") 246 case reflect.Interface: 247 return nil, errors.New("unsupported type interface") 248 case reflect.Map: 249 keyType, err := generateField(fieldType.Key(), "key") 250 if err != nil { 251 return nil, err 252 } 253 valueType, err := generateField(fieldType.Elem(), "value") 254 if err != nil { 255 return nil, err 256 } 257 return &parquetschema.ColumnDefinition{ 258 SchemaElement: &parquet.SchemaElement{ 259 Name: fieldName, 260 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_OPTIONAL), 261 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_MAP), 262 LogicalType: &parquet.LogicalType{ 263 MAP: &parquet.MapType{}, 264 }, 265 }, 266 Children: []*parquetschema.ColumnDefinition{ 267 { 268 SchemaElement: &parquet.SchemaElement{ 269 Name: "key_value", 270 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REPEATED), 271 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_MAP_KEY_VALUE), 272 }, 273 Children: []*parquetschema.ColumnDefinition{ 274 keyType, 275 valueType, 276 }, 277 }, 278 }, 279 }, nil 280 case reflect.Ptr: 281 colDef, err := generateField(fieldType.Elem(), fieldName) 282 if err != nil { 283 return nil, err 284 } 285 colDef.SchemaElement.RepetitionType = parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_OPTIONAL) 286 return colDef, nil 287 case reflect.Slice, reflect.Array: 288 if fieldType.Elem().Kind() == reflect.Uint8 { 289 switch fieldType.Kind() { 290 case reflect.Slice: 291 // handle special case for []byte 292 return &parquetschema.ColumnDefinition{ 293 SchemaElement: &parquet.SchemaElement{ 294 Type: parquet.TypePtr(parquet.Type_BYTE_ARRAY), 295 Name: fieldName, 296 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 297 }, 298 }, nil 299 case reflect.Array: 300 typeLen := int32(fieldType.Len()) 301 // handle special case for [N]byte 302 return &parquetschema.ColumnDefinition{ 303 SchemaElement: &parquet.SchemaElement{ 304 Type: parquet.TypePtr(parquet.Type_FIXED_LEN_BYTE_ARRAY), 305 Name: fieldName, 306 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 307 TypeLength: &typeLen, 308 }, 309 }, nil 310 } 311 } 312 elementType, err := generateField(fieldType.Elem(), "element") 313 if err != nil { 314 return nil, err 315 } 316 repType := elementType.SchemaElement.RepetitionType 317 elementType.SchemaElement.RepetitionType = parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED) 318 return &parquetschema.ColumnDefinition{ 319 SchemaElement: &parquet.SchemaElement{ 320 Name: fieldName, 321 RepetitionType: repType, 322 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_LIST), 323 LogicalType: &parquet.LogicalType{ 324 LIST: &parquet.ListType{}, 325 }, 326 }, 327 Children: []*parquetschema.ColumnDefinition{ 328 { 329 SchemaElement: &parquet.SchemaElement{ 330 Name: "list", 331 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REPEATED), 332 }, 333 Children: []*parquetschema.ColumnDefinition{ 334 elementType, 335 }, 336 }, 337 }, 338 }, nil 339 case reflect.String: 340 return &parquetschema.ColumnDefinition{ 341 SchemaElement: &parquet.SchemaElement{ 342 Type: parquet.TypePtr(parquet.Type_BYTE_ARRAY), 343 Name: fieldName, 344 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 345 ConvertedType: parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8), 346 LogicalType: &parquet.LogicalType{ 347 STRING: &parquet.StringType{}, 348 }, 349 }, 350 }, nil 351 case reflect.Struct: 352 switch { 353 case fieldType.ConvertibleTo(reflect.TypeOf(time.Time{})): 354 return &parquetschema.ColumnDefinition{ 355 SchemaElement: &parquet.SchemaElement{ 356 Type: parquet.TypePtr(parquet.Type_INT64), 357 Name: fieldName, 358 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 359 LogicalType: &parquet.LogicalType{ 360 TIMESTAMP: &parquet.TimestampType{ 361 IsAdjustedToUTC: true, 362 Unit: &parquet.TimeUnit{ 363 NANOS: parquet.NewNanoSeconds(), 364 }, 365 }, 366 }, 367 }, 368 }, nil 369 default: 370 children, err := generateSchema(fieldType) 371 if err != nil { 372 return nil, err 373 } 374 return &parquetschema.ColumnDefinition{ 375 SchemaElement: &parquet.SchemaElement{ 376 Name: fieldName, 377 RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED), 378 }, 379 Children: children, 380 }, nil 381 } 382 case reflect.UnsafePointer: 383 return nil, errors.New("unsafe.Pointer is unsupported") 384 default: 385 return nil, fmt.Errorf("unknown kind %s is unsupported", fieldType.Kind()) 386 } 387 } 388 389 func fieldNameToLower(field reflect.StructField) string { 390 parquetStructTag, ok := field.Tag.Lookup("parquet") 391 if !ok { 392 return strings.ToLower(field.Name) 393 } 394 395 parquetStructTagFields := strings.Split(parquetStructTag, ",") 396 397 return strings.TrimSpace(parquetStructTagFields[0]) 398 }