github.com/fraugster/parquet-go@v0.12.0/parquetschema/autoschema/gen.go (about)

     1  package autoschema
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"reflect"
     7  	"strings"
     8  	"time"
     9  
    10  	"github.com/fraugster/parquet-go/parquet"
    11  	"github.com/fraugster/parquet-go/parquetschema"
    12  )
    13  
    14  // GenerateSchema auto-generates a schema definition for a provided object's type
    15  // using reflection. The generated schema is meant to be compatible with
    16  // github.com/fraugster/parquet-go/floor's reflection-based marshalling/unmarshalling.
    17  func GenerateSchema(obj interface{}) (*parquetschema.SchemaDefinition, error) {
    18  	valueObj := reflect.ValueOf(obj)
    19  	columns, err := generateSchema(valueObj.Type())
    20  	if err != nil {
    21  		return nil, fmt.Errorf("can't generate schema: %w", err)
    22  	}
    23  
    24  	return &parquetschema.SchemaDefinition{
    25  		RootColumn: &parquetschema.ColumnDefinition{
    26  			SchemaElement: &parquet.SchemaElement{
    27  				Name: "autogen_schema",
    28  			},
    29  			Children: columns,
    30  		},
    31  	}, nil
    32  }
    33  
    34  func generateSchema(objType reflect.Type) ([]*parquetschema.ColumnDefinition, error) {
    35  	if objType.Kind() == reflect.Ptr {
    36  		objType = objType.Elem()
    37  	}
    38  
    39  	if objType.Kind() != reflect.Struct {
    40  		return nil, errors.New("can't generate schema: provided object needs to be of type struct or *struct")
    41  	}
    42  
    43  	columns := []*parquetschema.ColumnDefinition{}
    44  
    45  	for i := 0; i < objType.NumField(); i++ {
    46  		fieldType := objType.Field(i)
    47  		fieldName := fieldNameToLower(fieldType)
    48  
    49  		column, err := generateField(fieldType.Type, fieldName)
    50  		if err != nil {
    51  			return nil, err
    52  		}
    53  
    54  		columns = append(columns, column)
    55  	}
    56  
    57  	return columns, nil
    58  }
    59  
    60  func generateField(fieldType reflect.Type, fieldName string) (*parquetschema.ColumnDefinition, error) {
    61  	switch fieldType.Kind() {
    62  	case reflect.Bool:
    63  		return &parquetschema.ColumnDefinition{
    64  			SchemaElement: &parquet.SchemaElement{
    65  				Type:           parquet.TypePtr(parquet.Type_BOOLEAN),
    66  				Name:           fieldName,
    67  				RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
    68  			},
    69  		}, nil
    70  	case reflect.Int:
    71  		return &parquetschema.ColumnDefinition{
    72  			SchemaElement: &parquet.SchemaElement{
    73  				Type:           parquet.TypePtr(parquet.Type_INT64),
    74  				Name:           fieldName,
    75  				RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
    76  				ConvertedType:  parquet.ConvertedTypePtr(parquet.ConvertedType_INT_64),
    77  				LogicalType: &parquet.LogicalType{
    78  					INTEGER: &parquet.IntType{
    79  						BitWidth: 64,
    80  						IsSigned: true,
    81  					},
    82  				},
    83  			},
    84  		}, nil
    85  	case reflect.Int8:
    86  		return &parquetschema.ColumnDefinition{
    87  			SchemaElement: &parquet.SchemaElement{
    88  				Type:           parquet.TypePtr(parquet.Type_INT32),
    89  				Name:           fieldName,
    90  				RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
    91  				ConvertedType:  parquet.ConvertedTypePtr(parquet.ConvertedType_INT_16),
    92  				LogicalType: &parquet.LogicalType{
    93  					INTEGER: &parquet.IntType{
    94  						BitWidth: 8,
    95  						IsSigned: true,
    96  					},
    97  				},
    98  			},
    99  		}, nil
   100  	case reflect.Int16:
   101  		return &parquetschema.ColumnDefinition{
   102  			SchemaElement: &parquet.SchemaElement{
   103  				Type:           parquet.TypePtr(parquet.Type_INT32),
   104  				Name:           fieldName,
   105  				RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   106  				ConvertedType:  parquet.ConvertedTypePtr(parquet.ConvertedType_INT_16),
   107  				LogicalType: &parquet.LogicalType{
   108  					INTEGER: &parquet.IntType{
   109  						BitWidth: 16,
   110  						IsSigned: true,
   111  					},
   112  				},
   113  			},
   114  		}, nil
   115  	case reflect.Int32:
   116  		return &parquetschema.ColumnDefinition{
   117  			SchemaElement: &parquet.SchemaElement{
   118  				Type:           parquet.TypePtr(parquet.Type_INT32),
   119  				Name:           fieldName,
   120  				RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   121  				ConvertedType:  parquet.ConvertedTypePtr(parquet.ConvertedType_INT_32),
   122  				LogicalType: &parquet.LogicalType{
   123  					INTEGER: &parquet.IntType{
   124  						BitWidth: 32,
   125  						IsSigned: true,
   126  					},
   127  				},
   128  			},
   129  		}, nil
   130  	case reflect.Int64:
   131  		return &parquetschema.ColumnDefinition{
   132  			SchemaElement: &parquet.SchemaElement{
   133  				Type:           parquet.TypePtr(parquet.Type_INT64),
   134  				Name:           fieldName,
   135  				RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   136  				ConvertedType:  parquet.ConvertedTypePtr(parquet.ConvertedType_INT_64),
   137  				LogicalType: &parquet.LogicalType{
   138  					INTEGER: &parquet.IntType{
   139  						BitWidth: 64,
   140  						IsSigned: true,
   141  					},
   142  				},
   143  			},
   144  		}, nil
   145  	case reflect.Uint:
   146  		return &parquetschema.ColumnDefinition{
   147  			SchemaElement: &parquet.SchemaElement{
   148  				Type:           parquet.TypePtr(parquet.Type_INT32),
   149  				Name:           fieldName,
   150  				RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   151  				ConvertedType:  parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
   152  				LogicalType: &parquet.LogicalType{
   153  					INTEGER: &parquet.IntType{
   154  						BitWidth: 32,
   155  						IsSigned: false,
   156  					},
   157  				},
   158  			},
   159  		}, nil
   160  	case reflect.Uint8:
   161  		return &parquetschema.ColumnDefinition{
   162  			SchemaElement: &parquet.SchemaElement{
   163  				Type:           parquet.TypePtr(parquet.Type_INT32),
   164  				Name:           fieldName,
   165  				RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   166  				ConvertedType:  parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_16),
   167  				LogicalType: &parquet.LogicalType{
   168  					INTEGER: &parquet.IntType{
   169  						BitWidth: 8,
   170  						IsSigned: false,
   171  					},
   172  				},
   173  			},
   174  		}, nil
   175  	case reflect.Uint16:
   176  		return &parquetschema.ColumnDefinition{
   177  			SchemaElement: &parquet.SchemaElement{
   178  				Type:           parquet.TypePtr(parquet.Type_INT32),
   179  				Name:           fieldName,
   180  				RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   181  				ConvertedType:  parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_16),
   182  				LogicalType: &parquet.LogicalType{
   183  					INTEGER: &parquet.IntType{
   184  						BitWidth: 16,
   185  						IsSigned: false,
   186  					},
   187  				},
   188  			},
   189  		}, nil
   190  	case reflect.Uint32:
   191  		return &parquetschema.ColumnDefinition{
   192  			SchemaElement: &parquet.SchemaElement{
   193  				Type:           parquet.TypePtr(parquet.Type_INT32),
   194  				Name:           fieldName,
   195  				RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   196  				ConvertedType:  parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_32),
   197  				LogicalType: &parquet.LogicalType{
   198  					INTEGER: &parquet.IntType{
   199  						BitWidth: 32,
   200  						IsSigned: false,
   201  					},
   202  				},
   203  			},
   204  		}, nil
   205  	case reflect.Uint64:
   206  		return &parquetschema.ColumnDefinition{
   207  			SchemaElement: &parquet.SchemaElement{
   208  				Type:           parquet.TypePtr(parquet.Type_INT64),
   209  				Name:           fieldName,
   210  				RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   211  				ConvertedType:  parquet.ConvertedTypePtr(parquet.ConvertedType_UINT_64),
   212  				LogicalType: &parquet.LogicalType{
   213  					INTEGER: &parquet.IntType{
   214  						BitWidth: 64,
   215  						IsSigned: false,
   216  					},
   217  				},
   218  			},
   219  		}, nil
   220  	case reflect.Uintptr:
   221  		return nil, errors.New("unsupported type uintptr")
   222  	case reflect.Float32:
   223  		return &parquetschema.ColumnDefinition{
   224  			SchemaElement: &parquet.SchemaElement{
   225  				Type:           parquet.TypePtr(parquet.Type_FLOAT),
   226  				Name:           fieldName,
   227  				RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   228  			},
   229  		}, nil
   230  	case reflect.Float64:
   231  		return &parquetschema.ColumnDefinition{
   232  			SchemaElement: &parquet.SchemaElement{
   233  				Type:           parquet.TypePtr(parquet.Type_DOUBLE),
   234  				Name:           fieldName,
   235  				RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   236  			},
   237  		}, nil
   238  	case reflect.Complex64:
   239  		return nil, errors.New("unsupported type complex64")
   240  	case reflect.Complex128:
   241  		return nil, errors.New("unsupported type complex128")
   242  	case reflect.Chan:
   243  		return nil, errors.New("unsupported type chan")
   244  	case reflect.Func:
   245  		return nil, errors.New("unsupported type func")
   246  	case reflect.Interface:
   247  		return nil, errors.New("unsupported type interface")
   248  	case reflect.Map:
   249  		keyType, err := generateField(fieldType.Key(), "key")
   250  		if err != nil {
   251  			return nil, err
   252  		}
   253  		valueType, err := generateField(fieldType.Elem(), "value")
   254  		if err != nil {
   255  			return nil, err
   256  		}
   257  		return &parquetschema.ColumnDefinition{
   258  			SchemaElement: &parquet.SchemaElement{
   259  				Name:           fieldName,
   260  				RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_OPTIONAL),
   261  				ConvertedType:  parquet.ConvertedTypePtr(parquet.ConvertedType_MAP),
   262  				LogicalType: &parquet.LogicalType{
   263  					MAP: &parquet.MapType{},
   264  				},
   265  			},
   266  			Children: []*parquetschema.ColumnDefinition{
   267  				{
   268  					SchemaElement: &parquet.SchemaElement{
   269  						Name:           "key_value",
   270  						RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REPEATED),
   271  						ConvertedType:  parquet.ConvertedTypePtr(parquet.ConvertedType_MAP_KEY_VALUE),
   272  					},
   273  					Children: []*parquetschema.ColumnDefinition{
   274  						keyType,
   275  						valueType,
   276  					},
   277  				},
   278  			},
   279  		}, nil
   280  	case reflect.Ptr:
   281  		colDef, err := generateField(fieldType.Elem(), fieldName)
   282  		if err != nil {
   283  			return nil, err
   284  		}
   285  		colDef.SchemaElement.RepetitionType = parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_OPTIONAL)
   286  		return colDef, nil
   287  	case reflect.Slice, reflect.Array:
   288  		if fieldType.Elem().Kind() == reflect.Uint8 {
   289  			switch fieldType.Kind() {
   290  			case reflect.Slice:
   291  				// handle special case for []byte
   292  				return &parquetschema.ColumnDefinition{
   293  					SchemaElement: &parquet.SchemaElement{
   294  						Type:           parquet.TypePtr(parquet.Type_BYTE_ARRAY),
   295  						Name:           fieldName,
   296  						RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   297  					},
   298  				}, nil
   299  			case reflect.Array:
   300  				typeLen := int32(fieldType.Len())
   301  				// handle special case for [N]byte
   302  				return &parquetschema.ColumnDefinition{
   303  					SchemaElement: &parquet.SchemaElement{
   304  						Type:           parquet.TypePtr(parquet.Type_FIXED_LEN_BYTE_ARRAY),
   305  						Name:           fieldName,
   306  						RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   307  						TypeLength:     &typeLen,
   308  					},
   309  				}, nil
   310  			}
   311  		}
   312  		elementType, err := generateField(fieldType.Elem(), "element")
   313  		if err != nil {
   314  			return nil, err
   315  		}
   316  		repType := elementType.SchemaElement.RepetitionType
   317  		elementType.SchemaElement.RepetitionType = parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED)
   318  		return &parquetschema.ColumnDefinition{
   319  			SchemaElement: &parquet.SchemaElement{
   320  				Name:           fieldName,
   321  				RepetitionType: repType,
   322  				ConvertedType:  parquet.ConvertedTypePtr(parquet.ConvertedType_LIST),
   323  				LogicalType: &parquet.LogicalType{
   324  					LIST: &parquet.ListType{},
   325  				},
   326  			},
   327  			Children: []*parquetschema.ColumnDefinition{
   328  				{
   329  					SchemaElement: &parquet.SchemaElement{
   330  						Name:           "list",
   331  						RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REPEATED),
   332  					},
   333  					Children: []*parquetschema.ColumnDefinition{
   334  						elementType,
   335  					},
   336  				},
   337  			},
   338  		}, nil
   339  	case reflect.String:
   340  		return &parquetschema.ColumnDefinition{
   341  			SchemaElement: &parquet.SchemaElement{
   342  				Type:           parquet.TypePtr(parquet.Type_BYTE_ARRAY),
   343  				Name:           fieldName,
   344  				RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   345  				ConvertedType:  parquet.ConvertedTypePtr(parquet.ConvertedType_UTF8),
   346  				LogicalType: &parquet.LogicalType{
   347  					STRING: &parquet.StringType{},
   348  				},
   349  			},
   350  		}, nil
   351  	case reflect.Struct:
   352  		switch {
   353  		case fieldType.ConvertibleTo(reflect.TypeOf(time.Time{})):
   354  			return &parquetschema.ColumnDefinition{
   355  				SchemaElement: &parquet.SchemaElement{
   356  					Type:           parquet.TypePtr(parquet.Type_INT64),
   357  					Name:           fieldName,
   358  					RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   359  					LogicalType: &parquet.LogicalType{
   360  						TIMESTAMP: &parquet.TimestampType{
   361  							IsAdjustedToUTC: true,
   362  							Unit: &parquet.TimeUnit{
   363  								NANOS: parquet.NewNanoSeconds(),
   364  							},
   365  						},
   366  					},
   367  				},
   368  			}, nil
   369  		default:
   370  			children, err := generateSchema(fieldType)
   371  			if err != nil {
   372  				return nil, err
   373  			}
   374  			return &parquetschema.ColumnDefinition{
   375  				SchemaElement: &parquet.SchemaElement{
   376  					Name:           fieldName,
   377  					RepetitionType: parquet.FieldRepetitionTypePtr(parquet.FieldRepetitionType_REQUIRED),
   378  				},
   379  				Children: children,
   380  			}, nil
   381  		}
   382  	case reflect.UnsafePointer:
   383  		return nil, errors.New("unsafe.Pointer is unsupported")
   384  	default:
   385  		return nil, fmt.Errorf("unknown kind %s is unsupported", fieldType.Kind())
   386  	}
   387  }
   388  
   389  func fieldNameToLower(field reflect.StructField) string {
   390  	parquetStructTag, ok := field.Tag.Lookup("parquet")
   391  	if !ok {
   392  		return strings.ToLower(field.Name)
   393  	}
   394  
   395  	parquetStructTagFields := strings.Split(parquetStructTag, ",")
   396  
   397  	return strings.TrimSpace(parquetStructTagFields[0])
   398  }