github.com/apache/arrow/go/v14@v14.0.2/parquet/schema/reflection_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package schema_test
    18  
    19  import (
    20  	"log"
    21  	"os"
    22  	"reflect"
    23  	"testing"
    24  
    25  	"github.com/apache/arrow/go/v14/parquet"
    26  	"github.com/apache/arrow/go/v14/parquet/schema"
    27  	"github.com/stretchr/testify/assert"
    28  )
    29  
    30  func ExampleNewSchemaFromStruct_primitives() {
    31  	type Schema struct {
    32  		Bool              bool
    33  		Int8              int8
    34  		Uint16            uint16
    35  		Int32             int32
    36  		Int64             int64
    37  		Int96             parquet.Int96
    38  		Float             float32
    39  		Double            float64
    40  		ByteArray         string
    41  		FixedLenByteArray [10]byte
    42  	}
    43  
    44  	sc, err := schema.NewSchemaFromStruct(Schema{})
    45  	if err != nil {
    46  		log.Fatal(err)
    47  	}
    48  
    49  	schema.PrintSchema(sc.Root(), os.Stdout, 2)
    50  
    51  	// Output:
    52  	// repeated group field_id=-1 Schema {
    53  	//   required boolean field_id=-1 Bool;
    54  	//   required int32 field_id=-1 Int8 (Int(bitWidth=8, isSigned=true));
    55  	//   required int32 field_id=-1 Uint16 (Int(bitWidth=16, isSigned=false));
    56  	//   required int32 field_id=-1 Int32 (Int(bitWidth=32, isSigned=true));
    57  	//   required int64 field_id=-1 Int64 (Int(bitWidth=64, isSigned=true));
    58  	//   required int96 field_id=-1 Int96;
    59  	//   required float field_id=-1 Float;
    60  	//   required double field_id=-1 Double;
    61  	//   required byte_array field_id=-1 ByteArray;
    62  	//   required fixed_len_byte_array field_id=-1 FixedLenByteArray;
    63  	// }
    64  }
    65  
    66  func ExampleNewSchemaFromStruct_convertedtypes() {
    67  	type ConvertedSchema struct {
    68  		Utf8           string        `parquet:"name=utf8, converted=UTF8"`
    69  		Uint32         uint32        `parquet:"converted=INT_32"`
    70  		Date           int32         `parquet:"name=date, converted=date"`
    71  		TimeMilli      int32         `parquet:"name=timemilli, converted=TIME_MILLIS"`
    72  		TimeMicro      int64         `parquet:"name=timemicro, converted=time_micros"`
    73  		TimeStampMilli int64         `parquet:"converted=timestamp_millis"`
    74  		TimeStampMicro int64         `parquet:"converted=timestamp_micros"`
    75  		Interval       parquet.Int96 `parquet:"converted=INTERVAL"`
    76  		Decimal1       int32         `parquet:"converted=decimal, scale=2, precision=9"`
    77  		Decimal2       int64         `parquet:"converted=decimal, scale=2, precision=18"`
    78  		Decimal3       [12]byte      `parquet:"converted=decimal, scale=2, precision=10"`
    79  		Decimal4       string        `parquet:"converted=decimal, scale=2, precision=20"`
    80  	}
    81  
    82  	sc, err := schema.NewSchemaFromStruct(&ConvertedSchema{})
    83  	if err != nil {
    84  		log.Fatal(err)
    85  	}
    86  
    87  	schema.PrintSchema(sc.Root(), os.Stdout, 2)
    88  
    89  	// Output:
    90  	// repeated group field_id=-1 ConvertedSchema {
    91  	//   required byte_array field_id=-1 utf8 (String);
    92  	//   required int32 field_id=-1 Uint32 (Int(bitWidth=32, isSigned=true));
    93  	//   required int32 field_id=-1 date (Date);
    94  	//   required int32 field_id=-1 timemilli (Time(isAdjustedToUTC=true, timeUnit=milliseconds));
    95  	//   required int64 field_id=-1 timemicro (Time(isAdjustedToUTC=true, timeUnit=microseconds));
    96  	//   required int64 field_id=-1 TimeStampMilli (Timestamp(isAdjustedToUTC=true, timeUnit=milliseconds, is_from_converted_type=true, force_set_converted_type=false));
    97  	//   required int64 field_id=-1 TimeStampMicro (Timestamp(isAdjustedToUTC=true, timeUnit=microseconds, is_from_converted_type=true, force_set_converted_type=false));
    98  	//   required int96 field_id=-1 Interval;
    99  	//   required int32 field_id=-1 Decimal1 (Decimal(precision=9, scale=2));
   100  	//   required int64 field_id=-1 Decimal2 (Decimal(precision=18, scale=2));
   101  	//   required fixed_len_byte_array field_id=-1 Decimal3 (Decimal(precision=10, scale=2));
   102  	//   required byte_array field_id=-1 Decimal4 (Decimal(precision=20, scale=2));
   103  	// }
   104  }
   105  
   106  func ExampleNewSchemaFromStruct_repetition() {
   107  	type RepetitionSchema struct {
   108  		List     []int64 `parquet:"fieldid=1"`
   109  		Repeated []int64 `parquet:"repetition=repeated, fieldid=2"`
   110  		Optional *int64  `parquet:"fieldid=3"`
   111  		Required *int64  `parquet:"repetition=REQUIRED, fieldid=4"`
   112  		Opt      int64   `parquet:"repetition=OPTIONAL, fieldid=5"`
   113  	}
   114  
   115  	sc, err := schema.NewSchemaFromStruct(RepetitionSchema{})
   116  	if err != nil {
   117  		log.Fatal(err)
   118  	}
   119  
   120  	schema.PrintSchema(sc.Root(), os.Stdout, 2)
   121  
   122  	// Output:
   123  	// repeated group field_id=-1 RepetitionSchema {
   124  	//   required group field_id=1 List (List) {
   125  	//     repeated group field_id=-1 list {
   126  	//       required int64 field_id=-1 element (Int(bitWidth=64, isSigned=true));
   127  	//     }
   128  	//   }
   129  	//   repeated int64 field_id=2 Repeated (Int(bitWidth=64, isSigned=true));
   130  	//   optional int64 field_id=3 Optional (Int(bitWidth=64, isSigned=true));
   131  	//   required int64 field_id=4 Required (Int(bitWidth=64, isSigned=true));
   132  	//   optional int64 field_id=5 Opt (Int(bitWidth=64, isSigned=true));
   133  	// }
   134  }
   135  
   136  func ExampleNewSchemaFromStruct_logicaltypes() {
   137  	type LogicalTypes struct {
   138  		String                []byte   `parquet:"logical=String"`
   139  		Enum                  string   `parquet:"logical=enum"`
   140  		Date                  int32    `parquet:"logical=date"`
   141  		Decimal1              int32    `parquet:"logical=decimal, precision=9, scale=2"`
   142  		Decimal2              int32    `parquet:"logical=decimal, logical.precision=9, scale=2"`
   143  		Decimal3              int32    `parquet:"logical=decimal, precision=5, logical.precision=9, scale=1, logical.scale=3"`
   144  		TimeMilliUTC          int32    `parquet:"logical=TIME, logical.unit=millis"`
   145  		TimeMilli             int32    `parquet:"logical=Time, logical.unit=millis, logical.isadjustedutc=false"`
   146  		TimeMicros            int64    `parquet:"logical=time, logical.unit=micros, logical.isadjustedutc=false"`
   147  		TimeMicrosUTC         int64    `parquet:"logical=time, logical.unit=micros, logical.isadjustedutc=true"`
   148  		TimeNanos             int64    `parquet:"logical=time, logical.unit=nanos"`
   149  		TimestampMilli        int64    `parquet:"logical=timestamp, logical.unit=millis"`
   150  		TimestampMicrosNotUTC int64    `parquet:"logical=timestamp, logical.unit=micros, logical.isadjustedutc=false"`
   151  		TimestampNanos        int64    `parquet:"logical=timestamp, logical.unit=nanos"`
   152  		JSON                  string   `parquet:"logical=json"`
   153  		BSON                  []byte   `parquet:"logical=BSON"`
   154  		UUID                  [16]byte `parquet:"logical=uuid"`
   155  	}
   156  
   157  	sc, err := schema.NewSchemaFromStruct(LogicalTypes{})
   158  	if err != nil {
   159  		log.Fatal(err)
   160  	}
   161  
   162  	schema.PrintSchema(sc.Root(), os.Stdout, 2)
   163  
   164  	// Output:
   165  	// repeated group field_id=-1 LogicalTypes {
   166  	//   required byte_array field_id=-1 String (String);
   167  	//   required byte_array field_id=-1 Enum (Enum);
   168  	//   required int32 field_id=-1 Date (Date);
   169  	//   required int32 field_id=-1 Decimal1 (Decimal(precision=9, scale=2));
   170  	//   required int32 field_id=-1 Decimal2 (Decimal(precision=9, scale=2));
   171  	//   required int32 field_id=-1 Decimal3 (Decimal(precision=9, scale=3));
   172  	//   required int32 field_id=-1 TimeMilliUTC (Time(isAdjustedToUTC=true, timeUnit=milliseconds));
   173  	//   required int32 field_id=-1 TimeMilli (Time(isAdjustedToUTC=false, timeUnit=milliseconds));
   174  	//   required int64 field_id=-1 TimeMicros (Time(isAdjustedToUTC=false, timeUnit=microseconds));
   175  	//   required int64 field_id=-1 TimeMicrosUTC (Time(isAdjustedToUTC=true, timeUnit=microseconds));
   176  	//   required int64 field_id=-1 TimeNanos (Time(isAdjustedToUTC=true, timeUnit=nanoseconds));
   177  	//   required int64 field_id=-1 TimestampMilli (Timestamp(isAdjustedToUTC=true, timeUnit=milliseconds, is_from_converted_type=false, force_set_converted_type=false));
   178  	//   required int64 field_id=-1 TimestampMicrosNotUTC (Timestamp(isAdjustedToUTC=false, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false));
   179  	//   required int64 field_id=-1 TimestampNanos (Timestamp(isAdjustedToUTC=true, timeUnit=nanoseconds, is_from_converted_type=false, force_set_converted_type=false));
   180  	//   required byte_array field_id=-1 JSON (JSON);
   181  	//   required byte_array field_id=-1 BSON (BSON);
   182  	//   required fixed_len_byte_array field_id=-1 UUID (UUID);
   183  	// }
   184  }
   185  
   186  func ExampleNewSchemaFromStruct_physicaltype() {
   187  	type ChangeTypes struct {
   188  		Int32        int64  `parquet:"type=int32"`
   189  		FixedLen     string `parquet:"type=fixed_len_byte_array, length=10"`
   190  		SliceAsFixed []byte `parquet:"type=fixed_len_byte_array, length=12"`
   191  		Int          int    `parquet:"type=int32"`
   192  	}
   193  
   194  	sc, err := schema.NewSchemaFromStruct(ChangeTypes{})
   195  	if err != nil {
   196  		log.Fatal(err)
   197  	}
   198  
   199  	schema.PrintSchema(sc.Root(), os.Stdout, 2)
   200  
   201  	// Output:
   202  	// repeated group field_id=-1 ChangeTypes {
   203  	//   required int32 field_id=-1 Int32 (Int(bitWidth=32, isSigned=true));
   204  	//   required fixed_len_byte_array field_id=-1 FixedLen;
   205  	//   required fixed_len_byte_array field_id=-1 SliceAsFixed;
   206  	//   required int32 field_id=-1 Int (Int(bitWidth=32, isSigned=true));
   207  	// }
   208  }
   209  
   210  func ExampleNewSchemaFromStruct_nestedtypes() {
   211  	type Other struct {
   212  		OptionalMap *map[string]*string `parquet:"valuerepetition=required, keylogical=String, valueconverted=BSON"`
   213  	}
   214  
   215  	type MyMap map[int32]string
   216  
   217  	type Nested struct {
   218  		SimpleMap     map[int32]string
   219  		FixedLenMap   map[string][]byte `parquet:"keytype=fixed_len_byte_array, keyfieldid=10, valuefieldid=11, keylength=10"`
   220  		DecimalMap    map[int32]string  `parquet:"logical=map, keyconverted=DECIMAL, keyscale=3, keyprecision=7, valuetype=fixed_len_byte_array, valuelength=4, valuelogical=decimal, valuelogical.precision=9, valuescale=2"`
   221  		OtherList     []*Other
   222  		OtherRepeated []Other  `parquet:"repetition=repeated"`
   223  		DateArray     [5]int32 `parquet:"valuelogical=date, logical=list"`
   224  		DateMap       MyMap    `parquet:"keylogical=TIME, keylogical.unit=MILLIS, keylogical.isadjustedutc=false, valuelogical=enum"`
   225  	}
   226  
   227  	sc, err := schema.NewSchemaFromStruct(Nested{})
   228  	if err != nil {
   229  		log.Fatal(err)
   230  	}
   231  
   232  	schema.PrintSchema(sc.Root(), os.Stdout, 2)
   233  
   234  	// Output:
   235  	// repeated group field_id=-1 Nested {
   236  	//   required group field_id=-1 SimpleMap (Map) {
   237  	//     repeated group field_id=-1 key_value {
   238  	//       required int32 field_id=-1 key (Int(bitWidth=32, isSigned=true));
   239  	//       required byte_array field_id=-1 value;
   240  	//     }
   241  	//   }
   242  	//   required group field_id=-1 FixedLenMap (Map) {
   243  	//     repeated group field_id=-1 key_value {
   244  	//       required fixed_len_byte_array field_id=10 key;
   245  	//       required byte_array field_id=11 value;
   246  	//     }
   247  	//   }
   248  	//   required group field_id=-1 DecimalMap (Map) {
   249  	//     repeated group field_id=-1 key_value {
   250  	//       required int32 field_id=-1 key (Decimal(precision=7, scale=3));
   251  	//       required fixed_len_byte_array field_id=-1 value (Decimal(precision=9, scale=2));
   252  	//     }
   253  	//   }
   254  	//   required group field_id=-1 OtherList (List) {
   255  	//     repeated group field_id=-1 list {
   256  	//       optional group field_id=-1 element {
   257  	//         optional group field_id=-1 OptionalMap (Map) {
   258  	//           repeated group field_id=-1 key_value {
   259  	//             required byte_array field_id=-1 key (String);
   260  	//             required byte_array field_id=-1 value (BSON);
   261  	//           }
   262  	//         }
   263  	//       }
   264  	//     }
   265  	//   }
   266  	//   repeated group field_id=-1 OtherRepeated {
   267  	//     optional group field_id=-1 OptionalMap (Map) {
   268  	//       repeated group field_id=-1 key_value {
   269  	//         required byte_array field_id=-1 key (String);
   270  	//         required byte_array field_id=-1 value (BSON);
   271  	//       }
   272  	//     }
   273  	//   }
   274  	//   required group field_id=-1 DateArray (List) {
   275  	//     repeated group field_id=-1 list {
   276  	//       required int32 field_id=-1 element (Date);
   277  	//     }
   278  	//   }
   279  	//   required group field_id=-1 DateMap (Map) {
   280  	//     repeated group field_id=-1 key_value {
   281  	//       required int32 field_id=-1 key (Time(isAdjustedToUTC=false, timeUnit=milliseconds));
   282  	//       required byte_array field_id=-1 value (Enum);
   283  	//     }
   284  	//   }
   285  	// }
   286  }
   287  
   288  func TestStructFromSchema(t *testing.T) {
   289  	root, err := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{
   290  		schema.NewBooleanNode("bool", parquet.Repetitions.Required, -1),
   291  		schema.NewInt32Node("int32", parquet.Repetitions.Optional, -1),
   292  		schema.NewInt64Node("int64", parquet.Repetitions.Repeated, -1),
   293  		schema.NewInt96Node("int96", parquet.Repetitions.Required, -1),
   294  		schema.NewFloat32Node("float", parquet.Repetitions.Required, -1),
   295  		schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1),
   296  		schema.NewFixedLenByteArrayNode("fixedLen", parquet.Repetitions.Required, 10, -1),
   297  	}, -1)
   298  	assert.NoError(t, err)
   299  
   300  	sc := schema.NewSchema(root)
   301  
   302  	typ, err := schema.NewStructFromSchema(sc)
   303  	assert.NoError(t, err)
   304  
   305  	assert.Equal(t, reflect.Struct, typ.Kind())
   306  	assert.Equal(t, "struct { bool bool; int32 *int32; int64 []int64; int96 parquet.Int96; float float32; bytearray parquet.ByteArray; fixedLen parquet.FixedLenByteArray }",
   307  		typ.String())
   308  }
   309  
   310  func TestStructFromSchemaWithNesting(t *testing.T) {
   311  	type Other struct {
   312  		List     *[]*float32
   313  		Excluded int32 `parquet:"-"`
   314  	}
   315  
   316  	type Nested struct {
   317  		Nest         []int32
   318  		OptionalNest []*int64
   319  		Mapped       map[string]float32
   320  		Other        []Other
   321  		Other2       Other
   322  	}
   323  
   324  	sc, err := schema.NewSchemaFromStruct(Nested{})
   325  	assert.NoError(t, err)
   326  
   327  	typ, err := schema.NewStructFromSchema(sc)
   328  	assert.NoError(t, err)
   329  	assert.Equal(t, "struct { Nest []int32; OptionalNest []*int64; Mapped map[string]float32; Other []struct { List *[]*float32 }; Other2 struct { List *[]*float32 } }",
   330  		typ.String())
   331  }
   332  
   333  func TestStructFromSchemaBackwardsCompatList(t *testing.T) {
   334  	tests := []struct {
   335  		name     string
   336  		n        schema.Node
   337  		expected string
   338  	}{
   339  		{"proper list", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Required,
   340  			schema.FieldList{
   341  				schema.MustGroup(schema.NewGroupNode("list", parquet.Repetitions.Repeated, schema.FieldList{schema.NewBooleanNode("element", parquet.Repetitions.Optional, -1)}, -1)),
   342  			}, schema.NewListLogicalType(), -1)), "struct { my_list []*bool }"},
   343  		{"backward nullable list nonnull ints", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Optional, schema.FieldList{
   344  			schema.NewInt32Node("element", parquet.Repetitions.Repeated, -1),
   345  		}, schema.NewListLogicalType(), -1)), "struct { my_list *[]int32 }"},
   346  		{"backward nullable list tuple string int", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Optional, schema.FieldList{
   347  			schema.MustGroup(schema.NewGroupNode("element", parquet.Repetitions.Repeated, schema.FieldList{
   348  				schema.MustPrimitive(schema.NewPrimitiveNodeLogical("str", parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)),
   349  				schema.NewInt32Node("num", parquet.Repetitions.Required, -1),
   350  			}, -1)),
   351  		}, schema.NewListLogicalType(), -1)), "struct { my_list *[]struct { str string; num int32 } }"},
   352  		{"list tuple string", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Required, schema.FieldList{
   353  			schema.MustGroup(schema.NewGroupNode("array", parquet.Repetitions.Repeated, schema.FieldList{
   354  				schema.NewByteArrayNode("str", parquet.Repetitions.Required, -1),
   355  			}, -1)),
   356  		}, schema.NewListLogicalType(), -1)), "struct { my_list []struct { str parquet.ByteArray } }"},
   357  		{"list tuple string my_list_tuple", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Optional, schema.FieldList{
   358  			schema.MustGroup(schema.NewGroupNode("my_list_tuple", parquet.Repetitions.Repeated, schema.FieldList{
   359  				schema.MustPrimitive(schema.NewPrimitiveNodeLogical("str", parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)),
   360  			}, -1)),
   361  		}, schema.NewListLogicalType(), -1)), "struct { my_list *[]struct { str string } }"},
   362  	}
   363  
   364  	for _, tt := range tests {
   365  		t.Run(tt.name, func(t *testing.T) {
   366  			typ, err := schema.NewStructFromSchema(schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{tt.n}, -1))))
   367  			assert.NoError(t, err)
   368  			assert.Equal(t, tt.expected, typ.String())
   369  		})
   370  	}
   371  }
   372  
   373  func TestStructFromSchemaMaps(t *testing.T) {
   374  	tests := []struct {
   375  		name     string
   376  		n        schema.Node
   377  		expected string
   378  	}{
   379  		{"map string int", schema.MustGroup(schema.NewGroupNodeLogical("my_map", parquet.Repetitions.Required, schema.FieldList{
   380  			schema.MustGroup(schema.NewGroupNode("key_value", parquet.Repetitions.Repeated, schema.FieldList{
   381  				schema.MustPrimitive(schema.NewPrimitiveNodeLogical("key", parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)),
   382  				schema.NewInt32Node("value", parquet.Repetitions.Optional, -1),
   383  			}, -1)),
   384  		}, schema.MapLogicalType{}, -1)), "struct { my_map map[string]*int32 }"},
   385  		{"nullable map string, int, required values", schema.MustGroup(schema.NewGroupNodeLogical("my_map", parquet.Repetitions.Optional, schema.FieldList{
   386  			schema.MustGroup(schema.NewGroupNode("map", parquet.Repetitions.Repeated, schema.FieldList{
   387  				schema.NewByteArrayNode("str", parquet.Repetitions.Required, -1),
   388  				schema.NewInt32Node("num", parquet.Repetitions.Required, -1),
   389  			}, -1)),
   390  		}, schema.MapLogicalType{}, -1)), "struct { my_map *map[string]int32 }"},
   391  		{"map_key_value with missing value", schema.MustGroup(schema.NewGroupNodeConverted("my_map", parquet.Repetitions.Optional, schema.FieldList{
   392  			schema.MustGroup(schema.NewGroupNode("map", parquet.Repetitions.Repeated, schema.FieldList{
   393  				schema.NewByteArrayNode("key", parquet.Repetitions.Required, -1),
   394  			}, -1)),
   395  		}, schema.ConvertedTypes.MapKeyValue, -1)), "struct { my_map *map[string]bool }"},
   396  	}
   397  	for _, tt := range tests {
   398  		t.Run(tt.name, func(t *testing.T) {
   399  			typ, err := schema.NewStructFromSchema(schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{tt.n}, -1))))
   400  			assert.NoError(t, err)
   401  			assert.Equal(t, tt.expected, typ.String())
   402  		})
   403  	}
   404  }