github.com/apache/arrow/go/v14@v14.0.2/parquet/schema/reflection_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package schema_test 18 19 import ( 20 "log" 21 "os" 22 "reflect" 23 "testing" 24 25 "github.com/apache/arrow/go/v14/parquet" 26 "github.com/apache/arrow/go/v14/parquet/schema" 27 "github.com/stretchr/testify/assert" 28 ) 29 30 func ExampleNewSchemaFromStruct_primitives() { 31 type Schema struct { 32 Bool bool 33 Int8 int8 34 Uint16 uint16 35 Int32 int32 36 Int64 int64 37 Int96 parquet.Int96 38 Float float32 39 Double float64 40 ByteArray string 41 FixedLenByteArray [10]byte 42 } 43 44 sc, err := schema.NewSchemaFromStruct(Schema{}) 45 if err != nil { 46 log.Fatal(err) 47 } 48 49 schema.PrintSchema(sc.Root(), os.Stdout, 2) 50 51 // Output: 52 // repeated group field_id=-1 Schema { 53 // required boolean field_id=-1 Bool; 54 // required int32 field_id=-1 Int8 (Int(bitWidth=8, isSigned=true)); 55 // required int32 field_id=-1 Uint16 (Int(bitWidth=16, isSigned=false)); 56 // required int32 field_id=-1 Int32 (Int(bitWidth=32, isSigned=true)); 57 // required int64 field_id=-1 Int64 (Int(bitWidth=64, isSigned=true)); 58 // required int96 field_id=-1 Int96; 59 // required float field_id=-1 Float; 60 // required double field_id=-1 Double; 61 // required byte_array field_id=-1 ByteArray; 62 // required fixed_len_byte_array field_id=-1 FixedLenByteArray; 63 // } 64 } 65 66 func ExampleNewSchemaFromStruct_convertedtypes() { 67 type ConvertedSchema struct { 68 Utf8 string `parquet:"name=utf8, converted=UTF8"` 69 Uint32 uint32 `parquet:"converted=INT_32"` 70 Date int32 `parquet:"name=date, converted=date"` 71 TimeMilli int32 `parquet:"name=timemilli, converted=TIME_MILLIS"` 72 TimeMicro int64 `parquet:"name=timemicro, converted=time_micros"` 73 TimeStampMilli int64 `parquet:"converted=timestamp_millis"` 74 TimeStampMicro int64 `parquet:"converted=timestamp_micros"` 75 Interval parquet.Int96 `parquet:"converted=INTERVAL"` 76 Decimal1 int32 `parquet:"converted=decimal, scale=2, precision=9"` 77 Decimal2 int64 `parquet:"converted=decimal, scale=2, precision=18"` 78 Decimal3 [12]byte `parquet:"converted=decimal, scale=2, precision=10"` 79 Decimal4 string `parquet:"converted=decimal, scale=2, precision=20"` 80 } 81 82 sc, err := schema.NewSchemaFromStruct(&ConvertedSchema{}) 83 if err != nil { 84 log.Fatal(err) 85 } 86 87 schema.PrintSchema(sc.Root(), os.Stdout, 2) 88 89 // Output: 90 // repeated group field_id=-1 ConvertedSchema { 91 // required byte_array field_id=-1 utf8 (String); 92 // required int32 field_id=-1 Uint32 (Int(bitWidth=32, isSigned=true)); 93 // required int32 field_id=-1 date (Date); 94 // required int32 field_id=-1 timemilli (Time(isAdjustedToUTC=true, timeUnit=milliseconds)); 95 // required int64 field_id=-1 timemicro (Time(isAdjustedToUTC=true, timeUnit=microseconds)); 96 // required int64 field_id=-1 TimeStampMilli (Timestamp(isAdjustedToUTC=true, timeUnit=milliseconds, is_from_converted_type=true, force_set_converted_type=false)); 97 // required int64 field_id=-1 TimeStampMicro (Timestamp(isAdjustedToUTC=true, timeUnit=microseconds, is_from_converted_type=true, force_set_converted_type=false)); 98 // required int96 field_id=-1 Interval; 99 // required int32 field_id=-1 Decimal1 (Decimal(precision=9, scale=2)); 100 // required int64 field_id=-1 Decimal2 (Decimal(precision=18, scale=2)); 101 // required fixed_len_byte_array field_id=-1 Decimal3 (Decimal(precision=10, scale=2)); 102 // required byte_array field_id=-1 Decimal4 (Decimal(precision=20, scale=2)); 103 // } 104 } 105 106 func ExampleNewSchemaFromStruct_repetition() { 107 type RepetitionSchema struct { 108 List []int64 `parquet:"fieldid=1"` 109 Repeated []int64 `parquet:"repetition=repeated, fieldid=2"` 110 Optional *int64 `parquet:"fieldid=3"` 111 Required *int64 `parquet:"repetition=REQUIRED, fieldid=4"` 112 Opt int64 `parquet:"repetition=OPTIONAL, fieldid=5"` 113 } 114 115 sc, err := schema.NewSchemaFromStruct(RepetitionSchema{}) 116 if err != nil { 117 log.Fatal(err) 118 } 119 120 schema.PrintSchema(sc.Root(), os.Stdout, 2) 121 122 // Output: 123 // repeated group field_id=-1 RepetitionSchema { 124 // required group field_id=1 List (List) { 125 // repeated group field_id=-1 list { 126 // required int64 field_id=-1 element (Int(bitWidth=64, isSigned=true)); 127 // } 128 // } 129 // repeated int64 field_id=2 Repeated (Int(bitWidth=64, isSigned=true)); 130 // optional int64 field_id=3 Optional (Int(bitWidth=64, isSigned=true)); 131 // required int64 field_id=4 Required (Int(bitWidth=64, isSigned=true)); 132 // optional int64 field_id=5 Opt (Int(bitWidth=64, isSigned=true)); 133 // } 134 } 135 136 func ExampleNewSchemaFromStruct_logicaltypes() { 137 type LogicalTypes struct { 138 String []byte `parquet:"logical=String"` 139 Enum string `parquet:"logical=enum"` 140 Date int32 `parquet:"logical=date"` 141 Decimal1 int32 `parquet:"logical=decimal, precision=9, scale=2"` 142 Decimal2 int32 `parquet:"logical=decimal, logical.precision=9, scale=2"` 143 Decimal3 int32 `parquet:"logical=decimal, precision=5, logical.precision=9, scale=1, logical.scale=3"` 144 TimeMilliUTC int32 `parquet:"logical=TIME, logical.unit=millis"` 145 TimeMilli int32 `parquet:"logical=Time, logical.unit=millis, logical.isadjustedutc=false"` 146 TimeMicros int64 `parquet:"logical=time, logical.unit=micros, logical.isadjustedutc=false"` 147 TimeMicrosUTC int64 `parquet:"logical=time, logical.unit=micros, logical.isadjustedutc=true"` 148 TimeNanos int64 `parquet:"logical=time, logical.unit=nanos"` 149 TimestampMilli int64 `parquet:"logical=timestamp, logical.unit=millis"` 150 TimestampMicrosNotUTC int64 `parquet:"logical=timestamp, logical.unit=micros, logical.isadjustedutc=false"` 151 TimestampNanos int64 `parquet:"logical=timestamp, logical.unit=nanos"` 152 JSON string `parquet:"logical=json"` 153 BSON []byte `parquet:"logical=BSON"` 154 UUID [16]byte `parquet:"logical=uuid"` 155 } 156 157 sc, err := schema.NewSchemaFromStruct(LogicalTypes{}) 158 if err != nil { 159 log.Fatal(err) 160 } 161 162 schema.PrintSchema(sc.Root(), os.Stdout, 2) 163 164 // Output: 165 // repeated group field_id=-1 LogicalTypes { 166 // required byte_array field_id=-1 String (String); 167 // required byte_array field_id=-1 Enum (Enum); 168 // required int32 field_id=-1 Date (Date); 169 // required int32 field_id=-1 Decimal1 (Decimal(precision=9, scale=2)); 170 // required int32 field_id=-1 Decimal2 (Decimal(precision=9, scale=2)); 171 // required int32 field_id=-1 Decimal3 (Decimal(precision=9, scale=3)); 172 // required int32 field_id=-1 TimeMilliUTC (Time(isAdjustedToUTC=true, timeUnit=milliseconds)); 173 // required int32 field_id=-1 TimeMilli (Time(isAdjustedToUTC=false, timeUnit=milliseconds)); 174 // required int64 field_id=-1 TimeMicros (Time(isAdjustedToUTC=false, timeUnit=microseconds)); 175 // required int64 field_id=-1 TimeMicrosUTC (Time(isAdjustedToUTC=true, timeUnit=microseconds)); 176 // required int64 field_id=-1 TimeNanos (Time(isAdjustedToUTC=true, timeUnit=nanoseconds)); 177 // required int64 field_id=-1 TimestampMilli (Timestamp(isAdjustedToUTC=true, timeUnit=milliseconds, is_from_converted_type=false, force_set_converted_type=false)); 178 // required int64 field_id=-1 TimestampMicrosNotUTC (Timestamp(isAdjustedToUTC=false, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false)); 179 // required int64 field_id=-1 TimestampNanos (Timestamp(isAdjustedToUTC=true, timeUnit=nanoseconds, is_from_converted_type=false, force_set_converted_type=false)); 180 // required byte_array field_id=-1 JSON (JSON); 181 // required byte_array field_id=-1 BSON (BSON); 182 // required fixed_len_byte_array field_id=-1 UUID (UUID); 183 // } 184 } 185 186 func ExampleNewSchemaFromStruct_physicaltype() { 187 type ChangeTypes struct { 188 Int32 int64 `parquet:"type=int32"` 189 FixedLen string `parquet:"type=fixed_len_byte_array, length=10"` 190 SliceAsFixed []byte `parquet:"type=fixed_len_byte_array, length=12"` 191 Int int `parquet:"type=int32"` 192 } 193 194 sc, err := schema.NewSchemaFromStruct(ChangeTypes{}) 195 if err != nil { 196 log.Fatal(err) 197 } 198 199 schema.PrintSchema(sc.Root(), os.Stdout, 2) 200 201 // Output: 202 // repeated group field_id=-1 ChangeTypes { 203 // required int32 field_id=-1 Int32 (Int(bitWidth=32, isSigned=true)); 204 // required fixed_len_byte_array field_id=-1 FixedLen; 205 // required fixed_len_byte_array field_id=-1 SliceAsFixed; 206 // required int32 field_id=-1 Int (Int(bitWidth=32, isSigned=true)); 207 // } 208 } 209 210 func ExampleNewSchemaFromStruct_nestedtypes() { 211 type Other struct { 212 OptionalMap *map[string]*string `parquet:"valuerepetition=required, keylogical=String, valueconverted=BSON"` 213 } 214 215 type MyMap map[int32]string 216 217 type Nested struct { 218 SimpleMap map[int32]string 219 FixedLenMap map[string][]byte `parquet:"keytype=fixed_len_byte_array, keyfieldid=10, valuefieldid=11, keylength=10"` 220 DecimalMap map[int32]string `parquet:"logical=map, keyconverted=DECIMAL, keyscale=3, keyprecision=7, valuetype=fixed_len_byte_array, valuelength=4, valuelogical=decimal, valuelogical.precision=9, valuescale=2"` 221 OtherList []*Other 222 OtherRepeated []Other `parquet:"repetition=repeated"` 223 DateArray [5]int32 `parquet:"valuelogical=date, logical=list"` 224 DateMap MyMap `parquet:"keylogical=TIME, keylogical.unit=MILLIS, keylogical.isadjustedutc=false, valuelogical=enum"` 225 } 226 227 sc, err := schema.NewSchemaFromStruct(Nested{}) 228 if err != nil { 229 log.Fatal(err) 230 } 231 232 schema.PrintSchema(sc.Root(), os.Stdout, 2) 233 234 // Output: 235 // repeated group field_id=-1 Nested { 236 // required group field_id=-1 SimpleMap (Map) { 237 // repeated group field_id=-1 key_value { 238 // required int32 field_id=-1 key (Int(bitWidth=32, isSigned=true)); 239 // required byte_array field_id=-1 value; 240 // } 241 // } 242 // required group field_id=-1 FixedLenMap (Map) { 243 // repeated group field_id=-1 key_value { 244 // required fixed_len_byte_array field_id=10 key; 245 // required byte_array field_id=11 value; 246 // } 247 // } 248 // required group field_id=-1 DecimalMap (Map) { 249 // repeated group field_id=-1 key_value { 250 // required int32 field_id=-1 key (Decimal(precision=7, scale=3)); 251 // required fixed_len_byte_array field_id=-1 value (Decimal(precision=9, scale=2)); 252 // } 253 // } 254 // required group field_id=-1 OtherList (List) { 255 // repeated group field_id=-1 list { 256 // optional group field_id=-1 element { 257 // optional group field_id=-1 OptionalMap (Map) { 258 // repeated group field_id=-1 key_value { 259 // required byte_array field_id=-1 key (String); 260 // required byte_array field_id=-1 value (BSON); 261 // } 262 // } 263 // } 264 // } 265 // } 266 // repeated group field_id=-1 OtherRepeated { 267 // optional group field_id=-1 OptionalMap (Map) { 268 // repeated group field_id=-1 key_value { 269 // required byte_array field_id=-1 key (String); 270 // required byte_array field_id=-1 value (BSON); 271 // } 272 // } 273 // } 274 // required group field_id=-1 DateArray (List) { 275 // repeated group field_id=-1 list { 276 // required int32 field_id=-1 element (Date); 277 // } 278 // } 279 // required group field_id=-1 DateMap (Map) { 280 // repeated group field_id=-1 key_value { 281 // required int32 field_id=-1 key (Time(isAdjustedToUTC=false, timeUnit=milliseconds)); 282 // required byte_array field_id=-1 value (Enum); 283 // } 284 // } 285 // } 286 } 287 288 func TestStructFromSchema(t *testing.T) { 289 root, err := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{ 290 schema.NewBooleanNode("bool", parquet.Repetitions.Required, -1), 291 schema.NewInt32Node("int32", parquet.Repetitions.Optional, -1), 292 schema.NewInt64Node("int64", parquet.Repetitions.Repeated, -1), 293 schema.NewInt96Node("int96", parquet.Repetitions.Required, -1), 294 schema.NewFloat32Node("float", parquet.Repetitions.Required, -1), 295 schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 296 schema.NewFixedLenByteArrayNode("fixedLen", parquet.Repetitions.Required, 10, -1), 297 }, -1) 298 assert.NoError(t, err) 299 300 sc := schema.NewSchema(root) 301 302 typ, err := schema.NewStructFromSchema(sc) 303 assert.NoError(t, err) 304 305 assert.Equal(t, reflect.Struct, typ.Kind()) 306 assert.Equal(t, "struct { bool bool; int32 *int32; int64 []int64; int96 parquet.Int96; float float32; bytearray parquet.ByteArray; fixedLen parquet.FixedLenByteArray }", 307 typ.String()) 308 } 309 310 func TestStructFromSchemaWithNesting(t *testing.T) { 311 type Other struct { 312 List *[]*float32 313 Excluded int32 `parquet:"-"` 314 } 315 316 type Nested struct { 317 Nest []int32 318 OptionalNest []*int64 319 Mapped map[string]float32 320 Other []Other 321 Other2 Other 322 } 323 324 sc, err := schema.NewSchemaFromStruct(Nested{}) 325 assert.NoError(t, err) 326 327 typ, err := schema.NewStructFromSchema(sc) 328 assert.NoError(t, err) 329 assert.Equal(t, "struct { Nest []int32; OptionalNest []*int64; Mapped map[string]float32; Other []struct { List *[]*float32 }; Other2 struct { List *[]*float32 } }", 330 typ.String()) 331 } 332 333 func TestStructFromSchemaBackwardsCompatList(t *testing.T) { 334 tests := []struct { 335 name string 336 n schema.Node 337 expected string 338 }{ 339 {"proper list", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Required, 340 schema.FieldList{ 341 schema.MustGroup(schema.NewGroupNode("list", parquet.Repetitions.Repeated, schema.FieldList{schema.NewBooleanNode("element", parquet.Repetitions.Optional, -1)}, -1)), 342 }, schema.NewListLogicalType(), -1)), "struct { my_list []*bool }"}, 343 {"backward nullable list nonnull ints", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Optional, schema.FieldList{ 344 schema.NewInt32Node("element", parquet.Repetitions.Repeated, -1), 345 }, schema.NewListLogicalType(), -1)), "struct { my_list *[]int32 }"}, 346 {"backward nullable list tuple string int", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Optional, schema.FieldList{ 347 schema.MustGroup(schema.NewGroupNode("element", parquet.Repetitions.Repeated, schema.FieldList{ 348 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("str", parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), 349 schema.NewInt32Node("num", parquet.Repetitions.Required, -1), 350 }, -1)), 351 }, schema.NewListLogicalType(), -1)), "struct { my_list *[]struct { str string; num int32 } }"}, 352 {"list tuple string", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Required, schema.FieldList{ 353 schema.MustGroup(schema.NewGroupNode("array", parquet.Repetitions.Repeated, schema.FieldList{ 354 schema.NewByteArrayNode("str", parquet.Repetitions.Required, -1), 355 }, -1)), 356 }, schema.NewListLogicalType(), -1)), "struct { my_list []struct { str parquet.ByteArray } }"}, 357 {"list tuple string my_list_tuple", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Optional, schema.FieldList{ 358 schema.MustGroup(schema.NewGroupNode("my_list_tuple", parquet.Repetitions.Repeated, schema.FieldList{ 359 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("str", parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), 360 }, -1)), 361 }, schema.NewListLogicalType(), -1)), "struct { my_list *[]struct { str string } }"}, 362 } 363 364 for _, tt := range tests { 365 t.Run(tt.name, func(t *testing.T) { 366 typ, err := schema.NewStructFromSchema(schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{tt.n}, -1)))) 367 assert.NoError(t, err) 368 assert.Equal(t, tt.expected, typ.String()) 369 }) 370 } 371 } 372 373 func TestStructFromSchemaMaps(t *testing.T) { 374 tests := []struct { 375 name string 376 n schema.Node 377 expected string 378 }{ 379 {"map string int", schema.MustGroup(schema.NewGroupNodeLogical("my_map", parquet.Repetitions.Required, schema.FieldList{ 380 schema.MustGroup(schema.NewGroupNode("key_value", parquet.Repetitions.Repeated, schema.FieldList{ 381 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("key", parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), 382 schema.NewInt32Node("value", parquet.Repetitions.Optional, -1), 383 }, -1)), 384 }, schema.MapLogicalType{}, -1)), "struct { my_map map[string]*int32 }"}, 385 {"nullable map string, int, required values", schema.MustGroup(schema.NewGroupNodeLogical("my_map", parquet.Repetitions.Optional, schema.FieldList{ 386 schema.MustGroup(schema.NewGroupNode("map", parquet.Repetitions.Repeated, schema.FieldList{ 387 schema.NewByteArrayNode("str", parquet.Repetitions.Required, -1), 388 schema.NewInt32Node("num", parquet.Repetitions.Required, -1), 389 }, -1)), 390 }, schema.MapLogicalType{}, -1)), "struct { my_map *map[string]int32 }"}, 391 {"map_key_value with missing value", schema.MustGroup(schema.NewGroupNodeConverted("my_map", parquet.Repetitions.Optional, schema.FieldList{ 392 schema.MustGroup(schema.NewGroupNode("map", parquet.Repetitions.Repeated, schema.FieldList{ 393 schema.NewByteArrayNode("key", parquet.Repetitions.Required, -1), 394 }, -1)), 395 }, schema.ConvertedTypes.MapKeyValue, -1)), "struct { my_map *map[string]bool }"}, 396 } 397 for _, tt := range tests { 398 t.Run(tt.name, func(t *testing.T) { 399 typ, err := schema.NewStructFromSchema(schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{tt.n}, -1)))) 400 assert.NoError(t, err) 401 assert.Equal(t, tt.expected, typ.String()) 402 }) 403 } 404 }