github.com/apache/arrow/go/v15@v15.0.1/parquet/schema/reflection_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package schema_test 18 19 import ( 20 "log" 21 "os" 22 "reflect" 23 "testing" 24 25 "github.com/apache/arrow/go/v15/arrow/float16" 26 "github.com/apache/arrow/go/v15/parquet" 27 "github.com/apache/arrow/go/v15/parquet/schema" 28 "github.com/stretchr/testify/assert" 29 ) 30 31 func ExampleNewSchemaFromStruct_primitives() { 32 type Schema struct { 33 Bool bool 34 Int8 int8 35 Uint16 uint16 36 Int32 int32 37 Int64 int64 38 Int96 parquet.Int96 39 Float float32 40 Double float64 41 ByteArray string 42 FixedLenByteArray [10]byte 43 } 44 45 sc, err := schema.NewSchemaFromStruct(Schema{}) 46 if err != nil { 47 log.Fatal(err) 48 } 49 50 schema.PrintSchema(sc.Root(), os.Stdout, 2) 51 52 // Output: 53 // repeated group field_id=-1 Schema { 54 // required boolean field_id=-1 Bool; 55 // required int32 field_id=-1 Int8 (Int(bitWidth=8, isSigned=true)); 56 // required int32 field_id=-1 Uint16 (Int(bitWidth=16, isSigned=false)); 57 // required int32 field_id=-1 Int32 (Int(bitWidth=32, isSigned=true)); 58 // required int64 field_id=-1 Int64 (Int(bitWidth=64, isSigned=true)); 59 // required int96 field_id=-1 Int96; 60 // required float field_id=-1 Float; 61 // required double field_id=-1 Double; 62 // required byte_array field_id=-1 ByteArray; 63 // required fixed_len_byte_array field_id=-1 FixedLenByteArray; 64 // } 65 } 66 67 func ExampleNewSchemaFromStruct_convertedtypes() { 68 type ConvertedSchema struct { 69 Utf8 string `parquet:"name=utf8, converted=UTF8"` 70 Uint32 uint32 `parquet:"converted=INT_32"` 71 Date int32 `parquet:"name=date, converted=date"` 72 TimeMilli int32 `parquet:"name=timemilli, converted=TIME_MILLIS"` 73 TimeMicro int64 `parquet:"name=timemicro, converted=time_micros"` 74 TimeStampMilli int64 `parquet:"converted=timestamp_millis"` 75 TimeStampMicro int64 `parquet:"converted=timestamp_micros"` 76 Interval parquet.Int96 `parquet:"converted=INTERVAL"` 77 Decimal1 int32 `parquet:"converted=decimal, scale=2, precision=9"` 78 Decimal2 int64 `parquet:"converted=decimal, scale=2, precision=18"` 79 Decimal3 [12]byte `parquet:"converted=decimal, scale=2, precision=10"` 80 Decimal4 string `parquet:"converted=decimal, scale=2, precision=20"` 81 } 82 83 sc, err := schema.NewSchemaFromStruct(&ConvertedSchema{}) 84 if err != nil { 85 log.Fatal(err) 86 } 87 88 schema.PrintSchema(sc.Root(), os.Stdout, 2) 89 90 // Output: 91 // repeated group field_id=-1 ConvertedSchema { 92 // required byte_array field_id=-1 utf8 (String); 93 // required int32 field_id=-1 Uint32 (Int(bitWidth=32, isSigned=true)); 94 // required int32 field_id=-1 date (Date); 95 // required int32 field_id=-1 timemilli (Time(isAdjustedToUTC=true, timeUnit=milliseconds)); 96 // required int64 field_id=-1 timemicro (Time(isAdjustedToUTC=true, timeUnit=microseconds)); 97 // required int64 field_id=-1 TimeStampMilli (Timestamp(isAdjustedToUTC=true, timeUnit=milliseconds, is_from_converted_type=true, force_set_converted_type=false)); 98 // required int64 field_id=-1 TimeStampMicro (Timestamp(isAdjustedToUTC=true, timeUnit=microseconds, is_from_converted_type=true, force_set_converted_type=false)); 99 // required int96 field_id=-1 Interval; 100 // required int32 field_id=-1 Decimal1 (Decimal(precision=9, scale=2)); 101 // required int64 field_id=-1 Decimal2 (Decimal(precision=18, scale=2)); 102 // required fixed_len_byte_array field_id=-1 Decimal3 (Decimal(precision=10, scale=2)); 103 // required byte_array field_id=-1 Decimal4 (Decimal(precision=20, scale=2)); 104 // } 105 } 106 107 func ExampleNewSchemaFromStruct_repetition() { 108 type RepetitionSchema struct { 109 List []int64 `parquet:"fieldid=1"` 110 Repeated []int64 `parquet:"repetition=repeated, fieldid=2"` 111 Optional *int64 `parquet:"fieldid=3"` 112 Required *int64 `parquet:"repetition=REQUIRED, fieldid=4"` 113 Opt int64 `parquet:"repetition=OPTIONAL, fieldid=5"` 114 } 115 116 sc, err := schema.NewSchemaFromStruct(RepetitionSchema{}) 117 if err != nil { 118 log.Fatal(err) 119 } 120 121 schema.PrintSchema(sc.Root(), os.Stdout, 2) 122 123 // Output: 124 // repeated group field_id=-1 RepetitionSchema { 125 // required group field_id=1 List (List) { 126 // repeated group field_id=-1 list { 127 // required int64 field_id=-1 element (Int(bitWidth=64, isSigned=true)); 128 // } 129 // } 130 // repeated int64 field_id=2 Repeated (Int(bitWidth=64, isSigned=true)); 131 // optional int64 field_id=3 Optional (Int(bitWidth=64, isSigned=true)); 132 // required int64 field_id=4 Required (Int(bitWidth=64, isSigned=true)); 133 // optional int64 field_id=5 Opt (Int(bitWidth=64, isSigned=true)); 134 // } 135 } 136 137 func ExampleNewSchemaFromStruct_logicaltypes() { 138 type LogicalTypes struct { 139 String []byte `parquet:"logical=String"` 140 Enum string `parquet:"logical=enum"` 141 Date int32 `parquet:"logical=date"` 142 Decimal1 int32 `parquet:"logical=decimal, precision=9, scale=2"` 143 Decimal2 int32 `parquet:"logical=decimal, logical.precision=9, scale=2"` 144 Decimal3 int32 `parquet:"logical=decimal, precision=5, logical.precision=9, scale=1, logical.scale=3"` 145 TimeMilliUTC int32 `parquet:"logical=TIME, logical.unit=millis"` 146 TimeMilli int32 `parquet:"logical=Time, logical.unit=millis, logical.isadjustedutc=false"` 147 TimeMicros int64 `parquet:"logical=time, logical.unit=micros, logical.isadjustedutc=false"` 148 TimeMicrosUTC int64 `parquet:"logical=time, logical.unit=micros, logical.isadjustedutc=true"` 149 TimeNanos int64 `parquet:"logical=time, logical.unit=nanos"` 150 TimestampMilli int64 `parquet:"logical=timestamp, logical.unit=millis"` 151 TimestampMicrosNotUTC int64 `parquet:"logical=timestamp, logical.unit=micros, logical.isadjustedutc=false"` 152 TimestampNanos int64 `parquet:"logical=timestamp, logical.unit=nanos"` 153 JSON string `parquet:"logical=json"` 154 BSON []byte `parquet:"logical=BSON"` 155 UUID [16]byte `parquet:"logical=uuid"` 156 Float16 [2]byte `parquet:"logical=float16"` 157 Float16Optional *[2]byte `parquet:"logical=float16"` 158 Float16Num float16.Num 159 } 160 161 sc, err := schema.NewSchemaFromStruct(LogicalTypes{}) 162 if err != nil { 163 log.Fatal(err) 164 } 165 166 schema.PrintSchema(sc.Root(), os.Stdout, 2) 167 168 // Output: 169 // repeated group field_id=-1 LogicalTypes { 170 // required byte_array field_id=-1 String (String); 171 // required byte_array field_id=-1 Enum (Enum); 172 // required int32 field_id=-1 Date (Date); 173 // required int32 field_id=-1 Decimal1 (Decimal(precision=9, scale=2)); 174 // required int32 field_id=-1 Decimal2 (Decimal(precision=9, scale=2)); 175 // required int32 field_id=-1 Decimal3 (Decimal(precision=9, scale=3)); 176 // required int32 field_id=-1 TimeMilliUTC (Time(isAdjustedToUTC=true, timeUnit=milliseconds)); 177 // required int32 field_id=-1 TimeMilli (Time(isAdjustedToUTC=false, timeUnit=milliseconds)); 178 // required int64 field_id=-1 TimeMicros (Time(isAdjustedToUTC=false, timeUnit=microseconds)); 179 // required int64 field_id=-1 TimeMicrosUTC (Time(isAdjustedToUTC=true, timeUnit=microseconds)); 180 // required int64 field_id=-1 TimeNanos (Time(isAdjustedToUTC=true, timeUnit=nanoseconds)); 181 // required int64 field_id=-1 TimestampMilli (Timestamp(isAdjustedToUTC=true, timeUnit=milliseconds, is_from_converted_type=false, force_set_converted_type=false)); 182 // required int64 field_id=-1 TimestampMicrosNotUTC (Timestamp(isAdjustedToUTC=false, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false)); 183 // required int64 field_id=-1 TimestampNanos (Timestamp(isAdjustedToUTC=true, timeUnit=nanoseconds, is_from_converted_type=false, force_set_converted_type=false)); 184 // required byte_array field_id=-1 JSON (JSON); 185 // required byte_array field_id=-1 BSON (BSON); 186 // required fixed_len_byte_array field_id=-1 UUID (UUID); 187 // required fixed_len_byte_array field_id=-1 Float16 (Float16); 188 // optional fixed_len_byte_array field_id=-1 Float16Optional (Float16); 189 // required fixed_len_byte_array field_id=-1 Float16Num (Float16); 190 // } 191 } 192 193 func ExampleNewSchemaFromStruct_physicaltype() { 194 type ChangeTypes struct { 195 Int32 int64 `parquet:"type=int32"` 196 FixedLen string `parquet:"type=fixed_len_byte_array, length=10"` 197 SliceAsFixed []byte `parquet:"type=fixed_len_byte_array, length=12"` 198 Int int `parquet:"type=int32"` 199 } 200 201 sc, err := schema.NewSchemaFromStruct(ChangeTypes{}) 202 if err != nil { 203 log.Fatal(err) 204 } 205 206 schema.PrintSchema(sc.Root(), os.Stdout, 2) 207 208 // Output: 209 // repeated group field_id=-1 ChangeTypes { 210 // required int32 field_id=-1 Int32 (Int(bitWidth=32, isSigned=true)); 211 // required fixed_len_byte_array field_id=-1 FixedLen; 212 // required fixed_len_byte_array field_id=-1 SliceAsFixed; 213 // required int32 field_id=-1 Int (Int(bitWidth=32, isSigned=true)); 214 // } 215 } 216 217 func ExampleNewSchemaFromStruct_nestedtypes() { 218 type Other struct { 219 OptionalMap *map[string]*string `parquet:"valuerepetition=required, keylogical=String, valueconverted=BSON"` 220 } 221 222 type MyMap map[int32]string 223 224 type Nested struct { 225 SimpleMap map[int32]string 226 FixedLenMap map[string][]byte `parquet:"keytype=fixed_len_byte_array, keyfieldid=10, valuefieldid=11, keylength=10"` 227 DecimalMap map[int32]string `parquet:"logical=map, keyconverted=DECIMAL, keyscale=3, keyprecision=7, valuetype=fixed_len_byte_array, valuelength=4, valuelogical=decimal, valuelogical.precision=9, valuescale=2"` 228 OtherList []*Other 229 OtherRepeated []Other `parquet:"repetition=repeated"` 230 DateArray [5]int32 `parquet:"valuelogical=date, logical=list"` 231 DateMap MyMap `parquet:"keylogical=TIME, keylogical.unit=MILLIS, keylogical.isadjustedutc=false, valuelogical=enum"` 232 } 233 234 sc, err := schema.NewSchemaFromStruct(Nested{}) 235 if err != nil { 236 log.Fatal(err) 237 } 238 239 schema.PrintSchema(sc.Root(), os.Stdout, 2) 240 241 // Output: 242 // repeated group field_id=-1 Nested { 243 // required group field_id=-1 SimpleMap (Map) { 244 // repeated group field_id=-1 key_value { 245 // required int32 field_id=-1 key (Int(bitWidth=32, isSigned=true)); 246 // required byte_array field_id=-1 value; 247 // } 248 // } 249 // required group field_id=-1 FixedLenMap (Map) { 250 // repeated group field_id=-1 key_value { 251 // required fixed_len_byte_array field_id=10 key; 252 // required byte_array field_id=11 value; 253 // } 254 // } 255 // required group field_id=-1 DecimalMap (Map) { 256 // repeated group field_id=-1 key_value { 257 // required int32 field_id=-1 key (Decimal(precision=7, scale=3)); 258 // required fixed_len_byte_array field_id=-1 value (Decimal(precision=9, scale=2)); 259 // } 260 // } 261 // required group field_id=-1 OtherList (List) { 262 // repeated group field_id=-1 list { 263 // optional group field_id=-1 element { 264 // optional group field_id=-1 OptionalMap (Map) { 265 // repeated group field_id=-1 key_value { 266 // required byte_array field_id=-1 key (String); 267 // required byte_array field_id=-1 value (BSON); 268 // } 269 // } 270 // } 271 // } 272 // } 273 // repeated group field_id=-1 OtherRepeated { 274 // optional group field_id=-1 OptionalMap (Map) { 275 // repeated group field_id=-1 key_value { 276 // required byte_array field_id=-1 key (String); 277 // required byte_array field_id=-1 value (BSON); 278 // } 279 // } 280 // } 281 // required group field_id=-1 DateArray (List) { 282 // repeated group field_id=-1 list { 283 // required int32 field_id=-1 element (Date); 284 // } 285 // } 286 // required group field_id=-1 DateMap (Map) { 287 // repeated group field_id=-1 key_value { 288 // required int32 field_id=-1 key (Time(isAdjustedToUTC=false, timeUnit=milliseconds)); 289 // required byte_array field_id=-1 value (Enum); 290 // } 291 // } 292 // } 293 } 294 295 func TestStructFromSchema(t *testing.T) { 296 root, err := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{ 297 schema.NewBooleanNode("bool", parquet.Repetitions.Required, -1), 298 schema.NewInt32Node("int32", parquet.Repetitions.Optional, -1), 299 schema.NewInt64Node("int64", parquet.Repetitions.Repeated, -1), 300 schema.NewInt96Node("int96", parquet.Repetitions.Required, -1), 301 schema.NewFloat32Node("float", parquet.Repetitions.Required, -1), 302 schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 303 schema.NewFixedLenByteArrayNode("fixedLen", parquet.Repetitions.Required, 10, -1), 304 }, -1) 305 assert.NoError(t, err) 306 307 sc := schema.NewSchema(root) 308 309 typ, err := schema.NewStructFromSchema(sc) 310 assert.NoError(t, err) 311 312 assert.Equal(t, reflect.Struct, typ.Kind()) 313 assert.Equal(t, "struct { bool bool; int32 *int32; int64 []int64; int96 parquet.Int96; float float32; bytearray parquet.ByteArray; fixedLen parquet.FixedLenByteArray }", 314 typ.String()) 315 } 316 317 func TestStructFromSchemaWithNesting(t *testing.T) { 318 type Other struct { 319 List *[]*float32 320 Excluded int32 `parquet:"-"` 321 } 322 323 type Nested struct { 324 Nest []int32 325 OptionalNest []*int64 326 Mapped map[string]float32 327 Other []Other 328 Other2 Other 329 } 330 331 sc, err := schema.NewSchemaFromStruct(Nested{}) 332 assert.NoError(t, err) 333 334 typ, err := schema.NewStructFromSchema(sc) 335 assert.NoError(t, err) 336 assert.Equal(t, "struct { Nest []int32; OptionalNest []*int64; Mapped map[string]float32; Other []struct { List *[]*float32 }; Other2 struct { List *[]*float32 } }", 337 typ.String()) 338 } 339 340 func TestStructFromSchemaBackwardsCompatList(t *testing.T) { 341 tests := []struct { 342 name string 343 n schema.Node 344 expected string 345 }{ 346 {"proper list", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Required, 347 schema.FieldList{ 348 schema.MustGroup(schema.NewGroupNode("list", parquet.Repetitions.Repeated, schema.FieldList{schema.NewBooleanNode("element", parquet.Repetitions.Optional, -1)}, -1)), 349 }, schema.NewListLogicalType(), -1)), "struct { my_list []*bool }"}, 350 {"backward nullable list nonnull ints", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Optional, schema.FieldList{ 351 schema.NewInt32Node("element", parquet.Repetitions.Repeated, -1), 352 }, schema.NewListLogicalType(), -1)), "struct { my_list *[]int32 }"}, 353 {"backward nullable list tuple string int", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Optional, schema.FieldList{ 354 schema.MustGroup(schema.NewGroupNode("element", parquet.Repetitions.Repeated, schema.FieldList{ 355 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("str", parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), 356 schema.NewInt32Node("num", parquet.Repetitions.Required, -1), 357 }, -1)), 358 }, schema.NewListLogicalType(), -1)), "struct { my_list *[]struct { str string; num int32 } }"}, 359 {"list tuple string", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Required, schema.FieldList{ 360 schema.MustGroup(schema.NewGroupNode("array", parquet.Repetitions.Repeated, schema.FieldList{ 361 schema.NewByteArrayNode("str", parquet.Repetitions.Required, -1), 362 }, -1)), 363 }, schema.NewListLogicalType(), -1)), "struct { my_list []struct { str parquet.ByteArray } }"}, 364 {"list tuple string my_list_tuple", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Optional, schema.FieldList{ 365 schema.MustGroup(schema.NewGroupNode("my_list_tuple", parquet.Repetitions.Repeated, schema.FieldList{ 366 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("str", parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), 367 }, -1)), 368 }, schema.NewListLogicalType(), -1)), "struct { my_list *[]struct { str string } }"}, 369 } 370 371 for _, tt := range tests { 372 t.Run(tt.name, func(t *testing.T) { 373 typ, err := schema.NewStructFromSchema(schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{tt.n}, -1)))) 374 assert.NoError(t, err) 375 assert.Equal(t, tt.expected, typ.String()) 376 }) 377 } 378 } 379 380 func TestStructFromSchemaMaps(t *testing.T) { 381 tests := []struct { 382 name string 383 n schema.Node 384 expected string 385 }{ 386 {"map string int", schema.MustGroup(schema.NewGroupNodeLogical("my_map", parquet.Repetitions.Required, schema.FieldList{ 387 schema.MustGroup(schema.NewGroupNode("key_value", parquet.Repetitions.Repeated, schema.FieldList{ 388 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("key", parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), 389 schema.NewInt32Node("value", parquet.Repetitions.Optional, -1), 390 }, -1)), 391 }, schema.MapLogicalType{}, -1)), "struct { my_map map[string]*int32 }"}, 392 {"nullable map string, int, required values", schema.MustGroup(schema.NewGroupNodeLogical("my_map", parquet.Repetitions.Optional, schema.FieldList{ 393 schema.MustGroup(schema.NewGroupNode("map", parquet.Repetitions.Repeated, schema.FieldList{ 394 schema.NewByteArrayNode("str", parquet.Repetitions.Required, -1), 395 schema.NewInt32Node("num", parquet.Repetitions.Required, -1), 396 }, -1)), 397 }, schema.MapLogicalType{}, -1)), "struct { my_map *map[string]int32 }"}, 398 {"map_key_value with missing value", schema.MustGroup(schema.NewGroupNodeConverted("my_map", parquet.Repetitions.Optional, schema.FieldList{ 399 schema.MustGroup(schema.NewGroupNode("map", parquet.Repetitions.Repeated, schema.FieldList{ 400 schema.NewByteArrayNode("key", parquet.Repetitions.Required, -1), 401 }, -1)), 402 }, schema.ConvertedTypes.MapKeyValue, -1)), "struct { my_map *map[string]bool }"}, 403 } 404 for _, tt := range tests { 405 t.Run(tt.name, func(t *testing.T) { 406 typ, err := schema.NewStructFromSchema(schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{tt.n}, -1)))) 407 assert.NoError(t, err) 408 assert.Equal(t, tt.expected, typ.String()) 409 }) 410 } 411 }