github.com/apache/arrow/go/v14@v14.0.2/parquet/pqarrow/schema_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow_test 18 19 import ( 20 "encoding/base64" 21 "testing" 22 23 "github.com/apache/arrow/go/v14/arrow" 24 "github.com/apache/arrow/go/v14/arrow/flight" 25 "github.com/apache/arrow/go/v14/arrow/ipc" 26 "github.com/apache/arrow/go/v14/arrow/memory" 27 "github.com/apache/arrow/go/v14/internal/types" 28 "github.com/apache/arrow/go/v14/parquet" 29 "github.com/apache/arrow/go/v14/parquet/metadata" 30 "github.com/apache/arrow/go/v14/parquet/pqarrow" 31 "github.com/apache/arrow/go/v14/parquet/schema" 32 "github.com/stretchr/testify/assert" 33 "github.com/stretchr/testify/require" 34 ) 35 36 func TestGetOriginSchemaBase64(t *testing.T) { 37 uuidType := types.NewUUIDType() 38 md := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"}) 39 extMd := arrow.NewMetadata([]string{ipc.ExtensionMetadataKeyName, ipc.ExtensionTypeKeyName, "PARQUET:field_id"}, []string{uuidType.Serialize(), uuidType.ExtensionName(), "-1"}) 40 origArrSc := arrow.NewSchema([]arrow.Field{ 41 {Name: "f1", Type: arrow.BinaryTypes.String, Metadata: md}, 42 {Name: "f2", Type: arrow.PrimitiveTypes.Int64, Metadata: md}, 43 {Name: "uuid", Type: uuidType, Metadata: extMd}, 44 }, nil) 45 46 arrSerializedSc := flight.SerializeSchema(origArrSc, memory.DefaultAllocator) 47 if err := arrow.RegisterExtensionType(uuidType); err != nil { 48 t.Fatal(err) 49 } 50 defer arrow.UnregisterExtensionType(uuidType.ExtensionName()) 51 pqschema, err := pqarrow.ToParquet(origArrSc, nil, pqarrow.DefaultWriterProps()) 52 require.NoError(t, err) 53 54 tests := []struct { 55 name string 56 enc *base64.Encoding 57 }{ 58 {"raw", base64.RawStdEncoding}, 59 {"std", base64.StdEncoding}, 60 } 61 62 for _, tt := range tests { 63 t.Run(tt.name, func(t *testing.T) { 64 kv := metadata.NewKeyValueMetadata() 65 kv.Append("ARROW:schema", tt.enc.EncodeToString(arrSerializedSc)) 66 arrsc, err := pqarrow.FromParquet(pqschema, nil, kv) 67 assert.NoError(t, err) 68 assert.True(t, origArrSc.Equal(arrsc)) 69 }) 70 } 71 } 72 73 func TestGetOriginSchemaUnregisteredExtension(t *testing.T) { 74 uuidType := types.NewUUIDType() 75 if err := arrow.RegisterExtensionType(uuidType); err != nil { 76 t.Fatal(err) 77 } 78 79 md := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"}) 80 origArrSc := arrow.NewSchema([]arrow.Field{ 81 {Name: "f1", Type: arrow.BinaryTypes.String, Metadata: md}, 82 {Name: "f2", Type: arrow.PrimitiveTypes.Int64, Metadata: md}, 83 {Name: "uuid", Type: uuidType, Metadata: md}, 84 }, nil) 85 pqschema, err := pqarrow.ToParquet(origArrSc, nil, pqarrow.DefaultWriterProps()) 86 require.NoError(t, err) 87 88 arrSerializedSc := flight.SerializeSchema(origArrSc, memory.DefaultAllocator) 89 kv := metadata.NewKeyValueMetadata() 90 kv.Append("ARROW:schema", base64.StdEncoding.EncodeToString(arrSerializedSc)) 91 92 arrow.UnregisterExtensionType(uuidType.ExtensionName()) 93 arrsc, err := pqarrow.FromParquet(pqschema, nil, kv) 94 require.NoError(t, err) 95 96 extMd := arrow.NewMetadata([]string{ipc.ExtensionMetadataKeyName, ipc.ExtensionTypeKeyName, "PARQUET:field_id"}, 97 []string{uuidType.Serialize(), uuidType.ExtensionName(), "-1"}) 98 expArrSc := arrow.NewSchema([]arrow.Field{ 99 {Name: "f1", Type: arrow.BinaryTypes.String, Metadata: md}, 100 {Name: "f2", Type: arrow.PrimitiveTypes.Int64, Metadata: md}, 101 {Name: "uuid", Type: uuidType.StorageType(), Metadata: extMd}, 102 }, nil) 103 104 assert.Truef(t, expArrSc.Equal(arrsc), "expected: %s\ngot: %s", expArrSc, arrsc) 105 } 106 107 func TestToParquetWriterConfig(t *testing.T) { 108 origSc := arrow.NewSchema([]arrow.Field{ 109 {Name: "f1", Type: arrow.BinaryTypes.String}, 110 {Name: "f2", Type: arrow.PrimitiveTypes.Int64}, 111 }, nil) 112 113 tests := []struct { 114 name string 115 rootRepetition parquet.Repetition 116 }{ 117 {"test1", parquet.Repetitions.Required}, 118 {"test2", parquet.Repetitions.Repeated}, 119 } 120 121 for _, tt := range tests { 122 t.Run(tt.name, func(t *testing.T) { 123 124 pqschema, err := pqarrow.ToParquet(origSc, 125 parquet.NewWriterProperties( 126 parquet.WithRootName(tt.name), 127 parquet.WithRootRepetition(tt.rootRepetition), 128 ), 129 pqarrow.DefaultWriterProps()) 130 require.NoError(t, err) 131 132 assert.Equal(t, tt.name, pqschema.Root().Name()) 133 assert.Equal(t, tt.rootRepetition, pqschema.Root().RepetitionType()) 134 }) 135 } 136 } 137 138 func TestConvertArrowFlatPrimitives(t *testing.T) { 139 parquetFields := make(schema.FieldList, 0) 140 arrowFields := make([]arrow.Field, 0) 141 142 parquetFields = append(parquetFields, schema.NewBooleanNode("boolean", parquet.Repetitions.Required, -1)) 143 arrowFields = append(arrowFields, arrow.Field{Name: "boolean", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}) 144 145 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int8", parquet.Repetitions.Required, 146 schema.NewIntLogicalType(8, true), parquet.Types.Int32, 0, -1))) 147 arrowFields = append(arrowFields, arrow.Field{Name: "int8", Type: arrow.PrimitiveTypes.Int8, Nullable: false}) 148 149 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint8", parquet.Repetitions.Required, 150 schema.NewIntLogicalType(8, false), parquet.Types.Int32, 0, -1))) 151 arrowFields = append(arrowFields, arrow.Field{Name: "uint8", Type: arrow.PrimitiveTypes.Uint8, Nullable: false}) 152 153 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int16", parquet.Repetitions.Required, 154 schema.NewIntLogicalType(16, true), parquet.Types.Int32, 0, -1))) 155 arrowFields = append(arrowFields, arrow.Field{Name: "int16", Type: arrow.PrimitiveTypes.Int16, Nullable: false}) 156 157 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint16", parquet.Repetitions.Required, 158 schema.NewIntLogicalType(16, false), parquet.Types.Int32, 0, -1))) 159 arrowFields = append(arrowFields, arrow.Field{Name: "uint16", Type: arrow.PrimitiveTypes.Uint16, Nullable: false}) 160 161 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int32", parquet.Repetitions.Required, 162 schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1))) 163 arrowFields = append(arrowFields, arrow.Field{Name: "int32", Type: arrow.PrimitiveTypes.Int32, Nullable: false}) 164 165 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint32", parquet.Repetitions.Required, 166 schema.NewIntLogicalType(32, false), parquet.Types.Int32, 0, -1))) 167 arrowFields = append(arrowFields, arrow.Field{Name: "uint32", Type: arrow.PrimitiveTypes.Uint32, Nullable: false}) 168 169 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int64", parquet.Repetitions.Required, 170 schema.NewIntLogicalType(64, true), parquet.Types.Int64, 0, -1))) 171 arrowFields = append(arrowFields, arrow.Field{Name: "int64", Type: arrow.PrimitiveTypes.Int64, Nullable: false}) 172 173 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint64", parquet.Repetitions.Required, 174 schema.NewIntLogicalType(64, false), parquet.Types.Int64, 0, -1))) 175 arrowFields = append(arrowFields, arrow.Field{Name: "uint64", Type: arrow.PrimitiveTypes.Uint64, Nullable: false}) 176 177 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeConverted("timestamp", parquet.Repetitions.Required, 178 parquet.Types.Int64, schema.ConvertedTypes.TimestampMillis, 0, 0, 0, -1))) 179 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp", Type: arrow.FixedWidthTypes.Timestamp_ms, Nullable: false}) 180 181 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeConverted("timestamp[us]", parquet.Repetitions.Required, 182 parquet.Types.Int64, schema.ConvertedTypes.TimestampMicros, 0, 0, 0, -1))) 183 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp[us]", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: false}) 184 185 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("date", parquet.Repetitions.Required, 186 schema.DateLogicalType{}, parquet.Types.Int32, 0, -1))) 187 arrowFields = append(arrowFields, arrow.Field{Name: "date", Type: arrow.FixedWidthTypes.Date32, Nullable: false}) 188 189 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("date64", parquet.Repetitions.Required, 190 schema.NewTimestampLogicalType(true, schema.TimeUnitMillis), parquet.Types.Int64, 0, -1))) 191 arrowFields = append(arrowFields, arrow.Field{Name: "date64", Type: arrow.FixedWidthTypes.Date64, Nullable: false}) 192 193 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("time32", parquet.Repetitions.Required, 194 schema.NewTimeLogicalType(true, schema.TimeUnitMillis), parquet.Types.Int32, 0, -1))) 195 arrowFields = append(arrowFields, arrow.Field{Name: "time32", Type: arrow.FixedWidthTypes.Time32ms, Nullable: false}) 196 197 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("time64", parquet.Repetitions.Required, 198 schema.NewTimeLogicalType(true, schema.TimeUnitMicros), parquet.Types.Int64, 0, -1))) 199 arrowFields = append(arrowFields, arrow.Field{Name: "time64", Type: arrow.FixedWidthTypes.Time64us, Nullable: false}) 200 201 parquetFields = append(parquetFields, schema.NewInt96Node("timestamp96", parquet.Repetitions.Required, -1)) 202 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp96", Type: arrow.FixedWidthTypes.Timestamp_ns, Nullable: false}) 203 204 parquetFields = append(parquetFields, schema.NewFloat32Node("float", parquet.Repetitions.Optional, -1)) 205 arrowFields = append(arrowFields, arrow.Field{Name: "float", Type: arrow.PrimitiveTypes.Float32, Nullable: true}) 206 207 parquetFields = append(parquetFields, schema.NewFloat64Node("double", parquet.Repetitions.Optional, -1)) 208 arrowFields = append(arrowFields, arrow.Field{Name: "double", Type: arrow.PrimitiveTypes.Float64, Nullable: true}) 209 210 parquetFields = append(parquetFields, schema.NewByteArrayNode("binary", parquet.Repetitions.Optional, -1)) 211 arrowFields = append(arrowFields, arrow.Field{Name: "binary", Type: arrow.BinaryTypes.Binary, Nullable: true}) 212 213 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("string", parquet.Repetitions.Optional, 214 schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1))) 215 arrowFields = append(arrowFields, arrow.Field{Name: "string", Type: arrow.BinaryTypes.String, Nullable: true}) 216 217 parquetFields = append(parquetFields, schema.NewFixedLenByteArrayNode("flba-binary", parquet.Repetitions.Optional, 12, -1)) 218 arrowFields = append(arrowFields, arrow.Field{Name: "flba-binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 12}, Nullable: true}) 219 220 arrowSchema := arrow.NewSchema(arrowFields, nil) 221 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 222 223 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true))) 224 assert.NoError(t, err) 225 assert.True(t, parquetSchema.Equals(result)) 226 for i := 0; i < parquetSchema.NumColumns(); i++ { 227 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 228 } 229 } 230 231 func TestConvertArrowParquetLists(t *testing.T) { 232 parquetFields := make(schema.FieldList, 0) 233 arrowFields := make([]arrow.Field, 0) 234 235 parquetFields = append(parquetFields, schema.MustGroup(schema.ListOf(schema.Must(schema.NewPrimitiveNodeLogical("my_list", 236 parquet.Repetitions.Optional, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), parquet.Repetitions.Required, -1))) 237 238 arrowFields = append(arrowFields, arrow.Field{Name: "my_list", Type: arrow.ListOf(arrow.BinaryTypes.String)}) 239 240 parquetFields = append(parquetFields, schema.MustGroup(schema.ListOf(schema.Must(schema.NewPrimitiveNodeLogical("my_list", 241 parquet.Repetitions.Optional, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), parquet.Repetitions.Optional, -1))) 242 243 arrowFields = append(arrowFields, arrow.Field{Name: "my_list", Type: arrow.ListOf(arrow.BinaryTypes.String), Nullable: true}) 244 245 arrowSchema := arrow.NewSchema(arrowFields, nil) 246 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 247 248 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true))) 249 assert.NoError(t, err) 250 assert.True(t, parquetSchema.Equals(result), parquetSchema.String(), result.String()) 251 for i := 0; i < parquetSchema.NumColumns(); i++ { 252 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 253 } 254 } 255 256 func TestConvertArrowDecimals(t *testing.T) { 257 parquetFields := make(schema.FieldList, 0) 258 arrowFields := make([]arrow.Field, 0) 259 260 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("decimal_8_4", parquet.Repetitions.Required, 261 schema.NewDecimalLogicalType(8, 4), parquet.Types.FixedLenByteArray, 4, -1))) 262 arrowFields = append(arrowFields, arrow.Field{Name: "decimal_8_4", Type: &arrow.Decimal128Type{Precision: 8, Scale: 4}}) 263 264 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("decimal_20_4", parquet.Repetitions.Required, 265 schema.NewDecimalLogicalType(20, 4), parquet.Types.FixedLenByteArray, 9, -1))) 266 arrowFields = append(arrowFields, arrow.Field{Name: "decimal_20_4", Type: &arrow.Decimal128Type{Precision: 20, Scale: 4}}) 267 268 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("decimal_77_4", parquet.Repetitions.Required, 269 schema.NewDecimalLogicalType(77, 4), parquet.Types.FixedLenByteArray, 34, -1))) 270 arrowFields = append(arrowFields, arrow.Field{Name: "decimal_77_4", Type: &arrow.Decimal128Type{Precision: 77, Scale: 4}}) 271 272 arrowSchema := arrow.NewSchema(arrowFields, nil) 273 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 274 275 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true))) 276 assert.NoError(t, err) 277 assert.True(t, parquetSchema.Equals(result)) 278 for i := 0; i < parquetSchema.NumColumns(); i++ { 279 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 280 } 281 } 282 283 func TestCoerceTImestampV1(t *testing.T) { 284 parquetFields := make(schema.FieldList, 0) 285 arrowFields := make([]arrow.Field, 0) 286 287 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("timestamp", parquet.Repetitions.Required, 288 schema.NewTimestampLogicalTypeForce(false, schema.TimeUnitMicros), parquet.Types.Int64, 0, -1))) 289 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp", Type: &arrow.TimestampType{Unit: arrow.Millisecond, TimeZone: "EST"}}) 290 291 arrowSchema := arrow.NewSchema(arrowFields, nil) 292 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 293 294 result, err := pqarrow.ToParquet(arrowSchema, parquet.NewWriterProperties(parquet.WithVersion(parquet.V1_0)), pqarrow.NewArrowWriterProperties(pqarrow.WithCoerceTimestamps(arrow.Microsecond))) 295 assert.NoError(t, err) 296 assert.True(t, parquetSchema.Equals(result)) 297 for i := 0; i < parquetSchema.NumColumns(); i++ { 298 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 299 } 300 } 301 302 func TestAutoCoerceTImestampV1(t *testing.T) { 303 parquetFields := make(schema.FieldList, 0) 304 arrowFields := make([]arrow.Field, 0) 305 306 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("timestamp", parquet.Repetitions.Required, 307 schema.NewTimestampLogicalTypeForce(false, schema.TimeUnitMicros), parquet.Types.Int64, 0, -1))) 308 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp", Type: &arrow.TimestampType{Unit: arrow.Nanosecond, TimeZone: "EST"}}) 309 310 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("timestamp[ms]", parquet.Repetitions.Required, 311 schema.NewTimestampLogicalTypeForce(true, schema.TimeUnitMillis), parquet.Types.Int64, 0, -1))) 312 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp[ms]", Type: &arrow.TimestampType{Unit: arrow.Second}}) 313 314 arrowSchema := arrow.NewSchema(arrowFields, nil) 315 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 316 317 result, err := pqarrow.ToParquet(arrowSchema, parquet.NewWriterProperties(parquet.WithVersion(parquet.V1_0)), pqarrow.NewArrowWriterProperties()) 318 assert.NoError(t, err) 319 assert.True(t, parquetSchema.Equals(result)) 320 for i := 0; i < parquetSchema.NumColumns(); i++ { 321 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 322 } 323 } 324 325 func TestConvertArrowStruct(t *testing.T) { 326 parquetFields := make(schema.FieldList, 0) 327 arrowFields := make([]arrow.Field, 0) 328 329 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("leaf1", parquet.Repetitions.Optional, schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1))) 330 parquetFields = append(parquetFields, schema.Must(schema.NewGroupNode("outerGroup", parquet.Repetitions.Required, schema.FieldList{ 331 schema.Must(schema.NewPrimitiveNodeLogical("leaf2", parquet.Repetitions.Optional, schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1)), 332 schema.Must(schema.NewGroupNode("innerGroup", parquet.Repetitions.Required, schema.FieldList{ 333 schema.Must(schema.NewPrimitiveNodeLogical("leaf3", parquet.Repetitions.Optional, schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1)), 334 }, -1)), 335 }, -1))) 336 337 arrowFields = append(arrowFields, arrow.Field{Name: "leaf1", Type: arrow.PrimitiveTypes.Int32, Nullable: true}) 338 arrowFields = append(arrowFields, arrow.Field{Name: "outerGroup", Type: arrow.StructOf( 339 arrow.Field{Name: "leaf2", Type: arrow.PrimitiveTypes.Int32, Nullable: true}, 340 arrow.Field{Name: "innerGroup", Type: arrow.StructOf( 341 arrow.Field{Name: "leaf3", Type: arrow.PrimitiveTypes.Int32, Nullable: true}, 342 )}, 343 )}) 344 345 arrowSchema := arrow.NewSchema(arrowFields, nil) 346 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 347 348 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties()) 349 assert.NoError(t, err) 350 assert.True(t, parquetSchema.Equals(result)) 351 for i := 0; i < parquetSchema.NumColumns(); i++ { 352 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 353 } 354 } 355 356 func TestListStructBackwardCompatible(t *testing.T) { 357 // Set up old construction for list of struct, not using 358 // the 3-level encoding. Schema looks like: 359 // 360 // required group field_id=-1 root { 361 // optional group field_id=-1 answers (List) { 362 // repeated group field_id=-1 array { 363 // optional byte_array field_id=-1 type (String); 364 // optional byte_array field_id=-1 rdata (String); 365 // optional byte_array field_id=-1 class (String); 366 // } 367 // } 368 // } 369 // 370 // Instaed of the proper 3-level encoding which would be: 371 // 372 // repeated group field_id=-1 schema { 373 // optional group field_id=-1 answers (List) { 374 // repeated group field_id=-1 list { 375 // optional group field_id=-1 element { 376 // optional byte_array field_id=-1 type (String); 377 // optional byte_array field_id=-1 rdata (String); 378 // optional byte_array field_id=-1 class (String); 379 // } 380 // } 381 // } 382 // } 383 // 384 pqSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("root", parquet.Repetitions.Required, schema.FieldList{ 385 schema.Must(schema.NewGroupNodeLogical("answers", parquet.Repetitions.Optional, schema.FieldList{ 386 schema.Must(schema.NewGroupNode("array", parquet.Repetitions.Repeated, schema.FieldList{ 387 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("type", parquet.Repetitions.Optional, 388 schema.StringLogicalType{}, parquet.Types.ByteArray, -1, -1)), 389 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("rdata", parquet.Repetitions.Optional, 390 schema.StringLogicalType{}, parquet.Types.ByteArray, -1, -1)), 391 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("class", parquet.Repetitions.Optional, 392 schema.StringLogicalType{}, parquet.Types.ByteArray, -1, -1)), 393 }, -1)), 394 }, schema.NewListLogicalType(), -1)), 395 }, -1))) 396 397 meta := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"}) 398 // desired equivalent arrow schema would be list<item: struct<type: utf8, rdata: utf8, class: utf8>> 399 arrowSchema := arrow.NewSchema( 400 []arrow.Field{ 401 {Name: "answers", Type: arrow.ListOfField(arrow.Field{ 402 Name: "array", Type: arrow.StructOf( 403 arrow.Field{Name: "type", Type: arrow.BinaryTypes.String, Nullable: true, Metadata: meta}, 404 arrow.Field{Name: "rdata", Type: arrow.BinaryTypes.String, Nullable: true, Metadata: meta}, 405 arrow.Field{Name: "class", Type: arrow.BinaryTypes.String, Nullable: true, Metadata: meta}, 406 ), Nullable: true}), Nullable: true, Metadata: meta}, 407 }, nil) 408 409 arrsc, err := pqarrow.FromParquet(pqSchema, nil, metadata.KeyValueMetadata{}) 410 assert.NoError(t, err) 411 assert.True(t, arrowSchema.Equal(arrsc)) 412 } 413 414 // TestUnsupportedTypes tests the error message for unsupported types. This test should be updated 415 // when support for these types is added. 416 func TestUnsupportedTypes(t *testing.T) { 417 unsupportedTypes := []struct { 418 typ arrow.DataType 419 }{ 420 // Non-exhaustive list of unsupported types 421 {typ: &arrow.Float16Type{}}, 422 {typ: &arrow.DurationType{}}, 423 {typ: &arrow.DayTimeIntervalType{}}, 424 {typ: &arrow.MonthIntervalType{}}, 425 {typ: &arrow.MonthDayNanoIntervalType{}}, 426 {typ: &arrow.DenseUnionType{}}, 427 {typ: &arrow.SparseUnionType{}}, 428 } 429 for _, tc := range unsupportedTypes { 430 t.Run(tc.typ.ID().String(), func(t *testing.T) { 431 arrowFields := make([]arrow.Field, 0) 432 arrowFields = append(arrowFields, arrow.Field{Name: "unsupported", Type: tc.typ, Nullable: true}) 433 arrowSchema := arrow.NewSchema(arrowFields, nil) 434 _, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties()) 435 assert.ErrorIs(t, err, arrow.ErrNotImplemented) 436 assert.ErrorContains(t, err, "support for "+tc.typ.ID().String()) 437 }) 438 } 439 }