github.com/apache/arrow/go/v10@v10.0.1/parquet/pqarrow/schema_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow_test 18 19 import ( 20 "encoding/base64" 21 "testing" 22 23 "github.com/apache/arrow/go/v10/arrow" 24 "github.com/apache/arrow/go/v10/arrow/flight" 25 "github.com/apache/arrow/go/v10/arrow/memory" 26 "github.com/apache/arrow/go/v10/parquet" 27 "github.com/apache/arrow/go/v10/parquet/metadata" 28 "github.com/apache/arrow/go/v10/parquet/pqarrow" 29 "github.com/apache/arrow/go/v10/parquet/schema" 30 "github.com/stretchr/testify/assert" 31 "github.com/stretchr/testify/require" 32 ) 33 34 func TestGetOriginSchemaBase64(t *testing.T) { 35 md := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"}) 36 origArrSc := arrow.NewSchema([]arrow.Field{ 37 {Name: "f1", Type: arrow.BinaryTypes.String, Metadata: md}, 38 {Name: "f2", Type: arrow.PrimitiveTypes.Int64, Metadata: md}, 39 }, nil) 40 41 arrSerializedSc := flight.SerializeSchema(origArrSc, memory.DefaultAllocator) 42 pqschema, err := pqarrow.ToParquet(origArrSc, nil, pqarrow.DefaultWriterProps()) 43 require.NoError(t, err) 44 45 tests := []struct { 46 name string 47 enc *base64.Encoding 48 }{ 49 {"raw", base64.RawStdEncoding}, 50 {"std", base64.StdEncoding}, 51 } 52 53 for _, tt := range tests { 54 t.Run(tt.name, func(t *testing.T) { 55 kv := metadata.NewKeyValueMetadata() 56 kv.Append("ARROW:schema", tt.enc.EncodeToString(arrSerializedSc)) 57 arrsc, err := pqarrow.FromParquet(pqschema, nil, kv) 58 assert.NoError(t, err) 59 assert.True(t, origArrSc.Equal(arrsc)) 60 }) 61 } 62 } 63 64 func TestToParquetWriterConfig(t *testing.T) { 65 origSc := arrow.NewSchema([]arrow.Field{ 66 {Name: "f1", Type: arrow.BinaryTypes.String}, 67 {Name: "f2", Type: arrow.PrimitiveTypes.Int64}, 68 }, nil) 69 70 tests := []struct { 71 name string 72 rootRepetition parquet.Repetition 73 }{ 74 {"test1", parquet.Repetitions.Required}, 75 {"test2", parquet.Repetitions.Repeated}, 76 } 77 78 for _, tt := range tests { 79 t.Run(tt.name, func(t *testing.T) { 80 81 pqschema, err := pqarrow.ToParquet(origSc, 82 parquet.NewWriterProperties( 83 parquet.WithRootName(tt.name), 84 parquet.WithRootRepetition(tt.rootRepetition), 85 ), 86 pqarrow.DefaultWriterProps()) 87 require.NoError(t, err) 88 89 assert.Equal(t, tt.name, pqschema.Root().Name()) 90 assert.Equal(t, tt.rootRepetition, pqschema.Root().RepetitionType()) 91 }) 92 } 93 } 94 95 func TestConvertArrowFlatPrimitives(t *testing.T) { 96 parquetFields := make(schema.FieldList, 0) 97 arrowFields := make([]arrow.Field, 0) 98 99 parquetFields = append(parquetFields, schema.NewBooleanNode("boolean", parquet.Repetitions.Required, -1)) 100 arrowFields = append(arrowFields, arrow.Field{Name: "boolean", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}) 101 102 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int8", parquet.Repetitions.Required, 103 schema.NewIntLogicalType(8, true), parquet.Types.Int32, 0, -1))) 104 arrowFields = append(arrowFields, arrow.Field{Name: "int8", Type: arrow.PrimitiveTypes.Int8, Nullable: false}) 105 106 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint8", parquet.Repetitions.Required, 107 schema.NewIntLogicalType(8, false), parquet.Types.Int32, 0, -1))) 108 arrowFields = append(arrowFields, arrow.Field{Name: "uint8", Type: arrow.PrimitiveTypes.Uint8, Nullable: false}) 109 110 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int16", parquet.Repetitions.Required, 111 schema.NewIntLogicalType(16, true), parquet.Types.Int32, 0, -1))) 112 arrowFields = append(arrowFields, arrow.Field{Name: "int16", Type: arrow.PrimitiveTypes.Int16, Nullable: false}) 113 114 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint16", parquet.Repetitions.Required, 115 schema.NewIntLogicalType(16, false), parquet.Types.Int32, 0, -1))) 116 arrowFields = append(arrowFields, arrow.Field{Name: "uint16", Type: arrow.PrimitiveTypes.Uint16, Nullable: false}) 117 118 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int32", parquet.Repetitions.Required, 119 schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1))) 120 arrowFields = append(arrowFields, arrow.Field{Name: "int32", Type: arrow.PrimitiveTypes.Int32, Nullable: false}) 121 122 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint32", parquet.Repetitions.Required, 123 schema.NewIntLogicalType(32, false), parquet.Types.Int32, 0, -1))) 124 arrowFields = append(arrowFields, arrow.Field{Name: "uint32", Type: arrow.PrimitiveTypes.Uint32, Nullable: false}) 125 126 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int64", parquet.Repetitions.Required, 127 schema.NewIntLogicalType(64, true), parquet.Types.Int64, 0, -1))) 128 arrowFields = append(arrowFields, arrow.Field{Name: "int64", Type: arrow.PrimitiveTypes.Int64, Nullable: false}) 129 130 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint64", parquet.Repetitions.Required, 131 schema.NewIntLogicalType(64, false), parquet.Types.Int64, 0, -1))) 132 arrowFields = append(arrowFields, arrow.Field{Name: "uint64", Type: arrow.PrimitiveTypes.Uint64, Nullable: false}) 133 134 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeConverted("timestamp", parquet.Repetitions.Required, 135 parquet.Types.Int64, schema.ConvertedTypes.TimestampMillis, 0, 0, 0, -1))) 136 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp", Type: arrow.FixedWidthTypes.Timestamp_ms, Nullable: false}) 137 138 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeConverted("timestamp[us]", parquet.Repetitions.Required, 139 parquet.Types.Int64, schema.ConvertedTypes.TimestampMicros, 0, 0, 0, -1))) 140 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp[us]", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: false}) 141 142 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("date", parquet.Repetitions.Required, 143 schema.DateLogicalType{}, parquet.Types.Int32, 0, -1))) 144 arrowFields = append(arrowFields, arrow.Field{Name: "date", Type: arrow.FixedWidthTypes.Date32, Nullable: false}) 145 146 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("date64", parquet.Repetitions.Required, 147 schema.NewTimestampLogicalType(true, schema.TimeUnitMillis), parquet.Types.Int64, 0, -1))) 148 arrowFields = append(arrowFields, arrow.Field{Name: "date64", Type: arrow.FixedWidthTypes.Date64, Nullable: false}) 149 150 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("time32", parquet.Repetitions.Required, 151 schema.NewTimeLogicalType(true, schema.TimeUnitMillis), parquet.Types.Int32, 0, -1))) 152 arrowFields = append(arrowFields, arrow.Field{Name: "time32", Type: arrow.FixedWidthTypes.Time32ms, Nullable: false}) 153 154 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("time64", parquet.Repetitions.Required, 155 schema.NewTimeLogicalType(true, schema.TimeUnitMicros), parquet.Types.Int64, 0, -1))) 156 arrowFields = append(arrowFields, arrow.Field{Name: "time64", Type: arrow.FixedWidthTypes.Time64us, Nullable: false}) 157 158 parquetFields = append(parquetFields, schema.NewInt96Node("timestamp96", parquet.Repetitions.Required, -1)) 159 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp96", Type: arrow.FixedWidthTypes.Timestamp_ns, Nullable: false}) 160 161 parquetFields = append(parquetFields, schema.NewFloat32Node("float", parquet.Repetitions.Optional, -1)) 162 arrowFields = append(arrowFields, arrow.Field{Name: "float", Type: arrow.PrimitiveTypes.Float32, Nullable: true}) 163 164 parquetFields = append(parquetFields, schema.NewFloat64Node("double", parquet.Repetitions.Optional, -1)) 165 arrowFields = append(arrowFields, arrow.Field{Name: "double", Type: arrow.PrimitiveTypes.Float64, Nullable: true}) 166 167 parquetFields = append(parquetFields, schema.NewByteArrayNode("binary", parquet.Repetitions.Optional, -1)) 168 arrowFields = append(arrowFields, arrow.Field{Name: "binary", Type: arrow.BinaryTypes.Binary, Nullable: true}) 169 170 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("string", parquet.Repetitions.Optional, 171 schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1))) 172 arrowFields = append(arrowFields, arrow.Field{Name: "string", Type: arrow.BinaryTypes.String, Nullable: true}) 173 174 parquetFields = append(parquetFields, schema.NewFixedLenByteArrayNode("flba-binary", parquet.Repetitions.Optional, 12, -1)) 175 arrowFields = append(arrowFields, arrow.Field{Name: "flba-binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 12}, Nullable: true}) 176 177 arrowSchema := arrow.NewSchema(arrowFields, nil) 178 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 179 180 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true))) 181 assert.NoError(t, err) 182 assert.True(t, parquetSchema.Equals(result)) 183 for i := 0; i < parquetSchema.NumColumns(); i++ { 184 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 185 } 186 } 187 188 func TestConvertArrowParquetLists(t *testing.T) { 189 parquetFields := make(schema.FieldList, 0) 190 arrowFields := make([]arrow.Field, 0) 191 192 parquetFields = append(parquetFields, schema.MustGroup(schema.ListOf(schema.Must(schema.NewPrimitiveNodeLogical("my_list", 193 parquet.Repetitions.Optional, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), parquet.Repetitions.Required, -1))) 194 195 arrowFields = append(arrowFields, arrow.Field{Name: "my_list", Type: arrow.ListOf(arrow.BinaryTypes.String)}) 196 197 parquetFields = append(parquetFields, schema.MustGroup(schema.ListOf(schema.Must(schema.NewPrimitiveNodeLogical("my_list", 198 parquet.Repetitions.Optional, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), parquet.Repetitions.Optional, -1))) 199 200 arrowFields = append(arrowFields, arrow.Field{Name: "my_list", Type: arrow.ListOf(arrow.BinaryTypes.String), Nullable: true}) 201 202 arrowSchema := arrow.NewSchema(arrowFields, nil) 203 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 204 205 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true))) 206 assert.NoError(t, err) 207 assert.True(t, parquetSchema.Equals(result), parquetSchema.String(), result.String()) 208 for i := 0; i < parquetSchema.NumColumns(); i++ { 209 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 210 } 211 } 212 213 func TestConvertArrowDecimals(t *testing.T) { 214 parquetFields := make(schema.FieldList, 0) 215 arrowFields := make([]arrow.Field, 0) 216 217 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("decimal_8_4", parquet.Repetitions.Required, 218 schema.NewDecimalLogicalType(8, 4), parquet.Types.FixedLenByteArray, 4, -1))) 219 arrowFields = append(arrowFields, arrow.Field{Name: "decimal_8_4", Type: &arrow.Decimal128Type{Precision: 8, Scale: 4}}) 220 221 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("decimal_20_4", parquet.Repetitions.Required, 222 schema.NewDecimalLogicalType(20, 4), parquet.Types.FixedLenByteArray, 9, -1))) 223 arrowFields = append(arrowFields, arrow.Field{Name: "decimal_20_4", Type: &arrow.Decimal128Type{Precision: 20, Scale: 4}}) 224 225 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("decimal_77_4", parquet.Repetitions.Required, 226 schema.NewDecimalLogicalType(77, 4), parquet.Types.FixedLenByteArray, 34, -1))) 227 arrowFields = append(arrowFields, arrow.Field{Name: "decimal_77_4", Type: &arrow.Decimal128Type{Precision: 77, Scale: 4}}) 228 229 arrowSchema := arrow.NewSchema(arrowFields, nil) 230 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 231 232 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true))) 233 assert.NoError(t, err) 234 assert.True(t, parquetSchema.Equals(result)) 235 for i := 0; i < parquetSchema.NumColumns(); i++ { 236 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 237 } 238 } 239 240 func TestCoerceTImestampV1(t *testing.T) { 241 parquetFields := make(schema.FieldList, 0) 242 arrowFields := make([]arrow.Field, 0) 243 244 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("timestamp", parquet.Repetitions.Required, 245 schema.NewTimestampLogicalTypeForce(false, schema.TimeUnitMicros), parquet.Types.Int64, 0, -1))) 246 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp", Type: &arrow.TimestampType{Unit: arrow.Millisecond, TimeZone: "EST"}}) 247 248 arrowSchema := arrow.NewSchema(arrowFields, nil) 249 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 250 251 result, err := pqarrow.ToParquet(arrowSchema, parquet.NewWriterProperties(parquet.WithVersion(parquet.V1_0)), pqarrow.NewArrowWriterProperties(pqarrow.WithCoerceTimestamps(arrow.Microsecond))) 252 assert.NoError(t, err) 253 assert.True(t, parquetSchema.Equals(result)) 254 for i := 0; i < parquetSchema.NumColumns(); i++ { 255 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 256 } 257 } 258 259 func TestAutoCoerceTImestampV1(t *testing.T) { 260 parquetFields := make(schema.FieldList, 0) 261 arrowFields := make([]arrow.Field, 0) 262 263 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("timestamp", parquet.Repetitions.Required, 264 schema.NewTimestampLogicalTypeForce(false, schema.TimeUnitMicros), parquet.Types.Int64, 0, -1))) 265 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp", Type: &arrow.TimestampType{Unit: arrow.Nanosecond, TimeZone: "EST"}}) 266 267 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("timestamp[ms]", parquet.Repetitions.Required, 268 schema.NewTimestampLogicalTypeForce(true, schema.TimeUnitMillis), parquet.Types.Int64, 0, -1))) 269 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp[ms]", Type: &arrow.TimestampType{Unit: arrow.Second}}) 270 271 arrowSchema := arrow.NewSchema(arrowFields, nil) 272 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 273 274 result, err := pqarrow.ToParquet(arrowSchema, parquet.NewWriterProperties(parquet.WithVersion(parquet.V1_0)), pqarrow.NewArrowWriterProperties()) 275 assert.NoError(t, err) 276 assert.True(t, parquetSchema.Equals(result)) 277 for i := 0; i < parquetSchema.NumColumns(); i++ { 278 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 279 } 280 } 281 282 func TestConvertArrowStruct(t *testing.T) { 283 parquetFields := make(schema.FieldList, 0) 284 arrowFields := make([]arrow.Field, 0) 285 286 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("leaf1", parquet.Repetitions.Optional, schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1))) 287 parquetFields = append(parquetFields, schema.Must(schema.NewGroupNode("outerGroup", parquet.Repetitions.Required, schema.FieldList{ 288 schema.Must(schema.NewPrimitiveNodeLogical("leaf2", parquet.Repetitions.Optional, schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1)), 289 schema.Must(schema.NewGroupNode("innerGroup", parquet.Repetitions.Required, schema.FieldList{ 290 schema.Must(schema.NewPrimitiveNodeLogical("leaf3", parquet.Repetitions.Optional, schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1)), 291 }, -1)), 292 }, -1))) 293 294 arrowFields = append(arrowFields, arrow.Field{Name: "leaf1", Type: arrow.PrimitiveTypes.Int32, Nullable: true}) 295 arrowFields = append(arrowFields, arrow.Field{Name: "outerGroup", Type: arrow.StructOf( 296 arrow.Field{Name: "leaf2", Type: arrow.PrimitiveTypes.Int32, Nullable: true}, 297 arrow.Field{Name: "innerGroup", Type: arrow.StructOf( 298 arrow.Field{Name: "leaf3", Type: arrow.PrimitiveTypes.Int32, Nullable: true}, 299 )}, 300 )}) 301 302 arrowSchema := arrow.NewSchema(arrowFields, nil) 303 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 304 305 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties()) 306 assert.NoError(t, err) 307 assert.True(t, parquetSchema.Equals(result)) 308 for i := 0; i < parquetSchema.NumColumns(); i++ { 309 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 310 } 311 }