github.com/apache/arrow/go/v7@v7.0.1/parquet/pqarrow/schema_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow_test 18 19 import ( 20 "testing" 21 22 "github.com/apache/arrow/go/v7/arrow" 23 "github.com/apache/arrow/go/v7/parquet" 24 "github.com/apache/arrow/go/v7/parquet/pqarrow" 25 "github.com/apache/arrow/go/v7/parquet/schema" 26 "github.com/stretchr/testify/assert" 27 ) 28 29 func TestConvertArrowFlatPrimitives(t *testing.T) { 30 parquetFields := make(schema.FieldList, 0) 31 arrowFields := make([]arrow.Field, 0) 32 33 parquetFields = append(parquetFields, schema.NewBooleanNode("boolean", parquet.Repetitions.Required, -1)) 34 arrowFields = append(arrowFields, arrow.Field{Name: "boolean", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}) 35 36 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int8", parquet.Repetitions.Required, 37 schema.NewIntLogicalType(8, true), parquet.Types.Int32, 0, -1))) 38 arrowFields = append(arrowFields, arrow.Field{Name: "int8", Type: arrow.PrimitiveTypes.Int8, Nullable: false}) 39 40 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint8", parquet.Repetitions.Required, 41 schema.NewIntLogicalType(8, false), parquet.Types.Int32, 0, -1))) 42 arrowFields = append(arrowFields, arrow.Field{Name: "uint8", Type: arrow.PrimitiveTypes.Uint8, Nullable: false}) 43 44 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int16", parquet.Repetitions.Required, 45 schema.NewIntLogicalType(16, true), parquet.Types.Int32, 0, -1))) 46 arrowFields = append(arrowFields, arrow.Field{Name: "int16", Type: arrow.PrimitiveTypes.Int16, Nullable: false}) 47 48 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint16", parquet.Repetitions.Required, 49 schema.NewIntLogicalType(16, false), parquet.Types.Int32, 0, -1))) 50 arrowFields = append(arrowFields, arrow.Field{Name: "uint16", Type: arrow.PrimitiveTypes.Uint16, Nullable: false}) 51 52 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int32", parquet.Repetitions.Required, 53 schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1))) 54 arrowFields = append(arrowFields, arrow.Field{Name: "int32", Type: arrow.PrimitiveTypes.Int32, Nullable: false}) 55 56 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint32", parquet.Repetitions.Required, 57 schema.NewIntLogicalType(32, false), parquet.Types.Int32, 0, -1))) 58 arrowFields = append(arrowFields, arrow.Field{Name: "uint32", Type: arrow.PrimitiveTypes.Uint32, Nullable: false}) 59 60 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int64", parquet.Repetitions.Required, 61 schema.NewIntLogicalType(64, true), parquet.Types.Int64, 0, -1))) 62 arrowFields = append(arrowFields, arrow.Field{Name: "int64", Type: arrow.PrimitiveTypes.Int64, Nullable: false}) 63 64 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint64", parquet.Repetitions.Required, 65 schema.NewIntLogicalType(64, false), parquet.Types.Int64, 0, -1))) 66 arrowFields = append(arrowFields, arrow.Field{Name: "uint64", Type: arrow.PrimitiveTypes.Uint64, Nullable: false}) 67 68 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeConverted("timestamp", parquet.Repetitions.Required, 69 parquet.Types.Int64, schema.ConvertedTypes.TimestampMillis, 0, 0, 0, -1))) 70 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp", Type: arrow.FixedWidthTypes.Timestamp_ms, Nullable: false}) 71 72 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeConverted("timestamp[us]", parquet.Repetitions.Required, 73 parquet.Types.Int64, schema.ConvertedTypes.TimestampMicros, 0, 0, 0, -1))) 74 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp[us]", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: false}) 75 76 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("date", parquet.Repetitions.Required, 77 schema.DateLogicalType{}, parquet.Types.Int32, 0, -1))) 78 arrowFields = append(arrowFields, arrow.Field{Name: "date", Type: arrow.FixedWidthTypes.Date32, Nullable: false}) 79 80 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("date64", parquet.Repetitions.Required, 81 schema.NewTimestampLogicalType(true, schema.TimeUnitMillis), parquet.Types.Int64, 0, -1))) 82 arrowFields = append(arrowFields, arrow.Field{Name: "date64", Type: arrow.FixedWidthTypes.Date64, Nullable: false}) 83 84 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("time32", parquet.Repetitions.Required, 85 schema.NewTimeLogicalType(true, schema.TimeUnitMillis), parquet.Types.Int32, 0, -1))) 86 arrowFields = append(arrowFields, arrow.Field{Name: "time32", Type: arrow.FixedWidthTypes.Time32ms, Nullable: false}) 87 88 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("time64", parquet.Repetitions.Required, 89 schema.NewTimeLogicalType(true, schema.TimeUnitMicros), parquet.Types.Int64, 0, -1))) 90 arrowFields = append(arrowFields, arrow.Field{Name: "time64", Type: arrow.FixedWidthTypes.Time64us, Nullable: false}) 91 92 parquetFields = append(parquetFields, schema.NewInt96Node("timestamp96", parquet.Repetitions.Required, -1)) 93 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp96", Type: arrow.FixedWidthTypes.Timestamp_ns, Nullable: false}) 94 95 parquetFields = append(parquetFields, schema.NewFloat32Node("float", parquet.Repetitions.Optional, -1)) 96 arrowFields = append(arrowFields, arrow.Field{Name: "float", Type: arrow.PrimitiveTypes.Float32, Nullable: true}) 97 98 parquetFields = append(parquetFields, schema.NewFloat64Node("double", parquet.Repetitions.Optional, -1)) 99 arrowFields = append(arrowFields, arrow.Field{Name: "double", Type: arrow.PrimitiveTypes.Float64, Nullable: true}) 100 101 parquetFields = append(parquetFields, schema.NewByteArrayNode("binary", parquet.Repetitions.Optional, -1)) 102 arrowFields = append(arrowFields, arrow.Field{Name: "binary", Type: arrow.BinaryTypes.Binary, Nullable: true}) 103 104 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("string", parquet.Repetitions.Optional, 105 schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1))) 106 arrowFields = append(arrowFields, arrow.Field{Name: "string", Type: arrow.BinaryTypes.String, Nullable: true}) 107 108 parquetFields = append(parquetFields, schema.NewFixedLenByteArrayNode("flba-binary", parquet.Repetitions.Optional, 12, -1)) 109 arrowFields = append(arrowFields, arrow.Field{Name: "flba-binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 12}, Nullable: true}) 110 111 arrowSchema := arrow.NewSchema(arrowFields, nil) 112 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 113 114 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true))) 115 assert.NoError(t, err) 116 assert.True(t, parquetSchema.Equals(result)) 117 for i := 0; i < parquetSchema.NumColumns(); i++ { 118 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 119 } 120 } 121 122 func TestConvertArrowParquetLists(t *testing.T) { 123 parquetFields := make(schema.FieldList, 0) 124 arrowFields := make([]arrow.Field, 0) 125 126 parquetFields = append(parquetFields, schema.MustGroup(schema.ListOf(schema.Must(schema.NewPrimitiveNodeLogical("my_list", 127 parquet.Repetitions.Optional, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), parquet.Repetitions.Required, -1))) 128 129 arrowFields = append(arrowFields, arrow.Field{Name: "my_list", Type: arrow.ListOf(arrow.BinaryTypes.String)}) 130 131 parquetFields = append(parquetFields, schema.MustGroup(schema.ListOf(schema.Must(schema.NewPrimitiveNodeLogical("my_list", 132 parquet.Repetitions.Optional, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), parquet.Repetitions.Optional, -1))) 133 134 arrowFields = append(arrowFields, arrow.Field{Name: "my_list", Type: arrow.ListOf(arrow.BinaryTypes.String), Nullable: true}) 135 136 arrowSchema := arrow.NewSchema(arrowFields, nil) 137 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 138 139 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true))) 140 assert.NoError(t, err) 141 assert.True(t, parquetSchema.Equals(result), parquetSchema.String(), result.String()) 142 for i := 0; i < parquetSchema.NumColumns(); i++ { 143 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 144 } 145 } 146 147 func TestConvertArrowDecimals(t *testing.T) { 148 parquetFields := make(schema.FieldList, 0) 149 arrowFields := make([]arrow.Field, 0) 150 151 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("decimal_8_4", parquet.Repetitions.Required, 152 schema.NewDecimalLogicalType(8, 4), parquet.Types.FixedLenByteArray, 4, -1))) 153 arrowFields = append(arrowFields, arrow.Field{Name: "decimal_8_4", Type: &arrow.Decimal128Type{Precision: 8, Scale: 4}}) 154 155 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("decimal_20_4", parquet.Repetitions.Required, 156 schema.NewDecimalLogicalType(20, 4), parquet.Types.FixedLenByteArray, 9, -1))) 157 arrowFields = append(arrowFields, arrow.Field{Name: "decimal_20_4", Type: &arrow.Decimal128Type{Precision: 20, Scale: 4}}) 158 159 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("decimal_77_4", parquet.Repetitions.Required, 160 schema.NewDecimalLogicalType(77, 4), parquet.Types.FixedLenByteArray, 34, -1))) 161 arrowFields = append(arrowFields, arrow.Field{Name: "decimal_77_4", Type: &arrow.Decimal128Type{Precision: 77, Scale: 4}}) 162 163 arrowSchema := arrow.NewSchema(arrowFields, nil) 164 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 165 166 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true))) 167 assert.NoError(t, err) 168 assert.True(t, parquetSchema.Equals(result)) 169 for i := 0; i < parquetSchema.NumColumns(); i++ { 170 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 171 } 172 } 173 174 func TestCoerceTImestampV1(t *testing.T) { 175 parquetFields := make(schema.FieldList, 0) 176 arrowFields := make([]arrow.Field, 0) 177 178 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("timestamp", parquet.Repetitions.Required, 179 schema.NewTimestampLogicalTypeForce(false, schema.TimeUnitMicros), parquet.Types.Int64, 0, -1))) 180 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp", Type: &arrow.TimestampType{Unit: arrow.Millisecond, TimeZone: "EST"}}) 181 182 arrowSchema := arrow.NewSchema(arrowFields, nil) 183 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 184 185 result, err := pqarrow.ToParquet(arrowSchema, parquet.NewWriterProperties(parquet.WithVersion(parquet.V1_0)), pqarrow.NewArrowWriterProperties(pqarrow.WithCoerceTimestamps(arrow.Microsecond))) 186 assert.NoError(t, err) 187 assert.True(t, parquetSchema.Equals(result)) 188 for i := 0; i < parquetSchema.NumColumns(); i++ { 189 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 190 } 191 } 192 193 func TestAutoCoerceTImestampV1(t *testing.T) { 194 parquetFields := make(schema.FieldList, 0) 195 arrowFields := make([]arrow.Field, 0) 196 197 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("timestamp", parquet.Repetitions.Required, 198 schema.NewTimestampLogicalTypeForce(false, schema.TimeUnitMicros), parquet.Types.Int64, 0, -1))) 199 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp", Type: &arrow.TimestampType{Unit: arrow.Nanosecond, TimeZone: "EST"}}) 200 201 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("timestamp[ms]", parquet.Repetitions.Required, 202 schema.NewTimestampLogicalTypeForce(true, schema.TimeUnitMillis), parquet.Types.Int64, 0, -1))) 203 arrowFields = append(arrowFields, arrow.Field{Name: "timestamp[ms]", Type: &arrow.TimestampType{Unit: arrow.Second}}) 204 205 arrowSchema := arrow.NewSchema(arrowFields, nil) 206 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 207 208 result, err := pqarrow.ToParquet(arrowSchema, parquet.NewWriterProperties(parquet.WithVersion(parquet.V1_0)), pqarrow.NewArrowWriterProperties()) 209 assert.NoError(t, err) 210 assert.True(t, parquetSchema.Equals(result)) 211 for i := 0; i < parquetSchema.NumColumns(); i++ { 212 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 213 } 214 } 215 216 func TestConvertArrowStruct(t *testing.T) { 217 parquetFields := make(schema.FieldList, 0) 218 arrowFields := make([]arrow.Field, 0) 219 220 parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("leaf1", parquet.Repetitions.Optional, schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1))) 221 parquetFields = append(parquetFields, schema.Must(schema.NewGroupNode("outerGroup", parquet.Repetitions.Required, schema.FieldList{ 222 schema.Must(schema.NewPrimitiveNodeLogical("leaf2", parquet.Repetitions.Optional, schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1)), 223 schema.Must(schema.NewGroupNode("innerGroup", parquet.Repetitions.Required, schema.FieldList{ 224 schema.Must(schema.NewPrimitiveNodeLogical("leaf3", parquet.Repetitions.Optional, schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1)), 225 }, -1)), 226 }, -1))) 227 228 arrowFields = append(arrowFields, arrow.Field{Name: "leaf1", Type: arrow.PrimitiveTypes.Int32, Nullable: true}) 229 arrowFields = append(arrowFields, arrow.Field{Name: "outerGroup", Type: arrow.StructOf( 230 arrow.Field{Name: "leaf2", Type: arrow.PrimitiveTypes.Int32, Nullable: true}, 231 arrow.Field{Name: "innerGroup", Type: arrow.StructOf( 232 arrow.Field{Name: "leaf3", Type: arrow.PrimitiveTypes.Int32, Nullable: true}, 233 )}, 234 )}) 235 236 arrowSchema := arrow.NewSchema(arrowFields, nil) 237 parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1))) 238 239 result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties()) 240 assert.NoError(t, err) 241 assert.True(t, parquetSchema.Equals(result)) 242 for i := 0; i < parquetSchema.NumColumns(); i++ { 243 assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name()) 244 } 245 }