github.com/apache/arrow/go/v10@v10.0.1/parquet/pqarrow/schema_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow_test
    18  
    19  import (
    20  	"encoding/base64"
    21  	"testing"
    22  
    23  	"github.com/apache/arrow/go/v10/arrow"
    24  	"github.com/apache/arrow/go/v10/arrow/flight"
    25  	"github.com/apache/arrow/go/v10/arrow/memory"
    26  	"github.com/apache/arrow/go/v10/parquet"
    27  	"github.com/apache/arrow/go/v10/parquet/metadata"
    28  	"github.com/apache/arrow/go/v10/parquet/pqarrow"
    29  	"github.com/apache/arrow/go/v10/parquet/schema"
    30  	"github.com/stretchr/testify/assert"
    31  	"github.com/stretchr/testify/require"
    32  )
    33  
    34  func TestGetOriginSchemaBase64(t *testing.T) {
    35  	md := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})
    36  	origArrSc := arrow.NewSchema([]arrow.Field{
    37  		{Name: "f1", Type: arrow.BinaryTypes.String, Metadata: md},
    38  		{Name: "f2", Type: arrow.PrimitiveTypes.Int64, Metadata: md},
    39  	}, nil)
    40  
    41  	arrSerializedSc := flight.SerializeSchema(origArrSc, memory.DefaultAllocator)
    42  	pqschema, err := pqarrow.ToParquet(origArrSc, nil, pqarrow.DefaultWriterProps())
    43  	require.NoError(t, err)
    44  
    45  	tests := []struct {
    46  		name string
    47  		enc  *base64.Encoding
    48  	}{
    49  		{"raw", base64.RawStdEncoding},
    50  		{"std", base64.StdEncoding},
    51  	}
    52  
    53  	for _, tt := range tests {
    54  		t.Run(tt.name, func(t *testing.T) {
    55  			kv := metadata.NewKeyValueMetadata()
    56  			kv.Append("ARROW:schema", tt.enc.EncodeToString(arrSerializedSc))
    57  			arrsc, err := pqarrow.FromParquet(pqschema, nil, kv)
    58  			assert.NoError(t, err)
    59  			assert.True(t, origArrSc.Equal(arrsc))
    60  		})
    61  	}
    62  }
    63  
    64  func TestToParquetWriterConfig(t *testing.T) {
    65  	origSc := arrow.NewSchema([]arrow.Field{
    66  		{Name: "f1", Type: arrow.BinaryTypes.String},
    67  		{Name: "f2", Type: arrow.PrimitiveTypes.Int64},
    68  	}, nil)
    69  
    70  	tests := []struct {
    71  		name           string
    72  		rootRepetition parquet.Repetition
    73  	}{
    74  		{"test1", parquet.Repetitions.Required},
    75  		{"test2", parquet.Repetitions.Repeated},
    76  	}
    77  
    78  	for _, tt := range tests {
    79  		t.Run(tt.name, func(t *testing.T) {
    80  
    81  			pqschema, err := pqarrow.ToParquet(origSc,
    82  				parquet.NewWriterProperties(
    83  					parquet.WithRootName(tt.name),
    84  					parquet.WithRootRepetition(tt.rootRepetition),
    85  				),
    86  				pqarrow.DefaultWriterProps())
    87  			require.NoError(t, err)
    88  
    89  			assert.Equal(t, tt.name, pqschema.Root().Name())
    90  			assert.Equal(t, tt.rootRepetition, pqschema.Root().RepetitionType())
    91  		})
    92  	}
    93  }
    94  
    95  func TestConvertArrowFlatPrimitives(t *testing.T) {
    96  	parquetFields := make(schema.FieldList, 0)
    97  	arrowFields := make([]arrow.Field, 0)
    98  
    99  	parquetFields = append(parquetFields, schema.NewBooleanNode("boolean", parquet.Repetitions.Required, -1))
   100  	arrowFields = append(arrowFields, arrow.Field{Name: "boolean", Type: arrow.FixedWidthTypes.Boolean, Nullable: false})
   101  
   102  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int8", parquet.Repetitions.Required,
   103  		schema.NewIntLogicalType(8, true), parquet.Types.Int32, 0, -1)))
   104  	arrowFields = append(arrowFields, arrow.Field{Name: "int8", Type: arrow.PrimitiveTypes.Int8, Nullable: false})
   105  
   106  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint8", parquet.Repetitions.Required,
   107  		schema.NewIntLogicalType(8, false), parquet.Types.Int32, 0, -1)))
   108  	arrowFields = append(arrowFields, arrow.Field{Name: "uint8", Type: arrow.PrimitiveTypes.Uint8, Nullable: false})
   109  
   110  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int16", parquet.Repetitions.Required,
   111  		schema.NewIntLogicalType(16, true), parquet.Types.Int32, 0, -1)))
   112  	arrowFields = append(arrowFields, arrow.Field{Name: "int16", Type: arrow.PrimitiveTypes.Int16, Nullable: false})
   113  
   114  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint16", parquet.Repetitions.Required,
   115  		schema.NewIntLogicalType(16, false), parquet.Types.Int32, 0, -1)))
   116  	arrowFields = append(arrowFields, arrow.Field{Name: "uint16", Type: arrow.PrimitiveTypes.Uint16, Nullable: false})
   117  
   118  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int32", parquet.Repetitions.Required,
   119  		schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1)))
   120  	arrowFields = append(arrowFields, arrow.Field{Name: "int32", Type: arrow.PrimitiveTypes.Int32, Nullable: false})
   121  
   122  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint32", parquet.Repetitions.Required,
   123  		schema.NewIntLogicalType(32, false), parquet.Types.Int32, 0, -1)))
   124  	arrowFields = append(arrowFields, arrow.Field{Name: "uint32", Type: arrow.PrimitiveTypes.Uint32, Nullable: false})
   125  
   126  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("int64", parquet.Repetitions.Required,
   127  		schema.NewIntLogicalType(64, true), parquet.Types.Int64, 0, -1)))
   128  	arrowFields = append(arrowFields, arrow.Field{Name: "int64", Type: arrow.PrimitiveTypes.Int64, Nullable: false})
   129  
   130  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("uint64", parquet.Repetitions.Required,
   131  		schema.NewIntLogicalType(64, false), parquet.Types.Int64, 0, -1)))
   132  	arrowFields = append(arrowFields, arrow.Field{Name: "uint64", Type: arrow.PrimitiveTypes.Uint64, Nullable: false})
   133  
   134  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeConverted("timestamp", parquet.Repetitions.Required,
   135  		parquet.Types.Int64, schema.ConvertedTypes.TimestampMillis, 0, 0, 0, -1)))
   136  	arrowFields = append(arrowFields, arrow.Field{Name: "timestamp", Type: arrow.FixedWidthTypes.Timestamp_ms, Nullable: false})
   137  
   138  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeConverted("timestamp[us]", parquet.Repetitions.Required,
   139  		parquet.Types.Int64, schema.ConvertedTypes.TimestampMicros, 0, 0, 0, -1)))
   140  	arrowFields = append(arrowFields, arrow.Field{Name: "timestamp[us]", Type: arrow.FixedWidthTypes.Timestamp_us, Nullable: false})
   141  
   142  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("date", parquet.Repetitions.Required,
   143  		schema.DateLogicalType{}, parquet.Types.Int32, 0, -1)))
   144  	arrowFields = append(arrowFields, arrow.Field{Name: "date", Type: arrow.FixedWidthTypes.Date32, Nullable: false})
   145  
   146  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("date64", parquet.Repetitions.Required,
   147  		schema.NewTimestampLogicalType(true, schema.TimeUnitMillis), parquet.Types.Int64, 0, -1)))
   148  	arrowFields = append(arrowFields, arrow.Field{Name: "date64", Type: arrow.FixedWidthTypes.Date64, Nullable: false})
   149  
   150  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("time32", parquet.Repetitions.Required,
   151  		schema.NewTimeLogicalType(true, schema.TimeUnitMillis), parquet.Types.Int32, 0, -1)))
   152  	arrowFields = append(arrowFields, arrow.Field{Name: "time32", Type: arrow.FixedWidthTypes.Time32ms, Nullable: false})
   153  
   154  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("time64", parquet.Repetitions.Required,
   155  		schema.NewTimeLogicalType(true, schema.TimeUnitMicros), parquet.Types.Int64, 0, -1)))
   156  	arrowFields = append(arrowFields, arrow.Field{Name: "time64", Type: arrow.FixedWidthTypes.Time64us, Nullable: false})
   157  
   158  	parquetFields = append(parquetFields, schema.NewInt96Node("timestamp96", parquet.Repetitions.Required, -1))
   159  	arrowFields = append(arrowFields, arrow.Field{Name: "timestamp96", Type: arrow.FixedWidthTypes.Timestamp_ns, Nullable: false})
   160  
   161  	parquetFields = append(parquetFields, schema.NewFloat32Node("float", parquet.Repetitions.Optional, -1))
   162  	arrowFields = append(arrowFields, arrow.Field{Name: "float", Type: arrow.PrimitiveTypes.Float32, Nullable: true})
   163  
   164  	parquetFields = append(parquetFields, schema.NewFloat64Node("double", parquet.Repetitions.Optional, -1))
   165  	arrowFields = append(arrowFields, arrow.Field{Name: "double", Type: arrow.PrimitiveTypes.Float64, Nullable: true})
   166  
   167  	parquetFields = append(parquetFields, schema.NewByteArrayNode("binary", parquet.Repetitions.Optional, -1))
   168  	arrowFields = append(arrowFields, arrow.Field{Name: "binary", Type: arrow.BinaryTypes.Binary, Nullable: true})
   169  
   170  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("string", parquet.Repetitions.Optional,
   171  		schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)))
   172  	arrowFields = append(arrowFields, arrow.Field{Name: "string", Type: arrow.BinaryTypes.String, Nullable: true})
   173  
   174  	parquetFields = append(parquetFields, schema.NewFixedLenByteArrayNode("flba-binary", parquet.Repetitions.Optional, 12, -1))
   175  	arrowFields = append(arrowFields, arrow.Field{Name: "flba-binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 12}, Nullable: true})
   176  
   177  	arrowSchema := arrow.NewSchema(arrowFields, nil)
   178  	parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1)))
   179  
   180  	result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true)))
   181  	assert.NoError(t, err)
   182  	assert.True(t, parquetSchema.Equals(result))
   183  	for i := 0; i < parquetSchema.NumColumns(); i++ {
   184  		assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name())
   185  	}
   186  }
   187  
   188  func TestConvertArrowParquetLists(t *testing.T) {
   189  	parquetFields := make(schema.FieldList, 0)
   190  	arrowFields := make([]arrow.Field, 0)
   191  
   192  	parquetFields = append(parquetFields, schema.MustGroup(schema.ListOf(schema.Must(schema.NewPrimitiveNodeLogical("my_list",
   193  		parquet.Repetitions.Optional, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), parquet.Repetitions.Required, -1)))
   194  
   195  	arrowFields = append(arrowFields, arrow.Field{Name: "my_list", Type: arrow.ListOf(arrow.BinaryTypes.String)})
   196  
   197  	parquetFields = append(parquetFields, schema.MustGroup(schema.ListOf(schema.Must(schema.NewPrimitiveNodeLogical("my_list",
   198  		parquet.Repetitions.Optional, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)), parquet.Repetitions.Optional, -1)))
   199  
   200  	arrowFields = append(arrowFields, arrow.Field{Name: "my_list", Type: arrow.ListOf(arrow.BinaryTypes.String), Nullable: true})
   201  
   202  	arrowSchema := arrow.NewSchema(arrowFields, nil)
   203  	parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1)))
   204  
   205  	result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true)))
   206  	assert.NoError(t, err)
   207  	assert.True(t, parquetSchema.Equals(result), parquetSchema.String(), result.String())
   208  	for i := 0; i < parquetSchema.NumColumns(); i++ {
   209  		assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name())
   210  	}
   211  }
   212  
   213  func TestConvertArrowDecimals(t *testing.T) {
   214  	parquetFields := make(schema.FieldList, 0)
   215  	arrowFields := make([]arrow.Field, 0)
   216  
   217  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("decimal_8_4", parquet.Repetitions.Required,
   218  		schema.NewDecimalLogicalType(8, 4), parquet.Types.FixedLenByteArray, 4, -1)))
   219  	arrowFields = append(arrowFields, arrow.Field{Name: "decimal_8_4", Type: &arrow.Decimal128Type{Precision: 8, Scale: 4}})
   220  
   221  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("decimal_20_4", parquet.Repetitions.Required,
   222  		schema.NewDecimalLogicalType(20, 4), parquet.Types.FixedLenByteArray, 9, -1)))
   223  	arrowFields = append(arrowFields, arrow.Field{Name: "decimal_20_4", Type: &arrow.Decimal128Type{Precision: 20, Scale: 4}})
   224  
   225  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("decimal_77_4", parquet.Repetitions.Required,
   226  		schema.NewDecimalLogicalType(77, 4), parquet.Types.FixedLenByteArray, 34, -1)))
   227  	arrowFields = append(arrowFields, arrow.Field{Name: "decimal_77_4", Type: &arrow.Decimal128Type{Precision: 77, Scale: 4}})
   228  
   229  	arrowSchema := arrow.NewSchema(arrowFields, nil)
   230  	parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1)))
   231  
   232  	result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties(pqarrow.WithDeprecatedInt96Timestamps(true)))
   233  	assert.NoError(t, err)
   234  	assert.True(t, parquetSchema.Equals(result))
   235  	for i := 0; i < parquetSchema.NumColumns(); i++ {
   236  		assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name())
   237  	}
   238  }
   239  
   240  func TestCoerceTImestampV1(t *testing.T) {
   241  	parquetFields := make(schema.FieldList, 0)
   242  	arrowFields := make([]arrow.Field, 0)
   243  
   244  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("timestamp", parquet.Repetitions.Required,
   245  		schema.NewTimestampLogicalTypeForce(false, schema.TimeUnitMicros), parquet.Types.Int64, 0, -1)))
   246  	arrowFields = append(arrowFields, arrow.Field{Name: "timestamp", Type: &arrow.TimestampType{Unit: arrow.Millisecond, TimeZone: "EST"}})
   247  
   248  	arrowSchema := arrow.NewSchema(arrowFields, nil)
   249  	parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1)))
   250  
   251  	result, err := pqarrow.ToParquet(arrowSchema, parquet.NewWriterProperties(parquet.WithVersion(parquet.V1_0)), pqarrow.NewArrowWriterProperties(pqarrow.WithCoerceTimestamps(arrow.Microsecond)))
   252  	assert.NoError(t, err)
   253  	assert.True(t, parquetSchema.Equals(result))
   254  	for i := 0; i < parquetSchema.NumColumns(); i++ {
   255  		assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name())
   256  	}
   257  }
   258  
   259  func TestAutoCoerceTImestampV1(t *testing.T) {
   260  	parquetFields := make(schema.FieldList, 0)
   261  	arrowFields := make([]arrow.Field, 0)
   262  
   263  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("timestamp", parquet.Repetitions.Required,
   264  		schema.NewTimestampLogicalTypeForce(false, schema.TimeUnitMicros), parquet.Types.Int64, 0, -1)))
   265  	arrowFields = append(arrowFields, arrow.Field{Name: "timestamp", Type: &arrow.TimestampType{Unit: arrow.Nanosecond, TimeZone: "EST"}})
   266  
   267  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("timestamp[ms]", parquet.Repetitions.Required,
   268  		schema.NewTimestampLogicalTypeForce(true, schema.TimeUnitMillis), parquet.Types.Int64, 0, -1)))
   269  	arrowFields = append(arrowFields, arrow.Field{Name: "timestamp[ms]", Type: &arrow.TimestampType{Unit: arrow.Second}})
   270  
   271  	arrowSchema := arrow.NewSchema(arrowFields, nil)
   272  	parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1)))
   273  
   274  	result, err := pqarrow.ToParquet(arrowSchema, parquet.NewWriterProperties(parquet.WithVersion(parquet.V1_0)), pqarrow.NewArrowWriterProperties())
   275  	assert.NoError(t, err)
   276  	assert.True(t, parquetSchema.Equals(result))
   277  	for i := 0; i < parquetSchema.NumColumns(); i++ {
   278  		assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name())
   279  	}
   280  }
   281  
   282  func TestConvertArrowStruct(t *testing.T) {
   283  	parquetFields := make(schema.FieldList, 0)
   284  	arrowFields := make([]arrow.Field, 0)
   285  
   286  	parquetFields = append(parquetFields, schema.Must(schema.NewPrimitiveNodeLogical("leaf1", parquet.Repetitions.Optional, schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1)))
   287  	parquetFields = append(parquetFields, schema.Must(schema.NewGroupNode("outerGroup", parquet.Repetitions.Required, schema.FieldList{
   288  		schema.Must(schema.NewPrimitiveNodeLogical("leaf2", parquet.Repetitions.Optional, schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1)),
   289  		schema.Must(schema.NewGroupNode("innerGroup", parquet.Repetitions.Required, schema.FieldList{
   290  			schema.Must(schema.NewPrimitiveNodeLogical("leaf3", parquet.Repetitions.Optional, schema.NewIntLogicalType(32, true), parquet.Types.Int32, 0, -1)),
   291  		}, -1)),
   292  	}, -1)))
   293  
   294  	arrowFields = append(arrowFields, arrow.Field{Name: "leaf1", Type: arrow.PrimitiveTypes.Int32, Nullable: true})
   295  	arrowFields = append(arrowFields, arrow.Field{Name: "outerGroup", Type: arrow.StructOf(
   296  		arrow.Field{Name: "leaf2", Type: arrow.PrimitiveTypes.Int32, Nullable: true},
   297  		arrow.Field{Name: "innerGroup", Type: arrow.StructOf(
   298  			arrow.Field{Name: "leaf3", Type: arrow.PrimitiveTypes.Int32, Nullable: true},
   299  		)},
   300  	)})
   301  
   302  	arrowSchema := arrow.NewSchema(arrowFields, nil)
   303  	parquetSchema := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, parquetFields, -1)))
   304  
   305  	result, err := pqarrow.ToParquet(arrowSchema, nil, pqarrow.NewArrowWriterProperties())
   306  	assert.NoError(t, err)
   307  	assert.True(t, parquetSchema.Equals(result))
   308  	for i := 0; i < parquetSchema.NumColumns(); i++ {
   309  		assert.Truef(t, parquetSchema.Column(i).Equals(result.Column(i)), "Column %d didn't match: %s", i, parquetSchema.Column(i).Name())
   310  	}
   311  }