github.com/apache/arrow/go/v14@v14.0.1/parquet/metadata/metadata_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package metadata_test
    18  
    19  import (
    20  	"context"
    21  	"testing"
    22  	"unsafe"
    23  
    24  	"github.com/apache/arrow/go/v14/parquet"
    25  	"github.com/apache/arrow/go/v14/parquet/metadata"
    26  	"github.com/apache/arrow/go/v14/parquet/schema"
    27  	"github.com/stretchr/testify/assert"
    28  	"github.com/stretchr/testify/require"
    29  )
    30  
    31  func generateTableMetaData(schema *schema.Schema, props *parquet.WriterProperties, nrows int, statsInt, statsFloat metadata.EncodedStatistics) (*metadata.FileMetaData, error) {
    32  	fbuilder := metadata.NewFileMetadataBuilder(schema, props, nil)
    33  	rg1Builder := fbuilder.AppendRowGroup()
    34  	// metadata
    35  	// row group 1
    36  	col1Builder := rg1Builder.NextColumnChunk()
    37  	col2Builder := rg1Builder.NextColumnChunk()
    38  	// column metadata
    39  	dictEncodingStats := map[parquet.Encoding]int32{parquet.Encodings.RLEDict: 1}
    40  	dataEncodingStats := map[parquet.Encoding]int32{parquet.Encodings.Plain: 1, parquet.Encodings.RLE: 1}
    41  	statsInt.Signed = true
    42  	col1Builder.SetStats(statsInt)
    43  	statsFloat.Signed = true
    44  	col2Builder.SetStats(statsFloat)
    45  
    46  	col1Builder.Finish(metadata.ChunkMetaInfo{int64(nrows) / 2, 4, 0, 10, 512, 600}, true, false, metadata.EncodingStats{dictEncodingStats, dataEncodingStats}, nil)
    47  	col2Builder.Finish(metadata.ChunkMetaInfo{int64(nrows) / 2, 24, 0, 30, 512, 600}, true, false, metadata.EncodingStats{dictEncodingStats, dataEncodingStats}, nil)
    48  
    49  	rg1Builder.SetNumRows(nrows / 2)
    50  	rg1Builder.Finish(1024, -1)
    51  
    52  	// rowgroup2 metadata
    53  	rg2Builder := fbuilder.AppendRowGroup()
    54  	col1Builder = rg2Builder.NextColumnChunk()
    55  	col2Builder = rg2Builder.NextColumnChunk()
    56  	// column metadata
    57  	col1Builder.SetStats(statsInt)
    58  	col2Builder.SetStats(statsFloat)
    59  	dictEncodingStats = make(map[parquet.Encoding]int32)
    60  	col1Builder.Finish(metadata.ChunkMetaInfo{int64(nrows) / 2, 0 /*dictionary page offset*/, 0, 10, 512, 600}, false /* has dictionary */, false, metadata.EncodingStats{dictEncodingStats, dataEncodingStats}, nil)
    61  	col2Builder.Finish(metadata.ChunkMetaInfo{int64(nrows) / 2, 16, 0, 26, 512, 600}, true, false, metadata.EncodingStats{dictEncodingStats, dataEncodingStats}, nil)
    62  
    63  	rg2Builder.SetNumRows(nrows / 2)
    64  	rg2Builder.Finish(1024, -1)
    65  
    66  	return fbuilder.Finish()
    67  }
    68  
    69  func assertStatsSet(t *testing.T, m *metadata.ColumnChunkMetaData) {
    70  	ok, err := m.StatsSet()
    71  	assert.NoError(t, err)
    72  	assert.True(t, ok)
    73  }
    74  
    75  func assertStats(t *testing.T, m *metadata.ColumnChunkMetaData) metadata.TypedStatistics {
    76  	s, err := m.Statistics()
    77  	assert.NoError(t, err)
    78  	assert.NotNil(t, s)
    79  	return s
    80  }
    81  
    82  func TestBuildAccess(t *testing.T) {
    83  	props := parquet.NewWriterProperties(parquet.WithVersion(parquet.V2_LATEST))
    84  
    85  	fields := schema.FieldList{
    86  		schema.NewInt32Node("int_col", parquet.Repetitions.Required, -1),
    87  		schema.NewFloat32Node("float_col", parquet.Repetitions.Required, -1),
    88  	}
    89  	root, err := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, fields, -1)
    90  	require.NoError(t, err)
    91  	schema := schema.NewSchema(root)
    92  
    93  	var (
    94  		nrows      int64   = 1000
    95  		intMin     int32   = 100
    96  		intMax     int32   = 200
    97  		floatMin   float32 = 100.100
    98  		floatMax   float32 = 200.200
    99  		statsInt   metadata.EncodedStatistics
   100  		statsFloat metadata.EncodedStatistics
   101  	)
   102  
   103  	statsInt.SetNullCount(0).
   104  		SetDistinctCount(nrows).
   105  		SetMin((*(*[4]byte)(unsafe.Pointer(&intMin)))[:]).
   106  		SetMax((*(*[4]byte)(unsafe.Pointer(&intMax)))[:])
   107  
   108  	statsFloat.SetNullCount(0).
   109  		SetDistinctCount(nrows).
   110  		SetMin((*(*[4]byte)(unsafe.Pointer(&floatMin)))[:]).
   111  		SetMax((*(*[4]byte)(unsafe.Pointer(&floatMax)))[:])
   112  
   113  	faccessor, err := generateTableMetaData(schema, props, int(nrows), statsInt, statsFloat)
   114  	require.NoError(t, err)
   115  	serialized, err := faccessor.SerializeString(context.Background())
   116  	assert.NoError(t, err)
   117  	faccessorCopy, err := metadata.NewFileMetaData([]byte(serialized), nil)
   118  	assert.NoError(t, err)
   119  
   120  	for _, accessor := range []*metadata.FileMetaData{faccessor, faccessorCopy} {
   121  		// file metadata
   122  		assert.Equal(t, nrows, accessor.NumRows)
   123  		assert.Len(t, accessor.RowGroups, 2)
   124  		assert.EqualValues(t, parquet.V2_LATEST, accessor.Version())
   125  		assert.Equal(t, parquet.DefaultCreatedBy, accessor.GetCreatedBy())
   126  		assert.Equal(t, 3, accessor.NumSchemaElements())
   127  
   128  		// row group 1 metadata
   129  		rg1Access := accessor.RowGroup(0)
   130  		assert.Equal(t, 2, rg1Access.NumColumns())
   131  		assert.Equal(t, nrows/2, rg1Access.NumRows())
   132  		assert.Equal(t, int64(1024), rg1Access.TotalByteSize())
   133  		assert.Equal(t, int64(1024), rg1Access.TotalCompressedSize())
   134  
   135  		rg1Col1, err := rg1Access.ColumnChunk(0)
   136  		assert.NoError(t, err)
   137  		assert.Equal(t, rg1Access.FileOffset(), rg1Col1.DictionaryPageOffset())
   138  
   139  		rg1Col2, err := rg1Access.ColumnChunk(1)
   140  		assert.NoError(t, err)
   141  		assertStatsSet(t, rg1Col1)
   142  		assertStatsSet(t, rg1Col2)
   143  		assert.Equal(t, statsInt.Min, assertStats(t, rg1Col1).EncodeMin())
   144  		assert.Equal(t, statsInt.Max, assertStats(t, rg1Col1).EncodeMax())
   145  		assert.Equal(t, statsFloat.Min, assertStats(t, rg1Col2).EncodeMin())
   146  		assert.Equal(t, statsFloat.Max, assertStats(t, rg1Col2).EncodeMax())
   147  		assert.Zero(t, assertStats(t, rg1Col1).NullCount())
   148  		assert.Zero(t, assertStats(t, rg1Col2).NullCount())
   149  		assert.Equal(t, nrows, assertStats(t, rg1Col1).DistinctCount())
   150  		assert.Equal(t, nrows, assertStats(t, rg1Col2).DistinctCount())
   151  		assert.Equal(t, metadata.DefaultCompressionType, rg1Col1.Compression())
   152  		assert.Equal(t, metadata.DefaultCompressionType, rg1Col2.Compression())
   153  		assert.Equal(t, nrows/2, rg1Col1.NumValues())
   154  		assert.Equal(t, nrows/2, rg1Col2.NumValues())
   155  		assert.Len(t, rg1Col1.Encodings(), 3)
   156  		assert.Len(t, rg1Col2.Encodings(), 3)
   157  		assert.EqualValues(t, 512, rg1Col1.TotalCompressedSize())
   158  		assert.EqualValues(t, 512, rg1Col2.TotalCompressedSize())
   159  		assert.EqualValues(t, 600, rg1Col1.TotalUncompressedSize())
   160  		assert.EqualValues(t, 600, rg1Col2.TotalUncompressedSize())
   161  		assert.EqualValues(t, 4, rg1Col1.DictionaryPageOffset())
   162  		assert.EqualValues(t, 24, rg1Col2.DictionaryPageOffset())
   163  		assert.EqualValues(t, 10, rg1Col1.DataPageOffset())
   164  		assert.EqualValues(t, 30, rg1Col2.DataPageOffset())
   165  		assert.Len(t, rg1Col1.EncodingStats(), 3)
   166  		assert.Len(t, rg1Col2.EncodingStats(), 3)
   167  
   168  		// row group 2 metadata
   169  		rg2Access := accessor.RowGroup(1)
   170  		assert.Equal(t, 2, rg2Access.NumColumns())
   171  		assert.Equal(t, nrows/2, rg2Access.NumRows())
   172  		assert.EqualValues(t, 1024, rg2Access.TotalByteSize())
   173  		assert.EqualValues(t, 1024, rg2Access.TotalCompressedSize())
   174  
   175  		rg2Col1, err := rg2Access.ColumnChunk(0)
   176  		assert.NoError(t, err)
   177  		assert.Equal(t, rg2Access.FileOffset(), rg2Col1.DataPageOffset())
   178  
   179  		rg2Col2, err := rg2Access.ColumnChunk(1)
   180  		assert.NoError(t, err)
   181  		assertStatsSet(t, rg1Col1)
   182  		assertStatsSet(t, rg1Col2)
   183  		assert.Equal(t, statsInt.Min, assertStats(t, rg1Col1).EncodeMin())
   184  		assert.Equal(t, statsInt.Max, assertStats(t, rg1Col1).EncodeMax())
   185  		assert.Equal(t, statsFloat.Min, assertStats(t, rg1Col2).EncodeMin())
   186  		assert.Equal(t, statsFloat.Max, assertStats(t, rg1Col2).EncodeMax())
   187  		assert.Zero(t, assertStats(t, rg1Col1).NullCount())
   188  		assert.Zero(t, assertStats(t, rg1Col2).NullCount())
   189  		assert.Equal(t, nrows, assertStats(t, rg1Col1).DistinctCount())
   190  		assert.Equal(t, nrows, assertStats(t, rg1Col2).DistinctCount())
   191  		assert.Equal(t, metadata.DefaultCompressionType, rg2Col1.Compression())
   192  		assert.Equal(t, metadata.DefaultCompressionType, rg2Col2.Compression())
   193  		assert.Equal(t, nrows/2, rg2Col1.NumValues())
   194  		assert.Equal(t, nrows/2, rg2Col2.NumValues())
   195  		assert.Len(t, rg2Col1.Encodings(), 2)
   196  		assert.Len(t, rg2Col2.Encodings(), 3)
   197  		assert.EqualValues(t, 512, rg2Col1.TotalCompressedSize())
   198  		assert.EqualValues(t, 512, rg2Col2.TotalCompressedSize())
   199  		assert.EqualValues(t, 600, rg2Col1.TotalUncompressedSize())
   200  		assert.EqualValues(t, 600, rg2Col2.TotalUncompressedSize())
   201  		assert.EqualValues(t, 0, rg2Col1.DictionaryPageOffset())
   202  		assert.EqualValues(t, 16, rg2Col2.DictionaryPageOffset())
   203  		assert.EqualValues(t, 10, rg2Col1.DataPageOffset())
   204  		assert.EqualValues(t, 26, rg2Col2.DataPageOffset())
   205  		assert.Len(t, rg2Col1.EncodingStats(), 2)
   206  		assert.Len(t, rg2Col2.EncodingStats(), 2)
   207  
   208  		assert.Empty(t, rg2Col1.FilePath())
   209  		accessor.SetFilePath("/foo/bar/bar.parquet")
   210  		assert.Equal(t, "/foo/bar/bar.parquet", rg2Col1.FilePath())
   211  	}
   212  
   213  	faccessor2, err := generateTableMetaData(schema, props, int(nrows), statsInt, statsFloat)
   214  	require.NoError(t, err)
   215  	faccessor.AppendRowGroups(faccessor2)
   216  	assert.Len(t, faccessor.RowGroups, 4)
   217  	assert.Equal(t, nrows*2, faccessor.NumRows)
   218  	assert.EqualValues(t, parquet.V2_LATEST, faccessor.Version())
   219  	assert.Equal(t, parquet.DefaultCreatedBy, faccessor.GetCreatedBy())
   220  	assert.Equal(t, 3, faccessor.NumSchemaElements())
   221  
   222  	faccessor1, err := faccessor.Subset([]int{2, 3})
   223  	require.NoError(t, err)
   224  	assert.True(t, faccessor1.Equals(faccessor2))
   225  
   226  	faccessor1, err = faccessor2.Subset([]int{0})
   227  	require.NoError(t, err)
   228  
   229  	next, err := faccessor.Subset([]int{0})
   230  	require.NoError(t, err)
   231  	faccessor1.AppendRowGroups(next)
   232  
   233  	sub, err := faccessor.Subset([]int{2, 0})
   234  	require.NoError(t, err)
   235  	assert.True(t, faccessor1.Equals(sub))
   236  }
   237  
   238  func TestV1VersionMetadata(t *testing.T) {
   239  	props := parquet.NewWriterProperties(parquet.WithVersion(parquet.V1_0))
   240  
   241  	fields := schema.FieldList{
   242  		schema.NewInt32Node("int_col", parquet.Repetitions.Required, -1),
   243  		schema.NewFloat32Node("float_col", parquet.Repetitions.Required, -1),
   244  	}
   245  	root, err := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, fields, -1)
   246  	require.NoError(t, err)
   247  	schema := schema.NewSchema(root)
   248  
   249  	fbuilder := metadata.NewFileMetadataBuilder(schema, props, nil)
   250  	faccessor, err := fbuilder.Finish()
   251  	require.NoError(t, err)
   252  	assert.EqualValues(t, parquet.V1_0, faccessor.Version())
   253  }
   254  
   255  func TestKeyValueMetadata(t *testing.T) {
   256  	props := parquet.NewWriterProperties(parquet.WithVersion(parquet.V1_0))
   257  
   258  	fields := schema.FieldList{
   259  		schema.NewInt32Node("int_col", parquet.Repetitions.Required, -1),
   260  		schema.NewFloat32Node("float_col", parquet.Repetitions.Required, -1),
   261  	}
   262  	root, err := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, fields, -1)
   263  	require.NoError(t, err)
   264  	schema := schema.NewSchema(root)
   265  	kvmeta := metadata.NewKeyValueMetadata()
   266  	kvmeta.Append("test_key", "test_value")
   267  
   268  	fbuilder := metadata.NewFileMetadataBuilder(schema, props, kvmeta)
   269  	faccessor, err := fbuilder.Finish()
   270  	require.NoError(t, err)
   271  
   272  	assert.True(t, faccessor.KeyValueMetadata().Equals(kvmeta))
   273  }
   274  
   275  func TestKeyValueMetadataAppend(t *testing.T) {
   276  	props := parquet.NewWriterProperties(parquet.WithVersion(parquet.V1_0))
   277  
   278  	fields := schema.FieldList{
   279  		schema.NewInt32Node("int_col", parquet.Repetitions.Required, -1),
   280  		schema.NewFloat32Node("float_col", parquet.Repetitions.Required, -1),
   281  	}
   282  	root, err := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, fields, -1)
   283  	require.NoError(t, err)
   284  	schema := schema.NewSchema(root)
   285  
   286  	kvmeta := metadata.NewKeyValueMetadata()
   287  	key1 := "test_key1"
   288  	value1 := "test_value1"
   289  	require.NoError(t, kvmeta.Append(key1, value1))
   290  
   291  	fbuilder := metadata.NewFileMetadataBuilder(schema, props, kvmeta)
   292  
   293  	key2 := "test_key2"
   294  	value2 := "test_value2"
   295  	require.NoError(t, fbuilder.AppendKeyValueMetadata(key2, value2))
   296  	faccessor, err := fbuilder.Finish()
   297  	require.NoError(t, err)
   298  
   299  	kv := faccessor.KeyValueMetadata()
   300  
   301  	got1 := kv.FindValue(key1)
   302  	require.NotNil(t, got1)
   303  	assert.Equal(t, value1, *got1)
   304  
   305  	got2 := kv.FindValue(key2)
   306  	require.NotNil(t, got2)
   307  	assert.Equal(t, value2, *got2)
   308  }
   309  
   310  func TestApplicationVersion(t *testing.T) {
   311  	version := metadata.NewAppVersion("parquet-mr version 1.7.9")
   312  	version1 := metadata.NewAppVersion("parquet-mr version 1.8.0")
   313  	version2 := metadata.NewAppVersion("parquet-cpp version 1.0.0")
   314  	version3 := metadata.NewAppVersion("")
   315  	version4 := metadata.NewAppVersion("parquet-mr version 1.5.0ab-cdh5.5.0+cd (build abcd)")
   316  	version5 := metadata.NewAppVersion("parquet-mr")
   317  
   318  	assert.Equal(t, "parquet-mr", version.App)
   319  	assert.Equal(t, 1, version.Version.Major)
   320  	assert.Equal(t, 7, version.Version.Minor)
   321  	assert.Equal(t, 9, version.Version.Patch)
   322  
   323  	assert.Equal(t, "parquet-cpp", version2.App)
   324  	assert.Equal(t, 1, version2.Version.Major)
   325  	assert.Equal(t, 0, version2.Version.Minor)
   326  	assert.Equal(t, 0, version2.Version.Patch)
   327  
   328  	assert.Equal(t, "parquet-mr", version4.App)
   329  	assert.Equal(t, "abcd", version4.Build)
   330  	assert.Equal(t, 1, version4.Version.Major)
   331  	assert.Equal(t, 5, version4.Version.Minor)
   332  	assert.Equal(t, 0, version4.Version.Patch)
   333  	assert.Equal(t, "ab", version4.Version.Unknown)
   334  	assert.Equal(t, "cdh5.5.0", version4.Version.PreRelease)
   335  	assert.Equal(t, "cd", version4.Version.BuildInfo)
   336  
   337  	assert.Equal(t, "parquet-mr", version5.App)
   338  	assert.Equal(t, 0, version5.Version.Major)
   339  	assert.Equal(t, 0, version5.Version.Minor)
   340  	assert.Equal(t, 0, version5.Version.Patch)
   341  
   342  	assert.True(t, version.LessThan(version1))
   343  
   344  	var stats metadata.EncodedStatistics
   345  	assert.False(t, version1.HasCorrectStatistics(parquet.Types.Int96, schema.NoLogicalType{}, stats, schema.SortUNKNOWN))
   346  	assert.True(t, version.HasCorrectStatistics(parquet.Types.Int32, schema.NoLogicalType{}, stats, schema.SortSIGNED))
   347  	assert.False(t, version.HasCorrectStatistics(parquet.Types.ByteArray, schema.NoLogicalType{}, stats, schema.SortSIGNED))
   348  	assert.True(t, version1.HasCorrectStatistics(parquet.Types.ByteArray, schema.NoLogicalType{}, stats, schema.SortSIGNED))
   349  	assert.False(t, version1.HasCorrectStatistics(parquet.Types.ByteArray, schema.NoLogicalType{}, stats, schema.SortUNSIGNED))
   350  	assert.True(t, version3.HasCorrectStatistics(parquet.Types.FixedLenByteArray, schema.NoLogicalType{}, stats, schema.SortSIGNED))
   351  
   352  	// check that the old stats are correct if min and max are the same regardless of sort order
   353  	var statsStr metadata.EncodedStatistics
   354  	statsStr.SetMin([]byte("a")).SetMax([]byte("b"))
   355  	assert.False(t, version1.HasCorrectStatistics(parquet.Types.ByteArray, schema.NoLogicalType{}, statsStr, schema.SortUNSIGNED))
   356  	statsStr.SetMax([]byte("a"))
   357  	assert.True(t, version1.HasCorrectStatistics(parquet.Types.ByteArray, schema.NoLogicalType{}, statsStr, schema.SortUNSIGNED))
   358  
   359  	// check that the same holds true for ints
   360  	var (
   361  		intMin int32 = 100
   362  		intMax int32 = 200
   363  	)
   364  	var statsInt metadata.EncodedStatistics
   365  	statsInt.SetMin((*(*[4]byte)(unsafe.Pointer(&intMin)))[:])
   366  	statsInt.SetMax((*(*[4]byte)(unsafe.Pointer(&intMax)))[:])
   367  	assert.False(t, version1.HasCorrectStatistics(parquet.Types.ByteArray, schema.NoLogicalType{}, statsInt, schema.SortUNSIGNED))
   368  	statsInt.SetMax((*(*[4]byte)(unsafe.Pointer(&intMin)))[:])
   369  	assert.True(t, version1.HasCorrectStatistics(parquet.Types.ByteArray, schema.NoLogicalType{}, statsInt, schema.SortUNSIGNED))
   370  }
   371  
   372  func TestCheckBadDecimalStats(t *testing.T) {
   373  	version1 := metadata.NewAppVersion("parquet-cpp version 3.0.0")
   374  	version2 := metadata.NewAppVersion("parquet-cpp-arrow version 3.0.0")
   375  	version3 := metadata.NewAppVersion("parquet-cpp-arrow version 4.0.0")
   376  
   377  	var stats metadata.EncodedStatistics
   378  	assert.False(t, version1.HasCorrectStatistics(parquet.Types.FixedLenByteArray, schema.NewDecimalLogicalType(5, 0), stats, schema.SortSIGNED))
   379  	assert.False(t, version2.HasCorrectStatistics(parquet.Types.FixedLenByteArray, schema.NewDecimalLogicalType(5, 0), stats, schema.SortSIGNED))
   380  	assert.True(t, version3.HasCorrectStatistics(parquet.Types.FixedLenByteArray, schema.NewDecimalLogicalType(5, 0), stats, schema.SortSIGNED))
   381  }