github.com/apache/arrow/go/v7@v7.0.1/parquet/file/file_reader_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file_test
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"io"
    23  	"math/rand"
    24  	"testing"
    25  
    26  	"github.com/apache/arrow/go/v7/arrow/memory"
    27  	"github.com/apache/arrow/go/v7/parquet/compress"
    28  	"github.com/apache/arrow/go/v7/parquet/file"
    29  	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
    30  	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
    31  	"github.com/apache/arrow/go/v7/parquet/internal/thrift"
    32  	"github.com/apache/arrow/go/v7/parquet/metadata"
    33  	libthrift "github.com/apache/thrift/lib/go/thrift"
    34  	"github.com/stretchr/testify/assert"
    35  	"github.com/stretchr/testify/suite"
    36  )
    37  
    38  func getDummyStats(statSize int, fillAll bool) *format.Statistics {
    39  	statBytes := make([]byte, statSize)
    40  	memory.Set(statBytes, 1)
    41  
    42  	ret := format.NewStatistics()
    43  	ret.Max = statBytes
    44  	if fillAll {
    45  		ret.Min = statBytes
    46  		ret.NullCount = libthrift.Int64Ptr(42)
    47  		ret.DistinctCount = libthrift.Int64Ptr(1)
    48  	}
    49  	return ret
    50  }
    51  
    52  func checkStatistics(t *testing.T, stats format.Statistics, actual metadata.EncodedStatistics) {
    53  	if stats.IsSetMax() {
    54  		assert.Equal(t, stats.Max, actual.Max)
    55  	}
    56  	if stats.IsSetMin() {
    57  		assert.Equal(t, stats.Min, actual.Min)
    58  	}
    59  	if stats.IsSetNullCount() {
    60  		assert.Equal(t, stats.GetNullCount(), actual.NullCount)
    61  	}
    62  	if stats.IsSetDistinctCount() {
    63  		assert.Equal(t, stats.GetDistinctCount(), actual.DistinctCount)
    64  	}
    65  }
    66  
    67  type PageSerdeSuite struct {
    68  	suite.Suite
    69  
    70  	sink   *encoding.BufferWriter
    71  	buffer *memory.Buffer
    72  
    73  	pageHdr       format.PageHeader
    74  	dataPageHdr   format.DataPageHeader
    75  	dataPageHdrV2 format.DataPageHeaderV2
    76  
    77  	pageReader file.PageReader
    78  }
    79  
    80  func TestFileDeserializing(t *testing.T) {
    81  	t.Parallel()
    82  	suite.Run(t, new(PageSerdeSuite))
    83  }
    84  
    85  func (p *PageSerdeSuite) ResetStream() {
    86  	p.sink = encoding.NewBufferWriter(0, memory.DefaultAllocator)
    87  }
    88  
    89  func (p *PageSerdeSuite) EndStream() {
    90  	p.buffer = p.sink.Finish()
    91  }
    92  
    93  func (p *PageSerdeSuite) SetupTest() {
    94  	p.dataPageHdr.Encoding = format.Encoding_PLAIN
    95  	p.dataPageHdr.DefinitionLevelEncoding = format.Encoding_RLE
    96  	p.dataPageHdr.RepetitionLevelEncoding = format.Encoding_RLE
    97  
    98  	p.ResetStream()
    99  }
   100  
   101  func (p *PageSerdeSuite) InitSerializedPageReader(nrows int64, codec compress.Compression) {
   102  	p.EndStream()
   103  
   104  	p.pageReader, _ = file.NewPageReader(bytes.NewReader(p.buffer.Bytes()), nrows, codec, memory.DefaultAllocator, nil)
   105  }
   106  
   107  func (p *PageSerdeSuite) WriteDataPageHeader(maxSerialized int, uncompressed, compressed int32) {
   108  	// simplifying writing serialized data page headers which may or may
   109  	// not have meaningful data associated with them
   110  
   111  	p.pageHdr.DataPageHeader = &p.dataPageHdr
   112  	p.pageHdr.UncompressedPageSize = uncompressed
   113  	p.pageHdr.CompressedPageSize = compressed
   114  	p.pageHdr.Type = format.PageType_DATA_PAGE
   115  
   116  	serializer := thrift.NewThriftSerializer()
   117  	p.NotPanics(func() {
   118  		serializer.Serialize(&p.pageHdr, p.sink, nil)
   119  	})
   120  }
   121  
   122  func (p *PageSerdeSuite) WriteDataPageHeaderV2(maxSerialized int, uncompressed, compressed int32) {
   123  	p.pageHdr.DataPageHeaderV2 = &p.dataPageHdrV2
   124  	p.pageHdr.UncompressedPageSize = uncompressed
   125  	p.pageHdr.CompressedPageSize = compressed
   126  	p.pageHdr.Type = format.PageType_DATA_PAGE_V2
   127  
   128  	serializer := thrift.NewThriftSerializer()
   129  	p.NotPanics(func() {
   130  		serializer.Serialize(&p.pageHdr, p.sink, nil)
   131  	})
   132  }
   133  
   134  func (p *PageSerdeSuite) CheckDataPageHeader(expected format.DataPageHeader, page file.Page) {
   135  	p.Equal(format.PageType_DATA_PAGE, page.Type())
   136  
   137  	p.IsType(&file.DataPageV1{}, page)
   138  	p.Equal(expected.NumValues, page.NumValues())
   139  	p.Equal(expected.Encoding, page.Encoding())
   140  	p.EqualValues(expected.DefinitionLevelEncoding, page.(*file.DataPageV1).DefinitionLevelEncoding())
   141  	p.EqualValues(expected.RepetitionLevelEncoding, page.(*file.DataPageV1).RepetitionLevelEncoding())
   142  	checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics())
   143  }
   144  
   145  func (p *PageSerdeSuite) CheckDataPageHeaderV2(expected format.DataPageHeaderV2, page file.Page) {
   146  	p.Equal(format.PageType_DATA_PAGE_V2, page.Type())
   147  
   148  	p.IsType(&file.DataPageV2{}, page)
   149  	p.Equal(expected.NumValues, page.NumValues())
   150  	p.Equal(expected.Encoding, page.Encoding())
   151  	p.Equal(expected.NumNulls, page.(*file.DataPageV2).NumNulls())
   152  	p.Equal(expected.DefinitionLevelsByteLength, page.(*file.DataPageV2).DefinitionLevelByteLen())
   153  	p.Equal(expected.RepetitionLevelsByteLength, page.(*file.DataPageV2).RepetitionLevelByteLen())
   154  	p.Equal(expected.IsCompressed, page.(*file.DataPageV2).IsCompressed())
   155  	checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics())
   156  }
   157  
   158  func (p *PageSerdeSuite) TestDataPageV1() {
   159  	const (
   160  		statsSize = 512
   161  		nrows     = 4444
   162  	)
   163  	p.dataPageHdr.Statistics = getDummyStats(statsSize, true)
   164  	p.dataPageHdr.NumValues = nrows
   165  
   166  	p.WriteDataPageHeader(1024, 0, 0)
   167  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   168  	p.True(p.pageReader.Next())
   169  	currentPage := p.pageReader.Page()
   170  	p.CheckDataPageHeader(p.dataPageHdr, currentPage)
   171  }
   172  
   173  func (p *PageSerdeSuite) TestDataPageV2() {
   174  	const (
   175  		statsSize = 512
   176  		nrows     = 4444
   177  	)
   178  	p.dataPageHdrV2.Statistics = getDummyStats(statsSize, true)
   179  	p.dataPageHdrV2.NumValues = nrows
   180  	p.WriteDataPageHeaderV2(1024, 0, 0)
   181  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   182  	p.True(p.pageReader.Next())
   183  	p.CheckDataPageHeaderV2(p.dataPageHdrV2, p.pageReader.Page())
   184  }
   185  
   186  func (p *PageSerdeSuite) TestLargePageHeaders() {
   187  	const (
   188  		statsSize     = 256 * 1024 // 256KB
   189  		nrows         = 4141
   190  		maxHeaderSize = 512 * 1024 // 512KB
   191  	)
   192  
   193  	p.dataPageHdr.Statistics = getDummyStats(statsSize, false)
   194  	p.dataPageHdr.NumValues = nrows
   195  	p.WriteDataPageHeader(maxHeaderSize, 0, 0)
   196  	pos, err := p.sink.Seek(0, io.SeekCurrent)
   197  	p.NoError(err)
   198  	p.GreaterOrEqual(maxHeaderSize, int(pos))
   199  	p.LessOrEqual(statsSize, int(pos))
   200  	p.GreaterOrEqual(16*1024*1024, int(pos))
   201  
   202  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   203  	p.True(p.pageReader.Next())
   204  	p.CheckDataPageHeader(p.dataPageHdr, p.pageReader.Page())
   205  }
   206  
   207  func (p *PageSerdeSuite) TestFailLargePageHeaders() {
   208  	const (
   209  		statsSize      = 256 * 1024 // 256KB
   210  		nrows          = 1337       // dummy value
   211  		maxHeaderSize  = 512 * 1024 // 512 KB
   212  		smallerMaxSize = 128 * 1024 // 128KB
   213  	)
   214  	p.dataPageHdr.Statistics = getDummyStats(statsSize, false)
   215  	p.WriteDataPageHeader(maxHeaderSize, 0, 0)
   216  	pos, err := p.sink.Seek(0, io.SeekCurrent)
   217  	p.NoError(err)
   218  	p.GreaterOrEqual(maxHeaderSize, int(pos))
   219  
   220  	p.LessOrEqual(smallerMaxSize, int(pos))
   221  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   222  	p.pageReader.SetMaxPageHeaderSize(smallerMaxSize)
   223  	p.NotPanics(func() { p.False(p.pageReader.Next()) })
   224  	p.Error(p.pageReader.Err())
   225  }
   226  
   227  func (p *PageSerdeSuite) TestCompression() {
   228  	codecs := []compress.Compression{
   229  		compress.Codecs.Snappy,
   230  		compress.Codecs.Brotli,
   231  		compress.Codecs.Gzip,
   232  		// compress.Codecs.Lz4, // not yet implemented
   233  		compress.Codecs.Zstd,
   234  	}
   235  
   236  	const (
   237  		nrows  = 32 // dummy value
   238  		npages = 10
   239  	)
   240  	p.dataPageHdr.NumValues = nrows
   241  
   242  	fauxData := make([][]byte, npages)
   243  	for idx := range fauxData {
   244  		// each page is larger
   245  		fauxData[idx] = make([]byte, (idx+1)*64)
   246  		rand.Read(fauxData[idx])
   247  	}
   248  	for _, c := range codecs {
   249  		p.Run(c.String(), func() {
   250  			codec, _ := compress.GetCodec(c)
   251  			for _, data := range fauxData {
   252  				maxCompressed := codec.CompressBound(int64(len(data)))
   253  				buffer := make([]byte, maxCompressed)
   254  				buffer = codec.Encode(buffer, data)
   255  				p.WriteDataPageHeader(1024, int32(len(data)), int32(len(buffer)))
   256  				_, err := p.sink.Write(buffer)
   257  				p.NoError(err)
   258  			}
   259  
   260  			p.InitSerializedPageReader(nrows*npages, c)
   261  
   262  			for _, data := range fauxData {
   263  				p.True(p.pageReader.Next())
   264  				page := p.pageReader.Page()
   265  				p.IsType(&file.DataPageV1{}, page)
   266  				p.Equal(data, page.Data())
   267  			}
   268  			p.ResetStream()
   269  		})
   270  	}
   271  }
   272  
   273  func TestInvalidHeaders(t *testing.T) {
   274  	badHeader := []byte("PAR2")
   275  	_, err := file.NewParquetReader(bytes.NewReader(badHeader))
   276  	assert.Error(t, err)
   277  }
   278  
   279  func TestInvalidFooter(t *testing.T) {
   280  	// file is smaller than FOOTER_SIZE
   281  	badFile := []byte("PAR1PAR")
   282  	_, err := file.NewParquetReader(bytes.NewReader(badFile))
   283  	assert.Error(t, err)
   284  
   285  	// Magic Number Incorrect
   286  	badFile2 := []byte("PAR1PAR2")
   287  	_, err = file.NewParquetReader(bytes.NewReader(badFile2))
   288  	assert.Error(t, err)
   289  }
   290  
   291  func TestIncompleteMetadata(t *testing.T) {
   292  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   293  	magic := []byte("PAR1")
   294  
   295  	sink.Write(magic)
   296  	sink.Write(make([]byte, 10))
   297  	const metadataLen = 24
   298  	binary.Write(sink, binary.LittleEndian, uint32(metadataLen))
   299  	sink.Write(magic)
   300  	buf := sink.Finish()
   301  	defer buf.Release()
   302  	_, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
   303  	assert.Error(t, err)
   304  }