github.com/apache/arrow/go/v10@v10.0.1/parquet/file/file_reader_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file_test
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"io"
    23  	"math/rand"
    24  	"testing"
    25  
    26  	"github.com/apache/arrow/go/v10/arrow/memory"
    27  	"github.com/apache/arrow/go/v10/internal/utils"
    28  	"github.com/apache/arrow/go/v10/parquet/compress"
    29  	"github.com/apache/arrow/go/v10/parquet/file"
    30  	"github.com/apache/arrow/go/v10/parquet/internal/encoding"
    31  	format "github.com/apache/arrow/go/v10/parquet/internal/gen-go/parquet"
    32  	"github.com/apache/arrow/go/v10/parquet/internal/thrift"
    33  	"github.com/apache/arrow/go/v10/parquet/metadata"
    34  	libthrift "github.com/apache/thrift/lib/go/thrift"
    35  	"github.com/stretchr/testify/assert"
    36  	"github.com/stretchr/testify/suite"
    37  )
    38  
    39  func getDummyStats(statSize int, fillAll bool) *format.Statistics {
    40  	statBytes := make([]byte, statSize)
    41  	memory.Set(statBytes, 1)
    42  
    43  	ret := format.NewStatistics()
    44  	ret.Max = statBytes
    45  	if fillAll {
    46  		ret.Min = statBytes
    47  		ret.NullCount = libthrift.Int64Ptr(42)
    48  		ret.DistinctCount = libthrift.Int64Ptr(1)
    49  	}
    50  	return ret
    51  }
    52  
    53  func checkStatistics(t *testing.T, stats format.Statistics, actual metadata.EncodedStatistics) {
    54  	if stats.IsSetMax() {
    55  		assert.Equal(t, stats.Max, actual.Max)
    56  	}
    57  	if stats.IsSetMin() {
    58  		assert.Equal(t, stats.Min, actual.Min)
    59  	}
    60  	if stats.IsSetNullCount() {
    61  		assert.Equal(t, stats.GetNullCount(), actual.NullCount)
    62  	}
    63  	if stats.IsSetDistinctCount() {
    64  		assert.Equal(t, stats.GetDistinctCount(), actual.DistinctCount)
    65  	}
    66  }
    67  
    68  type PageSerdeSuite struct {
    69  	suite.Suite
    70  
    71  	sink   *encoding.BufferWriter
    72  	buffer *memory.Buffer
    73  
    74  	pageHdr       format.PageHeader
    75  	dataPageHdr   format.DataPageHeader
    76  	dataPageHdrV2 format.DataPageHeaderV2
    77  
    78  	pageReader file.PageReader
    79  }
    80  
    81  func TestFileDeserializing(t *testing.T) {
    82  	t.Parallel()
    83  	suite.Run(t, new(PageSerdeSuite))
    84  }
    85  
    86  func (p *PageSerdeSuite) ResetStream() {
    87  	p.sink = encoding.NewBufferWriter(0, memory.DefaultAllocator)
    88  }
    89  
    90  func (p *PageSerdeSuite) EndStream() {
    91  	p.buffer = p.sink.Finish()
    92  }
    93  
    94  func (p *PageSerdeSuite) SetupTest() {
    95  	p.dataPageHdr.Encoding = format.Encoding_PLAIN
    96  	p.dataPageHdr.DefinitionLevelEncoding = format.Encoding_RLE
    97  	p.dataPageHdr.RepetitionLevelEncoding = format.Encoding_RLE
    98  
    99  	p.ResetStream()
   100  }
   101  
   102  func (p *PageSerdeSuite) InitSerializedPageReader(nrows int64, codec compress.Compression) {
   103  	p.EndStream()
   104  
   105  	p.pageReader, _ = file.NewPageReader(utils.NewBufferedReader(bytes.NewReader(p.buffer.Bytes()), p.buffer.Len()), nrows, codec, memory.DefaultAllocator, nil)
   106  }
   107  
   108  func (p *PageSerdeSuite) WriteDataPageHeader(maxSerialized int, uncompressed, compressed int32) {
   109  	// simplifying writing serialized data page headers which may or may
   110  	// not have meaningful data associated with them
   111  
   112  	p.pageHdr.DataPageHeader = &p.dataPageHdr
   113  	p.pageHdr.UncompressedPageSize = uncompressed
   114  	p.pageHdr.CompressedPageSize = compressed
   115  	p.pageHdr.Type = format.PageType_DATA_PAGE
   116  
   117  	serializer := thrift.NewThriftSerializer()
   118  	p.NotPanics(func() {
   119  		serializer.Serialize(&p.pageHdr, p.sink, nil)
   120  	})
   121  }
   122  
   123  func (p *PageSerdeSuite) WriteDataPageHeaderV2(maxSerialized int, uncompressed, compressed int32) {
   124  	p.pageHdr.DataPageHeaderV2 = &p.dataPageHdrV2
   125  	p.pageHdr.UncompressedPageSize = uncompressed
   126  	p.pageHdr.CompressedPageSize = compressed
   127  	p.pageHdr.Type = format.PageType_DATA_PAGE_V2
   128  
   129  	serializer := thrift.NewThriftSerializer()
   130  	p.NotPanics(func() {
   131  		serializer.Serialize(&p.pageHdr, p.sink, nil)
   132  	})
   133  }
   134  
   135  func (p *PageSerdeSuite) CheckDataPageHeader(expected format.DataPageHeader, page file.Page) {
   136  	p.Equal(format.PageType_DATA_PAGE, page.Type())
   137  
   138  	p.IsType(&file.DataPageV1{}, page)
   139  	p.Equal(expected.NumValues, page.NumValues())
   140  	p.Equal(expected.Encoding, page.Encoding())
   141  	p.EqualValues(expected.DefinitionLevelEncoding, page.(*file.DataPageV1).DefinitionLevelEncoding())
   142  	p.EqualValues(expected.RepetitionLevelEncoding, page.(*file.DataPageV1).RepetitionLevelEncoding())
   143  	checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics())
   144  }
   145  
   146  func (p *PageSerdeSuite) CheckDataPageHeaderV2(expected format.DataPageHeaderV2, page file.Page) {
   147  	p.Equal(format.PageType_DATA_PAGE_V2, page.Type())
   148  
   149  	p.IsType(&file.DataPageV2{}, page)
   150  	p.Equal(expected.NumValues, page.NumValues())
   151  	p.Equal(expected.Encoding, page.Encoding())
   152  	p.Equal(expected.NumNulls, page.(*file.DataPageV2).NumNulls())
   153  	p.Equal(expected.DefinitionLevelsByteLength, page.(*file.DataPageV2).DefinitionLevelByteLen())
   154  	p.Equal(expected.RepetitionLevelsByteLength, page.(*file.DataPageV2).RepetitionLevelByteLen())
   155  	p.Equal(expected.IsCompressed, page.(*file.DataPageV2).IsCompressed())
   156  	checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics())
   157  }
   158  
   159  func (p *PageSerdeSuite) TestDataPageV1() {
   160  	const (
   161  		statsSize = 512
   162  		nrows     = 4444
   163  	)
   164  	p.dataPageHdr.Statistics = getDummyStats(statsSize, true)
   165  	p.dataPageHdr.NumValues = nrows
   166  
   167  	p.WriteDataPageHeader(1024, 0, 0)
   168  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   169  	p.True(p.pageReader.Next())
   170  	currentPage := p.pageReader.Page()
   171  	p.CheckDataPageHeader(p.dataPageHdr, currentPage)
   172  }
   173  
   174  func (p *PageSerdeSuite) TestDataPageV2() {
   175  	const (
   176  		statsSize = 512
   177  		nrows     = 4444
   178  	)
   179  	p.dataPageHdrV2.Statistics = getDummyStats(statsSize, true)
   180  	p.dataPageHdrV2.NumValues = nrows
   181  	p.WriteDataPageHeaderV2(1024, 0, 0)
   182  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   183  	p.True(p.pageReader.Next())
   184  	p.CheckDataPageHeaderV2(p.dataPageHdrV2, p.pageReader.Page())
   185  }
   186  
   187  func (p *PageSerdeSuite) TestLargePageHeaders() {
   188  	const (
   189  		statsSize     = 256 * 1024 // 256KB
   190  		nrows         = 4141
   191  		maxHeaderSize = 512 * 1024 // 512KB
   192  	)
   193  
   194  	p.dataPageHdr.Statistics = getDummyStats(statsSize, false)
   195  	p.dataPageHdr.NumValues = nrows
   196  	p.WriteDataPageHeader(maxHeaderSize, 0, 0)
   197  	pos, err := p.sink.Seek(0, io.SeekCurrent)
   198  	p.NoError(err)
   199  	p.GreaterOrEqual(maxHeaderSize, int(pos))
   200  	p.LessOrEqual(statsSize, int(pos))
   201  	p.GreaterOrEqual(16*1024*1024, int(pos))
   202  
   203  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   204  	p.True(p.pageReader.Next())
   205  	p.CheckDataPageHeader(p.dataPageHdr, p.pageReader.Page())
   206  }
   207  
   208  func (p *PageSerdeSuite) TestFailLargePageHeaders() {
   209  	const (
   210  		statsSize      = 256 * 1024 // 256KB
   211  		nrows          = 1337       // dummy value
   212  		maxHeaderSize  = 512 * 1024 // 512 KB
   213  		smallerMaxSize = 128 * 1024 // 128KB
   214  	)
   215  	p.dataPageHdr.Statistics = getDummyStats(statsSize, false)
   216  	p.WriteDataPageHeader(maxHeaderSize, 0, 0)
   217  	pos, err := p.sink.Seek(0, io.SeekCurrent)
   218  	p.NoError(err)
   219  	p.GreaterOrEqual(maxHeaderSize, int(pos))
   220  
   221  	p.LessOrEqual(smallerMaxSize, int(pos))
   222  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   223  	p.pageReader.SetMaxPageHeaderSize(smallerMaxSize)
   224  	p.NotPanics(func() { p.False(p.pageReader.Next()) })
   225  	p.Error(p.pageReader.Err())
   226  }
   227  
   228  func (p *PageSerdeSuite) TestCompression() {
   229  	codecs := []compress.Compression{
   230  		compress.Codecs.Snappy,
   231  		compress.Codecs.Brotli,
   232  		compress.Codecs.Gzip,
   233  		// compress.Codecs.Lz4, // not yet implemented
   234  		compress.Codecs.Zstd,
   235  	}
   236  
   237  	const (
   238  		nrows  = 32 // dummy value
   239  		npages = 10
   240  	)
   241  	p.dataPageHdr.NumValues = nrows
   242  
   243  	fauxData := make([][]byte, npages)
   244  	for idx := range fauxData {
   245  		// each page is larger
   246  		fauxData[idx] = make([]byte, (idx+1)*64)
   247  		rand.Read(fauxData[idx])
   248  	}
   249  	for _, c := range codecs {
   250  		p.Run(c.String(), func() {
   251  			codec, _ := compress.GetCodec(c)
   252  			for _, data := range fauxData {
   253  				maxCompressed := codec.CompressBound(int64(len(data)))
   254  				buffer := make([]byte, maxCompressed)
   255  				buffer = codec.Encode(buffer, data)
   256  				p.WriteDataPageHeader(1024, int32(len(data)), int32(len(buffer)))
   257  				_, err := p.sink.Write(buffer)
   258  				p.NoError(err)
   259  			}
   260  
   261  			p.InitSerializedPageReader(nrows*npages, c)
   262  
   263  			for _, data := range fauxData {
   264  				p.True(p.pageReader.Next())
   265  				page := p.pageReader.Page()
   266  				p.IsType(&file.DataPageV1{}, page)
   267  				p.Equal(data, page.Data())
   268  			}
   269  			p.ResetStream()
   270  		})
   271  	}
   272  }
   273  
   274  func TestInvalidHeaders(t *testing.T) {
   275  	badHeader := []byte("PAR2")
   276  	_, err := file.NewParquetReader(bytes.NewReader(badHeader))
   277  	assert.Error(t, err)
   278  }
   279  
   280  func TestInvalidFooter(t *testing.T) {
   281  	// file is smaller than FOOTER_SIZE
   282  	badFile := []byte("PAR1PAR")
   283  	_, err := file.NewParquetReader(bytes.NewReader(badFile))
   284  	assert.Error(t, err)
   285  
   286  	// Magic Number Incorrect
   287  	badFile2 := []byte("PAR1PAR2")
   288  	_, err = file.NewParquetReader(bytes.NewReader(badFile2))
   289  	assert.Error(t, err)
   290  }
   291  
   292  func TestIncompleteMetadata(t *testing.T) {
   293  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   294  	magic := []byte("PAR1")
   295  
   296  	sink.Write(magic)
   297  	sink.Write(make([]byte, 10))
   298  	const metadataLen = 24
   299  	binary.Write(sink, binary.LittleEndian, uint32(metadataLen))
   300  	sink.Write(magic)
   301  	buf := sink.Finish()
   302  	defer buf.Release()
   303  	_, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
   304  	assert.Error(t, err)
   305  }