github.com/apache/arrow/go/v14@v14.0.1/parquet/file/file_reader_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file_test
    18  
    19  import (
    20  	"bytes"
    21  	"crypto/rand"
    22  	"encoding/binary"
    23  	"io"
    24  	"testing"
    25  
    26  	"github.com/apache/arrow/go/v14/arrow/memory"
    27  	"github.com/apache/arrow/go/v14/internal/utils"
    28  	"github.com/apache/arrow/go/v14/parquet"
    29  	"github.com/apache/arrow/go/v14/parquet/compress"
    30  	"github.com/apache/arrow/go/v14/parquet/file"
    31  	"github.com/apache/arrow/go/v14/parquet/internal/encoding"
    32  	format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet"
    33  	"github.com/apache/arrow/go/v14/parquet/internal/thrift"
    34  	"github.com/apache/arrow/go/v14/parquet/metadata"
    35  	"github.com/apache/arrow/go/v14/parquet/schema"
    36  	libthrift "github.com/apache/thrift/lib/go/thrift"
    37  	"github.com/stretchr/testify/assert"
    38  	"github.com/stretchr/testify/require"
    39  	"github.com/stretchr/testify/suite"
    40  )
    41  
    42  func getDummyStats(statSize int, fillAll bool) *format.Statistics {
    43  	statBytes := make([]byte, statSize)
    44  	memory.Set(statBytes, 1)
    45  
    46  	ret := format.NewStatistics()
    47  	ret.Max = statBytes
    48  	if fillAll {
    49  		ret.Min = statBytes
    50  		ret.NullCount = libthrift.Int64Ptr(42)
    51  		ret.DistinctCount = libthrift.Int64Ptr(1)
    52  	}
    53  	return ret
    54  }
    55  
    56  func checkStatistics(t *testing.T, stats format.Statistics, actual metadata.EncodedStatistics) {
    57  	if stats.IsSetMax() {
    58  		assert.Equal(t, stats.Max, actual.Max)
    59  	}
    60  	if stats.IsSetMin() {
    61  		assert.Equal(t, stats.Min, actual.Min)
    62  	}
    63  	if stats.IsSetNullCount() {
    64  		assert.Equal(t, stats.GetNullCount(), actual.NullCount)
    65  	}
    66  	if stats.IsSetDistinctCount() {
    67  		assert.Equal(t, stats.GetDistinctCount(), actual.DistinctCount)
    68  	}
    69  }
    70  
    71  type testReader struct {
    72  	*bytes.Reader
    73  }
    74  
    75  // ReadAt for testReader returns io.EOF when off + len(b) is exactly the length of the underlying input source.
    76  func (tr testReader) ReadAt(b []byte, off int64) (int, error) {
    77  	n, err := tr.Reader.ReadAt(b, off)
    78  	if err == nil && (int64(n)+off == tr.Size()) {
    79  		return n, io.EOF
    80  	}
    81  	return n, err
    82  }
    83  
    84  type PageSerdeSuite struct {
    85  	suite.Suite
    86  
    87  	sink   *encoding.BufferWriter
    88  	buffer *memory.Buffer
    89  
    90  	pageHdr       format.PageHeader
    91  	dataPageHdr   format.DataPageHeader
    92  	dataPageHdrV2 format.DataPageHeaderV2
    93  
    94  	pageReader file.PageReader
    95  }
    96  
    97  func TestFileDeserializing(t *testing.T) {
    98  	t.Parallel()
    99  	suite.Run(t, new(PageSerdeSuite))
   100  }
   101  
   102  func (p *PageSerdeSuite) ResetStream() {
   103  	p.sink = encoding.NewBufferWriter(0, memory.DefaultAllocator)
   104  }
   105  
   106  func (p *PageSerdeSuite) EndStream() {
   107  	p.buffer = p.sink.Finish()
   108  }
   109  
   110  func (p *PageSerdeSuite) SetupTest() {
   111  	p.dataPageHdr.Encoding = format.Encoding_PLAIN
   112  	p.dataPageHdr.DefinitionLevelEncoding = format.Encoding_RLE
   113  	p.dataPageHdr.RepetitionLevelEncoding = format.Encoding_RLE
   114  
   115  	p.ResetStream()
   116  }
   117  
   118  func (p *PageSerdeSuite) InitSerializedPageReader(nrows int64, codec compress.Compression) {
   119  	p.EndStream()
   120  
   121  	p.pageReader, _ = file.NewPageReader(utils.NewBufferedReader(bytes.NewReader(p.buffer.Bytes()), p.buffer.Len()), nrows, codec, memory.DefaultAllocator, nil)
   122  }
   123  
   124  func (p *PageSerdeSuite) WriteDataPageHeader(maxSerialized int, uncompressed, compressed int32) {
   125  	// simplifying writing serialized data page headers which may or may
   126  	// not have meaningful data associated with them
   127  
   128  	p.pageHdr.DataPageHeader = &p.dataPageHdr
   129  	p.pageHdr.UncompressedPageSize = uncompressed
   130  	p.pageHdr.CompressedPageSize = compressed
   131  	p.pageHdr.Type = format.PageType_DATA_PAGE
   132  
   133  	serializer := thrift.NewThriftSerializer()
   134  	p.NotPanics(func() {
   135  		serializer.Serialize(&p.pageHdr, p.sink, nil)
   136  	})
   137  }
   138  
   139  func (p *PageSerdeSuite) WriteDataPageHeaderV2(maxSerialized int, uncompressed, compressed int32) {
   140  	p.pageHdr.DataPageHeaderV2 = &p.dataPageHdrV2
   141  	p.pageHdr.UncompressedPageSize = uncompressed
   142  	p.pageHdr.CompressedPageSize = compressed
   143  	p.pageHdr.Type = format.PageType_DATA_PAGE_V2
   144  
   145  	serializer := thrift.NewThriftSerializer()
   146  	p.NotPanics(func() {
   147  		serializer.Serialize(&p.pageHdr, p.sink, nil)
   148  	})
   149  }
   150  
   151  func (p *PageSerdeSuite) CheckDataPageHeader(expected format.DataPageHeader, page file.Page) {
   152  	p.Equal(format.PageType_DATA_PAGE, page.Type())
   153  
   154  	p.IsType(&file.DataPageV1{}, page)
   155  	p.Equal(expected.NumValues, page.NumValues())
   156  	p.Equal(expected.Encoding, page.Encoding())
   157  	p.EqualValues(expected.DefinitionLevelEncoding, page.(*file.DataPageV1).DefinitionLevelEncoding())
   158  	p.EqualValues(expected.RepetitionLevelEncoding, page.(*file.DataPageV1).RepetitionLevelEncoding())
   159  	checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics())
   160  }
   161  
   162  func (p *PageSerdeSuite) CheckDataPageHeaderV2(expected format.DataPageHeaderV2, page file.Page) {
   163  	p.Equal(format.PageType_DATA_PAGE_V2, page.Type())
   164  
   165  	p.IsType(&file.DataPageV2{}, page)
   166  	p.Equal(expected.NumValues, page.NumValues())
   167  	p.Equal(expected.Encoding, page.Encoding())
   168  	p.Equal(expected.NumNulls, page.(*file.DataPageV2).NumNulls())
   169  	p.Equal(expected.DefinitionLevelsByteLength, page.(*file.DataPageV2).DefinitionLevelByteLen())
   170  	p.Equal(expected.RepetitionLevelsByteLength, page.(*file.DataPageV2).RepetitionLevelByteLen())
   171  	p.Equal(expected.IsCompressed, page.(*file.DataPageV2).IsCompressed())
   172  	checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics())
   173  }
   174  
   175  func (p *PageSerdeSuite) TestDataPageV1() {
   176  	const (
   177  		statsSize = 512
   178  		nrows     = 4444
   179  	)
   180  	p.dataPageHdr.Statistics = getDummyStats(statsSize, true)
   181  	p.dataPageHdr.NumValues = nrows
   182  
   183  	p.WriteDataPageHeader(1024, 0, 0)
   184  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   185  	p.True(p.pageReader.Next())
   186  	currentPage := p.pageReader.Page()
   187  	p.CheckDataPageHeader(p.dataPageHdr, currentPage)
   188  }
   189  
   190  func (p *PageSerdeSuite) TestDataPageV2() {
   191  	const (
   192  		statsSize = 512
   193  		nrows     = 4444
   194  	)
   195  	p.dataPageHdrV2.Statistics = getDummyStats(statsSize, true)
   196  	p.dataPageHdrV2.NumValues = nrows
   197  	p.WriteDataPageHeaderV2(1024, 0, 0)
   198  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   199  	p.True(p.pageReader.Next())
   200  	p.CheckDataPageHeaderV2(p.dataPageHdrV2, p.pageReader.Page())
   201  }
   202  
   203  func (p *PageSerdeSuite) TestLargePageHeaders() {
   204  	const (
   205  		statsSize     = 256 * 1024 // 256KB
   206  		nrows         = 4141
   207  		maxHeaderSize = 512 * 1024 // 512KB
   208  	)
   209  
   210  	p.dataPageHdr.Statistics = getDummyStats(statsSize, false)
   211  	p.dataPageHdr.NumValues = nrows
   212  	p.WriteDataPageHeader(maxHeaderSize, 0, 0)
   213  	pos, err := p.sink.Seek(0, io.SeekCurrent)
   214  	p.NoError(err)
   215  	p.GreaterOrEqual(maxHeaderSize, int(pos))
   216  	p.LessOrEqual(statsSize, int(pos))
   217  	p.GreaterOrEqual(16*1024*1024, int(pos))
   218  
   219  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   220  	p.True(p.pageReader.Next())
   221  	p.CheckDataPageHeader(p.dataPageHdr, p.pageReader.Page())
   222  }
   223  
   224  func (p *PageSerdeSuite) TestFailLargePageHeaders() {
   225  	const (
   226  		statsSize      = 256 * 1024 // 256KB
   227  		nrows          = 1337       // dummy value
   228  		maxHeaderSize  = 512 * 1024 // 512 KB
   229  		smallerMaxSize = 128 * 1024 // 128KB
   230  	)
   231  	p.dataPageHdr.Statistics = getDummyStats(statsSize, false)
   232  	p.WriteDataPageHeader(maxHeaderSize, 0, 0)
   233  	pos, err := p.sink.Seek(0, io.SeekCurrent)
   234  	p.NoError(err)
   235  	p.GreaterOrEqual(maxHeaderSize, int(pos))
   236  
   237  	p.LessOrEqual(smallerMaxSize, int(pos))
   238  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   239  	p.pageReader.SetMaxPageHeaderSize(smallerMaxSize)
   240  	p.NotPanics(func() { p.False(p.pageReader.Next()) })
   241  	p.Error(p.pageReader.Err())
   242  }
   243  
   244  func (p *PageSerdeSuite) TestCompression() {
   245  	codecs := []compress.Compression{
   246  		compress.Codecs.Snappy,
   247  		compress.Codecs.Brotli,
   248  		compress.Codecs.Gzip,
   249  		// compress.Codecs.Lz4, // not yet implemented
   250  		compress.Codecs.Zstd,
   251  	}
   252  
   253  	const (
   254  		nrows  = 32 // dummy value
   255  		npages = 10
   256  	)
   257  	p.dataPageHdr.NumValues = nrows
   258  
   259  	fauxData := make([][]byte, npages)
   260  	for idx := range fauxData {
   261  		// each page is larger
   262  		fauxData[idx] = make([]byte, (idx+1)*64)
   263  		rand.Read(fauxData[idx])
   264  	}
   265  	for _, c := range codecs {
   266  		p.Run(c.String(), func() {
   267  			codec, _ := compress.GetCodec(c)
   268  			for _, data := range fauxData {
   269  				maxCompressed := codec.CompressBound(int64(len(data)))
   270  				buffer := make([]byte, maxCompressed)
   271  				buffer = codec.Encode(buffer, data)
   272  				p.WriteDataPageHeader(1024, int32(len(data)), int32(len(buffer)))
   273  				_, err := p.sink.Write(buffer)
   274  				p.NoError(err)
   275  			}
   276  
   277  			p.InitSerializedPageReader(nrows*npages, c)
   278  
   279  			for _, data := range fauxData {
   280  				p.True(p.pageReader.Next())
   281  				page := p.pageReader.Page()
   282  				p.IsType(&file.DataPageV1{}, page)
   283  				p.Equal(data, page.Data())
   284  			}
   285  			p.ResetStream()
   286  		})
   287  	}
   288  }
   289  
   290  func TestWithEOFReader(t *testing.T) {
   291  	root, _ := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{
   292  		schema.NewInt32Node("int_col", parquet.Repetitions.Required, -1)}, -1)
   293  	props := parquet.NewWriterProperties(parquet.WithVersion(parquet.V2_LATEST))
   294  
   295  	var buf bytes.Buffer
   296  	wr := file.NewParquetWriter(&buf, root, file.WithWriterProps(props))
   297  	require.NoError(t, wr.Close())
   298  
   299  	r := bytes.NewReader(buf.Bytes())
   300  	_, err := file.NewParquetReader(testReader{Reader: r})
   301  	assert.NoError(t, err)
   302  }
   303  
   304  func TestInvalidHeaders(t *testing.T) {
   305  	badHeader := []byte("PAR2")
   306  	_, err := file.NewParquetReader(bytes.NewReader(badHeader))
   307  	assert.Error(t, err)
   308  }
   309  
   310  func TestInvalidFooter(t *testing.T) {
   311  	// file is smaller than FOOTER_SIZE
   312  	badFile := []byte("PAR1PAR")
   313  	_, err := file.NewParquetReader(bytes.NewReader(badFile))
   314  	assert.Error(t, err)
   315  
   316  	// Magic Number Incorrect
   317  	badFile2 := []byte("PAR1PAR2")
   318  	_, err = file.NewParquetReader(bytes.NewReader(badFile2))
   319  	assert.Error(t, err)
   320  }
   321  
   322  func TestIncompleteMetadata(t *testing.T) {
   323  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   324  	magic := []byte("PAR1")
   325  
   326  	sink.Write(magic)
   327  	sink.Write(make([]byte, 10))
   328  	const metadataLen = 24
   329  	binary.Write(sink, binary.LittleEndian, uint32(metadataLen))
   330  	sink.Write(magic)
   331  	buf := sink.Finish()
   332  	defer buf.Release()
   333  	_, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
   334  	assert.Error(t, err)
   335  }
   336  
   337  func TestDeltaLengthByteArrayPackingWithNulls(t *testing.T) {
   338  	// produce file with DeltaLengthByteArray Encoding with mostly null values but one actual value.
   339  	root, _ := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{
   340  		schema.NewByteArrayNode("byte_array_col", parquet.Repetitions.Optional, -1),
   341  	}, -1)
   342  	props := parquet.NewWriterProperties(parquet.WithVersion(parquet.V2_LATEST),
   343  		parquet.WithEncoding(parquet.Encodings.DeltaLengthByteArray), parquet.WithDictionaryDefault(false))
   344  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   345  
   346  	writer := file.NewParquetWriter(sink, root, file.WithWriterProps(props))
   347  	rgw := writer.AppendRowGroup()
   348  	ccw, err := rgw.NextColumn()
   349  	assert.NoError(t, err)
   350  	const elements = 500
   351  	data := make([]parquet.ByteArray, elements)
   352  	data[0] = parquet.ByteArray{1, 2, 3, 4, 5, 6, 7, 8}
   353  
   354  	defLvls := make([]int16, elements)
   355  	repLvls := make([]int16, elements)
   356  	defLvls[0] = 1
   357  
   358  	_, err = ccw.(*file.ByteArrayColumnChunkWriter).WriteBatch(data, defLvls, repLvls)
   359  	assert.NoError(t, err)
   360  	assert.NoError(t, ccw.Close())
   361  	assert.NoError(t, rgw.Close())
   362  	assert.NoError(t, writer.Close())
   363  	buf := sink.Finish()
   364  	defer buf.Release()
   365  
   366  	// read file back in
   367  	reader, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
   368  	assert.NoError(t, err)
   369  	defer reader.Close()
   370  	ccr, err := reader.RowGroup(0).Column(0)
   371  	assert.NoError(t, err)
   372  	const batchSize = 500
   373  
   374  	for ccr.HasNext() {
   375  		readData := make([]parquet.ByteArray, batchSize)
   376  		readdevLvls := make([]int16, batchSize)
   377  		readrepLvls := make([]int16, batchSize)
   378  		cr := ccr.(*file.ByteArrayColumnChunkReader)
   379  
   380  		total, read, err := cr.ReadBatch(batchSize, readData, readdevLvls, readrepLvls)
   381  		assert.NoError(t, err)
   382  		assert.Equal(t, int64(batchSize), total)
   383  		assert.Equal(t, 1, read)
   384  		assert.Equal(t, data[0], readData[0])
   385  		assert.NotNil(t, readData[0])
   386  	}
   387  }