github.com/apache/arrow/go/v16@v16.1.0/parquet/file/file_reader_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file_test
    18  
    19  import (
    20  	"bytes"
    21  	"crypto/rand"
    22  	"encoding/binary"
    23  	"io"
    24  	"os"
    25  	"path"
    26  	"testing"
    27  
    28  	"github.com/apache/arrow/go/v16/arrow/memory"
    29  	"github.com/apache/arrow/go/v16/internal/utils"
    30  	"github.com/apache/arrow/go/v16/parquet"
    31  	"github.com/apache/arrow/go/v16/parquet/compress"
    32  	"github.com/apache/arrow/go/v16/parquet/file"
    33  	"github.com/apache/arrow/go/v16/parquet/internal/encoding"
    34  	format "github.com/apache/arrow/go/v16/parquet/internal/gen-go/parquet"
    35  	"github.com/apache/arrow/go/v16/parquet/internal/thrift"
    36  	"github.com/apache/arrow/go/v16/parquet/metadata"
    37  	"github.com/apache/arrow/go/v16/parquet/schema"
    38  	libthrift "github.com/apache/thrift/lib/go/thrift"
    39  	"github.com/stretchr/testify/assert"
    40  	"github.com/stretchr/testify/require"
    41  	"github.com/stretchr/testify/suite"
    42  )
    43  
    44  func getDummyStats(statSize int, fillAll bool) *format.Statistics {
    45  	statBytes := make([]byte, statSize)
    46  	memory.Set(statBytes, 1)
    47  
    48  	ret := format.NewStatistics()
    49  	ret.Max = statBytes
    50  	if fillAll {
    51  		ret.Min = statBytes
    52  		ret.NullCount = libthrift.Int64Ptr(42)
    53  		ret.DistinctCount = libthrift.Int64Ptr(1)
    54  	}
    55  	return ret
    56  }
    57  
    58  func checkStatistics(t *testing.T, stats format.Statistics, actual metadata.EncodedStatistics) {
    59  	if stats.IsSetMax() {
    60  		assert.Equal(t, stats.Max, actual.Max)
    61  	}
    62  	if stats.IsSetMin() {
    63  		assert.Equal(t, stats.Min, actual.Min)
    64  	}
    65  	if stats.IsSetNullCount() {
    66  		assert.Equal(t, stats.GetNullCount(), actual.NullCount)
    67  	}
    68  	if stats.IsSetDistinctCount() {
    69  		assert.Equal(t, stats.GetDistinctCount(), actual.DistinctCount)
    70  	}
    71  }
    72  
    73  type testReader struct {
    74  	*bytes.Reader
    75  }
    76  
    77  // ReadAt for testReader returns io.EOF when off + len(b) is exactly the length of the underlying input source.
    78  func (tr testReader) ReadAt(b []byte, off int64) (int, error) {
    79  	n, err := tr.Reader.ReadAt(b, off)
    80  	if err == nil && (int64(n)+off == tr.Size()) {
    81  		return n, io.EOF
    82  	}
    83  	return n, err
    84  }
    85  
    86  type PageSerdeSuite struct {
    87  	suite.Suite
    88  
    89  	sink   *encoding.BufferWriter
    90  	buffer *memory.Buffer
    91  
    92  	pageHdr       format.PageHeader
    93  	dataPageHdr   format.DataPageHeader
    94  	dataPageHdrV2 format.DataPageHeaderV2
    95  
    96  	pageReader file.PageReader
    97  }
    98  
    99  func TestFileDeserializing(t *testing.T) {
   100  	t.Parallel()
   101  	suite.Run(t, new(PageSerdeSuite))
   102  }
   103  
   104  func (p *PageSerdeSuite) ResetStream() {
   105  	p.sink = encoding.NewBufferWriter(0, memory.DefaultAllocator)
   106  }
   107  
   108  func (p *PageSerdeSuite) EndStream() {
   109  	p.buffer = p.sink.Finish()
   110  }
   111  
   112  func (p *PageSerdeSuite) SetupTest() {
   113  	p.dataPageHdr.Encoding = format.Encoding_PLAIN
   114  	p.dataPageHdr.DefinitionLevelEncoding = format.Encoding_RLE
   115  	p.dataPageHdr.RepetitionLevelEncoding = format.Encoding_RLE
   116  
   117  	p.ResetStream()
   118  }
   119  
   120  func (p *PageSerdeSuite) InitSerializedPageReader(nrows int64, codec compress.Compression) {
   121  	p.EndStream()
   122  
   123  	p.pageReader, _ = file.NewPageReader(utils.NewBufferedReader(bytes.NewReader(p.buffer.Bytes()), p.buffer.Len()), nrows, codec, memory.DefaultAllocator, nil)
   124  }
   125  
   126  func (p *PageSerdeSuite) WriteDataPageHeader(maxSerialized int, uncompressed, compressed int32) {
   127  	// simplifying writing serialized data page headers which may or may
   128  	// not have meaningful data associated with them
   129  
   130  	p.pageHdr.DataPageHeader = &p.dataPageHdr
   131  	p.pageHdr.UncompressedPageSize = uncompressed
   132  	p.pageHdr.CompressedPageSize = compressed
   133  	p.pageHdr.Type = format.PageType_DATA_PAGE
   134  
   135  	serializer := thrift.NewThriftSerializer()
   136  	p.NotPanics(func() {
   137  		serializer.Serialize(&p.pageHdr, p.sink, nil)
   138  	})
   139  }
   140  
   141  func (p *PageSerdeSuite) WriteDataPageHeaderV2(maxSerialized int, uncompressed, compressed int32) {
   142  	p.pageHdr.DataPageHeaderV2 = &p.dataPageHdrV2
   143  	p.pageHdr.UncompressedPageSize = uncompressed
   144  	p.pageHdr.CompressedPageSize = compressed
   145  	p.pageHdr.Type = format.PageType_DATA_PAGE_V2
   146  
   147  	serializer := thrift.NewThriftSerializer()
   148  	p.NotPanics(func() {
   149  		serializer.Serialize(&p.pageHdr, p.sink, nil)
   150  	})
   151  }
   152  
   153  func (p *PageSerdeSuite) CheckDataPageHeader(expected format.DataPageHeader, page file.Page) {
   154  	p.Equal(format.PageType_DATA_PAGE, page.Type())
   155  
   156  	p.IsType(&file.DataPageV1{}, page)
   157  	p.Equal(expected.NumValues, page.NumValues())
   158  	p.Equal(expected.Encoding, page.Encoding())
   159  	p.EqualValues(expected.DefinitionLevelEncoding, page.(*file.DataPageV1).DefinitionLevelEncoding())
   160  	p.EqualValues(expected.RepetitionLevelEncoding, page.(*file.DataPageV1).RepetitionLevelEncoding())
   161  	checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics())
   162  }
   163  
   164  func (p *PageSerdeSuite) CheckDataPageHeaderV2(expected format.DataPageHeaderV2, page file.Page) {
   165  	p.Equal(format.PageType_DATA_PAGE_V2, page.Type())
   166  
   167  	p.IsType(&file.DataPageV2{}, page)
   168  	p.Equal(expected.NumValues, page.NumValues())
   169  	p.Equal(expected.Encoding, page.Encoding())
   170  	p.Equal(expected.NumNulls, page.(*file.DataPageV2).NumNulls())
   171  	p.Equal(expected.DefinitionLevelsByteLength, page.(*file.DataPageV2).DefinitionLevelByteLen())
   172  	p.Equal(expected.RepetitionLevelsByteLength, page.(*file.DataPageV2).RepetitionLevelByteLen())
   173  	p.Equal(expected.IsCompressed, page.(*file.DataPageV2).IsCompressed())
   174  	checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics())
   175  }
   176  
   177  func (p *PageSerdeSuite) TestDataPageV1() {
   178  	const (
   179  		statsSize = 512
   180  		nrows     = 4444
   181  	)
   182  	p.dataPageHdr.Statistics = getDummyStats(statsSize, true)
   183  	p.dataPageHdr.NumValues = nrows
   184  
   185  	p.WriteDataPageHeader(1024, 0, 0)
   186  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   187  	p.True(p.pageReader.Next())
   188  	currentPage := p.pageReader.Page()
   189  	p.CheckDataPageHeader(p.dataPageHdr, currentPage)
   190  }
   191  
   192  func (p *PageSerdeSuite) TestDataPageV2() {
   193  	const (
   194  		statsSize = 512
   195  		nrows     = 4444
   196  	)
   197  	p.dataPageHdrV2.Statistics = getDummyStats(statsSize, true)
   198  	p.dataPageHdrV2.NumValues = nrows
   199  	p.WriteDataPageHeaderV2(1024, 0, 0)
   200  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   201  	p.True(p.pageReader.Next())
   202  	p.CheckDataPageHeaderV2(p.dataPageHdrV2, p.pageReader.Page())
   203  }
   204  
   205  func (p *PageSerdeSuite) TestLargePageHeaders() {
   206  	const (
   207  		statsSize     = 256 * 1024 // 256KB
   208  		nrows         = 4141
   209  		maxHeaderSize = 512 * 1024 // 512KB
   210  	)
   211  
   212  	p.dataPageHdr.Statistics = getDummyStats(statsSize, false)
   213  	p.dataPageHdr.NumValues = nrows
   214  	p.WriteDataPageHeader(maxHeaderSize, 0, 0)
   215  	pos, err := p.sink.Seek(0, io.SeekCurrent)
   216  	p.NoError(err)
   217  	p.GreaterOrEqual(maxHeaderSize, int(pos))
   218  	p.LessOrEqual(statsSize, int(pos))
   219  	p.GreaterOrEqual(16*1024*1024, int(pos))
   220  
   221  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   222  	p.True(p.pageReader.Next())
   223  	p.CheckDataPageHeader(p.dataPageHdr, p.pageReader.Page())
   224  }
   225  
   226  func (p *PageSerdeSuite) TestFailLargePageHeaders() {
   227  	const (
   228  		statsSize      = 256 * 1024 // 256KB
   229  		nrows          = 1337       // dummy value
   230  		maxHeaderSize  = 512 * 1024 // 512 KB
   231  		smallerMaxSize = 128 * 1024 // 128KB
   232  	)
   233  	p.dataPageHdr.Statistics = getDummyStats(statsSize, false)
   234  	p.WriteDataPageHeader(maxHeaderSize, 0, 0)
   235  	pos, err := p.sink.Seek(0, io.SeekCurrent)
   236  	p.NoError(err)
   237  	p.GreaterOrEqual(maxHeaderSize, int(pos))
   238  
   239  	p.LessOrEqual(smallerMaxSize, int(pos))
   240  	p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed)
   241  	p.pageReader.SetMaxPageHeaderSize(smallerMaxSize)
   242  	p.NotPanics(func() { p.False(p.pageReader.Next()) })
   243  	p.Error(p.pageReader.Err())
   244  }
   245  
   246  func (p *PageSerdeSuite) TestCompression() {
   247  	codecs := []compress.Compression{
   248  		compress.Codecs.Snappy,
   249  		compress.Codecs.Brotli,
   250  		compress.Codecs.Gzip,
   251  		// compress.Codecs.Lz4, // not yet implemented
   252  		compress.Codecs.Zstd,
   253  	}
   254  
   255  	const (
   256  		nrows  = 32 // dummy value
   257  		npages = 10
   258  	)
   259  	p.dataPageHdr.NumValues = nrows
   260  
   261  	fauxData := make([][]byte, npages)
   262  	for idx := range fauxData {
   263  		// each page is larger
   264  		fauxData[idx] = make([]byte, (idx+1)*64)
   265  		rand.Read(fauxData[idx])
   266  	}
   267  	for _, c := range codecs {
   268  		p.Run(c.String(), func() {
   269  			codec, _ := compress.GetCodec(c)
   270  			for _, data := range fauxData {
   271  				maxCompressed := codec.CompressBound(int64(len(data)))
   272  				buffer := make([]byte, maxCompressed)
   273  				buffer = codec.Encode(buffer, data)
   274  				p.WriteDataPageHeader(1024, int32(len(data)), int32(len(buffer)))
   275  				_, err := p.sink.Write(buffer)
   276  				p.NoError(err)
   277  			}
   278  
   279  			p.InitSerializedPageReader(nrows*npages, c)
   280  
   281  			for _, data := range fauxData {
   282  				p.True(p.pageReader.Next())
   283  				page := p.pageReader.Page()
   284  				p.IsType(&file.DataPageV1{}, page)
   285  				p.Equal(data, page.Data())
   286  			}
   287  			p.ResetStream()
   288  		})
   289  	}
   290  }
   291  
   292  func TestWithEOFReader(t *testing.T) {
   293  	root, _ := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{
   294  		schema.NewInt32Node("int_col", parquet.Repetitions.Required, -1)}, -1)
   295  	props := parquet.NewWriterProperties(parquet.WithVersion(parquet.V2_LATEST))
   296  
   297  	var buf bytes.Buffer
   298  	wr := file.NewParquetWriter(&buf, root, file.WithWriterProps(props))
   299  	require.NoError(t, wr.Close())
   300  
   301  	r := bytes.NewReader(buf.Bytes())
   302  	_, err := file.NewParquetReader(testReader{Reader: r})
   303  	assert.NoError(t, err)
   304  }
   305  
   306  func TestInvalidHeaders(t *testing.T) {
   307  	badHeader := []byte("PAR2")
   308  	_, err := file.NewParquetReader(bytes.NewReader(badHeader))
   309  	assert.Error(t, err)
   310  }
   311  
   312  func TestInvalidFooter(t *testing.T) {
   313  	// file is smaller than FOOTER_SIZE
   314  	badFile := []byte("PAR1PAR")
   315  	_, err := file.NewParquetReader(bytes.NewReader(badFile))
   316  	assert.Error(t, err)
   317  
   318  	// Magic Number Incorrect
   319  	badFile2 := []byte("PAR1PAR2")
   320  	_, err = file.NewParquetReader(bytes.NewReader(badFile2))
   321  	assert.Error(t, err)
   322  }
   323  
   324  func TestIncompleteMetadata(t *testing.T) {
   325  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   326  	magic := []byte("PAR1")
   327  
   328  	sink.Write(magic)
   329  	sink.Write(make([]byte, 10))
   330  	const metadataLen = 24
   331  	binary.Write(sink, binary.LittleEndian, uint32(metadataLen))
   332  	sink.Write(magic)
   333  	buf := sink.Finish()
   334  	defer buf.Release()
   335  	_, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
   336  	assert.Error(t, err)
   337  }
   338  
   339  func TestDeltaLengthByteArrayPackingWithNulls(t *testing.T) {
   340  	// produce file with DeltaLengthByteArray Encoding with mostly null values but one actual value.
   341  	root, _ := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{
   342  		schema.NewByteArrayNode("byte_array_col", parquet.Repetitions.Optional, -1),
   343  	}, -1)
   344  	props := parquet.NewWriterProperties(parquet.WithVersion(parquet.V2_LATEST),
   345  		parquet.WithEncoding(parquet.Encodings.DeltaLengthByteArray), parquet.WithDictionaryDefault(false))
   346  	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
   347  
   348  	writer := file.NewParquetWriter(sink, root, file.WithWriterProps(props))
   349  	rgw := writer.AppendRowGroup()
   350  	ccw, err := rgw.NextColumn()
   351  	assert.NoError(t, err)
   352  	const elements = 500
   353  	data := make([]parquet.ByteArray, elements)
   354  	data[0] = parquet.ByteArray{1, 2, 3, 4, 5, 6, 7, 8}
   355  
   356  	defLvls := make([]int16, elements)
   357  	repLvls := make([]int16, elements)
   358  	defLvls[0] = 1
   359  
   360  	_, err = ccw.(*file.ByteArrayColumnChunkWriter).WriteBatch(data, defLvls, repLvls)
   361  	assert.NoError(t, err)
   362  	assert.NoError(t, ccw.Close())
   363  	assert.NoError(t, rgw.Close())
   364  	assert.NoError(t, writer.Close())
   365  	buf := sink.Finish()
   366  	defer buf.Release()
   367  
   368  	// read file back in
   369  	reader, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
   370  	assert.NoError(t, err)
   371  	defer reader.Close()
   372  	ccr, err := reader.RowGroup(0).Column(0)
   373  	assert.NoError(t, err)
   374  	const batchSize = 500
   375  
   376  	for ccr.HasNext() {
   377  		readData := make([]parquet.ByteArray, batchSize)
   378  		readdevLvls := make([]int16, batchSize)
   379  		readrepLvls := make([]int16, batchSize)
   380  		cr := ccr.(*file.ByteArrayColumnChunkReader)
   381  
   382  		total, read, err := cr.ReadBatch(batchSize, readData, readdevLvls, readrepLvls)
   383  		assert.NoError(t, err)
   384  		assert.Equal(t, int64(batchSize), total)
   385  		assert.Equal(t, 1, read)
   386  		assert.Equal(t, data[0], readData[0])
   387  		assert.NotNil(t, readData[0])
   388  	}
   389  }
   390  
   391  func TestRleBooleanEncodingFileRead(t *testing.T) {
   392  	dir := os.Getenv("PARQUET_TEST_DATA")
   393  	if dir == "" {
   394  		t.Skip("no path supplied with PARQUET_TEST_DATA")
   395  	}
   396  	assert.DirExists(t, dir)
   397  
   398  	props := parquet.NewReaderProperties(memory.DefaultAllocator)
   399  	fileReader, err := file.OpenParquetFile(path.Join(dir, "rle_boolean_encoding.parquet"),
   400  		false, file.WithReadProps(props))
   401  	require.NoError(t, err)
   402  	defer fileReader.Close()
   403  
   404  	assert.Equal(t, 1, fileReader.NumRowGroups())
   405  	rgr := fileReader.RowGroup(0)
   406  	assert.EqualValues(t, 68, rgr.NumRows())
   407  
   408  	rdr, err := rgr.Column(0)
   409  	require.NoError(t, err)
   410  	brdr := rdr.(*file.BooleanColumnChunkReader)
   411  
   412  	values := make([]bool, 68)
   413  	defLvls, repLvls := make([]int16, 68), make([]int16, 68)
   414  	total, read, err := brdr.ReadBatch(68, values, defLvls, repLvls)
   415  	require.NoError(t, err)
   416  
   417  	assert.EqualValues(t, 68, total)
   418  	md, err := rgr.MetaData().ColumnChunk(0)
   419  	require.NoError(t, err)
   420  	stats, err := md.Statistics()
   421  	require.NoError(t, err)
   422  	assert.EqualValues(t, total-stats.NullCount(), read)
   423  
   424  	expected := []bool{
   425  		true, false, true, true, false, false,
   426  		true, true, true, false, false, true, true,
   427  		false, true, true, false, false, true, true,
   428  		false, true, true, false, false, true, true,
   429  		true, false, false, false, false, true, true,
   430  		false, true, true, false, false, true, true,
   431  		true, false, false, true, true, false, false,
   432  		true, true, true, false, true, true, false,
   433  		true, true, false, false, true, true, true,
   434  	}
   435  	expectedNulls := []int{2, 15, 23, 38, 48, 60}
   436  
   437  	expectedNullIdx := 0
   438  	for i, v := range defLvls {
   439  		if expectedNullIdx < len(expectedNulls) && i == expectedNulls[expectedNullIdx] {
   440  			assert.Zero(t, v)
   441  			expectedNullIdx++
   442  		} else {
   443  			assert.EqualValues(t, 1, v)
   444  		}
   445  	}
   446  
   447  	assert.Equal(t, expected, values[:len(expected)])
   448  }