github.com/apache/arrow/go/v10@v10.0.1/parquet/file/file_reader_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file_test 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "io" 23 "math/rand" 24 "testing" 25 26 "github.com/apache/arrow/go/v10/arrow/memory" 27 "github.com/apache/arrow/go/v10/internal/utils" 28 "github.com/apache/arrow/go/v10/parquet/compress" 29 "github.com/apache/arrow/go/v10/parquet/file" 30 "github.com/apache/arrow/go/v10/parquet/internal/encoding" 31 format "github.com/apache/arrow/go/v10/parquet/internal/gen-go/parquet" 32 "github.com/apache/arrow/go/v10/parquet/internal/thrift" 33 "github.com/apache/arrow/go/v10/parquet/metadata" 34 libthrift "github.com/apache/thrift/lib/go/thrift" 35 "github.com/stretchr/testify/assert" 36 "github.com/stretchr/testify/suite" 37 ) 38 39 func getDummyStats(statSize int, fillAll bool) *format.Statistics { 40 statBytes := make([]byte, statSize) 41 memory.Set(statBytes, 1) 42 43 ret := format.NewStatistics() 44 ret.Max = statBytes 45 if fillAll { 46 ret.Min = statBytes 47 ret.NullCount = libthrift.Int64Ptr(42) 48 ret.DistinctCount = libthrift.Int64Ptr(1) 49 } 50 return ret 51 } 52 53 func checkStatistics(t *testing.T, stats format.Statistics, actual metadata.EncodedStatistics) { 54 if stats.IsSetMax() { 55 assert.Equal(t, stats.Max, actual.Max) 56 } 57 if stats.IsSetMin() { 58 assert.Equal(t, stats.Min, actual.Min) 59 } 60 if stats.IsSetNullCount() { 61 assert.Equal(t, stats.GetNullCount(), actual.NullCount) 62 } 63 if stats.IsSetDistinctCount() { 64 assert.Equal(t, stats.GetDistinctCount(), actual.DistinctCount) 65 } 66 } 67 68 type PageSerdeSuite struct { 69 suite.Suite 70 71 sink *encoding.BufferWriter 72 buffer *memory.Buffer 73 74 pageHdr format.PageHeader 75 dataPageHdr format.DataPageHeader 76 dataPageHdrV2 format.DataPageHeaderV2 77 78 pageReader file.PageReader 79 } 80 81 func TestFileDeserializing(t *testing.T) { 82 t.Parallel() 83 suite.Run(t, new(PageSerdeSuite)) 84 } 85 86 func (p *PageSerdeSuite) ResetStream() { 87 p.sink = encoding.NewBufferWriter(0, memory.DefaultAllocator) 88 } 89 90 func (p *PageSerdeSuite) EndStream() { 91 p.buffer = p.sink.Finish() 92 } 93 94 func (p *PageSerdeSuite) SetupTest() { 95 p.dataPageHdr.Encoding = format.Encoding_PLAIN 96 p.dataPageHdr.DefinitionLevelEncoding = format.Encoding_RLE 97 p.dataPageHdr.RepetitionLevelEncoding = format.Encoding_RLE 98 99 p.ResetStream() 100 } 101 102 func (p *PageSerdeSuite) InitSerializedPageReader(nrows int64, codec compress.Compression) { 103 p.EndStream() 104 105 p.pageReader, _ = file.NewPageReader(utils.NewBufferedReader(bytes.NewReader(p.buffer.Bytes()), p.buffer.Len()), nrows, codec, memory.DefaultAllocator, nil) 106 } 107 108 func (p *PageSerdeSuite) WriteDataPageHeader(maxSerialized int, uncompressed, compressed int32) { 109 // simplifying writing serialized data page headers which may or may 110 // not have meaningful data associated with them 111 112 p.pageHdr.DataPageHeader = &p.dataPageHdr 113 p.pageHdr.UncompressedPageSize = uncompressed 114 p.pageHdr.CompressedPageSize = compressed 115 p.pageHdr.Type = format.PageType_DATA_PAGE 116 117 serializer := thrift.NewThriftSerializer() 118 p.NotPanics(func() { 119 serializer.Serialize(&p.pageHdr, p.sink, nil) 120 }) 121 } 122 123 func (p *PageSerdeSuite) WriteDataPageHeaderV2(maxSerialized int, uncompressed, compressed int32) { 124 p.pageHdr.DataPageHeaderV2 = &p.dataPageHdrV2 125 p.pageHdr.UncompressedPageSize = uncompressed 126 p.pageHdr.CompressedPageSize = compressed 127 p.pageHdr.Type = format.PageType_DATA_PAGE_V2 128 129 serializer := thrift.NewThriftSerializer() 130 p.NotPanics(func() { 131 serializer.Serialize(&p.pageHdr, p.sink, nil) 132 }) 133 } 134 135 func (p *PageSerdeSuite) CheckDataPageHeader(expected format.DataPageHeader, page file.Page) { 136 p.Equal(format.PageType_DATA_PAGE, page.Type()) 137 138 p.IsType(&file.DataPageV1{}, page) 139 p.Equal(expected.NumValues, page.NumValues()) 140 p.Equal(expected.Encoding, page.Encoding()) 141 p.EqualValues(expected.DefinitionLevelEncoding, page.(*file.DataPageV1).DefinitionLevelEncoding()) 142 p.EqualValues(expected.RepetitionLevelEncoding, page.(*file.DataPageV1).RepetitionLevelEncoding()) 143 checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics()) 144 } 145 146 func (p *PageSerdeSuite) CheckDataPageHeaderV2(expected format.DataPageHeaderV2, page file.Page) { 147 p.Equal(format.PageType_DATA_PAGE_V2, page.Type()) 148 149 p.IsType(&file.DataPageV2{}, page) 150 p.Equal(expected.NumValues, page.NumValues()) 151 p.Equal(expected.Encoding, page.Encoding()) 152 p.Equal(expected.NumNulls, page.(*file.DataPageV2).NumNulls()) 153 p.Equal(expected.DefinitionLevelsByteLength, page.(*file.DataPageV2).DefinitionLevelByteLen()) 154 p.Equal(expected.RepetitionLevelsByteLength, page.(*file.DataPageV2).RepetitionLevelByteLen()) 155 p.Equal(expected.IsCompressed, page.(*file.DataPageV2).IsCompressed()) 156 checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics()) 157 } 158 159 func (p *PageSerdeSuite) TestDataPageV1() { 160 const ( 161 statsSize = 512 162 nrows = 4444 163 ) 164 p.dataPageHdr.Statistics = getDummyStats(statsSize, true) 165 p.dataPageHdr.NumValues = nrows 166 167 p.WriteDataPageHeader(1024, 0, 0) 168 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 169 p.True(p.pageReader.Next()) 170 currentPage := p.pageReader.Page() 171 p.CheckDataPageHeader(p.dataPageHdr, currentPage) 172 } 173 174 func (p *PageSerdeSuite) TestDataPageV2() { 175 const ( 176 statsSize = 512 177 nrows = 4444 178 ) 179 p.dataPageHdrV2.Statistics = getDummyStats(statsSize, true) 180 p.dataPageHdrV2.NumValues = nrows 181 p.WriteDataPageHeaderV2(1024, 0, 0) 182 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 183 p.True(p.pageReader.Next()) 184 p.CheckDataPageHeaderV2(p.dataPageHdrV2, p.pageReader.Page()) 185 } 186 187 func (p *PageSerdeSuite) TestLargePageHeaders() { 188 const ( 189 statsSize = 256 * 1024 // 256KB 190 nrows = 4141 191 maxHeaderSize = 512 * 1024 // 512KB 192 ) 193 194 p.dataPageHdr.Statistics = getDummyStats(statsSize, false) 195 p.dataPageHdr.NumValues = nrows 196 p.WriteDataPageHeader(maxHeaderSize, 0, 0) 197 pos, err := p.sink.Seek(0, io.SeekCurrent) 198 p.NoError(err) 199 p.GreaterOrEqual(maxHeaderSize, int(pos)) 200 p.LessOrEqual(statsSize, int(pos)) 201 p.GreaterOrEqual(16*1024*1024, int(pos)) 202 203 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 204 p.True(p.pageReader.Next()) 205 p.CheckDataPageHeader(p.dataPageHdr, p.pageReader.Page()) 206 } 207 208 func (p *PageSerdeSuite) TestFailLargePageHeaders() { 209 const ( 210 statsSize = 256 * 1024 // 256KB 211 nrows = 1337 // dummy value 212 maxHeaderSize = 512 * 1024 // 512 KB 213 smallerMaxSize = 128 * 1024 // 128KB 214 ) 215 p.dataPageHdr.Statistics = getDummyStats(statsSize, false) 216 p.WriteDataPageHeader(maxHeaderSize, 0, 0) 217 pos, err := p.sink.Seek(0, io.SeekCurrent) 218 p.NoError(err) 219 p.GreaterOrEqual(maxHeaderSize, int(pos)) 220 221 p.LessOrEqual(smallerMaxSize, int(pos)) 222 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 223 p.pageReader.SetMaxPageHeaderSize(smallerMaxSize) 224 p.NotPanics(func() { p.False(p.pageReader.Next()) }) 225 p.Error(p.pageReader.Err()) 226 } 227 228 func (p *PageSerdeSuite) TestCompression() { 229 codecs := []compress.Compression{ 230 compress.Codecs.Snappy, 231 compress.Codecs.Brotli, 232 compress.Codecs.Gzip, 233 // compress.Codecs.Lz4, // not yet implemented 234 compress.Codecs.Zstd, 235 } 236 237 const ( 238 nrows = 32 // dummy value 239 npages = 10 240 ) 241 p.dataPageHdr.NumValues = nrows 242 243 fauxData := make([][]byte, npages) 244 for idx := range fauxData { 245 // each page is larger 246 fauxData[idx] = make([]byte, (idx+1)*64) 247 rand.Read(fauxData[idx]) 248 } 249 for _, c := range codecs { 250 p.Run(c.String(), func() { 251 codec, _ := compress.GetCodec(c) 252 for _, data := range fauxData { 253 maxCompressed := codec.CompressBound(int64(len(data))) 254 buffer := make([]byte, maxCompressed) 255 buffer = codec.Encode(buffer, data) 256 p.WriteDataPageHeader(1024, int32(len(data)), int32(len(buffer))) 257 _, err := p.sink.Write(buffer) 258 p.NoError(err) 259 } 260 261 p.InitSerializedPageReader(nrows*npages, c) 262 263 for _, data := range fauxData { 264 p.True(p.pageReader.Next()) 265 page := p.pageReader.Page() 266 p.IsType(&file.DataPageV1{}, page) 267 p.Equal(data, page.Data()) 268 } 269 p.ResetStream() 270 }) 271 } 272 } 273 274 func TestInvalidHeaders(t *testing.T) { 275 badHeader := []byte("PAR2") 276 _, err := file.NewParquetReader(bytes.NewReader(badHeader)) 277 assert.Error(t, err) 278 } 279 280 func TestInvalidFooter(t *testing.T) { 281 // file is smaller than FOOTER_SIZE 282 badFile := []byte("PAR1PAR") 283 _, err := file.NewParquetReader(bytes.NewReader(badFile)) 284 assert.Error(t, err) 285 286 // Magic Number Incorrect 287 badFile2 := []byte("PAR1PAR2") 288 _, err = file.NewParquetReader(bytes.NewReader(badFile2)) 289 assert.Error(t, err) 290 } 291 292 func TestIncompleteMetadata(t *testing.T) { 293 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 294 magic := []byte("PAR1") 295 296 sink.Write(magic) 297 sink.Write(make([]byte, 10)) 298 const metadataLen = 24 299 binary.Write(sink, binary.LittleEndian, uint32(metadataLen)) 300 sink.Write(magic) 301 buf := sink.Finish() 302 defer buf.Release() 303 _, err := file.NewParquetReader(bytes.NewReader(buf.Bytes())) 304 assert.Error(t, err) 305 }