github.com/apache/arrow/go/v7@v7.0.1/parquet/file/file_reader_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file_test 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "io" 23 "math/rand" 24 "testing" 25 26 "github.com/apache/arrow/go/v7/arrow/memory" 27 "github.com/apache/arrow/go/v7/parquet/compress" 28 "github.com/apache/arrow/go/v7/parquet/file" 29 "github.com/apache/arrow/go/v7/parquet/internal/encoding" 30 format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet" 31 "github.com/apache/arrow/go/v7/parquet/internal/thrift" 32 "github.com/apache/arrow/go/v7/parquet/metadata" 33 libthrift "github.com/apache/thrift/lib/go/thrift" 34 "github.com/stretchr/testify/assert" 35 "github.com/stretchr/testify/suite" 36 ) 37 38 func getDummyStats(statSize int, fillAll bool) *format.Statistics { 39 statBytes := make([]byte, statSize) 40 memory.Set(statBytes, 1) 41 42 ret := format.NewStatistics() 43 ret.Max = statBytes 44 if fillAll { 45 ret.Min = statBytes 46 ret.NullCount = libthrift.Int64Ptr(42) 47 ret.DistinctCount = libthrift.Int64Ptr(1) 48 } 49 return ret 50 } 51 52 func checkStatistics(t *testing.T, stats format.Statistics, actual metadata.EncodedStatistics) { 53 if stats.IsSetMax() { 54 assert.Equal(t, stats.Max, actual.Max) 55 } 56 if stats.IsSetMin() { 57 assert.Equal(t, stats.Min, actual.Min) 58 } 59 if stats.IsSetNullCount() { 60 assert.Equal(t, stats.GetNullCount(), actual.NullCount) 61 } 62 if stats.IsSetDistinctCount() { 63 assert.Equal(t, stats.GetDistinctCount(), actual.DistinctCount) 64 } 65 } 66 67 type PageSerdeSuite struct { 68 suite.Suite 69 70 sink *encoding.BufferWriter 71 buffer *memory.Buffer 72 73 pageHdr format.PageHeader 74 dataPageHdr format.DataPageHeader 75 dataPageHdrV2 format.DataPageHeaderV2 76 77 pageReader file.PageReader 78 } 79 80 func TestFileDeserializing(t *testing.T) { 81 t.Parallel() 82 suite.Run(t, new(PageSerdeSuite)) 83 } 84 85 func (p *PageSerdeSuite) ResetStream() { 86 p.sink = encoding.NewBufferWriter(0, memory.DefaultAllocator) 87 } 88 89 func (p *PageSerdeSuite) EndStream() { 90 p.buffer = p.sink.Finish() 91 } 92 93 func (p *PageSerdeSuite) SetupTest() { 94 p.dataPageHdr.Encoding = format.Encoding_PLAIN 95 p.dataPageHdr.DefinitionLevelEncoding = format.Encoding_RLE 96 p.dataPageHdr.RepetitionLevelEncoding = format.Encoding_RLE 97 98 p.ResetStream() 99 } 100 101 func (p *PageSerdeSuite) InitSerializedPageReader(nrows int64, codec compress.Compression) { 102 p.EndStream() 103 104 p.pageReader, _ = file.NewPageReader(bytes.NewReader(p.buffer.Bytes()), nrows, codec, memory.DefaultAllocator, nil) 105 } 106 107 func (p *PageSerdeSuite) WriteDataPageHeader(maxSerialized int, uncompressed, compressed int32) { 108 // simplifying writing serialized data page headers which may or may 109 // not have meaningful data associated with them 110 111 p.pageHdr.DataPageHeader = &p.dataPageHdr 112 p.pageHdr.UncompressedPageSize = uncompressed 113 p.pageHdr.CompressedPageSize = compressed 114 p.pageHdr.Type = format.PageType_DATA_PAGE 115 116 serializer := thrift.NewThriftSerializer() 117 p.NotPanics(func() { 118 serializer.Serialize(&p.pageHdr, p.sink, nil) 119 }) 120 } 121 122 func (p *PageSerdeSuite) WriteDataPageHeaderV2(maxSerialized int, uncompressed, compressed int32) { 123 p.pageHdr.DataPageHeaderV2 = &p.dataPageHdrV2 124 p.pageHdr.UncompressedPageSize = uncompressed 125 p.pageHdr.CompressedPageSize = compressed 126 p.pageHdr.Type = format.PageType_DATA_PAGE_V2 127 128 serializer := thrift.NewThriftSerializer() 129 p.NotPanics(func() { 130 serializer.Serialize(&p.pageHdr, p.sink, nil) 131 }) 132 } 133 134 func (p *PageSerdeSuite) CheckDataPageHeader(expected format.DataPageHeader, page file.Page) { 135 p.Equal(format.PageType_DATA_PAGE, page.Type()) 136 137 p.IsType(&file.DataPageV1{}, page) 138 p.Equal(expected.NumValues, page.NumValues()) 139 p.Equal(expected.Encoding, page.Encoding()) 140 p.EqualValues(expected.DefinitionLevelEncoding, page.(*file.DataPageV1).DefinitionLevelEncoding()) 141 p.EqualValues(expected.RepetitionLevelEncoding, page.(*file.DataPageV1).RepetitionLevelEncoding()) 142 checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics()) 143 } 144 145 func (p *PageSerdeSuite) CheckDataPageHeaderV2(expected format.DataPageHeaderV2, page file.Page) { 146 p.Equal(format.PageType_DATA_PAGE_V2, page.Type()) 147 148 p.IsType(&file.DataPageV2{}, page) 149 p.Equal(expected.NumValues, page.NumValues()) 150 p.Equal(expected.Encoding, page.Encoding()) 151 p.Equal(expected.NumNulls, page.(*file.DataPageV2).NumNulls()) 152 p.Equal(expected.DefinitionLevelsByteLength, page.(*file.DataPageV2).DefinitionLevelByteLen()) 153 p.Equal(expected.RepetitionLevelsByteLength, page.(*file.DataPageV2).RepetitionLevelByteLen()) 154 p.Equal(expected.IsCompressed, page.(*file.DataPageV2).IsCompressed()) 155 checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics()) 156 } 157 158 func (p *PageSerdeSuite) TestDataPageV1() { 159 const ( 160 statsSize = 512 161 nrows = 4444 162 ) 163 p.dataPageHdr.Statistics = getDummyStats(statsSize, true) 164 p.dataPageHdr.NumValues = nrows 165 166 p.WriteDataPageHeader(1024, 0, 0) 167 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 168 p.True(p.pageReader.Next()) 169 currentPage := p.pageReader.Page() 170 p.CheckDataPageHeader(p.dataPageHdr, currentPage) 171 } 172 173 func (p *PageSerdeSuite) TestDataPageV2() { 174 const ( 175 statsSize = 512 176 nrows = 4444 177 ) 178 p.dataPageHdrV2.Statistics = getDummyStats(statsSize, true) 179 p.dataPageHdrV2.NumValues = nrows 180 p.WriteDataPageHeaderV2(1024, 0, 0) 181 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 182 p.True(p.pageReader.Next()) 183 p.CheckDataPageHeaderV2(p.dataPageHdrV2, p.pageReader.Page()) 184 } 185 186 func (p *PageSerdeSuite) TestLargePageHeaders() { 187 const ( 188 statsSize = 256 * 1024 // 256KB 189 nrows = 4141 190 maxHeaderSize = 512 * 1024 // 512KB 191 ) 192 193 p.dataPageHdr.Statistics = getDummyStats(statsSize, false) 194 p.dataPageHdr.NumValues = nrows 195 p.WriteDataPageHeader(maxHeaderSize, 0, 0) 196 pos, err := p.sink.Seek(0, io.SeekCurrent) 197 p.NoError(err) 198 p.GreaterOrEqual(maxHeaderSize, int(pos)) 199 p.LessOrEqual(statsSize, int(pos)) 200 p.GreaterOrEqual(16*1024*1024, int(pos)) 201 202 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 203 p.True(p.pageReader.Next()) 204 p.CheckDataPageHeader(p.dataPageHdr, p.pageReader.Page()) 205 } 206 207 func (p *PageSerdeSuite) TestFailLargePageHeaders() { 208 const ( 209 statsSize = 256 * 1024 // 256KB 210 nrows = 1337 // dummy value 211 maxHeaderSize = 512 * 1024 // 512 KB 212 smallerMaxSize = 128 * 1024 // 128KB 213 ) 214 p.dataPageHdr.Statistics = getDummyStats(statsSize, false) 215 p.WriteDataPageHeader(maxHeaderSize, 0, 0) 216 pos, err := p.sink.Seek(0, io.SeekCurrent) 217 p.NoError(err) 218 p.GreaterOrEqual(maxHeaderSize, int(pos)) 219 220 p.LessOrEqual(smallerMaxSize, int(pos)) 221 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 222 p.pageReader.SetMaxPageHeaderSize(smallerMaxSize) 223 p.NotPanics(func() { p.False(p.pageReader.Next()) }) 224 p.Error(p.pageReader.Err()) 225 } 226 227 func (p *PageSerdeSuite) TestCompression() { 228 codecs := []compress.Compression{ 229 compress.Codecs.Snappy, 230 compress.Codecs.Brotli, 231 compress.Codecs.Gzip, 232 // compress.Codecs.Lz4, // not yet implemented 233 compress.Codecs.Zstd, 234 } 235 236 const ( 237 nrows = 32 // dummy value 238 npages = 10 239 ) 240 p.dataPageHdr.NumValues = nrows 241 242 fauxData := make([][]byte, npages) 243 for idx := range fauxData { 244 // each page is larger 245 fauxData[idx] = make([]byte, (idx+1)*64) 246 rand.Read(fauxData[idx]) 247 } 248 for _, c := range codecs { 249 p.Run(c.String(), func() { 250 codec, _ := compress.GetCodec(c) 251 for _, data := range fauxData { 252 maxCompressed := codec.CompressBound(int64(len(data))) 253 buffer := make([]byte, maxCompressed) 254 buffer = codec.Encode(buffer, data) 255 p.WriteDataPageHeader(1024, int32(len(data)), int32(len(buffer))) 256 _, err := p.sink.Write(buffer) 257 p.NoError(err) 258 } 259 260 p.InitSerializedPageReader(nrows*npages, c) 261 262 for _, data := range fauxData { 263 p.True(p.pageReader.Next()) 264 page := p.pageReader.Page() 265 p.IsType(&file.DataPageV1{}, page) 266 p.Equal(data, page.Data()) 267 } 268 p.ResetStream() 269 }) 270 } 271 } 272 273 func TestInvalidHeaders(t *testing.T) { 274 badHeader := []byte("PAR2") 275 _, err := file.NewParquetReader(bytes.NewReader(badHeader)) 276 assert.Error(t, err) 277 } 278 279 func TestInvalidFooter(t *testing.T) { 280 // file is smaller than FOOTER_SIZE 281 badFile := []byte("PAR1PAR") 282 _, err := file.NewParquetReader(bytes.NewReader(badFile)) 283 assert.Error(t, err) 284 285 // Magic Number Incorrect 286 badFile2 := []byte("PAR1PAR2") 287 _, err = file.NewParquetReader(bytes.NewReader(badFile2)) 288 assert.Error(t, err) 289 } 290 291 func TestIncompleteMetadata(t *testing.T) { 292 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 293 magic := []byte("PAR1") 294 295 sink.Write(magic) 296 sink.Write(make([]byte, 10)) 297 const metadataLen = 24 298 binary.Write(sink, binary.LittleEndian, uint32(metadataLen)) 299 sink.Write(magic) 300 buf := sink.Finish() 301 defer buf.Release() 302 _, err := file.NewParquetReader(bytes.NewReader(buf.Bytes())) 303 assert.Error(t, err) 304 }