github.com/apache/arrow/go/v14@v14.0.1/parquet/file/file_reader_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file_test 18 19 import ( 20 "bytes" 21 "crypto/rand" 22 "encoding/binary" 23 "io" 24 "testing" 25 26 "github.com/apache/arrow/go/v14/arrow/memory" 27 "github.com/apache/arrow/go/v14/internal/utils" 28 "github.com/apache/arrow/go/v14/parquet" 29 "github.com/apache/arrow/go/v14/parquet/compress" 30 "github.com/apache/arrow/go/v14/parquet/file" 31 "github.com/apache/arrow/go/v14/parquet/internal/encoding" 32 format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet" 33 "github.com/apache/arrow/go/v14/parquet/internal/thrift" 34 "github.com/apache/arrow/go/v14/parquet/metadata" 35 "github.com/apache/arrow/go/v14/parquet/schema" 36 libthrift "github.com/apache/thrift/lib/go/thrift" 37 "github.com/stretchr/testify/assert" 38 "github.com/stretchr/testify/require" 39 "github.com/stretchr/testify/suite" 40 ) 41 42 func getDummyStats(statSize int, fillAll bool) *format.Statistics { 43 statBytes := make([]byte, statSize) 44 memory.Set(statBytes, 1) 45 46 ret := format.NewStatistics() 47 ret.Max = statBytes 48 if fillAll { 49 ret.Min = statBytes 50 ret.NullCount = libthrift.Int64Ptr(42) 51 ret.DistinctCount = libthrift.Int64Ptr(1) 52 } 53 return ret 54 } 55 56 func checkStatistics(t *testing.T, stats format.Statistics, actual metadata.EncodedStatistics) { 57 if stats.IsSetMax() { 58 assert.Equal(t, stats.Max, actual.Max) 59 } 60 if stats.IsSetMin() { 61 assert.Equal(t, stats.Min, actual.Min) 62 } 63 if stats.IsSetNullCount() { 64 assert.Equal(t, stats.GetNullCount(), actual.NullCount) 65 } 66 if stats.IsSetDistinctCount() { 67 assert.Equal(t, stats.GetDistinctCount(), actual.DistinctCount) 68 } 69 } 70 71 type testReader struct { 72 *bytes.Reader 73 } 74 75 // ReadAt for testReader returns io.EOF when off + len(b) is exactly the length of the underlying input source. 76 func (tr testReader) ReadAt(b []byte, off int64) (int, error) { 77 n, err := tr.Reader.ReadAt(b, off) 78 if err == nil && (int64(n)+off == tr.Size()) { 79 return n, io.EOF 80 } 81 return n, err 82 } 83 84 type PageSerdeSuite struct { 85 suite.Suite 86 87 sink *encoding.BufferWriter 88 buffer *memory.Buffer 89 90 pageHdr format.PageHeader 91 dataPageHdr format.DataPageHeader 92 dataPageHdrV2 format.DataPageHeaderV2 93 94 pageReader file.PageReader 95 } 96 97 func TestFileDeserializing(t *testing.T) { 98 t.Parallel() 99 suite.Run(t, new(PageSerdeSuite)) 100 } 101 102 func (p *PageSerdeSuite) ResetStream() { 103 p.sink = encoding.NewBufferWriter(0, memory.DefaultAllocator) 104 } 105 106 func (p *PageSerdeSuite) EndStream() { 107 p.buffer = p.sink.Finish() 108 } 109 110 func (p *PageSerdeSuite) SetupTest() { 111 p.dataPageHdr.Encoding = format.Encoding_PLAIN 112 p.dataPageHdr.DefinitionLevelEncoding = format.Encoding_RLE 113 p.dataPageHdr.RepetitionLevelEncoding = format.Encoding_RLE 114 115 p.ResetStream() 116 } 117 118 func (p *PageSerdeSuite) InitSerializedPageReader(nrows int64, codec compress.Compression) { 119 p.EndStream() 120 121 p.pageReader, _ = file.NewPageReader(utils.NewBufferedReader(bytes.NewReader(p.buffer.Bytes()), p.buffer.Len()), nrows, codec, memory.DefaultAllocator, nil) 122 } 123 124 func (p *PageSerdeSuite) WriteDataPageHeader(maxSerialized int, uncompressed, compressed int32) { 125 // simplifying writing serialized data page headers which may or may 126 // not have meaningful data associated with them 127 128 p.pageHdr.DataPageHeader = &p.dataPageHdr 129 p.pageHdr.UncompressedPageSize = uncompressed 130 p.pageHdr.CompressedPageSize = compressed 131 p.pageHdr.Type = format.PageType_DATA_PAGE 132 133 serializer := thrift.NewThriftSerializer() 134 p.NotPanics(func() { 135 serializer.Serialize(&p.pageHdr, p.sink, nil) 136 }) 137 } 138 139 func (p *PageSerdeSuite) WriteDataPageHeaderV2(maxSerialized int, uncompressed, compressed int32) { 140 p.pageHdr.DataPageHeaderV2 = &p.dataPageHdrV2 141 p.pageHdr.UncompressedPageSize = uncompressed 142 p.pageHdr.CompressedPageSize = compressed 143 p.pageHdr.Type = format.PageType_DATA_PAGE_V2 144 145 serializer := thrift.NewThriftSerializer() 146 p.NotPanics(func() { 147 serializer.Serialize(&p.pageHdr, p.sink, nil) 148 }) 149 } 150 151 func (p *PageSerdeSuite) CheckDataPageHeader(expected format.DataPageHeader, page file.Page) { 152 p.Equal(format.PageType_DATA_PAGE, page.Type()) 153 154 p.IsType(&file.DataPageV1{}, page) 155 p.Equal(expected.NumValues, page.NumValues()) 156 p.Equal(expected.Encoding, page.Encoding()) 157 p.EqualValues(expected.DefinitionLevelEncoding, page.(*file.DataPageV1).DefinitionLevelEncoding()) 158 p.EqualValues(expected.RepetitionLevelEncoding, page.(*file.DataPageV1).RepetitionLevelEncoding()) 159 checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics()) 160 } 161 162 func (p *PageSerdeSuite) CheckDataPageHeaderV2(expected format.DataPageHeaderV2, page file.Page) { 163 p.Equal(format.PageType_DATA_PAGE_V2, page.Type()) 164 165 p.IsType(&file.DataPageV2{}, page) 166 p.Equal(expected.NumValues, page.NumValues()) 167 p.Equal(expected.Encoding, page.Encoding()) 168 p.Equal(expected.NumNulls, page.(*file.DataPageV2).NumNulls()) 169 p.Equal(expected.DefinitionLevelsByteLength, page.(*file.DataPageV2).DefinitionLevelByteLen()) 170 p.Equal(expected.RepetitionLevelsByteLength, page.(*file.DataPageV2).RepetitionLevelByteLen()) 171 p.Equal(expected.IsCompressed, page.(*file.DataPageV2).IsCompressed()) 172 checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics()) 173 } 174 175 func (p *PageSerdeSuite) TestDataPageV1() { 176 const ( 177 statsSize = 512 178 nrows = 4444 179 ) 180 p.dataPageHdr.Statistics = getDummyStats(statsSize, true) 181 p.dataPageHdr.NumValues = nrows 182 183 p.WriteDataPageHeader(1024, 0, 0) 184 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 185 p.True(p.pageReader.Next()) 186 currentPage := p.pageReader.Page() 187 p.CheckDataPageHeader(p.dataPageHdr, currentPage) 188 } 189 190 func (p *PageSerdeSuite) TestDataPageV2() { 191 const ( 192 statsSize = 512 193 nrows = 4444 194 ) 195 p.dataPageHdrV2.Statistics = getDummyStats(statsSize, true) 196 p.dataPageHdrV2.NumValues = nrows 197 p.WriteDataPageHeaderV2(1024, 0, 0) 198 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 199 p.True(p.pageReader.Next()) 200 p.CheckDataPageHeaderV2(p.dataPageHdrV2, p.pageReader.Page()) 201 } 202 203 func (p *PageSerdeSuite) TestLargePageHeaders() { 204 const ( 205 statsSize = 256 * 1024 // 256KB 206 nrows = 4141 207 maxHeaderSize = 512 * 1024 // 512KB 208 ) 209 210 p.dataPageHdr.Statistics = getDummyStats(statsSize, false) 211 p.dataPageHdr.NumValues = nrows 212 p.WriteDataPageHeader(maxHeaderSize, 0, 0) 213 pos, err := p.sink.Seek(0, io.SeekCurrent) 214 p.NoError(err) 215 p.GreaterOrEqual(maxHeaderSize, int(pos)) 216 p.LessOrEqual(statsSize, int(pos)) 217 p.GreaterOrEqual(16*1024*1024, int(pos)) 218 219 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 220 p.True(p.pageReader.Next()) 221 p.CheckDataPageHeader(p.dataPageHdr, p.pageReader.Page()) 222 } 223 224 func (p *PageSerdeSuite) TestFailLargePageHeaders() { 225 const ( 226 statsSize = 256 * 1024 // 256KB 227 nrows = 1337 // dummy value 228 maxHeaderSize = 512 * 1024 // 512 KB 229 smallerMaxSize = 128 * 1024 // 128KB 230 ) 231 p.dataPageHdr.Statistics = getDummyStats(statsSize, false) 232 p.WriteDataPageHeader(maxHeaderSize, 0, 0) 233 pos, err := p.sink.Seek(0, io.SeekCurrent) 234 p.NoError(err) 235 p.GreaterOrEqual(maxHeaderSize, int(pos)) 236 237 p.LessOrEqual(smallerMaxSize, int(pos)) 238 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 239 p.pageReader.SetMaxPageHeaderSize(smallerMaxSize) 240 p.NotPanics(func() { p.False(p.pageReader.Next()) }) 241 p.Error(p.pageReader.Err()) 242 } 243 244 func (p *PageSerdeSuite) TestCompression() { 245 codecs := []compress.Compression{ 246 compress.Codecs.Snappy, 247 compress.Codecs.Brotli, 248 compress.Codecs.Gzip, 249 // compress.Codecs.Lz4, // not yet implemented 250 compress.Codecs.Zstd, 251 } 252 253 const ( 254 nrows = 32 // dummy value 255 npages = 10 256 ) 257 p.dataPageHdr.NumValues = nrows 258 259 fauxData := make([][]byte, npages) 260 for idx := range fauxData { 261 // each page is larger 262 fauxData[idx] = make([]byte, (idx+1)*64) 263 rand.Read(fauxData[idx]) 264 } 265 for _, c := range codecs { 266 p.Run(c.String(), func() { 267 codec, _ := compress.GetCodec(c) 268 for _, data := range fauxData { 269 maxCompressed := codec.CompressBound(int64(len(data))) 270 buffer := make([]byte, maxCompressed) 271 buffer = codec.Encode(buffer, data) 272 p.WriteDataPageHeader(1024, int32(len(data)), int32(len(buffer))) 273 _, err := p.sink.Write(buffer) 274 p.NoError(err) 275 } 276 277 p.InitSerializedPageReader(nrows*npages, c) 278 279 for _, data := range fauxData { 280 p.True(p.pageReader.Next()) 281 page := p.pageReader.Page() 282 p.IsType(&file.DataPageV1{}, page) 283 p.Equal(data, page.Data()) 284 } 285 p.ResetStream() 286 }) 287 } 288 } 289 290 func TestWithEOFReader(t *testing.T) { 291 root, _ := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{ 292 schema.NewInt32Node("int_col", parquet.Repetitions.Required, -1)}, -1) 293 props := parquet.NewWriterProperties(parquet.WithVersion(parquet.V2_LATEST)) 294 295 var buf bytes.Buffer 296 wr := file.NewParquetWriter(&buf, root, file.WithWriterProps(props)) 297 require.NoError(t, wr.Close()) 298 299 r := bytes.NewReader(buf.Bytes()) 300 _, err := file.NewParquetReader(testReader{Reader: r}) 301 assert.NoError(t, err) 302 } 303 304 func TestInvalidHeaders(t *testing.T) { 305 badHeader := []byte("PAR2") 306 _, err := file.NewParquetReader(bytes.NewReader(badHeader)) 307 assert.Error(t, err) 308 } 309 310 func TestInvalidFooter(t *testing.T) { 311 // file is smaller than FOOTER_SIZE 312 badFile := []byte("PAR1PAR") 313 _, err := file.NewParquetReader(bytes.NewReader(badFile)) 314 assert.Error(t, err) 315 316 // Magic Number Incorrect 317 badFile2 := []byte("PAR1PAR2") 318 _, err = file.NewParquetReader(bytes.NewReader(badFile2)) 319 assert.Error(t, err) 320 } 321 322 func TestIncompleteMetadata(t *testing.T) { 323 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 324 magic := []byte("PAR1") 325 326 sink.Write(magic) 327 sink.Write(make([]byte, 10)) 328 const metadataLen = 24 329 binary.Write(sink, binary.LittleEndian, uint32(metadataLen)) 330 sink.Write(magic) 331 buf := sink.Finish() 332 defer buf.Release() 333 _, err := file.NewParquetReader(bytes.NewReader(buf.Bytes())) 334 assert.Error(t, err) 335 } 336 337 func TestDeltaLengthByteArrayPackingWithNulls(t *testing.T) { 338 // produce file with DeltaLengthByteArray Encoding with mostly null values but one actual value. 339 root, _ := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{ 340 schema.NewByteArrayNode("byte_array_col", parquet.Repetitions.Optional, -1), 341 }, -1) 342 props := parquet.NewWriterProperties(parquet.WithVersion(parquet.V2_LATEST), 343 parquet.WithEncoding(parquet.Encodings.DeltaLengthByteArray), parquet.WithDictionaryDefault(false)) 344 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 345 346 writer := file.NewParquetWriter(sink, root, file.WithWriterProps(props)) 347 rgw := writer.AppendRowGroup() 348 ccw, err := rgw.NextColumn() 349 assert.NoError(t, err) 350 const elements = 500 351 data := make([]parquet.ByteArray, elements) 352 data[0] = parquet.ByteArray{1, 2, 3, 4, 5, 6, 7, 8} 353 354 defLvls := make([]int16, elements) 355 repLvls := make([]int16, elements) 356 defLvls[0] = 1 357 358 _, err = ccw.(*file.ByteArrayColumnChunkWriter).WriteBatch(data, defLvls, repLvls) 359 assert.NoError(t, err) 360 assert.NoError(t, ccw.Close()) 361 assert.NoError(t, rgw.Close()) 362 assert.NoError(t, writer.Close()) 363 buf := sink.Finish() 364 defer buf.Release() 365 366 // read file back in 367 reader, err := file.NewParquetReader(bytes.NewReader(buf.Bytes())) 368 assert.NoError(t, err) 369 defer reader.Close() 370 ccr, err := reader.RowGroup(0).Column(0) 371 assert.NoError(t, err) 372 const batchSize = 500 373 374 for ccr.HasNext() { 375 readData := make([]parquet.ByteArray, batchSize) 376 readdevLvls := make([]int16, batchSize) 377 readrepLvls := make([]int16, batchSize) 378 cr := ccr.(*file.ByteArrayColumnChunkReader) 379 380 total, read, err := cr.ReadBatch(batchSize, readData, readdevLvls, readrepLvls) 381 assert.NoError(t, err) 382 assert.Equal(t, int64(batchSize), total) 383 assert.Equal(t, 1, read) 384 assert.Equal(t, data[0], readData[0]) 385 assert.NotNil(t, readData[0]) 386 } 387 }