github.com/apache/arrow/go/v16@v16.1.0/parquet/file/file_reader_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file_test 18 19 import ( 20 "bytes" 21 "crypto/rand" 22 "encoding/binary" 23 "io" 24 "os" 25 "path" 26 "testing" 27 28 "github.com/apache/arrow/go/v16/arrow/memory" 29 "github.com/apache/arrow/go/v16/internal/utils" 30 "github.com/apache/arrow/go/v16/parquet" 31 "github.com/apache/arrow/go/v16/parquet/compress" 32 "github.com/apache/arrow/go/v16/parquet/file" 33 "github.com/apache/arrow/go/v16/parquet/internal/encoding" 34 format "github.com/apache/arrow/go/v16/parquet/internal/gen-go/parquet" 35 "github.com/apache/arrow/go/v16/parquet/internal/thrift" 36 "github.com/apache/arrow/go/v16/parquet/metadata" 37 "github.com/apache/arrow/go/v16/parquet/schema" 38 libthrift "github.com/apache/thrift/lib/go/thrift" 39 "github.com/stretchr/testify/assert" 40 "github.com/stretchr/testify/require" 41 "github.com/stretchr/testify/suite" 42 ) 43 44 func getDummyStats(statSize int, fillAll bool) *format.Statistics { 45 statBytes := make([]byte, statSize) 46 memory.Set(statBytes, 1) 47 48 ret := format.NewStatistics() 49 ret.Max = statBytes 50 if fillAll { 51 ret.Min = statBytes 52 ret.NullCount = libthrift.Int64Ptr(42) 53 ret.DistinctCount = libthrift.Int64Ptr(1) 54 } 55 return ret 56 } 57 58 func checkStatistics(t *testing.T, stats format.Statistics, actual metadata.EncodedStatistics) { 59 if stats.IsSetMax() { 60 assert.Equal(t, stats.Max, actual.Max) 61 } 62 if stats.IsSetMin() { 63 assert.Equal(t, stats.Min, actual.Min) 64 } 65 if stats.IsSetNullCount() { 66 assert.Equal(t, stats.GetNullCount(), actual.NullCount) 67 } 68 if stats.IsSetDistinctCount() { 69 assert.Equal(t, stats.GetDistinctCount(), actual.DistinctCount) 70 } 71 } 72 73 type testReader struct { 74 *bytes.Reader 75 } 76 77 // ReadAt for testReader returns io.EOF when off + len(b) is exactly the length of the underlying input source. 78 func (tr testReader) ReadAt(b []byte, off int64) (int, error) { 79 n, err := tr.Reader.ReadAt(b, off) 80 if err == nil && (int64(n)+off == tr.Size()) { 81 return n, io.EOF 82 } 83 return n, err 84 } 85 86 type PageSerdeSuite struct { 87 suite.Suite 88 89 sink *encoding.BufferWriter 90 buffer *memory.Buffer 91 92 pageHdr format.PageHeader 93 dataPageHdr format.DataPageHeader 94 dataPageHdrV2 format.DataPageHeaderV2 95 96 pageReader file.PageReader 97 } 98 99 func TestFileDeserializing(t *testing.T) { 100 t.Parallel() 101 suite.Run(t, new(PageSerdeSuite)) 102 } 103 104 func (p *PageSerdeSuite) ResetStream() { 105 p.sink = encoding.NewBufferWriter(0, memory.DefaultAllocator) 106 } 107 108 func (p *PageSerdeSuite) EndStream() { 109 p.buffer = p.sink.Finish() 110 } 111 112 func (p *PageSerdeSuite) SetupTest() { 113 p.dataPageHdr.Encoding = format.Encoding_PLAIN 114 p.dataPageHdr.DefinitionLevelEncoding = format.Encoding_RLE 115 p.dataPageHdr.RepetitionLevelEncoding = format.Encoding_RLE 116 117 p.ResetStream() 118 } 119 120 func (p *PageSerdeSuite) InitSerializedPageReader(nrows int64, codec compress.Compression) { 121 p.EndStream() 122 123 p.pageReader, _ = file.NewPageReader(utils.NewBufferedReader(bytes.NewReader(p.buffer.Bytes()), p.buffer.Len()), nrows, codec, memory.DefaultAllocator, nil) 124 } 125 126 func (p *PageSerdeSuite) WriteDataPageHeader(maxSerialized int, uncompressed, compressed int32) { 127 // simplifying writing serialized data page headers which may or may 128 // not have meaningful data associated with them 129 130 p.pageHdr.DataPageHeader = &p.dataPageHdr 131 p.pageHdr.UncompressedPageSize = uncompressed 132 p.pageHdr.CompressedPageSize = compressed 133 p.pageHdr.Type = format.PageType_DATA_PAGE 134 135 serializer := thrift.NewThriftSerializer() 136 p.NotPanics(func() { 137 serializer.Serialize(&p.pageHdr, p.sink, nil) 138 }) 139 } 140 141 func (p *PageSerdeSuite) WriteDataPageHeaderV2(maxSerialized int, uncompressed, compressed int32) { 142 p.pageHdr.DataPageHeaderV2 = &p.dataPageHdrV2 143 p.pageHdr.UncompressedPageSize = uncompressed 144 p.pageHdr.CompressedPageSize = compressed 145 p.pageHdr.Type = format.PageType_DATA_PAGE_V2 146 147 serializer := thrift.NewThriftSerializer() 148 p.NotPanics(func() { 149 serializer.Serialize(&p.pageHdr, p.sink, nil) 150 }) 151 } 152 153 func (p *PageSerdeSuite) CheckDataPageHeader(expected format.DataPageHeader, page file.Page) { 154 p.Equal(format.PageType_DATA_PAGE, page.Type()) 155 156 p.IsType(&file.DataPageV1{}, page) 157 p.Equal(expected.NumValues, page.NumValues()) 158 p.Equal(expected.Encoding, page.Encoding()) 159 p.EqualValues(expected.DefinitionLevelEncoding, page.(*file.DataPageV1).DefinitionLevelEncoding()) 160 p.EqualValues(expected.RepetitionLevelEncoding, page.(*file.DataPageV1).RepetitionLevelEncoding()) 161 checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics()) 162 } 163 164 func (p *PageSerdeSuite) CheckDataPageHeaderV2(expected format.DataPageHeaderV2, page file.Page) { 165 p.Equal(format.PageType_DATA_PAGE_V2, page.Type()) 166 167 p.IsType(&file.DataPageV2{}, page) 168 p.Equal(expected.NumValues, page.NumValues()) 169 p.Equal(expected.Encoding, page.Encoding()) 170 p.Equal(expected.NumNulls, page.(*file.DataPageV2).NumNulls()) 171 p.Equal(expected.DefinitionLevelsByteLength, page.(*file.DataPageV2).DefinitionLevelByteLen()) 172 p.Equal(expected.RepetitionLevelsByteLength, page.(*file.DataPageV2).RepetitionLevelByteLen()) 173 p.Equal(expected.IsCompressed, page.(*file.DataPageV2).IsCompressed()) 174 checkStatistics(p.T(), *expected.Statistics, page.(file.DataPage).Statistics()) 175 } 176 177 func (p *PageSerdeSuite) TestDataPageV1() { 178 const ( 179 statsSize = 512 180 nrows = 4444 181 ) 182 p.dataPageHdr.Statistics = getDummyStats(statsSize, true) 183 p.dataPageHdr.NumValues = nrows 184 185 p.WriteDataPageHeader(1024, 0, 0) 186 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 187 p.True(p.pageReader.Next()) 188 currentPage := p.pageReader.Page() 189 p.CheckDataPageHeader(p.dataPageHdr, currentPage) 190 } 191 192 func (p *PageSerdeSuite) TestDataPageV2() { 193 const ( 194 statsSize = 512 195 nrows = 4444 196 ) 197 p.dataPageHdrV2.Statistics = getDummyStats(statsSize, true) 198 p.dataPageHdrV2.NumValues = nrows 199 p.WriteDataPageHeaderV2(1024, 0, 0) 200 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 201 p.True(p.pageReader.Next()) 202 p.CheckDataPageHeaderV2(p.dataPageHdrV2, p.pageReader.Page()) 203 } 204 205 func (p *PageSerdeSuite) TestLargePageHeaders() { 206 const ( 207 statsSize = 256 * 1024 // 256KB 208 nrows = 4141 209 maxHeaderSize = 512 * 1024 // 512KB 210 ) 211 212 p.dataPageHdr.Statistics = getDummyStats(statsSize, false) 213 p.dataPageHdr.NumValues = nrows 214 p.WriteDataPageHeader(maxHeaderSize, 0, 0) 215 pos, err := p.sink.Seek(0, io.SeekCurrent) 216 p.NoError(err) 217 p.GreaterOrEqual(maxHeaderSize, int(pos)) 218 p.LessOrEqual(statsSize, int(pos)) 219 p.GreaterOrEqual(16*1024*1024, int(pos)) 220 221 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 222 p.True(p.pageReader.Next()) 223 p.CheckDataPageHeader(p.dataPageHdr, p.pageReader.Page()) 224 } 225 226 func (p *PageSerdeSuite) TestFailLargePageHeaders() { 227 const ( 228 statsSize = 256 * 1024 // 256KB 229 nrows = 1337 // dummy value 230 maxHeaderSize = 512 * 1024 // 512 KB 231 smallerMaxSize = 128 * 1024 // 128KB 232 ) 233 p.dataPageHdr.Statistics = getDummyStats(statsSize, false) 234 p.WriteDataPageHeader(maxHeaderSize, 0, 0) 235 pos, err := p.sink.Seek(0, io.SeekCurrent) 236 p.NoError(err) 237 p.GreaterOrEqual(maxHeaderSize, int(pos)) 238 239 p.LessOrEqual(smallerMaxSize, int(pos)) 240 p.InitSerializedPageReader(nrows, compress.Codecs.Uncompressed) 241 p.pageReader.SetMaxPageHeaderSize(smallerMaxSize) 242 p.NotPanics(func() { p.False(p.pageReader.Next()) }) 243 p.Error(p.pageReader.Err()) 244 } 245 246 func (p *PageSerdeSuite) TestCompression() { 247 codecs := []compress.Compression{ 248 compress.Codecs.Snappy, 249 compress.Codecs.Brotli, 250 compress.Codecs.Gzip, 251 // compress.Codecs.Lz4, // not yet implemented 252 compress.Codecs.Zstd, 253 } 254 255 const ( 256 nrows = 32 // dummy value 257 npages = 10 258 ) 259 p.dataPageHdr.NumValues = nrows 260 261 fauxData := make([][]byte, npages) 262 for idx := range fauxData { 263 // each page is larger 264 fauxData[idx] = make([]byte, (idx+1)*64) 265 rand.Read(fauxData[idx]) 266 } 267 for _, c := range codecs { 268 p.Run(c.String(), func() { 269 codec, _ := compress.GetCodec(c) 270 for _, data := range fauxData { 271 maxCompressed := codec.CompressBound(int64(len(data))) 272 buffer := make([]byte, maxCompressed) 273 buffer = codec.Encode(buffer, data) 274 p.WriteDataPageHeader(1024, int32(len(data)), int32(len(buffer))) 275 _, err := p.sink.Write(buffer) 276 p.NoError(err) 277 } 278 279 p.InitSerializedPageReader(nrows*npages, c) 280 281 for _, data := range fauxData { 282 p.True(p.pageReader.Next()) 283 page := p.pageReader.Page() 284 p.IsType(&file.DataPageV1{}, page) 285 p.Equal(data, page.Data()) 286 } 287 p.ResetStream() 288 }) 289 } 290 } 291 292 func TestWithEOFReader(t *testing.T) { 293 root, _ := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{ 294 schema.NewInt32Node("int_col", parquet.Repetitions.Required, -1)}, -1) 295 props := parquet.NewWriterProperties(parquet.WithVersion(parquet.V2_LATEST)) 296 297 var buf bytes.Buffer 298 wr := file.NewParquetWriter(&buf, root, file.WithWriterProps(props)) 299 require.NoError(t, wr.Close()) 300 301 r := bytes.NewReader(buf.Bytes()) 302 _, err := file.NewParquetReader(testReader{Reader: r}) 303 assert.NoError(t, err) 304 } 305 306 func TestInvalidHeaders(t *testing.T) { 307 badHeader := []byte("PAR2") 308 _, err := file.NewParquetReader(bytes.NewReader(badHeader)) 309 assert.Error(t, err) 310 } 311 312 func TestInvalidFooter(t *testing.T) { 313 // file is smaller than FOOTER_SIZE 314 badFile := []byte("PAR1PAR") 315 _, err := file.NewParquetReader(bytes.NewReader(badFile)) 316 assert.Error(t, err) 317 318 // Magic Number Incorrect 319 badFile2 := []byte("PAR1PAR2") 320 _, err = file.NewParquetReader(bytes.NewReader(badFile2)) 321 assert.Error(t, err) 322 } 323 324 func TestIncompleteMetadata(t *testing.T) { 325 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 326 magic := []byte("PAR1") 327 328 sink.Write(magic) 329 sink.Write(make([]byte, 10)) 330 const metadataLen = 24 331 binary.Write(sink, binary.LittleEndian, uint32(metadataLen)) 332 sink.Write(magic) 333 buf := sink.Finish() 334 defer buf.Release() 335 _, err := file.NewParquetReader(bytes.NewReader(buf.Bytes())) 336 assert.Error(t, err) 337 } 338 339 func TestDeltaLengthByteArrayPackingWithNulls(t *testing.T) { 340 // produce file with DeltaLengthByteArray Encoding with mostly null values but one actual value. 341 root, _ := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{ 342 schema.NewByteArrayNode("byte_array_col", parquet.Repetitions.Optional, -1), 343 }, -1) 344 props := parquet.NewWriterProperties(parquet.WithVersion(parquet.V2_LATEST), 345 parquet.WithEncoding(parquet.Encodings.DeltaLengthByteArray), parquet.WithDictionaryDefault(false)) 346 sink := encoding.NewBufferWriter(0, memory.DefaultAllocator) 347 348 writer := file.NewParquetWriter(sink, root, file.WithWriterProps(props)) 349 rgw := writer.AppendRowGroup() 350 ccw, err := rgw.NextColumn() 351 assert.NoError(t, err) 352 const elements = 500 353 data := make([]parquet.ByteArray, elements) 354 data[0] = parquet.ByteArray{1, 2, 3, 4, 5, 6, 7, 8} 355 356 defLvls := make([]int16, elements) 357 repLvls := make([]int16, elements) 358 defLvls[0] = 1 359 360 _, err = ccw.(*file.ByteArrayColumnChunkWriter).WriteBatch(data, defLvls, repLvls) 361 assert.NoError(t, err) 362 assert.NoError(t, ccw.Close()) 363 assert.NoError(t, rgw.Close()) 364 assert.NoError(t, writer.Close()) 365 buf := sink.Finish() 366 defer buf.Release() 367 368 // read file back in 369 reader, err := file.NewParquetReader(bytes.NewReader(buf.Bytes())) 370 assert.NoError(t, err) 371 defer reader.Close() 372 ccr, err := reader.RowGroup(0).Column(0) 373 assert.NoError(t, err) 374 const batchSize = 500 375 376 for ccr.HasNext() { 377 readData := make([]parquet.ByteArray, batchSize) 378 readdevLvls := make([]int16, batchSize) 379 readrepLvls := make([]int16, batchSize) 380 cr := ccr.(*file.ByteArrayColumnChunkReader) 381 382 total, read, err := cr.ReadBatch(batchSize, readData, readdevLvls, readrepLvls) 383 assert.NoError(t, err) 384 assert.Equal(t, int64(batchSize), total) 385 assert.Equal(t, 1, read) 386 assert.Equal(t, data[0], readData[0]) 387 assert.NotNil(t, readData[0]) 388 } 389 } 390 391 func TestRleBooleanEncodingFileRead(t *testing.T) { 392 dir := os.Getenv("PARQUET_TEST_DATA") 393 if dir == "" { 394 t.Skip("no path supplied with PARQUET_TEST_DATA") 395 } 396 assert.DirExists(t, dir) 397 398 props := parquet.NewReaderProperties(memory.DefaultAllocator) 399 fileReader, err := file.OpenParquetFile(path.Join(dir, "rle_boolean_encoding.parquet"), 400 false, file.WithReadProps(props)) 401 require.NoError(t, err) 402 defer fileReader.Close() 403 404 assert.Equal(t, 1, fileReader.NumRowGroups()) 405 rgr := fileReader.RowGroup(0) 406 assert.EqualValues(t, 68, rgr.NumRows()) 407 408 rdr, err := rgr.Column(0) 409 require.NoError(t, err) 410 brdr := rdr.(*file.BooleanColumnChunkReader) 411 412 values := make([]bool, 68) 413 defLvls, repLvls := make([]int16, 68), make([]int16, 68) 414 total, read, err := brdr.ReadBatch(68, values, defLvls, repLvls) 415 require.NoError(t, err) 416 417 assert.EqualValues(t, 68, total) 418 md, err := rgr.MetaData().ColumnChunk(0) 419 require.NoError(t, err) 420 stats, err := md.Statistics() 421 require.NoError(t, err) 422 assert.EqualValues(t, total-stats.NullCount(), read) 423 424 expected := []bool{ 425 true, false, true, true, false, false, 426 true, true, true, false, false, true, true, 427 false, true, true, false, false, true, true, 428 false, true, true, false, false, true, true, 429 true, false, false, false, false, true, true, 430 false, true, true, false, false, true, true, 431 true, false, false, true, true, false, false, 432 true, true, true, false, true, true, false, 433 true, true, false, false, true, true, true, 434 } 435 expectedNulls := []int{2, 15, 23, 38, 48, 60} 436 437 expectedNullIdx := 0 438 for i, v := range defLvls { 439 if expectedNullIdx < len(expectedNulls) && i == expectedNulls[expectedNullIdx] { 440 assert.Zero(t, v) 441 expectedNullIdx++ 442 } else { 443 assert.EqualValues(t, 1, v) 444 } 445 } 446 447 assert.Equal(t, expected, values[:len(expected)]) 448 }