github.com/apache/arrow/go/v14@v14.0.2/parquet/file/page_reader.go

github.com/apache/arrow/go/v14@v14.0.2/parquet/file/page_reader.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"bytes"
    21  	"fmt"
    22  	"io"
    23  	"sync"
    24  
    25  	"github.com/JohnCGriffin/overflow"
    26  	"github.com/apache/arrow/go/v14/arrow/memory"
    27  	"github.com/apache/arrow/go/v14/parquet"
    28  	"github.com/apache/arrow/go/v14/parquet/compress"
    29  	"github.com/apache/arrow/go/v14/parquet/internal/encryption"
    30  	format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet"
    31  	"github.com/apache/arrow/go/v14/parquet/internal/thrift"
    32  	"github.com/apache/arrow/go/v14/parquet/metadata"
    33  	"golang.org/x/xerrors"
    34  )
    35  
    36  // PageReader is the interface used by the columnreader in order to read
    37  // and handle DataPages and loop through them.
    38  type PageReader interface {
    39  	// Set the maximum Page header size allowed to be read
    40  	SetMaxPageHeaderSize(int)
    41  	// Return the current page, or nil if there are no more
    42  	Page() Page
    43  	// Fetch the next page, returns false if there are no more pages
    44  	Next() bool
    45  	// if Next returns false, Err will return the error encountered or
    46  	// nil if there was no error and you just hit the end of the page
    47  	Err() error
    48  	// Reset allows reusing a page reader
    49  	Reset(r parquet.BufferedReader, nrows int64, compressType compress.Compression, ctx *CryptoContext)
    50  }
    51  
    52  // Page is an interface for handling DataPages or Dictionary Pages
    53  type Page interface {
    54  	// Returns which kind of page this is
    55  	Type() format.PageType
    56  	// Get the raw bytes of this page
    57  	Data() []byte
    58  	// return the encoding used for this page, Plain/RLE, etc.
    59  	Encoding() format.Encoding
    60  	// get the number of values in this page
    61  	NumValues() int32
    62  	// release this page object back into the page pool for re-use
    63  	Release()
    64  }
    65  
    66  type page struct {
    67  	buf *memory.Buffer
    68  	typ format.PageType
    69  
    70  	nvals    int32
    71  	encoding format.Encoding
    72  }
    73  
    74  func (p *page) Type() format.PageType     { return p.typ }
    75  func (p *page) Data() []byte              { return p.buf.Bytes() }
    76  func (p *page) NumValues() int32          { return p.nvals }
    77  func (p *page) Encoding() format.Encoding { return p.encoding }
    78  
    79  // DataPage is the base interface for both DataPageV1 and DataPageV2 of the
    80  // parquet spec.
    81  type DataPage interface {
    82  	Page
    83  	UncompressedSize() int32
    84  	Statistics() metadata.EncodedStatistics
    85  }
    86  
    87  // Create some pools to use for reusing the data page objects themselves so that
    88  // we can avoid tight loops that are creating and destroying tons of individual
    89  // objects. This combined with a Release function on the pages themselves
    90  // which will put them back into the pool yields significant memory reduction
    91  // and performance benefits
    92  
    93  var dataPageV1Pool = sync.Pool{
    94  	New: func() interface{} { return (*DataPageV1)(nil) },
    95  }
    96  
    97  var dataPageV2Pool = sync.Pool{
    98  	New: func() interface{} { return (*DataPageV2)(nil) },
    99  }
   100  
   101  var dictPagePool = sync.Pool{
   102  	New: func() interface{} { return (*DictionaryPage)(nil) },
   103  }
   104  
   105  // DataPageV1 represents a DataPage version 1 from the parquet.thrift file
   106  type DataPageV1 struct {
   107  	page
   108  
   109  	defLvlEncoding   format.Encoding
   110  	repLvlEncoding   format.Encoding
   111  	uncompressedSize int32
   112  	statistics       metadata.EncodedStatistics
   113  }
   114  
   115  // NewDataPageV1 returns a V1 data page with the given buffer as its data and the specified encoding information
   116  //
   117  // Will utilize objects that have been released back into the data page pool and
   118  // re-use them if available as opposed to creating new objects. Calling Release on the
   119  // data page object will release it back to the pool for re-use.
   120  func NewDataPageV1(buffer *memory.Buffer, num int32, encoding, defEncoding, repEncoding parquet.Encoding, uncompressedSize int32) *DataPageV1 {
   121  	dp := dataPageV1Pool.Get().(*DataPageV1)
   122  	if dp == nil {
   123  		return &DataPageV1{
   124  			page:             page{buf: buffer, typ: format.PageType_DATA_PAGE, nvals: num, encoding: format.Encoding(encoding)},
   125  			defLvlEncoding:   format.Encoding(defEncoding),
   126  			repLvlEncoding:   format.Encoding(repEncoding),
   127  			uncompressedSize: uncompressedSize,
   128  		}
   129  	}
   130  
   131  	dp.buf, dp.nvals = buffer, num
   132  	dp.encoding = format.Encoding(encoding)
   133  	dp.defLvlEncoding, dp.repLvlEncoding = format.Encoding(defEncoding), format.Encoding(repEncoding)
   134  	dp.statistics.HasMax, dp.statistics.HasMin = false, false
   135  	dp.statistics.HasNullCount, dp.statistics.HasDistinctCount = false, false
   136  	dp.uncompressedSize = uncompressedSize
   137  	return dp
   138  }
   139  
   140  // NewDataPageV1WithStats is the same as NewDataPageV1, but also allows adding the stat info into the created page
   141  func NewDataPageV1WithStats(buffer *memory.Buffer, num int32, encoding, defEncoding, repEncoding parquet.Encoding, uncompressedSize int32, stats metadata.EncodedStatistics) *DataPageV1 {
   142  	ret := NewDataPageV1(buffer, num, encoding, defEncoding, repEncoding, uncompressedSize)
   143  	ret.statistics = stats
   144  	return ret
   145  }
   146  
   147  // Release this page back into the DataPage object pool so that it can be reused.
   148  //
   149  // After calling this function, the object should not be utilized anymore, otherwise
   150  // conflicts can arise.
   151  func (d *DataPageV1) Release() {
   152  	d.buf.Release()
   153  	d.buf = nil
   154  	dataPageV1Pool.Put(d)
   155  }
   156  
   157  // UncompressedSize returns the size of the data in this data page when uncompressed
   158  func (d *DataPageV1) UncompressedSize() int32 { return d.uncompressedSize }
   159  
   160  // Statistics returns the encoded statistics on this data page
   161  func (d *DataPageV1) Statistics() metadata.EncodedStatistics { return d.statistics }
   162  
   163  // DefinitionLevelEncoding returns the encoding utilized for the Definition Levels
   164  func (d *DataPageV1) DefinitionLevelEncoding() parquet.Encoding {
   165  	return parquet.Encoding(d.defLvlEncoding)
   166  }
   167  
   168  // RepetitionLevelEncoding returns the encoding utilized for the Repetition Levels
   169  func (d *DataPageV1) RepetitionLevelEncoding() parquet.Encoding {
   170  	return parquet.Encoding(d.repLvlEncoding)
   171  }
   172  
   173  // DataPageV2 is the representation of the V2 data page from the parquet.thrift spec
   174  type DataPageV2 struct {
   175  	page
   176  
   177  	nulls            int32
   178  	nrows            int32
   179  	defLvlByteLen    int32
   180  	repLvlByteLen    int32
   181  	compressed       bool
   182  	uncompressedSize int32
   183  	statistics       metadata.EncodedStatistics
   184  }
   185  
   186  // NewDataPageV2 constructs a new V2 data page with the provided information and a buffer of the raw data.
   187  func NewDataPageV2(buffer *memory.Buffer, numValues, numNulls, numRows int32, encoding parquet.Encoding, defLvlsByteLen, repLvlsByteLen, uncompressed int32, isCompressed bool) *DataPageV2 {
   188  	dp := dataPageV2Pool.Get().(*DataPageV2)
   189  	if dp == nil {
   190  		return &DataPageV2{
   191  			page:             page{buf: buffer, typ: format.PageType_DATA_PAGE_V2, nvals: numValues, encoding: format.Encoding(encoding)},
   192  			nulls:            numNulls,
   193  			nrows:            numRows,
   194  			defLvlByteLen:    defLvlsByteLen,
   195  			repLvlByteLen:    repLvlsByteLen,
   196  			compressed:       isCompressed,
   197  			uncompressedSize: uncompressed,
   198  		}
   199  	}
   200  
   201  	dp.buf, dp.nvals = buffer, numValues
   202  	dp.encoding = format.Encoding(encoding)
   203  	dp.nulls, dp.nrows = numNulls, numRows
   204  	dp.defLvlByteLen, dp.repLvlByteLen = defLvlsByteLen, repLvlsByteLen
   205  	dp.compressed, dp.uncompressedSize = isCompressed, uncompressed
   206  	dp.statistics.HasMax, dp.statistics.HasMin = false, false
   207  	dp.statistics.HasNullCount, dp.statistics.HasDistinctCount = false, false
   208  	return dp
   209  }
   210  
   211  // NewDataPageV2WithStats is the same as NewDataPageV2 but allows providing the encoded stats with the page.
   212  func NewDataPageV2WithStats(buffer *memory.Buffer, numValues, numNulls, numRows int32, encoding parquet.Encoding, defLvlsByteLen, repLvlsByteLen, uncompressed int32, isCompressed bool, stats metadata.EncodedStatistics) *DataPageV2 {
   213  	ret := NewDataPageV2(buffer, numValues, numNulls, numRows, encoding, defLvlsByteLen, repLvlsByteLen, uncompressed, isCompressed)
   214  	ret.statistics = stats
   215  	return ret
   216  }
   217  
   218  // Release this page back into the DataPage object pool so that it can be reused.
   219  //
   220  // After calling this function, the object should not be utilized anymore, otherwise
   221  // conflicts can arise.
   222  func (d *DataPageV2) Release() {
   223  	d.buf.Release()
   224  	d.buf = nil
   225  	dataPageV2Pool.Put(d)
   226  }
   227  
   228  // UncompressedSize is the size of the raw page when uncompressed. If `IsCompressed` is true, then
   229  // the raw data in the buffer is expected to be compressed.
   230  func (d *DataPageV2) UncompressedSize() int32 { return d.uncompressedSize }
   231  
   232  // Statistics are the encoded statistics in the data page
   233  func (d *DataPageV2) Statistics() metadata.EncodedStatistics { return d.statistics }
   234  
   235  // NumNulls is the reported number of nulls in this datapage
   236  func (d *DataPageV2) NumNulls() int32 { return d.nulls }
   237  
   238  // NumRows is the number of rows recorded in the page header
   239  func (d *DataPageV2) NumRows() int32 { return d.nrows }
   240  
   241  // DefinitionLevelByteLen is the number of bytes in the buffer that are used to represent the definition levels
   242  func (d *DataPageV2) DefinitionLevelByteLen() int32 { return d.defLvlByteLen }
   243  
   244  // RepetitionLevelByteLen is the number of bytes in the buffer which are used to represent the repetition Levels
   245  func (d *DataPageV2) RepetitionLevelByteLen() int32 { return d.repLvlByteLen }
   246  
   247  // IsCompressed returns true if the data of this page is compressed
   248  func (d *DataPageV2) IsCompressed() bool { return d.compressed }
   249  
   250  // DictionaryPage represents the a page of data that uses dictionary encoding
   251  type DictionaryPage struct {
   252  	page
   253  
   254  	sorted bool
   255  }
   256  
   257  // NewDictionaryPage constructs a new dictionary page with the provided data buffer and number of values.
   258  func NewDictionaryPage(buffer *memory.Buffer, nvals int32, encoding parquet.Encoding) *DictionaryPage {
   259  	dp := dictPagePool.Get().(*DictionaryPage)
   260  	if dp == nil {
   261  		return &DictionaryPage{
   262  			page: page{
   263  				buf:      buffer,
   264  				typ:      format.PageType_DICTIONARY_PAGE,
   265  				nvals:    nvals,
   266  				encoding: format.Encoding(encoding),
   267  			},
   268  		}
   269  	}
   270  
   271  	dp.buf = buffer
   272  	dp.nvals = nvals
   273  	dp.encoding = format.Encoding(encoding)
   274  	dp.sorted = false
   275  	return dp
   276  }
   277  
   278  // Release this page back into the DataPage object pool so that it can be reused.
   279  //
   280  // After calling this function, the object should not be utilized anymore, otherwise
   281  // conflicts can arise.
   282  func (d *DictionaryPage) Release() {
   283  	d.buf.Release()
   284  	d.buf = nil
   285  	dictPagePool.Put(d)
   286  }
   287  
   288  // IsSorted returns whether the dictionary itself is sorted
   289  func (d *DictionaryPage) IsSorted() bool { return d.sorted }
   290  
   291  type serializedPageReader struct {
   292  	r        parquet.BufferedReader
   293  	nrows    int64
   294  	rowsSeen int64
   295  	mem      memory.Allocator
   296  	codec    compress.Codec
   297  
   298  	curPageHdr        *format.PageHeader
   299  	pageOrd           int16
   300  	maxPageHeaderSize int
   301  
   302  	curPage           Page
   303  	cryptoCtx         CryptoContext
   304  	dataPageAad       string
   305  	dataPageHeaderAad string
   306  
   307  	decompressBuffer bytes.Buffer
   308  	err              error
   309  }
   310  
   311  // NewPageReader returns a page reader for the data which can be read from the provided reader and compression.
   312  func NewPageReader(r parquet.BufferedReader, nrows int64, compressType compress.Compression, mem memory.Allocator, ctx *CryptoContext) (PageReader, error) {
   313  	if mem == nil {
   314  		mem = memory.NewGoAllocator()
   315  	}
   316  
   317  	codec, err := compress.GetCodec(compressType)
   318  	if err != nil {
   319  		return nil, err
   320  	}
   321  
   322  	rdr := &serializedPageReader{
   323  		r:                 r,
   324  		maxPageHeaderSize: defaultMaxPageHeaderSize,
   325  		nrows:             nrows,
   326  		mem:               mem,
   327  		codec:             codec,
   328  	}
   329  	rdr.decompressBuffer.Grow(defaultPageHeaderSize)
   330  	if ctx != nil {
   331  		rdr.cryptoCtx = *ctx
   332  		rdr.initDecryption()
   333  	}
   334  	return rdr, nil
   335  }
   336  
   337  func (p *serializedPageReader) Reset(r parquet.BufferedReader, nrows int64, compressType compress.Compression, ctx *CryptoContext) {
   338  	p.rowsSeen, p.pageOrd, p.nrows = 0, 0, nrows
   339  	p.curPageHdr, p.curPage, p.err = nil, nil, nil
   340  	p.r = r
   341  
   342  	p.codec, p.err = compress.GetCodec(compressType)
   343  	if p.err != nil {
   344  		return
   345  	}
   346  	p.decompressBuffer.Reset()
   347  	if ctx != nil {
   348  		p.cryptoCtx = *ctx
   349  		p.initDecryption()
   350  	} else {
   351  		p.cryptoCtx = CryptoContext{}
   352  		p.dataPageAad = ""
   353  		p.dataPageHeaderAad = ""
   354  	}
   355  }
   356  
   357  func (p *serializedPageReader) Err() error { return p.err }
   358  
   359  func (p *serializedPageReader) SetMaxPageHeaderSize(sz int) {
   360  	p.maxPageHeaderSize = sz
   361  }
   362  
   363  func (p *serializedPageReader) initDecryption() {
   364  	if p.cryptoCtx.DataDecryptor != nil {
   365  		p.dataPageAad = encryption.CreateModuleAad(p.cryptoCtx.DataDecryptor.FileAad(), encryption.DataPageModule,
   366  			p.cryptoCtx.RowGroupOrdinal, p.cryptoCtx.ColumnOrdinal, -1)
   367  	}
   368  	if p.cryptoCtx.MetaDecryptor != nil {
   369  		p.dataPageHeaderAad = encryption.CreateModuleAad(p.cryptoCtx.MetaDecryptor.FileAad(), encryption.DataPageHeaderModule,
   370  			p.cryptoCtx.RowGroupOrdinal, p.cryptoCtx.ColumnOrdinal, -1)
   371  	}
   372  }
   373  
   374  func (p *serializedPageReader) updateDecryption(decrypt encryption.Decryptor, moduleType int8, pageAad string) {
   375  	if p.cryptoCtx.StartDecryptWithDictionaryPage {
   376  		aad := encryption.CreateModuleAad(decrypt.FileAad(), moduleType, p.cryptoCtx.RowGroupOrdinal, p.cryptoCtx.ColumnOrdinal, -1)
   377  		decrypt.UpdateAad(aad)
   378  	} else {
   379  		pageaad := []byte(pageAad)
   380  		encryption.QuickUpdatePageAad(pageaad, p.pageOrd)
   381  		decrypt.UpdateAad(string(pageaad))
   382  	}
   383  }
   384  
   385  func (p *serializedPageReader) Page() Page {
   386  	return p.curPage
   387  }
   388  
   389  func (p *serializedPageReader) decompress(lenCompressed int, buf []byte) ([]byte, error) {
   390  	p.decompressBuffer.Grow(lenCompressed)
   391  	if _, err := io.CopyN(&p.decompressBuffer, p.r, int64(lenCompressed)); err != nil {
   392  		return nil, err
   393  	}
   394  
   395  	data := p.decompressBuffer.Bytes()
   396  	if p.cryptoCtx.DataDecryptor != nil {
   397  		data = p.cryptoCtx.DataDecryptor.Decrypt(p.decompressBuffer.Bytes())
   398  	}
   399  
   400  	return p.codec.Decode(buf, data), nil
   401  }
   402  
   403  type dataheader interface {
   404  	IsSetStatistics() bool
   405  	GetStatistics() *format.Statistics
   406  }
   407  
   408  func extractStats(dataHeader dataheader) (pageStats metadata.EncodedStatistics) {
   409  	if dataHeader.IsSetStatistics() {
   410  		stats := dataHeader.GetStatistics()
   411  		if stats.IsSetMaxValue() {
   412  			pageStats.SetMax(stats.GetMaxValue())
   413  		} else if stats.IsSetMax() {
   414  			pageStats.SetMax(stats.GetMax())
   415  		}
   416  		if stats.IsSetMinValue() {
   417  			pageStats.SetMin(stats.GetMinValue())
   418  		} else if stats.IsSetMin() {
   419  			pageStats.SetMin(stats.GetMin())
   420  		}
   421  
   422  		if stats.IsSetNullCount() {
   423  			pageStats.SetNullCount(stats.GetNullCount())
   424  		}
   425  		if stats.IsSetDistinctCount() {
   426  			pageStats.SetDistinctCount(stats.GetDistinctCount())
   427  		}
   428  	}
   429  	return
   430  }
   431  
   432  func (p *serializedPageReader) Next() bool {
   433  	// Loop here because there may be unhandled page types that we skip until
   434  	// finding a page that we do know what to do with
   435  	if p.curPage != nil {
   436  		p.curPage.Release()
   437  	}
   438  	p.curPage = nil
   439  	p.curPageHdr = format.NewPageHeader()
   440  	p.err = nil
   441  
   442  	for p.rowsSeen < p.nrows {
   443  		allowedPgSz := defaultPageHeaderSize
   444  		p.decompressBuffer.Reset()
   445  		for {
   446  			view, err := p.r.Peek(allowedPgSz)
   447  			if err != nil && err != io.EOF {
   448  				p.err = err
   449  				return false
   450  			}
   451  
   452  			if len(view) == 0 {
   453  				return false
   454  			}
   455  
   456  			extra := 0
   457  			if p.cryptoCtx.MetaDecryptor != nil {
   458  				p.updateDecryption(p.cryptoCtx.MetaDecryptor, encryption.DictPageHeaderModule, p.dataPageHeaderAad)
   459  				view = p.cryptoCtx.MetaDecryptor.Decrypt(view)
   460  				extra = p.cryptoCtx.MetaDecryptor.CiphertextSizeDelta()
   461  			}
   462  
   463  			remaining, err := thrift.DeserializeThrift(p.curPageHdr, view)
   464  			if err != nil {
   465  				allowedPgSz *= 2
   466  				if allowedPgSz > p.maxPageHeaderSize {
   467  					p.err = xerrors.New("parquet: deserializing page header failed")
   468  					return false
   469  				}
   470  				continue
   471  			}
   472  
   473  			p.r.Discard(len(view) - int(remaining) + extra)
   474  			break
   475  		}
   476  
   477  		lenCompressed := int(p.curPageHdr.GetCompressedPageSize())
   478  		lenUncompressed := int(p.curPageHdr.GetUncompressedPageSize())
   479  		if lenCompressed < 0 || lenUncompressed < 0 {
   480  			p.err = xerrors.New("parquet: invalid page header")
   481  			return false
   482  		}
   483  
   484  		if p.cryptoCtx.DataDecryptor != nil {
   485  			p.updateDecryption(p.cryptoCtx.DataDecryptor, encryption.DictPageModule, p.dataPageAad)
   486  		}
   487  
   488  		buf := memory.NewResizableBuffer(p.mem)
   489  		defer buf.Release()
   490  		buf.ResizeNoShrink(lenUncompressed)
   491  
   492  		switch p.curPageHdr.GetType() {
   493  		case format.PageType_DICTIONARY_PAGE:
   494  			p.cryptoCtx.StartDecryptWithDictionaryPage = false
   495  			dictHeader := p.curPageHdr.GetDictionaryPageHeader()
   496  			if dictHeader.GetNumValues() < 0 {
   497  				p.err = xerrors.New("parquet: invalid page header (negative number of values)")
   498  				return false
   499  			}
   500  
   501  			data, err := p.decompress(lenCompressed, buf.Bytes())
   502  			if err != nil {
   503  				p.err = err
   504  				return false
   505  			}
   506  			if len(data) != lenUncompressed {
   507  				p.err = fmt.Errorf("parquet: metadata said %d bytes uncompressed dictionary page, got %d bytes", lenUncompressed, len(data))
   508  				return false
   509  			}
   510  
   511  			// make dictionary page
   512  			p.curPage = &DictionaryPage{
   513  				page: page{
   514  					buf:      memory.NewBufferBytes(data),
   515  					typ:      p.curPageHdr.Type,
   516  					nvals:    dictHeader.GetNumValues(),
   517  					encoding: dictHeader.GetEncoding(),
   518  				},
   519  				sorted: dictHeader.IsSetIsSorted() && dictHeader.GetIsSorted(),
   520  			}
   521  
   522  		case format.PageType_DATA_PAGE:
   523  			p.pageOrd++
   524  			dataHeader := p.curPageHdr.GetDataPageHeader()
   525  			if dataHeader.GetNumValues() < 0 {
   526  				p.err = xerrors.New("parquet: invalid page header (negative number of values)")
   527  				return false
   528  			}
   529  
   530  			p.rowsSeen += int64(dataHeader.GetNumValues())
   531  			data, err := p.decompress(lenCompressed, buf.Bytes())
   532  			if err != nil {
   533  				p.err = err
   534  				return false
   535  			}
   536  			if len(data) != lenUncompressed {
   537  				p.err = fmt.Errorf("parquet: metadata said %d bytes uncompressed data page, got %d bytes", lenUncompressed, len(data))
   538  				return false
   539  			}
   540  
   541  			// make datapagev1
   542  			p.curPage = &DataPageV1{
   543  				page: page{
   544  					buf:      memory.NewBufferBytes(data),
   545  					typ:      p.curPageHdr.Type,
   546  					nvals:    dataHeader.GetNumValues(),
   547  					encoding: dataHeader.GetEncoding(),
   548  				},
   549  				defLvlEncoding:   dataHeader.GetDefinitionLevelEncoding(),
   550  				repLvlEncoding:   dataHeader.GetRepetitionLevelEncoding(),
   551  				uncompressedSize: int32(lenUncompressed),
   552  				statistics:       extractStats(dataHeader),
   553  			}
   554  		case format.PageType_DATA_PAGE_V2:
   555  			p.pageOrd++
   556  			dataHeader := p.curPageHdr.GetDataPageHeaderV2()
   557  			if dataHeader.GetNumValues() < 0 {
   558  				p.err = xerrors.New("parquet: invalid page header (negative number of values)")
   559  				return false
   560  			}
   561  
   562  			if dataHeader.GetDefinitionLevelsByteLength() < 0 || dataHeader.GetRepetitionLevelsByteLength() < 0 {
   563  				p.err = xerrors.New("parquet: invalid page header (negative levels byte length)")
   564  				return false
   565  			}
   566  
   567  			compressed := dataHeader.GetIsCompressed()
   568  			// extract stats
   569  			p.rowsSeen += int64(dataHeader.GetNumValues())
   570  			levelsBytelen, ok := overflow.Add(int(dataHeader.GetDefinitionLevelsByteLength()), int(dataHeader.GetRepetitionLevelsByteLength()))
   571  			if !ok {
   572  				p.err = xerrors.New("parquet: levels size too large (corrupt file?)")
   573  				return false
   574  			}
   575  
   576  			if compressed {
   577  				if levelsBytelen > 0 {
   578  					io.ReadFull(p.r, buf.Bytes()[:levelsBytelen])
   579  				}
   580  				if _, p.err = p.decompress(lenCompressed-levelsBytelen, buf.Bytes()[levelsBytelen:]); p.err != nil {
   581  					return false
   582  				}
   583  			} else {
   584  				io.ReadFull(p.r, buf.Bytes())
   585  			}
   586  			buf.Retain()
   587  
   588  			if buf.Len() != lenUncompressed {
   589  				p.err = fmt.Errorf("parquet: metadata said %d bytes uncompressed data page, got %d bytes", lenUncompressed, buf.Len())
   590  				return false
   591  			}
   592  
   593  			// make datapage v2
   594  			p.curPage = &DataPageV2{
   595  				page: page{
   596  					buf:      buf,
   597  					typ:      p.curPageHdr.Type,
   598  					nvals:    dataHeader.GetNumValues(),
   599  					encoding: dataHeader.GetEncoding(),
   600  				},
   601  				nulls:            dataHeader.GetNumNulls(),
   602  				nrows:            dataHeader.GetNumRows(),
   603  				defLvlByteLen:    dataHeader.GetDefinitionLevelsByteLength(),
   604  				repLvlByteLen:    dataHeader.GetRepetitionLevelsByteLength(),
   605  				compressed:       compressed,
   606  				uncompressedSize: int32(lenUncompressed),
   607  				statistics:       extractStats(dataHeader),
   608  			}
   609  		default:
   610  			// we don't know this page type, we're allowed to skip non-data pages
   611  			continue
   612  		}
   613  		return true
   614  	}
   615  
   616  	return false
   617  }