github.com/apache/arrow/go/v7@v7.0.1/parquet/file/page_reader.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"bytes"
    21  	"io"
    22  	"sync"
    23  
    24  	"github.com/JohnCGriffin/overflow"
    25  	"github.com/apache/arrow/go/v7/arrow/ipc"
    26  	"github.com/apache/arrow/go/v7/arrow/memory"
    27  	"github.com/apache/arrow/go/v7/parquet"
    28  	"github.com/apache/arrow/go/v7/parquet/compress"
    29  	"github.com/apache/arrow/go/v7/parquet/internal/encryption"
    30  	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
    31  	"github.com/apache/arrow/go/v7/parquet/internal/thrift"
    32  	"github.com/apache/arrow/go/v7/parquet/metadata"
    33  	"golang.org/x/xerrors"
    34  )
    35  
    36  // PageReader is the interface used by the columnreader in order to read
    37  // and handle DataPages and loop through them.
    38  type PageReader interface {
    39  	// Set the maximum Page header size allowed to be read
    40  	SetMaxPageHeaderSize(int)
    41  	// Return the current page, or nil if there are no more
    42  	Page() Page
    43  	// Fetch the next page, returns false if there are no more pages
    44  	Next() bool
    45  	// if Next returns false, Err will return the error encountered or
    46  	// nil if there was no error and you just hit the end of the page
    47  	Err() error
    48  	// Reset allows reusing a page reader
    49  	Reset(r parquet.ReaderAtSeeker, nrows int64, compressType compress.Compression, ctx *CryptoContext)
    50  }
    51  
    52  // Page is an interface for handling DataPages or Dictionary Pages
    53  type Page interface {
    54  	// Returns which kind of page this is
    55  	Type() format.PageType
    56  	// Get the raw bytes of this page
    57  	Data() []byte
    58  	// return the encoding used for this page, Plain/RLE, etc.
    59  	Encoding() format.Encoding
    60  	// get the number of values in this page
    61  	NumValues() int32
    62  	// release this page object back into the page pool for re-use
    63  	Release()
    64  }
    65  
    66  type page struct {
    67  	buf *memory.Buffer
    68  	typ format.PageType
    69  
    70  	nvals    int32
    71  	encoding format.Encoding
    72  }
    73  
    74  func (p *page) Type() format.PageType     { return p.typ }
    75  func (p *page) Data() []byte              { return p.buf.Bytes() }
    76  func (p *page) NumValues() int32          { return p.nvals }
    77  func (p *page) Encoding() format.Encoding { return p.encoding }
    78  
    79  // DataPage is the base interface for both DataPageV1 and DataPageV2 of the
    80  // parquet spec.
    81  type DataPage interface {
    82  	Page
    83  	UncompressedSize() int32
    84  	Statistics() metadata.EncodedStatistics
    85  }
    86  
    87  // Create some pools to use for reusing the data page objects themselves so that
    88  // we can avoid tight loops that are creating and destroying tons of individual
    89  // objects. This combined with a Release function on the pages themselves
    90  // which will put them back into the pool yields significant memory reduction
    91  // and performance benefits
    92  
    93  var dataPageV1Pool = sync.Pool{
    94  	New: func() interface{} { return (*DataPageV1)(nil) },
    95  }
    96  
    97  var dataPageV2Pool = sync.Pool{
    98  	New: func() interface{} { return (*DataPageV2)(nil) },
    99  }
   100  
   101  var dictPagePool = sync.Pool{
   102  	New: func() interface{} { return (*DictionaryPage)(nil) },
   103  }
   104  
   105  // DataPageV1 represents a DataPage version 1 from the parquet.thrift file
   106  type DataPageV1 struct {
   107  	page
   108  
   109  	defLvlEncoding   format.Encoding
   110  	repLvlEncoding   format.Encoding
   111  	uncompressedSize int32
   112  	statistics       metadata.EncodedStatistics
   113  }
   114  
   115  // NewDataPageV1 returns a V1 data page with the given buffer as its data and the specified encoding information
   116  //
   117  // Will utilize objects that have been released back into the data page pool and
   118  // re-use them if available as opposed to creating new objects. Calling Release on the
   119  // data page object will release it back to the pool for re-use.
   120  func NewDataPageV1(buffer *memory.Buffer, num int32, encoding, defEncoding, repEncoding parquet.Encoding, uncompressedSize int32) *DataPageV1 {
   121  	dp := dataPageV1Pool.Get().(*DataPageV1)
   122  	if dp == nil {
   123  		return &DataPageV1{
   124  			page:             page{buf: buffer, typ: format.PageType_DATA_PAGE, nvals: num, encoding: format.Encoding(encoding)},
   125  			defLvlEncoding:   format.Encoding(defEncoding),
   126  			repLvlEncoding:   format.Encoding(repEncoding),
   127  			uncompressedSize: uncompressedSize,
   128  		}
   129  	}
   130  
   131  	dp.buf, dp.nvals = buffer, num
   132  	dp.encoding = format.Encoding(encoding)
   133  	dp.defLvlEncoding, dp.repLvlEncoding = format.Encoding(defEncoding), format.Encoding(repEncoding)
   134  	dp.statistics.HasMax, dp.statistics.HasMin = false, false
   135  	dp.statistics.HasNullCount, dp.statistics.HasDistinctCount = false, false
   136  	dp.uncompressedSize = uncompressedSize
   137  	return dp
   138  }
   139  
   140  // NewDataPageV1WithStats is the same as NewDataPageV1, but also allows adding the stat info into the created page
   141  func NewDataPageV1WithStats(buffer *memory.Buffer, num int32, encoding, defEncoding, repEncoding parquet.Encoding, uncompressedSize int32, stats metadata.EncodedStatistics) *DataPageV1 {
   142  	ret := NewDataPageV1(buffer, num, encoding, defEncoding, repEncoding, uncompressedSize)
   143  	ret.statistics = stats
   144  	return ret
   145  }
   146  
   147  // Release this page back into the DataPage object pool so that it can be reused.
   148  //
   149  // After calling this function, the object should not be utilized anymore, otherwise
   150  // conflicts can arise.
   151  func (d *DataPageV1) Release() {
   152  	d.buf.Release()
   153  	d.buf = nil
   154  	dataPageV1Pool.Put(d)
   155  }
   156  
   157  // UncompressedSize returns the size of the data in this data page when uncompressed
   158  func (d *DataPageV1) UncompressedSize() int32 { return d.uncompressedSize }
   159  
   160  // Statistics returns the encoded statistics on this data page
   161  func (d *DataPageV1) Statistics() metadata.EncodedStatistics { return d.statistics }
   162  
   163  // DefinitionLevelEncoding returns the encoding utilized for the Definition Levels
   164  func (d *DataPageV1) DefinitionLevelEncoding() parquet.Encoding {
   165  	return parquet.Encoding(d.defLvlEncoding)
   166  }
   167  
   168  // RepetitionLevelEncoding returns the encoding utilized for the Repetition Levels
   169  func (d *DataPageV1) RepetitionLevelEncoding() parquet.Encoding {
   170  	return parquet.Encoding(d.repLvlEncoding)
   171  }
   172  
   173  // DataPageV2 is the representation of the V2 data page from the parquet.thrift spec
   174  type DataPageV2 struct {
   175  	page
   176  
   177  	nulls            int32
   178  	nrows            int32
   179  	defLvlByteLen    int32
   180  	repLvlByteLen    int32
   181  	compressed       bool
   182  	uncompressedSize int32
   183  	statistics       metadata.EncodedStatistics
   184  }
   185  
   186  // NewDataPageV2 constructs a new V2 data page with the provided information and a buffer of the raw data.
   187  func NewDataPageV2(buffer *memory.Buffer, numValues, numNulls, numRows int32, encoding parquet.Encoding, defLvlsByteLen, repLvlsByteLen, uncompressed int32, isCompressed bool) *DataPageV2 {
   188  	dp := dataPageV2Pool.Get().(*DataPageV2)
   189  	if dp == nil {
   190  		return &DataPageV2{
   191  			page:             page{buf: buffer, typ: format.PageType_DATA_PAGE_V2, nvals: numValues, encoding: format.Encoding(encoding)},
   192  			nulls:            numNulls,
   193  			nrows:            numRows,
   194  			defLvlByteLen:    defLvlsByteLen,
   195  			repLvlByteLen:    repLvlsByteLen,
   196  			compressed:       isCompressed,
   197  			uncompressedSize: uncompressed,
   198  		}
   199  	}
   200  
   201  	dp.buf, dp.nvals = buffer, numValues
   202  	dp.encoding = format.Encoding(encoding)
   203  	dp.nulls, dp.nrows = numNulls, numRows
   204  	dp.defLvlByteLen, dp.repLvlByteLen = defLvlsByteLen, repLvlsByteLen
   205  	dp.compressed, dp.uncompressedSize = isCompressed, uncompressed
   206  	dp.statistics.HasMax, dp.statistics.HasMin = false, false
   207  	dp.statistics.HasNullCount, dp.statistics.HasDistinctCount = false, false
   208  	return dp
   209  }
   210  
   211  // NewDataPageV2WithStats is the same as NewDataPageV2 but allows providing the encoded stats with the page.
   212  func NewDataPageV2WithStats(buffer *memory.Buffer, numValues, numNulls, numRows int32, encoding parquet.Encoding, defLvlsByteLen, repLvlsByteLen, uncompressed int32, isCompressed bool, stats metadata.EncodedStatistics) *DataPageV2 {
   213  	ret := NewDataPageV2(buffer, numValues, numNulls, numRows, encoding, defLvlsByteLen, repLvlsByteLen, uncompressed, isCompressed)
   214  	ret.statistics = stats
   215  	return ret
   216  }
   217  
   218  // Release this page back into the DataPage object pool so that it can be reused.
   219  //
   220  // After calling this function, the object should not be utilized anymore, otherwise
   221  // conflicts can arise.
   222  func (d *DataPageV2) Release() {
   223  	d.buf.Release()
   224  	d.buf = nil
   225  	dataPageV2Pool.Put(d)
   226  }
   227  
   228  // UncompressedSize is the size of the raw page when uncompressed. If `IsCompressed` is true, then
   229  // the raw data in the buffer is expected to be compressed.
   230  func (d *DataPageV2) UncompressedSize() int32 { return d.uncompressedSize }
   231  
   232  // Statistics are the encoded statistics in the data page
   233  func (d *DataPageV2) Statistics() metadata.EncodedStatistics { return d.statistics }
   234  
   235  // NumNulls is the reported number of nulls in this datapage
   236  func (d *DataPageV2) NumNulls() int32 { return d.nulls }
   237  
   238  // NumRows is the number of rows recorded in the page header
   239  func (d *DataPageV2) NumRows() int32 { return d.nrows }
   240  
   241  // DefinitionLevelByteLen is the number of bytes in the buffer that are used to represent the definition levels
   242  func (d *DataPageV2) DefinitionLevelByteLen() int32 { return d.defLvlByteLen }
   243  
   244  // RepetitionLevelByteLen is the number of bytes in the buffer which are used to represent the repetition Levels
   245  func (d *DataPageV2) RepetitionLevelByteLen() int32 { return d.repLvlByteLen }
   246  
   247  // IsCompressed returns true if the data of this page is compressed
   248  func (d *DataPageV2) IsCompressed() bool { return d.compressed }
   249  
   250  // DictionaryPage represents the a page of data that uses dictionary encoding
   251  type DictionaryPage struct {
   252  	page
   253  
   254  	sorted bool
   255  }
   256  
   257  // NewDictionaryPage constructs a new dictionary page with the provided data buffer and number of values.
   258  func NewDictionaryPage(buffer *memory.Buffer, nvals int32, encoding parquet.Encoding) *DictionaryPage {
   259  	dp := dictPagePool.Get().(*DictionaryPage)
   260  	if dp == nil {
   261  		return &DictionaryPage{
   262  			page: page{
   263  				buf:      buffer,
   264  				typ:      format.PageType_DICTIONARY_PAGE,
   265  				nvals:    nvals,
   266  				encoding: format.Encoding(encoding),
   267  			},
   268  		}
   269  	}
   270  
   271  	dp.buf = buffer
   272  	dp.nvals = nvals
   273  	dp.encoding = format.Encoding(encoding)
   274  	dp.sorted = false
   275  	return dp
   276  }
   277  
   278  // Release this page back into the DataPage object pool so that it can be reused.
   279  //
   280  // After calling this function, the object should not be utilized anymore, otherwise
   281  // conflicts can arise.
   282  func (d *DictionaryPage) Release() {
   283  	d.buf.Release()
   284  	d.buf = nil
   285  	dictPagePool.Put(d)
   286  }
   287  
   288  // IsSorted returns whether the dictionary itself is sorted
   289  func (d *DictionaryPage) IsSorted() bool { return d.sorted }
   290  
   291  type serializedPageReader struct {
   292  	r        ipc.ReadAtSeeker
   293  	nrows    int64
   294  	rowsSeen int64
   295  	mem      memory.Allocator
   296  	codec    compress.Codec
   297  
   298  	curPageHdr        *format.PageHeader
   299  	buf               *memory.Buffer
   300  	pageOrd           int16
   301  	maxPageHeaderSize int
   302  
   303  	curPage           Page
   304  	cryptoCtx         CryptoContext
   305  	dataPageAad       string
   306  	dataPageHeaderAad string
   307  
   308  	decompressBuffer bytes.Buffer
   309  	err              error
   310  }
   311  
   312  // NewPageReader returns a page reader for the data which can be read from the provided reader and compression.
   313  func NewPageReader(r parquet.ReaderAtSeeker, nrows int64, compressType compress.Compression, mem memory.Allocator, ctx *CryptoContext) (PageReader, error) {
   314  	if mem == nil {
   315  		mem = memory.NewGoAllocator()
   316  	}
   317  
   318  	codec, err := compress.GetCodec(compressType)
   319  	if err != nil {
   320  		return nil, err
   321  	}
   322  
   323  	rdr := &serializedPageReader{
   324  		r:                 r,
   325  		maxPageHeaderSize: defaultMaxPageHeaderSize,
   326  		nrows:             nrows,
   327  		mem:               mem,
   328  		codec:             codec,
   329  		buf:               memory.NewResizableBuffer(mem),
   330  	}
   331  	rdr.decompressBuffer.Grow(defaultPageHeaderSize)
   332  	if ctx != nil {
   333  		rdr.cryptoCtx = *ctx
   334  		rdr.initDecryption()
   335  	}
   336  	return rdr, nil
   337  }
   338  
   339  func (p *serializedPageReader) Reset(r parquet.ReaderAtSeeker, nrows int64, compressType compress.Compression, ctx *CryptoContext) {
   340  	p.rowsSeen, p.pageOrd = 0, 0
   341  	p.curPageHdr, p.curPage, p.err = nil, nil, nil
   342  	p.r, p.nrows = r, nrows
   343  
   344  	p.codec, p.err = compress.GetCodec(compressType)
   345  	if p.err != nil {
   346  		return
   347  	}
   348  	p.buf.ResizeNoShrink(0)
   349  	p.decompressBuffer.Reset()
   350  	if ctx != nil {
   351  		p.cryptoCtx = *ctx
   352  		p.initDecryption()
   353  	} else {
   354  		p.cryptoCtx = CryptoContext{}
   355  		p.dataPageAad = ""
   356  		p.dataPageHeaderAad = ""
   357  	}
   358  }
   359  
   360  func (p *serializedPageReader) Err() error { return p.err }
   361  
   362  func (p *serializedPageReader) SetMaxPageHeaderSize(sz int) {
   363  	p.maxPageHeaderSize = sz
   364  }
   365  
   366  func (p *serializedPageReader) initDecryption() {
   367  	if p.cryptoCtx.DataDecryptor != nil {
   368  		p.dataPageAad = encryption.CreateModuleAad(p.cryptoCtx.DataDecryptor.FileAad(), encryption.DataPageModule,
   369  			p.cryptoCtx.RowGroupOrdinal, p.cryptoCtx.ColumnOrdinal, -1)
   370  	}
   371  	if p.cryptoCtx.MetaDecryptor != nil {
   372  		p.dataPageHeaderAad = encryption.CreateModuleAad(p.cryptoCtx.MetaDecryptor.FileAad(), encryption.DataPageHeaderModule,
   373  			p.cryptoCtx.RowGroupOrdinal, p.cryptoCtx.ColumnOrdinal, -1)
   374  	}
   375  }
   376  
   377  func (p *serializedPageReader) updateDecryption(decrypt encryption.Decryptor, moduleType int8, pageAad string) {
   378  	if p.cryptoCtx.StartDecryptWithDictionaryPage {
   379  		aad := encryption.CreateModuleAad(decrypt.FileAad(), moduleType, p.cryptoCtx.RowGroupOrdinal, p.cryptoCtx.ColumnOrdinal, -1)
   380  		decrypt.UpdateAad(aad)
   381  	} else {
   382  		pageaad := []byte(pageAad)
   383  		encryption.QuickUpdatePageAad(pageaad, p.pageOrd)
   384  		decrypt.UpdateAad(string(pageaad))
   385  	}
   386  }
   387  
   388  func (p *serializedPageReader) Page() Page {
   389  	return p.curPage
   390  }
   391  
   392  func (p *serializedPageReader) decompress(lenCompressed int, buf []byte) ([]byte, error) {
   393  	p.decompressBuffer.Reset()
   394  	p.decompressBuffer.Grow(lenCompressed)
   395  	if _, err := io.CopyN(&p.decompressBuffer, p.r, int64(lenCompressed)); err != nil {
   396  		return nil, err
   397  	}
   398  
   399  	data := p.decompressBuffer.Bytes()
   400  	if p.cryptoCtx.DataDecryptor != nil {
   401  		data = p.cryptoCtx.DataDecryptor.Decrypt(p.decompressBuffer.Bytes())
   402  	}
   403  
   404  	return p.codec.Decode(buf, data), nil
   405  }
   406  
   407  type dataheader interface {
   408  	IsSetStatistics() bool
   409  	GetStatistics() *format.Statistics
   410  }
   411  
   412  func extractStats(dataHeader dataheader) (pageStats metadata.EncodedStatistics) {
   413  	if dataHeader.IsSetStatistics() {
   414  		stats := dataHeader.GetStatistics()
   415  		if stats.IsSetMaxValue() {
   416  			pageStats.SetMax(stats.GetMaxValue())
   417  		} else if stats.IsSetMax() {
   418  			pageStats.SetMax(stats.GetMax())
   419  		}
   420  		if stats.IsSetMinValue() {
   421  			pageStats.SetMin(stats.GetMinValue())
   422  		} else if stats.IsSetMin() {
   423  			pageStats.SetMin(stats.GetMin())
   424  		}
   425  
   426  		if stats.IsSetNullCount() {
   427  			pageStats.SetNullCount(stats.GetNullCount())
   428  		}
   429  		if stats.IsSetDistinctCount() {
   430  			pageStats.SetDistinctCount(stats.GetDistinctCount())
   431  		}
   432  	}
   433  	return
   434  }
   435  
   436  func (p *serializedPageReader) Next() bool {
   437  	// Loop here because there may be unhandled page types that we skip until
   438  	// finding a page that we do know what to do with
   439  	if p.curPage != nil {
   440  		p.curPage.Release()
   441  	}
   442  	p.curPage = nil
   443  	p.curPageHdr = format.NewPageHeader()
   444  	p.err = nil
   445  
   446  	for p.rowsSeen < p.nrows {
   447  		// headerSize := 0
   448  		allowedPgSz := defaultPageHeaderSize
   449  
   450  		start, _ := p.r.Seek(0, io.SeekCurrent)
   451  		p.decompressBuffer.Reset()
   452  		// Page headers can be very large because of page statistics
   453  		// We try to deserialize a larger buffer progressively
   454  		// until a maximum allowed header limit
   455  		for {
   456  			n, err := io.CopyN(&p.decompressBuffer, p.r, int64(allowedPgSz))
   457  			// view, err := p.r.Peek(allowedPgSz)
   458  			if err != nil && err != io.EOF {
   459  				p.err = err
   460  				return false
   461  			}
   462  
   463  			if n == 0 {
   464  				return false
   465  			}
   466  
   467  			view := p.decompressBuffer.Bytes()
   468  
   469  			extra := 0
   470  			if p.cryptoCtx.MetaDecryptor != nil {
   471  				p.updateDecryption(p.cryptoCtx.MetaDecryptor, encryption.DictPageHeaderModule, p.dataPageHeaderAad)
   472  				view = p.cryptoCtx.MetaDecryptor.Decrypt(view)
   473  				extra = p.cryptoCtx.MetaDecryptor.CiphertextSizeDelta()
   474  			}
   475  
   476  			remaining, err := thrift.DeserializeThrift(p.curPageHdr, view)
   477  			if err != nil {
   478  				allowedPgSz *= 2
   479  				if allowedPgSz > p.maxPageHeaderSize {
   480  					p.err = xerrors.New("parquet: deserializing page header failed")
   481  					return false
   482  				}
   483  				continue
   484  			}
   485  
   486  			p.r.Seek(start+int64(len(view)-int(remaining)+extra), io.SeekStart)
   487  			break
   488  		}
   489  
   490  		lenCompressed := int(p.curPageHdr.GetCompressedPageSize())
   491  		lenUncompressed := int(p.curPageHdr.GetUncompressedPageSize())
   492  		if lenCompressed < 0 || lenUncompressed < 0 {
   493  			p.err = xerrors.New("parquet: invalid page header")
   494  			return false
   495  		}
   496  
   497  		if p.cryptoCtx.DataDecryptor != nil {
   498  			p.updateDecryption(p.cryptoCtx.DataDecryptor, encryption.DictPageModule, p.dataPageAad)
   499  		}
   500  
   501  		p.buf.ResizeNoShrink(lenUncompressed)
   502  
   503  		switch p.curPageHdr.GetType() {
   504  		case format.PageType_DICTIONARY_PAGE:
   505  			p.cryptoCtx.StartDecryptWithDictionaryPage = false
   506  			dictHeader := p.curPageHdr.GetDictionaryPageHeader()
   507  			if dictHeader.GetNumValues() < 0 {
   508  				p.err = xerrors.New("parquet: invalid page header (negative number of values)")
   509  				return false
   510  			}
   511  
   512  			data, err := p.decompress(lenCompressed, p.buf.Bytes())
   513  			if err != nil {
   514  				p.err = err
   515  				return false
   516  			}
   517  			if len(data) != lenUncompressed {
   518  				p.err = xerrors.Errorf("parquet: metadata said %d bytes uncompressed dictionary page, got %d bytes", lenUncompressed, len(data))
   519  				return false
   520  			}
   521  
   522  			// p.buf.Resize(lenUncompressed)
   523  			// make dictionary page
   524  			p.curPage = &DictionaryPage{
   525  				page: page{
   526  					buf:      memory.NewBufferBytes(data),
   527  					typ:      p.curPageHdr.Type,
   528  					nvals:    dictHeader.GetNumValues(),
   529  					encoding: dictHeader.GetEncoding(),
   530  				},
   531  				sorted: dictHeader.IsSetIsSorted() && dictHeader.GetIsSorted(),
   532  			}
   533  
   534  		case format.PageType_DATA_PAGE:
   535  			p.pageOrd++
   536  			dataHeader := p.curPageHdr.GetDataPageHeader()
   537  			if dataHeader.GetNumValues() < 0 {
   538  				p.err = xerrors.New("parquet: invalid page header (negative number of values)")
   539  				return false
   540  			}
   541  
   542  			p.rowsSeen += int64(dataHeader.GetNumValues())
   543  			data, err := p.decompress(lenCompressed, p.buf.Bytes())
   544  			if err != nil {
   545  				p.err = err
   546  				return false
   547  			}
   548  			if len(data) != lenUncompressed {
   549  				p.err = xerrors.Errorf("parquet: metadata said %d bytes uncompressed data page, got %d bytes", lenUncompressed, len(data))
   550  				return false
   551  			}
   552  
   553  			// make datapagev1
   554  			p.curPage = &DataPageV1{
   555  				page: page{
   556  					buf:      memory.NewBufferBytes(data),
   557  					typ:      p.curPageHdr.Type,
   558  					nvals:    dataHeader.GetNumValues(),
   559  					encoding: dataHeader.GetEncoding(),
   560  				},
   561  				defLvlEncoding:   dataHeader.GetDefinitionLevelEncoding(),
   562  				repLvlEncoding:   dataHeader.GetRepetitionLevelEncoding(),
   563  				uncompressedSize: int32(lenUncompressed),
   564  				statistics:       extractStats(dataHeader),
   565  			}
   566  		case format.PageType_DATA_PAGE_V2:
   567  			p.pageOrd++
   568  			dataHeader := p.curPageHdr.GetDataPageHeaderV2()
   569  			if dataHeader.GetNumValues() < 0 {
   570  				p.err = xerrors.New("parquet: invalid page header (negative number of values)")
   571  				return false
   572  			}
   573  
   574  			if dataHeader.GetDefinitionLevelsByteLength() < 0 || dataHeader.GetRepetitionLevelsByteLength() < 0 {
   575  				p.err = xerrors.New("parquet: invalid page header (negative levels byte length)")
   576  				return false
   577  			}
   578  
   579  			compressed := dataHeader.GetIsCompressed()
   580  			// extract stats
   581  			p.rowsSeen += int64(dataHeader.GetNumValues())
   582  			levelsBytelen, ok := overflow.Add(int(dataHeader.GetDefinitionLevelsByteLength()), int(dataHeader.GetRepetitionLevelsByteLength()))
   583  			if !ok {
   584  				p.err = xerrors.New("parquet: levels size too large (corrupt file?)")
   585  				return false
   586  			}
   587  
   588  			var data []byte
   589  			if compressed {
   590  				if levelsBytelen > 0 {
   591  					io.ReadFull(p.r, p.buf.Bytes()[:levelsBytelen])
   592  				}
   593  				if data, p.err = p.decompress(lenCompressed-levelsBytelen, p.buf.Bytes()[levelsBytelen:]); p.err != nil {
   594  					return false
   595  				}
   596  			} else {
   597  				io.ReadFull(p.r, p.buf.Bytes())
   598  				data = p.buf.Bytes()
   599  			}
   600  			if len(data) != lenUncompressed {
   601  				p.err = xerrors.Errorf("parquet: metadata said %d bytes uncompressed data page, got %d bytes", lenUncompressed, len(data))
   602  				return false
   603  			}
   604  
   605  			// make datapage v2
   606  			p.curPage = &DataPageV2{
   607  				page: page{
   608  					buf:      memory.NewBufferBytes(data),
   609  					typ:      p.curPageHdr.Type,
   610  					nvals:    dataHeader.GetNumValues(),
   611  					encoding: dataHeader.GetEncoding(),
   612  				},
   613  				nulls:            dataHeader.GetNumNulls(),
   614  				nrows:            dataHeader.GetNumRows(),
   615  				defLvlByteLen:    dataHeader.GetDefinitionLevelsByteLength(),
   616  				repLvlByteLen:    dataHeader.GetRepetitionLevelsByteLength(),
   617  				compressed:       compressed,
   618  				uncompressedSize: int32(lenUncompressed),
   619  				statistics:       extractStats(dataHeader),
   620  			}
   621  		default:
   622  			// we don't know this page type, we're allowed to skip non-data pages
   623  			continue
   624  		}
   625  
   626  		p.buf = memory.NewResizableBuffer(p.mem)
   627  		return true
   628  	}
   629  
   630  	return false
   631  }