github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/file.go (about)

     1  package parquet
     2  
     3  import (
     4  	"bufio"
     5  	"encoding/binary"
     6  	"fmt"
     7  	"hash/crc32"
     8  	"io"
     9  	"sort"
    10  	"sync"
    11  
    12  	"github.com/segmentio/encoding/thrift"
    13  	"github.com/vc42/parquet-go/format"
    14  )
    15  
    16  const (
    17  	defaultDictBufferSize  = 8192
    18  	defaultReadBufferSize  = 4096
    19  	defaultLevelBufferSize = 1024
    20  )
    21  
    22  // File represents a parquet file. The layout of a Parquet file can be found
    23  // here: https://github.com/apache/parquet-format#file-format
    24  type File struct {
    25  	metadata      format.FileMetaData
    26  	protocol      thrift.CompactProtocol
    27  	reader        io.ReaderAt
    28  	size          int64
    29  	schema        *Schema
    30  	root          *Column
    31  	columnIndexes []format.ColumnIndex
    32  	offsetIndexes []format.OffsetIndex
    33  	rowGroups     []RowGroup
    34  }
    35  
    36  // OpenFile opens a parquet file and reads the content between offset 0 and the given
    37  // size in r.
    38  //
    39  // Only the parquet magic bytes and footer are read, column chunks and other
    40  // parts of the file are left untouched; this means that successfully opening
    41  // a file does not validate that the pages have valid checksums.
    42  func OpenFile(r io.ReaderAt, size int64, options ...FileOption) (*File, error) {
    43  	b := make([]byte, 8)
    44  	f := &File{reader: r, size: size}
    45  	c, err := NewFileConfig(options...)
    46  	if err != nil {
    47  		return nil, err
    48  	}
    49  
    50  	if _, err := r.ReadAt(b[:4], 0); err != nil {
    51  		return nil, fmt.Errorf("reading magic header of parquet file: %w", err)
    52  	}
    53  	if string(b[:4]) != "PAR1" {
    54  		return nil, fmt.Errorf("invalid magic header of parquet file: %q", b[:4])
    55  	}
    56  
    57  	if cast, ok := f.reader.(interface{ SetMagicFooterSection(offset, length int64) }); ok {
    58  		cast.SetMagicFooterSection(size-8, 8)
    59  	}
    60  	if _, err := r.ReadAt(b[:8], size-8); err != nil {
    61  		return nil, fmt.Errorf("reading magic footer of parquet file: %w", err)
    62  	}
    63  	if string(b[4:8]) != "PAR1" {
    64  		return nil, fmt.Errorf("invalid magic footer of parquet file: %q", b[4:8])
    65  	}
    66  
    67  	footerSize := int64(binary.LittleEndian.Uint32(b[:4]))
    68  	footerData := make([]byte, footerSize)
    69  
    70  	if cast, ok := f.reader.(interface{ SetFooterSection(offset, length int64) }); ok {
    71  		cast.SetFooterSection(size-(footerSize+8), footerSize)
    72  	}
    73  	if _, err := f.reader.ReadAt(footerData, size-(footerSize+8)); err != nil {
    74  		return nil, fmt.Errorf("reading footer of parquet file: %w", err)
    75  	}
    76  	if err := thrift.Unmarshal(&f.protocol, footerData, &f.metadata); err != nil {
    77  		return nil, fmt.Errorf("reading parquet file metadata: %w", err)
    78  	}
    79  	if len(f.metadata.Schema) == 0 {
    80  		return nil, ErrMissingRootColumn
    81  	}
    82  
    83  	if !c.SkipPageIndex {
    84  		if f.columnIndexes, f.offsetIndexes, err = f.ReadPageIndex(); err != nil {
    85  			return nil, fmt.Errorf("reading page index of parquet file: %w", err)
    86  		}
    87  	}
    88  
    89  	if f.root, err = openColumns(f); err != nil {
    90  		return nil, fmt.Errorf("opening columns of parquet file: %w", err)
    91  	}
    92  
    93  	schema := NewSchema(f.root.Name(), f.root)
    94  	columns := make([]*Column, 0, numLeafColumnsOf(f.root))
    95  	f.schema = schema
    96  	f.root.forEachLeaf(func(c *Column) { columns = append(columns, c) })
    97  
    98  	rowGroups := make([]fileRowGroup, len(f.metadata.RowGroups))
    99  	for i := range rowGroups {
   100  		rowGroups[i].init(f, schema, columns, &f.metadata.RowGroups[i])
   101  	}
   102  	f.rowGroups = make([]RowGroup, len(rowGroups))
   103  	for i := range rowGroups {
   104  		f.rowGroups[i] = &rowGroups[i]
   105  	}
   106  
   107  	if !c.SkipBloomFilters {
   108  		h := format.BloomFilterHeader{}
   109  		p := thrift.CompactProtocol{}
   110  		s := io.NewSectionReader(r, 0, size)
   111  		d := thrift.NewDecoder(p.NewReader(s))
   112  
   113  		for i := range rowGroups {
   114  			g := &rowGroups[i]
   115  
   116  			for j := range g.columns {
   117  				c := g.columns[j].(*fileColumnChunk)
   118  
   119  				if offset := c.chunk.MetaData.BloomFilterOffset; offset > 0 {
   120  					s.Seek(offset, io.SeekStart)
   121  					h = format.BloomFilterHeader{}
   122  					if err := d.Decode(&h); err != nil {
   123  						return nil, err
   124  					}
   125  					offset, _ = s.Seek(0, io.SeekCurrent)
   126  					if cast, ok := r.(interface{ SetBloomFilterSection(offset, length int64) }); ok {
   127  						bloomFilterOffset := c.chunk.MetaData.BloomFilterOffset
   128  						bloomFilterLength := (offset - bloomFilterOffset) + int64(h.NumBytes)
   129  						cast.SetBloomFilterSection(bloomFilterOffset, bloomFilterLength)
   130  					}
   131  
   132  					c.bloomFilter = newBloomFilter(r, offset, &h)
   133  				}
   134  			}
   135  		}
   136  	}
   137  
   138  	sortKeyValueMetadata(f.metadata.KeyValueMetadata)
   139  	return f, nil
   140  }
   141  
   142  // ReadPageIndex reads the page index section of the parquet file f.
   143  //
   144  // If the file did not contain a page index, the method returns two empty slices
   145  // and a nil error.
   146  //
   147  // Only leaf columns have indexes, the returned indexes are arranged using the
   148  // following layout:
   149  //
   150  //	+ -------------- +
   151  //	| col 0: chunk 0 |
   152  //	+ -------------- +
   153  //	| col 1: chunk 0 |
   154  //	+ -------------- +
   155  //	| ...            |
   156  //	+ -------------- +
   157  //	| col 0: chunk 1 |
   158  //	+ -------------- +
   159  //	| col 1: chunk 1 |
   160  //	+ -------------- +
   161  //	| ...            |
   162  //	+ -------------- +
   163  //
   164  // This method is useful in combination with the SkipPageIndex option to delay
   165  // reading the page index section until after the file was opened. Note that in
   166  // this case the page index is not cached within the file, programs are expected
   167  // to make use of independently from the parquet package.
   168  func (f *File) ReadPageIndex() ([]format.ColumnIndex, []format.OffsetIndex, error) {
   169  	columnIndexOffset := f.metadata.RowGroups[0].Columns[0].ColumnIndexOffset
   170  	offsetIndexOffset := f.metadata.RowGroups[0].Columns[0].OffsetIndexOffset
   171  	columnIndexLength := int64(0)
   172  	offsetIndexLength := int64(0)
   173  
   174  	if columnIndexOffset == 0 || offsetIndexOffset == 0 {
   175  		return nil, nil, nil
   176  	}
   177  
   178  	forEachColumnChunk := func(do func(int, int, *format.ColumnChunk) error) error {
   179  		for i := range f.metadata.RowGroups {
   180  			for j := range f.metadata.RowGroups[i].Columns {
   181  				c := &f.metadata.RowGroups[i].Columns[j]
   182  				if err := do(i, j, c); err != nil {
   183  					return err
   184  				}
   185  			}
   186  		}
   187  		return nil
   188  	}
   189  
   190  	forEachColumnChunk(func(_, _ int, c *format.ColumnChunk) error {
   191  		columnIndexLength += int64(c.ColumnIndexLength)
   192  		offsetIndexLength += int64(c.OffsetIndexLength)
   193  		return nil
   194  	})
   195  
   196  	numRowGroups := len(f.metadata.RowGroups)
   197  	numColumns := len(f.metadata.RowGroups[0].Columns)
   198  	numColumnChunks := numRowGroups * numColumns
   199  
   200  	columnIndexes := make([]format.ColumnIndex, numColumnChunks)
   201  	offsetIndexes := make([]format.OffsetIndex, numColumnChunks)
   202  	indexBuffer := make([]byte, max(int(columnIndexLength), int(offsetIndexLength)))
   203  
   204  	if columnIndexOffset > 0 {
   205  		columnIndexData := indexBuffer[:columnIndexLength]
   206  
   207  		if cast, ok := f.reader.(interface{ SetColumnIndexSection(offset, length int64) }); ok {
   208  			cast.SetColumnIndexSection(columnIndexOffset, columnIndexLength)
   209  		}
   210  		if _, err := f.reader.ReadAt(columnIndexData, columnIndexOffset); err != nil {
   211  			return nil, nil, fmt.Errorf("reading %d bytes column index at offset %d: %w", columnIndexLength, columnIndexOffset, err)
   212  		}
   213  
   214  		err := forEachColumnChunk(func(i, j int, c *format.ColumnChunk) error {
   215  			offset := c.ColumnIndexOffset - columnIndexOffset
   216  			length := int64(c.ColumnIndexLength)
   217  			buffer := columnIndexData[offset : offset+length]
   218  			if err := thrift.Unmarshal(&f.protocol, buffer, &columnIndexes[(i*numColumns)+j]); err != nil {
   219  				return fmt.Errorf("decoding column index: rowGroup=%d columnChunk=%d/%d: %w", i, j, numColumns, err)
   220  			}
   221  			return nil
   222  		})
   223  		if err != nil {
   224  			return nil, nil, err
   225  		}
   226  	}
   227  
   228  	if offsetIndexOffset > 0 {
   229  		offsetIndexData := indexBuffer[:offsetIndexLength]
   230  
   231  		if cast, ok := f.reader.(interface{ SetOffsetIndexSection(offset, length int64) }); ok {
   232  			cast.SetOffsetIndexSection(offsetIndexOffset, offsetIndexLength)
   233  		}
   234  		if _, err := f.reader.ReadAt(offsetIndexData, offsetIndexOffset); err != nil {
   235  			return nil, nil, fmt.Errorf("reading %d bytes offset index at offset %d: %w", offsetIndexLength, offsetIndexOffset, err)
   236  		}
   237  
   238  		err := forEachColumnChunk(func(i, j int, c *format.ColumnChunk) error {
   239  			offset := c.OffsetIndexOffset - offsetIndexOffset
   240  			length := int64(c.OffsetIndexLength)
   241  			buffer := offsetIndexData[offset : offset+length]
   242  			if err := thrift.Unmarshal(&f.protocol, buffer, &offsetIndexes[(i*numColumns)+j]); err != nil {
   243  				return fmt.Errorf("decoding column index: rowGroup=%d columnChunk=%d/%d: %w", i, j, numColumns, err)
   244  			}
   245  			return nil
   246  		})
   247  		if err != nil {
   248  			return nil, nil, err
   249  		}
   250  	}
   251  
   252  	return columnIndexes, offsetIndexes, nil
   253  }
   254  
   255  // NumRows returns the number of rows in the file.
   256  func (f *File) NumRows() int64 { return f.metadata.NumRows }
   257  
   258  // RowGroups returns the list of row group in the file.
   259  func (f *File) RowGroups() []RowGroup { return f.rowGroups }
   260  
   261  // Root returns the root column of f.
   262  func (f *File) Root() *Column { return f.root }
   263  
   264  // Schema returns the schema of f.
   265  func (f *File) Schema() *Schema { return f.schema }
   266  
   267  // Metadata returns the metadata of f.
   268  func (f *File) Metadata() *format.FileMetaData { return &f.metadata }
   269  
   270  // Size returns the size of f (in bytes).
   271  func (f *File) Size() int64 { return f.size }
   272  
   273  // ReadAt reads bytes into b from f at the given offset.
   274  //
   275  // The method satisfies the io.ReaderAt interface.
   276  func (f *File) ReadAt(b []byte, off int64) (int, error) {
   277  	if off < 0 || off >= f.size {
   278  		return 0, io.EOF
   279  	}
   280  
   281  	if limit := f.size - off; limit < int64(len(b)) {
   282  		n, err := f.reader.ReadAt(b[:limit], off)
   283  		if err == nil {
   284  			err = io.EOF
   285  		}
   286  		return n, err
   287  	}
   288  
   289  	return f.reader.ReadAt(b, off)
   290  }
   291  
   292  // ColumnIndexes returns the page index of the parquet file f.
   293  //
   294  // If the file did not contain a column index, the method returns an empty slice
   295  // and nil error.
   296  func (f *File) ColumnIndexes() []format.ColumnIndex { return f.columnIndexes }
   297  
   298  // OffsetIndexes returns the page index of the parquet file f.
   299  //
   300  // If the file did not contain an offset index, the method returns an empty
   301  // slice and nil error.
   302  func (f *File) OffsetIndexes() []format.OffsetIndex { return f.offsetIndexes }
   303  
   304  // Lookup returns the value associated with the given key in the file key/value
   305  // metadata.
   306  //
   307  // The ok boolean will be true if the key was found, false otherwise.
   308  func (f *File) Lookup(key string) (value string, ok bool) {
   309  	return lookupKeyValueMetadata(f.metadata.KeyValueMetadata, key)
   310  }
   311  
   312  func (f *File) hasIndexes() bool {
   313  	return f.columnIndexes != nil && f.offsetIndexes != nil
   314  }
   315  
   316  var _ io.ReaderAt = (*File)(nil)
   317  
   318  func sortKeyValueMetadata(keyValueMetadata []format.KeyValue) {
   319  	sort.Slice(keyValueMetadata, func(i, j int) bool {
   320  		switch {
   321  		case keyValueMetadata[i].Key < keyValueMetadata[j].Key:
   322  			return true
   323  		case keyValueMetadata[i].Key > keyValueMetadata[j].Key:
   324  			return false
   325  		default:
   326  			return keyValueMetadata[i].Value < keyValueMetadata[j].Value
   327  		}
   328  	})
   329  }
   330  
   331  func lookupKeyValueMetadata(keyValueMetadata []format.KeyValue, key string) (value string, ok bool) {
   332  	i := sort.Search(len(keyValueMetadata), func(i int) bool {
   333  		return keyValueMetadata[i].Key >= key
   334  	})
   335  	if i == len(keyValueMetadata) || keyValueMetadata[i].Key != key {
   336  		return "", false
   337  	}
   338  	return keyValueMetadata[i].Value, true
   339  }
   340  
   341  type fileRowGroup struct {
   342  	schema   *Schema
   343  	rowGroup *format.RowGroup
   344  	columns  []ColumnChunk
   345  	sorting  []SortingColumn
   346  }
   347  
   348  func (g *fileRowGroup) init(file *File, schema *Schema, columns []*Column, rowGroup *format.RowGroup) {
   349  	g.schema = schema
   350  	g.rowGroup = rowGroup
   351  	g.columns = make([]ColumnChunk, len(rowGroup.Columns))
   352  	g.sorting = make([]SortingColumn, len(rowGroup.SortingColumns))
   353  	fileColumnChunks := make([]fileColumnChunk, len(rowGroup.Columns))
   354  
   355  	for i := range g.columns {
   356  		fileColumnChunks[i] = fileColumnChunk{
   357  			file:     file,
   358  			column:   columns[i],
   359  			rowGroup: rowGroup,
   360  			chunk:    &rowGroup.Columns[i],
   361  		}
   362  
   363  		if file.hasIndexes() {
   364  			j := (int(rowGroup.Ordinal) * len(columns)) + i
   365  			fileColumnChunks[i].columnIndex = &file.columnIndexes[j]
   366  			fileColumnChunks[i].offsetIndex = &file.offsetIndexes[j]
   367  		}
   368  
   369  		g.columns[i] = &fileColumnChunks[i]
   370  	}
   371  
   372  	for i := range g.sorting {
   373  		g.sorting[i] = &fileSortingColumn{
   374  			column:     columns[rowGroup.SortingColumns[i].ColumnIdx],
   375  			descending: rowGroup.SortingColumns[i].Descending,
   376  			nullsFirst: rowGroup.SortingColumns[i].NullsFirst,
   377  		}
   378  	}
   379  }
   380  
   381  func (g *fileRowGroup) Schema() *Schema                 { return g.schema }
   382  func (g *fileRowGroup) NumRows() int64                  { return g.rowGroup.NumRows }
   383  func (g *fileRowGroup) ColumnChunks() []ColumnChunk     { return g.columns }
   384  func (g *fileRowGroup) SortingColumns() []SortingColumn { return g.sorting }
   385  func (g *fileRowGroup) Rows() Rows                      { return &rowGroupRows{rowGroup: g} }
   386  
   387  type fileSortingColumn struct {
   388  	column     *Column
   389  	descending bool
   390  	nullsFirst bool
   391  }
   392  
   393  func (s *fileSortingColumn) Path() []string   { return s.column.Path() }
   394  func (s *fileSortingColumn) Descending() bool { return s.descending }
   395  func (s *fileSortingColumn) NullsFirst() bool { return s.nullsFirst }
   396  
   397  type fileColumnChunk struct {
   398  	file        *File
   399  	column      *Column
   400  	bloomFilter *bloomFilter
   401  	rowGroup    *format.RowGroup
   402  	columnIndex *format.ColumnIndex
   403  	offsetIndex *format.OffsetIndex
   404  	chunk       *format.ColumnChunk
   405  }
   406  
   407  func (c *fileColumnChunk) Type() Type {
   408  	return c.column.Type()
   409  }
   410  
   411  func (c *fileColumnChunk) Column() int {
   412  	return int(c.column.Index())
   413  }
   414  
   415  func (c *fileColumnChunk) Pages() Pages {
   416  	r := new(filePages)
   417  	r.init(c)
   418  	return r
   419  }
   420  
   421  func (c *fileColumnChunk) ColumnIndex() ColumnIndex {
   422  	if c.columnIndex == nil {
   423  		return nil
   424  	}
   425  	return fileColumnIndex{c}
   426  }
   427  
   428  func (c *fileColumnChunk) OffsetIndex() OffsetIndex {
   429  	if c.offsetIndex == nil {
   430  		return nil
   431  	}
   432  	return (*fileOffsetIndex)(c.offsetIndex)
   433  }
   434  
   435  func (c *fileColumnChunk) BloomFilter() BloomFilter {
   436  	if c.bloomFilter == nil {
   437  		return nil
   438  	}
   439  	return c.bloomFilter
   440  }
   441  
   442  func (c *fileColumnChunk) NumValues() int64 {
   443  	return c.chunk.MetaData.NumValues
   444  }
   445  
   446  type filePages struct {
   447  	chunk    *fileColumnChunk
   448  	dictPage *dictPage
   449  	dataPage *dataPage
   450  	rbuf     *bufio.Reader
   451  	section  io.SectionReader
   452  
   453  	protocol thrift.CompactProtocol
   454  	decoder  thrift.Decoder
   455  
   456  	baseOffset int64
   457  	dataOffset int64
   458  	dictOffset int64
   459  	index      int
   460  	skip       int64
   461  }
   462  
   463  func (f *filePages) init(c *fileColumnChunk) {
   464  	f.dataPage = acquireDataPage()
   465  	f.chunk = c
   466  	f.baseOffset = c.chunk.MetaData.DataPageOffset
   467  	f.dataOffset = f.baseOffset
   468  
   469  	if c.chunk.MetaData.DictionaryPageOffset != 0 {
   470  		f.baseOffset = c.chunk.MetaData.DictionaryPageOffset
   471  		f.dictOffset = f.baseOffset
   472  	}
   473  
   474  	f.section = *io.NewSectionReader(c.file, f.baseOffset, c.chunk.MetaData.TotalCompressedSize)
   475  	f.rbuf = acquireReadBuffer(&f.section)
   476  	f.decoder.Reset(f.protocol.NewReader(f.rbuf))
   477  }
   478  
   479  func (f *filePages) ReadPage() (Page, error) {
   480  	if f.chunk == nil {
   481  		return nil, io.EOF
   482  	}
   483  
   484  	for {
   485  		header := new(format.PageHeader)
   486  		if err := f.decoder.Decode(header); err != nil {
   487  			return nil, err
   488  		}
   489  		if err := f.readPage(header, f.dataPage, f.rbuf); err != nil {
   490  			return nil, err
   491  		}
   492  
   493  		var page Page
   494  		var err error
   495  
   496  		switch header.Type {
   497  		case format.DataPageV2:
   498  			page, err = f.readDataPageV2(header)
   499  		case format.DataPage:
   500  			page, err = f.readDataPageV1(header)
   501  		case format.DictionaryPage:
   502  			// Sometimes parquet files do not have the dictionary page offset
   503  			// recorded in the column metadata. We account for this by lazily
   504  			// reading dictionary pages when we encounter them.
   505  			err = f.readDictionaryPage(header, f.dataPage)
   506  		default:
   507  			err = fmt.Errorf("cannot read values of type %s from page", header.Type)
   508  		}
   509  
   510  		if err != nil {
   511  			return nil, fmt.Errorf("decoding page %d of column %q: %w", f.index, f.columnPath(), err)
   512  		}
   513  
   514  		if page != nil {
   515  			f.index++
   516  			if f.skip == 0 {
   517  				return page, nil
   518  			}
   519  
   520  			// TODO: what about pages that don't embed the number of rows?
   521  			// (data page v1 with no offset index in the column chunk).
   522  			numRows := page.NumRows()
   523  			if numRows > f.skip {
   524  				seek := f.skip
   525  				f.skip = 0
   526  				if seek > 0 {
   527  					page = page.Buffer().Slice(seek, numRows)
   528  				}
   529  				return page, nil
   530  			}
   531  
   532  			f.skip -= numRows
   533  		}
   534  	}
   535  }
   536  
   537  func (f *filePages) readDictionary() error {
   538  	chunk := io.NewSectionReader(f.chunk.file, f.baseOffset, f.chunk.chunk.MetaData.TotalCompressedSize)
   539  	rbuf := acquireReadBuffer(chunk)
   540  	defer releaseReadBuffer(rbuf)
   541  
   542  	decoder := thrift.NewDecoder(f.protocol.NewReader(rbuf))
   543  	header := new(format.PageHeader)
   544  
   545  	if err := decoder.Decode(header); err != nil {
   546  		return err
   547  	}
   548  
   549  	page := acquireDataPage()
   550  	defer releaseDataPage(page)
   551  
   552  	if err := f.readPage(header, page, rbuf); err != nil {
   553  		return err
   554  	}
   555  
   556  	return f.readDictionaryPage(header, page)
   557  }
   558  
   559  func (f *filePages) readDictionaryPage(header *format.PageHeader, page *dataPage) (err error) {
   560  	if header.DictionaryPageHeader == nil {
   561  		return ErrMissingPageHeader
   562  	}
   563  	f.dictPage, _ = dictPagePool.Get().(*dictPage)
   564  	if f.dictPage == nil {
   565  		f.dictPage = new(dictPage)
   566  	}
   567  	f.dataPage.dictionary, err = f.chunk.column.decodeDictionary(
   568  		DictionaryPageHeader{header.DictionaryPageHeader},
   569  		page,
   570  		f.dictPage,
   571  	)
   572  	return err
   573  }
   574  
   575  func (f *filePages) readDataPageV1(header *format.PageHeader) (Page, error) {
   576  	if header.DataPageHeader == nil {
   577  		return nil, ErrMissingPageHeader
   578  	}
   579  	if isDictionaryFormat(header.DataPageHeader.Encoding) && f.dataPage.dictionary == nil {
   580  		if err := f.readDictionary(); err != nil {
   581  			return nil, err
   582  		}
   583  	}
   584  	return f.chunk.column.decodeDataPageV1(DataPageHeaderV1{header.DataPageHeader}, f.dataPage)
   585  }
   586  
   587  func (f *filePages) readDataPageV2(header *format.PageHeader) (Page, error) {
   588  	if header.DataPageHeaderV2 == nil {
   589  		return nil, ErrMissingPageHeader
   590  	}
   591  	if isDictionaryFormat(header.DataPageHeaderV2.Encoding) && f.dataPage.dictionary == nil {
   592  		// If the program seeked to a row passed the first page, the dictionary
   593  		// page may not have been seen, in which case we have to lazily load it
   594  		// from the beginning of column chunk.
   595  		if err := f.readDictionary(); err != nil {
   596  			return nil, err
   597  		}
   598  	}
   599  	return f.chunk.column.decodeDataPageV2(DataPageHeaderV2{header.DataPageHeaderV2}, f.dataPage)
   600  }
   601  
   602  func (f *filePages) readPage(header *format.PageHeader, page *dataPage, reader *bufio.Reader) error {
   603  	compressedPageSize, uncompressedPageSize := int(header.CompressedPageSize), int(header.UncompressedPageSize)
   604  
   605  	if cap(page.data) < compressedPageSize {
   606  		page.data = make([]byte, compressedPageSize)
   607  	} else {
   608  		page.data = page.data[:compressedPageSize]
   609  	}
   610  	if cap(page.values) < uncompressedPageSize {
   611  		page.values = make([]byte, 0, uncompressedPageSize)
   612  	}
   613  
   614  	if _, err := io.ReadFull(reader, page.data); err != nil {
   615  		return err
   616  	}
   617  
   618  	if header.CRC != 0 {
   619  		headerChecksum := uint32(header.CRC)
   620  		bufferChecksum := crc32.ChecksumIEEE(page.data)
   621  
   622  		if headerChecksum != bufferChecksum {
   623  			// The parquet specs indicate that corruption errors could be
   624  			// handled gracefully by skipping pages, tho this may not always
   625  			// be practical. Depending on how the pages are consumed,
   626  			// missing rows may cause unpredictable behaviors in algorithms.
   627  			//
   628  			// For now, we assume these errors to be fatal, but we may
   629  			// revisit later and improve error handling to be more resilient
   630  			// to data corruption.
   631  			return fmt.Errorf("crc32 checksum mismatch in page of column %q: want=0x%08X got=0x%08X: %w",
   632  				f.columnPath(),
   633  				headerChecksum,
   634  				bufferChecksum,
   635  				ErrCorrupted,
   636  			)
   637  		}
   638  	}
   639  
   640  	return nil
   641  }
   642  
   643  func (f *filePages) SeekToRow(rowIndex int64) (err error) {
   644  	if f.chunk == nil {
   645  		return io.ErrClosedPipe
   646  	}
   647  	if f.chunk.offsetIndex == nil {
   648  		_, err = f.section.Seek(f.dataOffset-f.baseOffset, io.SeekStart)
   649  		f.skip = rowIndex
   650  		f.index = 0
   651  		if f.dictOffset > 0 {
   652  			f.index = 1
   653  		}
   654  	} else {
   655  		pages := f.chunk.offsetIndex.PageLocations
   656  		index := sort.Search(len(pages), func(i int) bool {
   657  			return pages[i].FirstRowIndex > rowIndex
   658  		}) - 1
   659  		if index < 0 {
   660  			return ErrSeekOutOfRange
   661  		}
   662  		_, err = f.section.Seek(pages[index].Offset-f.baseOffset, io.SeekStart)
   663  		f.skip = rowIndex - pages[index].FirstRowIndex
   664  		f.index = index
   665  	}
   666  	f.rbuf.Reset(&f.section)
   667  	return err
   668  }
   669  
   670  func (f *filePages) Close() error {
   671  	releaseDictPage(f.dictPage)
   672  	releaseDataPage(f.dataPage)
   673  	releaseReadBuffer(f.rbuf)
   674  	f.chunk = nil
   675  	f.dictPage = nil
   676  	f.dataPage = nil
   677  	f.section = io.SectionReader{}
   678  	f.rbuf = nil
   679  	f.baseOffset = 0
   680  	f.dataOffset = 0
   681  	f.dictOffset = 0
   682  	f.index = 0
   683  	f.skip = 0
   684  	return nil
   685  }
   686  
   687  func (f *filePages) columnPath() columnPath {
   688  	return columnPath(f.chunk.column.Path())
   689  }
   690  
   691  var (
   692  	dictPagePool   sync.Pool // *dictPage
   693  	dataPagePool   sync.Pool // *dataPage
   694  	readBufferPool sync.Pool // *bufio.Reader
   695  )
   696  
   697  func acquireDictPage() *dictPage {
   698  	p, _ := dictPagePool.Get().(*dictPage)
   699  	if p == nil {
   700  		p = new(dictPage)
   701  	}
   702  	return p
   703  }
   704  
   705  func releaseDictPage(p *dictPage) {
   706  	if p != nil {
   707  		p.reset()
   708  		dictPagePool.Put(p)
   709  	}
   710  }
   711  
   712  func acquireDataPage() *dataPage {
   713  	p, _ := dataPagePool.Get().(*dataPage)
   714  	if p == nil {
   715  		p = new(dataPage)
   716  	}
   717  	return p
   718  }
   719  
   720  func releaseDataPage(p *dataPage) {
   721  	if p != nil {
   722  		p.reset()
   723  		dataPagePool.Put(p)
   724  	}
   725  }
   726  
   727  func acquireReadBuffer(r io.Reader) *bufio.Reader {
   728  	b, _ := readBufferPool.Get().(*bufio.Reader)
   729  	if b == nil {
   730  		b = bufio.NewReaderSize(r, defaultReadBufferSize)
   731  	} else {
   732  		b.Reset(r)
   733  	}
   734  	return b
   735  }
   736  
   737  func releaseReadBuffer(b *bufio.Reader) {
   738  	if b != nil {
   739  		b.Reset(nil)
   740  		readBufferPool.Put(b)
   741  	}
   742  }