github.com/parquet-go/parquet-go@v0.20.0/file.go (about)

     1  package parquet
     2  
     3  import (
     4  	"bufio"
     5  	"encoding/binary"
     6  	"fmt"
     7  	"hash/crc32"
     8  	"io"
     9  	"sort"
    10  	"strings"
    11  	"sync"
    12  
    13  	"github.com/segmentio/encoding/thrift"
    14  
    15  	"github.com/parquet-go/parquet-go/format"
    16  )
    17  
    18  const (
    19  	defaultDictBufferSize = 8192
    20  	defaultReadBufferSize = 4096
    21  )
    22  
    23  // File represents a parquet file. The layout of a Parquet file can be found
    24  // here: https://github.com/apache/parquet-format#file-format
    25  type File struct {
    26  	metadata      format.FileMetaData
    27  	protocol      thrift.CompactProtocol
    28  	reader        io.ReaderAt
    29  	size          int64
    30  	schema        *Schema
    31  	root          *Column
    32  	columnIndexes []format.ColumnIndex
    33  	offsetIndexes []format.OffsetIndex
    34  	rowGroups     []RowGroup
    35  	config        *FileConfig
    36  }
    37  
    38  // OpenFile opens a parquet file and reads the content between offset 0 and the given
    39  // size in r.
    40  //
    41  // Only the parquet magic bytes and footer are read, column chunks and other
    42  // parts of the file are left untouched; this means that successfully opening
    43  // a file does not validate that the pages have valid checksums.
    44  func OpenFile(r io.ReaderAt, size int64, options ...FileOption) (*File, error) {
    45  	b := make([]byte, 8)
    46  	c, err := NewFileConfig(options...)
    47  	if err != nil {
    48  		return nil, err
    49  	}
    50  	f := &File{reader: r, size: size, config: c}
    51  
    52  	if _, err := readAt(r, b[:4], 0); err != nil {
    53  		return nil, fmt.Errorf("reading magic header of parquet file: %w", err)
    54  	}
    55  	if string(b[:4]) != "PAR1" {
    56  		return nil, fmt.Errorf("invalid magic header of parquet file: %q", b[:4])
    57  	}
    58  
    59  	if cast, ok := f.reader.(interface{ SetMagicFooterSection(offset, length int64) }); ok {
    60  		cast.SetMagicFooterSection(size-8, 8)
    61  	}
    62  	if n, err := r.ReadAt(b[:8], size-8); n != 8 {
    63  		return nil, fmt.Errorf("reading magic footer of parquet file: %w", err)
    64  	}
    65  	if string(b[4:8]) != "PAR1" {
    66  		return nil, fmt.Errorf("invalid magic footer of parquet file: %q", b[4:8])
    67  	}
    68  
    69  	footerSize := int64(binary.LittleEndian.Uint32(b[:4]))
    70  	footerData := make([]byte, footerSize)
    71  
    72  	if cast, ok := f.reader.(interface{ SetFooterSection(offset, length int64) }); ok {
    73  		cast.SetFooterSection(size-(footerSize+8), footerSize)
    74  	}
    75  	if _, err := f.readAt(footerData, size-(footerSize+8)); err != nil {
    76  		return nil, fmt.Errorf("reading footer of parquet file: %w", err)
    77  	}
    78  	if err := thrift.Unmarshal(&f.protocol, footerData, &f.metadata); err != nil {
    79  		return nil, fmt.Errorf("reading parquet file metadata: %w", err)
    80  	}
    81  	if len(f.metadata.Schema) == 0 {
    82  		return nil, ErrMissingRootColumn
    83  	}
    84  
    85  	if !c.SkipPageIndex {
    86  		if f.columnIndexes, f.offsetIndexes, err = f.ReadPageIndex(); err != nil {
    87  			return nil, fmt.Errorf("reading page index of parquet file: %w", err)
    88  		}
    89  	}
    90  
    91  	if f.root, err = openColumns(f); err != nil {
    92  		return nil, fmt.Errorf("opening columns of parquet file: %w", err)
    93  	}
    94  
    95  	var schema *Schema
    96  	if c.Schema != nil {
    97  		schema = c.Schema
    98  	} else {
    99  		schema = NewSchema(f.root.Name(), f.root)
   100  	}
   101  	columns := make([]*Column, 0, numLeafColumnsOf(f.root))
   102  	f.schema = schema
   103  	f.root.forEachLeaf(func(c *Column) { columns = append(columns, c) })
   104  
   105  	rowGroups := make([]fileRowGroup, len(f.metadata.RowGroups))
   106  	for i := range rowGroups {
   107  		rowGroups[i].init(f, schema, columns, &f.metadata.RowGroups[i])
   108  	}
   109  	f.rowGroups = make([]RowGroup, len(rowGroups))
   110  	for i := range rowGroups {
   111  		f.rowGroups[i] = &rowGroups[i]
   112  	}
   113  
   114  	if !c.SkipBloomFilters {
   115  		section := io.NewSectionReader(r, 0, size)
   116  		rbuf, rbufpool := getBufioReader(section, c.ReadBufferSize)
   117  		defer putBufioReader(rbuf, rbufpool)
   118  
   119  		header := format.BloomFilterHeader{}
   120  		compact := thrift.CompactProtocol{}
   121  		decoder := thrift.NewDecoder(compact.NewReader(rbuf))
   122  
   123  		for i := range rowGroups {
   124  			g := &rowGroups[i]
   125  
   126  			for j := range g.columns {
   127  				c := g.columns[j].(*fileColumnChunk)
   128  
   129  				if offset := c.chunk.MetaData.BloomFilterOffset; offset > 0 {
   130  					section.Seek(offset, io.SeekStart)
   131  					rbuf.Reset(section)
   132  
   133  					header = format.BloomFilterHeader{}
   134  					if err := decoder.Decode(&header); err != nil {
   135  						return nil, fmt.Errorf("decoding bloom filter header: %w", err)
   136  					}
   137  
   138  					offset, _ = section.Seek(0, io.SeekCurrent)
   139  					offset -= int64(rbuf.Buffered())
   140  
   141  					if cast, ok := r.(interface{ SetBloomFilterSection(offset, length int64) }); ok {
   142  						bloomFilterOffset := c.chunk.MetaData.BloomFilterOffset
   143  						bloomFilterLength := (offset - bloomFilterOffset) + int64(header.NumBytes)
   144  						cast.SetBloomFilterSection(bloomFilterOffset, bloomFilterLength)
   145  					}
   146  
   147  					c.bloomFilter = newBloomFilter(r, offset, &header)
   148  				}
   149  			}
   150  		}
   151  	}
   152  
   153  	sortKeyValueMetadata(f.metadata.KeyValueMetadata)
   154  	return f, nil
   155  }
   156  
   157  // ReadPageIndex reads the page index section of the parquet file f.
   158  //
   159  // If the file did not contain a page index, the method returns two empty slices
   160  // and a nil error.
   161  //
   162  // Only leaf columns have indexes, the returned indexes are arranged using the
   163  // following layout:
   164  //
   165  //	------------------
   166  //	| col 0: chunk 0 |
   167  //	------------------
   168  //	| col 1: chunk 0 |
   169  //	------------------
   170  //	| ...            |
   171  //	------------------
   172  //	| col 0: chunk 1 |
   173  //	------------------
   174  //	| col 1: chunk 1 |
   175  //	------------------
   176  //	| ...            |
   177  //	------------------
   178  //
   179  // This method is useful in combination with the SkipPageIndex option to delay
   180  // reading the page index section until after the file was opened. Note that in
   181  // this case the page index is not cached within the file, programs are expected
   182  // to make use of independently from the parquet package.
   183  func (f *File) ReadPageIndex() ([]format.ColumnIndex, []format.OffsetIndex, error) {
   184  	if len(f.metadata.RowGroups) == 0 {
   185  		return nil, nil, nil
   186  	}
   187  
   188  	columnIndexOffset := f.metadata.RowGroups[0].Columns[0].ColumnIndexOffset
   189  	offsetIndexOffset := f.metadata.RowGroups[0].Columns[0].OffsetIndexOffset
   190  	columnIndexLength := int64(0)
   191  	offsetIndexLength := int64(0)
   192  
   193  	forEachColumnChunk := func(do func(int, int, *format.ColumnChunk) error) error {
   194  		for i := range f.metadata.RowGroups {
   195  			for j := range f.metadata.RowGroups[i].Columns {
   196  				c := &f.metadata.RowGroups[i].Columns[j]
   197  				if err := do(i, j, c); err != nil {
   198  					return err
   199  				}
   200  			}
   201  		}
   202  		return nil
   203  	}
   204  
   205  	forEachColumnChunk(func(_, _ int, c *format.ColumnChunk) error {
   206  		columnIndexLength += int64(c.ColumnIndexLength)
   207  		offsetIndexLength += int64(c.OffsetIndexLength)
   208  		return nil
   209  	})
   210  
   211  	if columnIndexLength == 0 && offsetIndexLength == 0 {
   212  		return nil, nil, nil
   213  	}
   214  
   215  	numRowGroups := len(f.metadata.RowGroups)
   216  	numColumns := len(f.metadata.RowGroups[0].Columns)
   217  	numColumnChunks := numRowGroups * numColumns
   218  
   219  	columnIndexes := make([]format.ColumnIndex, numColumnChunks)
   220  	offsetIndexes := make([]format.OffsetIndex, numColumnChunks)
   221  	indexBuffer := make([]byte, max(int(columnIndexLength), int(offsetIndexLength)))
   222  
   223  	if columnIndexOffset > 0 {
   224  		columnIndexData := indexBuffer[:columnIndexLength]
   225  
   226  		if cast, ok := f.reader.(interface{ SetColumnIndexSection(offset, length int64) }); ok {
   227  			cast.SetColumnIndexSection(columnIndexOffset, columnIndexLength)
   228  		}
   229  		if _, err := f.readAt(columnIndexData, columnIndexOffset); err != nil {
   230  			return nil, nil, fmt.Errorf("reading %d bytes column index at offset %d: %w", columnIndexLength, columnIndexOffset, err)
   231  		}
   232  
   233  		err := forEachColumnChunk(func(i, j int, c *format.ColumnChunk) error {
   234  			// Some parquet files are missing the column index on some columns.
   235  			//
   236  			// An example of this file is testdata/alltypes_tiny_pages_plain.parquet
   237  			// which was added in https://github.com/apache/parquet-testing/pull/24.
   238  			if c.ColumnIndexOffset > 0 {
   239  				offset := c.ColumnIndexOffset - columnIndexOffset
   240  				length := int64(c.ColumnIndexLength)
   241  				buffer := columnIndexData[offset : offset+length]
   242  				if err := thrift.Unmarshal(&f.protocol, buffer, &columnIndexes[(i*numColumns)+j]); err != nil {
   243  					return fmt.Errorf("decoding column index: rowGroup=%d columnChunk=%d/%d: %w", i, j, numColumns, err)
   244  				}
   245  			}
   246  			return nil
   247  		})
   248  		if err != nil {
   249  			return nil, nil, err
   250  		}
   251  	}
   252  
   253  	if offsetIndexOffset > 0 {
   254  		offsetIndexData := indexBuffer[:offsetIndexLength]
   255  
   256  		if cast, ok := f.reader.(interface{ SetOffsetIndexSection(offset, length int64) }); ok {
   257  			cast.SetOffsetIndexSection(offsetIndexOffset, offsetIndexLength)
   258  		}
   259  		if _, err := f.readAt(offsetIndexData, offsetIndexOffset); err != nil {
   260  			return nil, nil, fmt.Errorf("reading %d bytes offset index at offset %d: %w", offsetIndexLength, offsetIndexOffset, err)
   261  		}
   262  
   263  		err := forEachColumnChunk(func(i, j int, c *format.ColumnChunk) error {
   264  			if c.OffsetIndexOffset > 0 {
   265  				offset := c.OffsetIndexOffset - offsetIndexOffset
   266  				length := int64(c.OffsetIndexLength)
   267  				buffer := offsetIndexData[offset : offset+length]
   268  				if err := thrift.Unmarshal(&f.protocol, buffer, &offsetIndexes[(i*numColumns)+j]); err != nil {
   269  					return fmt.Errorf("decoding column index: rowGroup=%d columnChunk=%d/%d: %w", i, j, numColumns, err)
   270  				}
   271  			}
   272  			return nil
   273  		})
   274  		if err != nil {
   275  			return nil, nil, err
   276  		}
   277  	}
   278  
   279  	return columnIndexes, offsetIndexes, nil
   280  }
   281  
   282  // NumRows returns the number of rows in the file.
   283  func (f *File) NumRows() int64 { return f.metadata.NumRows }
   284  
   285  // RowGroups returns the list of row groups in the file.
   286  func (f *File) RowGroups() []RowGroup { return f.rowGroups }
   287  
   288  // Root returns the root column of f.
   289  func (f *File) Root() *Column { return f.root }
   290  
   291  // Schema returns the schema of f.
   292  func (f *File) Schema() *Schema { return f.schema }
   293  
   294  // Metadata returns the metadata of f.
   295  func (f *File) Metadata() *format.FileMetaData { return &f.metadata }
   296  
   297  // Size returns the size of f (in bytes).
   298  func (f *File) Size() int64 { return f.size }
   299  
   300  // ReadAt reads bytes into b from f at the given offset.
   301  //
   302  // The method satisfies the io.ReaderAt interface.
   303  func (f *File) ReadAt(b []byte, off int64) (int, error) {
   304  	if off < 0 || off >= f.size {
   305  		return 0, io.EOF
   306  	}
   307  
   308  	if limit := f.size - off; limit < int64(len(b)) {
   309  		n, err := f.readAt(b[:limit], off)
   310  		if err == nil {
   311  			err = io.EOF
   312  		}
   313  		return n, err
   314  	}
   315  
   316  	return f.readAt(b, off)
   317  }
   318  
   319  // ColumnIndexes returns the page index of the parquet file f.
   320  //
   321  // If the file did not contain a column index, the method returns an empty slice
   322  // and nil error.
   323  func (f *File) ColumnIndexes() []format.ColumnIndex { return f.columnIndexes }
   324  
   325  // OffsetIndexes returns the page index of the parquet file f.
   326  //
   327  // If the file did not contain an offset index, the method returns an empty
   328  // slice and nil error.
   329  func (f *File) OffsetIndexes() []format.OffsetIndex { return f.offsetIndexes }
   330  
   331  // Lookup returns the value associated with the given key in the file key/value
   332  // metadata.
   333  //
   334  // The ok boolean will be true if the key was found, false otherwise.
   335  func (f *File) Lookup(key string) (value string, ok bool) {
   336  	return lookupKeyValueMetadata(f.metadata.KeyValueMetadata, key)
   337  }
   338  
   339  func (f *File) hasIndexes() bool {
   340  	return f.columnIndexes != nil && f.offsetIndexes != nil
   341  }
   342  
   343  var _ io.ReaderAt = (*File)(nil)
   344  
   345  func sortKeyValueMetadata(keyValueMetadata []format.KeyValue) {
   346  	sort.Slice(keyValueMetadata, func(i, j int) bool {
   347  		switch {
   348  		case keyValueMetadata[i].Key < keyValueMetadata[j].Key:
   349  			return true
   350  		case keyValueMetadata[i].Key > keyValueMetadata[j].Key:
   351  			return false
   352  		default:
   353  			return keyValueMetadata[i].Value < keyValueMetadata[j].Value
   354  		}
   355  	})
   356  }
   357  
   358  func lookupKeyValueMetadata(keyValueMetadata []format.KeyValue, key string) (value string, ok bool) {
   359  	i := sort.Search(len(keyValueMetadata), func(i int) bool {
   360  		return keyValueMetadata[i].Key >= key
   361  	})
   362  	if i == len(keyValueMetadata) || keyValueMetadata[i].Key != key {
   363  		return "", false
   364  	}
   365  	return keyValueMetadata[i].Value, true
   366  }
   367  
   368  type fileRowGroup struct {
   369  	schema   *Schema
   370  	rowGroup *format.RowGroup
   371  	columns  []ColumnChunk
   372  	sorting  []SortingColumn
   373  	config   *FileConfig
   374  }
   375  
   376  func (g *fileRowGroup) init(file *File, schema *Schema, columns []*Column, rowGroup *format.RowGroup) {
   377  	g.schema = schema
   378  	g.rowGroup = rowGroup
   379  	g.config = file.config
   380  	g.columns = make([]ColumnChunk, len(rowGroup.Columns))
   381  	g.sorting = make([]SortingColumn, len(rowGroup.SortingColumns))
   382  	fileColumnChunks := make([]fileColumnChunk, len(rowGroup.Columns))
   383  
   384  	for i := range g.columns {
   385  		fileColumnChunks[i] = fileColumnChunk{
   386  			file:     file,
   387  			column:   columns[i],
   388  			rowGroup: rowGroup,
   389  			chunk:    &rowGroup.Columns[i],
   390  		}
   391  
   392  		if file.hasIndexes() {
   393  			j := (int(rowGroup.Ordinal) * len(columns)) + i
   394  			fileColumnChunks[i].columnIndex = &file.columnIndexes[j]
   395  			fileColumnChunks[i].offsetIndex = &file.offsetIndexes[j]
   396  		}
   397  
   398  		g.columns[i] = &fileColumnChunks[i]
   399  	}
   400  
   401  	for i := range g.sorting {
   402  		g.sorting[i] = &fileSortingColumn{
   403  			column:     columns[rowGroup.SortingColumns[i].ColumnIdx],
   404  			descending: rowGroup.SortingColumns[i].Descending,
   405  			nullsFirst: rowGroup.SortingColumns[i].NullsFirst,
   406  		}
   407  	}
   408  }
   409  
   410  func (g *fileRowGroup) Schema() *Schema                 { return g.schema }
   411  func (g *fileRowGroup) NumRows() int64                  { return g.rowGroup.NumRows }
   412  func (g *fileRowGroup) ColumnChunks() []ColumnChunk     { return g.columns }
   413  func (g *fileRowGroup) SortingColumns() []SortingColumn { return g.sorting }
   414  func (g *fileRowGroup) Rows() Rows                      { return newRowGroupRows(g, g.config.ReadMode) }
   415  
   416  type fileSortingColumn struct {
   417  	column     *Column
   418  	descending bool
   419  	nullsFirst bool
   420  }
   421  
   422  func (s *fileSortingColumn) Path() []string   { return s.column.Path() }
   423  func (s *fileSortingColumn) Descending() bool { return s.descending }
   424  func (s *fileSortingColumn) NullsFirst() bool { return s.nullsFirst }
   425  func (s *fileSortingColumn) String() string {
   426  	b := new(strings.Builder)
   427  	if s.nullsFirst {
   428  		b.WriteString("nulls_first+")
   429  	}
   430  	if s.descending {
   431  		b.WriteString("descending(")
   432  	} else {
   433  		b.WriteString("ascending(")
   434  	}
   435  	b.WriteString(columnPath(s.Path()).String())
   436  	b.WriteString(")")
   437  	return b.String()
   438  }
   439  
   440  type fileColumnChunk struct {
   441  	file        *File
   442  	column      *Column
   443  	bloomFilter *bloomFilter
   444  	rowGroup    *format.RowGroup
   445  	columnIndex *format.ColumnIndex
   446  	offsetIndex *format.OffsetIndex
   447  	chunk       *format.ColumnChunk
   448  }
   449  
   450  func (c *fileColumnChunk) Type() Type {
   451  	return c.column.Type()
   452  }
   453  
   454  func (c *fileColumnChunk) Column() int {
   455  	return int(c.column.Index())
   456  }
   457  
   458  func (c *fileColumnChunk) Pages() Pages {
   459  	r := new(filePages)
   460  	r.init(c)
   461  	return r
   462  }
   463  
   464  func (c *fileColumnChunk) ColumnIndex() (ColumnIndex, error) {
   465  	if err := c.readColumnIndex(); err != nil {
   466  		return nil, err
   467  	}
   468  	if c.columnIndex == nil || c.chunk.ColumnIndexOffset == 0 {
   469  		return nil, ErrMissingColumnIndex
   470  	}
   471  	return fileColumnIndex{c}, nil
   472  }
   473  
   474  func (c *fileColumnChunk) OffsetIndex() (OffsetIndex, error) {
   475  	if err := c.readOffsetIndex(); err != nil {
   476  		return nil, err
   477  	}
   478  	if c.offsetIndex == nil || c.chunk.OffsetIndexOffset == 0 {
   479  		return nil, ErrMissingOffsetIndex
   480  	}
   481  	return (*fileOffsetIndex)(c.offsetIndex), nil
   482  }
   483  
   484  func (c *fileColumnChunk) BloomFilter() BloomFilter {
   485  	if c.bloomFilter == nil {
   486  		return nil
   487  	}
   488  	return c.bloomFilter
   489  }
   490  
   491  func (c *fileColumnChunk) NumValues() int64 {
   492  	return c.chunk.MetaData.NumValues
   493  }
   494  
   495  func (c *fileColumnChunk) readColumnIndex() error {
   496  	if c.columnIndex != nil {
   497  		return nil
   498  	}
   499  	chunkMeta := c.file.metadata.RowGroups[c.rowGroup.Ordinal].Columns[c.Column()]
   500  	offset, length := chunkMeta.ColumnIndexOffset, chunkMeta.ColumnIndexLength
   501  	if offset == 0 {
   502  		return nil
   503  	}
   504  
   505  	indexData := make([]byte, int(length))
   506  	var columnIndex format.ColumnIndex
   507  	if _, err := readAt(c.file.reader, indexData, offset); err != nil {
   508  		return fmt.Errorf("read %d bytes column index at offset %d: %w", length, offset, err)
   509  	}
   510  	if err := thrift.Unmarshal(&c.file.protocol, indexData, &columnIndex); err != nil {
   511  		return fmt.Errorf("decode column index: rowGroup=%d columnChunk=%d/%d: %w", c.rowGroup.Ordinal, c.Column(), len(c.rowGroup.Columns), err)
   512  	}
   513  	c.columnIndex = &columnIndex
   514  	return nil
   515  }
   516  
   517  func (c *fileColumnChunk) readOffsetIndex() error {
   518  	if c.offsetIndex != nil {
   519  		return nil
   520  	}
   521  	chunkMeta := c.file.metadata.RowGroups[c.rowGroup.Ordinal].Columns[c.Column()]
   522  	offset, length := chunkMeta.OffsetIndexOffset, chunkMeta.OffsetIndexLength
   523  	if offset == 0 {
   524  		return nil
   525  	}
   526  
   527  	indexData := make([]byte, int(length))
   528  	var offsetIndex format.OffsetIndex
   529  	if _, err := readAt(c.file.reader, indexData, offset); err != nil {
   530  		return fmt.Errorf("read %d bytes offset index at offset %d: %w", length, offset, err)
   531  	}
   532  	if err := thrift.Unmarshal(&c.file.protocol, indexData, &offsetIndex); err != nil {
   533  		return fmt.Errorf("decode offset index: rowGroup=%d columnChunk=%d/%d: %w", c.rowGroup.Ordinal, c.Column(), len(c.rowGroup.Columns), err)
   534  	}
   535  	c.offsetIndex = &offsetIndex
   536  	return nil
   537  }
   538  
   539  type filePages struct {
   540  	chunk    *fileColumnChunk
   541  	rbuf     *bufio.Reader
   542  	rbufpool *sync.Pool
   543  	section  io.SectionReader
   544  
   545  	protocol thrift.CompactProtocol
   546  	decoder  thrift.Decoder
   547  
   548  	baseOffset int64
   549  	dataOffset int64
   550  	dictOffset int64
   551  	index      int
   552  	skip       int64
   553  	dictionary Dictionary
   554  
   555  	bufferSize int
   556  }
   557  
   558  func (f *filePages) init(c *fileColumnChunk) {
   559  	f.chunk = c
   560  	f.baseOffset = c.chunk.MetaData.DataPageOffset
   561  	f.dataOffset = f.baseOffset
   562  	f.bufferSize = c.file.config.ReadBufferSize
   563  
   564  	if c.chunk.MetaData.DictionaryPageOffset != 0 {
   565  		f.baseOffset = c.chunk.MetaData.DictionaryPageOffset
   566  		f.dictOffset = f.baseOffset
   567  	}
   568  
   569  	f.section = *io.NewSectionReader(c.file, f.baseOffset, c.chunk.MetaData.TotalCompressedSize)
   570  	f.rbuf, f.rbufpool = getBufioReader(&f.section, f.bufferSize)
   571  	f.decoder.Reset(f.protocol.NewReader(f.rbuf))
   572  }
   573  
   574  func (f *filePages) ReadPage() (Page, error) {
   575  	if f.chunk == nil {
   576  		return nil, io.EOF
   577  	}
   578  
   579  	for {
   580  		// Instantiate a new format.PageHeader for each page.
   581  		//
   582  		// A previous implementation reused page headers to save allocations.
   583  		// https://github.com/segmentio/parquet-go/pull/484
   584  		// The optimization turned out to be less effective than expected,
   585  		// because all the values referenced by pointers in the page header
   586  		// are lost when the header is reset and put back in the pool.
   587  		// https://github.com/parquet-go/parquet-go/pull/11
   588  		//
   589  		// Even after being reset, reusing page headers still produced instability
   590  		// issues.
   591  		// https://github.com/parquet-go/parquet-go/issues/70
   592  		header := new(format.PageHeader)
   593  		if err := f.decoder.Decode(header); err != nil {
   594  			return nil, err
   595  		}
   596  		data, err := f.readPage(header, f.rbuf)
   597  		if err != nil {
   598  			return nil, err
   599  		}
   600  
   601  		var page Page
   602  		switch header.Type {
   603  		case format.DataPageV2:
   604  			page, err = f.readDataPageV2(header, data)
   605  		case format.DataPage:
   606  			page, err = f.readDataPageV1(header, data)
   607  		case format.DictionaryPage:
   608  			// Sometimes parquet files do not have the dictionary page offset
   609  			// recorded in the column metadata. We account for this by lazily
   610  			// reading dictionary pages when we encounter them.
   611  			err = f.readDictionaryPage(header, data)
   612  		default:
   613  			err = fmt.Errorf("cannot read values of type %s from page", header.Type)
   614  		}
   615  
   616  		data.unref()
   617  
   618  		if err != nil {
   619  			return nil, fmt.Errorf("decoding page %d of column %q: %w", f.index, f.columnPath(), err)
   620  		}
   621  
   622  		if page == nil {
   623  			continue
   624  		}
   625  
   626  		f.index++
   627  		if f.skip == 0 {
   628  			return page, nil
   629  		}
   630  
   631  		// TODO: what about pages that don't embed the number of rows?
   632  		// (data page v1 with no offset index in the column chunk).
   633  		numRows := page.NumRows()
   634  
   635  		if numRows <= f.skip {
   636  			Release(page)
   637  		} else {
   638  			tail := page.Slice(f.skip, numRows)
   639  			Release(page)
   640  			f.skip = 0
   641  			return tail, nil
   642  		}
   643  
   644  		f.skip -= numRows
   645  	}
   646  }
   647  
   648  func (f *filePages) readDictionary() error {
   649  	chunk := io.NewSectionReader(f.chunk.file, f.baseOffset, f.chunk.chunk.MetaData.TotalCompressedSize)
   650  	rbuf, pool := getBufioReader(chunk, f.bufferSize)
   651  	defer putBufioReader(rbuf, pool)
   652  
   653  	decoder := thrift.NewDecoder(f.protocol.NewReader(rbuf))
   654  
   655  	header := new(format.PageHeader)
   656  
   657  	if err := decoder.Decode(header); err != nil {
   658  		return err
   659  	}
   660  
   661  	page := buffers.get(int(header.CompressedPageSize))
   662  	defer page.unref()
   663  
   664  	if _, err := io.ReadFull(rbuf, page.data); err != nil {
   665  		return err
   666  	}
   667  
   668  	return f.readDictionaryPage(header, page)
   669  }
   670  
   671  func (f *filePages) readDictionaryPage(header *format.PageHeader, page *buffer) error {
   672  	if header.DictionaryPageHeader == nil {
   673  		return ErrMissingPageHeader
   674  	}
   675  	d, err := f.chunk.column.decodeDictionary(DictionaryPageHeader{header.DictionaryPageHeader}, page, header.UncompressedPageSize)
   676  	if err != nil {
   677  		return err
   678  	}
   679  	f.dictionary = d
   680  	return nil
   681  }
   682  
   683  func (f *filePages) readDataPageV1(header *format.PageHeader, page *buffer) (Page, error) {
   684  	if header.DataPageHeader == nil {
   685  		return nil, ErrMissingPageHeader
   686  	}
   687  	if isDictionaryFormat(header.DataPageHeader.Encoding) && f.dictionary == nil {
   688  		if err := f.readDictionary(); err != nil {
   689  			return nil, err
   690  		}
   691  	}
   692  	return f.chunk.column.decodeDataPageV1(DataPageHeaderV1{header.DataPageHeader}, page, f.dictionary, header.UncompressedPageSize)
   693  }
   694  
   695  func (f *filePages) readDataPageV2(header *format.PageHeader, page *buffer) (Page, error) {
   696  	if header.DataPageHeaderV2 == nil {
   697  		return nil, ErrMissingPageHeader
   698  	}
   699  	if isDictionaryFormat(header.DataPageHeaderV2.Encoding) && f.dictionary == nil {
   700  		// If the program seeked to a row passed the first page, the dictionary
   701  		// page may not have been seen, in which case we have to lazily load it
   702  		// from the beginning of column chunk.
   703  		if err := f.readDictionary(); err != nil {
   704  			return nil, err
   705  		}
   706  	}
   707  	return f.chunk.column.decodeDataPageV2(DataPageHeaderV2{header.DataPageHeaderV2}, page, f.dictionary, header.UncompressedPageSize)
   708  }
   709  
   710  func (f *filePages) readPage(header *format.PageHeader, reader *bufio.Reader) (*buffer, error) {
   711  	page := buffers.get(int(header.CompressedPageSize))
   712  	defer page.unref()
   713  
   714  	if _, err := io.ReadFull(reader, page.data); err != nil {
   715  		return nil, err
   716  	}
   717  
   718  	if header.CRC != 0 {
   719  		headerChecksum := uint32(header.CRC)
   720  		bufferChecksum := crc32.ChecksumIEEE(page.data)
   721  
   722  		if headerChecksum != bufferChecksum {
   723  			// The parquet specs indicate that corruption errors could be
   724  			// handled gracefully by skipping pages, tho this may not always
   725  			// be practical. Depending on how the pages are consumed,
   726  			// missing rows may cause unpredictable behaviors in algorithms.
   727  			//
   728  			// For now, we assume these errors to be fatal, but we may
   729  			// revisit later and improve error handling to be more resilient
   730  			// to data corruption.
   731  			return nil, fmt.Errorf("crc32 checksum mismatch in page of column %q: want=0x%08X got=0x%08X: %w",
   732  				f.columnPath(),
   733  				headerChecksum,
   734  				bufferChecksum,
   735  				ErrCorrupted,
   736  			)
   737  		}
   738  	}
   739  
   740  	page.ref()
   741  	return page, nil
   742  }
   743  
   744  func (f *filePages) SeekToRow(rowIndex int64) (err error) {
   745  	if f.chunk == nil {
   746  		return io.ErrClosedPipe
   747  	}
   748  	if f.chunk.offsetIndex == nil {
   749  		_, err = f.section.Seek(f.dataOffset-f.baseOffset, io.SeekStart)
   750  		f.skip = rowIndex
   751  		f.index = 0
   752  		if f.dictOffset > 0 {
   753  			f.index = 1
   754  		}
   755  	} else {
   756  		pages := f.chunk.offsetIndex.PageLocations
   757  		index := sort.Search(len(pages), func(i int) bool {
   758  			return pages[i].FirstRowIndex > rowIndex
   759  		}) - 1
   760  		if index < 0 {
   761  			return ErrSeekOutOfRange
   762  		}
   763  		_, err = f.section.Seek(pages[index].Offset-f.baseOffset, io.SeekStart)
   764  		f.skip = rowIndex - pages[index].FirstRowIndex
   765  		f.index = index
   766  	}
   767  	f.rbuf.Reset(&f.section)
   768  	return err
   769  }
   770  
   771  func (f *filePages) Close() error {
   772  	putBufioReader(f.rbuf, f.rbufpool)
   773  	f.chunk = nil
   774  	f.section = io.SectionReader{}
   775  	f.rbuf = nil
   776  	f.rbufpool = nil
   777  	f.baseOffset = 0
   778  	f.dataOffset = 0
   779  	f.dictOffset = 0
   780  	f.index = 0
   781  	f.skip = 0
   782  	f.dictionary = nil
   783  	return nil
   784  }
   785  
   786  func (f *filePages) columnPath() columnPath {
   787  	return columnPath(f.chunk.column.Path())
   788  }
   789  
   790  type putBufioReaderFunc func()
   791  
   792  var (
   793  	bufioReaderPoolLock sync.Mutex
   794  	bufioReaderPool     = map[int]*sync.Pool{}
   795  )
   796  
   797  func getBufioReader(r io.Reader, bufferSize int) (*bufio.Reader, *sync.Pool) {
   798  	pool := getBufioReaderPool(bufferSize)
   799  	rbuf, _ := pool.Get().(*bufio.Reader)
   800  	if rbuf == nil {
   801  		rbuf = bufio.NewReaderSize(r, bufferSize)
   802  	} else {
   803  		rbuf.Reset(r)
   804  	}
   805  	return rbuf, pool
   806  }
   807  
   808  func putBufioReader(rbuf *bufio.Reader, pool *sync.Pool) {
   809  	if rbuf != nil && pool != nil {
   810  		rbuf.Reset(nil)
   811  		pool.Put(rbuf)
   812  	}
   813  }
   814  
   815  func getBufioReaderPool(size int) *sync.Pool {
   816  	bufioReaderPoolLock.Lock()
   817  	defer bufioReaderPoolLock.Unlock()
   818  
   819  	if pool := bufioReaderPool[size]; pool != nil {
   820  		return pool
   821  	}
   822  
   823  	pool := &sync.Pool{}
   824  	bufioReaderPool[size] = pool
   825  	return pool
   826  }
   827  
   828  func (f *File) readAt(p []byte, off int64) (int, error) {
   829  	return readAt(f.reader, p, off)
   830  }
   831  
   832  func readAt(r io.ReaderAt, p []byte, off int64) (n int, err error) {
   833  	n, err = r.ReadAt(p, off)
   834  	if n == len(p) {
   835  		err = nil
   836  		// p was fully read.There is no further need to check for errors. This
   837  		// operation is a success in principle.
   838  		return
   839  	}
   840  	return
   841  }