github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/file.go

github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/file.go (about)

     1  package parquet
     2  
     3  import (
     4  	"bufio"
     5  	"encoding/binary"
     6  	"fmt"
     7  	"hash/crc32"
     8  	"io"
     9  	"sort"
    10  	"strings"
    11  	"sync"
    12  
    13  	"github.com/segmentio/encoding/thrift"
    14  	"github.com/segmentio/parquet-go/format"
    15  )
    16  
    17  const (
    18  	defaultDictBufferSize = 8192
    19  	defaultReadBufferSize = 4096
    20  )
    21  
    22  // File represents a parquet file. The layout of a Parquet file can be found
    23  // here: https://github.com/apache/parquet-format#file-format
    24  type File struct {
    25  	metadata      format.FileMetaData
    26  	protocol      thrift.CompactProtocol
    27  	reader        io.ReaderAt
    28  	size          int64
    29  	schema        *Schema
    30  	root          *Column
    31  	columnIndexes []format.ColumnIndex
    32  	offsetIndexes []format.OffsetIndex
    33  	rowGroups     []RowGroup
    34  	config        *FileConfig
    35  }
    36  
    37  // OpenFile opens a parquet file and reads the content between offset 0 and the given
    38  // size in r.
    39  //
    40  // Only the parquet magic bytes and footer are read, column chunks and other
    41  // parts of the file are left untouched; this means that successfully opening
    42  // a file does not validate that the pages have valid checksums.
    43  func OpenFile(r io.ReaderAt, size int64, options ...FileOption) (*File, error) {
    44  	b := make([]byte, 8)
    45  	c, err := NewFileConfig(options...)
    46  	if err != nil {
    47  		return nil, err
    48  	}
    49  	f := &File{reader: r, size: size, config: c}
    50  
    51  	if _, err := r.ReadAt(b[:4], 0); err != nil {
    52  		return nil, fmt.Errorf("reading magic header of parquet file: %w", err)
    53  	}
    54  	if string(b[:4]) != "PAR1" {
    55  		return nil, fmt.Errorf("invalid magic header of parquet file: %q", b[:4])
    56  	}
    57  
    58  	if cast, ok := f.reader.(interface{ SetMagicFooterSection(offset, length int64) }); ok {
    59  		cast.SetMagicFooterSection(size-8, 8)
    60  	}
    61  	if n, err := r.ReadAt(b[:8], size-8); n != 8 {
    62  		return nil, fmt.Errorf("reading magic footer of parquet file: %w", err)
    63  	}
    64  	if string(b[4:8]) != "PAR1" {
    65  		return nil, fmt.Errorf("invalid magic footer of parquet file: %q", b[4:8])
    66  	}
    67  
    68  	footerSize := int64(binary.LittleEndian.Uint32(b[:4]))
    69  	footerData := make([]byte, footerSize)
    70  
    71  	if cast, ok := f.reader.(interface{ SetFooterSection(offset, length int64) }); ok {
    72  		cast.SetFooterSection(size-(footerSize+8), footerSize)
    73  	}
    74  	if _, err := f.reader.ReadAt(footerData, size-(footerSize+8)); err != nil {
    75  		return nil, fmt.Errorf("reading footer of parquet file: %w", err)
    76  	}
    77  	if err := thrift.Unmarshal(&f.protocol, footerData, &f.metadata); err != nil {
    78  		return nil, fmt.Errorf("reading parquet file metadata: %w", err)
    79  	}
    80  	if len(f.metadata.Schema) == 0 {
    81  		return nil, ErrMissingRootColumn
    82  	}
    83  
    84  	if !c.SkipPageIndex {
    85  		if f.columnIndexes, f.offsetIndexes, err = f.ReadPageIndex(); err != nil {
    86  			return nil, fmt.Errorf("reading page index of parquet file: %w", err)
    87  		}
    88  	}
    89  
    90  	if f.root, err = openColumns(f); err != nil {
    91  		return nil, fmt.Errorf("opening columns of parquet file: %w", err)
    92  	}
    93  
    94  	var schema *Schema
    95  	if c.Schema != nil {
    96  		schema = c.Schema
    97  	} else {
    98  		schema = NewSchema(f.root.Name(), f.root)
    99  	}
   100  	columns := make([]*Column, 0, numLeafColumnsOf(f.root))
   101  	f.schema = schema
   102  	f.root.forEachLeaf(func(c *Column) { columns = append(columns, c) })
   103  
   104  	rowGroups := make([]fileRowGroup, len(f.metadata.RowGroups))
   105  	for i := range rowGroups {
   106  		rowGroups[i].init(f, schema, columns, &f.metadata.RowGroups[i])
   107  	}
   108  	f.rowGroups = make([]RowGroup, len(rowGroups))
   109  	for i := range rowGroups {
   110  		f.rowGroups[i] = &rowGroups[i]
   111  	}
   112  
   113  	if !c.SkipBloomFilters {
   114  		section := io.NewSectionReader(r, 0, size)
   115  		rbuf, rbufpool := getBufioReader(section, c.ReadBufferSize)
   116  		defer putBufioReader(rbuf, rbufpool)
   117  
   118  		header := format.BloomFilterHeader{}
   119  		compact := thrift.CompactProtocol{}
   120  		decoder := thrift.NewDecoder(compact.NewReader(rbuf))
   121  
   122  		for i := range rowGroups {
   123  			g := &rowGroups[i]
   124  
   125  			for j := range g.columns {
   126  				c := g.columns[j].(*fileColumnChunk)
   127  
   128  				if offset := c.chunk.MetaData.BloomFilterOffset; offset > 0 {
   129  					section.Seek(offset, io.SeekStart)
   130  					rbuf.Reset(section)
   131  
   132  					header = format.BloomFilterHeader{}
   133  					if err := decoder.Decode(&header); err != nil {
   134  						return nil, fmt.Errorf("decoding bloom filter header: %w", err)
   135  					}
   136  
   137  					offset, _ = section.Seek(0, io.SeekCurrent)
   138  					offset -= int64(rbuf.Buffered())
   139  
   140  					if cast, ok := r.(interface{ SetBloomFilterSection(offset, length int64) }); ok {
   141  						bloomFilterOffset := c.chunk.MetaData.BloomFilterOffset
   142  						bloomFilterLength := (offset - bloomFilterOffset) + int64(header.NumBytes)
   143  						cast.SetBloomFilterSection(bloomFilterOffset, bloomFilterLength)
   144  					}
   145  
   146  					c.bloomFilter = newBloomFilter(r, offset, &header)
   147  				}
   148  			}
   149  		}
   150  	}
   151  
   152  	sortKeyValueMetadata(f.metadata.KeyValueMetadata)
   153  	return f, nil
   154  }
   155  
   156  // ReadPageIndex reads the page index section of the parquet file f.
   157  //
   158  // If the file did not contain a page index, the method returns two empty slices
   159  // and a nil error.
   160  //
   161  // Only leaf columns have indexes, the returned indexes are arranged using the
   162  // following layout:
   163  //
   164  //	------------------
   165  //	| col 0: chunk 0 |
   166  //	------------------
   167  //	| col 1: chunk 0 |
   168  //	------------------
   169  //	| ...            |
   170  //	------------------
   171  //	| col 0: chunk 1 |
   172  //	------------------
   173  //	| col 1: chunk 1 |
   174  //	------------------
   175  //	| ...            |
   176  //	------------------
   177  //
   178  // This method is useful in combination with the SkipPageIndex option to delay
   179  // reading the page index section until after the file was opened. Note that in
   180  // this case the page index is not cached within the file, programs are expected
   181  // to make use of independently from the parquet package.
   182  func (f *File) ReadPageIndex() ([]format.ColumnIndex, []format.OffsetIndex, error) {
   183  	if len(f.metadata.RowGroups) == 0 {
   184  		return nil, nil, nil
   185  	}
   186  
   187  	columnIndexOffset := f.metadata.RowGroups[0].Columns[0].ColumnIndexOffset
   188  	offsetIndexOffset := f.metadata.RowGroups[0].Columns[0].OffsetIndexOffset
   189  	columnIndexLength := int64(0)
   190  	offsetIndexLength := int64(0)
   191  
   192  	forEachColumnChunk := func(do func(int, int, *format.ColumnChunk) error) error {
   193  		for i := range f.metadata.RowGroups {
   194  			for j := range f.metadata.RowGroups[i].Columns {
   195  				c := &f.metadata.RowGroups[i].Columns[j]
   196  				if err := do(i, j, c); err != nil {
   197  					return err
   198  				}
   199  			}
   200  		}
   201  		return nil
   202  	}
   203  
   204  	forEachColumnChunk(func(_, _ int, c *format.ColumnChunk) error {
   205  		columnIndexLength += int64(c.ColumnIndexLength)
   206  		offsetIndexLength += int64(c.OffsetIndexLength)
   207  		return nil
   208  	})
   209  
   210  	if columnIndexLength == 0 && offsetIndexLength == 0 {
   211  		return nil, nil, nil
   212  	}
   213  
   214  	numRowGroups := len(f.metadata.RowGroups)
   215  	numColumns := len(f.metadata.RowGroups[0].Columns)
   216  	numColumnChunks := numRowGroups * numColumns
   217  
   218  	columnIndexes := make([]format.ColumnIndex, numColumnChunks)
   219  	offsetIndexes := make([]format.OffsetIndex, numColumnChunks)
   220  	indexBuffer := make([]byte, max(int(columnIndexLength), int(offsetIndexLength)))
   221  
   222  	if columnIndexOffset > 0 {
   223  		columnIndexData := indexBuffer[:columnIndexLength]
   224  
   225  		if cast, ok := f.reader.(interface{ SetColumnIndexSection(offset, length int64) }); ok {
   226  			cast.SetColumnIndexSection(columnIndexOffset, columnIndexLength)
   227  		}
   228  		if _, err := f.reader.ReadAt(columnIndexData, columnIndexOffset); err != nil {
   229  			return nil, nil, fmt.Errorf("reading %d bytes column index at offset %d: %w", columnIndexLength, columnIndexOffset, err)
   230  		}
   231  
   232  		err := forEachColumnChunk(func(i, j int, c *format.ColumnChunk) error {
   233  			// Some parquet files are missing the column index on some columns.
   234  			//
   235  			// An example of this file is testdata/alltypes_tiny_pages_plain.parquet
   236  			// which was added in https://github.com/apache/parquet-testing/pull/24.
   237  			if c.ColumnIndexOffset > 0 {
   238  				offset := c.ColumnIndexOffset - columnIndexOffset
   239  				length := int64(c.ColumnIndexLength)
   240  				buffer := columnIndexData[offset : offset+length]
   241  				if err := thrift.Unmarshal(&f.protocol, buffer, &columnIndexes[(i*numColumns)+j]); err != nil {
   242  					return fmt.Errorf("decoding column index: rowGroup=%d columnChunk=%d/%d: %w", i, j, numColumns, err)
   243  				}
   244  			}
   245  			return nil
   246  		})
   247  		if err != nil {
   248  			return nil, nil, err
   249  		}
   250  	}
   251  
   252  	if offsetIndexOffset > 0 {
   253  		offsetIndexData := indexBuffer[:offsetIndexLength]
   254  
   255  		if cast, ok := f.reader.(interface{ SetOffsetIndexSection(offset, length int64) }); ok {
   256  			cast.SetOffsetIndexSection(offsetIndexOffset, offsetIndexLength)
   257  		}
   258  		if _, err := f.reader.ReadAt(offsetIndexData, offsetIndexOffset); err != nil {
   259  			return nil, nil, fmt.Errorf("reading %d bytes offset index at offset %d: %w", offsetIndexLength, offsetIndexOffset, err)
   260  		}
   261  
   262  		err := forEachColumnChunk(func(i, j int, c *format.ColumnChunk) error {
   263  			if c.OffsetIndexOffset > 0 {
   264  				offset := c.OffsetIndexOffset - offsetIndexOffset
   265  				length := int64(c.OffsetIndexLength)
   266  				buffer := offsetIndexData[offset : offset+length]
   267  				if err := thrift.Unmarshal(&f.protocol, buffer, &offsetIndexes[(i*numColumns)+j]); err != nil {
   268  					return fmt.Errorf("decoding column index: rowGroup=%d columnChunk=%d/%d: %w", i, j, numColumns, err)
   269  				}
   270  			}
   271  			return nil
   272  		})
   273  		if err != nil {
   274  			return nil, nil, err
   275  		}
   276  	}
   277  
   278  	return columnIndexes, offsetIndexes, nil
   279  }
   280  
   281  // NumRows returns the number of rows in the file.
   282  func (f *File) NumRows() int64 { return f.metadata.NumRows }
   283  
   284  // RowGroups returns the list of row groups in the file.
   285  func (f *File) RowGroups() []RowGroup { return f.rowGroups }
   286  
   287  // Root returns the root column of f.
   288  func (f *File) Root() *Column { return f.root }
   289  
   290  // Schema returns the schema of f.
   291  func (f *File) Schema() *Schema { return f.schema }
   292  
   293  // Metadata returns the metadata of f.
   294  func (f *File) Metadata() *format.FileMetaData { return &f.metadata }
   295  
   296  // Size returns the size of f (in bytes).
   297  func (f *File) Size() int64 { return f.size }
   298  
   299  // ReadAt reads bytes into b from f at the given offset.
   300  //
   301  // The method satisfies the io.ReaderAt interface.
   302  func (f *File) ReadAt(b []byte, off int64) (int, error) {
   303  	if off < 0 || off >= f.size {
   304  		return 0, io.EOF
   305  	}
   306  
   307  	if limit := f.size - off; limit < int64(len(b)) {
   308  		n, err := f.reader.ReadAt(b[:limit], off)
   309  		if err == nil {
   310  			err = io.EOF
   311  		}
   312  		return n, err
   313  	}
   314  
   315  	return f.reader.ReadAt(b, off)
   316  }
   317  
   318  // ColumnIndexes returns the page index of the parquet file f.
   319  //
   320  // If the file did not contain a column index, the method returns an empty slice
   321  // and nil error.
   322  func (f *File) ColumnIndexes() []format.ColumnIndex { return f.columnIndexes }
   323  
   324  // OffsetIndexes returns the page index of the parquet file f.
   325  //
   326  // If the file did not contain an offset index, the method returns an empty
   327  // slice and nil error.
   328  func (f *File) OffsetIndexes() []format.OffsetIndex { return f.offsetIndexes }
   329  
   330  // Lookup returns the value associated with the given key in the file key/value
   331  // metadata.
   332  //
   333  // The ok boolean will be true if the key was found, false otherwise.
   334  func (f *File) Lookup(key string) (value string, ok bool) {
   335  	return lookupKeyValueMetadata(f.metadata.KeyValueMetadata, key)
   336  }
   337  
   338  func (f *File) hasIndexes() bool {
   339  	return f.columnIndexes != nil && f.offsetIndexes != nil
   340  }
   341  
   342  var _ io.ReaderAt = (*File)(nil)
   343  
   344  func sortKeyValueMetadata(keyValueMetadata []format.KeyValue) {
   345  	sort.Slice(keyValueMetadata, func(i, j int) bool {
   346  		switch {
   347  		case keyValueMetadata[i].Key < keyValueMetadata[j].Key:
   348  			return true
   349  		case keyValueMetadata[i].Key > keyValueMetadata[j].Key:
   350  			return false
   351  		default:
   352  			return keyValueMetadata[i].Value < keyValueMetadata[j].Value
   353  		}
   354  	})
   355  }
   356  
   357  func lookupKeyValueMetadata(keyValueMetadata []format.KeyValue, key string) (value string, ok bool) {
   358  	i := sort.Search(len(keyValueMetadata), func(i int) bool {
   359  		return keyValueMetadata[i].Key >= key
   360  	})
   361  	if i == len(keyValueMetadata) || keyValueMetadata[i].Key != key {
   362  		return "", false
   363  	}
   364  	return keyValueMetadata[i].Value, true
   365  }
   366  
   367  type fileRowGroup struct {
   368  	schema   *Schema
   369  	rowGroup *format.RowGroup
   370  	columns  []ColumnChunk
   371  	sorting  []SortingColumn
   372  	config   *FileConfig
   373  }
   374  
   375  func (g *fileRowGroup) init(file *File, schema *Schema, columns []*Column, rowGroup *format.RowGroup) {
   376  	g.schema = schema
   377  	g.rowGroup = rowGroup
   378  	g.config = file.config
   379  	g.columns = make([]ColumnChunk, len(rowGroup.Columns))
   380  	g.sorting = make([]SortingColumn, len(rowGroup.SortingColumns))
   381  	fileColumnChunks := make([]fileColumnChunk, len(rowGroup.Columns))
   382  
   383  	for i := range g.columns {
   384  		fileColumnChunks[i] = fileColumnChunk{
   385  			file:     file,
   386  			column:   columns[i],
   387  			rowGroup: rowGroup,
   388  			chunk:    &rowGroup.Columns[i],
   389  		}
   390  
   391  		if file.hasIndexes() {
   392  			j := (int(rowGroup.Ordinal) * len(columns)) + i
   393  			fileColumnChunks[i].columnIndex = &file.columnIndexes[j]
   394  			fileColumnChunks[i].offsetIndex = &file.offsetIndexes[j]
   395  		}
   396  
   397  		g.columns[i] = &fileColumnChunks[i]
   398  	}
   399  
   400  	for i := range g.sorting {
   401  		g.sorting[i] = &fileSortingColumn{
   402  			column:     columns[rowGroup.SortingColumns[i].ColumnIdx],
   403  			descending: rowGroup.SortingColumns[i].Descending,
   404  			nullsFirst: rowGroup.SortingColumns[i].NullsFirst,
   405  		}
   406  	}
   407  }
   408  
   409  func (g *fileRowGroup) Schema() *Schema                 { return g.schema }
   410  func (g *fileRowGroup) NumRows() int64                  { return g.rowGroup.NumRows }
   411  func (g *fileRowGroup) ColumnChunks() []ColumnChunk     { return g.columns }
   412  func (g *fileRowGroup) SortingColumns() []SortingColumn { return g.sorting }
   413  func (g *fileRowGroup) Rows() Rows                      { return newRowGroupRows(g, g.config.ReadMode) }
   414  
   415  type fileSortingColumn struct {
   416  	column     *Column
   417  	descending bool
   418  	nullsFirst bool
   419  }
   420  
   421  func (s *fileSortingColumn) Path() []string   { return s.column.Path() }
   422  func (s *fileSortingColumn) Descending() bool { return s.descending }
   423  func (s *fileSortingColumn) NullsFirst() bool { return s.nullsFirst }
   424  func (s *fileSortingColumn) String() string {
   425  	b := new(strings.Builder)
   426  	if s.nullsFirst {
   427  		b.WriteString("nulls_first+")
   428  	}
   429  	if s.descending {
   430  		b.WriteString("descending(")
   431  	} else {
   432  		b.WriteString("ascending(")
   433  	}
   434  	b.WriteString(columnPath(s.Path()).String())
   435  	b.WriteString(")")
   436  	return b.String()
   437  }
   438  
   439  type fileColumnChunk struct {
   440  	file        *File
   441  	column      *Column
   442  	bloomFilter *bloomFilter
   443  	rowGroup    *format.RowGroup
   444  	columnIndex *format.ColumnIndex
   445  	offsetIndex *format.OffsetIndex
   446  	chunk       *format.ColumnChunk
   447  }
   448  
   449  func (c *fileColumnChunk) Type() Type {
   450  	return c.column.Type()
   451  }
   452  
   453  func (c *fileColumnChunk) Column() int {
   454  	return int(c.column.Index())
   455  }
   456  
   457  func (c *fileColumnChunk) Pages() Pages {
   458  	r := new(filePages)
   459  	r.init(c)
   460  	return r
   461  }
   462  
   463  func (c *fileColumnChunk) ColumnIndex() ColumnIndex {
   464  	if c.columnIndex == nil {
   465  		return nil
   466  	}
   467  	return fileColumnIndex{c}
   468  }
   469  
   470  func (c *fileColumnChunk) OffsetIndex() OffsetIndex {
   471  	if c.offsetIndex == nil {
   472  		return nil
   473  	}
   474  	return (*fileOffsetIndex)(c.offsetIndex)
   475  }
   476  
   477  func (c *fileColumnChunk) BloomFilter() BloomFilter {
   478  	if c.bloomFilter == nil {
   479  		return nil
   480  	}
   481  	return c.bloomFilter
   482  }
   483  
   484  func (c *fileColumnChunk) NumValues() int64 {
   485  	return c.chunk.MetaData.NumValues
   486  }
   487  
   488  type filePages struct {
   489  	chunk    *fileColumnChunk
   490  	rbuf     *bufio.Reader
   491  	rbufpool *sync.Pool
   492  	section  io.SectionReader
   493  
   494  	protocol thrift.CompactProtocol
   495  	decoder  thrift.Decoder
   496  
   497  	baseOffset int64
   498  	dataOffset int64
   499  	dictOffset int64
   500  	index      int
   501  	skip       int64
   502  	dictionary Dictionary
   503  
   504  	bufferSize int
   505  }
   506  
   507  func (f *filePages) init(c *fileColumnChunk) {
   508  	f.chunk = c
   509  	f.baseOffset = c.chunk.MetaData.DataPageOffset
   510  	f.dataOffset = f.baseOffset
   511  	f.bufferSize = c.file.config.ReadBufferSize
   512  
   513  	if c.chunk.MetaData.DictionaryPageOffset != 0 {
   514  		f.baseOffset = c.chunk.MetaData.DictionaryPageOffset
   515  		f.dictOffset = f.baseOffset
   516  	}
   517  
   518  	f.section = *io.NewSectionReader(c.file, f.baseOffset, c.chunk.MetaData.TotalCompressedSize)
   519  	f.rbuf, f.rbufpool = getBufioReader(&f.section, f.bufferSize)
   520  	f.decoder.Reset(f.protocol.NewReader(f.rbuf))
   521  }
   522  
   523  func (f *filePages) ReadPage() (Page, error) {
   524  	if f.chunk == nil {
   525  		return nil, io.EOF
   526  	}
   527  
   528  	header := getPageHeader()
   529  	defer putPageHeader(header)
   530  
   531  	for {
   532  		if err := f.decoder.Decode(header); err != nil {
   533  			return nil, err
   534  		}
   535  		data, err := f.readPage(header, f.rbuf)
   536  		if err != nil {
   537  			return nil, err
   538  		}
   539  
   540  		var page Page
   541  		switch header.Type {
   542  		case format.DataPageV2:
   543  			page, err = f.readDataPageV2(header, data)
   544  		case format.DataPage:
   545  			page, err = f.readDataPageV1(header, data)
   546  		case format.DictionaryPage:
   547  			// Sometimes parquet files do not have the dictionary page offset
   548  			// recorded in the column metadata. We account for this by lazily
   549  			// reading dictionary pages when we encounter them.
   550  			err = f.readDictionaryPage(header, data)
   551  		default:
   552  			err = fmt.Errorf("cannot read values of type %s from page", header.Type)
   553  		}
   554  
   555  		data.unref()
   556  
   557  		if err != nil {
   558  			return nil, fmt.Errorf("decoding page %d of column %q: %w", f.index, f.columnPath(), err)
   559  		}
   560  
   561  		if page == nil {
   562  			continue
   563  		}
   564  
   565  		f.index++
   566  		if f.skip == 0 {
   567  			return page, nil
   568  		}
   569  
   570  		// TODO: what about pages that don't embed the number of rows?
   571  		// (data page v1 with no offset index in the column chunk).
   572  		numRows := page.NumRows()
   573  
   574  		if numRows <= f.skip {
   575  			Release(page)
   576  		} else {
   577  			tail := page.Slice(f.skip, numRows)
   578  			Release(page)
   579  			f.skip = 0
   580  			return tail, nil
   581  		}
   582  
   583  		f.skip -= numRows
   584  	}
   585  }
   586  
   587  func (f *filePages) readDictionary() error {
   588  	chunk := io.NewSectionReader(f.chunk.file, f.baseOffset, f.chunk.chunk.MetaData.TotalCompressedSize)
   589  	rbuf, pool := getBufioReader(chunk, f.bufferSize)
   590  	defer putBufioReader(rbuf, pool)
   591  
   592  	decoder := thrift.NewDecoder(f.protocol.NewReader(rbuf))
   593  
   594  	header := getPageHeader()
   595  	defer putPageHeader(header)
   596  
   597  	if err := decoder.Decode(header); err != nil {
   598  		return err
   599  	}
   600  
   601  	page := buffers.get(int(header.CompressedPageSize))
   602  	defer page.unref()
   603  
   604  	if _, err := io.ReadFull(rbuf, page.data); err != nil {
   605  		return err
   606  	}
   607  
   608  	return f.readDictionaryPage(header, page)
   609  }
   610  
   611  func (f *filePages) readDictionaryPage(header *format.PageHeader, page *buffer) error {
   612  	if header.DictionaryPageHeader == nil {
   613  		return ErrMissingPageHeader
   614  	}
   615  	d, err := f.chunk.column.decodeDictionary(DictionaryPageHeader{header.DictionaryPageHeader}, page, header.UncompressedPageSize)
   616  	if err != nil {
   617  		return err
   618  	}
   619  	f.dictionary = d
   620  	return nil
   621  }
   622  
   623  func (f *filePages) readDataPageV1(header *format.PageHeader, page *buffer) (Page, error) {
   624  	if header.DataPageHeader == nil {
   625  		return nil, ErrMissingPageHeader
   626  	}
   627  	if isDictionaryFormat(header.DataPageHeader.Encoding) && f.dictionary == nil {
   628  		if err := f.readDictionary(); err != nil {
   629  			return nil, err
   630  		}
   631  	}
   632  	return f.chunk.column.decodeDataPageV1(DataPageHeaderV1{header.DataPageHeader}, page, f.dictionary, header.UncompressedPageSize)
   633  }
   634  
   635  func (f *filePages) readDataPageV2(header *format.PageHeader, page *buffer) (Page, error) {
   636  	if header.DataPageHeaderV2 == nil {
   637  		return nil, ErrMissingPageHeader
   638  	}
   639  	if isDictionaryFormat(header.DataPageHeaderV2.Encoding) && f.dictionary == nil {
   640  		// If the program seeked to a row passed the first page, the dictionary
   641  		// page may not have been seen, in which case we have to lazily load it
   642  		// from the beginning of column chunk.
   643  		if err := f.readDictionary(); err != nil {
   644  			return nil, err
   645  		}
   646  	}
   647  	return f.chunk.column.decodeDataPageV2(DataPageHeaderV2{header.DataPageHeaderV2}, page, f.dictionary, header.UncompressedPageSize)
   648  }
   649  
   650  func (f *filePages) readPage(header *format.PageHeader, reader *bufio.Reader) (*buffer, error) {
   651  	page := buffers.get(int(header.CompressedPageSize))
   652  	defer page.unref()
   653  
   654  	if _, err := io.ReadFull(reader, page.data); err != nil {
   655  		return nil, err
   656  	}
   657  
   658  	if header.CRC != 0 {
   659  		headerChecksum := uint32(header.CRC)
   660  		bufferChecksum := crc32.ChecksumIEEE(page.data)
   661  
   662  		if headerChecksum != bufferChecksum {
   663  			// The parquet specs indicate that corruption errors could be
   664  			// handled gracefully by skipping pages, tho this may not always
   665  			// be practical. Depending on how the pages are consumed,
   666  			// missing rows may cause unpredictable behaviors in algorithms.
   667  			//
   668  			// For now, we assume these errors to be fatal, but we may
   669  			// revisit later and improve error handling to be more resilient
   670  			// to data corruption.
   671  			return nil, fmt.Errorf("crc32 checksum mismatch in page of column %q: want=0x%08X got=0x%08X: %w",
   672  				f.columnPath(),
   673  				headerChecksum,
   674  				bufferChecksum,
   675  				ErrCorrupted,
   676  			)
   677  		}
   678  	}
   679  
   680  	page.ref()
   681  	return page, nil
   682  }
   683  
   684  func (f *filePages) SeekToRow(rowIndex int64) (err error) {
   685  	if f.chunk == nil {
   686  		return io.ErrClosedPipe
   687  	}
   688  	if f.chunk.offsetIndex == nil {
   689  		_, err = f.section.Seek(f.dataOffset-f.baseOffset, io.SeekStart)
   690  		f.skip = rowIndex
   691  		f.index = 0
   692  		if f.dictOffset > 0 {
   693  			f.index = 1
   694  		}
   695  	} else {
   696  		pages := f.chunk.offsetIndex.PageLocations
   697  		index := sort.Search(len(pages), func(i int) bool {
   698  			return pages[i].FirstRowIndex > rowIndex
   699  		}) - 1
   700  		if index < 0 {
   701  			return ErrSeekOutOfRange
   702  		}
   703  		_, err = f.section.Seek(pages[index].Offset-f.baseOffset, io.SeekStart)
   704  		f.skip = rowIndex - pages[index].FirstRowIndex
   705  		f.index = index
   706  	}
   707  	f.rbuf.Reset(&f.section)
   708  	return err
   709  }
   710  
   711  func (f *filePages) Close() error {
   712  	putBufioReader(f.rbuf, f.rbufpool)
   713  	f.chunk = nil
   714  	f.section = io.SectionReader{}
   715  	f.rbuf = nil
   716  	f.rbufpool = nil
   717  	f.baseOffset = 0
   718  	f.dataOffset = 0
   719  	f.dictOffset = 0
   720  	f.index = 0
   721  	f.skip = 0
   722  	f.dictionary = nil
   723  	return nil
   724  }
   725  
   726  func (f *filePages) columnPath() columnPath {
   727  	return columnPath(f.chunk.column.Path())
   728  }
   729  
   730  type putBufioReaderFunc func()
   731  
   732  var (
   733  	bufioReaderPoolLock sync.Mutex
   734  	bufioReaderPool     = map[int]*sync.Pool{}
   735  )
   736  
   737  func getBufioReader(r io.Reader, bufferSize int) (*bufio.Reader, *sync.Pool) {
   738  	pool := getBufioReaderPool(bufferSize)
   739  	rbuf, _ := pool.Get().(*bufio.Reader)
   740  	if rbuf == nil {
   741  		rbuf = bufio.NewReaderSize(r, bufferSize)
   742  	} else {
   743  		rbuf.Reset(r)
   744  	}
   745  	return rbuf, pool
   746  }
   747  
   748  func putBufioReader(rbuf *bufio.Reader, pool *sync.Pool) {
   749  	if rbuf != nil && pool != nil {
   750  		rbuf.Reset(nil)
   751  		pool.Put(rbuf)
   752  	}
   753  }
   754  
   755  func getBufioReaderPool(size int) *sync.Pool {
   756  	bufioReaderPoolLock.Lock()
   757  	defer bufioReaderPoolLock.Unlock()
   758  
   759  	if pool := bufioReaderPool[size]; pool != nil {
   760  		return pool
   761  	}
   762  
   763  	pool := &sync.Pool{}
   764  	bufioReaderPool[size] = pool
   765  	return pool
   766  }
   767  
   768  var pageHeaderPool = &sync.Pool{}
   769  
   770  func getPageHeader() *format.PageHeader {
   771  	h, _ := pageHeaderPool.Get().(*format.PageHeader)
   772  	if h != nil {
   773  		return h
   774  	}
   775  	return new(format.PageHeader)
   776  }
   777  
   778  func putPageHeader(h *format.PageHeader) {
   779  	if h != nil {
   780  		h.CRC = 0
   781  		pageHeaderPool.Put(h)
   782  	}
   783  }