github.com/apache/arrow/go/v14@v14.0.2/parquet/file/file_reader.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"fmt"
    23  	"io"
    24  	"os"
    25  	"runtime"
    26  	"sync"
    27  
    28  	"github.com/apache/arrow/go/v14/arrow/memory"
    29  	"github.com/apache/arrow/go/v14/parquet"
    30  	"github.com/apache/arrow/go/v14/parquet/internal/encryption"
    31  	"github.com/apache/arrow/go/v14/parquet/metadata"
    32  	"golang.org/x/xerrors"
    33  )
    34  
    35  const (
    36  	footerSize uint32 = 8
    37  )
    38  
    39  var (
    40  	magicBytes                  = []byte("PAR1")
    41  	magicEBytes                 = []byte("PARE")
    42  	errInconsistentFileMetadata = xerrors.New("parquet: file is smaller than indicated metadata size")
    43  )
    44  
    45  // Reader is the main interface for reading a parquet file
    46  type Reader struct {
    47  	r             parquet.ReaderAtSeeker
    48  	props         *parquet.ReaderProperties
    49  	metadata      *metadata.FileMetaData
    50  	footerOffset  int64
    51  	fileDecryptor encryption.FileDecryptor
    52  
    53  	bufferPool sync.Pool
    54  }
    55  
    56  type ReadOption func(*Reader)
    57  
    58  // WithReadProps specifies a specific reader properties instance to use, rather
    59  // than using the default ReaderProperties.
    60  func WithReadProps(props *parquet.ReaderProperties) ReadOption {
    61  	return func(r *Reader) {
    62  		r.props = props
    63  	}
    64  }
    65  
    66  // WithMetadata allows providing a specific FileMetaData object rather than reading
    67  // the file metadata from the file itself.
    68  func WithMetadata(m *metadata.FileMetaData) ReadOption {
    69  	return func(r *Reader) {
    70  		r.metadata = m
    71  	}
    72  }
    73  
    74  // OpenParquetFile will return a Reader for the given parquet file on the local file system.
    75  //
    76  // Optionally the file can be memory mapped for faster reading. If no read properties are provided
    77  // then the default ReaderProperties will be used. The WithMetadata option can be used to provide
    78  // a FileMetaData object rather than reading the file metadata from the file.
    79  func OpenParquetFile(filename string, memoryMap bool, opts ...ReadOption) (*Reader, error) {
    80  	var source parquet.ReaderAtSeeker
    81  
    82  	var err error
    83  	if memoryMap {
    84  		source, err = mmapOpen(filename)
    85  		if err != nil {
    86  			return nil, err
    87  		}
    88  	} else {
    89  		source, err = os.Open(filename)
    90  		if err != nil {
    91  			return nil, err
    92  		}
    93  	}
    94  	return NewParquetReader(source, opts...)
    95  }
    96  
    97  // NewParquetReader returns a FileReader instance that reads a parquet file which can be read from r.
    98  // This reader needs to support Read, ReadAt and Seeking.
    99  //
   100  // If no read properties are provided then the default ReaderProperties will be used. The WithMetadata
   101  // option can be used to provide a FileMetaData object rather than reading the file metadata from the file.
   102  func NewParquetReader(r parquet.ReaderAtSeeker, opts ...ReadOption) (*Reader, error) {
   103  	var err error
   104  	f := &Reader{r: r}
   105  	for _, o := range opts {
   106  		o(f)
   107  	}
   108  
   109  	if f.footerOffset <= 0 {
   110  		f.footerOffset, err = r.Seek(0, io.SeekEnd)
   111  		if err != nil {
   112  			return nil, fmt.Errorf("parquet: could not retrieve footer offset: %w", err)
   113  		}
   114  	}
   115  
   116  	if f.props == nil {
   117  		f.props = parquet.NewReaderProperties(memory.NewGoAllocator())
   118  	}
   119  
   120  	f.bufferPool = sync.Pool{
   121  		New: func() interface{} {
   122  			buf := memory.NewResizableBuffer(f.props.Allocator())
   123  			runtime.SetFinalizer(buf, func(obj *memory.Buffer) {
   124  				obj.Release()
   125  			})
   126  			return buf
   127  		},
   128  	}
   129  
   130  	if f.metadata == nil {
   131  		return f, f.parseMetaData()
   132  	}
   133  
   134  	return f, nil
   135  }
   136  
   137  // BufferPool returns the internal buffer pool being utilized by this reader.
   138  // This is primarily for use by the pqarrow.FileReader or anything that builds
   139  // on top of the Reader and constructs their own ColumnReaders (like the
   140  // RecordReader)
   141  func (f *Reader) BufferPool() *sync.Pool {
   142  	return &f.bufferPool
   143  }
   144  
   145  // Close will close the current reader, and if the underlying reader being used
   146  // is an `io.Closer` then Close will be called on it too.
   147  func (f *Reader) Close() error {
   148  	if r, ok := f.r.(io.Closer); ok {
   149  		return r.Close()
   150  	}
   151  	return nil
   152  }
   153  
   154  // MetaData returns the underlying FileMetadata object
   155  func (f *Reader) MetaData() *metadata.FileMetaData { return f.metadata }
   156  
   157  // parseMetaData handles parsing the metadata from the opened file.
   158  func (f *Reader) parseMetaData() error {
   159  	if f.footerOffset <= int64(footerSize) {
   160  		return fmt.Errorf("parquet: file too small (size=%d)", f.footerOffset)
   161  	}
   162  
   163  	buf := make([]byte, footerSize)
   164  	// backup 8 bytes to read the footer size (first four bytes) and the magic bytes (last 4 bytes)
   165  	n, err := f.r.ReadAt(buf, f.footerOffset-int64(footerSize))
   166  	if err != nil && err != io.EOF {
   167  		return fmt.Errorf("parquet: could not read footer: %w", err)
   168  	}
   169  	if n != len(buf) {
   170  		return fmt.Errorf("parquet: could not read %d bytes from end of file", len(buf))
   171  	}
   172  
   173  	size := int64(binary.LittleEndian.Uint32(buf[:4]))
   174  	if size < 0 || size+int64(footerSize) > f.footerOffset {
   175  		return errInconsistentFileMetadata
   176  	}
   177  
   178  	fileDecryptProps := f.props.FileDecryptProps
   179  
   180  	switch {
   181  	case bytes.Equal(buf[4:], magicBytes): // non-encrypted metadata
   182  		buf = make([]byte, size)
   183  		if _, err := f.r.ReadAt(buf, f.footerOffset-int64(footerSize)-size); err != nil {
   184  			return fmt.Errorf("parquet: could not read footer: %w", err)
   185  		}
   186  
   187  		f.metadata, err = metadata.NewFileMetaData(buf, nil)
   188  		if err != nil {
   189  			return fmt.Errorf("parquet: could not read footer: %w", err)
   190  		}
   191  
   192  		if !f.metadata.IsSetEncryptionAlgorithm() {
   193  			if fileDecryptProps != nil && !fileDecryptProps.PlaintextFilesAllowed() {
   194  				return fmt.Errorf("parquet: applying decryption properties on plaintext file")
   195  			}
   196  		} else {
   197  			if err := f.parseMetaDataEncryptedFilePlaintextFooter(fileDecryptProps, buf); err != nil {
   198  				return err
   199  			}
   200  		}
   201  	case bytes.Equal(buf[4:], magicEBytes): // encrypted metadata
   202  		buf = make([]byte, size)
   203  		if _, err := f.r.ReadAt(buf, f.footerOffset-int64(footerSize)-size); err != nil {
   204  			return fmt.Errorf("parquet: could not read footer: %w", err)
   205  		}
   206  
   207  		if fileDecryptProps == nil {
   208  			return xerrors.New("could not read encrypted metadata, no decryption found in reader's properties")
   209  		}
   210  
   211  		fileCryptoMetadata, err := metadata.NewFileCryptoMetaData(buf)
   212  		if err != nil {
   213  			return err
   214  		}
   215  		algo := fileCryptoMetadata.EncryptionAlgorithm()
   216  		fileAad, err := f.handleAadPrefix(fileDecryptProps, &algo)
   217  		if err != nil {
   218  			return err
   219  		}
   220  		f.fileDecryptor = encryption.NewFileDecryptor(fileDecryptProps, fileAad, algo.Algo, string(fileCryptoMetadata.KeyMetadata()), f.props.Allocator())
   221  
   222  		f.metadata, err = metadata.NewFileMetaData(buf[fileCryptoMetadata.Len():], f.fileDecryptor)
   223  		if err != nil {
   224  			return fmt.Errorf("parquet: could not read footer: %w", err)
   225  		}
   226  	default:
   227  		return fmt.Errorf("parquet: magic bytes not found in footer. Either the file is corrupted or this isn't a parquet file")
   228  	}
   229  
   230  	return nil
   231  }
   232  
   233  func (f *Reader) handleAadPrefix(fileDecrypt *parquet.FileDecryptionProperties, algo *parquet.Algorithm) (string, error) {
   234  	aadPrefixInProps := fileDecrypt.AadPrefix()
   235  	aadPrefix := []byte(aadPrefixInProps)
   236  	fileHasAadPrefix := algo.Aad.AadPrefix != nil && len(algo.Aad.AadPrefix) > 0
   237  	aadPrefixInFile := algo.Aad.AadPrefix
   238  
   239  	if algo.Aad.SupplyAadPrefix && aadPrefixInProps == "" {
   240  		return "", xerrors.New("AAD Prefix used for file encryption but not stored in file and not suppliedin decryption props")
   241  	}
   242  
   243  	if fileHasAadPrefix {
   244  		if aadPrefixInProps != "" {
   245  			if aadPrefixInProps != string(aadPrefixInFile) {
   246  				return "", xerrors.New("AAD prefix in file and in properties but not the same")
   247  			}
   248  		}
   249  		aadPrefix = aadPrefixInFile
   250  		if fileDecrypt.Verifier != nil {
   251  			fileDecrypt.Verifier.Verify(string(aadPrefix))
   252  		}
   253  	} else {
   254  		if !algo.Aad.SupplyAadPrefix && aadPrefixInProps != "" {
   255  			return "", xerrors.New("AAD Prefix set in decryptionproperties but was not used for file encryption")
   256  		}
   257  		if fileDecrypt.Verifier != nil {
   258  			return "", xerrors.New("AAD Prefix Verifier is set but AAD Prefix not found in file")
   259  		}
   260  	}
   261  	return string(append(aadPrefix, algo.Aad.AadFileUnique...)), nil
   262  }
   263  
   264  func (f *Reader) parseMetaDataEncryptedFilePlaintextFooter(decryptProps *parquet.FileDecryptionProperties, data []byte) error {
   265  	if decryptProps != nil {
   266  		algo := f.metadata.EncryptionAlgorithm()
   267  		fileAad, err := f.handleAadPrefix(decryptProps, &algo)
   268  		if err != nil {
   269  			return err
   270  		}
   271  		f.fileDecryptor = encryption.NewFileDecryptor(decryptProps, fileAad, algo.Algo, string(f.metadata.GetFooterSigningKeyMetadata()), f.props.Allocator())
   272  		// set the InternalFileDecryptor in the metadata as well, as it's used
   273  		// for signature verification and for ColumnChunkMetaData creation.
   274  		f.metadata.FileDecryptor = f.fileDecryptor
   275  		if decryptProps.PlaintextFooterIntegrity() {
   276  			if len(data)-f.metadata.Size() != encryption.GcmTagLength+encryption.NonceLength {
   277  				return xerrors.New("failed reading metadata for encryption signature")
   278  			}
   279  
   280  			if !f.metadata.VerifySignature(data[f.metadata.Size():]) {
   281  				return xerrors.New("parquet crypto signature verification failed")
   282  			}
   283  		}
   284  	}
   285  	return nil
   286  }
   287  
   288  // WriterVersion returns the Application Version that was written in the file
   289  // metadata
   290  func (f *Reader) WriterVersion() *metadata.AppVersion {
   291  	return f.metadata.WriterVersion()
   292  }
   293  
   294  // NumRows returns the total number of rows in this parquet file.
   295  func (f *Reader) NumRows() int64 {
   296  	return f.metadata.GetNumRows()
   297  }
   298  
   299  // NumRowGroups returns the total number of row groups in this file.
   300  func (f *Reader) NumRowGroups() int {
   301  	return len(f.metadata.GetRowGroups())
   302  }
   303  
   304  // RowGroup returns a reader for the desired (0-based) row group
   305  func (f *Reader) RowGroup(i int) *RowGroupReader {
   306  	rg := f.metadata.RowGroups[i]
   307  
   308  	return &RowGroupReader{
   309  		fileMetadata:  f.metadata,
   310  		rgMetadata:    metadata.NewRowGroupMetaData(rg, f.metadata.Schema, f.WriterVersion(), f.fileDecryptor),
   311  		props:         f.props,
   312  		r:             f.r,
   313  		sourceSz:      f.footerOffset,
   314  		fileDecryptor: f.fileDecryptor,
   315  		bufferPool:    &f.bufferPool,
   316  	}
   317  }