github.com/apache/arrow/go/v7@v7.0.1/parquet/file/file_reader.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"io"
    23  	"os"
    24  
    25  	"github.com/apache/arrow/go/v7/arrow/memory"
    26  	"github.com/apache/arrow/go/v7/parquet"
    27  	"github.com/apache/arrow/go/v7/parquet/internal/encryption"
    28  	"github.com/apache/arrow/go/v7/parquet/metadata"
    29  	"golang.org/x/exp/mmap"
    30  	"golang.org/x/xerrors"
    31  )
    32  
    33  const (
    34  	footerSize uint32 = 8
    35  )
    36  
    37  var (
    38  	magicBytes                  = []byte("PAR1")
    39  	magicEBytes                 = []byte("PARE")
    40  	errInconsistentFileMetadata = xerrors.New("parquet: file is smaller than indicated metadata size")
    41  )
    42  
    43  // Reader is the main interface for reading a parquet file
    44  type Reader struct {
    45  	r             parquet.ReaderAtSeeker
    46  	props         *parquet.ReaderProperties
    47  	metadata      *metadata.FileMetaData
    48  	footerOffset  int64
    49  	fileDecryptor encryption.FileDecryptor
    50  }
    51  
    52  // an adapter for mmap'd files
    53  type mmapAdapter struct {
    54  	*mmap.ReaderAt
    55  
    56  	pos int64
    57  }
    58  
    59  func (m *mmapAdapter) Close() error {
    60  	return m.ReaderAt.Close()
    61  }
    62  
    63  func (m *mmapAdapter) ReadAt(p []byte, off int64) (int, error) {
    64  	return m.ReaderAt.ReadAt(p, off)
    65  }
    66  
    67  func (m *mmapAdapter) Read(p []byte) (n int, err error) {
    68  	n, err = m.ReaderAt.ReadAt(p, m.pos)
    69  	m.pos += int64(n)
    70  	return
    71  }
    72  
    73  func (m *mmapAdapter) Seek(offset int64, whence int) (int64, error) {
    74  	newPos, offs := int64(0), offset
    75  	switch whence {
    76  	case io.SeekStart:
    77  		newPos = offs
    78  	case io.SeekCurrent:
    79  		newPos = m.pos + offs
    80  	case io.SeekEnd:
    81  		newPos = int64(m.ReaderAt.Len()) + offs
    82  	}
    83  	if newPos < 0 {
    84  		return 0, xerrors.New("negative result pos")
    85  	}
    86  	if newPos > int64(m.ReaderAt.Len()) {
    87  		return 0, xerrors.New("new position exceeds size of file")
    88  	}
    89  	m.pos = newPos
    90  	return newPos, nil
    91  }
    92  
    93  type ReadOption func(*Reader)
    94  
    95  // WithReadProps specifies a specific reader properties instance to use, rather
    96  // than using the default ReaderProperties.
    97  func WithReadProps(props *parquet.ReaderProperties) ReadOption {
    98  	return func(r *Reader) {
    99  		r.props = props
   100  	}
   101  }
   102  
   103  // WithMetadata allows providing a specific FileMetaData object rather than reading
   104  // the file metadata from the file itself.
   105  func WithMetadata(m *metadata.FileMetaData) ReadOption {
   106  	return func(r *Reader) {
   107  		r.metadata = m
   108  	}
   109  }
   110  
   111  // OpenParquetFile will return a Reader for the given parquet file on the local file system.
   112  //
   113  // Optionally the file can be memory mapped for faster reading. If no read properties are provided
   114  // then the default ReaderProperties will be used. The WithMetadata option can be used to provide
   115  // a FileMetaData object rather than reading the file metadata from the file.
   116  func OpenParquetFile(filename string, memoryMap bool, opts ...ReadOption) (*Reader, error) {
   117  	var source parquet.ReaderAtSeeker
   118  
   119  	var err error
   120  	if memoryMap {
   121  		rdr, err := mmap.Open(filename)
   122  		if err != nil {
   123  			return nil, err
   124  		}
   125  		source = &mmapAdapter{rdr, 0}
   126  	} else {
   127  		source, err = os.Open(filename)
   128  		if err != nil {
   129  			return nil, err
   130  		}
   131  	}
   132  	return NewParquetReader(source, opts...)
   133  }
   134  
   135  // NewParquetReader returns a FileReader instance that reads a parquet file which can be read from r.
   136  // This reader needs to support Read, ReadAt and Seeking.
   137  //
   138  // If no read properties are provided then the default ReaderProperties will be used. The WithMetadata
   139  // option can be used to provide a FileMetaData object rather than reading the file metadata from the file.
   140  func NewParquetReader(r parquet.ReaderAtSeeker, opts ...ReadOption) (*Reader, error) {
   141  	var err error
   142  	f := &Reader{r: r}
   143  	for _, o := range opts {
   144  		o(f)
   145  	}
   146  
   147  	if f.footerOffset <= 0 {
   148  		f.footerOffset, err = r.Seek(0, io.SeekEnd)
   149  		if err != nil {
   150  			return nil, xerrors.Errorf("parquet: could not retrieve footer offset: %w", err)
   151  		}
   152  	}
   153  
   154  	if f.props == nil {
   155  		f.props = parquet.NewReaderProperties(memory.NewGoAllocator())
   156  	}
   157  
   158  	if f.metadata == nil {
   159  		return f, f.parseMetaData()
   160  	}
   161  
   162  	return f, nil
   163  }
   164  
   165  // Close will close the current reader, and if the underlying reader being used
   166  // is an `io.Closer` then Close will be called on it too.
   167  func (f *Reader) Close() error {
   168  	if r, ok := f.r.(io.Closer); ok {
   169  		return r.Close()
   170  	}
   171  	return nil
   172  }
   173  
   174  // MetaData returns the underlying FileMetadata object
   175  func (f *Reader) MetaData() *metadata.FileMetaData { return f.metadata }
   176  
   177  // parseMetaData handles parsing the metadata from the opened file.
   178  func (f *Reader) parseMetaData() error {
   179  	if f.footerOffset <= int64(footerSize) {
   180  		return xerrors.Errorf("parquet: file too small (size=%d)", f.footerOffset)
   181  	}
   182  
   183  	buf := make([]byte, footerSize)
   184  	// backup 8 bytes to read the footer size (first four bytes) and the magic bytes (last 4 bytes)
   185  	n, err := f.r.ReadAt(buf, f.footerOffset-int64(footerSize))
   186  	if err != nil {
   187  		return xerrors.Errorf("parquet: could not read footer: %w", err)
   188  	}
   189  	if n != len(buf) {
   190  		return xerrors.Errorf("parquet: could not read %d bytes from end of file", len(buf))
   191  	}
   192  
   193  	size := int64(binary.LittleEndian.Uint32(buf[:4]))
   194  	if size < 0 || size+int64(footerSize) > f.footerOffset {
   195  		return errInconsistentFileMetadata
   196  	}
   197  
   198  	fileDecryptProps := f.props.FileDecryptProps
   199  
   200  	switch {
   201  	case bytes.Equal(buf[4:], magicBytes): // non-encrypted metadata
   202  		buf = make([]byte, size)
   203  		if _, err := f.r.ReadAt(buf, f.footerOffset-int64(footerSize)-size); err != nil {
   204  			return xerrors.Errorf("parquet: could not read footer: %w", err)
   205  		}
   206  
   207  		f.metadata, err = metadata.NewFileMetaData(buf, nil)
   208  		if err != nil {
   209  			return xerrors.Errorf("parquet: could not read footer: %w", err)
   210  		}
   211  
   212  		if !f.metadata.IsSetEncryptionAlgorithm() {
   213  			if fileDecryptProps != nil && !fileDecryptProps.PlaintextFilesAllowed() {
   214  				return xerrors.Errorf("parquet: applying decryption properties on plaintext file")
   215  			}
   216  		} else {
   217  			if err := f.parseMetaDataEncryptedFilePlaintextFooter(fileDecryptProps, buf); err != nil {
   218  				return err
   219  			}
   220  		}
   221  	case bytes.Equal(buf[4:], magicEBytes): // encrypted metadata
   222  		buf = make([]byte, size)
   223  		if _, err := f.r.ReadAt(buf, f.footerOffset-int64(footerSize)-size); err != nil {
   224  			return xerrors.Errorf("parquet: could not read footer: %w", err)
   225  		}
   226  
   227  		if fileDecryptProps == nil {
   228  			return xerrors.New("could not read encrypted metadata, no decryption found in reader's properties")
   229  		}
   230  
   231  		fileCryptoMetadata, err := metadata.NewFileCryptoMetaData(buf)
   232  		if err != nil {
   233  			return err
   234  		}
   235  		algo := fileCryptoMetadata.EncryptionAlgorithm()
   236  		fileAad, err := f.handleAadPrefix(fileDecryptProps, &algo)
   237  		if err != nil {
   238  			return err
   239  		}
   240  		f.fileDecryptor = encryption.NewFileDecryptor(fileDecryptProps, fileAad, algo.Algo, string(fileCryptoMetadata.KeyMetadata()), f.props.Allocator())
   241  
   242  		f.metadata, err = metadata.NewFileMetaData(buf[fileCryptoMetadata.Len():], f.fileDecryptor)
   243  		if err != nil {
   244  			return xerrors.Errorf("parquet: could not read footer: %w", err)
   245  		}
   246  	default:
   247  		return xerrors.Errorf("parquet: magic bytes not found in footer. Either the file is corrupted or this isn't a parquet file")
   248  	}
   249  
   250  	return nil
   251  }
   252  
   253  func (f *Reader) handleAadPrefix(fileDecrypt *parquet.FileDecryptionProperties, algo *parquet.Algorithm) (string, error) {
   254  	aadPrefixInProps := fileDecrypt.AadPrefix()
   255  	aadPrefix := []byte(aadPrefixInProps)
   256  	fileHasAadPrefix := algo.Aad.AadPrefix != nil && len(algo.Aad.AadPrefix) > 0
   257  	aadPrefixInFile := algo.Aad.AadPrefix
   258  
   259  	if algo.Aad.SupplyAadPrefix && aadPrefixInProps == "" {
   260  		return "", xerrors.New("AAD Prefix used for file encryption but not stored in file and not suppliedin decryption props")
   261  	}
   262  
   263  	if fileHasAadPrefix {
   264  		if aadPrefixInProps != "" {
   265  			if aadPrefixInProps != string(aadPrefixInFile) {
   266  				return "", xerrors.New("AAD prefix in file and in properties but not the same")
   267  			}
   268  		}
   269  		aadPrefix = aadPrefixInFile
   270  		if fileDecrypt.Verifier != nil {
   271  			fileDecrypt.Verifier.Verify(string(aadPrefix))
   272  		}
   273  	} else {
   274  		if !algo.Aad.SupplyAadPrefix && aadPrefixInProps != "" {
   275  			return "", xerrors.New("AAD Prefix set in decryptionproperties but was not used for file encryption")
   276  		}
   277  		if fileDecrypt.Verifier != nil {
   278  			return "", xerrors.New("AAD Prefix Verifier is set but AAD Prefix not found in file")
   279  		}
   280  	}
   281  	return string(append(aadPrefix, algo.Aad.AadFileUnique...)), nil
   282  }
   283  
   284  func (f *Reader) parseMetaDataEncryptedFilePlaintextFooter(decryptProps *parquet.FileDecryptionProperties, data []byte) error {
   285  	if decryptProps != nil {
   286  		algo := f.metadata.EncryptionAlgorithm()
   287  		fileAad, err := f.handleAadPrefix(decryptProps, &algo)
   288  		if err != nil {
   289  			return err
   290  		}
   291  		f.fileDecryptor = encryption.NewFileDecryptor(decryptProps, fileAad, algo.Algo, string(f.metadata.GetFooterSigningKeyMetadata()), f.props.Allocator())
   292  		// set the InternalFileDecryptor in the metadata as well, as it's used
   293  		// for signature verification and for ColumnChunkMetaData creation.
   294  		f.metadata.FileDecryptor = f.fileDecryptor
   295  		if decryptProps.PlaintextFooterIntegrity() {
   296  			if len(data)-f.metadata.Size() != encryption.GcmTagLength+encryption.NonceLength {
   297  				return xerrors.New("failed reading metadata for encryption signature")
   298  			}
   299  
   300  			if !f.metadata.VerifySignature(data[f.metadata.Size():]) {
   301  				return xerrors.New("parquet crypto signature verification failed")
   302  			}
   303  		}
   304  	}
   305  	return nil
   306  }
   307  
   308  // WriterVersion returns the Application Version that was written in the file
   309  // metadata
   310  func (f *Reader) WriterVersion() *metadata.AppVersion {
   311  	return f.metadata.WriterVersion()
   312  }
   313  
   314  // NumRows returns the total number of rows in this parquet file.
   315  func (f *Reader) NumRows() int64 {
   316  	return f.metadata.GetNumRows()
   317  }
   318  
   319  // NumRowGroups returns the total number of row groups in this file.
   320  func (f *Reader) NumRowGroups() int {
   321  	return len(f.metadata.GetRowGroups())
   322  }
   323  
   324  // RowGroup returns a reader for the desired (0-based) row group
   325  func (f *Reader) RowGroup(i int) *RowGroupReader {
   326  	rg := f.metadata.RowGroups[i]
   327  
   328  	return &RowGroupReader{
   329  		fileMetadata:  f.metadata,
   330  		rgMetadata:    metadata.NewRowGroupMetaData(rg, f.metadata.Schema, f.WriterVersion(), f.fileDecryptor),
   331  		props:         f.props,
   332  		r:             f.r,
   333  		sourceSz:      f.footerOffset,
   334  		fileDecryptor: f.fileDecryptor,
   335  	}
   336  }