github.com/apache/arrow/go/v14@v14.0.1/parquet/file/file_reader.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "fmt" 23 "io" 24 "os" 25 "runtime" 26 "sync" 27 28 "github.com/apache/arrow/go/v14/arrow/memory" 29 "github.com/apache/arrow/go/v14/parquet" 30 "github.com/apache/arrow/go/v14/parquet/internal/encryption" 31 "github.com/apache/arrow/go/v14/parquet/metadata" 32 "golang.org/x/xerrors" 33 ) 34 35 const ( 36 footerSize uint32 = 8 37 ) 38 39 var ( 40 magicBytes = []byte("PAR1") 41 magicEBytes = []byte("PARE") 42 errInconsistentFileMetadata = xerrors.New("parquet: file is smaller than indicated metadata size") 43 ) 44 45 // Reader is the main interface for reading a parquet file 46 type Reader struct { 47 r parquet.ReaderAtSeeker 48 props *parquet.ReaderProperties 49 metadata *metadata.FileMetaData 50 footerOffset int64 51 fileDecryptor encryption.FileDecryptor 52 53 bufferPool sync.Pool 54 } 55 56 type ReadOption func(*Reader) 57 58 // WithReadProps specifies a specific reader properties instance to use, rather 59 // than using the default ReaderProperties. 60 func WithReadProps(props *parquet.ReaderProperties) ReadOption { 61 return func(r *Reader) { 62 r.props = props 63 } 64 } 65 66 // WithMetadata allows providing a specific FileMetaData object rather than reading 67 // the file metadata from the file itself. 68 func WithMetadata(m *metadata.FileMetaData) ReadOption { 69 return func(r *Reader) { 70 r.metadata = m 71 } 72 } 73 74 // OpenParquetFile will return a Reader for the given parquet file on the local file system. 75 // 76 // Optionally the file can be memory mapped for faster reading. If no read properties are provided 77 // then the default ReaderProperties will be used. The WithMetadata option can be used to provide 78 // a FileMetaData object rather than reading the file metadata from the file. 79 func OpenParquetFile(filename string, memoryMap bool, opts ...ReadOption) (*Reader, error) { 80 var source parquet.ReaderAtSeeker 81 82 var err error 83 if memoryMap { 84 source, err = mmapOpen(filename) 85 if err != nil { 86 return nil, err 87 } 88 } else { 89 source, err = os.Open(filename) 90 if err != nil { 91 return nil, err 92 } 93 } 94 return NewParquetReader(source, opts...) 95 } 96 97 // NewParquetReader returns a FileReader instance that reads a parquet file which can be read from r. 98 // This reader needs to support Read, ReadAt and Seeking. 99 // 100 // If no read properties are provided then the default ReaderProperties will be used. The WithMetadata 101 // option can be used to provide a FileMetaData object rather than reading the file metadata from the file. 102 func NewParquetReader(r parquet.ReaderAtSeeker, opts ...ReadOption) (*Reader, error) { 103 var err error 104 f := &Reader{r: r} 105 for _, o := range opts { 106 o(f) 107 } 108 109 if f.footerOffset <= 0 { 110 f.footerOffset, err = r.Seek(0, io.SeekEnd) 111 if err != nil { 112 return nil, fmt.Errorf("parquet: could not retrieve footer offset: %w", err) 113 } 114 } 115 116 if f.props == nil { 117 f.props = parquet.NewReaderProperties(memory.NewGoAllocator()) 118 } 119 120 f.bufferPool = sync.Pool{ 121 New: func() interface{} { 122 buf := memory.NewResizableBuffer(f.props.Allocator()) 123 runtime.SetFinalizer(buf, func(obj *memory.Buffer) { 124 obj.Release() 125 }) 126 return buf 127 }, 128 } 129 130 if f.metadata == nil { 131 return f, f.parseMetaData() 132 } 133 134 return f, nil 135 } 136 137 // BufferPool returns the internal buffer pool being utilized by this reader. 138 // This is primarily for use by the pqarrow.FileReader or anything that builds 139 // on top of the Reader and constructs their own ColumnReaders (like the 140 // RecordReader) 141 func (f *Reader) BufferPool() *sync.Pool { 142 return &f.bufferPool 143 } 144 145 // Close will close the current reader, and if the underlying reader being used 146 // is an `io.Closer` then Close will be called on it too. 147 func (f *Reader) Close() error { 148 if r, ok := f.r.(io.Closer); ok { 149 return r.Close() 150 } 151 return nil 152 } 153 154 // MetaData returns the underlying FileMetadata object 155 func (f *Reader) MetaData() *metadata.FileMetaData { return f.metadata } 156 157 // parseMetaData handles parsing the metadata from the opened file. 158 func (f *Reader) parseMetaData() error { 159 if f.footerOffset <= int64(footerSize) { 160 return fmt.Errorf("parquet: file too small (size=%d)", f.footerOffset) 161 } 162 163 buf := make([]byte, footerSize) 164 // backup 8 bytes to read the footer size (first four bytes) and the magic bytes (last 4 bytes) 165 n, err := f.r.ReadAt(buf, f.footerOffset-int64(footerSize)) 166 if err != nil && err != io.EOF { 167 return fmt.Errorf("parquet: could not read footer: %w", err) 168 } 169 if n != len(buf) { 170 return fmt.Errorf("parquet: could not read %d bytes from end of file", len(buf)) 171 } 172 173 size := int64(binary.LittleEndian.Uint32(buf[:4])) 174 if size < 0 || size+int64(footerSize) > f.footerOffset { 175 return errInconsistentFileMetadata 176 } 177 178 fileDecryptProps := f.props.FileDecryptProps 179 180 switch { 181 case bytes.Equal(buf[4:], magicBytes): // non-encrypted metadata 182 buf = make([]byte, size) 183 if _, err := f.r.ReadAt(buf, f.footerOffset-int64(footerSize)-size); err != nil { 184 return fmt.Errorf("parquet: could not read footer: %w", err) 185 } 186 187 f.metadata, err = metadata.NewFileMetaData(buf, nil) 188 if err != nil { 189 return fmt.Errorf("parquet: could not read footer: %w", err) 190 } 191 192 if !f.metadata.IsSetEncryptionAlgorithm() { 193 if fileDecryptProps != nil && !fileDecryptProps.PlaintextFilesAllowed() { 194 return fmt.Errorf("parquet: applying decryption properties on plaintext file") 195 } 196 } else { 197 if err := f.parseMetaDataEncryptedFilePlaintextFooter(fileDecryptProps, buf); err != nil { 198 return err 199 } 200 } 201 case bytes.Equal(buf[4:], magicEBytes): // encrypted metadata 202 buf = make([]byte, size) 203 if _, err := f.r.ReadAt(buf, f.footerOffset-int64(footerSize)-size); err != nil { 204 return fmt.Errorf("parquet: could not read footer: %w", err) 205 } 206 207 if fileDecryptProps == nil { 208 return xerrors.New("could not read encrypted metadata, no decryption found in reader's properties") 209 } 210 211 fileCryptoMetadata, err := metadata.NewFileCryptoMetaData(buf) 212 if err != nil { 213 return err 214 } 215 algo := fileCryptoMetadata.EncryptionAlgorithm() 216 fileAad, err := f.handleAadPrefix(fileDecryptProps, &algo) 217 if err != nil { 218 return err 219 } 220 f.fileDecryptor = encryption.NewFileDecryptor(fileDecryptProps, fileAad, algo.Algo, string(fileCryptoMetadata.KeyMetadata()), f.props.Allocator()) 221 222 f.metadata, err = metadata.NewFileMetaData(buf[fileCryptoMetadata.Len():], f.fileDecryptor) 223 if err != nil { 224 return fmt.Errorf("parquet: could not read footer: %w", err) 225 } 226 default: 227 return fmt.Errorf("parquet: magic bytes not found in footer. Either the file is corrupted or this isn't a parquet file") 228 } 229 230 return nil 231 } 232 233 func (f *Reader) handleAadPrefix(fileDecrypt *parquet.FileDecryptionProperties, algo *parquet.Algorithm) (string, error) { 234 aadPrefixInProps := fileDecrypt.AadPrefix() 235 aadPrefix := []byte(aadPrefixInProps) 236 fileHasAadPrefix := algo.Aad.AadPrefix != nil && len(algo.Aad.AadPrefix) > 0 237 aadPrefixInFile := algo.Aad.AadPrefix 238 239 if algo.Aad.SupplyAadPrefix && aadPrefixInProps == "" { 240 return "", xerrors.New("AAD Prefix used for file encryption but not stored in file and not suppliedin decryption props") 241 } 242 243 if fileHasAadPrefix { 244 if aadPrefixInProps != "" { 245 if aadPrefixInProps != string(aadPrefixInFile) { 246 return "", xerrors.New("AAD prefix in file and in properties but not the same") 247 } 248 } 249 aadPrefix = aadPrefixInFile 250 if fileDecrypt.Verifier != nil { 251 fileDecrypt.Verifier.Verify(string(aadPrefix)) 252 } 253 } else { 254 if !algo.Aad.SupplyAadPrefix && aadPrefixInProps != "" { 255 return "", xerrors.New("AAD Prefix set in decryptionproperties but was not used for file encryption") 256 } 257 if fileDecrypt.Verifier != nil { 258 return "", xerrors.New("AAD Prefix Verifier is set but AAD Prefix not found in file") 259 } 260 } 261 return string(append(aadPrefix, algo.Aad.AadFileUnique...)), nil 262 } 263 264 func (f *Reader) parseMetaDataEncryptedFilePlaintextFooter(decryptProps *parquet.FileDecryptionProperties, data []byte) error { 265 if decryptProps != nil { 266 algo := f.metadata.EncryptionAlgorithm() 267 fileAad, err := f.handleAadPrefix(decryptProps, &algo) 268 if err != nil { 269 return err 270 } 271 f.fileDecryptor = encryption.NewFileDecryptor(decryptProps, fileAad, algo.Algo, string(f.metadata.GetFooterSigningKeyMetadata()), f.props.Allocator()) 272 // set the InternalFileDecryptor in the metadata as well, as it's used 273 // for signature verification and for ColumnChunkMetaData creation. 274 f.metadata.FileDecryptor = f.fileDecryptor 275 if decryptProps.PlaintextFooterIntegrity() { 276 if len(data)-f.metadata.Size() != encryption.GcmTagLength+encryption.NonceLength { 277 return xerrors.New("failed reading metadata for encryption signature") 278 } 279 280 if !f.metadata.VerifySignature(data[f.metadata.Size():]) { 281 return xerrors.New("parquet crypto signature verification failed") 282 } 283 } 284 } 285 return nil 286 } 287 288 // WriterVersion returns the Application Version that was written in the file 289 // metadata 290 func (f *Reader) WriterVersion() *metadata.AppVersion { 291 return f.metadata.WriterVersion() 292 } 293 294 // NumRows returns the total number of rows in this parquet file. 295 func (f *Reader) NumRows() int64 { 296 return f.metadata.GetNumRows() 297 } 298 299 // NumRowGroups returns the total number of row groups in this file. 300 func (f *Reader) NumRowGroups() int { 301 return len(f.metadata.GetRowGroups()) 302 } 303 304 // RowGroup returns a reader for the desired (0-based) row group 305 func (f *Reader) RowGroup(i int) *RowGroupReader { 306 rg := f.metadata.RowGroups[i] 307 308 return &RowGroupReader{ 309 fileMetadata: f.metadata, 310 rgMetadata: metadata.NewRowGroupMetaData(rg, f.metadata.Schema, f.WriterVersion(), f.fileDecryptor), 311 props: f.props, 312 r: f.r, 313 sourceSz: f.footerOffset, 314 fileDecryptor: f.fileDecryptor, 315 bufferPool: &f.bufferPool, 316 } 317 }