github.com/apache/arrow/go/v7@v7.0.1/parquet/file/file_reader.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "io" 23 "os" 24 25 "github.com/apache/arrow/go/v7/arrow/memory" 26 "github.com/apache/arrow/go/v7/parquet" 27 "github.com/apache/arrow/go/v7/parquet/internal/encryption" 28 "github.com/apache/arrow/go/v7/parquet/metadata" 29 "golang.org/x/exp/mmap" 30 "golang.org/x/xerrors" 31 ) 32 33 const ( 34 footerSize uint32 = 8 35 ) 36 37 var ( 38 magicBytes = []byte("PAR1") 39 magicEBytes = []byte("PARE") 40 errInconsistentFileMetadata = xerrors.New("parquet: file is smaller than indicated metadata size") 41 ) 42 43 // Reader is the main interface for reading a parquet file 44 type Reader struct { 45 r parquet.ReaderAtSeeker 46 props *parquet.ReaderProperties 47 metadata *metadata.FileMetaData 48 footerOffset int64 49 fileDecryptor encryption.FileDecryptor 50 } 51 52 // an adapter for mmap'd files 53 type mmapAdapter struct { 54 *mmap.ReaderAt 55 56 pos int64 57 } 58 59 func (m *mmapAdapter) Close() error { 60 return m.ReaderAt.Close() 61 } 62 63 func (m *mmapAdapter) ReadAt(p []byte, off int64) (int, error) { 64 return m.ReaderAt.ReadAt(p, off) 65 } 66 67 func (m *mmapAdapter) Read(p []byte) (n int, err error) { 68 n, err = m.ReaderAt.ReadAt(p, m.pos) 69 m.pos += int64(n) 70 return 71 } 72 73 func (m *mmapAdapter) Seek(offset int64, whence int) (int64, error) { 74 newPos, offs := int64(0), offset 75 switch whence { 76 case io.SeekStart: 77 newPos = offs 78 case io.SeekCurrent: 79 newPos = m.pos + offs 80 case io.SeekEnd: 81 newPos = int64(m.ReaderAt.Len()) + offs 82 } 83 if newPos < 0 { 84 return 0, xerrors.New("negative result pos") 85 } 86 if newPos > int64(m.ReaderAt.Len()) { 87 return 0, xerrors.New("new position exceeds size of file") 88 } 89 m.pos = newPos 90 return newPos, nil 91 } 92 93 type ReadOption func(*Reader) 94 95 // WithReadProps specifies a specific reader properties instance to use, rather 96 // than using the default ReaderProperties. 97 func WithReadProps(props *parquet.ReaderProperties) ReadOption { 98 return func(r *Reader) { 99 r.props = props 100 } 101 } 102 103 // WithMetadata allows providing a specific FileMetaData object rather than reading 104 // the file metadata from the file itself. 105 func WithMetadata(m *metadata.FileMetaData) ReadOption { 106 return func(r *Reader) { 107 r.metadata = m 108 } 109 } 110 111 // OpenParquetFile will return a Reader for the given parquet file on the local file system. 112 // 113 // Optionally the file can be memory mapped for faster reading. If no read properties are provided 114 // then the default ReaderProperties will be used. The WithMetadata option can be used to provide 115 // a FileMetaData object rather than reading the file metadata from the file. 116 func OpenParquetFile(filename string, memoryMap bool, opts ...ReadOption) (*Reader, error) { 117 var source parquet.ReaderAtSeeker 118 119 var err error 120 if memoryMap { 121 rdr, err := mmap.Open(filename) 122 if err != nil { 123 return nil, err 124 } 125 source = &mmapAdapter{rdr, 0} 126 } else { 127 source, err = os.Open(filename) 128 if err != nil { 129 return nil, err 130 } 131 } 132 return NewParquetReader(source, opts...) 133 } 134 135 // NewParquetReader returns a FileReader instance that reads a parquet file which can be read from r. 136 // This reader needs to support Read, ReadAt and Seeking. 137 // 138 // If no read properties are provided then the default ReaderProperties will be used. The WithMetadata 139 // option can be used to provide a FileMetaData object rather than reading the file metadata from the file. 140 func NewParquetReader(r parquet.ReaderAtSeeker, opts ...ReadOption) (*Reader, error) { 141 var err error 142 f := &Reader{r: r} 143 for _, o := range opts { 144 o(f) 145 } 146 147 if f.footerOffset <= 0 { 148 f.footerOffset, err = r.Seek(0, io.SeekEnd) 149 if err != nil { 150 return nil, xerrors.Errorf("parquet: could not retrieve footer offset: %w", err) 151 } 152 } 153 154 if f.props == nil { 155 f.props = parquet.NewReaderProperties(memory.NewGoAllocator()) 156 } 157 158 if f.metadata == nil { 159 return f, f.parseMetaData() 160 } 161 162 return f, nil 163 } 164 165 // Close will close the current reader, and if the underlying reader being used 166 // is an `io.Closer` then Close will be called on it too. 167 func (f *Reader) Close() error { 168 if r, ok := f.r.(io.Closer); ok { 169 return r.Close() 170 } 171 return nil 172 } 173 174 // MetaData returns the underlying FileMetadata object 175 func (f *Reader) MetaData() *metadata.FileMetaData { return f.metadata } 176 177 // parseMetaData handles parsing the metadata from the opened file. 178 func (f *Reader) parseMetaData() error { 179 if f.footerOffset <= int64(footerSize) { 180 return xerrors.Errorf("parquet: file too small (size=%d)", f.footerOffset) 181 } 182 183 buf := make([]byte, footerSize) 184 // backup 8 bytes to read the footer size (first four bytes) and the magic bytes (last 4 bytes) 185 n, err := f.r.ReadAt(buf, f.footerOffset-int64(footerSize)) 186 if err != nil { 187 return xerrors.Errorf("parquet: could not read footer: %w", err) 188 } 189 if n != len(buf) { 190 return xerrors.Errorf("parquet: could not read %d bytes from end of file", len(buf)) 191 } 192 193 size := int64(binary.LittleEndian.Uint32(buf[:4])) 194 if size < 0 || size+int64(footerSize) > f.footerOffset { 195 return errInconsistentFileMetadata 196 } 197 198 fileDecryptProps := f.props.FileDecryptProps 199 200 switch { 201 case bytes.Equal(buf[4:], magicBytes): // non-encrypted metadata 202 buf = make([]byte, size) 203 if _, err := f.r.ReadAt(buf, f.footerOffset-int64(footerSize)-size); err != nil { 204 return xerrors.Errorf("parquet: could not read footer: %w", err) 205 } 206 207 f.metadata, err = metadata.NewFileMetaData(buf, nil) 208 if err != nil { 209 return xerrors.Errorf("parquet: could not read footer: %w", err) 210 } 211 212 if !f.metadata.IsSetEncryptionAlgorithm() { 213 if fileDecryptProps != nil && !fileDecryptProps.PlaintextFilesAllowed() { 214 return xerrors.Errorf("parquet: applying decryption properties on plaintext file") 215 } 216 } else { 217 if err := f.parseMetaDataEncryptedFilePlaintextFooter(fileDecryptProps, buf); err != nil { 218 return err 219 } 220 } 221 case bytes.Equal(buf[4:], magicEBytes): // encrypted metadata 222 buf = make([]byte, size) 223 if _, err := f.r.ReadAt(buf, f.footerOffset-int64(footerSize)-size); err != nil { 224 return xerrors.Errorf("parquet: could not read footer: %w", err) 225 } 226 227 if fileDecryptProps == nil { 228 return xerrors.New("could not read encrypted metadata, no decryption found in reader's properties") 229 } 230 231 fileCryptoMetadata, err := metadata.NewFileCryptoMetaData(buf) 232 if err != nil { 233 return err 234 } 235 algo := fileCryptoMetadata.EncryptionAlgorithm() 236 fileAad, err := f.handleAadPrefix(fileDecryptProps, &algo) 237 if err != nil { 238 return err 239 } 240 f.fileDecryptor = encryption.NewFileDecryptor(fileDecryptProps, fileAad, algo.Algo, string(fileCryptoMetadata.KeyMetadata()), f.props.Allocator()) 241 242 f.metadata, err = metadata.NewFileMetaData(buf[fileCryptoMetadata.Len():], f.fileDecryptor) 243 if err != nil { 244 return xerrors.Errorf("parquet: could not read footer: %w", err) 245 } 246 default: 247 return xerrors.Errorf("parquet: magic bytes not found in footer. Either the file is corrupted or this isn't a parquet file") 248 } 249 250 return nil 251 } 252 253 func (f *Reader) handleAadPrefix(fileDecrypt *parquet.FileDecryptionProperties, algo *parquet.Algorithm) (string, error) { 254 aadPrefixInProps := fileDecrypt.AadPrefix() 255 aadPrefix := []byte(aadPrefixInProps) 256 fileHasAadPrefix := algo.Aad.AadPrefix != nil && len(algo.Aad.AadPrefix) > 0 257 aadPrefixInFile := algo.Aad.AadPrefix 258 259 if algo.Aad.SupplyAadPrefix && aadPrefixInProps == "" { 260 return "", xerrors.New("AAD Prefix used for file encryption but not stored in file and not suppliedin decryption props") 261 } 262 263 if fileHasAadPrefix { 264 if aadPrefixInProps != "" { 265 if aadPrefixInProps != string(aadPrefixInFile) { 266 return "", xerrors.New("AAD prefix in file and in properties but not the same") 267 } 268 } 269 aadPrefix = aadPrefixInFile 270 if fileDecrypt.Verifier != nil { 271 fileDecrypt.Verifier.Verify(string(aadPrefix)) 272 } 273 } else { 274 if !algo.Aad.SupplyAadPrefix && aadPrefixInProps != "" { 275 return "", xerrors.New("AAD Prefix set in decryptionproperties but was not used for file encryption") 276 } 277 if fileDecrypt.Verifier != nil { 278 return "", xerrors.New("AAD Prefix Verifier is set but AAD Prefix not found in file") 279 } 280 } 281 return string(append(aadPrefix, algo.Aad.AadFileUnique...)), nil 282 } 283 284 func (f *Reader) parseMetaDataEncryptedFilePlaintextFooter(decryptProps *parquet.FileDecryptionProperties, data []byte) error { 285 if decryptProps != nil { 286 algo := f.metadata.EncryptionAlgorithm() 287 fileAad, err := f.handleAadPrefix(decryptProps, &algo) 288 if err != nil { 289 return err 290 } 291 f.fileDecryptor = encryption.NewFileDecryptor(decryptProps, fileAad, algo.Algo, string(f.metadata.GetFooterSigningKeyMetadata()), f.props.Allocator()) 292 // set the InternalFileDecryptor in the metadata as well, as it's used 293 // for signature verification and for ColumnChunkMetaData creation. 294 f.metadata.FileDecryptor = f.fileDecryptor 295 if decryptProps.PlaintextFooterIntegrity() { 296 if len(data)-f.metadata.Size() != encryption.GcmTagLength+encryption.NonceLength { 297 return xerrors.New("failed reading metadata for encryption signature") 298 } 299 300 if !f.metadata.VerifySignature(data[f.metadata.Size():]) { 301 return xerrors.New("parquet crypto signature verification failed") 302 } 303 } 304 } 305 return nil 306 } 307 308 // WriterVersion returns the Application Version that was written in the file 309 // metadata 310 func (f *Reader) WriterVersion() *metadata.AppVersion { 311 return f.metadata.WriterVersion() 312 } 313 314 // NumRows returns the total number of rows in this parquet file. 315 func (f *Reader) NumRows() int64 { 316 return f.metadata.GetNumRows() 317 } 318 319 // NumRowGroups returns the total number of row groups in this file. 320 func (f *Reader) NumRowGroups() int { 321 return len(f.metadata.GetRowGroups()) 322 } 323 324 // RowGroup returns a reader for the desired (0-based) row group 325 func (f *Reader) RowGroup(i int) *RowGroupReader { 326 rg := f.metadata.RowGroups[i] 327 328 return &RowGroupReader{ 329 fileMetadata: f.metadata, 330 rgMetadata: metadata.NewRowGroupMetaData(rg, f.metadata.Schema, f.WriterVersion(), f.fileDecryptor), 331 props: f.props, 332 r: f.r, 333 sourceSz: f.footerOffset, 334 fileDecryptor: f.fileDecryptor, 335 } 336 }