github.com/apache/arrow/go/v7@v7.0.1/parquet/file/row_group_reader.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "github.com/apache/arrow/go/v7/arrow/ipc" 21 "github.com/apache/arrow/go/v7/parquet" 22 "github.com/apache/arrow/go/v7/parquet/internal/encryption" 23 "github.com/apache/arrow/go/v7/parquet/internal/utils" 24 "github.com/apache/arrow/go/v7/parquet/metadata" 25 "golang.org/x/xerrors" 26 ) 27 28 const ( 29 maxDictHeaderSize int64 = 100 30 ) 31 32 // RowGroupReader is the primary interface for reading a single row group 33 type RowGroupReader struct { 34 r ipc.ReadAtSeeker 35 sourceSz int64 36 fileMetadata *metadata.FileMetaData 37 rgMetadata *metadata.RowGroupMetaData 38 props *parquet.ReaderProperties 39 fileDecryptor encryption.FileDecryptor 40 } 41 42 // MetaData returns the metadata of the current Row Group 43 func (r *RowGroupReader) MetaData() *metadata.RowGroupMetaData { return r.rgMetadata } 44 45 // NumColumns returns the number of columns of data as defined in the metadata of this row group 46 func (r *RowGroupReader) NumColumns() int { return r.rgMetadata.NumColumns() } 47 48 // NumRows returns the number of rows in just this row group 49 func (r *RowGroupReader) NumRows() int64 { return r.rgMetadata.NumRows() } 50 51 // ByteSize returns the full byte size of this row group as defined in its metadata 52 func (r *RowGroupReader) ByteSize() int64 { return r.rgMetadata.TotalByteSize() } 53 54 // Column returns a column reader for the requested (0-indexed) column 55 // 56 // panics if passed a column not in the range [0, NumColumns) 57 func (r *RowGroupReader) Column(i int) ColumnChunkReader { 58 if i >= r.NumColumns() || i < 0 { 59 panic(xerrors.Errorf("parquet: trying to read column index %d but row group metadata only has %d columns", i, r.rgMetadata.NumColumns())) 60 } 61 62 descr := r.fileMetadata.Schema.Column(i) 63 pageRdr, err := r.GetColumnPageReader(i) 64 if err != nil { 65 panic(xerrors.Errorf("parquet: unable to initialize page reader: %w", err)) 66 } 67 return NewColumnReader(descr, pageRdr, r.props.Allocator()) 68 } 69 70 func (r *RowGroupReader) GetColumnPageReader(i int) (PageReader, error) { 71 col, err := r.rgMetadata.ColumnChunk(i) 72 if err != nil { 73 return nil, err 74 } 75 76 colStart := col.DataPageOffset() 77 if col.HasDictionaryPage() && col.DictionaryPageOffset() > 0 && colStart > col.DictionaryPageOffset() { 78 colStart = col.DictionaryPageOffset() 79 } 80 81 colLen := col.TotalCompressedSize() 82 // PARQUET-816 workaround for old files created by older parquet-mr 83 if r.fileMetadata.WriterVersion().LessThan(metadata.Parquet816FixedVersion) { 84 // The Parquet MR writer had a bug in 1.2.8 and below where it didn't include the 85 // dictionary page header size in total_compressed_size and total_uncompressed_size 86 // (see IMPALA-694). We add padding to compensate. 87 if colStart < 0 || colLen < 0 { 88 return nil, xerrors.Errorf("invalid column chunk metadata, offset (%d) and length (%d) should both be positive", colStart, colLen) 89 } 90 if colStart > r.sourceSz || colLen > r.sourceSz { 91 return nil, xerrors.Errorf("invalid column chunk metadata, offset (%d) and length (%d) must both be less than total source size (%d)", colStart, colLen, r.sourceSz) 92 } 93 bytesRemain := r.sourceSz - (colStart + colLen) 94 padding := utils.Min(maxDictHeaderSize, bytesRemain) 95 colLen += padding 96 } 97 98 stream, err := r.props.GetStream(r.r, colStart, colLen) 99 if err != nil { 100 return nil, err 101 } 102 103 cryptoMetadata := col.CryptoMetadata() 104 if cryptoMetadata == nil { 105 return NewPageReader(stream, col.NumValues(), col.Compression(), r.props.Allocator(), nil) 106 } 107 108 if r.fileDecryptor == nil { 109 return nil, xerrors.New("column in rowgroup is encrypted, but no file decryptor") 110 } 111 112 const encryptedRowGroupsLimit = 32767 113 if i > encryptedRowGroupsLimit { 114 return nil, xerrors.New("encrypted files cannot contain more than 32767 column chunks") 115 } 116 117 if cryptoMetadata.IsSetENCRYPTION_WITH_FOOTER_KEY() { 118 ctx := CryptoContext{ 119 StartDecryptWithDictionaryPage: col.HasDictionaryPage(), 120 RowGroupOrdinal: r.rgMetadata.Ordinal(), 121 ColumnOrdinal: int16(i), 122 MetaDecryptor: r.fileDecryptor.GetFooterDecryptorForColumnMeta(""), 123 DataDecryptor: r.fileDecryptor.GetFooterDecryptorForColumnData(""), 124 } 125 return NewPageReader(stream, col.NumValues(), col.Compression(), r.props.Allocator(), &ctx) 126 } 127 128 // column encrypted with it's own key 129 columnKeyMeta := cryptoMetadata.GetENCRYPTION_WITH_COLUMN_KEY().KeyMetadata 130 columnPath := cryptoMetadata.GetENCRYPTION_WITH_COLUMN_KEY().PathInSchema 131 132 ctx := CryptoContext{ 133 StartDecryptWithDictionaryPage: col.HasDictionaryPage(), 134 RowGroupOrdinal: r.rgMetadata.Ordinal(), 135 ColumnOrdinal: int16(i), 136 MetaDecryptor: r.fileDecryptor.GetColumnMetaDecryptor(parquet.ColumnPath(columnPath).String(), string(columnKeyMeta), ""), 137 DataDecryptor: r.fileDecryptor.GetColumnDataDecryptor(parquet.ColumnPath(columnPath).String(), string(columnKeyMeta), ""), 138 } 139 return NewPageReader(stream, col.NumValues(), col.Compression(), r.props.Allocator(), &ctx) 140 }