github.com/apache/arrow/go/v7@v7.0.1/parquet/file/row_group_reader.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"github.com/apache/arrow/go/v7/arrow/ipc"
    21  	"github.com/apache/arrow/go/v7/parquet"
    22  	"github.com/apache/arrow/go/v7/parquet/internal/encryption"
    23  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    24  	"github.com/apache/arrow/go/v7/parquet/metadata"
    25  	"golang.org/x/xerrors"
    26  )
    27  
    28  const (
    29  	maxDictHeaderSize int64 = 100
    30  )
    31  
    32  // RowGroupReader is the primary interface for reading a single row group
    33  type RowGroupReader struct {
    34  	r             ipc.ReadAtSeeker
    35  	sourceSz      int64
    36  	fileMetadata  *metadata.FileMetaData
    37  	rgMetadata    *metadata.RowGroupMetaData
    38  	props         *parquet.ReaderProperties
    39  	fileDecryptor encryption.FileDecryptor
    40  }
    41  
    42  // MetaData returns the metadata of the current Row Group
    43  func (r *RowGroupReader) MetaData() *metadata.RowGroupMetaData { return r.rgMetadata }
    44  
    45  // NumColumns returns the number of columns of data as defined in the metadata of this row group
    46  func (r *RowGroupReader) NumColumns() int { return r.rgMetadata.NumColumns() }
    47  
    48  // NumRows returns the number of rows in just this row group
    49  func (r *RowGroupReader) NumRows() int64 { return r.rgMetadata.NumRows() }
    50  
    51  // ByteSize returns the full byte size of this row group as defined in its metadata
    52  func (r *RowGroupReader) ByteSize() int64 { return r.rgMetadata.TotalByteSize() }
    53  
    54  // Column returns a column reader for the requested (0-indexed) column
    55  //
    56  // panics if passed a column not in the range [0, NumColumns)
    57  func (r *RowGroupReader) Column(i int) ColumnChunkReader {
    58  	if i >= r.NumColumns() || i < 0 {
    59  		panic(xerrors.Errorf("parquet: trying to read column index %d but row group metadata only has %d columns", i, r.rgMetadata.NumColumns()))
    60  	}
    61  
    62  	descr := r.fileMetadata.Schema.Column(i)
    63  	pageRdr, err := r.GetColumnPageReader(i)
    64  	if err != nil {
    65  		panic(xerrors.Errorf("parquet: unable to initialize page reader: %w", err))
    66  	}
    67  	return NewColumnReader(descr, pageRdr, r.props.Allocator())
    68  }
    69  
    70  func (r *RowGroupReader) GetColumnPageReader(i int) (PageReader, error) {
    71  	col, err := r.rgMetadata.ColumnChunk(i)
    72  	if err != nil {
    73  		return nil, err
    74  	}
    75  
    76  	colStart := col.DataPageOffset()
    77  	if col.HasDictionaryPage() && col.DictionaryPageOffset() > 0 && colStart > col.DictionaryPageOffset() {
    78  		colStart = col.DictionaryPageOffset()
    79  	}
    80  
    81  	colLen := col.TotalCompressedSize()
    82  	// PARQUET-816 workaround for old files created by older parquet-mr
    83  	if r.fileMetadata.WriterVersion().LessThan(metadata.Parquet816FixedVersion) {
    84  		// The Parquet MR writer had a bug in 1.2.8 and below where it didn't include the
    85  		// dictionary page header size in total_compressed_size and total_uncompressed_size
    86  		// (see IMPALA-694). We add padding to compensate.
    87  		if colStart < 0 || colLen < 0 {
    88  			return nil, xerrors.Errorf("invalid column chunk metadata, offset (%d) and length (%d) should both be positive", colStart, colLen)
    89  		}
    90  		if colStart > r.sourceSz || colLen > r.sourceSz {
    91  			return nil, xerrors.Errorf("invalid column chunk metadata, offset (%d) and length (%d) must both be less than total source size (%d)", colStart, colLen, r.sourceSz)
    92  		}
    93  		bytesRemain := r.sourceSz - (colStart + colLen)
    94  		padding := utils.Min(maxDictHeaderSize, bytesRemain)
    95  		colLen += padding
    96  	}
    97  
    98  	stream, err := r.props.GetStream(r.r, colStart, colLen)
    99  	if err != nil {
   100  		return nil, err
   101  	}
   102  
   103  	cryptoMetadata := col.CryptoMetadata()
   104  	if cryptoMetadata == nil {
   105  		return NewPageReader(stream, col.NumValues(), col.Compression(), r.props.Allocator(), nil)
   106  	}
   107  
   108  	if r.fileDecryptor == nil {
   109  		return nil, xerrors.New("column in rowgroup is encrypted, but no file decryptor")
   110  	}
   111  
   112  	const encryptedRowGroupsLimit = 32767
   113  	if i > encryptedRowGroupsLimit {
   114  		return nil, xerrors.New("encrypted files cannot contain more than 32767 column chunks")
   115  	}
   116  
   117  	if cryptoMetadata.IsSetENCRYPTION_WITH_FOOTER_KEY() {
   118  		ctx := CryptoContext{
   119  			StartDecryptWithDictionaryPage: col.HasDictionaryPage(),
   120  			RowGroupOrdinal:                r.rgMetadata.Ordinal(),
   121  			ColumnOrdinal:                  int16(i),
   122  			MetaDecryptor:                  r.fileDecryptor.GetFooterDecryptorForColumnMeta(""),
   123  			DataDecryptor:                  r.fileDecryptor.GetFooterDecryptorForColumnData(""),
   124  		}
   125  		return NewPageReader(stream, col.NumValues(), col.Compression(), r.props.Allocator(), &ctx)
   126  	}
   127  
   128  	// column encrypted with it's own key
   129  	columnKeyMeta := cryptoMetadata.GetENCRYPTION_WITH_COLUMN_KEY().KeyMetadata
   130  	columnPath := cryptoMetadata.GetENCRYPTION_WITH_COLUMN_KEY().PathInSchema
   131  
   132  	ctx := CryptoContext{
   133  		StartDecryptWithDictionaryPage: col.HasDictionaryPage(),
   134  		RowGroupOrdinal:                r.rgMetadata.Ordinal(),
   135  		ColumnOrdinal:                  int16(i),
   136  		MetaDecryptor:                  r.fileDecryptor.GetColumnMetaDecryptor(parquet.ColumnPath(columnPath).String(), string(columnKeyMeta), ""),
   137  		DataDecryptor:                  r.fileDecryptor.GetColumnDataDecryptor(parquet.ColumnPath(columnPath).String(), string(columnKeyMeta), ""),
   138  	}
   139  	return NewPageReader(stream, col.NumValues(), col.Compression(), r.props.Allocator(), &ctx)
   140  }