github.com/apache/arrow/go/v14@v14.0.2/parquet/file/row_group_reader.go

github.com/apache/arrow/go/v14@v14.0.2/parquet/file/row_group_reader.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"fmt"
    21  	"sync"
    22  
    23  	"github.com/apache/arrow/go/v14/internal/utils"
    24  	"github.com/apache/arrow/go/v14/parquet"
    25  	"github.com/apache/arrow/go/v14/parquet/internal/encryption"
    26  	"github.com/apache/arrow/go/v14/parquet/metadata"
    27  	"golang.org/x/xerrors"
    28  )
    29  
    30  const (
    31  	maxDictHeaderSize int64 = 100
    32  )
    33  
    34  // RowGroupReader is the primary interface for reading a single row group
    35  type RowGroupReader struct {
    36  	r             parquet.ReaderAtSeeker
    37  	sourceSz      int64
    38  	fileMetadata  *metadata.FileMetaData
    39  	rgMetadata    *metadata.RowGroupMetaData
    40  	props         *parquet.ReaderProperties
    41  	fileDecryptor encryption.FileDecryptor
    42  
    43  	bufferPool *sync.Pool
    44  }
    45  
    46  // MetaData returns the metadata of the current Row Group
    47  func (r *RowGroupReader) MetaData() *metadata.RowGroupMetaData { return r.rgMetadata }
    48  
    49  // NumColumns returns the number of columns of data as defined in the metadata of this row group
    50  func (r *RowGroupReader) NumColumns() int { return r.rgMetadata.NumColumns() }
    51  
    52  // NumRows returns the number of rows in just this row group
    53  func (r *RowGroupReader) NumRows() int64 { return r.rgMetadata.NumRows() }
    54  
    55  // ByteSize returns the full byte size of this row group as defined in its metadata
    56  func (r *RowGroupReader) ByteSize() int64 { return r.rgMetadata.TotalByteSize() }
    57  
    58  // Column returns a column reader for the requested (0-indexed) column
    59  //
    60  // panics if passed a column not in the range [0, NumColumns)
    61  func (r *RowGroupReader) Column(i int) (ColumnChunkReader, error) {
    62  	if i >= r.NumColumns() || i < 0 {
    63  		return nil, fmt.Errorf("parquet: trying to read column index %d but row group metadata only has %d columns", i, r.rgMetadata.NumColumns())
    64  	}
    65  
    66  	descr := r.fileMetadata.Schema.Column(i)
    67  	pageRdr, err := r.GetColumnPageReader(i)
    68  	if err != nil {
    69  		return nil, fmt.Errorf("parquet: unable to initialize page reader: %w", err)
    70  	}
    71  	return NewColumnReader(descr, pageRdr, r.props.Allocator(), r.bufferPool), nil
    72  }
    73  
    74  func (r *RowGroupReader) GetColumnPageReader(i int) (PageReader, error) {
    75  	col, err := r.rgMetadata.ColumnChunk(i)
    76  	if err != nil {
    77  		return nil, err
    78  	}
    79  
    80  	colStart := col.DataPageOffset()
    81  	if col.HasDictionaryPage() && col.DictionaryPageOffset() > 0 && colStart > col.DictionaryPageOffset() {
    82  		colStart = col.DictionaryPageOffset()
    83  	}
    84  
    85  	colLen := col.TotalCompressedSize()
    86  	// PARQUET-816 workaround for old files created by older parquet-mr
    87  	if r.fileMetadata.WriterVersion().LessThan(metadata.Parquet816FixedVersion) {
    88  		// The Parquet MR writer had a bug in 1.2.8 and below where it didn't include the
    89  		// dictionary page header size in total_compressed_size and total_uncompressed_size
    90  		// (see IMPALA-694). We add padding to compensate.
    91  		if colStart < 0 || colLen < 0 {
    92  			return nil, fmt.Errorf("invalid column chunk metadata, offset (%d) and length (%d) should both be positive", colStart, colLen)
    93  		}
    94  		if colStart > r.sourceSz || colLen > r.sourceSz {
    95  			return nil, fmt.Errorf("invalid column chunk metadata, offset (%d) and length (%d) must both be less than total source size (%d)", colStart, colLen, r.sourceSz)
    96  		}
    97  		bytesRemain := r.sourceSz - (colStart + colLen)
    98  		padding := utils.Min(maxDictHeaderSize, bytesRemain)
    99  		colLen += padding
   100  	}
   101  
   102  	stream, err := r.props.GetStream(r.r, colStart, colLen)
   103  	if err != nil {
   104  		return nil, err
   105  	}
   106  
   107  	cryptoMetadata := col.CryptoMetadata()
   108  	if cryptoMetadata == nil {
   109  		return NewPageReader(stream, col.NumValues(), col.Compression(), r.props.Allocator(), nil)
   110  	}
   111  
   112  	if r.fileDecryptor == nil {
   113  		return nil, xerrors.New("column in rowgroup is encrypted, but no file decryptor")
   114  	}
   115  
   116  	const encryptedRowGroupsLimit = 32767
   117  	if i > encryptedRowGroupsLimit {
   118  		return nil, xerrors.New("encrypted files cannot contain more than 32767 column chunks")
   119  	}
   120  
   121  	if cryptoMetadata.IsSetENCRYPTION_WITH_FOOTER_KEY() {
   122  		ctx := CryptoContext{
   123  			StartDecryptWithDictionaryPage: col.HasDictionaryPage(),
   124  			RowGroupOrdinal:                r.rgMetadata.Ordinal(),
   125  			ColumnOrdinal:                  int16(i),
   126  			MetaDecryptor:                  r.fileDecryptor.GetFooterDecryptorForColumnMeta(""),
   127  			DataDecryptor:                  r.fileDecryptor.GetFooterDecryptorForColumnData(""),
   128  		}
   129  		return NewPageReader(stream, col.NumValues(), col.Compression(), r.props.Allocator(), &ctx)
   130  	}
   131  
   132  	// column encrypted with it's own key
   133  	columnKeyMeta := cryptoMetadata.GetENCRYPTION_WITH_COLUMN_KEY().KeyMetadata
   134  	columnPath := cryptoMetadata.GetENCRYPTION_WITH_COLUMN_KEY().PathInSchema
   135  
   136  	ctx := CryptoContext{
   137  		StartDecryptWithDictionaryPage: col.HasDictionaryPage(),
   138  		RowGroupOrdinal:                r.rgMetadata.Ordinal(),
   139  		ColumnOrdinal:                  int16(i),
   140  		MetaDecryptor:                  r.fileDecryptor.GetColumnMetaDecryptor(parquet.ColumnPath(columnPath).String(), string(columnKeyMeta), ""),
   141  		DataDecryptor:                  r.fileDecryptor.GetColumnDataDecryptor(parquet.ColumnPath(columnPath).String(), string(columnKeyMeta), ""),
   142  	}
   143  	return NewPageReader(stream, col.NumValues(), col.Compression(), r.props.Allocator(), &ctx)
   144  }