github.com/apache/arrow/go/v14@v14.0.1/parquet/metadata/row_group.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package metadata
    18  
    19  import (
    20  	"fmt"
    21  	"reflect"
    22  
    23  	"github.com/apache/arrow/go/v14/parquet"
    24  	"github.com/apache/arrow/go/v14/parquet/internal/encryption"
    25  	format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet"
    26  	"github.com/apache/arrow/go/v14/parquet/schema"
    27  )
    28  
    29  // RowGroupMetaData is a proxy around the thrift RowGroup meta data object
    30  type RowGroupMetaData struct {
    31  	rowGroup      *format.RowGroup
    32  	Schema        *schema.Schema
    33  	version       *AppVersion
    34  	fileDecryptor encryption.FileDecryptor
    35  }
    36  
    37  // NewRowGroupMetaData constructs an object from the underlying thrift objects and schema,
    38  // decrypting if provided and necessary. This is primarily used internally and consumers
    39  // should use the RowGroupMetaDataBuilder rather than this directly.
    40  func NewRowGroupMetaData(rg *format.RowGroup, sc *schema.Schema, version *AppVersion, decryptor encryption.FileDecryptor) *RowGroupMetaData {
    41  	return &RowGroupMetaData{
    42  		rowGroup:      rg,
    43  		Schema:        sc,
    44  		version:       version,
    45  		fileDecryptor: decryptor,
    46  	}
    47  }
    48  
    49  // NumColumns returns the number of column metadata objects in this row group
    50  func (r *RowGroupMetaData) NumColumns() int {
    51  	return len(r.rowGroup.GetColumns())
    52  }
    53  
    54  func (r *RowGroupMetaData) Equals(other *RowGroupMetaData) bool {
    55  	return reflect.DeepEqual(r.rowGroup, other.rowGroup)
    56  }
    57  
    58  // NumRows is just the number of rows in this row group. All columns have the same
    59  // number of rows for a row group regardless of repetition and definition levels.
    60  func (r *RowGroupMetaData) NumRows() int64 { return r.rowGroup.NumRows }
    61  
    62  // TotalByteSize is the total size of this rowgroup on disk
    63  func (r *RowGroupMetaData) TotalByteSize() int64 { return r.rowGroup.GetTotalByteSize() }
    64  
    65  // FileOffset is the location in the file where the data for this rowgroup begins
    66  func (r *RowGroupMetaData) FileOffset() int64 { return r.rowGroup.GetFileOffset() }
    67  
    68  func (r *RowGroupMetaData) TotalCompressedSize() int64 { return r.rowGroup.GetTotalCompressedSize() }
    69  
    70  // Ordinal is the row group number in order for the given file.
    71  func (r *RowGroupMetaData) Ordinal() int16 { return r.rowGroup.GetOrdinal() }
    72  
    73  // ColumnChunk returns the metadata for the requested (0-based) chunk index
    74  func (r *RowGroupMetaData) ColumnChunk(i int) (*ColumnChunkMetaData, error) {
    75  	if i >= r.NumColumns() {
    76  		panic(fmt.Errorf("parquet: the file only has %d columns, requested metadata for column: %d", r.NumColumns(), i))
    77  	}
    78  
    79  	return NewColumnChunkMetaData(r.rowGroup.Columns[i], r.Schema.Column(i), r.version, r.rowGroup.GetOrdinal(), int16(i), r.fileDecryptor)
    80  }
    81  
    82  // RowGroupMetaDataBuilder is a convenience object for constructing row group
    83  // metadata information. Primarily used in conjunction with writing new files.
    84  type RowGroupMetaDataBuilder struct {
    85  	rg          *format.RowGroup
    86  	props       *parquet.WriterProperties
    87  	schema      *schema.Schema
    88  	colBuilders []*ColumnChunkMetaDataBuilder
    89  	nextCol     int
    90  }
    91  
    92  // NewRowGroupMetaDataBuilder returns a builder using the given properties and underlying thrift object.
    93  //
    94  // This is primarily used internally, consumers should use the file metadatabuilder and call
    95  // AppendRowGroup on it to get instances of RowGroupMetaDataBuilder
    96  func NewRowGroupMetaDataBuilder(props *parquet.WriterProperties, schema *schema.Schema, rg *format.RowGroup) *RowGroupMetaDataBuilder {
    97  	r := &RowGroupMetaDataBuilder{
    98  		rg:          rg,
    99  		props:       props,
   100  		schema:      schema,
   101  		colBuilders: make([]*ColumnChunkMetaDataBuilder, 0),
   102  	}
   103  	r.rg.Columns = make([]*format.ColumnChunk, schema.NumColumns())
   104  	return r
   105  }
   106  
   107  // NumColumns returns the current number of columns in this metadata
   108  func (r *RowGroupMetaDataBuilder) NumColumns() int {
   109  	return int(len(r.rg.GetColumns()))
   110  }
   111  
   112  func (r *RowGroupMetaDataBuilder) NumRows() int64 {
   113  	return r.rg.GetNumRows()
   114  }
   115  
   116  func (r *RowGroupMetaDataBuilder) SetNumRows(nrows int) {
   117  	r.rg.NumRows = int64(nrows)
   118  }
   119  
   120  // CurrentColumn returns the current column chunk (0-based) index that is being built.
   121  //
   122  // Returns -1 until the first time NextColumnChunk is called.
   123  func (r *RowGroupMetaDataBuilder) CurrentColumn() int { return r.nextCol - 1 }
   124  
   125  // NextColumnChunk appends a new column chunk, updates the column index,
   126  // and returns a builder for that column chunk's metadata
   127  func (r *RowGroupMetaDataBuilder) NextColumnChunk() *ColumnChunkMetaDataBuilder {
   128  	if r.nextCol >= r.NumColumns() {
   129  		panic(fmt.Errorf("parquet: the schema only has %d columns, requested metadata for col: %d", r.NumColumns(), r.nextCol))
   130  	}
   131  
   132  	col := r.schema.Column(r.nextCol)
   133  	if r.rg.Columns[r.nextCol] == nil {
   134  		r.rg.Columns[r.nextCol] = &format.ColumnChunk{MetaData: format.NewColumnMetaData()}
   135  	}
   136  	colBldr := NewColumnChunkMetaDataBuilderWithContents(r.props, col, r.rg.Columns[r.nextCol])
   137  	r.nextCol++
   138  	r.colBuilders = append(r.colBuilders, colBldr)
   139  	return colBldr
   140  }
   141  
   142  // Finish should be called when complete and updates the metadata with the final
   143  // file offset, and total compressed sizes. totalBytesWritten gets written as the
   144  // TotalByteSize for the row group and Ordinal should be the index of the row group
   145  // being written. e.g. first row group should be 0, second is 1, and so on...
   146  func (r *RowGroupMetaDataBuilder) Finish(totalBytesWritten int64, ordinal int16) error {
   147  	if r.nextCol != r.NumColumns() {
   148  		return fmt.Errorf("parquet: only %d out of %d columns are initialized", r.nextCol-1, r.schema.NumColumns())
   149  	}
   150  
   151  	var (
   152  		fileOffset      int64 = 0
   153  		totalCompressed int64 = 0
   154  	)
   155  
   156  	for idx, col := range r.rg.Columns {
   157  		if col.FileOffset < 0 {
   158  			return fmt.Errorf("parquet: Column %d is not complete", idx)
   159  		}
   160  		if idx == 0 {
   161  			if col.MetaData.IsSetDictionaryPageOffset() && col.MetaData.GetDictionaryPageOffset() > 0 {
   162  				fileOffset = col.MetaData.GetDictionaryPageOffset()
   163  			} else {
   164  				fileOffset = col.MetaData.DataPageOffset
   165  			}
   166  		}
   167  		// sometimes column metadata is encrypted and not available to read
   168  		// so we must get total compressed size from column builder
   169  		totalCompressed += r.colBuilders[idx].TotalCompressedSize()
   170  	}
   171  
   172  	r.rg.FileOffset = &fileOffset
   173  	r.rg.TotalCompressedSize = &totalCompressed
   174  	r.rg.TotalByteSize = totalBytesWritten
   175  	r.rg.Ordinal = &ordinal
   176  	return nil
   177  }