github.com/apache/arrow/go/v14@v14.0.2/parquet/metadata/row_group.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package metadata 18 19 import ( 20 "fmt" 21 "reflect" 22 23 "github.com/apache/arrow/go/v14/parquet" 24 "github.com/apache/arrow/go/v14/parquet/internal/encryption" 25 format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet" 26 "github.com/apache/arrow/go/v14/parquet/schema" 27 ) 28 29 // RowGroupMetaData is a proxy around the thrift RowGroup meta data object 30 type RowGroupMetaData struct { 31 rowGroup *format.RowGroup 32 Schema *schema.Schema 33 version *AppVersion 34 fileDecryptor encryption.FileDecryptor 35 } 36 37 // NewRowGroupMetaData constructs an object from the underlying thrift objects and schema, 38 // decrypting if provided and necessary. This is primarily used internally and consumers 39 // should use the RowGroupMetaDataBuilder rather than this directly. 40 func NewRowGroupMetaData(rg *format.RowGroup, sc *schema.Schema, version *AppVersion, decryptor encryption.FileDecryptor) *RowGroupMetaData { 41 return &RowGroupMetaData{ 42 rowGroup: rg, 43 Schema: sc, 44 version: version, 45 fileDecryptor: decryptor, 46 } 47 } 48 49 // NumColumns returns the number of column metadata objects in this row group 50 func (r *RowGroupMetaData) NumColumns() int { 51 return len(r.rowGroup.GetColumns()) 52 } 53 54 func (r *RowGroupMetaData) Equals(other *RowGroupMetaData) bool { 55 return reflect.DeepEqual(r.rowGroup, other.rowGroup) 56 } 57 58 // NumRows is just the number of rows in this row group. All columns have the same 59 // number of rows for a row group regardless of repetition and definition levels. 60 func (r *RowGroupMetaData) NumRows() int64 { return r.rowGroup.NumRows } 61 62 // TotalByteSize is the total size of this rowgroup on disk 63 func (r *RowGroupMetaData) TotalByteSize() int64 { return r.rowGroup.GetTotalByteSize() } 64 65 // FileOffset is the location in the file where the data for this rowgroup begins 66 func (r *RowGroupMetaData) FileOffset() int64 { return r.rowGroup.GetFileOffset() } 67 68 func (r *RowGroupMetaData) TotalCompressedSize() int64 { return r.rowGroup.GetTotalCompressedSize() } 69 70 // Ordinal is the row group number in order for the given file. 71 func (r *RowGroupMetaData) Ordinal() int16 { return r.rowGroup.GetOrdinal() } 72 73 // ColumnChunk returns the metadata for the requested (0-based) chunk index 74 func (r *RowGroupMetaData) ColumnChunk(i int) (*ColumnChunkMetaData, error) { 75 if i >= r.NumColumns() { 76 panic(fmt.Errorf("parquet: the file only has %d columns, requested metadata for column: %d", r.NumColumns(), i)) 77 } 78 79 return NewColumnChunkMetaData(r.rowGroup.Columns[i], r.Schema.Column(i), r.version, r.rowGroup.GetOrdinal(), int16(i), r.fileDecryptor) 80 } 81 82 // RowGroupMetaDataBuilder is a convenience object for constructing row group 83 // metadata information. Primarily used in conjunction with writing new files. 84 type RowGroupMetaDataBuilder struct { 85 rg *format.RowGroup 86 props *parquet.WriterProperties 87 schema *schema.Schema 88 colBuilders []*ColumnChunkMetaDataBuilder 89 nextCol int 90 } 91 92 // NewRowGroupMetaDataBuilder returns a builder using the given properties and underlying thrift object. 93 // 94 // This is primarily used internally, consumers should use the file metadatabuilder and call 95 // AppendRowGroup on it to get instances of RowGroupMetaDataBuilder 96 func NewRowGroupMetaDataBuilder(props *parquet.WriterProperties, schema *schema.Schema, rg *format.RowGroup) *RowGroupMetaDataBuilder { 97 r := &RowGroupMetaDataBuilder{ 98 rg: rg, 99 props: props, 100 schema: schema, 101 colBuilders: make([]*ColumnChunkMetaDataBuilder, 0), 102 } 103 r.rg.Columns = make([]*format.ColumnChunk, schema.NumColumns()) 104 return r 105 } 106 107 // NumColumns returns the current number of columns in this metadata 108 func (r *RowGroupMetaDataBuilder) NumColumns() int { 109 return int(len(r.rg.GetColumns())) 110 } 111 112 func (r *RowGroupMetaDataBuilder) NumRows() int64 { 113 return r.rg.GetNumRows() 114 } 115 116 func (r *RowGroupMetaDataBuilder) SetNumRows(nrows int) { 117 r.rg.NumRows = int64(nrows) 118 } 119 120 // CurrentColumn returns the current column chunk (0-based) index that is being built. 121 // 122 // Returns -1 until the first time NextColumnChunk is called. 123 func (r *RowGroupMetaDataBuilder) CurrentColumn() int { return r.nextCol - 1 } 124 125 // NextColumnChunk appends a new column chunk, updates the column index, 126 // and returns a builder for that column chunk's metadata 127 func (r *RowGroupMetaDataBuilder) NextColumnChunk() *ColumnChunkMetaDataBuilder { 128 if r.nextCol >= r.NumColumns() { 129 panic(fmt.Errorf("parquet: the schema only has %d columns, requested metadata for col: %d", r.NumColumns(), r.nextCol)) 130 } 131 132 col := r.schema.Column(r.nextCol) 133 if r.rg.Columns[r.nextCol] == nil { 134 r.rg.Columns[r.nextCol] = &format.ColumnChunk{MetaData: format.NewColumnMetaData()} 135 } 136 colBldr := NewColumnChunkMetaDataBuilderWithContents(r.props, col, r.rg.Columns[r.nextCol]) 137 r.nextCol++ 138 r.colBuilders = append(r.colBuilders, colBldr) 139 return colBldr 140 } 141 142 // Finish should be called when complete and updates the metadata with the final 143 // file offset, and total compressed sizes. totalBytesWritten gets written as the 144 // TotalByteSize for the row group and Ordinal should be the index of the row group 145 // being written. e.g. first row group should be 0, second is 1, and so on... 146 func (r *RowGroupMetaDataBuilder) Finish(totalBytesWritten int64, ordinal int16) error { 147 if r.nextCol != r.NumColumns() { 148 return fmt.Errorf("parquet: only %d out of %d columns are initialized", r.nextCol-1, r.schema.NumColumns()) 149 } 150 151 var ( 152 fileOffset int64 = 0 153 totalCompressed int64 = 0 154 ) 155 156 for idx, col := range r.rg.Columns { 157 if col.FileOffset < 0 { 158 return fmt.Errorf("parquet: Column %d is not complete", idx) 159 } 160 if idx == 0 { 161 if col.MetaData.IsSetDictionaryPageOffset() && col.MetaData.GetDictionaryPageOffset() > 0 { 162 fileOffset = col.MetaData.GetDictionaryPageOffset() 163 } else { 164 fileOffset = col.MetaData.DataPageOffset 165 } 166 } 167 // sometimes column metadata is encrypted and not available to read 168 // so we must get total compressed size from column builder 169 totalCompressed += r.colBuilders[idx].TotalCompressedSize() 170 } 171 172 r.rg.FileOffset = &fileOffset 173 r.rg.TotalCompressedSize = &totalCompressed 174 r.rg.TotalByteSize = totalBytesWritten 175 r.rg.Ordinal = &ordinal 176 return nil 177 }