github.com/apache/arrow/go/v14@v14.0.1/parquet/metadata/column_chunk.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package metadata 18 19 import ( 20 "bytes" 21 "context" 22 "io" 23 "reflect" 24 25 "github.com/apache/arrow/go/v14/arrow/memory" 26 "github.com/apache/arrow/go/v14/parquet" 27 "github.com/apache/arrow/go/v14/parquet/compress" 28 "github.com/apache/arrow/go/v14/parquet/internal/encryption" 29 format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet" 30 "github.com/apache/arrow/go/v14/parquet/internal/thrift" 31 "github.com/apache/arrow/go/v14/parquet/schema" 32 "golang.org/x/xerrors" 33 ) 34 35 // PageEncodingStats is used for counting the number of pages of specific 36 // types with the given internal encoding. 37 type PageEncodingStats struct { 38 Encoding parquet.Encoding 39 PageType format.PageType 40 } 41 42 type statvalues struct { 43 *format.Statistics 44 } 45 46 func (s *statvalues) GetMin() []byte { return s.GetMinValue() } 47 func (s *statvalues) GetMax() []byte { return s.GetMaxValue() } 48 func (s *statvalues) IsSetMin() bool { return s.IsSetMinValue() } 49 func (s *statvalues) IsSetMax() bool { return s.IsSetMaxValue() } 50 51 func makeColumnStats(metadata *format.ColumnMetaData, descr *schema.Column, mem memory.Allocator) TypedStatistics { 52 if descr.ColumnOrder() == parquet.ColumnOrders.TypeDefinedOrder { 53 return NewStatisticsFromEncoded(descr, mem, 54 metadata.NumValues-metadata.Statistics.GetNullCount(), 55 &statvalues{metadata.Statistics}) 56 } 57 return NewStatisticsFromEncoded(descr, mem, 58 metadata.NumValues-metadata.Statistics.GetNullCount(), 59 metadata.Statistics) 60 } 61 62 // ColumnChunkMetaData is a proxy around format.ColumnChunkMetaData 63 // containing all of the information and metadata for a given column chunk 64 // and it's associated Column 65 type ColumnChunkMetaData struct { 66 column *format.ColumnChunk 67 columnMeta *format.ColumnMetaData 68 decryptedMeta format.ColumnMetaData 69 descr *schema.Column 70 writerVersion *AppVersion 71 encodings []parquet.Encoding 72 encodingStats []format.PageEncodingStats 73 possibleStats TypedStatistics 74 mem memory.Allocator 75 } 76 77 // NewColumnChunkMetaData creates an instance of the metadata from a column chunk and descriptor 78 // 79 // this is primarily used internally or between the subpackages. ColumnChunkMetaDataBuilder should 80 // be used by consumers instead of using this directly. 81 func NewColumnChunkMetaData(column *format.ColumnChunk, descr *schema.Column, writerVersion *AppVersion, rowGroupOrdinal, columnOrdinal int16, fileDecryptor encryption.FileDecryptor) (*ColumnChunkMetaData, error) { 82 c := &ColumnChunkMetaData{ 83 column: column, 84 columnMeta: column.GetMetaData(), 85 descr: descr, 86 writerVersion: writerVersion, 87 mem: memory.DefaultAllocator, 88 } 89 if column.IsSetCryptoMetadata() { 90 ccmd := column.CryptoMetadata 91 92 if ccmd.IsSetENCRYPTION_WITH_COLUMN_KEY() { 93 if fileDecryptor != nil && fileDecryptor.Properties() != nil { 94 // should decrypt metadata 95 path := parquet.ColumnPath(ccmd.ENCRYPTION_WITH_COLUMN_KEY.GetPathInSchema()) 96 keyMetadata := ccmd.ENCRYPTION_WITH_COLUMN_KEY.GetKeyMetadata() 97 aadColumnMetadata := encryption.CreateModuleAad(fileDecryptor.FileAad(), encryption.ColumnMetaModule, rowGroupOrdinal, columnOrdinal, -1) 98 decryptor := fileDecryptor.GetColumnMetaDecryptor(path.String(), string(keyMetadata), aadColumnMetadata) 99 thrift.DeserializeThrift(&c.decryptedMeta, decryptor.Decrypt(column.GetEncryptedColumnMetadata())) 100 c.columnMeta = &c.decryptedMeta 101 } else { 102 return nil, xerrors.New("cannot decrypt column metadata. file decryption not setup correctly") 103 } 104 } 105 } 106 for _, enc := range c.columnMeta.Encodings { 107 c.encodings = append(c.encodings, parquet.Encoding(enc)) 108 } 109 for _, enc := range c.columnMeta.EncodingStats { 110 c.encodingStats = append(c.encodingStats, *enc) 111 } 112 return c, nil 113 } 114 115 // CryptoMetadata returns the cryptographic metadata for how this column was 116 // encrypted and how to decrypt it. 117 func (c *ColumnChunkMetaData) CryptoMetadata() *format.ColumnCryptoMetaData { 118 return c.column.GetCryptoMetadata() 119 } 120 121 // FileOffset is the location in the file where the column data begins 122 func (c *ColumnChunkMetaData) FileOffset() int64 { return c.column.FileOffset } 123 124 // FilePath gives the name of the parquet file if provided in the metadata 125 func (c *ColumnChunkMetaData) FilePath() string { return c.column.GetFilePath() } 126 127 // Type is the physical storage type used in the parquet file for this column chunk. 128 func (c *ColumnChunkMetaData) Type() parquet.Type { return parquet.Type(c.columnMeta.Type) } 129 130 // NumValues is the number of values stored in just this chunk including nulls. 131 func (c *ColumnChunkMetaData) NumValues() int64 { return c.columnMeta.NumValues } 132 133 // PathInSchema is the full path to this column from the root of the schema including 134 // any nested columns 135 func (c *ColumnChunkMetaData) PathInSchema() parquet.ColumnPath { 136 return c.columnMeta.GetPathInSchema() 137 } 138 139 // Compression provides the type of compression used for this particular chunk. 140 func (c *ColumnChunkMetaData) Compression() compress.Compression { 141 return compress.Compression(c.columnMeta.Codec) 142 } 143 144 // Encodings returns the list of different encodings used in this chunk 145 func (c *ColumnChunkMetaData) Encodings() []parquet.Encoding { return c.encodings } 146 147 // EncodingStats connects the order of encodings based on the list of pages and types 148 func (c *ColumnChunkMetaData) EncodingStats() []PageEncodingStats { 149 ret := make([]PageEncodingStats, len(c.encodingStats)) 150 for idx := range ret { 151 ret[idx].Encoding = parquet.Encoding(c.encodingStats[idx].Encoding) 152 ret[idx].PageType = c.encodingStats[idx].PageType 153 } 154 return ret 155 } 156 157 // HasDictionaryPage returns true if there is a dictionary page offset set in 158 // this metadata. 159 func (c *ColumnChunkMetaData) HasDictionaryPage() bool { 160 return c.columnMeta.IsSetDictionaryPageOffset() 161 } 162 163 // DictionaryPageOffset returns the location in the file where the dictionary page starts 164 func (c *ColumnChunkMetaData) DictionaryPageOffset() int64 { 165 return c.columnMeta.GetDictionaryPageOffset() 166 } 167 168 // DataPageOffset returns the location in the file where the data pages begin for this column 169 func (c *ColumnChunkMetaData) DataPageOffset() int64 { return c.columnMeta.GetDataPageOffset() } 170 171 // HasIndexPage returns true if the offset for the index page is set in the metadata 172 func (c *ColumnChunkMetaData) HasIndexPage() bool { return c.columnMeta.IsSetIndexPageOffset() } 173 174 // IndexPageOffset is the location in the file where the index page starts. 175 func (c *ColumnChunkMetaData) IndexPageOffset() int64 { return c.columnMeta.GetIndexPageOffset() } 176 177 // TotalCompressedSize will be equal to TotalUncompressedSize if the data is not compressed. 178 // Otherwise this will be the size of the actual data in the file. 179 func (c *ColumnChunkMetaData) TotalCompressedSize() int64 { 180 return c.columnMeta.GetTotalCompressedSize() 181 } 182 183 // TotalUncompressedSize is the total size of the raw data after uncompressing the chunk 184 func (c *ColumnChunkMetaData) TotalUncompressedSize() int64 { 185 return c.columnMeta.GetTotalUncompressedSize() 186 } 187 188 // BloomFilterOffset is the byte offset from the beginning of the file to the bloom 189 // filter data. 190 func (c *ColumnChunkMetaData) BloomFilterOffset() int64 { 191 return c.columnMeta.GetBloomFilterOffset() 192 } 193 194 // StatsSet returns true only if there are statistics set in the metadata and the column 195 // descriptor has a sort order that is not SortUnknown 196 // 197 // It also checks the writer version to ensure that it was not written by a version 198 // of parquet which is known to have incorrect stat computations. 199 func (c *ColumnChunkMetaData) StatsSet() (bool, error) { 200 if !c.columnMeta.IsSetStatistics() || c.descr.SortOrder() == schema.SortUNKNOWN { 201 return false, nil 202 } 203 204 if c.possibleStats == nil { 205 c.possibleStats = makeColumnStats(c.columnMeta, c.descr, c.mem) 206 } 207 208 encoded, err := c.possibleStats.Encode() 209 if err != nil { 210 return false, err 211 } 212 213 return c.writerVersion.HasCorrectStatistics(c.Type(), c.descr.LogicalType(), encoded, c.descr.SortOrder()), nil 214 } 215 216 func (c *ColumnChunkMetaData) Equals(other *ColumnChunkMetaData) bool { 217 return reflect.DeepEqual(c.columnMeta, other.columnMeta) 218 } 219 220 // Statistics can return nil if there are no stats in this metadata 221 func (c *ColumnChunkMetaData) Statistics() (TypedStatistics, error) { 222 ok, err := c.StatsSet() 223 if err != nil { 224 return nil, err 225 } 226 227 if ok { 228 return c.possibleStats, nil 229 } 230 return nil, nil 231 } 232 233 // ColumnChunkMetaDataBuilder is used during writing to construct metadata 234 // for a given column chunk while writing, providing a proxy around constructing 235 // the actual thrift object. 236 type ColumnChunkMetaDataBuilder struct { 237 chunk *format.ColumnChunk 238 props *parquet.WriterProperties 239 column *schema.Column 240 241 compressedSize int64 242 } 243 244 func NewColumnChunkMetaDataBuilder(props *parquet.WriterProperties, column *schema.Column) *ColumnChunkMetaDataBuilder { 245 return NewColumnChunkMetaDataBuilderWithContents(props, column, format.NewColumnChunk()) 246 } 247 248 // NewColumnChunkMetaDataBuilderWithContents will construct a builder and start it with the provided 249 // column chunk information rather than with an empty column chunk. 250 func NewColumnChunkMetaDataBuilderWithContents(props *parquet.WriterProperties, column *schema.Column, chunk *format.ColumnChunk) *ColumnChunkMetaDataBuilder { 251 b := &ColumnChunkMetaDataBuilder{ 252 props: props, 253 column: column, 254 chunk: chunk, 255 } 256 b.init(chunk) 257 return b 258 } 259 260 // Contents returns the underlying thrift ColumnChunk object so that it can be used 261 // for constructing or duplicating column metadata 262 func (c *ColumnChunkMetaDataBuilder) Contents() *format.ColumnChunk { return c.chunk } 263 264 func (c *ColumnChunkMetaDataBuilder) init(chunk *format.ColumnChunk) { 265 c.chunk = chunk 266 if !c.chunk.IsSetMetaData() { 267 c.chunk.MetaData = format.NewColumnMetaData() 268 } 269 c.chunk.MetaData.Type = format.Type(c.column.PhysicalType()) 270 c.chunk.MetaData.PathInSchema = schema.ColumnPathFromNode(c.column.SchemaNode()) 271 c.chunk.MetaData.Codec = format.CompressionCodec(c.props.CompressionFor(c.column.Path())) 272 } 273 274 func (c *ColumnChunkMetaDataBuilder) SetFilePath(val string) { 275 c.chunk.FilePath = &val 276 } 277 278 // Descr returns the associated column descriptor for this column chunk 279 func (c *ColumnChunkMetaDataBuilder) Descr() *schema.Column { return c.column } 280 281 func (c *ColumnChunkMetaDataBuilder) TotalCompressedSize() int64 { 282 // if this column is encrypted, after Finish is called, the MetaData 283 // field is set to nil and we store the compressed size so return that 284 if c.chunk.MetaData == nil { 285 return c.compressedSize 286 } 287 return c.chunk.MetaData.GetTotalCompressedSize() 288 } 289 290 func (c *ColumnChunkMetaDataBuilder) SetStats(val EncodedStatistics) { 291 c.chunk.MetaData.Statistics = val.ToThrift() 292 } 293 294 // ChunkMetaInfo is a helper struct for passing the offset and size information 295 // for finishing the building of column chunk metadata 296 type ChunkMetaInfo struct { 297 NumValues int64 298 DictPageOffset int64 299 IndexPageOffset int64 300 DataPageOffset int64 301 CompressedSize int64 302 UncompressedSize int64 303 } 304 305 // EncodingStats is a helper struct for passing the encoding stat information 306 // for finishing up metadata for a column chunk. 307 type EncodingStats struct { 308 DictEncodingStats map[parquet.Encoding]int32 309 DataEncodingStats map[parquet.Encoding]int32 310 } 311 312 // Finish finalizes the metadata with the given offsets, 313 // flushes any compression that needs to be done, and performs 314 // any encryption if an encryptor is provided. 315 func (c *ColumnChunkMetaDataBuilder) Finish(info ChunkMetaInfo, hasDict, dictFallback bool, encStats EncodingStats, metaEncryptor encryption.Encryptor) error { 316 if info.DictPageOffset > 0 { 317 c.chunk.MetaData.DictionaryPageOffset = &info.DictPageOffset 318 c.chunk.FileOffset = info.DictPageOffset + info.CompressedSize 319 } else { 320 c.chunk.FileOffset = info.DataPageOffset + info.CompressedSize 321 } 322 323 c.chunk.MetaData.NumValues = info.NumValues 324 if info.IndexPageOffset >= 0 { 325 c.chunk.MetaData.IndexPageOffset = &info.IndexPageOffset 326 } 327 328 c.chunk.MetaData.DataPageOffset = info.DataPageOffset 329 c.chunk.MetaData.TotalUncompressedSize = info.UncompressedSize 330 c.chunk.MetaData.TotalCompressedSize = info.CompressedSize 331 332 // no matter the configuration, the maximum number of thrift encodings we'll 333 // populate is going to be 3: 334 // 1. potential dictionary index encoding 335 // 2. page encoding 336 // 3. RLE for repetition and definition levels 337 // so let's preallocate a capacity of 3 but initialize the slice at 0 len 338 const maxEncodings = 3 339 340 thriftEncodings := make([]format.Encoding, 0, maxEncodings) 341 if hasDict { 342 thriftEncodings = append(thriftEncodings, format.Encoding(c.props.DictionaryIndexEncoding())) 343 if c.props.Version() == parquet.V1_0 { 344 thriftEncodings = append(thriftEncodings, format.Encoding_PLAIN) 345 } else { 346 thriftEncodings = append(thriftEncodings, format.Encoding(c.props.DictionaryPageEncoding())) 347 } 348 } else { // no dictionary 349 thriftEncodings = append(thriftEncodings, format.Encoding(c.props.EncodingFor(c.column.Path()))) 350 } 351 352 thriftEncodings = append(thriftEncodings, format.Encoding(parquet.Encodings.RLE)) 353 // Only PLAIN encoding is supported for fallback in V1 354 // TODO(zeroshade): Use user specified encoding for V2 355 if dictFallback { 356 thriftEncodings = append(thriftEncodings, format.Encoding_PLAIN) 357 } 358 c.chunk.MetaData.Encodings = thriftEncodings 359 360 thriftEncodingStats := make([]*format.PageEncodingStats, 0, len(encStats.DictEncodingStats)+len(encStats.DataEncodingStats)) 361 for k, v := range encStats.DictEncodingStats { 362 thriftEncodingStats = append(thriftEncodingStats, &format.PageEncodingStats{ 363 PageType: format.PageType_DICTIONARY_PAGE, 364 Encoding: format.Encoding(k), 365 Count: v, 366 }) 367 } 368 for k, v := range encStats.DataEncodingStats { 369 thriftEncodingStats = append(thriftEncodingStats, &format.PageEncodingStats{ 370 PageType: format.PageType_DATA_PAGE, 371 Encoding: format.Encoding(k), 372 Count: v, 373 }) 374 } 375 c.chunk.MetaData.EncodingStats = thriftEncodingStats 376 377 encryptProps := c.props.ColumnEncryptionProperties(c.column.Path()) 378 if encryptProps != nil && encryptProps.IsEncrypted() { 379 ccmd := format.NewColumnCryptoMetaData() 380 if encryptProps.IsEncryptedWithFooterKey() { 381 ccmd.ENCRYPTION_WITH_FOOTER_KEY = format.NewEncryptionWithFooterKey() 382 } else { 383 ccmd.ENCRYPTION_WITH_COLUMN_KEY = &format.EncryptionWithColumnKey{ 384 KeyMetadata: []byte(encryptProps.KeyMetadata()), 385 PathInSchema: c.column.ColumnPath(), 386 } 387 } 388 c.chunk.CryptoMetadata = ccmd 389 390 encryptedFooter := c.props.FileEncryptionProperties().EncryptedFooter() 391 encryptMetadata := !encryptedFooter || !encryptProps.IsEncryptedWithFooterKey() 392 if encryptMetadata { 393 // Serialize and encrypt ColumnMetadata separately 394 // Thrift-serialize the ColumnMetaData structure, 395 // encrypt it with the column key, and write to encrypted_column_metadata 396 serializer := thrift.NewThriftSerializer() 397 data, err := serializer.Write(context.Background(), c.chunk.MetaData) 398 if err != nil { 399 return err 400 } 401 var buf bytes.Buffer 402 metaEncryptor.Encrypt(&buf, data) 403 c.chunk.EncryptedColumnMetadata = buf.Bytes() 404 405 if encryptedFooter { 406 c.compressedSize = c.chunk.MetaData.GetTotalCompressedSize() 407 c.chunk.MetaData = nil 408 } else { 409 // Keep redacted metadata version for old readers 410 c.chunk.MetaData.Statistics = nil 411 c.chunk.MetaData.EncodingStats = nil 412 } 413 } 414 } 415 return nil 416 } 417 418 // WriteTo will always return 0 as the int64 since the thrift writer library 419 // does not return the number of bytes written, we only use the signature 420 // of (int64, error) in order to match the standard WriteTo interfaces. 421 func (c *ColumnChunkMetaDataBuilder) WriteTo(w io.Writer) (int64, error) { 422 return 0, thrift.SerializeThriftStream(c.chunk, w) 423 }