github.com/apache/arrow/go/v16@v16.1.0/parquet/metadata/file.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package metadata 18 19 import ( 20 "bytes" 21 "context" 22 "fmt" 23 "io" 24 "reflect" 25 "unicode/utf8" 26 27 "github.com/apache/arrow/go/v16/parquet" 28 "github.com/apache/arrow/go/v16/parquet/compress" 29 "github.com/apache/arrow/go/v16/parquet/internal/encryption" 30 format "github.com/apache/arrow/go/v16/parquet/internal/gen-go/parquet" 31 "github.com/apache/arrow/go/v16/parquet/internal/thrift" 32 "github.com/apache/arrow/go/v16/parquet/schema" 33 "golang.org/x/xerrors" 34 ) 35 36 // DefaultCompressionType is used unless a different compression is specified 37 // in the properties 38 var DefaultCompressionType = compress.Codecs.Uncompressed 39 40 // FileMetaDataBuilder is a proxy for more easily constructing file metadata 41 // particularly used when writing a file out. 42 type FileMetaDataBuilder struct { 43 metadata *format.FileMetaData 44 props *parquet.WriterProperties 45 schema *schema.Schema 46 rowGroups []*format.RowGroup 47 currentRgBldr *RowGroupMetaDataBuilder 48 kvmeta KeyValueMetadata 49 cryptoMetadata *format.FileCryptoMetaData 50 } 51 52 // NewFileMetadataBuilder will use the default writer properties if nil is passed for 53 // the writer properties and nil is allowable for the key value metadata. 54 func NewFileMetadataBuilder(schema *schema.Schema, props *parquet.WriterProperties, kvmeta KeyValueMetadata) *FileMetaDataBuilder { 55 var crypto *format.FileCryptoMetaData 56 if props.FileEncryptionProperties() != nil && props.FileEncryptionProperties().EncryptedFooter() { 57 crypto = format.NewFileCryptoMetaData() 58 } 59 return &FileMetaDataBuilder{ 60 metadata: format.NewFileMetaData(), 61 props: props, 62 schema: schema, 63 kvmeta: kvmeta, 64 cryptoMetadata: crypto, 65 } 66 } 67 68 // GetFileCryptoMetaData returns the cryptographic information for encrypting/ 69 // decrypting the file. 70 func (f *FileMetaDataBuilder) GetFileCryptoMetaData() *FileCryptoMetadata { 71 if f.cryptoMetadata == nil { 72 return nil 73 } 74 75 props := f.props.FileEncryptionProperties() 76 f.cryptoMetadata.EncryptionAlgorithm = props.Algorithm().ToThrift() 77 keyMetadata := props.FooterKeyMetadata() 78 if keyMetadata != "" { 79 f.cryptoMetadata.KeyMetadata = []byte(keyMetadata) 80 } 81 82 return &FileCryptoMetadata{f.cryptoMetadata, 0} 83 } 84 85 // AppendRowGroup adds a rowgroup to the list and returns a builder 86 // for that row group 87 func (f *FileMetaDataBuilder) AppendRowGroup() *RowGroupMetaDataBuilder { 88 if f.rowGroups == nil { 89 f.rowGroups = make([]*format.RowGroup, 0, 1) 90 } 91 92 rg := format.NewRowGroup() 93 f.rowGroups = append(f.rowGroups, rg) 94 f.currentRgBldr = NewRowGroupMetaDataBuilder(f.props, f.schema, rg) 95 return f.currentRgBldr 96 } 97 98 // AppendKeyValueMetadata appends a key/value pair to the existing key/value metadata 99 func (f *FileMetaDataBuilder) AppendKeyValueMetadata(key string, value string) error { 100 return f.kvmeta.Append(key, value) 101 } 102 103 // Finish will finalize the metadata of the number of rows, row groups, 104 // version etc. This will clear out this filemetadatabuilder so it can 105 // be re-used 106 func (f *FileMetaDataBuilder) Finish() (*FileMetaData, error) { 107 out, err := f.Snapshot() 108 f.Clear() 109 return out, err 110 } 111 112 // Snapshot returns finalized metadata of the number of rows, row groups, version etc. 113 // The snapshot must be used (e.g., serialized) before any additional (meta)data is 114 // written, as it refers to builder datastructures that will continue to mutate. 115 func (f *FileMetaDataBuilder) Snapshot() (*FileMetaData, error) { 116 totalRows := int64(0) 117 for _, rg := range f.rowGroups { 118 totalRows += rg.NumRows 119 } 120 f.metadata.NumRows = totalRows 121 f.metadata.RowGroups = f.rowGroups 122 switch f.props.Version() { 123 case parquet.V1_0: 124 f.metadata.Version = 1 125 default: 126 f.metadata.Version = 2 127 } 128 createdBy := f.props.CreatedBy() 129 f.metadata.CreatedBy = &createdBy 130 131 // Users cannot set the `ColumnOrder` since we do not have user defined sort order 132 // in the spec yet. 133 // 134 // We always default to `TYPE_DEFINED_ORDER`. We can expose it in 135 // the API once we have user defined sort orders in the Parquet format. 136 // TypeDefinedOrder implies choose SortOrder based on ConvertedType/PhysicalType 137 typeDefined := format.NewTypeDefinedOrder() 138 colOrder := &format.ColumnOrder{TYPE_ORDER: typeDefined} 139 f.metadata.ColumnOrders = make([]*format.ColumnOrder, f.schema.NumColumns()) 140 for idx := range f.metadata.ColumnOrders { 141 f.metadata.ColumnOrders[idx] = colOrder 142 } 143 144 encryptProps := f.props.FileEncryptionProperties() 145 if encryptProps != nil && !encryptProps.EncryptedFooter() { 146 var signingAlgo parquet.Algorithm 147 algo := encryptProps.Algorithm() 148 signingAlgo.Aad.AadFileUnique = algo.Aad.AadFileUnique 149 signingAlgo.Aad.SupplyAadPrefix = algo.Aad.SupplyAadPrefix 150 if !algo.Aad.SupplyAadPrefix { 151 signingAlgo.Aad.AadPrefix = algo.Aad.AadPrefix 152 } 153 signingAlgo.Algo = parquet.AesGcm 154 f.metadata.EncryptionAlgorithm = signingAlgo.ToThrift() 155 footerSigningMetadata := f.props.FileEncryptionProperties().FooterKeyMetadata() 156 if footerSigningMetadata != "" { 157 f.metadata.FooterSigningKeyMetadata = []byte(footerSigningMetadata) 158 } 159 } 160 161 f.metadata.Schema = schema.ToThrift(f.schema.Root()) 162 f.metadata.KeyValueMetadata = f.kvmeta 163 164 out := &FileMetaData{ 165 FileMetaData: f.metadata, 166 version: NewAppVersion(f.metadata.GetCreatedBy()), 167 } 168 if err := out.initSchema(); err != nil { 169 return nil, err 170 } 171 out.initColumnOrders() 172 173 return out, nil 174 } 175 176 // Clears out this filemetadatabuilder so it can be re-used 177 func (f *FileMetaDataBuilder) Clear() { 178 f.metadata = format.NewFileMetaData() 179 f.rowGroups = nil 180 } 181 182 // KeyValueMetadata is an alias for a slice of thrift keyvalue pairs. 183 // 184 // It is presumed that the metadata should all be utf8 valid. 185 type KeyValueMetadata []*format.KeyValue 186 187 // NewKeyValueMetadata is equivalent to make(KeyValueMetadata, 0) 188 func NewKeyValueMetadata() KeyValueMetadata { 189 return make(KeyValueMetadata, 0) 190 } 191 192 // Append adds the passed in key and value to the metadata, if either contains 193 // any invalid utf8 runes, then it is not added and an error is returned. 194 func (k *KeyValueMetadata) Append(key, value string) error { 195 if !utf8.ValidString(key) || !utf8.ValidString(value) { 196 return fmt.Errorf("metadata must be valid utf8 strings, got key = '%s' and value = '%s'", key, value) 197 } 198 *k = append(*k, &format.KeyValue{Key: key, Value: &value}) 199 return nil 200 } 201 202 func (k KeyValueMetadata) Len() int { return len(k) } 203 204 // Equals compares all of the metadata keys and values to check they are equal 205 func (k KeyValueMetadata) Equals(other KeyValueMetadata) bool { 206 return reflect.DeepEqual(k, other) 207 } 208 209 func (k KeyValueMetadata) Keys() (ret []string) { 210 ret = make([]string, len(k)) 211 for idx, v := range k { 212 ret[idx] = v.GetKey() 213 } 214 return 215 } 216 217 func (k KeyValueMetadata) Values() (ret []string) { 218 ret = make([]string, len(k)) 219 for idx, v := range k { 220 ret[idx] = v.GetValue() 221 } 222 return 223 } 224 225 func (k KeyValueMetadata) FindValue(key string) *string { 226 for _, v := range k { 227 if v.Key == key { 228 return v.Value 229 } 230 } 231 return nil 232 } 233 234 // FileMetaData is a proxy around the underlying thrift FileMetaData object 235 // to make it easier to use and interact with. 236 type FileMetaData struct { 237 *format.FileMetaData 238 Schema *schema.Schema 239 FileDecryptor encryption.FileDecryptor 240 241 // app version of the writer for this file 242 version *AppVersion 243 // size of the raw bytes of the metadata in the file which were 244 // decoded by thrift, Size() getter returns the value. 245 metadataLen int 246 } 247 248 // NewFileMetaData takes in the raw bytes of the serialized metadata to deserialize 249 // and will attempt to decrypt the footer if a decryptor is provided. 250 func NewFileMetaData(data []byte, fileDecryptor encryption.FileDecryptor) (*FileMetaData, error) { 251 meta := format.NewFileMetaData() 252 if fileDecryptor != nil { 253 footerDecryptor := fileDecryptor.GetFooterDecryptor() 254 data = footerDecryptor.Decrypt(data) 255 } 256 257 remain, err := thrift.DeserializeThrift(meta, data) 258 if err != nil { 259 return nil, err 260 } 261 262 f := &FileMetaData{ 263 FileMetaData: meta, 264 version: NewAppVersion(meta.GetCreatedBy()), 265 metadataLen: len(data) - int(remain), 266 FileDecryptor: fileDecryptor, 267 } 268 269 f.initSchema() 270 f.initColumnOrders() 271 272 return f, nil 273 } 274 275 // Size is the length of the raw serialized metadata bytes in the footer 276 func (f *FileMetaData) Size() int { return f.metadataLen } 277 278 // NumSchemaElements is the length of the flattened schema list in the thrift 279 func (f *FileMetaData) NumSchemaElements() int { 280 return len(f.FileMetaData.Schema) 281 } 282 283 // RowGroup provides the metadata for the (0-based) index of the row group 284 func (f *FileMetaData) RowGroup(i int) *RowGroupMetaData { 285 return &RowGroupMetaData{ 286 f.RowGroups[i], f.Schema, f.version, f.FileDecryptor, 287 } 288 } 289 290 func (f *FileMetaData) Serialize(ctx context.Context) ([]byte, error) { 291 return thrift.NewThriftSerializer().Write(ctx, f.FileMetaData) 292 } 293 294 func (f *FileMetaData) SerializeString(ctx context.Context) (string, error) { 295 return thrift.NewThriftSerializer().WriteString(ctx, f.FileMetaData) 296 } 297 298 // EncryptionAlgorithm constructs the algorithm object from the thrift 299 // information or returns an empty instance if it was not set. 300 func (f *FileMetaData) EncryptionAlgorithm() parquet.Algorithm { 301 if f.IsSetEncryptionAlgorithm() { 302 return parquet.AlgorithmFromThrift(f.GetEncryptionAlgorithm()) 303 } 304 return parquet.Algorithm{} 305 } 306 307 func (f *FileMetaData) initSchema() error { 308 root, err := schema.FromParquet(f.FileMetaData.Schema) 309 if err != nil { 310 return err 311 } 312 f.Schema = schema.NewSchema(root.(*schema.GroupNode)) 313 return nil 314 } 315 316 func (f *FileMetaData) initColumnOrders() { 317 orders := make([]parquet.ColumnOrder, 0, f.Schema.NumColumns()) 318 if f.IsSetColumnOrders() { 319 for _, o := range f.GetColumnOrders() { 320 if o.IsSetTYPE_ORDER() { 321 orders = append(orders, parquet.ColumnOrders.TypeDefinedOrder) 322 } else { 323 orders = append(orders, parquet.ColumnOrders.Undefined) 324 } 325 } 326 } else { 327 orders = orders[:f.Schema.NumColumns()] 328 orders[0] = parquet.ColumnOrders.Undefined 329 for i := 1; i < len(orders); i *= 2 { 330 copy(orders[i:], orders[:i]) 331 } 332 } 333 f.Schema.UpdateColumnOrders(orders) 334 } 335 336 // WriterVersion returns the constructed application version from the 337 // created by string 338 func (f *FileMetaData) WriterVersion() *AppVersion { 339 if f.version == nil { 340 f.version = NewAppVersion(f.GetCreatedBy()) 341 } 342 return f.version 343 } 344 345 // SetFilePath will set the file path into all of the columns in each row group. 346 func (f *FileMetaData) SetFilePath(path string) { 347 for _, rg := range f.RowGroups { 348 for _, chunk := range rg.Columns { 349 chunk.FilePath = &path 350 } 351 } 352 } 353 354 // AppendRowGroups will add all of the rowgroup metadata from other to the 355 // current file metadata 356 func (f *FileMetaData) AppendRowGroups(other *FileMetaData) error { 357 if !f.Schema.Equals(other.Schema) { 358 return xerrors.New("parquet/FileMetaData: AppendRowGroups requires equal schemas") 359 } 360 361 f.RowGroups = append(f.RowGroups, other.GetRowGroups()...) 362 for _, rg := range other.GetRowGroups() { 363 f.NumRows += rg.NumRows 364 } 365 return nil 366 } 367 368 // Subset will construct a new FileMetaData object containing only the requested 369 // row groups by index 370 func (f *FileMetaData) Subset(rowGroups []int) (*FileMetaData, error) { 371 for _, i := range rowGroups { 372 if i < len(f.RowGroups) { 373 continue 374 } 375 return nil, fmt.Errorf("parquet: this file only has %d row groups, but requested a subset including row group: %d", len(f.RowGroups), i) 376 } 377 378 out := &FileMetaData{ 379 &format.FileMetaData{ 380 Schema: f.FileMetaData.Schema, 381 CreatedBy: f.CreatedBy, 382 ColumnOrders: f.GetColumnOrders(), 383 EncryptionAlgorithm: f.FileMetaData.EncryptionAlgorithm, 384 FooterSigningKeyMetadata: f.FooterSigningKeyMetadata, 385 Version: f.FileMetaData.Version, 386 KeyValueMetadata: f.KeyValueMetadata(), 387 }, 388 f.Schema, 389 f.FileDecryptor, 390 f.version, 391 0, 392 } 393 394 out.RowGroups = make([]*format.RowGroup, 0, len(rowGroups)) 395 for _, selected := range rowGroups { 396 out.RowGroups = append(out.RowGroups, f.RowGroups[selected]) 397 out.NumRows += f.RowGroups[selected].GetNumRows() 398 } 399 400 return out, nil 401 } 402 403 func (f *FileMetaData) Equals(other *FileMetaData) bool { 404 return reflect.DeepEqual(f.FileMetaData, other.FileMetaData) 405 } 406 407 func (f *FileMetaData) KeyValueMetadata() KeyValueMetadata { 408 return f.GetKeyValueMetadata() 409 } 410 411 // VerifySignature constructs a cryptographic signature using the FileDecryptor 412 // of the footer and then verifies it's integrity. 413 // 414 // Panics if f.FileDecryptor is nil 415 func (f *FileMetaData) VerifySignature(signature []byte) bool { 416 if f.FileDecryptor == nil { 417 panic("decryption not set properly, cannot verify signature") 418 } 419 420 serializer := thrift.NewThriftSerializer() 421 data, _ := serializer.Write(context.Background(), f.FileMetaData) 422 nonce := signature[:encryption.NonceLength] 423 tag := signature[encryption.NonceLength : encryption.NonceLength+encryption.GcmTagLength] 424 425 key := f.FileDecryptor.GetFooterKey() 426 aad := encryption.CreateFooterAad(f.FileDecryptor.FileAad()) 427 428 enc := encryption.NewAesEncryptor(f.FileDecryptor.Algorithm(), true) 429 var buf bytes.Buffer 430 buf.Grow(enc.CiphertextSizeDelta() + len(data)) 431 encryptedLen := enc.SignedFooterEncrypt(&buf, data, []byte(key), []byte(aad), nonce) 432 return bytes.Equal(buf.Bytes()[encryptedLen-encryption.GcmTagLength:], tag) 433 } 434 435 // WriteTo will serialize and write out this file metadata, encrypting it if 436 // appropriate. 437 // 438 // If it is an encrypted file with a plaintext footer, then we will write the 439 // signature with the unencrypted footer. 440 func (f *FileMetaData) WriteTo(w io.Writer, encryptor encryption.Encryptor) (int64, error) { 441 serializer := thrift.NewThriftSerializer() 442 // only in encrypted files with plaintext footers, the encryption algorithm is set in the footer 443 if f.IsSetEncryptionAlgorithm() { 444 data, err := serializer.Write(context.Background(), f.FileMetaData) 445 if err != nil { 446 return 0, err 447 } 448 449 // encrypt the footer key 450 var buf bytes.Buffer 451 buf.Grow(encryptor.CiphertextSizeDelta() + len(data)) 452 encryptedLen := encryptor.Encrypt(&buf, data) 453 454 wrote := 0 455 n := 0 456 // write unencrypted footer 457 if n, err = w.Write(data); err != nil { 458 return int64(n), err 459 } 460 wrote += n 461 // write signature (nonce and tag) 462 buf.Next(4) 463 if n, err = w.Write(buf.Next(encryption.NonceLength)); err != nil { 464 return int64(wrote + n), err 465 } 466 wrote += n 467 buf.Next(encryptedLen - 4 - encryption.NonceLength - encryption.GcmTagLength) 468 n, err = w.Write(buf.Next(encryption.GcmTagLength)) 469 return int64(wrote + n), err 470 } 471 n, err := serializer.Serialize(f.FileMetaData, w, encryptor) 472 return int64(n), err 473 } 474 475 // Version returns the "version" of the file 476 // 477 // WARNING: The value returned by this method is unreliable as 1) the 478 // parquet file metadata stores the version as a single integer and 479 // 2) some producers are known to always write a hardcoded value. Therefore 480 // you cannot use this value to know which features are used in the file. 481 func (f *FileMetaData) Version() parquet.Version { 482 switch f.FileMetaData.Version { 483 case 1: 484 return parquet.V1_0 485 case 2: 486 return parquet.V2_LATEST 487 default: 488 // improperly set version, assume parquet 1.0 489 return parquet.V1_0 490 } 491 } 492 493 // FileCryptoMetadata is a proxy for the thrift fileCryptoMetadata object 494 type FileCryptoMetadata struct { 495 metadata *format.FileCryptoMetaData 496 cryptoMetadataLen uint32 497 } 498 499 // NewFileCryptoMetaData takes in the raw serialized bytes to deserialize 500 // storing the number of bytes that were actually deserialized. 501 func NewFileCryptoMetaData(metadata []byte) (ret FileCryptoMetadata, err error) { 502 ret.metadata = format.NewFileCryptoMetaData() 503 var remain uint64 504 remain, err = thrift.DeserializeThrift(ret.metadata, metadata) 505 ret.cryptoMetadataLen = uint32(uint64(len(metadata)) - remain) 506 return 507 } 508 509 // WriteTo writes out the serialized crypto metadata to w 510 func (fc FileCryptoMetadata) WriteTo(w io.Writer) (int64, error) { 511 serializer := thrift.NewThriftSerializer() 512 n, err := serializer.Serialize(fc.metadata, w, nil) 513 return int64(n), err 514 } 515 516 // Len is the number of bytes that were deserialized to create this object 517 func (fc FileCryptoMetadata) Len() int { return int(fc.cryptoMetadataLen) } 518 519 func (fc FileCryptoMetadata) KeyMetadata() []byte { 520 return fc.metadata.KeyMetadata 521 } 522 523 // EncryptionAlgorithm constructs the object from the thrift instance of 524 // the encryption algorithm 525 func (fc FileCryptoMetadata) EncryptionAlgorithm() parquet.Algorithm { 526 return parquet.AlgorithmFromThrift(fc.metadata.GetEncryptionAlgorithm()) 527 }