github.com/apache/arrow/go/v7@v7.0.1/parquet/metadata/file.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package metadata 18 19 import ( 20 "bytes" 21 "context" 22 "io" 23 "reflect" 24 "unicode/utf8" 25 26 "github.com/apache/arrow/go/v7/parquet" 27 "github.com/apache/arrow/go/v7/parquet/compress" 28 "github.com/apache/arrow/go/v7/parquet/internal/encryption" 29 format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet" 30 "github.com/apache/arrow/go/v7/parquet/internal/thrift" 31 "github.com/apache/arrow/go/v7/parquet/schema" 32 "golang.org/x/xerrors" 33 ) 34 35 // DefaultCompressionType is used unless a different compression is specified 36 // in the properties 37 var DefaultCompressionType = compress.Codecs.Uncompressed 38 39 // FileMetaDataBuilder is a proxy for more easily constructing file metadata 40 // particularly used when writing a file out. 41 type FileMetaDataBuilder struct { 42 metadata *format.FileMetaData 43 props *parquet.WriterProperties 44 schema *schema.Schema 45 rowGroups []*format.RowGroup 46 currentRgBldr *RowGroupMetaDataBuilder 47 kvmeta KeyValueMetadata 48 cryptoMetadata *format.FileCryptoMetaData 49 } 50 51 // NewFileMetadataBuilder will use the default writer properties if nil is passed for 52 // the writer properties and nil is allowable for the key value metadata. 53 func NewFileMetadataBuilder(schema *schema.Schema, props *parquet.WriterProperties, kvmeta KeyValueMetadata) *FileMetaDataBuilder { 54 var crypto *format.FileCryptoMetaData 55 if props.FileEncryptionProperties() != nil && props.FileEncryptionProperties().EncryptedFooter() { 56 crypto = format.NewFileCryptoMetaData() 57 } 58 return &FileMetaDataBuilder{ 59 metadata: format.NewFileMetaData(), 60 props: props, 61 schema: schema, 62 kvmeta: kvmeta, 63 cryptoMetadata: crypto, 64 } 65 } 66 67 // GetFileCryptoMetaData returns the cryptographic information for encrypting/ 68 // decrypting the file. 69 func (f *FileMetaDataBuilder) GetFileCryptoMetaData() *FileCryptoMetadata { 70 if f.cryptoMetadata == nil { 71 return nil 72 } 73 74 props := f.props.FileEncryptionProperties() 75 f.cryptoMetadata.EncryptionAlgorithm = props.Algorithm().ToThrift() 76 keyMetadata := props.FooterKeyMetadata() 77 if keyMetadata != "" { 78 f.cryptoMetadata.KeyMetadata = []byte(keyMetadata) 79 } 80 81 return &FileCryptoMetadata{f.cryptoMetadata, 0} 82 } 83 84 // AppendRowGroup adds a rowgroup to the list and returns a builder 85 // for that row group 86 func (f *FileMetaDataBuilder) AppendRowGroup() *RowGroupMetaDataBuilder { 87 if f.rowGroups == nil { 88 f.rowGroups = make([]*format.RowGroup, 0, 1) 89 } 90 91 rg := format.NewRowGroup() 92 f.rowGroups = append(f.rowGroups, rg) 93 f.currentRgBldr = NewRowGroupMetaDataBuilder(f.props, f.schema, rg) 94 return f.currentRgBldr 95 } 96 97 // Finish will finalize the metadata of the number of rows, row groups, 98 // version etc. This will clear out this filemetadatabuilder so it can 99 // be re-used 100 func (f *FileMetaDataBuilder) Finish() (*FileMetaData, error) { 101 totalRows := int64(0) 102 for _, rg := range f.rowGroups { 103 totalRows += rg.NumRows 104 } 105 f.metadata.NumRows = totalRows 106 f.metadata.RowGroups = f.rowGroups 107 switch f.props.Version() { 108 case parquet.V1_0: 109 f.metadata.Version = 1 110 default: 111 f.metadata.Version = 2 112 } 113 createdBy := f.props.CreatedBy() 114 f.metadata.CreatedBy = &createdBy 115 116 // Users cannot set the `ColumnOrder` since we do not not have user defined sort order 117 // in the spec yet. 118 // 119 // We always default to `TYPE_DEFINED_ORDER`. We can expose it in 120 // the API once we have user defined sort orders in the Parquet format. 121 // TypeDefinedOrder implies choose SortOrder based on ConvertedType/PhysicalType 122 typeDefined := format.NewTypeDefinedOrder() 123 colOrder := &format.ColumnOrder{TYPE_ORDER: typeDefined} 124 f.metadata.ColumnOrders = make([]*format.ColumnOrder, f.schema.NumColumns()) 125 for idx := range f.metadata.ColumnOrders { 126 f.metadata.ColumnOrders[idx] = colOrder 127 } 128 129 encryptProps := f.props.FileEncryptionProperties() 130 if encryptProps != nil && !encryptProps.EncryptedFooter() { 131 var signingAlgo parquet.Algorithm 132 algo := encryptProps.Algorithm() 133 signingAlgo.Aad.AadFileUnique = algo.Aad.AadFileUnique 134 signingAlgo.Aad.SupplyAadPrefix = algo.Aad.SupplyAadPrefix 135 if !algo.Aad.SupplyAadPrefix { 136 signingAlgo.Aad.AadPrefix = algo.Aad.AadPrefix 137 } 138 signingAlgo.Algo = parquet.AesGcm 139 f.metadata.EncryptionAlgorithm = signingAlgo.ToThrift() 140 footerSigningMetadata := f.props.FileEncryptionProperties().FooterKeyMetadata() 141 if footerSigningMetadata != "" { 142 f.metadata.FooterSigningKeyMetadata = []byte(footerSigningMetadata) 143 } 144 } 145 146 f.metadata.Schema = schema.ToThrift(f.schema.Root()) 147 f.metadata.KeyValueMetadata = f.kvmeta 148 149 out := &FileMetaData{ 150 FileMetaData: f.metadata, 151 version: NewAppVersion(f.metadata.GetCreatedBy()), 152 } 153 if err := out.initSchema(); err != nil { 154 return nil, err 155 } 156 out.initColumnOrders() 157 158 f.metadata = format.NewFileMetaData() 159 f.rowGroups = nil 160 return out, nil 161 } 162 163 // KeyValueMetadata is an alias for a slice of thrift keyvalue pairs. 164 // 165 // It is presumed that the metadata should all be utf8 valid. 166 type KeyValueMetadata []*format.KeyValue 167 168 // NewKeyValueMetadata is equivalent to make(KeyValueMetadata, 0) 169 func NewKeyValueMetadata() KeyValueMetadata { 170 return make(KeyValueMetadata, 0) 171 } 172 173 // Append adds the passed in key and value to the metadata, if either contains 174 // any invalid utf8 runes, then it is not added and an error is returned. 175 func (k *KeyValueMetadata) Append(key, value string) error { 176 if !utf8.ValidString(key) || !utf8.ValidString(value) { 177 return xerrors.Errorf("metadata must be valid utf8 strings, got key = '%s' and value = '%s'", key, value) 178 } 179 *k = append(*k, &format.KeyValue{Key: key, Value: &value}) 180 return nil 181 } 182 183 func (k KeyValueMetadata) Len() int { return len(k) } 184 185 // Equals compares all of the metadata keys and values to check they are equal 186 func (k KeyValueMetadata) Equals(other KeyValueMetadata) bool { 187 return reflect.DeepEqual(k, other) 188 } 189 190 func (k KeyValueMetadata) Keys() (ret []string) { 191 ret = make([]string, len(k)) 192 for idx, v := range k { 193 ret[idx] = v.GetKey() 194 } 195 return 196 } 197 198 func (k KeyValueMetadata) Values() (ret []string) { 199 ret = make([]string, len(k)) 200 for idx, v := range k { 201 ret[idx] = v.GetValue() 202 } 203 return 204 } 205 206 func (k KeyValueMetadata) FindValue(key string) *string { 207 for _, v := range k { 208 if v.Key == key { 209 return v.Value 210 } 211 } 212 return nil 213 } 214 215 // FileMetaData is a proxy around the underlying thrift FileMetaData object 216 // to make it easier to use and interact with. 217 type FileMetaData struct { 218 *format.FileMetaData 219 Schema *schema.Schema 220 FileDecryptor encryption.FileDecryptor 221 222 // app version of the writer for this file 223 version *AppVersion 224 // size of the raw bytes of the metadata in the file which were 225 // decoded by thrift, Size() getter returns the value. 226 metadataLen int 227 } 228 229 // NewFileMetaData takes in the raw bytes of the serialized metadata to deserialize 230 // and will attempt to decrypt the footer if a decryptor is provided. 231 func NewFileMetaData(data []byte, fileDecryptor encryption.FileDecryptor) (*FileMetaData, error) { 232 meta := format.NewFileMetaData() 233 if fileDecryptor != nil { 234 footerDecryptor := fileDecryptor.GetFooterDecryptor() 235 data = footerDecryptor.Decrypt(data) 236 } 237 238 remain, err := thrift.DeserializeThrift(meta, data) 239 if err != nil { 240 return nil, err 241 } 242 243 f := &FileMetaData{ 244 FileMetaData: meta, 245 version: NewAppVersion(meta.GetCreatedBy()), 246 metadataLen: len(data) - int(remain), 247 FileDecryptor: fileDecryptor, 248 } 249 250 f.initSchema() 251 f.initColumnOrders() 252 253 return f, nil 254 } 255 256 // Size is the length of the raw serialized metadata bytes in the footer 257 func (f *FileMetaData) Size() int { return f.metadataLen } 258 259 // NumSchemaElements is the length of the flattened schema list in the thrift 260 func (f *FileMetaData) NumSchemaElements() int { 261 return len(f.FileMetaData.Schema) 262 } 263 264 // RowGroup provides the metadata for the (0-based) index of the row group 265 func (f *FileMetaData) RowGroup(i int) *RowGroupMetaData { 266 return &RowGroupMetaData{ 267 f.RowGroups[i], f.Schema, f.version, f.FileDecryptor, 268 } 269 } 270 271 func (f *FileMetaData) Serialize(ctx context.Context) ([]byte, error) { 272 return thrift.NewThriftSerializer().Write(ctx, f.FileMetaData) 273 } 274 275 func (f *FileMetaData) SerializeString(ctx context.Context) (string, error) { 276 return thrift.NewThriftSerializer().WriteString(ctx, f.FileMetaData) 277 } 278 279 // EncryptionAlgorithm constructs the algorithm object from the thrift 280 // information or returns an empty instance if it was not set. 281 func (f *FileMetaData) EncryptionAlgorithm() parquet.Algorithm { 282 if f.IsSetEncryptionAlgorithm() { 283 return parquet.AlgorithmFromThrift(f.GetEncryptionAlgorithm()) 284 } 285 return parquet.Algorithm{} 286 } 287 288 func (f *FileMetaData) initSchema() error { 289 root, err := schema.FromParquet(f.FileMetaData.Schema) 290 if err != nil { 291 return err 292 } 293 f.Schema = schema.NewSchema(root.(*schema.GroupNode)) 294 return nil 295 } 296 297 func (f *FileMetaData) initColumnOrders() { 298 orders := make([]parquet.ColumnOrder, 0, f.Schema.NumColumns()) 299 if f.IsSetColumnOrders() { 300 for _, o := range f.GetColumnOrders() { 301 if o.IsSetTYPE_ORDER() { 302 orders = append(orders, parquet.ColumnOrders.TypeDefinedOrder) 303 } else { 304 orders = append(orders, parquet.ColumnOrders.Undefined) 305 } 306 } 307 } else { 308 orders = orders[:f.Schema.NumColumns()] 309 orders[0] = parquet.ColumnOrders.Undefined 310 for i := 1; i < len(orders); i *= 2 { 311 copy(orders[i:], orders[:i]) 312 } 313 } 314 f.Schema.UpdateColumnOrders(orders) 315 } 316 317 // WriterVersion returns the constructed application version from the 318 // created by string 319 func (f *FileMetaData) WriterVersion() *AppVersion { 320 if f.version == nil { 321 f.version = NewAppVersion(f.GetCreatedBy()) 322 } 323 return f.version 324 } 325 326 // SetFilePath will set the file path into all of the columns in each row group. 327 func (f *FileMetaData) SetFilePath(path string) { 328 for _, rg := range f.RowGroups { 329 for _, chunk := range rg.Columns { 330 chunk.FilePath = &path 331 } 332 } 333 } 334 335 // AppendRowGroups will add all of the rowgroup metadata from other to the 336 // current file metadata 337 func (f *FileMetaData) AppendRowGroups(other *FileMetaData) error { 338 if !f.Schema.Equals(other.Schema) { 339 return xerrors.New("parquet/FileMetaData: AppendRowGroups requires equal schemas") 340 } 341 342 f.RowGroups = append(f.RowGroups, other.GetRowGroups()...) 343 for _, rg := range other.GetRowGroups() { 344 f.NumRows += rg.NumRows 345 } 346 return nil 347 } 348 349 // Subset will construct a new FileMetaData object containing only the requested 350 // row groups by index 351 func (f *FileMetaData) Subset(rowGroups []int) (*FileMetaData, error) { 352 for _, i := range rowGroups { 353 if i < len(f.RowGroups) { 354 continue 355 } 356 return nil, xerrors.Errorf("parquet: this file only has %d row groups, but requested a subset including row group: %d", len(f.RowGroups), i) 357 } 358 359 out := &FileMetaData{ 360 &format.FileMetaData{ 361 Schema: f.FileMetaData.Schema, 362 CreatedBy: f.CreatedBy, 363 ColumnOrders: f.GetColumnOrders(), 364 EncryptionAlgorithm: f.FileMetaData.EncryptionAlgorithm, 365 FooterSigningKeyMetadata: f.FooterSigningKeyMetadata, 366 Version: f.FileMetaData.Version, 367 KeyValueMetadata: f.KeyValueMetadata(), 368 }, 369 f.Schema, 370 f.FileDecryptor, 371 f.version, 372 0, 373 } 374 375 out.RowGroups = make([]*format.RowGroup, 0, len(rowGroups)) 376 for _, selected := range rowGroups { 377 out.RowGroups = append(out.RowGroups, f.RowGroups[selected]) 378 out.NumRows += f.RowGroups[selected].GetNumRows() 379 } 380 381 return out, nil 382 } 383 384 func (f *FileMetaData) Equals(other *FileMetaData) bool { 385 return reflect.DeepEqual(f.FileMetaData, other.FileMetaData) 386 } 387 388 func (f *FileMetaData) KeyValueMetadata() KeyValueMetadata { 389 return f.GetKeyValueMetadata() 390 } 391 392 // VerifySignature constructs a cryptographic signature using the FileDecryptor 393 // of the footer and then verifies it's integrity. 394 // 395 // Panics if f.FileDecryptor is nil 396 func (f *FileMetaData) VerifySignature(signature []byte) bool { 397 if f.FileDecryptor == nil { 398 panic("decryption not set propertly, cannot verify signature") 399 } 400 401 serializer := thrift.NewThriftSerializer() 402 data, _ := serializer.Write(context.Background(), f.FileMetaData) 403 nonce := signature[:encryption.NonceLength] 404 tag := signature[encryption.NonceLength : encryption.NonceLength+encryption.GcmTagLength] 405 406 key := f.FileDecryptor.GetFooterKey() 407 aad := encryption.CreateFooterAad(f.FileDecryptor.FileAad()) 408 409 enc := encryption.NewAesEncryptor(f.FileDecryptor.Algorithm(), true) 410 var buf bytes.Buffer 411 buf.Grow(enc.CiphertextSizeDelta() + len(data)) 412 encryptedLen := enc.SignedFooterEncrypt(&buf, data, []byte(key), []byte(aad), nonce) 413 return bytes.Equal(buf.Bytes()[encryptedLen-encryption.GcmTagLength:], tag) 414 } 415 416 // WriteTo will serialize and write out this file metadata, encrypting it if 417 // appropriate. 418 // 419 // If it is an encrypted file with a plaintext footer, then we will write the 420 // signature with the unencrypted footer. 421 func (f *FileMetaData) WriteTo(w io.Writer, encryptor encryption.Encryptor) (int64, error) { 422 serializer := thrift.NewThriftSerializer() 423 // only in encrypted files with plaintext footers, the encryption algorithm is set in the footer 424 if f.IsSetEncryptionAlgorithm() { 425 data, err := serializer.Write(context.Background(), f.FileMetaData) 426 if err != nil { 427 return 0, err 428 } 429 430 // encrypt the footer key 431 var buf bytes.Buffer 432 buf.Grow(encryptor.CiphertextSizeDelta() + len(data)) 433 encryptedLen := encryptor.Encrypt(&buf, data) 434 435 wrote := 0 436 n := 0 437 // write unencrypted footer 438 if n, err = w.Write(data); err != nil { 439 return int64(n), err 440 } 441 wrote += n 442 // write signature (nonce and tag) 443 buf.Next(4) 444 if n, err = w.Write(buf.Next(encryption.NonceLength)); err != nil { 445 return int64(wrote + n), err 446 } 447 wrote += n 448 buf.Next(encryptedLen - 4 - encryption.NonceLength - encryption.GcmTagLength) 449 n, err = w.Write(buf.Next(encryption.GcmTagLength)) 450 return int64(wrote + n), err 451 } 452 n, err := serializer.Serialize(f.FileMetaData, w, encryptor) 453 return int64(n), err 454 } 455 456 // Version returns the "version" of the file 457 // 458 // WARNING: The value returned by this method is unreliable as 1) the 459 // parquet file metadata stores the version as a single integer and 460 // 2) some producers are known to always write a hardcoded value. Therefore 461 // you cannot use this value to know which features are used in the file. 462 func (f *FileMetaData) Version() parquet.Version { 463 switch f.FileMetaData.Version { 464 case 1: 465 return parquet.V1_0 466 case 2: 467 return parquet.V2_LATEST 468 default: 469 // imporperly set version, assume parquet 1.0 470 return parquet.V1_0 471 } 472 } 473 474 // FileCryptoMetadata is a proxy for the thrift fileCryptoMetadata object 475 type FileCryptoMetadata struct { 476 metadata *format.FileCryptoMetaData 477 cryptoMetadataLen uint32 478 } 479 480 // NewFileCryptoMetaData takes in the raw serialized bytes to deserialize 481 // storing the number of bytes that were actually deserialized. 482 func NewFileCryptoMetaData(metadata []byte) (ret FileCryptoMetadata, err error) { 483 ret.metadata = format.NewFileCryptoMetaData() 484 var remain uint64 485 remain, err = thrift.DeserializeThrift(ret.metadata, metadata) 486 ret.cryptoMetadataLen = uint32(uint64(len(metadata)) - remain) 487 return 488 } 489 490 // WriteTo writes out the serialized crypto metadata to w 491 func (fc FileCryptoMetadata) WriteTo(w io.Writer) (int64, error) { 492 serializer := thrift.NewThriftSerializer() 493 n, err := serializer.Serialize(fc.metadata, w, nil) 494 return int64(n), err 495 } 496 497 // Len is the number of bytes that were deserialized to create this object 498 func (fc FileCryptoMetadata) Len() int { return int(fc.cryptoMetadataLen) } 499 500 func (fc FileCryptoMetadata) KeyMetadata() []byte { 501 return fc.metadata.KeyMetadata 502 } 503 504 // EncryptionAlgorithm constructs the object from the thrift instance of 505 // the encryption algorithm 506 func (fc FileCryptoMetadata) EncryptionAlgorithm() parquet.Algorithm { 507 return parquet.AlgorithmFromThrift(fc.metadata.GetEncryptionAlgorithm()) 508 }