github.com/apache/arrow/go/v10@v10.0.1/parquet/metadata/file.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package metadata 18 19 import ( 20 "bytes" 21 "context" 22 "fmt" 23 "io" 24 "reflect" 25 "unicode/utf8" 26 27 "github.com/apache/arrow/go/v10/parquet" 28 "github.com/apache/arrow/go/v10/parquet/compress" 29 "github.com/apache/arrow/go/v10/parquet/internal/encryption" 30 format "github.com/apache/arrow/go/v10/parquet/internal/gen-go/parquet" 31 "github.com/apache/arrow/go/v10/parquet/internal/thrift" 32 "github.com/apache/arrow/go/v10/parquet/schema" 33 "golang.org/x/xerrors" 34 ) 35 36 // DefaultCompressionType is used unless a different compression is specified 37 // in the properties 38 var DefaultCompressionType = compress.Codecs.Uncompressed 39 40 // FileMetaDataBuilder is a proxy for more easily constructing file metadata 41 // particularly used when writing a file out. 42 type FileMetaDataBuilder struct { 43 metadata *format.FileMetaData 44 props *parquet.WriterProperties 45 schema *schema.Schema 46 rowGroups []*format.RowGroup 47 currentRgBldr *RowGroupMetaDataBuilder 48 kvmeta KeyValueMetadata 49 cryptoMetadata *format.FileCryptoMetaData 50 } 51 52 // NewFileMetadataBuilder will use the default writer properties if nil is passed for 53 // the writer properties and nil is allowable for the key value metadata. 54 func NewFileMetadataBuilder(schema *schema.Schema, props *parquet.WriterProperties, kvmeta KeyValueMetadata) *FileMetaDataBuilder { 55 var crypto *format.FileCryptoMetaData 56 if props.FileEncryptionProperties() != nil && props.FileEncryptionProperties().EncryptedFooter() { 57 crypto = format.NewFileCryptoMetaData() 58 } 59 return &FileMetaDataBuilder{ 60 metadata: format.NewFileMetaData(), 61 props: props, 62 schema: schema, 63 kvmeta: kvmeta, 64 cryptoMetadata: crypto, 65 } 66 } 67 68 // GetFileCryptoMetaData returns the cryptographic information for encrypting/ 69 // decrypting the file. 70 func (f *FileMetaDataBuilder) GetFileCryptoMetaData() *FileCryptoMetadata { 71 if f.cryptoMetadata == nil { 72 return nil 73 } 74 75 props := f.props.FileEncryptionProperties() 76 f.cryptoMetadata.EncryptionAlgorithm = props.Algorithm().ToThrift() 77 keyMetadata := props.FooterKeyMetadata() 78 if keyMetadata != "" { 79 f.cryptoMetadata.KeyMetadata = []byte(keyMetadata) 80 } 81 82 return &FileCryptoMetadata{f.cryptoMetadata, 0} 83 } 84 85 // AppendRowGroup adds a rowgroup to the list and returns a builder 86 // for that row group 87 func (f *FileMetaDataBuilder) AppendRowGroup() *RowGroupMetaDataBuilder { 88 if f.rowGroups == nil { 89 f.rowGroups = make([]*format.RowGroup, 0, 1) 90 } 91 92 rg := format.NewRowGroup() 93 f.rowGroups = append(f.rowGroups, rg) 94 f.currentRgBldr = NewRowGroupMetaDataBuilder(f.props, f.schema, rg) 95 return f.currentRgBldr 96 } 97 98 // Finish will finalize the metadata of the number of rows, row groups, 99 // version etc. This will clear out this filemetadatabuilder so it can 100 // be re-used 101 func (f *FileMetaDataBuilder) Finish() (*FileMetaData, error) { 102 totalRows := int64(0) 103 for _, rg := range f.rowGroups { 104 totalRows += rg.NumRows 105 } 106 f.metadata.NumRows = totalRows 107 f.metadata.RowGroups = f.rowGroups 108 switch f.props.Version() { 109 case parquet.V1_0: 110 f.metadata.Version = 1 111 default: 112 f.metadata.Version = 2 113 } 114 createdBy := f.props.CreatedBy() 115 f.metadata.CreatedBy = &createdBy 116 117 // Users cannot set the `ColumnOrder` since we do not not have user defined sort order 118 // in the spec yet. 119 // 120 // We always default to `TYPE_DEFINED_ORDER`. We can expose it in 121 // the API once we have user defined sort orders in the Parquet format. 122 // TypeDefinedOrder implies choose SortOrder based on ConvertedType/PhysicalType 123 typeDefined := format.NewTypeDefinedOrder() 124 colOrder := &format.ColumnOrder{TYPE_ORDER: typeDefined} 125 f.metadata.ColumnOrders = make([]*format.ColumnOrder, f.schema.NumColumns()) 126 for idx := range f.metadata.ColumnOrders { 127 f.metadata.ColumnOrders[idx] = colOrder 128 } 129 130 encryptProps := f.props.FileEncryptionProperties() 131 if encryptProps != nil && !encryptProps.EncryptedFooter() { 132 var signingAlgo parquet.Algorithm 133 algo := encryptProps.Algorithm() 134 signingAlgo.Aad.AadFileUnique = algo.Aad.AadFileUnique 135 signingAlgo.Aad.SupplyAadPrefix = algo.Aad.SupplyAadPrefix 136 if !algo.Aad.SupplyAadPrefix { 137 signingAlgo.Aad.AadPrefix = algo.Aad.AadPrefix 138 } 139 signingAlgo.Algo = parquet.AesGcm 140 f.metadata.EncryptionAlgorithm = signingAlgo.ToThrift() 141 footerSigningMetadata := f.props.FileEncryptionProperties().FooterKeyMetadata() 142 if footerSigningMetadata != "" { 143 f.metadata.FooterSigningKeyMetadata = []byte(footerSigningMetadata) 144 } 145 } 146 147 f.metadata.Schema = schema.ToThrift(f.schema.Root()) 148 f.metadata.KeyValueMetadata = f.kvmeta 149 150 out := &FileMetaData{ 151 FileMetaData: f.metadata, 152 version: NewAppVersion(f.metadata.GetCreatedBy()), 153 } 154 if err := out.initSchema(); err != nil { 155 return nil, err 156 } 157 out.initColumnOrders() 158 159 f.metadata = format.NewFileMetaData() 160 f.rowGroups = nil 161 return out, nil 162 } 163 164 // KeyValueMetadata is an alias for a slice of thrift keyvalue pairs. 165 // 166 // It is presumed that the metadata should all be utf8 valid. 167 type KeyValueMetadata []*format.KeyValue 168 169 // NewKeyValueMetadata is equivalent to make(KeyValueMetadata, 0) 170 func NewKeyValueMetadata() KeyValueMetadata { 171 return make(KeyValueMetadata, 0) 172 } 173 174 // Append adds the passed in key and value to the metadata, if either contains 175 // any invalid utf8 runes, then it is not added and an error is returned. 176 func (k *KeyValueMetadata) Append(key, value string) error { 177 if !utf8.ValidString(key) || !utf8.ValidString(value) { 178 return fmt.Errorf("metadata must be valid utf8 strings, got key = '%s' and value = '%s'", key, value) 179 } 180 *k = append(*k, &format.KeyValue{Key: key, Value: &value}) 181 return nil 182 } 183 184 func (k KeyValueMetadata) Len() int { return len(k) } 185 186 // Equals compares all of the metadata keys and values to check they are equal 187 func (k KeyValueMetadata) Equals(other KeyValueMetadata) bool { 188 return reflect.DeepEqual(k, other) 189 } 190 191 func (k KeyValueMetadata) Keys() (ret []string) { 192 ret = make([]string, len(k)) 193 for idx, v := range k { 194 ret[idx] = v.GetKey() 195 } 196 return 197 } 198 199 func (k KeyValueMetadata) Values() (ret []string) { 200 ret = make([]string, len(k)) 201 for idx, v := range k { 202 ret[idx] = v.GetValue() 203 } 204 return 205 } 206 207 func (k KeyValueMetadata) FindValue(key string) *string { 208 for _, v := range k { 209 if v.Key == key { 210 return v.Value 211 } 212 } 213 return nil 214 } 215 216 // FileMetaData is a proxy around the underlying thrift FileMetaData object 217 // to make it easier to use and interact with. 218 type FileMetaData struct { 219 *format.FileMetaData 220 Schema *schema.Schema 221 FileDecryptor encryption.FileDecryptor 222 223 // app version of the writer for this file 224 version *AppVersion 225 // size of the raw bytes of the metadata in the file which were 226 // decoded by thrift, Size() getter returns the value. 227 metadataLen int 228 } 229 230 // NewFileMetaData takes in the raw bytes of the serialized metadata to deserialize 231 // and will attempt to decrypt the footer if a decryptor is provided. 232 func NewFileMetaData(data []byte, fileDecryptor encryption.FileDecryptor) (*FileMetaData, error) { 233 meta := format.NewFileMetaData() 234 if fileDecryptor != nil { 235 footerDecryptor := fileDecryptor.GetFooterDecryptor() 236 data = footerDecryptor.Decrypt(data) 237 } 238 239 remain, err := thrift.DeserializeThrift(meta, data) 240 if err != nil { 241 return nil, err 242 } 243 244 f := &FileMetaData{ 245 FileMetaData: meta, 246 version: NewAppVersion(meta.GetCreatedBy()), 247 metadataLen: len(data) - int(remain), 248 FileDecryptor: fileDecryptor, 249 } 250 251 f.initSchema() 252 f.initColumnOrders() 253 254 return f, nil 255 } 256 257 // Size is the length of the raw serialized metadata bytes in the footer 258 func (f *FileMetaData) Size() int { return f.metadataLen } 259 260 // NumSchemaElements is the length of the flattened schema list in the thrift 261 func (f *FileMetaData) NumSchemaElements() int { 262 return len(f.FileMetaData.Schema) 263 } 264 265 // RowGroup provides the metadata for the (0-based) index of the row group 266 func (f *FileMetaData) RowGroup(i int) *RowGroupMetaData { 267 return &RowGroupMetaData{ 268 f.RowGroups[i], f.Schema, f.version, f.FileDecryptor, 269 } 270 } 271 272 func (f *FileMetaData) Serialize(ctx context.Context) ([]byte, error) { 273 return thrift.NewThriftSerializer().Write(ctx, f.FileMetaData) 274 } 275 276 func (f *FileMetaData) SerializeString(ctx context.Context) (string, error) { 277 return thrift.NewThriftSerializer().WriteString(ctx, f.FileMetaData) 278 } 279 280 // EncryptionAlgorithm constructs the algorithm object from the thrift 281 // information or returns an empty instance if it was not set. 282 func (f *FileMetaData) EncryptionAlgorithm() parquet.Algorithm { 283 if f.IsSetEncryptionAlgorithm() { 284 return parquet.AlgorithmFromThrift(f.GetEncryptionAlgorithm()) 285 } 286 return parquet.Algorithm{} 287 } 288 289 func (f *FileMetaData) initSchema() error { 290 root, err := schema.FromParquet(f.FileMetaData.Schema) 291 if err != nil { 292 return err 293 } 294 f.Schema = schema.NewSchema(root.(*schema.GroupNode)) 295 return nil 296 } 297 298 func (f *FileMetaData) initColumnOrders() { 299 orders := make([]parquet.ColumnOrder, 0, f.Schema.NumColumns()) 300 if f.IsSetColumnOrders() { 301 for _, o := range f.GetColumnOrders() { 302 if o.IsSetTYPE_ORDER() { 303 orders = append(orders, parquet.ColumnOrders.TypeDefinedOrder) 304 } else { 305 orders = append(orders, parquet.ColumnOrders.Undefined) 306 } 307 } 308 } else { 309 orders = orders[:f.Schema.NumColumns()] 310 orders[0] = parquet.ColumnOrders.Undefined 311 for i := 1; i < len(orders); i *= 2 { 312 copy(orders[i:], orders[:i]) 313 } 314 } 315 f.Schema.UpdateColumnOrders(orders) 316 } 317 318 // WriterVersion returns the constructed application version from the 319 // created by string 320 func (f *FileMetaData) WriterVersion() *AppVersion { 321 if f.version == nil { 322 f.version = NewAppVersion(f.GetCreatedBy()) 323 } 324 return f.version 325 } 326 327 // SetFilePath will set the file path into all of the columns in each row group. 328 func (f *FileMetaData) SetFilePath(path string) { 329 for _, rg := range f.RowGroups { 330 for _, chunk := range rg.Columns { 331 chunk.FilePath = &path 332 } 333 } 334 } 335 336 // AppendRowGroups will add all of the rowgroup metadata from other to the 337 // current file metadata 338 func (f *FileMetaData) AppendRowGroups(other *FileMetaData) error { 339 if !f.Schema.Equals(other.Schema) { 340 return xerrors.New("parquet/FileMetaData: AppendRowGroups requires equal schemas") 341 } 342 343 f.RowGroups = append(f.RowGroups, other.GetRowGroups()...) 344 for _, rg := range other.GetRowGroups() { 345 f.NumRows += rg.NumRows 346 } 347 return nil 348 } 349 350 // Subset will construct a new FileMetaData object containing only the requested 351 // row groups by index 352 func (f *FileMetaData) Subset(rowGroups []int) (*FileMetaData, error) { 353 for _, i := range rowGroups { 354 if i < len(f.RowGroups) { 355 continue 356 } 357 return nil, fmt.Errorf("parquet: this file only has %d row groups, but requested a subset including row group: %d", len(f.RowGroups), i) 358 } 359 360 out := &FileMetaData{ 361 &format.FileMetaData{ 362 Schema: f.FileMetaData.Schema, 363 CreatedBy: f.CreatedBy, 364 ColumnOrders: f.GetColumnOrders(), 365 EncryptionAlgorithm: f.FileMetaData.EncryptionAlgorithm, 366 FooterSigningKeyMetadata: f.FooterSigningKeyMetadata, 367 Version: f.FileMetaData.Version, 368 KeyValueMetadata: f.KeyValueMetadata(), 369 }, 370 f.Schema, 371 f.FileDecryptor, 372 f.version, 373 0, 374 } 375 376 out.RowGroups = make([]*format.RowGroup, 0, len(rowGroups)) 377 for _, selected := range rowGroups { 378 out.RowGroups = append(out.RowGroups, f.RowGroups[selected]) 379 out.NumRows += f.RowGroups[selected].GetNumRows() 380 } 381 382 return out, nil 383 } 384 385 func (f *FileMetaData) Equals(other *FileMetaData) bool { 386 return reflect.DeepEqual(f.FileMetaData, other.FileMetaData) 387 } 388 389 func (f *FileMetaData) KeyValueMetadata() KeyValueMetadata { 390 return f.GetKeyValueMetadata() 391 } 392 393 // VerifySignature constructs a cryptographic signature using the FileDecryptor 394 // of the footer and then verifies it's integrity. 395 // 396 // Panics if f.FileDecryptor is nil 397 func (f *FileMetaData) VerifySignature(signature []byte) bool { 398 if f.FileDecryptor == nil { 399 panic("decryption not set propertly, cannot verify signature") 400 } 401 402 serializer := thrift.NewThriftSerializer() 403 data, _ := serializer.Write(context.Background(), f.FileMetaData) 404 nonce := signature[:encryption.NonceLength] 405 tag := signature[encryption.NonceLength : encryption.NonceLength+encryption.GcmTagLength] 406 407 key := f.FileDecryptor.GetFooterKey() 408 aad := encryption.CreateFooterAad(f.FileDecryptor.FileAad()) 409 410 enc := encryption.NewAesEncryptor(f.FileDecryptor.Algorithm(), true) 411 var buf bytes.Buffer 412 buf.Grow(enc.CiphertextSizeDelta() + len(data)) 413 encryptedLen := enc.SignedFooterEncrypt(&buf, data, []byte(key), []byte(aad), nonce) 414 return bytes.Equal(buf.Bytes()[encryptedLen-encryption.GcmTagLength:], tag) 415 } 416 417 // WriteTo will serialize and write out this file metadata, encrypting it if 418 // appropriate. 419 // 420 // If it is an encrypted file with a plaintext footer, then we will write the 421 // signature with the unencrypted footer. 422 func (f *FileMetaData) WriteTo(w io.Writer, encryptor encryption.Encryptor) (int64, error) { 423 serializer := thrift.NewThriftSerializer() 424 // only in encrypted files with plaintext footers, the encryption algorithm is set in the footer 425 if f.IsSetEncryptionAlgorithm() { 426 data, err := serializer.Write(context.Background(), f.FileMetaData) 427 if err != nil { 428 return 0, err 429 } 430 431 // encrypt the footer key 432 var buf bytes.Buffer 433 buf.Grow(encryptor.CiphertextSizeDelta() + len(data)) 434 encryptedLen := encryptor.Encrypt(&buf, data) 435 436 wrote := 0 437 n := 0 438 // write unencrypted footer 439 if n, err = w.Write(data); err != nil { 440 return int64(n), err 441 } 442 wrote += n 443 // write signature (nonce and tag) 444 buf.Next(4) 445 if n, err = w.Write(buf.Next(encryption.NonceLength)); err != nil { 446 return int64(wrote + n), err 447 } 448 wrote += n 449 buf.Next(encryptedLen - 4 - encryption.NonceLength - encryption.GcmTagLength) 450 n, err = w.Write(buf.Next(encryption.GcmTagLength)) 451 return int64(wrote + n), err 452 } 453 n, err := serializer.Serialize(f.FileMetaData, w, encryptor) 454 return int64(n), err 455 } 456 457 // Version returns the "version" of the file 458 // 459 // WARNING: The value returned by this method is unreliable as 1) the 460 // parquet file metadata stores the version as a single integer and 461 // 2) some producers are known to always write a hardcoded value. Therefore 462 // you cannot use this value to know which features are used in the file. 463 func (f *FileMetaData) Version() parquet.Version { 464 switch f.FileMetaData.Version { 465 case 1: 466 return parquet.V1_0 467 case 2: 468 return parquet.V2_LATEST 469 default: 470 // imporperly set version, assume parquet 1.0 471 return parquet.V1_0 472 } 473 } 474 475 // FileCryptoMetadata is a proxy for the thrift fileCryptoMetadata object 476 type FileCryptoMetadata struct { 477 metadata *format.FileCryptoMetaData 478 cryptoMetadataLen uint32 479 } 480 481 // NewFileCryptoMetaData takes in the raw serialized bytes to deserialize 482 // storing the number of bytes that were actually deserialized. 483 func NewFileCryptoMetaData(metadata []byte) (ret FileCryptoMetadata, err error) { 484 ret.metadata = format.NewFileCryptoMetaData() 485 var remain uint64 486 remain, err = thrift.DeserializeThrift(ret.metadata, metadata) 487 ret.cryptoMetadataLen = uint32(uint64(len(metadata)) - remain) 488 return 489 } 490 491 // WriteTo writes out the serialized crypto metadata to w 492 func (fc FileCryptoMetadata) WriteTo(w io.Writer) (int64, error) { 493 serializer := thrift.NewThriftSerializer() 494 n, err := serializer.Serialize(fc.metadata, w, nil) 495 return int64(n), err 496 } 497 498 // Len is the number of bytes that were deserialized to create this object 499 func (fc FileCryptoMetadata) Len() int { return int(fc.cryptoMetadataLen) } 500 501 func (fc FileCryptoMetadata) KeyMetadata() []byte { 502 return fc.metadata.KeyMetadata 503 } 504 505 // EncryptionAlgorithm constructs the object from the thrift instance of 506 // the encryption algorithm 507 func (fc FileCryptoMetadata) EncryptionAlgorithm() parquet.Algorithm { 508 return parquet.AlgorithmFromThrift(fc.metadata.GetEncryptionAlgorithm()) 509 }