github.com/apache/arrow/go/v14@v14.0.1/parquet/metadata/file.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package metadata
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"fmt"
    23  	"io"
    24  	"reflect"
    25  	"unicode/utf8"
    26  
    27  	"github.com/apache/arrow/go/v14/parquet"
    28  	"github.com/apache/arrow/go/v14/parquet/compress"
    29  	"github.com/apache/arrow/go/v14/parquet/internal/encryption"
    30  	format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet"
    31  	"github.com/apache/arrow/go/v14/parquet/internal/thrift"
    32  	"github.com/apache/arrow/go/v14/parquet/schema"
    33  	"golang.org/x/xerrors"
    34  )
    35  
    36  // DefaultCompressionType is used unless a different compression is specified
    37  // in the properties
    38  var DefaultCompressionType = compress.Codecs.Uncompressed
    39  
    40  // FileMetaDataBuilder is a proxy for more easily constructing file metadata
    41  // particularly used when writing a file out.
    42  type FileMetaDataBuilder struct {
    43  	metadata       *format.FileMetaData
    44  	props          *parquet.WriterProperties
    45  	schema         *schema.Schema
    46  	rowGroups      []*format.RowGroup
    47  	currentRgBldr  *RowGroupMetaDataBuilder
    48  	kvmeta         KeyValueMetadata
    49  	cryptoMetadata *format.FileCryptoMetaData
    50  }
    51  
    52  // NewFileMetadataBuilder will use the default writer properties if nil is passed for
    53  // the writer properties and nil is allowable for the key value metadata.
    54  func NewFileMetadataBuilder(schema *schema.Schema, props *parquet.WriterProperties, kvmeta KeyValueMetadata) *FileMetaDataBuilder {
    55  	var crypto *format.FileCryptoMetaData
    56  	if props.FileEncryptionProperties() != nil && props.FileEncryptionProperties().EncryptedFooter() {
    57  		crypto = format.NewFileCryptoMetaData()
    58  	}
    59  	return &FileMetaDataBuilder{
    60  		metadata:       format.NewFileMetaData(),
    61  		props:          props,
    62  		schema:         schema,
    63  		kvmeta:         kvmeta,
    64  		cryptoMetadata: crypto,
    65  	}
    66  }
    67  
    68  // GetFileCryptoMetaData returns the cryptographic information for encrypting/
    69  // decrypting the file.
    70  func (f *FileMetaDataBuilder) GetFileCryptoMetaData() *FileCryptoMetadata {
    71  	if f.cryptoMetadata == nil {
    72  		return nil
    73  	}
    74  
    75  	props := f.props.FileEncryptionProperties()
    76  	f.cryptoMetadata.EncryptionAlgorithm = props.Algorithm().ToThrift()
    77  	keyMetadata := props.FooterKeyMetadata()
    78  	if keyMetadata != "" {
    79  		f.cryptoMetadata.KeyMetadata = []byte(keyMetadata)
    80  	}
    81  
    82  	return &FileCryptoMetadata{f.cryptoMetadata, 0}
    83  }
    84  
    85  // AppendRowGroup adds a rowgroup to the list and returns a builder
    86  // for that row group
    87  func (f *FileMetaDataBuilder) AppendRowGroup() *RowGroupMetaDataBuilder {
    88  	if f.rowGroups == nil {
    89  		f.rowGroups = make([]*format.RowGroup, 0, 1)
    90  	}
    91  
    92  	rg := format.NewRowGroup()
    93  	f.rowGroups = append(f.rowGroups, rg)
    94  	f.currentRgBldr = NewRowGroupMetaDataBuilder(f.props, f.schema, rg)
    95  	return f.currentRgBldr
    96  }
    97  
    98  // AppendKeyValueMetadata appends a key/value pair to the existing key/value metadata
    99  func (f *FileMetaDataBuilder) AppendKeyValueMetadata(key string, value string) error {
   100  	return f.kvmeta.Append(key, value)
   101  }
   102  
   103  // Finish will finalize the metadata of the number of rows, row groups,
   104  // version etc. This will clear out this filemetadatabuilder so it can
   105  // be re-used
   106  func (f *FileMetaDataBuilder) Finish() (*FileMetaData, error) {
   107  	totalRows := int64(0)
   108  	for _, rg := range f.rowGroups {
   109  		totalRows += rg.NumRows
   110  	}
   111  	f.metadata.NumRows = totalRows
   112  	f.metadata.RowGroups = f.rowGroups
   113  	switch f.props.Version() {
   114  	case parquet.V1_0:
   115  		f.metadata.Version = 1
   116  	default:
   117  		f.metadata.Version = 2
   118  	}
   119  	createdBy := f.props.CreatedBy()
   120  	f.metadata.CreatedBy = &createdBy
   121  
   122  	// Users cannot set the `ColumnOrder` since we do not not have user defined sort order
   123  	// in the spec yet.
   124  	//
   125  	// We always default to `TYPE_DEFINED_ORDER`. We can expose it in
   126  	// the API once we have user defined sort orders in the Parquet format.
   127  	// TypeDefinedOrder implies choose SortOrder based on ConvertedType/PhysicalType
   128  	typeDefined := format.NewTypeDefinedOrder()
   129  	colOrder := &format.ColumnOrder{TYPE_ORDER: typeDefined}
   130  	f.metadata.ColumnOrders = make([]*format.ColumnOrder, f.schema.NumColumns())
   131  	for idx := range f.metadata.ColumnOrders {
   132  		f.metadata.ColumnOrders[idx] = colOrder
   133  	}
   134  
   135  	encryptProps := f.props.FileEncryptionProperties()
   136  	if encryptProps != nil && !encryptProps.EncryptedFooter() {
   137  		var signingAlgo parquet.Algorithm
   138  		algo := encryptProps.Algorithm()
   139  		signingAlgo.Aad.AadFileUnique = algo.Aad.AadFileUnique
   140  		signingAlgo.Aad.SupplyAadPrefix = algo.Aad.SupplyAadPrefix
   141  		if !algo.Aad.SupplyAadPrefix {
   142  			signingAlgo.Aad.AadPrefix = algo.Aad.AadPrefix
   143  		}
   144  		signingAlgo.Algo = parquet.AesGcm
   145  		f.metadata.EncryptionAlgorithm = signingAlgo.ToThrift()
   146  		footerSigningMetadata := f.props.FileEncryptionProperties().FooterKeyMetadata()
   147  		if footerSigningMetadata != "" {
   148  			f.metadata.FooterSigningKeyMetadata = []byte(footerSigningMetadata)
   149  		}
   150  	}
   151  
   152  	f.metadata.Schema = schema.ToThrift(f.schema.Root())
   153  	f.metadata.KeyValueMetadata = f.kvmeta
   154  
   155  	out := &FileMetaData{
   156  		FileMetaData: f.metadata,
   157  		version:      NewAppVersion(f.metadata.GetCreatedBy()),
   158  	}
   159  	if err := out.initSchema(); err != nil {
   160  		return nil, err
   161  	}
   162  	out.initColumnOrders()
   163  
   164  	f.metadata = format.NewFileMetaData()
   165  	f.rowGroups = nil
   166  	return out, nil
   167  }
   168  
   169  // KeyValueMetadata is an alias for a slice of thrift keyvalue pairs.
   170  //
   171  // It is presumed that the metadata should all be utf8 valid.
   172  type KeyValueMetadata []*format.KeyValue
   173  
   174  // NewKeyValueMetadata is equivalent to make(KeyValueMetadata, 0)
   175  func NewKeyValueMetadata() KeyValueMetadata {
   176  	return make(KeyValueMetadata, 0)
   177  }
   178  
   179  // Append adds the passed in key and value to the metadata, if either contains
   180  // any invalid utf8 runes, then it is not added and an error is returned.
   181  func (k *KeyValueMetadata) Append(key, value string) error {
   182  	if !utf8.ValidString(key) || !utf8.ValidString(value) {
   183  		return fmt.Errorf("metadata must be valid utf8 strings, got key = '%s' and value = '%s'", key, value)
   184  	}
   185  	*k = append(*k, &format.KeyValue{Key: key, Value: &value})
   186  	return nil
   187  }
   188  
   189  func (k KeyValueMetadata) Len() int { return len(k) }
   190  
   191  // Equals compares all of the metadata keys and values to check they are equal
   192  func (k KeyValueMetadata) Equals(other KeyValueMetadata) bool {
   193  	return reflect.DeepEqual(k, other)
   194  }
   195  
   196  func (k KeyValueMetadata) Keys() (ret []string) {
   197  	ret = make([]string, len(k))
   198  	for idx, v := range k {
   199  		ret[idx] = v.GetKey()
   200  	}
   201  	return
   202  }
   203  
   204  func (k KeyValueMetadata) Values() (ret []string) {
   205  	ret = make([]string, len(k))
   206  	for idx, v := range k {
   207  		ret[idx] = v.GetValue()
   208  	}
   209  	return
   210  }
   211  
   212  func (k KeyValueMetadata) FindValue(key string) *string {
   213  	for _, v := range k {
   214  		if v.Key == key {
   215  			return v.Value
   216  		}
   217  	}
   218  	return nil
   219  }
   220  
   221  // FileMetaData is a proxy around the underlying thrift FileMetaData object
   222  // to make it easier to use and interact with.
   223  type FileMetaData struct {
   224  	*format.FileMetaData
   225  	Schema        *schema.Schema
   226  	FileDecryptor encryption.FileDecryptor
   227  
   228  	// app version of the writer for this file
   229  	version *AppVersion
   230  	// size of the raw bytes of the metadata in the file which were
   231  	// decoded by thrift, Size() getter returns the value.
   232  	metadataLen int
   233  }
   234  
   235  // NewFileMetaData takes in the raw bytes of the serialized metadata to deserialize
   236  // and will attempt to decrypt the footer if a decryptor is provided.
   237  func NewFileMetaData(data []byte, fileDecryptor encryption.FileDecryptor) (*FileMetaData, error) {
   238  	meta := format.NewFileMetaData()
   239  	if fileDecryptor != nil {
   240  		footerDecryptor := fileDecryptor.GetFooterDecryptor()
   241  		data = footerDecryptor.Decrypt(data)
   242  	}
   243  
   244  	remain, err := thrift.DeserializeThrift(meta, data)
   245  	if err != nil {
   246  		return nil, err
   247  	}
   248  
   249  	f := &FileMetaData{
   250  		FileMetaData:  meta,
   251  		version:       NewAppVersion(meta.GetCreatedBy()),
   252  		metadataLen:   len(data) - int(remain),
   253  		FileDecryptor: fileDecryptor,
   254  	}
   255  
   256  	f.initSchema()
   257  	f.initColumnOrders()
   258  
   259  	return f, nil
   260  }
   261  
   262  // Size is the length of the raw serialized metadata bytes in the footer
   263  func (f *FileMetaData) Size() int { return f.metadataLen }
   264  
   265  // NumSchemaElements is the length of the flattened schema list in the thrift
   266  func (f *FileMetaData) NumSchemaElements() int {
   267  	return len(f.FileMetaData.Schema)
   268  }
   269  
   270  // RowGroup provides the metadata for the (0-based) index of the row group
   271  func (f *FileMetaData) RowGroup(i int) *RowGroupMetaData {
   272  	return &RowGroupMetaData{
   273  		f.RowGroups[i], f.Schema, f.version, f.FileDecryptor,
   274  	}
   275  }
   276  
   277  func (f *FileMetaData) Serialize(ctx context.Context) ([]byte, error) {
   278  	return thrift.NewThriftSerializer().Write(ctx, f.FileMetaData)
   279  }
   280  
   281  func (f *FileMetaData) SerializeString(ctx context.Context) (string, error) {
   282  	return thrift.NewThriftSerializer().WriteString(ctx, f.FileMetaData)
   283  }
   284  
   285  // EncryptionAlgorithm constructs the algorithm object from the thrift
   286  // information or returns an empty instance if it was not set.
   287  func (f *FileMetaData) EncryptionAlgorithm() parquet.Algorithm {
   288  	if f.IsSetEncryptionAlgorithm() {
   289  		return parquet.AlgorithmFromThrift(f.GetEncryptionAlgorithm())
   290  	}
   291  	return parquet.Algorithm{}
   292  }
   293  
   294  func (f *FileMetaData) initSchema() error {
   295  	root, err := schema.FromParquet(f.FileMetaData.Schema)
   296  	if err != nil {
   297  		return err
   298  	}
   299  	f.Schema = schema.NewSchema(root.(*schema.GroupNode))
   300  	return nil
   301  }
   302  
   303  func (f *FileMetaData) initColumnOrders() {
   304  	orders := make([]parquet.ColumnOrder, 0, f.Schema.NumColumns())
   305  	if f.IsSetColumnOrders() {
   306  		for _, o := range f.GetColumnOrders() {
   307  			if o.IsSetTYPE_ORDER() {
   308  				orders = append(orders, parquet.ColumnOrders.TypeDefinedOrder)
   309  			} else {
   310  				orders = append(orders, parquet.ColumnOrders.Undefined)
   311  			}
   312  		}
   313  	} else {
   314  		orders = orders[:f.Schema.NumColumns()]
   315  		orders[0] = parquet.ColumnOrders.Undefined
   316  		for i := 1; i < len(orders); i *= 2 {
   317  			copy(orders[i:], orders[:i])
   318  		}
   319  	}
   320  	f.Schema.UpdateColumnOrders(orders)
   321  }
   322  
   323  // WriterVersion returns the constructed application version from the
   324  // created by string
   325  func (f *FileMetaData) WriterVersion() *AppVersion {
   326  	if f.version == nil {
   327  		f.version = NewAppVersion(f.GetCreatedBy())
   328  	}
   329  	return f.version
   330  }
   331  
   332  // SetFilePath will set the file path into all of the columns in each row group.
   333  func (f *FileMetaData) SetFilePath(path string) {
   334  	for _, rg := range f.RowGroups {
   335  		for _, chunk := range rg.Columns {
   336  			chunk.FilePath = &path
   337  		}
   338  	}
   339  }
   340  
   341  // AppendRowGroups will add all of the rowgroup metadata from other to the
   342  // current file metadata
   343  func (f *FileMetaData) AppendRowGroups(other *FileMetaData) error {
   344  	if !f.Schema.Equals(other.Schema) {
   345  		return xerrors.New("parquet/FileMetaData: AppendRowGroups requires equal schemas")
   346  	}
   347  
   348  	f.RowGroups = append(f.RowGroups, other.GetRowGroups()...)
   349  	for _, rg := range other.GetRowGroups() {
   350  		f.NumRows += rg.NumRows
   351  	}
   352  	return nil
   353  }
   354  
   355  // Subset will construct a new FileMetaData object containing only the requested
   356  // row groups by index
   357  func (f *FileMetaData) Subset(rowGroups []int) (*FileMetaData, error) {
   358  	for _, i := range rowGroups {
   359  		if i < len(f.RowGroups) {
   360  			continue
   361  		}
   362  		return nil, fmt.Errorf("parquet: this file only has %d row groups, but requested a subset including row group: %d", len(f.RowGroups), i)
   363  	}
   364  
   365  	out := &FileMetaData{
   366  		&format.FileMetaData{
   367  			Schema:                   f.FileMetaData.Schema,
   368  			CreatedBy:                f.CreatedBy,
   369  			ColumnOrders:             f.GetColumnOrders(),
   370  			EncryptionAlgorithm:      f.FileMetaData.EncryptionAlgorithm,
   371  			FooterSigningKeyMetadata: f.FooterSigningKeyMetadata,
   372  			Version:                  f.FileMetaData.Version,
   373  			KeyValueMetadata:         f.KeyValueMetadata(),
   374  		},
   375  		f.Schema,
   376  		f.FileDecryptor,
   377  		f.version,
   378  		0,
   379  	}
   380  
   381  	out.RowGroups = make([]*format.RowGroup, 0, len(rowGroups))
   382  	for _, selected := range rowGroups {
   383  		out.RowGroups = append(out.RowGroups, f.RowGroups[selected])
   384  		out.NumRows += f.RowGroups[selected].GetNumRows()
   385  	}
   386  
   387  	return out, nil
   388  }
   389  
   390  func (f *FileMetaData) Equals(other *FileMetaData) bool {
   391  	return reflect.DeepEqual(f.FileMetaData, other.FileMetaData)
   392  }
   393  
   394  func (f *FileMetaData) KeyValueMetadata() KeyValueMetadata {
   395  	return f.GetKeyValueMetadata()
   396  }
   397  
   398  // VerifySignature constructs a cryptographic signature using the FileDecryptor
   399  // of the footer and then verifies it's integrity.
   400  //
   401  // Panics if f.FileDecryptor is nil
   402  func (f *FileMetaData) VerifySignature(signature []byte) bool {
   403  	if f.FileDecryptor == nil {
   404  		panic("decryption not set propertly, cannot verify signature")
   405  	}
   406  
   407  	serializer := thrift.NewThriftSerializer()
   408  	data, _ := serializer.Write(context.Background(), f.FileMetaData)
   409  	nonce := signature[:encryption.NonceLength]
   410  	tag := signature[encryption.NonceLength : encryption.NonceLength+encryption.GcmTagLength]
   411  
   412  	key := f.FileDecryptor.GetFooterKey()
   413  	aad := encryption.CreateFooterAad(f.FileDecryptor.FileAad())
   414  
   415  	enc := encryption.NewAesEncryptor(f.FileDecryptor.Algorithm(), true)
   416  	var buf bytes.Buffer
   417  	buf.Grow(enc.CiphertextSizeDelta() + len(data))
   418  	encryptedLen := enc.SignedFooterEncrypt(&buf, data, []byte(key), []byte(aad), nonce)
   419  	return bytes.Equal(buf.Bytes()[encryptedLen-encryption.GcmTagLength:], tag)
   420  }
   421  
   422  // WriteTo will serialize and write out this file metadata, encrypting it if
   423  // appropriate.
   424  //
   425  // If it is an encrypted file with a plaintext footer, then we will write the
   426  // signature with the unencrypted footer.
   427  func (f *FileMetaData) WriteTo(w io.Writer, encryptor encryption.Encryptor) (int64, error) {
   428  	serializer := thrift.NewThriftSerializer()
   429  	// only in encrypted files with plaintext footers, the encryption algorithm is set in the footer
   430  	if f.IsSetEncryptionAlgorithm() {
   431  		data, err := serializer.Write(context.Background(), f.FileMetaData)
   432  		if err != nil {
   433  			return 0, err
   434  		}
   435  
   436  		// encrypt the footer key
   437  		var buf bytes.Buffer
   438  		buf.Grow(encryptor.CiphertextSizeDelta() + len(data))
   439  		encryptedLen := encryptor.Encrypt(&buf, data)
   440  
   441  		wrote := 0
   442  		n := 0
   443  		// write unencrypted footer
   444  		if n, err = w.Write(data); err != nil {
   445  			return int64(n), err
   446  		}
   447  		wrote += n
   448  		// write signature (nonce and tag)
   449  		buf.Next(4)
   450  		if n, err = w.Write(buf.Next(encryption.NonceLength)); err != nil {
   451  			return int64(wrote + n), err
   452  		}
   453  		wrote += n
   454  		buf.Next(encryptedLen - 4 - encryption.NonceLength - encryption.GcmTagLength)
   455  		n, err = w.Write(buf.Next(encryption.GcmTagLength))
   456  		return int64(wrote + n), err
   457  	}
   458  	n, err := serializer.Serialize(f.FileMetaData, w, encryptor)
   459  	return int64(n), err
   460  }
   461  
   462  // Version returns the "version" of the file
   463  //
   464  // WARNING: The value returned by this method is unreliable as 1) the
   465  // parquet file metadata stores the version as a single integer and
   466  // 2) some producers are known to always write a hardcoded value. Therefore
   467  // you cannot use this value to know which features are used in the file.
   468  func (f *FileMetaData) Version() parquet.Version {
   469  	switch f.FileMetaData.Version {
   470  	case 1:
   471  		return parquet.V1_0
   472  	case 2:
   473  		return parquet.V2_LATEST
   474  	default:
   475  		// imporperly set version, assume parquet 1.0
   476  		return parquet.V1_0
   477  	}
   478  }
   479  
   480  // FileCryptoMetadata is a proxy for the thrift fileCryptoMetadata object
   481  type FileCryptoMetadata struct {
   482  	metadata          *format.FileCryptoMetaData
   483  	cryptoMetadataLen uint32
   484  }
   485  
   486  // NewFileCryptoMetaData takes in the raw serialized bytes to deserialize
   487  // storing the number of bytes that were actually deserialized.
   488  func NewFileCryptoMetaData(metadata []byte) (ret FileCryptoMetadata, err error) {
   489  	ret.metadata = format.NewFileCryptoMetaData()
   490  	var remain uint64
   491  	remain, err = thrift.DeserializeThrift(ret.metadata, metadata)
   492  	ret.cryptoMetadataLen = uint32(uint64(len(metadata)) - remain)
   493  	return
   494  }
   495  
   496  // WriteTo writes out the serialized crypto metadata to w
   497  func (fc FileCryptoMetadata) WriteTo(w io.Writer) (int64, error) {
   498  	serializer := thrift.NewThriftSerializer()
   499  	n, err := serializer.Serialize(fc.metadata, w, nil)
   500  	return int64(n), err
   501  }
   502  
   503  // Len is the number of bytes that were deserialized to create this object
   504  func (fc FileCryptoMetadata) Len() int { return int(fc.cryptoMetadataLen) }
   505  
   506  func (fc FileCryptoMetadata) KeyMetadata() []byte {
   507  	return fc.metadata.KeyMetadata
   508  }
   509  
   510  // EncryptionAlgorithm constructs the object from the thrift instance of
   511  // the encryption algorithm
   512  func (fc FileCryptoMetadata) EncryptionAlgorithm() parquet.Algorithm {
   513  	return parquet.AlgorithmFromThrift(fc.metadata.GetEncryptionAlgorithm())
   514  }