github.com/apache/arrow/go/v10@v10.0.1/parquet/metadata/file.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package metadata
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"fmt"
    23  	"io"
    24  	"reflect"
    25  	"unicode/utf8"
    26  
    27  	"github.com/apache/arrow/go/v10/parquet"
    28  	"github.com/apache/arrow/go/v10/parquet/compress"
    29  	"github.com/apache/arrow/go/v10/parquet/internal/encryption"
    30  	format "github.com/apache/arrow/go/v10/parquet/internal/gen-go/parquet"
    31  	"github.com/apache/arrow/go/v10/parquet/internal/thrift"
    32  	"github.com/apache/arrow/go/v10/parquet/schema"
    33  	"golang.org/x/xerrors"
    34  )
    35  
    36  // DefaultCompressionType is used unless a different compression is specified
    37  // in the properties
    38  var DefaultCompressionType = compress.Codecs.Uncompressed
    39  
    40  // FileMetaDataBuilder is a proxy for more easily constructing file metadata
    41  // particularly used when writing a file out.
    42  type FileMetaDataBuilder struct {
    43  	metadata       *format.FileMetaData
    44  	props          *parquet.WriterProperties
    45  	schema         *schema.Schema
    46  	rowGroups      []*format.RowGroup
    47  	currentRgBldr  *RowGroupMetaDataBuilder
    48  	kvmeta         KeyValueMetadata
    49  	cryptoMetadata *format.FileCryptoMetaData
    50  }
    51  
    52  // NewFileMetadataBuilder will use the default writer properties if nil is passed for
    53  // the writer properties and nil is allowable for the key value metadata.
    54  func NewFileMetadataBuilder(schema *schema.Schema, props *parquet.WriterProperties, kvmeta KeyValueMetadata) *FileMetaDataBuilder {
    55  	var crypto *format.FileCryptoMetaData
    56  	if props.FileEncryptionProperties() != nil && props.FileEncryptionProperties().EncryptedFooter() {
    57  		crypto = format.NewFileCryptoMetaData()
    58  	}
    59  	return &FileMetaDataBuilder{
    60  		metadata:       format.NewFileMetaData(),
    61  		props:          props,
    62  		schema:         schema,
    63  		kvmeta:         kvmeta,
    64  		cryptoMetadata: crypto,
    65  	}
    66  }
    67  
    68  // GetFileCryptoMetaData returns the cryptographic information for encrypting/
    69  // decrypting the file.
    70  func (f *FileMetaDataBuilder) GetFileCryptoMetaData() *FileCryptoMetadata {
    71  	if f.cryptoMetadata == nil {
    72  		return nil
    73  	}
    74  
    75  	props := f.props.FileEncryptionProperties()
    76  	f.cryptoMetadata.EncryptionAlgorithm = props.Algorithm().ToThrift()
    77  	keyMetadata := props.FooterKeyMetadata()
    78  	if keyMetadata != "" {
    79  		f.cryptoMetadata.KeyMetadata = []byte(keyMetadata)
    80  	}
    81  
    82  	return &FileCryptoMetadata{f.cryptoMetadata, 0}
    83  }
    84  
    85  // AppendRowGroup adds a rowgroup to the list and returns a builder
    86  // for that row group
    87  func (f *FileMetaDataBuilder) AppendRowGroup() *RowGroupMetaDataBuilder {
    88  	if f.rowGroups == nil {
    89  		f.rowGroups = make([]*format.RowGroup, 0, 1)
    90  	}
    91  
    92  	rg := format.NewRowGroup()
    93  	f.rowGroups = append(f.rowGroups, rg)
    94  	f.currentRgBldr = NewRowGroupMetaDataBuilder(f.props, f.schema, rg)
    95  	return f.currentRgBldr
    96  }
    97  
    98  // Finish will finalize the metadata of the number of rows, row groups,
    99  // version etc. This will clear out this filemetadatabuilder so it can
   100  // be re-used
   101  func (f *FileMetaDataBuilder) Finish() (*FileMetaData, error) {
   102  	totalRows := int64(0)
   103  	for _, rg := range f.rowGroups {
   104  		totalRows += rg.NumRows
   105  	}
   106  	f.metadata.NumRows = totalRows
   107  	f.metadata.RowGroups = f.rowGroups
   108  	switch f.props.Version() {
   109  	case parquet.V1_0:
   110  		f.metadata.Version = 1
   111  	default:
   112  		f.metadata.Version = 2
   113  	}
   114  	createdBy := f.props.CreatedBy()
   115  	f.metadata.CreatedBy = &createdBy
   116  
   117  	// Users cannot set the `ColumnOrder` since we do not not have user defined sort order
   118  	// in the spec yet.
   119  	//
   120  	// We always default to `TYPE_DEFINED_ORDER`. We can expose it in
   121  	// the API once we have user defined sort orders in the Parquet format.
   122  	// TypeDefinedOrder implies choose SortOrder based on ConvertedType/PhysicalType
   123  	typeDefined := format.NewTypeDefinedOrder()
   124  	colOrder := &format.ColumnOrder{TYPE_ORDER: typeDefined}
   125  	f.metadata.ColumnOrders = make([]*format.ColumnOrder, f.schema.NumColumns())
   126  	for idx := range f.metadata.ColumnOrders {
   127  		f.metadata.ColumnOrders[idx] = colOrder
   128  	}
   129  
   130  	encryptProps := f.props.FileEncryptionProperties()
   131  	if encryptProps != nil && !encryptProps.EncryptedFooter() {
   132  		var signingAlgo parquet.Algorithm
   133  		algo := encryptProps.Algorithm()
   134  		signingAlgo.Aad.AadFileUnique = algo.Aad.AadFileUnique
   135  		signingAlgo.Aad.SupplyAadPrefix = algo.Aad.SupplyAadPrefix
   136  		if !algo.Aad.SupplyAadPrefix {
   137  			signingAlgo.Aad.AadPrefix = algo.Aad.AadPrefix
   138  		}
   139  		signingAlgo.Algo = parquet.AesGcm
   140  		f.metadata.EncryptionAlgorithm = signingAlgo.ToThrift()
   141  		footerSigningMetadata := f.props.FileEncryptionProperties().FooterKeyMetadata()
   142  		if footerSigningMetadata != "" {
   143  			f.metadata.FooterSigningKeyMetadata = []byte(footerSigningMetadata)
   144  		}
   145  	}
   146  
   147  	f.metadata.Schema = schema.ToThrift(f.schema.Root())
   148  	f.metadata.KeyValueMetadata = f.kvmeta
   149  
   150  	out := &FileMetaData{
   151  		FileMetaData: f.metadata,
   152  		version:      NewAppVersion(f.metadata.GetCreatedBy()),
   153  	}
   154  	if err := out.initSchema(); err != nil {
   155  		return nil, err
   156  	}
   157  	out.initColumnOrders()
   158  
   159  	f.metadata = format.NewFileMetaData()
   160  	f.rowGroups = nil
   161  	return out, nil
   162  }
   163  
   164  // KeyValueMetadata is an alias for a slice of thrift keyvalue pairs.
   165  //
   166  // It is presumed that the metadata should all be utf8 valid.
   167  type KeyValueMetadata []*format.KeyValue
   168  
   169  // NewKeyValueMetadata is equivalent to make(KeyValueMetadata, 0)
   170  func NewKeyValueMetadata() KeyValueMetadata {
   171  	return make(KeyValueMetadata, 0)
   172  }
   173  
   174  // Append adds the passed in key and value to the metadata, if either contains
   175  // any invalid utf8 runes, then it is not added and an error is returned.
   176  func (k *KeyValueMetadata) Append(key, value string) error {
   177  	if !utf8.ValidString(key) || !utf8.ValidString(value) {
   178  		return fmt.Errorf("metadata must be valid utf8 strings, got key = '%s' and value = '%s'", key, value)
   179  	}
   180  	*k = append(*k, &format.KeyValue{Key: key, Value: &value})
   181  	return nil
   182  }
   183  
   184  func (k KeyValueMetadata) Len() int { return len(k) }
   185  
   186  // Equals compares all of the metadata keys and values to check they are equal
   187  func (k KeyValueMetadata) Equals(other KeyValueMetadata) bool {
   188  	return reflect.DeepEqual(k, other)
   189  }
   190  
   191  func (k KeyValueMetadata) Keys() (ret []string) {
   192  	ret = make([]string, len(k))
   193  	for idx, v := range k {
   194  		ret[idx] = v.GetKey()
   195  	}
   196  	return
   197  }
   198  
   199  func (k KeyValueMetadata) Values() (ret []string) {
   200  	ret = make([]string, len(k))
   201  	for idx, v := range k {
   202  		ret[idx] = v.GetValue()
   203  	}
   204  	return
   205  }
   206  
   207  func (k KeyValueMetadata) FindValue(key string) *string {
   208  	for _, v := range k {
   209  		if v.Key == key {
   210  			return v.Value
   211  		}
   212  	}
   213  	return nil
   214  }
   215  
   216  // FileMetaData is a proxy around the underlying thrift FileMetaData object
   217  // to make it easier to use and interact with.
   218  type FileMetaData struct {
   219  	*format.FileMetaData
   220  	Schema        *schema.Schema
   221  	FileDecryptor encryption.FileDecryptor
   222  
   223  	// app version of the writer for this file
   224  	version *AppVersion
   225  	// size of the raw bytes of the metadata in the file which were
   226  	// decoded by thrift, Size() getter returns the value.
   227  	metadataLen int
   228  }
   229  
   230  // NewFileMetaData takes in the raw bytes of the serialized metadata to deserialize
   231  // and will attempt to decrypt the footer if a decryptor is provided.
   232  func NewFileMetaData(data []byte, fileDecryptor encryption.FileDecryptor) (*FileMetaData, error) {
   233  	meta := format.NewFileMetaData()
   234  	if fileDecryptor != nil {
   235  		footerDecryptor := fileDecryptor.GetFooterDecryptor()
   236  		data = footerDecryptor.Decrypt(data)
   237  	}
   238  
   239  	remain, err := thrift.DeserializeThrift(meta, data)
   240  	if err != nil {
   241  		return nil, err
   242  	}
   243  
   244  	f := &FileMetaData{
   245  		FileMetaData:  meta,
   246  		version:       NewAppVersion(meta.GetCreatedBy()),
   247  		metadataLen:   len(data) - int(remain),
   248  		FileDecryptor: fileDecryptor,
   249  	}
   250  
   251  	f.initSchema()
   252  	f.initColumnOrders()
   253  
   254  	return f, nil
   255  }
   256  
   257  // Size is the length of the raw serialized metadata bytes in the footer
   258  func (f *FileMetaData) Size() int { return f.metadataLen }
   259  
   260  // NumSchemaElements is the length of the flattened schema list in the thrift
   261  func (f *FileMetaData) NumSchemaElements() int {
   262  	return len(f.FileMetaData.Schema)
   263  }
   264  
   265  // RowGroup provides the metadata for the (0-based) index of the row group
   266  func (f *FileMetaData) RowGroup(i int) *RowGroupMetaData {
   267  	return &RowGroupMetaData{
   268  		f.RowGroups[i], f.Schema, f.version, f.FileDecryptor,
   269  	}
   270  }
   271  
   272  func (f *FileMetaData) Serialize(ctx context.Context) ([]byte, error) {
   273  	return thrift.NewThriftSerializer().Write(ctx, f.FileMetaData)
   274  }
   275  
   276  func (f *FileMetaData) SerializeString(ctx context.Context) (string, error) {
   277  	return thrift.NewThriftSerializer().WriteString(ctx, f.FileMetaData)
   278  }
   279  
   280  // EncryptionAlgorithm constructs the algorithm object from the thrift
   281  // information or returns an empty instance if it was not set.
   282  func (f *FileMetaData) EncryptionAlgorithm() parquet.Algorithm {
   283  	if f.IsSetEncryptionAlgorithm() {
   284  		return parquet.AlgorithmFromThrift(f.GetEncryptionAlgorithm())
   285  	}
   286  	return parquet.Algorithm{}
   287  }
   288  
   289  func (f *FileMetaData) initSchema() error {
   290  	root, err := schema.FromParquet(f.FileMetaData.Schema)
   291  	if err != nil {
   292  		return err
   293  	}
   294  	f.Schema = schema.NewSchema(root.(*schema.GroupNode))
   295  	return nil
   296  }
   297  
   298  func (f *FileMetaData) initColumnOrders() {
   299  	orders := make([]parquet.ColumnOrder, 0, f.Schema.NumColumns())
   300  	if f.IsSetColumnOrders() {
   301  		for _, o := range f.GetColumnOrders() {
   302  			if o.IsSetTYPE_ORDER() {
   303  				orders = append(orders, parquet.ColumnOrders.TypeDefinedOrder)
   304  			} else {
   305  				orders = append(orders, parquet.ColumnOrders.Undefined)
   306  			}
   307  		}
   308  	} else {
   309  		orders = orders[:f.Schema.NumColumns()]
   310  		orders[0] = parquet.ColumnOrders.Undefined
   311  		for i := 1; i < len(orders); i *= 2 {
   312  			copy(orders[i:], orders[:i])
   313  		}
   314  	}
   315  	f.Schema.UpdateColumnOrders(orders)
   316  }
   317  
   318  // WriterVersion returns the constructed application version from the
   319  // created by string
   320  func (f *FileMetaData) WriterVersion() *AppVersion {
   321  	if f.version == nil {
   322  		f.version = NewAppVersion(f.GetCreatedBy())
   323  	}
   324  	return f.version
   325  }
   326  
   327  // SetFilePath will set the file path into all of the columns in each row group.
   328  func (f *FileMetaData) SetFilePath(path string) {
   329  	for _, rg := range f.RowGroups {
   330  		for _, chunk := range rg.Columns {
   331  			chunk.FilePath = &path
   332  		}
   333  	}
   334  }
   335  
   336  // AppendRowGroups will add all of the rowgroup metadata from other to the
   337  // current file metadata
   338  func (f *FileMetaData) AppendRowGroups(other *FileMetaData) error {
   339  	if !f.Schema.Equals(other.Schema) {
   340  		return xerrors.New("parquet/FileMetaData: AppendRowGroups requires equal schemas")
   341  	}
   342  
   343  	f.RowGroups = append(f.RowGroups, other.GetRowGroups()...)
   344  	for _, rg := range other.GetRowGroups() {
   345  		f.NumRows += rg.NumRows
   346  	}
   347  	return nil
   348  }
   349  
   350  // Subset will construct a new FileMetaData object containing only the requested
   351  // row groups by index
   352  func (f *FileMetaData) Subset(rowGroups []int) (*FileMetaData, error) {
   353  	for _, i := range rowGroups {
   354  		if i < len(f.RowGroups) {
   355  			continue
   356  		}
   357  		return nil, fmt.Errorf("parquet: this file only has %d row groups, but requested a subset including row group: %d", len(f.RowGroups), i)
   358  	}
   359  
   360  	out := &FileMetaData{
   361  		&format.FileMetaData{
   362  			Schema:                   f.FileMetaData.Schema,
   363  			CreatedBy:                f.CreatedBy,
   364  			ColumnOrders:             f.GetColumnOrders(),
   365  			EncryptionAlgorithm:      f.FileMetaData.EncryptionAlgorithm,
   366  			FooterSigningKeyMetadata: f.FooterSigningKeyMetadata,
   367  			Version:                  f.FileMetaData.Version,
   368  			KeyValueMetadata:         f.KeyValueMetadata(),
   369  		},
   370  		f.Schema,
   371  		f.FileDecryptor,
   372  		f.version,
   373  		0,
   374  	}
   375  
   376  	out.RowGroups = make([]*format.RowGroup, 0, len(rowGroups))
   377  	for _, selected := range rowGroups {
   378  		out.RowGroups = append(out.RowGroups, f.RowGroups[selected])
   379  		out.NumRows += f.RowGroups[selected].GetNumRows()
   380  	}
   381  
   382  	return out, nil
   383  }
   384  
   385  func (f *FileMetaData) Equals(other *FileMetaData) bool {
   386  	return reflect.DeepEqual(f.FileMetaData, other.FileMetaData)
   387  }
   388  
   389  func (f *FileMetaData) KeyValueMetadata() KeyValueMetadata {
   390  	return f.GetKeyValueMetadata()
   391  }
   392  
   393  // VerifySignature constructs a cryptographic signature using the FileDecryptor
   394  // of the footer and then verifies it's integrity.
   395  //
   396  // Panics if f.FileDecryptor is nil
   397  func (f *FileMetaData) VerifySignature(signature []byte) bool {
   398  	if f.FileDecryptor == nil {
   399  		panic("decryption not set propertly, cannot verify signature")
   400  	}
   401  
   402  	serializer := thrift.NewThriftSerializer()
   403  	data, _ := serializer.Write(context.Background(), f.FileMetaData)
   404  	nonce := signature[:encryption.NonceLength]
   405  	tag := signature[encryption.NonceLength : encryption.NonceLength+encryption.GcmTagLength]
   406  
   407  	key := f.FileDecryptor.GetFooterKey()
   408  	aad := encryption.CreateFooterAad(f.FileDecryptor.FileAad())
   409  
   410  	enc := encryption.NewAesEncryptor(f.FileDecryptor.Algorithm(), true)
   411  	var buf bytes.Buffer
   412  	buf.Grow(enc.CiphertextSizeDelta() + len(data))
   413  	encryptedLen := enc.SignedFooterEncrypt(&buf, data, []byte(key), []byte(aad), nonce)
   414  	return bytes.Equal(buf.Bytes()[encryptedLen-encryption.GcmTagLength:], tag)
   415  }
   416  
   417  // WriteTo will serialize and write out this file metadata, encrypting it if
   418  // appropriate.
   419  //
   420  // If it is an encrypted file with a plaintext footer, then we will write the
   421  // signature with the unencrypted footer.
   422  func (f *FileMetaData) WriteTo(w io.Writer, encryptor encryption.Encryptor) (int64, error) {
   423  	serializer := thrift.NewThriftSerializer()
   424  	// only in encrypted files with plaintext footers, the encryption algorithm is set in the footer
   425  	if f.IsSetEncryptionAlgorithm() {
   426  		data, err := serializer.Write(context.Background(), f.FileMetaData)
   427  		if err != nil {
   428  			return 0, err
   429  		}
   430  
   431  		// encrypt the footer key
   432  		var buf bytes.Buffer
   433  		buf.Grow(encryptor.CiphertextSizeDelta() + len(data))
   434  		encryptedLen := encryptor.Encrypt(&buf, data)
   435  
   436  		wrote := 0
   437  		n := 0
   438  		// write unencrypted footer
   439  		if n, err = w.Write(data); err != nil {
   440  			return int64(n), err
   441  		}
   442  		wrote += n
   443  		// write signature (nonce and tag)
   444  		buf.Next(4)
   445  		if n, err = w.Write(buf.Next(encryption.NonceLength)); err != nil {
   446  			return int64(wrote + n), err
   447  		}
   448  		wrote += n
   449  		buf.Next(encryptedLen - 4 - encryption.NonceLength - encryption.GcmTagLength)
   450  		n, err = w.Write(buf.Next(encryption.GcmTagLength))
   451  		return int64(wrote + n), err
   452  	}
   453  	n, err := serializer.Serialize(f.FileMetaData, w, encryptor)
   454  	return int64(n), err
   455  }
   456  
   457  // Version returns the "version" of the file
   458  //
   459  // WARNING: The value returned by this method is unreliable as 1) the
   460  // parquet file metadata stores the version as a single integer and
   461  // 2) some producers are known to always write a hardcoded value. Therefore
   462  // you cannot use this value to know which features are used in the file.
   463  func (f *FileMetaData) Version() parquet.Version {
   464  	switch f.FileMetaData.Version {
   465  	case 1:
   466  		return parquet.V1_0
   467  	case 2:
   468  		return parquet.V2_LATEST
   469  	default:
   470  		// imporperly set version, assume parquet 1.0
   471  		return parquet.V1_0
   472  	}
   473  }
   474  
   475  // FileCryptoMetadata is a proxy for the thrift fileCryptoMetadata object
   476  type FileCryptoMetadata struct {
   477  	metadata          *format.FileCryptoMetaData
   478  	cryptoMetadataLen uint32
   479  }
   480  
   481  // NewFileCryptoMetaData takes in the raw serialized bytes to deserialize
   482  // storing the number of bytes that were actually deserialized.
   483  func NewFileCryptoMetaData(metadata []byte) (ret FileCryptoMetadata, err error) {
   484  	ret.metadata = format.NewFileCryptoMetaData()
   485  	var remain uint64
   486  	remain, err = thrift.DeserializeThrift(ret.metadata, metadata)
   487  	ret.cryptoMetadataLen = uint32(uint64(len(metadata)) - remain)
   488  	return
   489  }
   490  
   491  // WriteTo writes out the serialized crypto metadata to w
   492  func (fc FileCryptoMetadata) WriteTo(w io.Writer) (int64, error) {
   493  	serializer := thrift.NewThriftSerializer()
   494  	n, err := serializer.Serialize(fc.metadata, w, nil)
   495  	return int64(n), err
   496  }
   497  
   498  // Len is the number of bytes that were deserialized to create this object
   499  func (fc FileCryptoMetadata) Len() int { return int(fc.cryptoMetadataLen) }
   500  
   501  func (fc FileCryptoMetadata) KeyMetadata() []byte {
   502  	return fc.metadata.KeyMetadata
   503  }
   504  
   505  // EncryptionAlgorithm constructs the object from the thrift instance of
   506  // the encryption algorithm
   507  func (fc FileCryptoMetadata) EncryptionAlgorithm() parquet.Algorithm {
   508  	return parquet.AlgorithmFromThrift(fc.metadata.GetEncryptionAlgorithm())
   509  }