github.com/apache/arrow/go/v16@v16.1.0/parquet/metadata/file.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package metadata
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"fmt"
    23  	"io"
    24  	"reflect"
    25  	"unicode/utf8"
    26  
    27  	"github.com/apache/arrow/go/v16/parquet"
    28  	"github.com/apache/arrow/go/v16/parquet/compress"
    29  	"github.com/apache/arrow/go/v16/parquet/internal/encryption"
    30  	format "github.com/apache/arrow/go/v16/parquet/internal/gen-go/parquet"
    31  	"github.com/apache/arrow/go/v16/parquet/internal/thrift"
    32  	"github.com/apache/arrow/go/v16/parquet/schema"
    33  	"golang.org/x/xerrors"
    34  )
    35  
    36  // DefaultCompressionType is used unless a different compression is specified
    37  // in the properties
    38  var DefaultCompressionType = compress.Codecs.Uncompressed
    39  
    40  // FileMetaDataBuilder is a proxy for more easily constructing file metadata
    41  // particularly used when writing a file out.
    42  type FileMetaDataBuilder struct {
    43  	metadata       *format.FileMetaData
    44  	props          *parquet.WriterProperties
    45  	schema         *schema.Schema
    46  	rowGroups      []*format.RowGroup
    47  	currentRgBldr  *RowGroupMetaDataBuilder
    48  	kvmeta         KeyValueMetadata
    49  	cryptoMetadata *format.FileCryptoMetaData
    50  }
    51  
    52  // NewFileMetadataBuilder will use the default writer properties if nil is passed for
    53  // the writer properties and nil is allowable for the key value metadata.
    54  func NewFileMetadataBuilder(schema *schema.Schema, props *parquet.WriterProperties, kvmeta KeyValueMetadata) *FileMetaDataBuilder {
    55  	var crypto *format.FileCryptoMetaData
    56  	if props.FileEncryptionProperties() != nil && props.FileEncryptionProperties().EncryptedFooter() {
    57  		crypto = format.NewFileCryptoMetaData()
    58  	}
    59  	return &FileMetaDataBuilder{
    60  		metadata:       format.NewFileMetaData(),
    61  		props:          props,
    62  		schema:         schema,
    63  		kvmeta:         kvmeta,
    64  		cryptoMetadata: crypto,
    65  	}
    66  }
    67  
    68  // GetFileCryptoMetaData returns the cryptographic information for encrypting/
    69  // decrypting the file.
    70  func (f *FileMetaDataBuilder) GetFileCryptoMetaData() *FileCryptoMetadata {
    71  	if f.cryptoMetadata == nil {
    72  		return nil
    73  	}
    74  
    75  	props := f.props.FileEncryptionProperties()
    76  	f.cryptoMetadata.EncryptionAlgorithm = props.Algorithm().ToThrift()
    77  	keyMetadata := props.FooterKeyMetadata()
    78  	if keyMetadata != "" {
    79  		f.cryptoMetadata.KeyMetadata = []byte(keyMetadata)
    80  	}
    81  
    82  	return &FileCryptoMetadata{f.cryptoMetadata, 0}
    83  }
    84  
    85  // AppendRowGroup adds a rowgroup to the list and returns a builder
    86  // for that row group
    87  func (f *FileMetaDataBuilder) AppendRowGroup() *RowGroupMetaDataBuilder {
    88  	if f.rowGroups == nil {
    89  		f.rowGroups = make([]*format.RowGroup, 0, 1)
    90  	}
    91  
    92  	rg := format.NewRowGroup()
    93  	f.rowGroups = append(f.rowGroups, rg)
    94  	f.currentRgBldr = NewRowGroupMetaDataBuilder(f.props, f.schema, rg)
    95  	return f.currentRgBldr
    96  }
    97  
    98  // AppendKeyValueMetadata appends a key/value pair to the existing key/value metadata
    99  func (f *FileMetaDataBuilder) AppendKeyValueMetadata(key string, value string) error {
   100  	return f.kvmeta.Append(key, value)
   101  }
   102  
   103  // Finish will finalize the metadata of the number of rows, row groups,
   104  // version etc. This will clear out this filemetadatabuilder so it can
   105  // be re-used
   106  func (f *FileMetaDataBuilder) Finish() (*FileMetaData, error) {
   107  	out, err := f.Snapshot()
   108  	f.Clear()
   109  	return out, err
   110  }
   111  
   112  // Snapshot returns finalized metadata of the number of rows, row groups, version etc.
   113  // The snapshot must be used (e.g., serialized) before any additional (meta)data is
   114  // written, as it refers to builder datastructures that will continue to mutate.
   115  func (f *FileMetaDataBuilder) Snapshot() (*FileMetaData, error) {
   116  	totalRows := int64(0)
   117  	for _, rg := range f.rowGroups {
   118  		totalRows += rg.NumRows
   119  	}
   120  	f.metadata.NumRows = totalRows
   121  	f.metadata.RowGroups = f.rowGroups
   122  	switch f.props.Version() {
   123  	case parquet.V1_0:
   124  		f.metadata.Version = 1
   125  	default:
   126  		f.metadata.Version = 2
   127  	}
   128  	createdBy := f.props.CreatedBy()
   129  	f.metadata.CreatedBy = &createdBy
   130  
   131  	// Users cannot set the `ColumnOrder` since we do not have user defined sort order
   132  	// in the spec yet.
   133  	//
   134  	// We always default to `TYPE_DEFINED_ORDER`. We can expose it in
   135  	// the API once we have user defined sort orders in the Parquet format.
   136  	// TypeDefinedOrder implies choose SortOrder based on ConvertedType/PhysicalType
   137  	typeDefined := format.NewTypeDefinedOrder()
   138  	colOrder := &format.ColumnOrder{TYPE_ORDER: typeDefined}
   139  	f.metadata.ColumnOrders = make([]*format.ColumnOrder, f.schema.NumColumns())
   140  	for idx := range f.metadata.ColumnOrders {
   141  		f.metadata.ColumnOrders[idx] = colOrder
   142  	}
   143  
   144  	encryptProps := f.props.FileEncryptionProperties()
   145  	if encryptProps != nil && !encryptProps.EncryptedFooter() {
   146  		var signingAlgo parquet.Algorithm
   147  		algo := encryptProps.Algorithm()
   148  		signingAlgo.Aad.AadFileUnique = algo.Aad.AadFileUnique
   149  		signingAlgo.Aad.SupplyAadPrefix = algo.Aad.SupplyAadPrefix
   150  		if !algo.Aad.SupplyAadPrefix {
   151  			signingAlgo.Aad.AadPrefix = algo.Aad.AadPrefix
   152  		}
   153  		signingAlgo.Algo = parquet.AesGcm
   154  		f.metadata.EncryptionAlgorithm = signingAlgo.ToThrift()
   155  		footerSigningMetadata := f.props.FileEncryptionProperties().FooterKeyMetadata()
   156  		if footerSigningMetadata != "" {
   157  			f.metadata.FooterSigningKeyMetadata = []byte(footerSigningMetadata)
   158  		}
   159  	}
   160  
   161  	f.metadata.Schema = schema.ToThrift(f.schema.Root())
   162  	f.metadata.KeyValueMetadata = f.kvmeta
   163  
   164  	out := &FileMetaData{
   165  		FileMetaData: f.metadata,
   166  		version:      NewAppVersion(f.metadata.GetCreatedBy()),
   167  	}
   168  	if err := out.initSchema(); err != nil {
   169  		return nil, err
   170  	}
   171  	out.initColumnOrders()
   172  
   173  	return out, nil
   174  }
   175  
   176  // Clears out this filemetadatabuilder so it can be re-used
   177  func (f *FileMetaDataBuilder) Clear() {
   178  	f.metadata = format.NewFileMetaData()
   179  	f.rowGroups = nil
   180  }
   181  
   182  // KeyValueMetadata is an alias for a slice of thrift keyvalue pairs.
   183  //
   184  // It is presumed that the metadata should all be utf8 valid.
   185  type KeyValueMetadata []*format.KeyValue
   186  
   187  // NewKeyValueMetadata is equivalent to make(KeyValueMetadata, 0)
   188  func NewKeyValueMetadata() KeyValueMetadata {
   189  	return make(KeyValueMetadata, 0)
   190  }
   191  
   192  // Append adds the passed in key and value to the metadata, if either contains
   193  // any invalid utf8 runes, then it is not added and an error is returned.
   194  func (k *KeyValueMetadata) Append(key, value string) error {
   195  	if !utf8.ValidString(key) || !utf8.ValidString(value) {
   196  		return fmt.Errorf("metadata must be valid utf8 strings, got key = '%s' and value = '%s'", key, value)
   197  	}
   198  	*k = append(*k, &format.KeyValue{Key: key, Value: &value})
   199  	return nil
   200  }
   201  
   202  func (k KeyValueMetadata) Len() int { return len(k) }
   203  
   204  // Equals compares all of the metadata keys and values to check they are equal
   205  func (k KeyValueMetadata) Equals(other KeyValueMetadata) bool {
   206  	return reflect.DeepEqual(k, other)
   207  }
   208  
   209  func (k KeyValueMetadata) Keys() (ret []string) {
   210  	ret = make([]string, len(k))
   211  	for idx, v := range k {
   212  		ret[idx] = v.GetKey()
   213  	}
   214  	return
   215  }
   216  
   217  func (k KeyValueMetadata) Values() (ret []string) {
   218  	ret = make([]string, len(k))
   219  	for idx, v := range k {
   220  		ret[idx] = v.GetValue()
   221  	}
   222  	return
   223  }
   224  
   225  func (k KeyValueMetadata) FindValue(key string) *string {
   226  	for _, v := range k {
   227  		if v.Key == key {
   228  			return v.Value
   229  		}
   230  	}
   231  	return nil
   232  }
   233  
   234  // FileMetaData is a proxy around the underlying thrift FileMetaData object
   235  // to make it easier to use and interact with.
   236  type FileMetaData struct {
   237  	*format.FileMetaData
   238  	Schema        *schema.Schema
   239  	FileDecryptor encryption.FileDecryptor
   240  
   241  	// app version of the writer for this file
   242  	version *AppVersion
   243  	// size of the raw bytes of the metadata in the file which were
   244  	// decoded by thrift, Size() getter returns the value.
   245  	metadataLen int
   246  }
   247  
   248  // NewFileMetaData takes in the raw bytes of the serialized metadata to deserialize
   249  // and will attempt to decrypt the footer if a decryptor is provided.
   250  func NewFileMetaData(data []byte, fileDecryptor encryption.FileDecryptor) (*FileMetaData, error) {
   251  	meta := format.NewFileMetaData()
   252  	if fileDecryptor != nil {
   253  		footerDecryptor := fileDecryptor.GetFooterDecryptor()
   254  		data = footerDecryptor.Decrypt(data)
   255  	}
   256  
   257  	remain, err := thrift.DeserializeThrift(meta, data)
   258  	if err != nil {
   259  		return nil, err
   260  	}
   261  
   262  	f := &FileMetaData{
   263  		FileMetaData:  meta,
   264  		version:       NewAppVersion(meta.GetCreatedBy()),
   265  		metadataLen:   len(data) - int(remain),
   266  		FileDecryptor: fileDecryptor,
   267  	}
   268  
   269  	f.initSchema()
   270  	f.initColumnOrders()
   271  
   272  	return f, nil
   273  }
   274  
   275  // Size is the length of the raw serialized metadata bytes in the footer
   276  func (f *FileMetaData) Size() int { return f.metadataLen }
   277  
   278  // NumSchemaElements is the length of the flattened schema list in the thrift
   279  func (f *FileMetaData) NumSchemaElements() int {
   280  	return len(f.FileMetaData.Schema)
   281  }
   282  
   283  // RowGroup provides the metadata for the (0-based) index of the row group
   284  func (f *FileMetaData) RowGroup(i int) *RowGroupMetaData {
   285  	return &RowGroupMetaData{
   286  		f.RowGroups[i], f.Schema, f.version, f.FileDecryptor,
   287  	}
   288  }
   289  
   290  func (f *FileMetaData) Serialize(ctx context.Context) ([]byte, error) {
   291  	return thrift.NewThriftSerializer().Write(ctx, f.FileMetaData)
   292  }
   293  
   294  func (f *FileMetaData) SerializeString(ctx context.Context) (string, error) {
   295  	return thrift.NewThriftSerializer().WriteString(ctx, f.FileMetaData)
   296  }
   297  
   298  // EncryptionAlgorithm constructs the algorithm object from the thrift
   299  // information or returns an empty instance if it was not set.
   300  func (f *FileMetaData) EncryptionAlgorithm() parquet.Algorithm {
   301  	if f.IsSetEncryptionAlgorithm() {
   302  		return parquet.AlgorithmFromThrift(f.GetEncryptionAlgorithm())
   303  	}
   304  	return parquet.Algorithm{}
   305  }
   306  
   307  func (f *FileMetaData) initSchema() error {
   308  	root, err := schema.FromParquet(f.FileMetaData.Schema)
   309  	if err != nil {
   310  		return err
   311  	}
   312  	f.Schema = schema.NewSchema(root.(*schema.GroupNode))
   313  	return nil
   314  }
   315  
   316  func (f *FileMetaData) initColumnOrders() {
   317  	orders := make([]parquet.ColumnOrder, 0, f.Schema.NumColumns())
   318  	if f.IsSetColumnOrders() {
   319  		for _, o := range f.GetColumnOrders() {
   320  			if o.IsSetTYPE_ORDER() {
   321  				orders = append(orders, parquet.ColumnOrders.TypeDefinedOrder)
   322  			} else {
   323  				orders = append(orders, parquet.ColumnOrders.Undefined)
   324  			}
   325  		}
   326  	} else {
   327  		orders = orders[:f.Schema.NumColumns()]
   328  		orders[0] = parquet.ColumnOrders.Undefined
   329  		for i := 1; i < len(orders); i *= 2 {
   330  			copy(orders[i:], orders[:i])
   331  		}
   332  	}
   333  	f.Schema.UpdateColumnOrders(orders)
   334  }
   335  
   336  // WriterVersion returns the constructed application version from the
   337  // created by string
   338  func (f *FileMetaData) WriterVersion() *AppVersion {
   339  	if f.version == nil {
   340  		f.version = NewAppVersion(f.GetCreatedBy())
   341  	}
   342  	return f.version
   343  }
   344  
   345  // SetFilePath will set the file path into all of the columns in each row group.
   346  func (f *FileMetaData) SetFilePath(path string) {
   347  	for _, rg := range f.RowGroups {
   348  		for _, chunk := range rg.Columns {
   349  			chunk.FilePath = &path
   350  		}
   351  	}
   352  }
   353  
   354  // AppendRowGroups will add all of the rowgroup metadata from other to the
   355  // current file metadata
   356  func (f *FileMetaData) AppendRowGroups(other *FileMetaData) error {
   357  	if !f.Schema.Equals(other.Schema) {
   358  		return xerrors.New("parquet/FileMetaData: AppendRowGroups requires equal schemas")
   359  	}
   360  
   361  	f.RowGroups = append(f.RowGroups, other.GetRowGroups()...)
   362  	for _, rg := range other.GetRowGroups() {
   363  		f.NumRows += rg.NumRows
   364  	}
   365  	return nil
   366  }
   367  
   368  // Subset will construct a new FileMetaData object containing only the requested
   369  // row groups by index
   370  func (f *FileMetaData) Subset(rowGroups []int) (*FileMetaData, error) {
   371  	for _, i := range rowGroups {
   372  		if i < len(f.RowGroups) {
   373  			continue
   374  		}
   375  		return nil, fmt.Errorf("parquet: this file only has %d row groups, but requested a subset including row group: %d", len(f.RowGroups), i)
   376  	}
   377  
   378  	out := &FileMetaData{
   379  		&format.FileMetaData{
   380  			Schema:                   f.FileMetaData.Schema,
   381  			CreatedBy:                f.CreatedBy,
   382  			ColumnOrders:             f.GetColumnOrders(),
   383  			EncryptionAlgorithm:      f.FileMetaData.EncryptionAlgorithm,
   384  			FooterSigningKeyMetadata: f.FooterSigningKeyMetadata,
   385  			Version:                  f.FileMetaData.Version,
   386  			KeyValueMetadata:         f.KeyValueMetadata(),
   387  		},
   388  		f.Schema,
   389  		f.FileDecryptor,
   390  		f.version,
   391  		0,
   392  	}
   393  
   394  	out.RowGroups = make([]*format.RowGroup, 0, len(rowGroups))
   395  	for _, selected := range rowGroups {
   396  		out.RowGroups = append(out.RowGroups, f.RowGroups[selected])
   397  		out.NumRows += f.RowGroups[selected].GetNumRows()
   398  	}
   399  
   400  	return out, nil
   401  }
   402  
   403  func (f *FileMetaData) Equals(other *FileMetaData) bool {
   404  	return reflect.DeepEqual(f.FileMetaData, other.FileMetaData)
   405  }
   406  
   407  func (f *FileMetaData) KeyValueMetadata() KeyValueMetadata {
   408  	return f.GetKeyValueMetadata()
   409  }
   410  
   411  // VerifySignature constructs a cryptographic signature using the FileDecryptor
   412  // of the footer and then verifies it's integrity.
   413  //
   414  // Panics if f.FileDecryptor is nil
   415  func (f *FileMetaData) VerifySignature(signature []byte) bool {
   416  	if f.FileDecryptor == nil {
   417  		panic("decryption not set properly, cannot verify signature")
   418  	}
   419  
   420  	serializer := thrift.NewThriftSerializer()
   421  	data, _ := serializer.Write(context.Background(), f.FileMetaData)
   422  	nonce := signature[:encryption.NonceLength]
   423  	tag := signature[encryption.NonceLength : encryption.NonceLength+encryption.GcmTagLength]
   424  
   425  	key := f.FileDecryptor.GetFooterKey()
   426  	aad := encryption.CreateFooterAad(f.FileDecryptor.FileAad())
   427  
   428  	enc := encryption.NewAesEncryptor(f.FileDecryptor.Algorithm(), true)
   429  	var buf bytes.Buffer
   430  	buf.Grow(enc.CiphertextSizeDelta() + len(data))
   431  	encryptedLen := enc.SignedFooterEncrypt(&buf, data, []byte(key), []byte(aad), nonce)
   432  	return bytes.Equal(buf.Bytes()[encryptedLen-encryption.GcmTagLength:], tag)
   433  }
   434  
   435  // WriteTo will serialize and write out this file metadata, encrypting it if
   436  // appropriate.
   437  //
   438  // If it is an encrypted file with a plaintext footer, then we will write the
   439  // signature with the unencrypted footer.
   440  func (f *FileMetaData) WriteTo(w io.Writer, encryptor encryption.Encryptor) (int64, error) {
   441  	serializer := thrift.NewThriftSerializer()
   442  	// only in encrypted files with plaintext footers, the encryption algorithm is set in the footer
   443  	if f.IsSetEncryptionAlgorithm() {
   444  		data, err := serializer.Write(context.Background(), f.FileMetaData)
   445  		if err != nil {
   446  			return 0, err
   447  		}
   448  
   449  		// encrypt the footer key
   450  		var buf bytes.Buffer
   451  		buf.Grow(encryptor.CiphertextSizeDelta() + len(data))
   452  		encryptedLen := encryptor.Encrypt(&buf, data)
   453  
   454  		wrote := 0
   455  		n := 0
   456  		// write unencrypted footer
   457  		if n, err = w.Write(data); err != nil {
   458  			return int64(n), err
   459  		}
   460  		wrote += n
   461  		// write signature (nonce and tag)
   462  		buf.Next(4)
   463  		if n, err = w.Write(buf.Next(encryption.NonceLength)); err != nil {
   464  			return int64(wrote + n), err
   465  		}
   466  		wrote += n
   467  		buf.Next(encryptedLen - 4 - encryption.NonceLength - encryption.GcmTagLength)
   468  		n, err = w.Write(buf.Next(encryption.GcmTagLength))
   469  		return int64(wrote + n), err
   470  	}
   471  	n, err := serializer.Serialize(f.FileMetaData, w, encryptor)
   472  	return int64(n), err
   473  }
   474  
   475  // Version returns the "version" of the file
   476  //
   477  // WARNING: The value returned by this method is unreliable as 1) the
   478  // parquet file metadata stores the version as a single integer and
   479  // 2) some producers are known to always write a hardcoded value. Therefore
   480  // you cannot use this value to know which features are used in the file.
   481  func (f *FileMetaData) Version() parquet.Version {
   482  	switch f.FileMetaData.Version {
   483  	case 1:
   484  		return parquet.V1_0
   485  	case 2:
   486  		return parquet.V2_LATEST
   487  	default:
   488  		// improperly set version, assume parquet 1.0
   489  		return parquet.V1_0
   490  	}
   491  }
   492  
   493  // FileCryptoMetadata is a proxy for the thrift fileCryptoMetadata object
   494  type FileCryptoMetadata struct {
   495  	metadata          *format.FileCryptoMetaData
   496  	cryptoMetadataLen uint32
   497  }
   498  
   499  // NewFileCryptoMetaData takes in the raw serialized bytes to deserialize
   500  // storing the number of bytes that were actually deserialized.
   501  func NewFileCryptoMetaData(metadata []byte) (ret FileCryptoMetadata, err error) {
   502  	ret.metadata = format.NewFileCryptoMetaData()
   503  	var remain uint64
   504  	remain, err = thrift.DeserializeThrift(ret.metadata, metadata)
   505  	ret.cryptoMetadataLen = uint32(uint64(len(metadata)) - remain)
   506  	return
   507  }
   508  
   509  // WriteTo writes out the serialized crypto metadata to w
   510  func (fc FileCryptoMetadata) WriteTo(w io.Writer) (int64, error) {
   511  	serializer := thrift.NewThriftSerializer()
   512  	n, err := serializer.Serialize(fc.metadata, w, nil)
   513  	return int64(n), err
   514  }
   515  
   516  // Len is the number of bytes that were deserialized to create this object
   517  func (fc FileCryptoMetadata) Len() int { return int(fc.cryptoMetadataLen) }
   518  
   519  func (fc FileCryptoMetadata) KeyMetadata() []byte {
   520  	return fc.metadata.KeyMetadata
   521  }
   522  
   523  // EncryptionAlgorithm constructs the object from the thrift instance of
   524  // the encryption algorithm
   525  func (fc FileCryptoMetadata) EncryptionAlgorithm() parquet.Algorithm {
   526  	return parquet.AlgorithmFromThrift(fc.metadata.GetEncryptionAlgorithm())
   527  }