github.com/apache/arrow/go/v14@v14.0.2/parquet/encryption_write_config_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package parquet_test
    18  
    19  import (
    20  	"encoding/binary"
    21  	"fmt"
    22  	"os"
    23  	"path/filepath"
    24  	"testing"
    25  
    26  	"github.com/apache/arrow/go/v14/parquet"
    27  	"github.com/apache/arrow/go/v14/parquet/compress"
    28  	"github.com/apache/arrow/go/v14/parquet/file"
    29  	"github.com/apache/arrow/go/v14/parquet/schema"
    30  	"github.com/stretchr/testify/suite"
    31  )
    32  
    33  /*
    34   * This file contains unit-tests for writing encrypted Parquet files with
    35   * different encryption configurations.
    36   * The files are saved in temporary folder and will be deleted after reading
    37   * them in encryption_read_config_test.go test.
    38   *
    39   * A detailed description of the Parquet Modular Encryption specification can be found
    40   * here:
    41   * https://github.com/apache/parquet-format/blob/encryption/Encryption.md
    42   *
    43   * Each unit-test creates a single parquet file with eight columns using one of the
    44   * following encryption configurations:
    45   *
    46   *  - Encryption configuration 1:   Encrypt all columns and the footer with the same key.
    47   *                                  (uniform encryption)
    48   *  - Encryption configuration 2:   Encrypt two columns and the footer, with different
    49   *                                  keys.
    50   *  - Encryption configuration 3:   Encrypt two columns, with different keys.
    51   *                                  Don’t encrypt footer (to enable legacy readers)
    52   *                                  - plaintext footer mode.
    53   *  - Encryption configuration 4:   Encrypt two columns and the footer, with different
    54   *                                  keys. Supply aad_prefix for file identity
    55   *                                  verification.
    56   *  - Encryption configuration 5:   Encrypt two columns and the footer, with different
    57   *                                  keys. Supply aad_prefix, and call
    58   *                                  disable_aad_prefix_storage to prevent file
    59   *                                  identity storage in file metadata.
    60   *  - Encryption configuration 6:   Encrypt two columns and the footer, with different
    61   *                                  keys. Use the alternative (AES_GCM_CTR_V1) algorithm.
    62   */
    63  
    64  var (
    65  	tempdir string
    66  )
    67  
    68  type EncryptionConfigTestSuite struct {
    69  	suite.Suite
    70  
    71  	pathToDoubleField    string
    72  	pathToFloatField     string
    73  	fileName             string
    74  	numRgs               int
    75  	rowsPerRG            int
    76  	schema               *schema.GroupNode
    77  	footerEncryptionKey  string
    78  	columnEncryptionKey1 string
    79  	columnEncryptionKey2 string
    80  }
    81  
    82  func (en *EncryptionConfigTestSuite) encryptFile(configs *parquet.FileEncryptionProperties, filename string) {
    83  	filename = filepath.Join(tempdir, filename)
    84  
    85  	props := parquet.NewWriterProperties(parquet.WithCompression(compress.Codecs.Snappy), parquet.WithEncryptionProperties(configs))
    86  	outFile, err := os.Create(filename)
    87  	en.Require().NoError(err)
    88  	en.Require().NotNil(outFile)
    89  
    90  	writer := file.NewParquetWriter(outFile, en.schema, file.WithWriterProps(props))
    91  	defer writer.Close()
    92  
    93  	for r := 0; r < en.numRgs; r++ {
    94  		var (
    95  			bufferedMode = r%2 == 0
    96  			rgr          file.RowGroupWriter
    97  			colIndex     = 0
    98  		)
    99  
   100  		if bufferedMode {
   101  			rgr = writer.AppendBufferedRowGroup()
   102  		} else {
   103  			rgr = writer.AppendRowGroup()
   104  		}
   105  
   106  		nextColumn := func() file.ColumnChunkWriter {
   107  			defer func() { colIndex++ }()
   108  			if bufferedMode {
   109  				cw, _ := rgr.(file.BufferedRowGroupWriter).Column(colIndex)
   110  				return cw
   111  			}
   112  			cw, _ := rgr.(file.SerialRowGroupWriter).NextColumn()
   113  			return cw
   114  		}
   115  
   116  		// write the bool col
   117  		boolWriter := nextColumn().(*file.BooleanColumnChunkWriter)
   118  		for i := 0; i < en.rowsPerRG; i++ {
   119  			value := (i % 2) == 0
   120  			n, err := boolWriter.WriteBatch([]bool{value}, nil, nil)
   121  			en.EqualValues(1, n)
   122  			en.Require().NoError(err)
   123  		}
   124  
   125  		// write the int32 col
   126  		int32Writer := nextColumn().(*file.Int32ColumnChunkWriter)
   127  		for i := int32(0); i < int32(en.rowsPerRG); i++ {
   128  			n, err := int32Writer.WriteBatch([]int32{i}, nil, nil)
   129  			en.EqualValues(1, n)
   130  			en.Require().NoError(err)
   131  		}
   132  
   133  		// write the int64 column, each row repeats twice
   134  		int64Writer := nextColumn().(*file.Int64ColumnChunkWriter)
   135  		for i := 0; i < 2*en.rowsPerRG; i++ {
   136  			var (
   137  				defLevel       = [1]int16{1}
   138  				repLevel       = [1]int16{0}
   139  				value    int64 = int64(i) * 1000 * 1000 * 1000 * 1000
   140  			)
   141  			if i%2 == 0 {
   142  				repLevel[0] = 1
   143  			}
   144  
   145  			n, err := int64Writer.WriteBatch([]int64{value}, defLevel[:], repLevel[:])
   146  			en.EqualValues(1, n)
   147  			en.Require().NoError(err)
   148  		}
   149  
   150  		// write the int96 col
   151  		int96Writer := nextColumn().(*file.Int96ColumnChunkWriter)
   152  		for i := 0; i < en.rowsPerRG; i++ {
   153  			val := parquet.Int96{}
   154  			binary.LittleEndian.PutUint32(val[:], uint32(i))
   155  			binary.LittleEndian.PutUint32(val[4:], uint32(i+1))
   156  			binary.LittleEndian.PutUint32(val[8:], uint32(i+2))
   157  			n, err := int96Writer.WriteBatch([]parquet.Int96{val}, nil, nil)
   158  			en.EqualValues(1, n)
   159  			en.Require().NoError(err)
   160  		}
   161  
   162  		// write the float column
   163  		floatWriter := nextColumn().(*file.Float32ColumnChunkWriter)
   164  		for i := 0; i < en.rowsPerRG; i++ {
   165  			val := float32(i) * 1.1
   166  			n, err := floatWriter.WriteBatch([]float32{val}, nil, nil)
   167  			en.EqualValues(1, n)
   168  			en.Require().NoError(err)
   169  		}
   170  
   171  		// write the double column
   172  		doubleWriter := nextColumn().(*file.Float64ColumnChunkWriter)
   173  		for i := 0; i < en.rowsPerRG; i++ {
   174  			value := float64(i) * 1.1111111
   175  			n, err := doubleWriter.WriteBatch([]float64{value}, nil, nil)
   176  			en.EqualValues(1, n)
   177  			en.Require().NoError(err)
   178  		}
   179  
   180  		// write the bytearray column. make every alternate value NULL
   181  		baWriter := nextColumn().(*file.ByteArrayColumnChunkWriter)
   182  		for i := 0; i < en.rowsPerRG; i++ {
   183  			var (
   184  				n     int64
   185  				err   error
   186  				hello = []byte{'p', 'a', 'r', 'q', 'u', 'e', 't', 0, 0, 0}
   187  			)
   188  			hello[7] = byte(int('0') + i/100)
   189  			hello[8] = byte(int('0') + (i/10)%10)
   190  			hello[9] = byte(int('0') + i%10)
   191  			if i%2 == 0 {
   192  				n, err = baWriter.WriteBatch([]parquet.ByteArray{hello}, []int16{1}, nil)
   193  				en.EqualValues(1, n)
   194  			} else {
   195  				n, err = baWriter.WriteBatch([]parquet.ByteArray{nil}, []int16{0}, nil)
   196  				en.Zero(n)
   197  			}
   198  
   199  			en.Require().NoError(err)
   200  		}
   201  
   202  		// write fixedlength byte array column
   203  		flbaWriter := nextColumn().(*file.FixedLenByteArrayColumnChunkWriter)
   204  		for i := 0; i < en.rowsPerRG; i++ {
   205  			v := byte(i)
   206  			value := parquet.FixedLenByteArray{v, v, v, v, v, v, v, v, v, v}
   207  			n, err := flbaWriter.WriteBatch([]parquet.FixedLenByteArray{value}, nil, nil)
   208  			en.EqualValues(1, n)
   209  			en.Require().NoError(err)
   210  		}
   211  	}
   212  }
   213  
   214  func (en *EncryptionConfigTestSuite) SetupSuite() {
   215  	var err error
   216  	tempdir, err = os.MkdirTemp("", "parquet-encryption-test-*")
   217  	en.Require().NoError(err)
   218  	fmt.Println(tempdir)
   219  
   220  	en.fileName = FileName
   221  	en.rowsPerRG = 50
   222  	en.numRgs = 5
   223  	en.pathToDoubleField = "double_field"
   224  	en.pathToFloatField = "float_field"
   225  	en.footerEncryptionKey = FooterEncryptionKey
   226  	en.columnEncryptionKey1 = ColumnEncryptionKey1
   227  	en.columnEncryptionKey2 = ColumnEncryptionKey2
   228  
   229  	fields := make(schema.FieldList, 0)
   230  	// create a primitive node named "boolean_field" with type BOOLEAN
   231  	// repetition:REQUIRED
   232  	fields = append(fields, schema.NewBooleanNode("boolean_field", parquet.Repetitions.Required, -1))
   233  	// create a primitive node named "int32_field" with type INT32 repetition REQUIRED
   234  	// and logical type: TIME_MILLIS
   235  	f, _ := schema.NewPrimitiveNodeLogical("int32_field", parquet.Repetitions.Required,
   236  		schema.NewTimeLogicalType(true, schema.TimeUnitMillis), parquet.Types.Int32, 0, -1)
   237  	fields = append(fields, f)
   238  
   239  	// create a primitive node named "int64_field" with type int64, repetition:REPEATED
   240  	fields = append(fields, schema.NewInt64Node("int64_field", parquet.Repetitions.Repeated, -1))
   241  
   242  	fields = append(fields,
   243  		schema.NewInt96Node("int96_field", parquet.Repetitions.Required, -1),
   244  		schema.NewFloat32Node("float_field", parquet.Repetitions.Required, -1),
   245  		schema.NewFloat64Node("double_field", parquet.Repetitions.Required, -1))
   246  
   247  	// create a primitive node named ba_field with type:BYTE_ARRAY repetition:OPTIONAL
   248  	fields = append(fields, schema.NewByteArrayNode("ba_field", parquet.Repetitions.Optional, -1))
   249  
   250  	// create a primitive node for flba_field
   251  	fields = append(fields, schema.NewFixedLenByteArrayNode("flba_field", parquet.Repetitions.Required, 10, -1))
   252  
   253  	// flba_field fixedlenbytearray
   254  	en.schema, _ = schema.NewGroupNode("schema", parquet.Repetitions.Required, fields, -1)
   255  }
   256  
   257  // Encryption Config 1: Encrypt All columns and the footer with the same key
   258  // (uniform encryption)
   259  func (en *EncryptionConfigTestSuite) TestUniformEncryption() {
   260  	props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"))
   261  	en.encryptFile(props, "tmp_uniform_encryption.parquet.encrypted")
   262  }
   263  
   264  // Encryption config 2: Encrypt Two Columns and the Footer, with different keys
   265  func (en *EncryptionConfigTestSuite) TestEncryptTwoColumnsAndFooter() {
   266  	encryptCols := make(parquet.ColumnPathToEncryptionPropsMap)
   267  	encryptCols[en.pathToDoubleField] = parquet.NewColumnEncryptionProperties(en.pathToDoubleField, parquet.WithKey(en.columnEncryptionKey1), parquet.WithKeyID("kc1"))
   268  	encryptCols[en.pathToFloatField] = parquet.NewColumnEncryptionProperties(en.pathToFloatField, parquet.WithKey(en.columnEncryptionKey2), parquet.WithKeyID("kc2"))
   269  
   270  	props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"), parquet.WithEncryptedColumns(encryptCols))
   271  	en.encryptFile(props, "tmp_encrypt_columns_and_footer.parquet.encrypted")
   272  }
   273  
   274  // Encryption Config 3: encrypt two columns, with different keys.
   275  // plaintext footer
   276  // (plaintext footer mode, readable by legacy readers)
   277  func (en *EncryptionConfigTestSuite) TestEncryptTwoColumnsPlaintextFooter() {
   278  	encryptCols := make(parquet.ColumnPathToEncryptionPropsMap)
   279  	encryptCols[en.pathToDoubleField] = parquet.NewColumnEncryptionProperties(en.pathToDoubleField, parquet.WithKey(en.columnEncryptionKey1), parquet.WithKeyID("kc1"))
   280  	encryptCols[en.pathToFloatField] = parquet.NewColumnEncryptionProperties(en.pathToFloatField, parquet.WithKey(en.columnEncryptionKey2), parquet.WithKeyID("kc2"))
   281  
   282  	props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"), parquet.WithEncryptedColumns(encryptCols), parquet.WithPlaintextFooter())
   283  	en.encryptFile(props, "tmp_encrypt_columns_plaintext_footer.parquet.encrypted")
   284  }
   285  
   286  // Encryption Config 4: Encrypt two columns and the footer, with different keys
   287  // use aad_prefix
   288  func (en *EncryptionConfigTestSuite) TestEncryptTwoColumnsAndFooterWithAadPrefix() {
   289  	encryptCols := make(parquet.ColumnPathToEncryptionPropsMap)
   290  	encryptCols[en.pathToDoubleField] = parquet.NewColumnEncryptionProperties(en.pathToDoubleField, parquet.WithKey(en.columnEncryptionKey1), parquet.WithKeyID("kc1"))
   291  	encryptCols[en.pathToFloatField] = parquet.NewColumnEncryptionProperties(en.pathToFloatField, parquet.WithKey(en.columnEncryptionKey2), parquet.WithKeyID("kc2"))
   292  
   293  	props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"), parquet.WithEncryptedColumns(encryptCols), parquet.WithAadPrefix(en.fileName))
   294  	en.encryptFile(props, "tmp_encrypt_columns_and_footer_aad.parquet.encrypted")
   295  }
   296  
   297  // Encryption Config 5: Encrypt Two columns and the footer, with different keys
   298  // use aad_prefix and disable_aad_prefix_storage
   299  func (en *EncryptionConfigTestSuite) TestEncryptTwoColumnsAndFooterWithAadPrefixDisableAadStorage() {
   300  	encryptCols := make(parquet.ColumnPathToEncryptionPropsMap)
   301  	encryptCols[en.pathToDoubleField] = parquet.NewColumnEncryptionProperties(en.pathToDoubleField, parquet.WithKey(en.columnEncryptionKey1), parquet.WithKeyID("kc1"))
   302  	encryptCols[en.pathToFloatField] = parquet.NewColumnEncryptionProperties(en.pathToFloatField, parquet.WithKey(en.columnEncryptionKey2), parquet.WithKeyID("kc2"))
   303  
   304  	props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"), parquet.WithAadPrefix(en.fileName), parquet.DisableAadPrefixStorage())
   305  	en.encryptFile(props, "tmp_encrypt_columns_and_footer_disable_aad_storage.parquet.encrypted")
   306  }
   307  
   308  // Encryption Config 6: Encrypt two columns and the footer, with different keys.
   309  // Use AES_GCM_CTR_V1
   310  func (en *EncryptionConfigTestSuite) TestEncryptTwoColumnsAndFooterAesGcmCtr() {
   311  	encryptCols := make(parquet.ColumnPathToEncryptionPropsMap)
   312  	encryptCols[en.pathToDoubleField] = parquet.NewColumnEncryptionProperties(en.pathToDoubleField, parquet.WithKey(en.columnEncryptionKey1), parquet.WithKeyID("kc1"))
   313  	encryptCols[en.pathToFloatField] = parquet.NewColumnEncryptionProperties(en.pathToFloatField, parquet.WithKey(en.columnEncryptionKey2), parquet.WithKeyID("kc2"))
   314  
   315  	props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"), parquet.WithEncryptedColumns(encryptCols), parquet.WithAlg(parquet.AesCtr))
   316  	en.encryptFile(props, "tmp_encrypt_columns_and_footer_ctr.parquet.encrypted")
   317  }
   318  
   319  func TestFileEncryption(t *testing.T) {
   320  	suite.Run(t, new(EncryptionConfigTestSuite))
   321  }