github.com/apache/arrow/go/v7@v7.0.1/parquet/encryption_write_config_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package parquet_test
    18  
    19  import (
    20  	"encoding/binary"
    21  	"fmt"
    22  	"io/ioutil"
    23  	"os"
    24  	"path/filepath"
    25  	"testing"
    26  
    27  	"github.com/apache/arrow/go/v7/parquet"
    28  	"github.com/apache/arrow/go/v7/parquet/compress"
    29  	"github.com/apache/arrow/go/v7/parquet/file"
    30  	"github.com/apache/arrow/go/v7/parquet/schema"
    31  	"github.com/stretchr/testify/suite"
    32  )
    33  
    34  /*
    35   * This file contains unit-tests for writing encrypted Parquet files with
    36   * different encryption configurations.
    37   * The files are saved in temporary folder and will be deleted after reading
    38   * them in encryption_read_config_test.go test.
    39   *
    40   * A detailed description of the Parquet Modular Encryption specification can be found
    41   * here:
    42   * https://github.com/apache/parquet-format/blob/encryption/Encryption.md
    43   *
    44   * Each unit-test creates a single parquet file with eight columns using one of the
    45   * following encryption configurations:
    46   *
    47   *  - Encryption configuration 1:   Encrypt all columns and the footer with the same key.
    48   *                                  (uniform encryption)
    49   *  - Encryption configuration 2:   Encrypt two columns and the footer, with different
    50   *                                  keys.
    51   *  - Encryption configuration 3:   Encrypt two columns, with different keys.
    52   *                                  Don’t encrypt footer (to enable legacy readers)
    53   *                                  - plaintext footer mode.
    54   *  - Encryption configuration 4:   Encrypt two columns and the footer, with different
    55   *                                  keys. Supply aad_prefix for file identity
    56   *                                  verification.
    57   *  - Encryption configuration 5:   Encrypt two columns and the footer, with different
    58   *                                  keys. Supply aad_prefix, and call
    59   *                                  disable_aad_prefix_storage to prevent file
    60   *                                  identity storage in file metadata.
    61   *  - Encryption configuration 6:   Encrypt two columns and the footer, with different
    62   *                                  keys. Use the alternative (AES_GCM_CTR_V1) algorithm.
    63   */
    64  
    65  var (
    66  	tempdir string
    67  )
    68  
    69  type EncryptionConfigTestSuite struct {
    70  	suite.Suite
    71  
    72  	pathToDoubleField    string
    73  	pathToFloatField     string
    74  	fileName             string
    75  	numRgs               int
    76  	rowsPerRG            int
    77  	schema               *schema.GroupNode
    78  	footerEncryptionKey  string
    79  	columnEncryptionKey1 string
    80  	columnEncryptionKey2 string
    81  }
    82  
    83  func (en *EncryptionConfigTestSuite) encryptFile(configs *parquet.FileEncryptionProperties, filename string) {
    84  	filename = filepath.Join(tempdir, filename)
    85  
    86  	props := parquet.NewWriterProperties(parquet.WithCompression(compress.Codecs.Snappy), parquet.WithEncryptionProperties(configs))
    87  	outFile, err := os.Create(filename)
    88  	en.Require().NoError(err)
    89  	en.Require().NotNil(outFile)
    90  
    91  	writer := file.NewParquetWriter(outFile, en.schema, file.WithWriterProps(props))
    92  	defer writer.Close()
    93  
    94  	for r := 0; r < en.numRgs; r++ {
    95  		var (
    96  			bufferedMode = r%2 == 0
    97  			rgr          file.RowGroupWriter
    98  			colIndex     = 0
    99  		)
   100  
   101  		if bufferedMode {
   102  			rgr = writer.AppendBufferedRowGroup()
   103  		} else {
   104  			rgr = writer.AppendRowGroup()
   105  		}
   106  
   107  		nextColumn := func() file.ColumnChunkWriter {
   108  			defer func() { colIndex++ }()
   109  			if bufferedMode {
   110  				cw, _ := rgr.(file.BufferedRowGroupWriter).Column(colIndex)
   111  				return cw
   112  			}
   113  			cw, _ := rgr.(file.SerialRowGroupWriter).NextColumn()
   114  			return cw
   115  		}
   116  
   117  		// write the bool col
   118  		boolWriter := nextColumn().(*file.BooleanColumnChunkWriter)
   119  		for i := 0; i < en.rowsPerRG; i++ {
   120  			value := (i % 2) == 0
   121  			boolWriter.WriteBatch([]bool{value}, nil, nil)
   122  		}
   123  
   124  		// write the int32 col
   125  		int32Writer := nextColumn().(*file.Int32ColumnChunkWriter)
   126  		for i := int32(0); i < int32(en.rowsPerRG); i++ {
   127  			int32Writer.WriteBatch([]int32{i}, nil, nil)
   128  		}
   129  
   130  		// write the int64 column, each row repeats twice
   131  		int64Writer := nextColumn().(*file.Int64ColumnChunkWriter)
   132  		for i := 0; i < 2*en.rowsPerRG; i++ {
   133  			var (
   134  				defLevel       = [1]int16{1}
   135  				repLevel       = [1]int16{0}
   136  				value    int64 = int64(i) * 1000 * 1000 * 1000 * 1000
   137  			)
   138  			if i%2 == 0 {
   139  				repLevel[0] = 1
   140  			}
   141  
   142  			int64Writer.WriteBatch([]int64{value}, defLevel[:], repLevel[:])
   143  		}
   144  
   145  		// write the int96 col
   146  		int96Writer := nextColumn().(*file.Int96ColumnChunkWriter)
   147  		for i := 0; i < en.rowsPerRG; i++ {
   148  			val := parquet.Int96{}
   149  			binary.LittleEndian.PutUint32(val[:], uint32(i))
   150  			binary.LittleEndian.PutUint32(val[4:], uint32(i+1))
   151  			binary.LittleEndian.PutUint32(val[8:], uint32(i+2))
   152  			int96Writer.WriteBatch([]parquet.Int96{val}, nil, nil)
   153  		}
   154  
   155  		// write the float column
   156  		floatWriter := nextColumn().(*file.Float32ColumnChunkWriter)
   157  		for i := 0; i < en.rowsPerRG; i++ {
   158  			val := float32(i) * 1.1
   159  			floatWriter.WriteBatch([]float32{val}, nil, nil)
   160  		}
   161  
   162  		// write the double column
   163  		doubleWriter := nextColumn().(*file.Float64ColumnChunkWriter)
   164  		for i := 0; i < en.rowsPerRG; i++ {
   165  			value := float64(i) * 1.1111111
   166  			doubleWriter.WriteBatch([]float64{value}, nil, nil)
   167  		}
   168  
   169  		// write the bytearray column. make every alternate value NULL
   170  		baWriter := nextColumn().(*file.ByteArrayColumnChunkWriter)
   171  		for i := 0; i < en.rowsPerRG; i++ {
   172  			var (
   173  				hello = []byte{'p', 'a', 'r', 'q', 'u', 'e', 't', 0, 0, 0}
   174  			)
   175  			hello[7] = byte(int('0') + i/100)
   176  			hello[8] = byte(int('0') + (i/10)%10)
   177  			hello[9] = byte(int('0') + i%10)
   178  			if i%2 == 0 {
   179  				baWriter.WriteBatch([]parquet.ByteArray{hello}, []int16{1}, nil)
   180  			} else {
   181  				baWriter.WriteBatch([]parquet.ByteArray{nil}, []int16{0}, nil)
   182  			}
   183  		}
   184  
   185  		// write fixedlength byte array column
   186  		flbaWriter := nextColumn().(*file.FixedLenByteArrayColumnChunkWriter)
   187  		for i := 0; i < en.rowsPerRG; i++ {
   188  			v := byte(i)
   189  			value := parquet.FixedLenByteArray{v, v, v, v, v, v, v, v, v, v}
   190  			flbaWriter.WriteBatch([]parquet.FixedLenByteArray{value}, nil, nil)
   191  		}
   192  	}
   193  }
   194  
   195  func (en *EncryptionConfigTestSuite) SetupSuite() {
   196  	var err error
   197  	tempdir, err = ioutil.TempDir("", "parquet-encryption-test-*")
   198  	en.Require().NoError(err)
   199  	fmt.Println(tempdir)
   200  
   201  	en.fileName = FileName
   202  	en.rowsPerRG = 50
   203  	en.numRgs = 5
   204  	en.pathToDoubleField = "double_field"
   205  	en.pathToFloatField = "float_field"
   206  	en.footerEncryptionKey = FooterEncryptionKey
   207  	en.columnEncryptionKey1 = ColumnEncryptionKey1
   208  	en.columnEncryptionKey2 = ColumnEncryptionKey2
   209  
   210  	fields := make(schema.FieldList, 0)
   211  	// create a primitive node named "boolean_field" with type BOOLEAN
   212  	// repetition:REQUIRED
   213  	fields = append(fields, schema.NewBooleanNode("boolean_field", parquet.Repetitions.Required, -1))
   214  	// create a primitive node named "int32_field" with type INT32 repetition REQUIRED
   215  	// and logical type: TIME_MILLIS
   216  	f, _ := schema.NewPrimitiveNodeLogical("int32_field", parquet.Repetitions.Required,
   217  		schema.NewTimeLogicalType(true, schema.TimeUnitMillis), parquet.Types.Int32, 0, -1)
   218  	fields = append(fields, f)
   219  
   220  	// create a primitive node named "int64_field" with type int64, repetition:REPEATED
   221  	fields = append(fields, schema.NewInt64Node("int64_field", parquet.Repetitions.Repeated, -1))
   222  
   223  	fields = append(fields,
   224  		schema.NewInt96Node("int96_field", parquet.Repetitions.Required, -1),
   225  		schema.NewFloat32Node("float_field", parquet.Repetitions.Required, -1),
   226  		schema.NewFloat64Node("double_field", parquet.Repetitions.Required, -1))
   227  
   228  	// create a primitive node named ba_field with type:BYTE_ARRAY repetition:OPTIONAL
   229  	fields = append(fields, schema.NewByteArrayNode("ba_field", parquet.Repetitions.Optional, -1))
   230  
   231  	// create a primitive node for flba_field
   232  	fields = append(fields, schema.NewFixedLenByteArrayNode("flba_field", parquet.Repetitions.Required, 10, -1))
   233  
   234  	// flba_field fixedlenbytearray
   235  	en.schema, _ = schema.NewGroupNode("schema", parquet.Repetitions.Required, fields, -1)
   236  }
   237  
   238  // Encryption Config 1: Encrypt All columns and the footer with the same key
   239  // (uniform encryption)
   240  func (en *EncryptionConfigTestSuite) TestUniformEncryption() {
   241  	props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"))
   242  	en.encryptFile(props, "tmp_uniform_encryption.parquet.encrypted")
   243  }
   244  
   245  // Encryption config 2: Encrypt Two Columns and the Footer, with different keys
   246  func (en *EncryptionConfigTestSuite) TestEncryptTwoColumnsAndFooter() {
   247  	encryptCols := make(parquet.ColumnPathToEncryptionPropsMap)
   248  	encryptCols[en.pathToDoubleField] = parquet.NewColumnEncryptionProperties(en.pathToDoubleField, parquet.WithKey(en.columnEncryptionKey1), parquet.WithKeyID("kc1"))
   249  	encryptCols[en.pathToFloatField] = parquet.NewColumnEncryptionProperties(en.pathToFloatField, parquet.WithKey(en.columnEncryptionKey2), parquet.WithKeyID("kc2"))
   250  
   251  	props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"), parquet.WithEncryptedColumns(encryptCols))
   252  	en.encryptFile(props, "tmp_encrypt_columns_and_footer.parquet.encrypted")
   253  }
   254  
   255  // Encryption Config 3: encrypt two columns, with different keys.
   256  // plaintext footer
   257  // (plaintext footer mode, readable by legacy readers)
   258  func (en *EncryptionConfigTestSuite) TestEncryptTwoColumnsPlaintextFooter() {
   259  	encryptCols := make(parquet.ColumnPathToEncryptionPropsMap)
   260  	encryptCols[en.pathToDoubleField] = parquet.NewColumnEncryptionProperties(en.pathToDoubleField, parquet.WithKey(en.columnEncryptionKey1), parquet.WithKeyID("kc1"))
   261  	encryptCols[en.pathToFloatField] = parquet.NewColumnEncryptionProperties(en.pathToFloatField, parquet.WithKey(en.columnEncryptionKey2), parquet.WithKeyID("kc2"))
   262  
   263  	props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"), parquet.WithEncryptedColumns(encryptCols), parquet.WithPlaintextFooter())
   264  	en.encryptFile(props, "tmp_encrypt_columns_plaintext_footer.parquet.encrypted")
   265  }
   266  
   267  // Encryption Config 4: Encrypt two columns and the footer, with different keys
   268  // use aad_prefix
   269  func (en *EncryptionConfigTestSuite) TestEncryptTwoColumnsAndFooterWithAadPrefix() {
   270  	encryptCols := make(parquet.ColumnPathToEncryptionPropsMap)
   271  	encryptCols[en.pathToDoubleField] = parquet.NewColumnEncryptionProperties(en.pathToDoubleField, parquet.WithKey(en.columnEncryptionKey1), parquet.WithKeyID("kc1"))
   272  	encryptCols[en.pathToFloatField] = parquet.NewColumnEncryptionProperties(en.pathToFloatField, parquet.WithKey(en.columnEncryptionKey2), parquet.WithKeyID("kc2"))
   273  
   274  	props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"), parquet.WithEncryptedColumns(encryptCols), parquet.WithAadPrefix(en.fileName))
   275  	en.encryptFile(props, "tmp_encrypt_columns_and_footer_aad.parquet.encrypted")
   276  }
   277  
   278  // Encryption Config 5: Encrypt Two columns and the footer, with different keys
   279  // use aad_prefix and disable_aad_prefix_storage
   280  func (en *EncryptionConfigTestSuite) TestEncryptTwoColumnsAndFooterWithAadPrefixDisableAadStorage() {
   281  	encryptCols := make(parquet.ColumnPathToEncryptionPropsMap)
   282  	encryptCols[en.pathToDoubleField] = parquet.NewColumnEncryptionProperties(en.pathToDoubleField, parquet.WithKey(en.columnEncryptionKey1), parquet.WithKeyID("kc1"))
   283  	encryptCols[en.pathToFloatField] = parquet.NewColumnEncryptionProperties(en.pathToFloatField, parquet.WithKey(en.columnEncryptionKey2), parquet.WithKeyID("kc2"))
   284  
   285  	props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"), parquet.WithAadPrefix(en.fileName), parquet.DisableAadPrefixStorage())
   286  	en.encryptFile(props, "tmp_encrypt_columns_and_footer_disable_aad_storage.parquet.encrypted")
   287  }
   288  
   289  // Encryption Config 6: Encrypt two columns and the footer, with different keys.
   290  // Use AES_GCM_CTR_V1
   291  func (en *EncryptionConfigTestSuite) TestEncryptTwoColumnsAndFooterAesGcmCtr() {
   292  	encryptCols := make(parquet.ColumnPathToEncryptionPropsMap)
   293  	encryptCols[en.pathToDoubleField] = parquet.NewColumnEncryptionProperties(en.pathToDoubleField, parquet.WithKey(en.columnEncryptionKey1), parquet.WithKeyID("kc1"))
   294  	encryptCols[en.pathToFloatField] = parquet.NewColumnEncryptionProperties(en.pathToFloatField, parquet.WithKey(en.columnEncryptionKey2), parquet.WithKeyID("kc2"))
   295  
   296  	props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"), parquet.WithEncryptedColumns(encryptCols), parquet.WithAlg(parquet.AesCtr))
   297  	en.encryptFile(props, "tmp_encrypt_columns_and_footer_ctr.parquet.encrypted")
   298  }
   299  
   300  func TestFileEncryption(t *testing.T) {
   301  	suite.Run(t, new(EncryptionConfigTestSuite))
   302  }