github.com/apache/arrow/go/v10@v10.0.1/parquet/encryption_read_config_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package parquet_test
    18  
    19  import (
    20  	"encoding/binary"
    21  	"fmt"
    22  	"os"
    23  	"path"
    24  	"testing"
    25  
    26  	"github.com/apache/arrow/go/v10/arrow/memory"
    27  	"github.com/apache/arrow/go/v10/parquet"
    28  	"github.com/apache/arrow/go/v10/parquet/file"
    29  	"github.com/apache/arrow/go/v10/parquet/internal/encryption"
    30  	"github.com/stretchr/testify/suite"
    31  )
    32  
    33  /*
    34   * This file contains a unit-test for reading encrypted Parquet files with
    35   * different decryption configurations.
    36   *
    37   * The unit-test is called multiple times, each time to decrypt parquet files using
    38   * different decryption configuration as described below.
    39   * In each call two encrypted files are read: one temporary file that was generated using
    40   * encryption_write_config_test.go test and will be deleted upon
    41   * reading it, while the second resides in
    42   * parquet-testing/data repository. Those two encrypted files were encrypted using the
    43   * same encryption configuration.
    44   * The encrypted parquet file names are passed as parameter to the unit-test.
    45   *
    46   * A detailed description of the Parquet Modular Encryption specification can be found
    47   * here:
    48   * https://github.com/apache/parquet-format/blob/encryption/Encryption.md
    49   *
    50   * The following decryption configurations are used to decrypt each parquet file:
    51   *
    52   *  - Decryption configuration 1:   Decrypt using key retriever that holds the keys of
    53   *                                  two encrypted columns and the footer key.
    54   *  - Decryption configuration 2:   Decrypt using key retriever that holds the keys of
    55   *                                  two encrypted columns and the footer key. Supplies
    56   *                                  aad_prefix to verify file identity.
    57   *  - Decryption configuration 3:   Decrypt using explicit column and footer keys
    58   *                                  (instead of key retrieval callback).
    59   *  - Decryption Configuration 4:   PlainText Footer mode - test legacy reads,
    60   *                                  read the footer + all non-encrypted columns.
    61   *                                  (pairs with encryption configuration 3)
    62   *
    63   * The encrypted parquet files that is read was encrypted using one of the configurations
    64   * below:
    65   *
    66   *  - Encryption configuration 1:   Encrypt all columns and the footer with the same key.
    67   *                                  (uniform encryption)
    68   *  - Encryption configuration 2:   Encrypt two columns and the footer, with different
    69   *                                  keys.
    70   *  - Encryption configuration 3:   Encrypt two columns, with different keys.
    71   *                                  Don’t encrypt footer (to enable legacy readers)
    72   *                                  - plaintext footer mode.
    73   *  - Encryption configuration 4:   Encrypt two columns and the footer, with different
    74   *                                  keys. Supply aad_prefix for file identity
    75   *                                  verification.
    76   *  - Encryption configuration 5:   Encrypt two columns and the footer, with different
    77   *                                  keys. Supply aad_prefix, and call
    78   *                                  disable_aad_prefix_storage to prevent file
    79   *                                  identity storage in file metadata.
    80   *  - Encryption configuration 6:   Encrypt two columns and the footer, with different
    81   *                                  keys. Use the alternative (AES_GCM_CTR_V1) algorithm.
    82   */
    83  
    84  func getDataDir() string {
    85  	datadir := os.Getenv("PARQUET_TEST_DATA")
    86  	if datadir == "" {
    87  		panic("please point the PARQUET_TEST_DATA environment variable to the test data dir")
    88  	}
    89  	return datadir
    90  }
    91  
    92  type TestDecryptionSuite struct {
    93  	suite.Suite
    94  
    95  	pathToDouble        string
    96  	pathToFloat         string
    97  	decryptionConfigs   []*parquet.FileDecryptionProperties
    98  	footerEncryptionKey string
    99  	colEncryptionKey1   string
   100  	colEncryptionKey2   string
   101  	fileName            string
   102  }
   103  
   104  func (d *TestDecryptionSuite) TearDownSuite() {
   105  	os.Remove(tempdir)
   106  }
   107  
   108  func TestFileEncryptionDecryption(t *testing.T) {
   109  	suite.Run(t, new(EncryptionConfigTestSuite))
   110  	suite.Run(t, new(TestDecryptionSuite))
   111  }
   112  
   113  func (d *TestDecryptionSuite) SetupSuite() {
   114  	d.pathToDouble = "double_field"
   115  	d.pathToFloat = "float_field"
   116  	d.footerEncryptionKey = FooterEncryptionKey
   117  	d.colEncryptionKey1 = ColumnEncryptionKey1
   118  	d.colEncryptionKey2 = ColumnEncryptionKey2
   119  	d.fileName = FileName
   120  
   121  	d.createDecryptionConfigs()
   122  }
   123  
   124  func (d *TestDecryptionSuite) createDecryptionConfigs() {
   125  	// Decryption configuration 1: Decrypt using key retriever callback that holds the
   126  	// keys of two encrypted columns and the footer key.
   127  	stringKr1 := make(encryption.StringKeyIDRetriever)
   128  	stringKr1.PutKey("kf", d.footerEncryptionKey)
   129  	stringKr1.PutKey("kc1", d.colEncryptionKey1)
   130  	stringKr1.PutKey("kc2", d.colEncryptionKey2)
   131  
   132  	d.decryptionConfigs = append(d.decryptionConfigs,
   133  		parquet.NewFileDecryptionProperties(parquet.WithKeyRetriever(stringKr1)))
   134  
   135  	// Decryption configuration 2: Decrypt using key retriever callback that holds the
   136  	// keys of two encrypted columns and the footer key. Supply aad_prefix.
   137  	stringKr2 := make(encryption.StringKeyIDRetriever)
   138  	stringKr2.PutKey("kf", d.footerEncryptionKey)
   139  	stringKr2.PutKey("kc1", d.colEncryptionKey1)
   140  	stringKr2.PutKey("kc2", d.colEncryptionKey2)
   141  	d.decryptionConfigs = append(d.decryptionConfigs,
   142  		parquet.NewFileDecryptionProperties(parquet.WithKeyRetriever(stringKr2), parquet.WithDecryptAadPrefix(d.fileName)))
   143  
   144  	// Decryption configuration 3: Decrypt using explicit column and footer keys. Supply
   145  	// aad_prefix.
   146  	decryptCols := make(parquet.ColumnPathToDecryptionPropsMap)
   147  	decryptCols[d.pathToFloat] = parquet.NewColumnDecryptionProperties(d.pathToFloat, parquet.WithDecryptKey(d.colEncryptionKey2))
   148  	decryptCols[d.pathToDouble] = parquet.NewColumnDecryptionProperties(d.pathToDouble, parquet.WithDecryptKey(d.colEncryptionKey1))
   149  	d.decryptionConfigs = append(d.decryptionConfigs,
   150  		parquet.NewFileDecryptionProperties(parquet.WithFooterKey(d.footerEncryptionKey), parquet.WithColumnKeys(decryptCols)))
   151  
   152  	// Decryption Configuration 4: use plaintext footer mode, read only footer + plaintext
   153  	// columns.
   154  	d.decryptionConfigs = append(d.decryptionConfigs, nil)
   155  }
   156  
   157  func (d *TestDecryptionSuite) decryptFile(filename string, decryptConfigNum int) {
   158  	// if we get decryption_config_num = x then it means the actual number is x+1
   159  	// and since we want decryption_config_num=4 we set the condition to 3
   160  	props := parquet.NewReaderProperties(memory.DefaultAllocator)
   161  	if decryptConfigNum != 3 {
   162  		props.FileDecryptProps = d.decryptionConfigs[decryptConfigNum].Clone("")
   163  	}
   164  
   165  	fileReader, err := file.OpenParquetFile(filename, false, file.WithReadProps(props))
   166  	if err != nil {
   167  		panic(err)
   168  	}
   169  	defer fileReader.Close()
   170  	// get metadata
   171  	fileMetadata := fileReader.MetaData()
   172  	// get number of rowgroups
   173  	numRowGroups := len(fileMetadata.RowGroups)
   174  	// number of columns
   175  	numColumns := fileMetadata.Schema.NumColumns()
   176  	d.Equal(8, numColumns)
   177  
   178  	for r := 0; r < numRowGroups; r++ {
   179  		rowGroupReader := fileReader.RowGroup(r)
   180  
   181  		// get rowgroup meta
   182  		rgMeta := fileMetadata.RowGroup(r)
   183  
   184  		valuesRead := 0
   185  		rowsRead := int64(0)
   186  
   187  		// get col reader for boolean column
   188  		colReader, err := rowGroupReader.Column(0)
   189  		if err != nil {
   190  			panic(err)
   191  		}
   192  		boolReader := colReader.(*file.BooleanColumnChunkReader)
   193  
   194  		// get column chunk metadata for boolean column
   195  		boolMd, _ := rgMeta.ColumnChunk(0)
   196  
   197  		// Read all rows in column
   198  		i := 0
   199  		for boolReader.HasNext() {
   200  			var val [1]bool
   201  			// read one value at a time. the number of rows read is returned. values
   202  			// read contains the number of non-null rows
   203  			rowsRead, valuesRead, _ = boolReader.ReadBatch(1, val[:], nil, nil)
   204  			// ensure only 1 value is read
   205  			d.EqualValues(1, rowsRead)
   206  			// there are no null values
   207  			d.EqualValues(1, valuesRead)
   208  			// verify the value
   209  			expected := i%2 == 0
   210  			d.Equal(expected, val[0], "i: ", i)
   211  			i++
   212  		}
   213  		d.EqualValues(i, boolMd.NumValues())
   214  
   215  		// Get column reader for int32 column
   216  		colReader, err = rowGroupReader.Column(1)
   217  		if err != nil {
   218  			panic(err)
   219  		}
   220  		int32reader := colReader.(*file.Int32ColumnChunkReader)
   221  
   222  		int32md, _ := rgMeta.ColumnChunk(1)
   223  		// Read all rows in column
   224  		i = 0
   225  		for int32reader.HasNext() {
   226  			var val [1]int32
   227  			// read one value at a time. the number of rows read is returned. values
   228  			// read contains the number of non-null rows
   229  			rowsRead, valuesRead, _ = int32reader.ReadBatch(1, val[:], nil, nil)
   230  			// ensure only 1 value is read
   231  			d.EqualValues(1, rowsRead)
   232  			// there are no null values
   233  			d.EqualValues(1, valuesRead)
   234  			// verify the value
   235  			d.EqualValues(i, val[0])
   236  			i++
   237  		}
   238  		d.EqualValues(i, int32md.NumValues())
   239  
   240  		// Get column reader for int64 column
   241  		colReader, err = rowGroupReader.Column(2)
   242  		if err != nil {
   243  			panic(err)
   244  		}
   245  		int64reader := colReader.(*file.Int64ColumnChunkReader)
   246  
   247  		int64md, _ := rgMeta.ColumnChunk(2)
   248  		// Read all rows in column
   249  		i = 0
   250  		for int64reader.HasNext() {
   251  			var (
   252  				val [1]int64
   253  				def [1]int16
   254  				rep [1]int16
   255  			)
   256  
   257  			// read one value at a time. the number of rows read is returned. values
   258  			// read contains the number of non-null rows
   259  			rowsRead, valuesRead, _ = int64reader.ReadBatch(1, val[:], def[:], rep[:])
   260  			// ensure only 1 value is read
   261  			d.EqualValues(1, rowsRead)
   262  			// there are no null values
   263  			d.EqualValues(1, valuesRead)
   264  			// verify the value
   265  			expectedValue := int64(i) * 1000 * 1000 * 1000 * 1000
   266  			d.Equal(expectedValue, val[0])
   267  			if i%2 == 0 {
   268  				d.EqualValues(1, rep[0])
   269  			} else {
   270  				d.Zero(rep[0])
   271  			}
   272  			i++
   273  		}
   274  		d.EqualValues(i, int64md.NumValues())
   275  
   276  		// Get column reader for int96 column
   277  		colReader, err = rowGroupReader.Column(3)
   278  		if err != nil {
   279  			panic(err)
   280  		}
   281  		int96reader := colReader.(*file.Int96ColumnChunkReader)
   282  
   283  		int96md, _ := rgMeta.ColumnChunk(3)
   284  		// Read all rows in column
   285  		i = 0
   286  		for int96reader.HasNext() {
   287  			var (
   288  				val [1]parquet.Int96
   289  			)
   290  
   291  			// read one value at a time. the number of rows read is returned. values
   292  			// read contains the number of non-null rows
   293  			rowsRead, valuesRead, _ = int96reader.ReadBatch(1, val[:], nil, nil)
   294  			// ensure only 1 value is read
   295  			d.EqualValues(1, rowsRead)
   296  			// there are no null values
   297  			d.EqualValues(1, valuesRead)
   298  			// verify the value
   299  			var expectedValue parquet.Int96
   300  			binary.LittleEndian.PutUint32(expectedValue[:4], uint32(i))
   301  			binary.LittleEndian.PutUint32(expectedValue[4:], uint32(i+1))
   302  			binary.LittleEndian.PutUint32(expectedValue[8:], uint32(i+2))
   303  			d.Equal(expectedValue, val[0])
   304  			i++
   305  		}
   306  		d.EqualValues(i, int96md.NumValues())
   307  
   308  		// these two columns are always encrypted when we write them, so don't
   309  		// try to read them during the plaintext test.
   310  		if props.FileDecryptProps != nil {
   311  			// Get column reader for the float column
   312  			colReader, err = rowGroupReader.Column(4)
   313  			if err != nil {
   314  				panic(err)
   315  			}
   316  			floatReader := colReader.(*file.Float32ColumnChunkReader)
   317  
   318  			floatmd, _ := rgMeta.ColumnChunk(4)
   319  
   320  			i = 0
   321  			for floatReader.HasNext() {
   322  				var value [1]float32
   323  				// read one value at a time. the number of rows read is returned. values
   324  				// read contains the number of non-null rows
   325  				rowsRead, valuesRead, _ = floatReader.ReadBatch(1, value[:], nil, nil)
   326  				// ensure only 1 value is read
   327  				d.EqualValues(1, rowsRead)
   328  				// there are no null values
   329  				d.EqualValues(1, valuesRead)
   330  				// verify the value
   331  				expectedValue := float32(i) * 1.1
   332  				d.Equal(expectedValue, value[0])
   333  				i++
   334  			}
   335  			d.EqualValues(i, floatmd.NumValues())
   336  
   337  			// Get column reader for the double column
   338  			colReader, err = rowGroupReader.Column(5)
   339  			if err != nil {
   340  				panic(err)
   341  			}
   342  			dblReader := colReader.(*file.Float64ColumnChunkReader)
   343  
   344  			dblmd, _ := rgMeta.ColumnChunk(5)
   345  
   346  			i = 0
   347  			for dblReader.HasNext() {
   348  				var value [1]float64
   349  				// read one value at a time. the number of rows read is returned. values
   350  				// read contains the number of non-null rows
   351  				rowsRead, valuesRead, _ = dblReader.ReadBatch(1, value[:], nil, nil)
   352  				// ensure only 1 value is read
   353  				d.EqualValues(1, rowsRead)
   354  				// there are no null values
   355  				d.EqualValues(1, valuesRead)
   356  				// verify the value
   357  				expectedValue := float64(i) * 1.1111111
   358  				d.Equal(expectedValue, value[0])
   359  				i++
   360  			}
   361  			d.EqualValues(i, dblmd.NumValues())
   362  		}
   363  
   364  		colReader, err = rowGroupReader.Column(6)
   365  		if err != nil {
   366  			panic(err)
   367  		}
   368  		bareader := colReader.(*file.ByteArrayColumnChunkReader)
   369  
   370  		bamd, _ := rgMeta.ColumnChunk(6)
   371  
   372  		i = 0
   373  		for bareader.HasNext() {
   374  			var value [1]parquet.ByteArray
   375  			var def [1]int16
   376  
   377  			rowsRead, valuesRead, _ := bareader.ReadBatch(1, value[:], def[:], nil)
   378  			d.EqualValues(1, rowsRead)
   379  			expected := [10]byte{'p', 'a', 'r', 'q', 'u', 'e', 't', 0, 0, 0}
   380  			expected[7] = byte('0') + byte(i/100)
   381  			expected[8] = byte('0') + byte(i/10)%10
   382  			expected[9] = byte('0') + byte(i%10)
   383  			if i%2 == 0 {
   384  				d.Equal(1, valuesRead)
   385  				d.Len(value[0], 10)
   386  				d.EqualValues(expected[:], value[0])
   387  				d.EqualValues(1, def[0])
   388  			} else {
   389  				d.Zero(valuesRead)
   390  				d.Zero(def[0])
   391  			}
   392  			i++
   393  		}
   394  		d.EqualValues(i, bamd.NumValues())
   395  	}
   396  }
   397  
   398  func (d *TestDecryptionSuite) checkResults(fileName string, decryptionConfig, encryptionConfig uint) {
   399  	decFn := func() { d.decryptFile(fileName, int(decryptionConfig-1)) }
   400  
   401  	// Encryption configuration number 5 contains aad_prefix and disable_aad_prefix_storage
   402  	// an exception is expected to be thrown if the file is not decrypted with aad_prefix
   403  	if encryptionConfig == 5 {
   404  		if decryptionConfig == 1 || decryptionConfig == 3 {
   405  			d.Panics(decFn)
   406  			return
   407  		}
   408  	}
   409  
   410  	// decryption config number two contains aad_prefix. an exception
   411  	// is expected to be thrown if the file was not encrypted with the same aad_prefix
   412  	if decryptionConfig == 2 {
   413  		if encryptionConfig != 5 && encryptionConfig != 4 {
   414  			d.Panics(decFn)
   415  			return
   416  		}
   417  	}
   418  
   419  	// decryption config 4 can only work when the encryption config is 3
   420  	if decryptionConfig == 4 && encryptionConfig != 3 {
   421  		return
   422  	}
   423  	d.NotPanics(decFn)
   424  }
   425  
   426  // Read encrypted parquet file.
   427  // the test reads two parquet files that were encrypted using the same encryption config
   428  // one was generated in encryption_write_configurations_test.go tests and is deleted
   429  // once the file is read and the second exists in parquet-testing/data folder
   430  func (d *TestDecryptionSuite) TestDecryption() {
   431  	tests := []struct {
   432  		file   string
   433  		config uint
   434  	}{
   435  		{"uniform_encryption.parquet.encrypted", 1},
   436  		{"encrypt_columns_and_footer.parquet.encrypted", 2},
   437  		{"encrypt_columns_plaintext_footer.parquet.encrypted", 3},
   438  		{"encrypt_columns_and_footer_aad.parquet.encrypted", 4},
   439  		{"encrypt_columns_and_footer_disable_aad_storage.parquet.encrypted", 5},
   440  		{"encrypt_columns_and_footer_ctr.parquet.encrypted", 6},
   441  	}
   442  	for _, tt := range tests {
   443  		d.Run(tt.file, func() {
   444  			// decrypt file that was generated in encryption-write-tests
   445  			tmpFile := path.Join(tempdir, "tmp_"+tt.file)
   446  			d.Require().FileExists(tmpFile)
   447  
   448  			// iterate over decryption configs and use each one to read the encrypted file
   449  			for idx := range d.decryptionConfigs {
   450  				decConfig := idx + 1
   451  				d.checkResults(tmpFile, uint(decConfig), tt.config)
   452  			}
   453  			os.Remove(tmpFile)
   454  
   455  			file := path.Join(getDataDir(), tt.file)
   456  			d.Require().FileExists(file)
   457  
   458  			for idx := range d.decryptionConfigs {
   459  				decConfig := idx + 1
   460  				d.Run(fmt.Sprintf("config %d", decConfig), func() {
   461  					d.checkResults(file, uint(decConfig), tt.config)
   462  				})
   463  			}
   464  		})
   465  	}
   466  }