github.com/apache/arrow/go/v14@v14.0.2/parquet/encryption_read_config_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package parquet_test
    18  
    19  import (
    20  	"encoding/binary"
    21  	"fmt"
    22  	"os"
    23  	"path"
    24  	"testing"
    25  
    26  	"github.com/apache/arrow/go/v14/arrow/memory"
    27  	"github.com/apache/arrow/go/v14/parquet"
    28  	"github.com/apache/arrow/go/v14/parquet/file"
    29  	"github.com/apache/arrow/go/v14/parquet/internal/encryption"
    30  	"github.com/stretchr/testify/suite"
    31  )
    32  
    33  /*
    34   * This file contains a unit-test for reading encrypted Parquet files with
    35   * different decryption configurations.
    36   *
    37   * The unit-test is called multiple times, each time to decrypt parquet files using
    38   * different decryption configuration as described below.
    39   * In each call two encrypted files are read: one temporary file that was generated using
    40   * encryption_write_config_test.go test and will be deleted upon
    41   * reading it, while the second resides in
    42   * parquet-testing/data repository. Those two encrypted files were encrypted using the
    43   * same encryption configuration.
    44   * The encrypted parquet file names are passed as parameter to the unit-test.
    45   *
    46   * A detailed description of the Parquet Modular Encryption specification can be found
    47   * here:
    48   * https://github.com/apache/parquet-format/blob/encryption/Encryption.md
    49   *
    50   * The following decryption configurations are used to decrypt each parquet file:
    51   *
    52   *  - Decryption configuration 1:   Decrypt using key retriever that holds the keys of
    53   *                                  two encrypted columns and the footer key.
    54   *  - Decryption configuration 2:   Decrypt using key retriever that holds the keys of
    55   *                                  two encrypted columns and the footer key. Supplies
    56   *                                  aad_prefix to verify file identity.
    57   *  - Decryption configuration 3:   Decrypt using explicit column and footer keys
    58   *                                  (instead of key retrieval callback).
    59   *  - Decryption Configuration 4:   PlainText Footer mode - test legacy reads,
    60   *                                  read the footer + all non-encrypted columns.
    61   *                                  (pairs with encryption configuration 3)
    62   *
    63   * The encrypted parquet files that is read was encrypted using one of the configurations
    64   * below:
    65   *
    66   *  - Encryption configuration 1:   Encrypt all columns and the footer with the same key.
    67   *                                  (uniform encryption)
    68   *  - Encryption configuration 2:   Encrypt two columns and the footer, with different
    69   *                                  keys.
    70   *  - Encryption configuration 3:   Encrypt two columns, with different keys.
    71   *                                  Don’t encrypt footer (to enable legacy readers)
    72   *                                  - plaintext footer mode.
    73   *  - Encryption configuration 4:   Encrypt two columns and the footer, with different
    74   *                                  keys. Supply aad_prefix for file identity
    75   *                                  verification.
    76   *  - Encryption configuration 5:   Encrypt two columns and the footer, with different
    77   *                                  keys. Supply aad_prefix, and call
    78   *                                  disable_aad_prefix_storage to prevent file
    79   *                                  identity storage in file metadata.
    80   *  - Encryption configuration 6:   Encrypt two columns and the footer, with different
    81   *                                  keys. Use the alternative (AES_GCM_CTR_V1) algorithm.
    82   */
    83  
    84  func getDataDir() string {
    85  	datadir := os.Getenv("PARQUET_TEST_DATA")
    86  	if datadir == "" {
    87  		panic("please point the PARQUET_TEST_DATA environment variable to the test data dir")
    88  	}
    89  	return datadir
    90  }
    91  
    92  type TestDecryptionSuite struct {
    93  	suite.Suite
    94  
    95  	pathToDouble        string
    96  	pathToFloat         string
    97  	decryptionConfigs   []*parquet.FileDecryptionProperties
    98  	footerEncryptionKey string
    99  	colEncryptionKey1   string
   100  	colEncryptionKey2   string
   101  	fileName            string
   102  	rowsPerRG           int
   103  }
   104  
   105  func (d *TestDecryptionSuite) TearDownSuite() {
   106  	os.Remove(tempdir)
   107  }
   108  
   109  func TestFileEncryptionDecryption(t *testing.T) {
   110  	suite.Run(t, new(EncryptionConfigTestSuite))
   111  	suite.Run(t, new(TestDecryptionSuite))
   112  }
   113  
   114  func (d *TestDecryptionSuite) SetupSuite() {
   115  	d.pathToDouble = "double_field"
   116  	d.pathToFloat = "float_field"
   117  	d.footerEncryptionKey = FooterEncryptionKey
   118  	d.colEncryptionKey1 = ColumnEncryptionKey1
   119  	d.colEncryptionKey2 = ColumnEncryptionKey2
   120  	d.fileName = FileName
   121  	d.rowsPerRG = 50 // same as write encryption test
   122  
   123  	d.createDecryptionConfigs()
   124  }
   125  
   126  func (d *TestDecryptionSuite) createDecryptionConfigs() {
   127  	// Decryption configuration 1: Decrypt using key retriever callback that holds the
   128  	// keys of two encrypted columns and the footer key.
   129  	stringKr1 := make(encryption.StringKeyIDRetriever)
   130  	stringKr1.PutKey("kf", d.footerEncryptionKey)
   131  	stringKr1.PutKey("kc1", d.colEncryptionKey1)
   132  	stringKr1.PutKey("kc2", d.colEncryptionKey2)
   133  
   134  	d.decryptionConfigs = append(d.decryptionConfigs,
   135  		parquet.NewFileDecryptionProperties(parquet.WithKeyRetriever(stringKr1)))
   136  
   137  	// Decryption configuration 2: Decrypt using key retriever callback that holds the
   138  	// keys of two encrypted columns and the footer key. Supply aad_prefix.
   139  	stringKr2 := make(encryption.StringKeyIDRetriever)
   140  	stringKr2.PutKey("kf", d.footerEncryptionKey)
   141  	stringKr2.PutKey("kc1", d.colEncryptionKey1)
   142  	stringKr2.PutKey("kc2", d.colEncryptionKey2)
   143  	d.decryptionConfigs = append(d.decryptionConfigs,
   144  		parquet.NewFileDecryptionProperties(parquet.WithKeyRetriever(stringKr2), parquet.WithDecryptAadPrefix(d.fileName)))
   145  
   146  	// Decryption configuration 3: Decrypt using explicit column and footer keys. Supply
   147  	// aad_prefix.
   148  	decryptCols := make(parquet.ColumnPathToDecryptionPropsMap)
   149  	decryptCols[d.pathToFloat] = parquet.NewColumnDecryptionProperties(d.pathToFloat, parquet.WithDecryptKey(d.colEncryptionKey2))
   150  	decryptCols[d.pathToDouble] = parquet.NewColumnDecryptionProperties(d.pathToDouble, parquet.WithDecryptKey(d.colEncryptionKey1))
   151  	d.decryptionConfigs = append(d.decryptionConfigs,
   152  		parquet.NewFileDecryptionProperties(parquet.WithFooterKey(d.footerEncryptionKey), parquet.WithColumnKeys(decryptCols)))
   153  
   154  	// Decryption Configuration 4: use plaintext footer mode, read only footer + plaintext
   155  	// columns.
   156  	d.decryptionConfigs = append(d.decryptionConfigs, nil)
   157  }
   158  
   159  func (d *TestDecryptionSuite) decryptFile(filename string, decryptConfigNum int) {
   160  	// if we get decryption_config_num = x then it means the actual number is x+1
   161  	// and since we want decryption_config_num=4 we set the condition to 3
   162  	props := parquet.NewReaderProperties(memory.DefaultAllocator)
   163  	if decryptConfigNum != 3 {
   164  		props.FileDecryptProps = d.decryptionConfigs[decryptConfigNum].Clone("")
   165  	}
   166  
   167  	fileReader, err := file.OpenParquetFile(filename, false, file.WithReadProps(props))
   168  	if err != nil {
   169  		panic(err)
   170  	}
   171  	defer fileReader.Close()
   172  	// get metadata
   173  	fileMetadata := fileReader.MetaData()
   174  	// get number of rowgroups
   175  	numRowGroups := len(fileMetadata.RowGroups)
   176  	// number of columns
   177  	numColumns := fileMetadata.Schema.NumColumns()
   178  	d.Equal(8, numColumns)
   179  
   180  	for r := 0; r < numRowGroups; r++ {
   181  		rowGroupReader := fileReader.RowGroup(r)
   182  
   183  		// get rowgroup meta
   184  		rgMeta := fileMetadata.RowGroup(r)
   185  		d.EqualValues(d.rowsPerRG, rgMeta.NumRows())
   186  
   187  		valuesRead := 0
   188  		rowsRead := int64(0)
   189  
   190  		// get col reader for boolean column
   191  		colReader, err := rowGroupReader.Column(0)
   192  		if err != nil {
   193  			panic(err)
   194  		}
   195  		boolReader := colReader.(*file.BooleanColumnChunkReader)
   196  
   197  		// get column chunk metadata for boolean column
   198  		boolMd, _ := rgMeta.ColumnChunk(0)
   199  		d.EqualValues(d.rowsPerRG, boolMd.NumValues())
   200  
   201  		// Read all rows in column
   202  		i := 0
   203  		for boolReader.HasNext() {
   204  			var val [1]bool
   205  			// read one value at a time. the number of rows read is returned. values
   206  			// read contains the number of non-null rows
   207  			rowsRead, valuesRead, _ = boolReader.ReadBatch(1, val[:], nil, nil)
   208  			// ensure only 1 value is read
   209  			d.EqualValues(1, rowsRead)
   210  			// there are no null values
   211  			d.EqualValues(1, valuesRead)
   212  			// verify the value
   213  			expected := i%2 == 0
   214  			d.Equal(expected, val[0], "i: ", i)
   215  			i++
   216  		}
   217  		d.EqualValues(i, boolMd.NumValues())
   218  
   219  		// Get column reader for int32 column
   220  		colReader, err = rowGroupReader.Column(1)
   221  		if err != nil {
   222  			panic(err)
   223  		}
   224  		int32reader := colReader.(*file.Int32ColumnChunkReader)
   225  
   226  		int32md, _ := rgMeta.ColumnChunk(1)
   227  		d.EqualValues(d.rowsPerRG, int32md.NumValues())
   228  		// Read all rows in column
   229  		i = 0
   230  		for int32reader.HasNext() {
   231  			var val [1]int32
   232  			// read one value at a time. the number of rows read is returned. values
   233  			// read contains the number of non-null rows
   234  			rowsRead, valuesRead, _ = int32reader.ReadBatch(1, val[:], nil, nil)
   235  			// ensure only 1 value is read
   236  			d.EqualValues(1, rowsRead)
   237  			// there are no null values
   238  			d.EqualValues(1, valuesRead)
   239  			// verify the value
   240  			d.EqualValues(i, val[0])
   241  			i++
   242  		}
   243  		d.EqualValues(i, int32md.NumValues())
   244  
   245  		// Get column reader for int64 column
   246  		colReader, err = rowGroupReader.Column(2)
   247  		if err != nil {
   248  			panic(err)
   249  		}
   250  		int64reader := colReader.(*file.Int64ColumnChunkReader)
   251  
   252  		int64md, _ := rgMeta.ColumnChunk(2)
   253  		// repeated column, we should have 2*d.rowsPerRG values
   254  		d.EqualValues(2*d.rowsPerRG, int64md.NumValues())
   255  		// Read all rows in column
   256  		i = 0
   257  		for int64reader.HasNext() {
   258  			var (
   259  				val [1]int64
   260  				def [1]int16
   261  				rep [1]int16
   262  			)
   263  
   264  			// read one value at a time. the number of rows read is returned. values
   265  			// read contains the number of non-null rows
   266  			rowsRead, valuesRead, _ = int64reader.ReadBatch(1, val[:], def[:], rep[:])
   267  			// ensure only 1 value is read
   268  			d.EqualValues(1, rowsRead)
   269  			// there are no null values
   270  			d.EqualValues(1, valuesRead)
   271  			// verify the value
   272  			expectedValue := int64(i) * 1000 * 1000 * 1000 * 1000
   273  			d.Equal(expectedValue, val[0])
   274  			if i%2 == 0 {
   275  				d.EqualValues(1, rep[0])
   276  			} else {
   277  				d.Zero(rep[0])
   278  			}
   279  			i++
   280  		}
   281  		d.EqualValues(i, int64md.NumValues())
   282  
   283  		// Get column reader for int96 column
   284  		colReader, err = rowGroupReader.Column(3)
   285  		if err != nil {
   286  			panic(err)
   287  		}
   288  		int96reader := colReader.(*file.Int96ColumnChunkReader)
   289  
   290  		int96md, _ := rgMeta.ColumnChunk(3)
   291  		// Read all rows in column
   292  		i = 0
   293  		for int96reader.HasNext() {
   294  			var (
   295  				val [1]parquet.Int96
   296  			)
   297  
   298  			// read one value at a time. the number of rows read is returned. values
   299  			// read contains the number of non-null rows
   300  			rowsRead, valuesRead, _ = int96reader.ReadBatch(1, val[:], nil, nil)
   301  			// ensure only 1 value is read
   302  			d.EqualValues(1, rowsRead)
   303  			// there are no null values
   304  			d.EqualValues(1, valuesRead)
   305  			// verify the value
   306  			var expectedValue parquet.Int96
   307  			binary.LittleEndian.PutUint32(expectedValue[:4], uint32(i))
   308  			binary.LittleEndian.PutUint32(expectedValue[4:], uint32(i+1))
   309  			binary.LittleEndian.PutUint32(expectedValue[8:], uint32(i+2))
   310  			d.Equal(expectedValue, val[0])
   311  			i++
   312  		}
   313  		d.EqualValues(i, int96md.NumValues())
   314  
   315  		// these two columns are always encrypted when we write them, so don't
   316  		// try to read them during the plaintext test.
   317  		if props.FileDecryptProps != nil {
   318  			// Get column reader for the float column
   319  			colReader, err = rowGroupReader.Column(4)
   320  			if err != nil {
   321  				panic(err)
   322  			}
   323  			floatReader := colReader.(*file.Float32ColumnChunkReader)
   324  
   325  			floatmd, _ := rgMeta.ColumnChunk(4)
   326  
   327  			i = 0
   328  			for floatReader.HasNext() {
   329  				var value [1]float32
   330  				// read one value at a time. the number of rows read is returned. values
   331  				// read contains the number of non-null rows
   332  				rowsRead, valuesRead, _ = floatReader.ReadBatch(1, value[:], nil, nil)
   333  				// ensure only 1 value is read
   334  				d.EqualValues(1, rowsRead)
   335  				// there are no null values
   336  				d.EqualValues(1, valuesRead)
   337  				// verify the value
   338  				expectedValue := float32(i) * 1.1
   339  				d.Equal(expectedValue, value[0])
   340  				i++
   341  			}
   342  			d.EqualValues(i, floatmd.NumValues())
   343  
   344  			// Get column reader for the double column
   345  			colReader, err = rowGroupReader.Column(5)
   346  			if err != nil {
   347  				panic(err)
   348  			}
   349  			dblReader := colReader.(*file.Float64ColumnChunkReader)
   350  
   351  			dblmd, _ := rgMeta.ColumnChunk(5)
   352  
   353  			i = 0
   354  			for dblReader.HasNext() {
   355  				var value [1]float64
   356  				// read one value at a time. the number of rows read is returned. values
   357  				// read contains the number of non-null rows
   358  				rowsRead, valuesRead, _ = dblReader.ReadBatch(1, value[:], nil, nil)
   359  				// ensure only 1 value is read
   360  				d.EqualValues(1, rowsRead)
   361  				// there are no null values
   362  				d.EqualValues(1, valuesRead)
   363  				// verify the value
   364  				expectedValue := float64(i) * 1.1111111
   365  				d.Equal(expectedValue, value[0])
   366  				i++
   367  			}
   368  			d.EqualValues(i, dblmd.NumValues())
   369  		}
   370  
   371  		colReader, err = rowGroupReader.Column(6)
   372  		if err != nil {
   373  			panic(err)
   374  		}
   375  		bareader := colReader.(*file.ByteArrayColumnChunkReader)
   376  
   377  		bamd, _ := rgMeta.ColumnChunk(6)
   378  
   379  		i = 0
   380  		for bareader.HasNext() {
   381  			var value [1]parquet.ByteArray
   382  			var def [1]int16
   383  
   384  			rowsRead, valuesRead, _ := bareader.ReadBatch(1, value[:], def[:], nil)
   385  			d.EqualValues(1, rowsRead)
   386  			expected := [10]byte{'p', 'a', 'r', 'q', 'u', 'e', 't', 0, 0, 0}
   387  			expected[7] = byte('0') + byte(i/100)
   388  			expected[8] = byte('0') + byte(i/10)%10
   389  			expected[9] = byte('0') + byte(i%10)
   390  			if i%2 == 0 {
   391  				d.Equal(1, valuesRead)
   392  				d.Len(value[0], 10)
   393  				d.EqualValues(expected[:], value[0])
   394  				d.EqualValues(1, def[0])
   395  			} else {
   396  				d.Zero(valuesRead)
   397  				d.Zero(def[0])
   398  			}
   399  			i++
   400  		}
   401  		d.EqualValues(i, bamd.NumValues())
   402  	}
   403  }
   404  
   405  func (d *TestDecryptionSuite) checkResults(fileName string, decryptionConfig, encryptionConfig uint) {
   406  	decFn := func() { d.decryptFile(fileName, int(decryptionConfig-1)) }
   407  
   408  	// Encryption configuration number 5 contains aad_prefix and disable_aad_prefix_storage
   409  	// an exception is expected to be thrown if the file is not decrypted with aad_prefix
   410  	if encryptionConfig == 5 {
   411  		if decryptionConfig == 1 || decryptionConfig == 3 {
   412  			d.Panics(decFn)
   413  			return
   414  		}
   415  	}
   416  
   417  	// decryption config number two contains aad_prefix. an exception
   418  	// is expected to be thrown if the file was not encrypted with the same aad_prefix
   419  	if decryptionConfig == 2 {
   420  		if encryptionConfig != 5 && encryptionConfig != 4 {
   421  			d.Panics(decFn)
   422  			return
   423  		}
   424  	}
   425  
   426  	// decryption config 4 can only work when the encryption config is 3
   427  	if decryptionConfig == 4 && encryptionConfig != 3 {
   428  		return
   429  	}
   430  	d.NotPanics(decFn)
   431  }
   432  
   433  // Read encrypted parquet file.
   434  // the test reads two parquet files that were encrypted using the same encryption config
   435  // one was generated in encryption_write_configurations_test.go tests and is deleted
   436  // once the file is read and the second exists in parquet-testing/data folder
   437  func (d *TestDecryptionSuite) TestDecryption() {
   438  	tests := []struct {
   439  		file   string
   440  		config uint
   441  	}{
   442  		{"uniform_encryption.parquet.encrypted", 1},
   443  		{"encrypt_columns_and_footer.parquet.encrypted", 2},
   444  		{"encrypt_columns_plaintext_footer.parquet.encrypted", 3},
   445  		{"encrypt_columns_and_footer_aad.parquet.encrypted", 4},
   446  		{"encrypt_columns_and_footer_disable_aad_storage.parquet.encrypted", 5},
   447  		{"encrypt_columns_and_footer_ctr.parquet.encrypted", 6},
   448  	}
   449  	for _, tt := range tests {
   450  		d.Run(tt.file, func() {
   451  			// decrypt file that was generated in encryption-write-tests
   452  			tmpFile := path.Join(tempdir, "tmp_"+tt.file)
   453  			d.Require().FileExists(tmpFile)
   454  
   455  			// iterate over decryption configs and use each one to read the encrypted file
   456  			for idx := range d.decryptionConfigs {
   457  				decConfig := idx + 1
   458  				d.checkResults(tmpFile, uint(decConfig), tt.config)
   459  			}
   460  			os.Remove(tmpFile)
   461  
   462  			file := path.Join(getDataDir(), tt.file)
   463  			d.Require().FileExists(file)
   464  
   465  			for idx := range d.decryptionConfigs {
   466  				decConfig := idx + 1
   467  				d.Run(fmt.Sprintf("config %d", decConfig), func() {
   468  					d.checkResults(file, uint(decConfig), tt.config)
   469  				})
   470  			}
   471  		})
   472  	}
   473  }