github.com/apache/arrow/go/v14@v14.0.1/parquet/encryption_read_config_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package parquet_test 18 19 import ( 20 "encoding/binary" 21 "fmt" 22 "os" 23 "path" 24 "testing" 25 26 "github.com/apache/arrow/go/v14/arrow/memory" 27 "github.com/apache/arrow/go/v14/parquet" 28 "github.com/apache/arrow/go/v14/parquet/file" 29 "github.com/apache/arrow/go/v14/parquet/internal/encryption" 30 "github.com/stretchr/testify/suite" 31 ) 32 33 /* 34 * This file contains a unit-test for reading encrypted Parquet files with 35 * different decryption configurations. 36 * 37 * The unit-test is called multiple times, each time to decrypt parquet files using 38 * different decryption configuration as described below. 39 * In each call two encrypted files are read: one temporary file that was generated using 40 * encryption_write_config_test.go test and will be deleted upon 41 * reading it, while the second resides in 42 * parquet-testing/data repository. Those two encrypted files were encrypted using the 43 * same encryption configuration. 44 * The encrypted parquet file names are passed as parameter to the unit-test. 45 * 46 * A detailed description of the Parquet Modular Encryption specification can be found 47 * here: 48 * https://github.com/apache/parquet-format/blob/encryption/Encryption.md 49 * 50 * The following decryption configurations are used to decrypt each parquet file: 51 * 52 * - Decryption configuration 1: Decrypt using key retriever that holds the keys of 53 * two encrypted columns and the footer key. 54 * - Decryption configuration 2: Decrypt using key retriever that holds the keys of 55 * two encrypted columns and the footer key. Supplies 56 * aad_prefix to verify file identity. 57 * - Decryption configuration 3: Decrypt using explicit column and footer keys 58 * (instead of key retrieval callback). 59 * - Decryption Configuration 4: PlainText Footer mode - test legacy reads, 60 * read the footer + all non-encrypted columns. 61 * (pairs with encryption configuration 3) 62 * 63 * The encrypted parquet files that is read was encrypted using one of the configurations 64 * below: 65 * 66 * - Encryption configuration 1: Encrypt all columns and the footer with the same key. 67 * (uniform encryption) 68 * - Encryption configuration 2: Encrypt two columns and the footer, with different 69 * keys. 70 * - Encryption configuration 3: Encrypt two columns, with different keys. 71 * Don’t encrypt footer (to enable legacy readers) 72 * - plaintext footer mode. 73 * - Encryption configuration 4: Encrypt two columns and the footer, with different 74 * keys. Supply aad_prefix for file identity 75 * verification. 76 * - Encryption configuration 5: Encrypt two columns and the footer, with different 77 * keys. Supply aad_prefix, and call 78 * disable_aad_prefix_storage to prevent file 79 * identity storage in file metadata. 80 * - Encryption configuration 6: Encrypt two columns and the footer, with different 81 * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. 82 */ 83 84 func getDataDir() string { 85 datadir := os.Getenv("PARQUET_TEST_DATA") 86 if datadir == "" { 87 panic("please point the PARQUET_TEST_DATA environment variable to the test data dir") 88 } 89 return datadir 90 } 91 92 type TestDecryptionSuite struct { 93 suite.Suite 94 95 pathToDouble string 96 pathToFloat string 97 decryptionConfigs []*parquet.FileDecryptionProperties 98 footerEncryptionKey string 99 colEncryptionKey1 string 100 colEncryptionKey2 string 101 fileName string 102 rowsPerRG int 103 } 104 105 func (d *TestDecryptionSuite) TearDownSuite() { 106 os.Remove(tempdir) 107 } 108 109 func TestFileEncryptionDecryption(t *testing.T) { 110 suite.Run(t, new(EncryptionConfigTestSuite)) 111 suite.Run(t, new(TestDecryptionSuite)) 112 } 113 114 func (d *TestDecryptionSuite) SetupSuite() { 115 d.pathToDouble = "double_field" 116 d.pathToFloat = "float_field" 117 d.footerEncryptionKey = FooterEncryptionKey 118 d.colEncryptionKey1 = ColumnEncryptionKey1 119 d.colEncryptionKey2 = ColumnEncryptionKey2 120 d.fileName = FileName 121 d.rowsPerRG = 50 // same as write encryption test 122 123 d.createDecryptionConfigs() 124 } 125 126 func (d *TestDecryptionSuite) createDecryptionConfigs() { 127 // Decryption configuration 1: Decrypt using key retriever callback that holds the 128 // keys of two encrypted columns and the footer key. 129 stringKr1 := make(encryption.StringKeyIDRetriever) 130 stringKr1.PutKey("kf", d.footerEncryptionKey) 131 stringKr1.PutKey("kc1", d.colEncryptionKey1) 132 stringKr1.PutKey("kc2", d.colEncryptionKey2) 133 134 d.decryptionConfigs = append(d.decryptionConfigs, 135 parquet.NewFileDecryptionProperties(parquet.WithKeyRetriever(stringKr1))) 136 137 // Decryption configuration 2: Decrypt using key retriever callback that holds the 138 // keys of two encrypted columns and the footer key. Supply aad_prefix. 139 stringKr2 := make(encryption.StringKeyIDRetriever) 140 stringKr2.PutKey("kf", d.footerEncryptionKey) 141 stringKr2.PutKey("kc1", d.colEncryptionKey1) 142 stringKr2.PutKey("kc2", d.colEncryptionKey2) 143 d.decryptionConfigs = append(d.decryptionConfigs, 144 parquet.NewFileDecryptionProperties(parquet.WithKeyRetriever(stringKr2), parquet.WithDecryptAadPrefix(d.fileName))) 145 146 // Decryption configuration 3: Decrypt using explicit column and footer keys. Supply 147 // aad_prefix. 148 decryptCols := make(parquet.ColumnPathToDecryptionPropsMap) 149 decryptCols[d.pathToFloat] = parquet.NewColumnDecryptionProperties(d.pathToFloat, parquet.WithDecryptKey(d.colEncryptionKey2)) 150 decryptCols[d.pathToDouble] = parquet.NewColumnDecryptionProperties(d.pathToDouble, parquet.WithDecryptKey(d.colEncryptionKey1)) 151 d.decryptionConfigs = append(d.decryptionConfigs, 152 parquet.NewFileDecryptionProperties(parquet.WithFooterKey(d.footerEncryptionKey), parquet.WithColumnKeys(decryptCols))) 153 154 // Decryption Configuration 4: use plaintext footer mode, read only footer + plaintext 155 // columns. 156 d.decryptionConfigs = append(d.decryptionConfigs, nil) 157 } 158 159 func (d *TestDecryptionSuite) decryptFile(filename string, decryptConfigNum int) { 160 // if we get decryption_config_num = x then it means the actual number is x+1 161 // and since we want decryption_config_num=4 we set the condition to 3 162 props := parquet.NewReaderProperties(memory.DefaultAllocator) 163 if decryptConfigNum != 3 { 164 props.FileDecryptProps = d.decryptionConfigs[decryptConfigNum].Clone("") 165 } 166 167 fileReader, err := file.OpenParquetFile(filename, false, file.WithReadProps(props)) 168 if err != nil { 169 panic(err) 170 } 171 defer fileReader.Close() 172 // get metadata 173 fileMetadata := fileReader.MetaData() 174 // get number of rowgroups 175 numRowGroups := len(fileMetadata.RowGroups) 176 // number of columns 177 numColumns := fileMetadata.Schema.NumColumns() 178 d.Equal(8, numColumns) 179 180 for r := 0; r < numRowGroups; r++ { 181 rowGroupReader := fileReader.RowGroup(r) 182 183 // get rowgroup meta 184 rgMeta := fileMetadata.RowGroup(r) 185 d.EqualValues(d.rowsPerRG, rgMeta.NumRows()) 186 187 valuesRead := 0 188 rowsRead := int64(0) 189 190 // get col reader for boolean column 191 colReader, err := rowGroupReader.Column(0) 192 if err != nil { 193 panic(err) 194 } 195 boolReader := colReader.(*file.BooleanColumnChunkReader) 196 197 // get column chunk metadata for boolean column 198 boolMd, _ := rgMeta.ColumnChunk(0) 199 d.EqualValues(d.rowsPerRG, boolMd.NumValues()) 200 201 // Read all rows in column 202 i := 0 203 for boolReader.HasNext() { 204 var val [1]bool 205 // read one value at a time. the number of rows read is returned. values 206 // read contains the number of non-null rows 207 rowsRead, valuesRead, _ = boolReader.ReadBatch(1, val[:], nil, nil) 208 // ensure only 1 value is read 209 d.EqualValues(1, rowsRead) 210 // there are no null values 211 d.EqualValues(1, valuesRead) 212 // verify the value 213 expected := i%2 == 0 214 d.Equal(expected, val[0], "i: ", i) 215 i++ 216 } 217 d.EqualValues(i, boolMd.NumValues()) 218 219 // Get column reader for int32 column 220 colReader, err = rowGroupReader.Column(1) 221 if err != nil { 222 panic(err) 223 } 224 int32reader := colReader.(*file.Int32ColumnChunkReader) 225 226 int32md, _ := rgMeta.ColumnChunk(1) 227 d.EqualValues(d.rowsPerRG, int32md.NumValues()) 228 // Read all rows in column 229 i = 0 230 for int32reader.HasNext() { 231 var val [1]int32 232 // read one value at a time. the number of rows read is returned. values 233 // read contains the number of non-null rows 234 rowsRead, valuesRead, _ = int32reader.ReadBatch(1, val[:], nil, nil) 235 // ensure only 1 value is read 236 d.EqualValues(1, rowsRead) 237 // there are no null values 238 d.EqualValues(1, valuesRead) 239 // verify the value 240 d.EqualValues(i, val[0]) 241 i++ 242 } 243 d.EqualValues(i, int32md.NumValues()) 244 245 // Get column reader for int64 column 246 colReader, err = rowGroupReader.Column(2) 247 if err != nil { 248 panic(err) 249 } 250 int64reader := colReader.(*file.Int64ColumnChunkReader) 251 252 int64md, _ := rgMeta.ColumnChunk(2) 253 // repeated column, we should have 2*d.rowsPerRG values 254 d.EqualValues(2*d.rowsPerRG, int64md.NumValues()) 255 // Read all rows in column 256 i = 0 257 for int64reader.HasNext() { 258 var ( 259 val [1]int64 260 def [1]int16 261 rep [1]int16 262 ) 263 264 // read one value at a time. the number of rows read is returned. values 265 // read contains the number of non-null rows 266 rowsRead, valuesRead, _ = int64reader.ReadBatch(1, val[:], def[:], rep[:]) 267 // ensure only 1 value is read 268 d.EqualValues(1, rowsRead) 269 // there are no null values 270 d.EqualValues(1, valuesRead) 271 // verify the value 272 expectedValue := int64(i) * 1000 * 1000 * 1000 * 1000 273 d.Equal(expectedValue, val[0]) 274 if i%2 == 0 { 275 d.EqualValues(1, rep[0]) 276 } else { 277 d.Zero(rep[0]) 278 } 279 i++ 280 } 281 d.EqualValues(i, int64md.NumValues()) 282 283 // Get column reader for int96 column 284 colReader, err = rowGroupReader.Column(3) 285 if err != nil { 286 panic(err) 287 } 288 int96reader := colReader.(*file.Int96ColumnChunkReader) 289 290 int96md, _ := rgMeta.ColumnChunk(3) 291 // Read all rows in column 292 i = 0 293 for int96reader.HasNext() { 294 var ( 295 val [1]parquet.Int96 296 ) 297 298 // read one value at a time. the number of rows read is returned. values 299 // read contains the number of non-null rows 300 rowsRead, valuesRead, _ = int96reader.ReadBatch(1, val[:], nil, nil) 301 // ensure only 1 value is read 302 d.EqualValues(1, rowsRead) 303 // there are no null values 304 d.EqualValues(1, valuesRead) 305 // verify the value 306 var expectedValue parquet.Int96 307 binary.LittleEndian.PutUint32(expectedValue[:4], uint32(i)) 308 binary.LittleEndian.PutUint32(expectedValue[4:], uint32(i+1)) 309 binary.LittleEndian.PutUint32(expectedValue[8:], uint32(i+2)) 310 d.Equal(expectedValue, val[0]) 311 i++ 312 } 313 d.EqualValues(i, int96md.NumValues()) 314 315 // these two columns are always encrypted when we write them, so don't 316 // try to read them during the plaintext test. 317 if props.FileDecryptProps != nil { 318 // Get column reader for the float column 319 colReader, err = rowGroupReader.Column(4) 320 if err != nil { 321 panic(err) 322 } 323 floatReader := colReader.(*file.Float32ColumnChunkReader) 324 325 floatmd, _ := rgMeta.ColumnChunk(4) 326 327 i = 0 328 for floatReader.HasNext() { 329 var value [1]float32 330 // read one value at a time. the number of rows read is returned. values 331 // read contains the number of non-null rows 332 rowsRead, valuesRead, _ = floatReader.ReadBatch(1, value[:], nil, nil) 333 // ensure only 1 value is read 334 d.EqualValues(1, rowsRead) 335 // there are no null values 336 d.EqualValues(1, valuesRead) 337 // verify the value 338 expectedValue := float32(i) * 1.1 339 d.Equal(expectedValue, value[0]) 340 i++ 341 } 342 d.EqualValues(i, floatmd.NumValues()) 343 344 // Get column reader for the double column 345 colReader, err = rowGroupReader.Column(5) 346 if err != nil { 347 panic(err) 348 } 349 dblReader := colReader.(*file.Float64ColumnChunkReader) 350 351 dblmd, _ := rgMeta.ColumnChunk(5) 352 353 i = 0 354 for dblReader.HasNext() { 355 var value [1]float64 356 // read one value at a time. the number of rows read is returned. values 357 // read contains the number of non-null rows 358 rowsRead, valuesRead, _ = dblReader.ReadBatch(1, value[:], nil, nil) 359 // ensure only 1 value is read 360 d.EqualValues(1, rowsRead) 361 // there are no null values 362 d.EqualValues(1, valuesRead) 363 // verify the value 364 expectedValue := float64(i) * 1.1111111 365 d.Equal(expectedValue, value[0]) 366 i++ 367 } 368 d.EqualValues(i, dblmd.NumValues()) 369 } 370 371 colReader, err = rowGroupReader.Column(6) 372 if err != nil { 373 panic(err) 374 } 375 bareader := colReader.(*file.ByteArrayColumnChunkReader) 376 377 bamd, _ := rgMeta.ColumnChunk(6) 378 379 i = 0 380 for bareader.HasNext() { 381 var value [1]parquet.ByteArray 382 var def [1]int16 383 384 rowsRead, valuesRead, _ := bareader.ReadBatch(1, value[:], def[:], nil) 385 d.EqualValues(1, rowsRead) 386 expected := [10]byte{'p', 'a', 'r', 'q', 'u', 'e', 't', 0, 0, 0} 387 expected[7] = byte('0') + byte(i/100) 388 expected[8] = byte('0') + byte(i/10)%10 389 expected[9] = byte('0') + byte(i%10) 390 if i%2 == 0 { 391 d.Equal(1, valuesRead) 392 d.Len(value[0], 10) 393 d.EqualValues(expected[:], value[0]) 394 d.EqualValues(1, def[0]) 395 } else { 396 d.Zero(valuesRead) 397 d.Zero(def[0]) 398 } 399 i++ 400 } 401 d.EqualValues(i, bamd.NumValues()) 402 } 403 } 404 405 func (d *TestDecryptionSuite) checkResults(fileName string, decryptionConfig, encryptionConfig uint) { 406 decFn := func() { d.decryptFile(fileName, int(decryptionConfig-1)) } 407 408 // Encryption configuration number 5 contains aad_prefix and disable_aad_prefix_storage 409 // an exception is expected to be thrown if the file is not decrypted with aad_prefix 410 if encryptionConfig == 5 { 411 if decryptionConfig == 1 || decryptionConfig == 3 { 412 d.Panics(decFn) 413 return 414 } 415 } 416 417 // decryption config number two contains aad_prefix. an exception 418 // is expected to be thrown if the file was not encrypted with the same aad_prefix 419 if decryptionConfig == 2 { 420 if encryptionConfig != 5 && encryptionConfig != 4 { 421 d.Panics(decFn) 422 return 423 } 424 } 425 426 // decryption config 4 can only work when the encryption config is 3 427 if decryptionConfig == 4 && encryptionConfig != 3 { 428 return 429 } 430 d.NotPanics(decFn) 431 } 432 433 // Read encrypted parquet file. 434 // the test reads two parquet files that were encrypted using the same encryption config 435 // one was generated in encryption_write_configurations_test.go tests and is deleted 436 // once the file is read and the second exists in parquet-testing/data folder 437 func (d *TestDecryptionSuite) TestDecryption() { 438 tests := []struct { 439 file string 440 config uint 441 }{ 442 {"uniform_encryption.parquet.encrypted", 1}, 443 {"encrypt_columns_and_footer.parquet.encrypted", 2}, 444 {"encrypt_columns_plaintext_footer.parquet.encrypted", 3}, 445 {"encrypt_columns_and_footer_aad.parquet.encrypted", 4}, 446 {"encrypt_columns_and_footer_disable_aad_storage.parquet.encrypted", 5}, 447 {"encrypt_columns_and_footer_ctr.parquet.encrypted", 6}, 448 } 449 for _, tt := range tests { 450 d.Run(tt.file, func() { 451 // decrypt file that was generated in encryption-write-tests 452 tmpFile := path.Join(tempdir, "tmp_"+tt.file) 453 d.Require().FileExists(tmpFile) 454 455 // iterate over decryption configs and use each one to read the encrypted file 456 for idx := range d.decryptionConfigs { 457 decConfig := idx + 1 458 d.checkResults(tmpFile, uint(decConfig), tt.config) 459 } 460 os.Remove(tmpFile) 461 462 file := path.Join(getDataDir(), tt.file) 463 d.Require().FileExists(file) 464 465 for idx := range d.decryptionConfigs { 466 decConfig := idx + 1 467 d.Run(fmt.Sprintf("config %d", decConfig), func() { 468 d.checkResults(file, uint(decConfig), tt.config) 469 }) 470 } 471 }) 472 } 473 }