github.com/apache/arrow/go/v10@v10.0.1/parquet/encryption_read_config_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package parquet_test 18 19 import ( 20 "encoding/binary" 21 "fmt" 22 "os" 23 "path" 24 "testing" 25 26 "github.com/apache/arrow/go/v10/arrow/memory" 27 "github.com/apache/arrow/go/v10/parquet" 28 "github.com/apache/arrow/go/v10/parquet/file" 29 "github.com/apache/arrow/go/v10/parquet/internal/encryption" 30 "github.com/stretchr/testify/suite" 31 ) 32 33 /* 34 * This file contains a unit-test for reading encrypted Parquet files with 35 * different decryption configurations. 36 * 37 * The unit-test is called multiple times, each time to decrypt parquet files using 38 * different decryption configuration as described below. 39 * In each call two encrypted files are read: one temporary file that was generated using 40 * encryption_write_config_test.go test and will be deleted upon 41 * reading it, while the second resides in 42 * parquet-testing/data repository. Those two encrypted files were encrypted using the 43 * same encryption configuration. 44 * The encrypted parquet file names are passed as parameter to the unit-test. 45 * 46 * A detailed description of the Parquet Modular Encryption specification can be found 47 * here: 48 * https://github.com/apache/parquet-format/blob/encryption/Encryption.md 49 * 50 * The following decryption configurations are used to decrypt each parquet file: 51 * 52 * - Decryption configuration 1: Decrypt using key retriever that holds the keys of 53 * two encrypted columns and the footer key. 54 * - Decryption configuration 2: Decrypt using key retriever that holds the keys of 55 * two encrypted columns and the footer key. Supplies 56 * aad_prefix to verify file identity. 57 * - Decryption configuration 3: Decrypt using explicit column and footer keys 58 * (instead of key retrieval callback). 59 * - Decryption Configuration 4: PlainText Footer mode - test legacy reads, 60 * read the footer + all non-encrypted columns. 61 * (pairs with encryption configuration 3) 62 * 63 * The encrypted parquet files that is read was encrypted using one of the configurations 64 * below: 65 * 66 * - Encryption configuration 1: Encrypt all columns and the footer with the same key. 67 * (uniform encryption) 68 * - Encryption configuration 2: Encrypt two columns and the footer, with different 69 * keys. 70 * - Encryption configuration 3: Encrypt two columns, with different keys. 71 * Don’t encrypt footer (to enable legacy readers) 72 * - plaintext footer mode. 73 * - Encryption configuration 4: Encrypt two columns and the footer, with different 74 * keys. Supply aad_prefix for file identity 75 * verification. 76 * - Encryption configuration 5: Encrypt two columns and the footer, with different 77 * keys. Supply aad_prefix, and call 78 * disable_aad_prefix_storage to prevent file 79 * identity storage in file metadata. 80 * - Encryption configuration 6: Encrypt two columns and the footer, with different 81 * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. 82 */ 83 84 func getDataDir() string { 85 datadir := os.Getenv("PARQUET_TEST_DATA") 86 if datadir == "" { 87 panic("please point the PARQUET_TEST_DATA environment variable to the test data dir") 88 } 89 return datadir 90 } 91 92 type TestDecryptionSuite struct { 93 suite.Suite 94 95 pathToDouble string 96 pathToFloat string 97 decryptionConfigs []*parquet.FileDecryptionProperties 98 footerEncryptionKey string 99 colEncryptionKey1 string 100 colEncryptionKey2 string 101 fileName string 102 } 103 104 func (d *TestDecryptionSuite) TearDownSuite() { 105 os.Remove(tempdir) 106 } 107 108 func TestFileEncryptionDecryption(t *testing.T) { 109 suite.Run(t, new(EncryptionConfigTestSuite)) 110 suite.Run(t, new(TestDecryptionSuite)) 111 } 112 113 func (d *TestDecryptionSuite) SetupSuite() { 114 d.pathToDouble = "double_field" 115 d.pathToFloat = "float_field" 116 d.footerEncryptionKey = FooterEncryptionKey 117 d.colEncryptionKey1 = ColumnEncryptionKey1 118 d.colEncryptionKey2 = ColumnEncryptionKey2 119 d.fileName = FileName 120 121 d.createDecryptionConfigs() 122 } 123 124 func (d *TestDecryptionSuite) createDecryptionConfigs() { 125 // Decryption configuration 1: Decrypt using key retriever callback that holds the 126 // keys of two encrypted columns and the footer key. 127 stringKr1 := make(encryption.StringKeyIDRetriever) 128 stringKr1.PutKey("kf", d.footerEncryptionKey) 129 stringKr1.PutKey("kc1", d.colEncryptionKey1) 130 stringKr1.PutKey("kc2", d.colEncryptionKey2) 131 132 d.decryptionConfigs = append(d.decryptionConfigs, 133 parquet.NewFileDecryptionProperties(parquet.WithKeyRetriever(stringKr1))) 134 135 // Decryption configuration 2: Decrypt using key retriever callback that holds the 136 // keys of two encrypted columns and the footer key. Supply aad_prefix. 137 stringKr2 := make(encryption.StringKeyIDRetriever) 138 stringKr2.PutKey("kf", d.footerEncryptionKey) 139 stringKr2.PutKey("kc1", d.colEncryptionKey1) 140 stringKr2.PutKey("kc2", d.colEncryptionKey2) 141 d.decryptionConfigs = append(d.decryptionConfigs, 142 parquet.NewFileDecryptionProperties(parquet.WithKeyRetriever(stringKr2), parquet.WithDecryptAadPrefix(d.fileName))) 143 144 // Decryption configuration 3: Decrypt using explicit column and footer keys. Supply 145 // aad_prefix. 146 decryptCols := make(parquet.ColumnPathToDecryptionPropsMap) 147 decryptCols[d.pathToFloat] = parquet.NewColumnDecryptionProperties(d.pathToFloat, parquet.WithDecryptKey(d.colEncryptionKey2)) 148 decryptCols[d.pathToDouble] = parquet.NewColumnDecryptionProperties(d.pathToDouble, parquet.WithDecryptKey(d.colEncryptionKey1)) 149 d.decryptionConfigs = append(d.decryptionConfigs, 150 parquet.NewFileDecryptionProperties(parquet.WithFooterKey(d.footerEncryptionKey), parquet.WithColumnKeys(decryptCols))) 151 152 // Decryption Configuration 4: use plaintext footer mode, read only footer + plaintext 153 // columns. 154 d.decryptionConfigs = append(d.decryptionConfigs, nil) 155 } 156 157 func (d *TestDecryptionSuite) decryptFile(filename string, decryptConfigNum int) { 158 // if we get decryption_config_num = x then it means the actual number is x+1 159 // and since we want decryption_config_num=4 we set the condition to 3 160 props := parquet.NewReaderProperties(memory.DefaultAllocator) 161 if decryptConfigNum != 3 { 162 props.FileDecryptProps = d.decryptionConfigs[decryptConfigNum].Clone("") 163 } 164 165 fileReader, err := file.OpenParquetFile(filename, false, file.WithReadProps(props)) 166 if err != nil { 167 panic(err) 168 } 169 defer fileReader.Close() 170 // get metadata 171 fileMetadata := fileReader.MetaData() 172 // get number of rowgroups 173 numRowGroups := len(fileMetadata.RowGroups) 174 // number of columns 175 numColumns := fileMetadata.Schema.NumColumns() 176 d.Equal(8, numColumns) 177 178 for r := 0; r < numRowGroups; r++ { 179 rowGroupReader := fileReader.RowGroup(r) 180 181 // get rowgroup meta 182 rgMeta := fileMetadata.RowGroup(r) 183 184 valuesRead := 0 185 rowsRead := int64(0) 186 187 // get col reader for boolean column 188 colReader, err := rowGroupReader.Column(0) 189 if err != nil { 190 panic(err) 191 } 192 boolReader := colReader.(*file.BooleanColumnChunkReader) 193 194 // get column chunk metadata for boolean column 195 boolMd, _ := rgMeta.ColumnChunk(0) 196 197 // Read all rows in column 198 i := 0 199 for boolReader.HasNext() { 200 var val [1]bool 201 // read one value at a time. the number of rows read is returned. values 202 // read contains the number of non-null rows 203 rowsRead, valuesRead, _ = boolReader.ReadBatch(1, val[:], nil, nil) 204 // ensure only 1 value is read 205 d.EqualValues(1, rowsRead) 206 // there are no null values 207 d.EqualValues(1, valuesRead) 208 // verify the value 209 expected := i%2 == 0 210 d.Equal(expected, val[0], "i: ", i) 211 i++ 212 } 213 d.EqualValues(i, boolMd.NumValues()) 214 215 // Get column reader for int32 column 216 colReader, err = rowGroupReader.Column(1) 217 if err != nil { 218 panic(err) 219 } 220 int32reader := colReader.(*file.Int32ColumnChunkReader) 221 222 int32md, _ := rgMeta.ColumnChunk(1) 223 // Read all rows in column 224 i = 0 225 for int32reader.HasNext() { 226 var val [1]int32 227 // read one value at a time. the number of rows read is returned. values 228 // read contains the number of non-null rows 229 rowsRead, valuesRead, _ = int32reader.ReadBatch(1, val[:], nil, nil) 230 // ensure only 1 value is read 231 d.EqualValues(1, rowsRead) 232 // there are no null values 233 d.EqualValues(1, valuesRead) 234 // verify the value 235 d.EqualValues(i, val[0]) 236 i++ 237 } 238 d.EqualValues(i, int32md.NumValues()) 239 240 // Get column reader for int64 column 241 colReader, err = rowGroupReader.Column(2) 242 if err != nil { 243 panic(err) 244 } 245 int64reader := colReader.(*file.Int64ColumnChunkReader) 246 247 int64md, _ := rgMeta.ColumnChunk(2) 248 // Read all rows in column 249 i = 0 250 for int64reader.HasNext() { 251 var ( 252 val [1]int64 253 def [1]int16 254 rep [1]int16 255 ) 256 257 // read one value at a time. the number of rows read is returned. values 258 // read contains the number of non-null rows 259 rowsRead, valuesRead, _ = int64reader.ReadBatch(1, val[:], def[:], rep[:]) 260 // ensure only 1 value is read 261 d.EqualValues(1, rowsRead) 262 // there are no null values 263 d.EqualValues(1, valuesRead) 264 // verify the value 265 expectedValue := int64(i) * 1000 * 1000 * 1000 * 1000 266 d.Equal(expectedValue, val[0]) 267 if i%2 == 0 { 268 d.EqualValues(1, rep[0]) 269 } else { 270 d.Zero(rep[0]) 271 } 272 i++ 273 } 274 d.EqualValues(i, int64md.NumValues()) 275 276 // Get column reader for int96 column 277 colReader, err = rowGroupReader.Column(3) 278 if err != nil { 279 panic(err) 280 } 281 int96reader := colReader.(*file.Int96ColumnChunkReader) 282 283 int96md, _ := rgMeta.ColumnChunk(3) 284 // Read all rows in column 285 i = 0 286 for int96reader.HasNext() { 287 var ( 288 val [1]parquet.Int96 289 ) 290 291 // read one value at a time. the number of rows read is returned. values 292 // read contains the number of non-null rows 293 rowsRead, valuesRead, _ = int96reader.ReadBatch(1, val[:], nil, nil) 294 // ensure only 1 value is read 295 d.EqualValues(1, rowsRead) 296 // there are no null values 297 d.EqualValues(1, valuesRead) 298 // verify the value 299 var expectedValue parquet.Int96 300 binary.LittleEndian.PutUint32(expectedValue[:4], uint32(i)) 301 binary.LittleEndian.PutUint32(expectedValue[4:], uint32(i+1)) 302 binary.LittleEndian.PutUint32(expectedValue[8:], uint32(i+2)) 303 d.Equal(expectedValue, val[0]) 304 i++ 305 } 306 d.EqualValues(i, int96md.NumValues()) 307 308 // these two columns are always encrypted when we write them, so don't 309 // try to read them during the plaintext test. 310 if props.FileDecryptProps != nil { 311 // Get column reader for the float column 312 colReader, err = rowGroupReader.Column(4) 313 if err != nil { 314 panic(err) 315 } 316 floatReader := colReader.(*file.Float32ColumnChunkReader) 317 318 floatmd, _ := rgMeta.ColumnChunk(4) 319 320 i = 0 321 for floatReader.HasNext() { 322 var value [1]float32 323 // read one value at a time. the number of rows read is returned. values 324 // read contains the number of non-null rows 325 rowsRead, valuesRead, _ = floatReader.ReadBatch(1, value[:], nil, nil) 326 // ensure only 1 value is read 327 d.EqualValues(1, rowsRead) 328 // there are no null values 329 d.EqualValues(1, valuesRead) 330 // verify the value 331 expectedValue := float32(i) * 1.1 332 d.Equal(expectedValue, value[0]) 333 i++ 334 } 335 d.EqualValues(i, floatmd.NumValues()) 336 337 // Get column reader for the double column 338 colReader, err = rowGroupReader.Column(5) 339 if err != nil { 340 panic(err) 341 } 342 dblReader := colReader.(*file.Float64ColumnChunkReader) 343 344 dblmd, _ := rgMeta.ColumnChunk(5) 345 346 i = 0 347 for dblReader.HasNext() { 348 var value [1]float64 349 // read one value at a time. the number of rows read is returned. values 350 // read contains the number of non-null rows 351 rowsRead, valuesRead, _ = dblReader.ReadBatch(1, value[:], nil, nil) 352 // ensure only 1 value is read 353 d.EqualValues(1, rowsRead) 354 // there are no null values 355 d.EqualValues(1, valuesRead) 356 // verify the value 357 expectedValue := float64(i) * 1.1111111 358 d.Equal(expectedValue, value[0]) 359 i++ 360 } 361 d.EqualValues(i, dblmd.NumValues()) 362 } 363 364 colReader, err = rowGroupReader.Column(6) 365 if err != nil { 366 panic(err) 367 } 368 bareader := colReader.(*file.ByteArrayColumnChunkReader) 369 370 bamd, _ := rgMeta.ColumnChunk(6) 371 372 i = 0 373 for bareader.HasNext() { 374 var value [1]parquet.ByteArray 375 var def [1]int16 376 377 rowsRead, valuesRead, _ := bareader.ReadBatch(1, value[:], def[:], nil) 378 d.EqualValues(1, rowsRead) 379 expected := [10]byte{'p', 'a', 'r', 'q', 'u', 'e', 't', 0, 0, 0} 380 expected[7] = byte('0') + byte(i/100) 381 expected[8] = byte('0') + byte(i/10)%10 382 expected[9] = byte('0') + byte(i%10) 383 if i%2 == 0 { 384 d.Equal(1, valuesRead) 385 d.Len(value[0], 10) 386 d.EqualValues(expected[:], value[0]) 387 d.EqualValues(1, def[0]) 388 } else { 389 d.Zero(valuesRead) 390 d.Zero(def[0]) 391 } 392 i++ 393 } 394 d.EqualValues(i, bamd.NumValues()) 395 } 396 } 397 398 func (d *TestDecryptionSuite) checkResults(fileName string, decryptionConfig, encryptionConfig uint) { 399 decFn := func() { d.decryptFile(fileName, int(decryptionConfig-1)) } 400 401 // Encryption configuration number 5 contains aad_prefix and disable_aad_prefix_storage 402 // an exception is expected to be thrown if the file is not decrypted with aad_prefix 403 if encryptionConfig == 5 { 404 if decryptionConfig == 1 || decryptionConfig == 3 { 405 d.Panics(decFn) 406 return 407 } 408 } 409 410 // decryption config number two contains aad_prefix. an exception 411 // is expected to be thrown if the file was not encrypted with the same aad_prefix 412 if decryptionConfig == 2 { 413 if encryptionConfig != 5 && encryptionConfig != 4 { 414 d.Panics(decFn) 415 return 416 } 417 } 418 419 // decryption config 4 can only work when the encryption config is 3 420 if decryptionConfig == 4 && encryptionConfig != 3 { 421 return 422 } 423 d.NotPanics(decFn) 424 } 425 426 // Read encrypted parquet file. 427 // the test reads two parquet files that were encrypted using the same encryption config 428 // one was generated in encryption_write_configurations_test.go tests and is deleted 429 // once the file is read and the second exists in parquet-testing/data folder 430 func (d *TestDecryptionSuite) TestDecryption() { 431 tests := []struct { 432 file string 433 config uint 434 }{ 435 {"uniform_encryption.parquet.encrypted", 1}, 436 {"encrypt_columns_and_footer.parquet.encrypted", 2}, 437 {"encrypt_columns_plaintext_footer.parquet.encrypted", 3}, 438 {"encrypt_columns_and_footer_aad.parquet.encrypted", 4}, 439 {"encrypt_columns_and_footer_disable_aad_storage.parquet.encrypted", 5}, 440 {"encrypt_columns_and_footer_ctr.parquet.encrypted", 6}, 441 } 442 for _, tt := range tests { 443 d.Run(tt.file, func() { 444 // decrypt file that was generated in encryption-write-tests 445 tmpFile := path.Join(tempdir, "tmp_"+tt.file) 446 d.Require().FileExists(tmpFile) 447 448 // iterate over decryption configs and use each one to read the encrypted file 449 for idx := range d.decryptionConfigs { 450 decConfig := idx + 1 451 d.checkResults(tmpFile, uint(decConfig), tt.config) 452 } 453 os.Remove(tmpFile) 454 455 file := path.Join(getDataDir(), tt.file) 456 d.Require().FileExists(file) 457 458 for idx := range d.decryptionConfigs { 459 decConfig := idx + 1 460 d.Run(fmt.Sprintf("config %d", decConfig), func() { 461 d.checkResults(file, uint(decConfig), tt.config) 462 }) 463 } 464 }) 465 } 466 }