github.com/apache/arrow/go/v14@v14.0.1/parquet/encryption_write_config_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package parquet_test 18 19 import ( 20 "encoding/binary" 21 "fmt" 22 "os" 23 "path/filepath" 24 "testing" 25 26 "github.com/apache/arrow/go/v14/parquet" 27 "github.com/apache/arrow/go/v14/parquet/compress" 28 "github.com/apache/arrow/go/v14/parquet/file" 29 "github.com/apache/arrow/go/v14/parquet/schema" 30 "github.com/stretchr/testify/suite" 31 ) 32 33 /* 34 * This file contains unit-tests for writing encrypted Parquet files with 35 * different encryption configurations. 36 * The files are saved in temporary folder and will be deleted after reading 37 * them in encryption_read_config_test.go test. 38 * 39 * A detailed description of the Parquet Modular Encryption specification can be found 40 * here: 41 * https://github.com/apache/parquet-format/blob/encryption/Encryption.md 42 * 43 * Each unit-test creates a single parquet file with eight columns using one of the 44 * following encryption configurations: 45 * 46 * - Encryption configuration 1: Encrypt all columns and the footer with the same key. 47 * (uniform encryption) 48 * - Encryption configuration 2: Encrypt two columns and the footer, with different 49 * keys. 50 * - Encryption configuration 3: Encrypt two columns, with different keys. 51 * Don’t encrypt footer (to enable legacy readers) 52 * - plaintext footer mode. 53 * - Encryption configuration 4: Encrypt two columns and the footer, with different 54 * keys. Supply aad_prefix for file identity 55 * verification. 56 * - Encryption configuration 5: Encrypt two columns and the footer, with different 57 * keys. Supply aad_prefix, and call 58 * disable_aad_prefix_storage to prevent file 59 * identity storage in file metadata. 60 * - Encryption configuration 6: Encrypt two columns and the footer, with different 61 * keys. Use the alternative (AES_GCM_CTR_V1) algorithm. 62 */ 63 64 var ( 65 tempdir string 66 ) 67 68 type EncryptionConfigTestSuite struct { 69 suite.Suite 70 71 pathToDoubleField string 72 pathToFloatField string 73 fileName string 74 numRgs int 75 rowsPerRG int 76 schema *schema.GroupNode 77 footerEncryptionKey string 78 columnEncryptionKey1 string 79 columnEncryptionKey2 string 80 } 81 82 func (en *EncryptionConfigTestSuite) encryptFile(configs *parquet.FileEncryptionProperties, filename string) { 83 filename = filepath.Join(tempdir, filename) 84 85 props := parquet.NewWriterProperties(parquet.WithCompression(compress.Codecs.Snappy), parquet.WithEncryptionProperties(configs)) 86 outFile, err := os.Create(filename) 87 en.Require().NoError(err) 88 en.Require().NotNil(outFile) 89 90 writer := file.NewParquetWriter(outFile, en.schema, file.WithWriterProps(props)) 91 defer writer.Close() 92 93 for r := 0; r < en.numRgs; r++ { 94 var ( 95 bufferedMode = r%2 == 0 96 rgr file.RowGroupWriter 97 colIndex = 0 98 ) 99 100 if bufferedMode { 101 rgr = writer.AppendBufferedRowGroup() 102 } else { 103 rgr = writer.AppendRowGroup() 104 } 105 106 nextColumn := func() file.ColumnChunkWriter { 107 defer func() { colIndex++ }() 108 if bufferedMode { 109 cw, _ := rgr.(file.BufferedRowGroupWriter).Column(colIndex) 110 return cw 111 } 112 cw, _ := rgr.(file.SerialRowGroupWriter).NextColumn() 113 return cw 114 } 115 116 // write the bool col 117 boolWriter := nextColumn().(*file.BooleanColumnChunkWriter) 118 for i := 0; i < en.rowsPerRG; i++ { 119 value := (i % 2) == 0 120 n, err := boolWriter.WriteBatch([]bool{value}, nil, nil) 121 en.EqualValues(1, n) 122 en.Require().NoError(err) 123 } 124 125 // write the int32 col 126 int32Writer := nextColumn().(*file.Int32ColumnChunkWriter) 127 for i := int32(0); i < int32(en.rowsPerRG); i++ { 128 n, err := int32Writer.WriteBatch([]int32{i}, nil, nil) 129 en.EqualValues(1, n) 130 en.Require().NoError(err) 131 } 132 133 // write the int64 column, each row repeats twice 134 int64Writer := nextColumn().(*file.Int64ColumnChunkWriter) 135 for i := 0; i < 2*en.rowsPerRG; i++ { 136 var ( 137 defLevel = [1]int16{1} 138 repLevel = [1]int16{0} 139 value int64 = int64(i) * 1000 * 1000 * 1000 * 1000 140 ) 141 if i%2 == 0 { 142 repLevel[0] = 1 143 } 144 145 n, err := int64Writer.WriteBatch([]int64{value}, defLevel[:], repLevel[:]) 146 en.EqualValues(1, n) 147 en.Require().NoError(err) 148 } 149 150 // write the int96 col 151 int96Writer := nextColumn().(*file.Int96ColumnChunkWriter) 152 for i := 0; i < en.rowsPerRG; i++ { 153 val := parquet.Int96{} 154 binary.LittleEndian.PutUint32(val[:], uint32(i)) 155 binary.LittleEndian.PutUint32(val[4:], uint32(i+1)) 156 binary.LittleEndian.PutUint32(val[8:], uint32(i+2)) 157 n, err := int96Writer.WriteBatch([]parquet.Int96{val}, nil, nil) 158 en.EqualValues(1, n) 159 en.Require().NoError(err) 160 } 161 162 // write the float column 163 floatWriter := nextColumn().(*file.Float32ColumnChunkWriter) 164 for i := 0; i < en.rowsPerRG; i++ { 165 val := float32(i) * 1.1 166 n, err := floatWriter.WriteBatch([]float32{val}, nil, nil) 167 en.EqualValues(1, n) 168 en.Require().NoError(err) 169 } 170 171 // write the double column 172 doubleWriter := nextColumn().(*file.Float64ColumnChunkWriter) 173 for i := 0; i < en.rowsPerRG; i++ { 174 value := float64(i) * 1.1111111 175 n, err := doubleWriter.WriteBatch([]float64{value}, nil, nil) 176 en.EqualValues(1, n) 177 en.Require().NoError(err) 178 } 179 180 // write the bytearray column. make every alternate value NULL 181 baWriter := nextColumn().(*file.ByteArrayColumnChunkWriter) 182 for i := 0; i < en.rowsPerRG; i++ { 183 var ( 184 n int64 185 err error 186 hello = []byte{'p', 'a', 'r', 'q', 'u', 'e', 't', 0, 0, 0} 187 ) 188 hello[7] = byte(int('0') + i/100) 189 hello[8] = byte(int('0') + (i/10)%10) 190 hello[9] = byte(int('0') + i%10) 191 if i%2 == 0 { 192 n, err = baWriter.WriteBatch([]parquet.ByteArray{hello}, []int16{1}, nil) 193 en.EqualValues(1, n) 194 } else { 195 n, err = baWriter.WriteBatch([]parquet.ByteArray{nil}, []int16{0}, nil) 196 en.Zero(n) 197 } 198 199 en.Require().NoError(err) 200 } 201 202 // write fixedlength byte array column 203 flbaWriter := nextColumn().(*file.FixedLenByteArrayColumnChunkWriter) 204 for i := 0; i < en.rowsPerRG; i++ { 205 v := byte(i) 206 value := parquet.FixedLenByteArray{v, v, v, v, v, v, v, v, v, v} 207 n, err := flbaWriter.WriteBatch([]parquet.FixedLenByteArray{value}, nil, nil) 208 en.EqualValues(1, n) 209 en.Require().NoError(err) 210 } 211 } 212 } 213 214 func (en *EncryptionConfigTestSuite) SetupSuite() { 215 var err error 216 tempdir, err = os.MkdirTemp("", "parquet-encryption-test-*") 217 en.Require().NoError(err) 218 fmt.Println(tempdir) 219 220 en.fileName = FileName 221 en.rowsPerRG = 50 222 en.numRgs = 5 223 en.pathToDoubleField = "double_field" 224 en.pathToFloatField = "float_field" 225 en.footerEncryptionKey = FooterEncryptionKey 226 en.columnEncryptionKey1 = ColumnEncryptionKey1 227 en.columnEncryptionKey2 = ColumnEncryptionKey2 228 229 fields := make(schema.FieldList, 0) 230 // create a primitive node named "boolean_field" with type BOOLEAN 231 // repetition:REQUIRED 232 fields = append(fields, schema.NewBooleanNode("boolean_field", parquet.Repetitions.Required, -1)) 233 // create a primitive node named "int32_field" with type INT32 repetition REQUIRED 234 // and logical type: TIME_MILLIS 235 f, _ := schema.NewPrimitiveNodeLogical("int32_field", parquet.Repetitions.Required, 236 schema.NewTimeLogicalType(true, schema.TimeUnitMillis), parquet.Types.Int32, 0, -1) 237 fields = append(fields, f) 238 239 // create a primitive node named "int64_field" with type int64, repetition:REPEATED 240 fields = append(fields, schema.NewInt64Node("int64_field", parquet.Repetitions.Repeated, -1)) 241 242 fields = append(fields, 243 schema.NewInt96Node("int96_field", parquet.Repetitions.Required, -1), 244 schema.NewFloat32Node("float_field", parquet.Repetitions.Required, -1), 245 schema.NewFloat64Node("double_field", parquet.Repetitions.Required, -1)) 246 247 // create a primitive node named ba_field with type:BYTE_ARRAY repetition:OPTIONAL 248 fields = append(fields, schema.NewByteArrayNode("ba_field", parquet.Repetitions.Optional, -1)) 249 250 // create a primitive node for flba_field 251 fields = append(fields, schema.NewFixedLenByteArrayNode("flba_field", parquet.Repetitions.Required, 10, -1)) 252 253 // flba_field fixedlenbytearray 254 en.schema, _ = schema.NewGroupNode("schema", parquet.Repetitions.Required, fields, -1) 255 } 256 257 // Encryption Config 1: Encrypt All columns and the footer with the same key 258 // (uniform encryption) 259 func (en *EncryptionConfigTestSuite) TestUniformEncryption() { 260 props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf")) 261 en.encryptFile(props, "tmp_uniform_encryption.parquet.encrypted") 262 } 263 264 // Encryption config 2: Encrypt Two Columns and the Footer, with different keys 265 func (en *EncryptionConfigTestSuite) TestEncryptTwoColumnsAndFooter() { 266 encryptCols := make(parquet.ColumnPathToEncryptionPropsMap) 267 encryptCols[en.pathToDoubleField] = parquet.NewColumnEncryptionProperties(en.pathToDoubleField, parquet.WithKey(en.columnEncryptionKey1), parquet.WithKeyID("kc1")) 268 encryptCols[en.pathToFloatField] = parquet.NewColumnEncryptionProperties(en.pathToFloatField, parquet.WithKey(en.columnEncryptionKey2), parquet.WithKeyID("kc2")) 269 270 props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"), parquet.WithEncryptedColumns(encryptCols)) 271 en.encryptFile(props, "tmp_encrypt_columns_and_footer.parquet.encrypted") 272 } 273 274 // Encryption Config 3: encrypt two columns, with different keys. 275 // plaintext footer 276 // (plaintext footer mode, readable by legacy readers) 277 func (en *EncryptionConfigTestSuite) TestEncryptTwoColumnsPlaintextFooter() { 278 encryptCols := make(parquet.ColumnPathToEncryptionPropsMap) 279 encryptCols[en.pathToDoubleField] = parquet.NewColumnEncryptionProperties(en.pathToDoubleField, parquet.WithKey(en.columnEncryptionKey1), parquet.WithKeyID("kc1")) 280 encryptCols[en.pathToFloatField] = parquet.NewColumnEncryptionProperties(en.pathToFloatField, parquet.WithKey(en.columnEncryptionKey2), parquet.WithKeyID("kc2")) 281 282 props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"), parquet.WithEncryptedColumns(encryptCols), parquet.WithPlaintextFooter()) 283 en.encryptFile(props, "tmp_encrypt_columns_plaintext_footer.parquet.encrypted") 284 } 285 286 // Encryption Config 4: Encrypt two columns and the footer, with different keys 287 // use aad_prefix 288 func (en *EncryptionConfigTestSuite) TestEncryptTwoColumnsAndFooterWithAadPrefix() { 289 encryptCols := make(parquet.ColumnPathToEncryptionPropsMap) 290 encryptCols[en.pathToDoubleField] = parquet.NewColumnEncryptionProperties(en.pathToDoubleField, parquet.WithKey(en.columnEncryptionKey1), parquet.WithKeyID("kc1")) 291 encryptCols[en.pathToFloatField] = parquet.NewColumnEncryptionProperties(en.pathToFloatField, parquet.WithKey(en.columnEncryptionKey2), parquet.WithKeyID("kc2")) 292 293 props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"), parquet.WithEncryptedColumns(encryptCols), parquet.WithAadPrefix(en.fileName)) 294 en.encryptFile(props, "tmp_encrypt_columns_and_footer_aad.parquet.encrypted") 295 } 296 297 // Encryption Config 5: Encrypt Two columns and the footer, with different keys 298 // use aad_prefix and disable_aad_prefix_storage 299 func (en *EncryptionConfigTestSuite) TestEncryptTwoColumnsAndFooterWithAadPrefixDisableAadStorage() { 300 encryptCols := make(parquet.ColumnPathToEncryptionPropsMap) 301 encryptCols[en.pathToDoubleField] = parquet.NewColumnEncryptionProperties(en.pathToDoubleField, parquet.WithKey(en.columnEncryptionKey1), parquet.WithKeyID("kc1")) 302 encryptCols[en.pathToFloatField] = parquet.NewColumnEncryptionProperties(en.pathToFloatField, parquet.WithKey(en.columnEncryptionKey2), parquet.WithKeyID("kc2")) 303 304 props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"), parquet.WithAadPrefix(en.fileName), parquet.DisableAadPrefixStorage()) 305 en.encryptFile(props, "tmp_encrypt_columns_and_footer_disable_aad_storage.parquet.encrypted") 306 } 307 308 // Encryption Config 6: Encrypt two columns and the footer, with different keys. 309 // Use AES_GCM_CTR_V1 310 func (en *EncryptionConfigTestSuite) TestEncryptTwoColumnsAndFooterAesGcmCtr() { 311 encryptCols := make(parquet.ColumnPathToEncryptionPropsMap) 312 encryptCols[en.pathToDoubleField] = parquet.NewColumnEncryptionProperties(en.pathToDoubleField, parquet.WithKey(en.columnEncryptionKey1), parquet.WithKeyID("kc1")) 313 encryptCols[en.pathToFloatField] = parquet.NewColumnEncryptionProperties(en.pathToFloatField, parquet.WithKey(en.columnEncryptionKey2), parquet.WithKeyID("kc2")) 314 315 props := parquet.NewFileEncryptionProperties(en.footerEncryptionKey, parquet.WithFooterKeyMetadata("kf"), parquet.WithEncryptedColumns(encryptCols), parquet.WithAlg(parquet.AesCtr)) 316 en.encryptFile(props, "tmp_encrypt_columns_and_footer_ctr.parquet.encrypted") 317 } 318 319 func TestFileEncryption(t *testing.T) { 320 suite.Run(t, new(EncryptionConfigTestSuite)) 321 }