github.com/apache/arrow/go/v14@v14.0.1/parquet/file/page_writer.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "bytes" 21 "sync" 22 23 "github.com/apache/arrow/go/v14/arrow/memory" 24 "github.com/apache/arrow/go/v14/parquet" 25 "github.com/apache/arrow/go/v14/parquet/compress" 26 "github.com/apache/arrow/go/v14/parquet/internal/encoding" 27 "github.com/apache/arrow/go/v14/parquet/internal/encryption" 28 format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet" 29 "github.com/apache/arrow/go/v14/parquet/internal/thrift" 30 "github.com/apache/arrow/go/v14/parquet/internal/utils" 31 "github.com/apache/arrow/go/v14/parquet/metadata" 32 libthrift "github.com/apache/thrift/lib/go/thrift" 33 "golang.org/x/xerrors" 34 ) 35 36 // PageWriter is the interface for both serialized and buffered page writers 37 type PageWriter interface { 38 // Closes the current page, flushing any buffered data pages/dictionary pages 39 // based on the input parameters. Subsequent calls have no effect. 40 Close(hasDict, fallback bool) error 41 // Write the provided datapage out to the underlying writer 42 WriteDataPage(page DataPage) (int64, error) 43 // Write the provided dictionary page out to the underlying writer 44 WriteDictionaryPage(page *DictionaryPage) (int64, error) 45 // returns true if there is a configured compressor for the data 46 HasCompressor() bool 47 // use the configured compressor and writer properties to compress the data in src 48 // using the buffer buf. Returns the slice of the compressed bytes which may be 49 // the bytes in the provided buffer 50 Compress(buf *bytes.Buffer, src []byte) []byte 51 // Allow reuse of the pagewriter object by resetting it using these values instead 52 // of having to create a new object. 53 Reset(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rgOrdinal, columnOrdinal int16, metaEncryptor, dataEncryptor encryption.Encryptor) error 54 } 55 56 type serializedPageWriter struct { 57 mem memory.Allocator 58 metaData *metadata.ColumnChunkMetaDataBuilder 59 sink utils.WriterTell 60 61 nvalues int64 62 dictPageOffset int64 63 dataPageOffset int64 64 totalUncompressed int64 65 totalCompressed int64 66 pageOrdinal int16 67 rgOrdinal int16 68 columnOrdinal int16 69 70 compressLevel int 71 compressor compress.Codec 72 metaEncryptor encryption.Encryptor 73 dataEncryptor encryption.Encryptor 74 encryptionBuf bytes.Buffer 75 76 dataPageAAD []byte 77 dataPageHeaderAAD []byte 78 79 dictEncodingStats map[parquet.Encoding]int32 80 dataEncodingStats map[parquet.Encoding]int32 81 82 thriftSerializer *thrift.Serializer 83 } 84 85 func createSerializedPageWriter(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rowGroupOrdinal, columnChunkOrdinal int16, mem memory.Allocator, metaEncryptor, dataEncryptor encryption.Encryptor) (PageWriter, error) { 86 var ( 87 compressor compress.Codec 88 err error 89 ) 90 if codec != compress.Codecs.Uncompressed { 91 compressor, err = compress.GetCodec(codec) 92 if err != nil { 93 return nil, err 94 } 95 } 96 97 pgwriter := &serializedPageWriter{ 98 sink: sink, 99 compressor: compressor, 100 compressLevel: compressionLevel, 101 metaData: metadata, 102 rgOrdinal: rowGroupOrdinal, 103 columnOrdinal: columnChunkOrdinal, 104 mem: mem, 105 metaEncryptor: metaEncryptor, 106 dataEncryptor: dataEncryptor, 107 dictEncodingStats: make(map[parquet.Encoding]int32), 108 dataEncodingStats: make(map[parquet.Encoding]int32), 109 thriftSerializer: thrift.NewThriftSerializer(), 110 } 111 if metaEncryptor != nil || dataEncryptor != nil { 112 pgwriter.initEncryption() 113 } 114 return pgwriter, nil 115 } 116 117 // NewPageWriter returns a page writer using either the buffered or serialized implementations 118 func NewPageWriter(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rowGroupOrdinal, columnChunkOrdinal int16, mem memory.Allocator, buffered bool, metaEncryptor, dataEncryptor encryption.Encryptor) (PageWriter, error) { 119 if buffered { 120 return newBufferedPageWriter(sink, codec, compressionLevel, metadata, rowGroupOrdinal, columnChunkOrdinal, mem, metaEncryptor, dataEncryptor) 121 } 122 return createSerializedPageWriter(sink, codec, compressionLevel, metadata, rowGroupOrdinal, columnChunkOrdinal, mem, metaEncryptor, dataEncryptor) 123 } 124 125 // Reset allows reusing the pagewriter object instead of creating a new one. 126 func (pw *serializedPageWriter) Reset(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rowGroupOrdinal, columnChunkOrdinal int16, metaEncryptor, dataEncryptor encryption.Encryptor) error { 127 var ( 128 compressor compress.Codec 129 err error 130 ) 131 if codec != compress.Codecs.Uncompressed { 132 compressor, err = compress.GetCodec(codec) 133 if err != nil { 134 return err 135 } 136 } 137 138 pw.sink = sink 139 pw.compressor = compressor 140 pw.compressLevel = compressionLevel 141 pw.metaData = metadata 142 pw.rgOrdinal = rowGroupOrdinal 143 pw.columnOrdinal = columnChunkOrdinal 144 pw.metaEncryptor = metaEncryptor 145 pw.dataEncryptor = dataEncryptor 146 pw.dictEncodingStats = make(map[parquet.Encoding]int32) 147 pw.dataEncodingStats = make(map[parquet.Encoding]int32) 148 149 pw.nvalues = 0 150 pw.dictPageOffset = 0 151 pw.dataPageOffset = 0 152 pw.totalUncompressed = 0 153 pw.totalCompressed = 0 154 pw.pageOrdinal = 0 155 156 if metaEncryptor != nil || dataEncryptor != nil { 157 pw.initEncryption() 158 } 159 return nil 160 } 161 162 func (pw *serializedPageWriter) initEncryption() { 163 if pw.dataEncryptor != nil { 164 pw.dataPageAAD = []byte(encryption.CreateModuleAad(pw.dataEncryptor.FileAad(), encryption.DataPageModule, pw.rgOrdinal, pw.columnOrdinal, -1)) 165 } 166 if pw.metaEncryptor != nil { 167 pw.dataPageHeaderAAD = []byte(encryption.CreateModuleAad(pw.metaEncryptor.FileAad(), encryption.DataPageHeaderModule, pw.rgOrdinal, pw.columnOrdinal, -1)) 168 } 169 } 170 171 func (pw *serializedPageWriter) updateEncryption(moduleType int8) error { 172 switch moduleType { 173 case encryption.ColumnMetaModule: 174 pw.metaEncryptor.UpdateAad(encryption.CreateModuleAad(pw.metaEncryptor.FileAad(), moduleType, pw.rgOrdinal, pw.columnOrdinal, -1)) 175 case encryption.DataPageModule: 176 encryption.QuickUpdatePageAad(pw.dataPageAAD, pw.pageOrdinal) 177 pw.dataEncryptor.UpdateAad(string(pw.dataPageAAD)) 178 case encryption.DataPageHeaderModule: 179 encryption.QuickUpdatePageAad(pw.dataPageHeaderAAD, pw.pageOrdinal) 180 pw.metaEncryptor.UpdateAad(string(pw.dataPageHeaderAAD)) 181 case encryption.DictPageHeaderModule: 182 pw.metaEncryptor.UpdateAad(encryption.CreateModuleAad(pw.metaEncryptor.FileAad(), moduleType, pw.rgOrdinal, pw.columnOrdinal, -1)) 183 case encryption.DictPageModule: 184 pw.dataEncryptor.UpdateAad(encryption.CreateModuleAad(pw.dataEncryptor.FileAad(), moduleType, pw.rgOrdinal, pw.columnOrdinal, -1)) 185 default: 186 return xerrors.New("unknown module type in updateencryption") 187 } 188 return nil 189 } 190 191 func (pw *serializedPageWriter) Close(hasDict, fallback bool) error { 192 if pw.metaEncryptor != nil { 193 pw.updateEncryption(encryption.ColumnMetaModule) 194 } 195 196 chunkInfo := metadata.ChunkMetaInfo{ 197 NumValues: pw.nvalues, 198 DictPageOffset: pw.dictPageOffset, 199 IndexPageOffset: -1, 200 DataPageOffset: pw.dataPageOffset, 201 CompressedSize: pw.totalCompressed, 202 UncompressedSize: pw.totalUncompressed, 203 } 204 encodingStats := metadata.EncodingStats{ 205 DictEncodingStats: pw.dictEncodingStats, 206 DataEncodingStats: pw.dataEncodingStats, 207 } 208 pw.metaData.Finish(chunkInfo, hasDict, fallback, encodingStats, pw.metaEncryptor) 209 _, err := pw.metaData.WriteTo(pw.sink) 210 return err 211 } 212 213 func (pw *serializedPageWriter) Compress(buf *bytes.Buffer, src []byte) []byte { 214 maxCompressed := pw.compressor.CompressBound(int64(len(src))) 215 buf.Grow(int(maxCompressed)) 216 return pw.compressor.EncodeLevel(buf.Bytes(), src, pw.compressLevel) 217 } 218 219 var dataPageV1HeaderPool = sync.Pool{ 220 New: func() interface{} { return format.NewDataPageHeader() }, 221 } 222 223 func (pw *serializedPageWriter) setDataPageHeader(pageHdr *format.PageHeader, page *DataPageV1) { 224 pageHdr.Type = format.PageType_DATA_PAGE 225 hdr := dataPageV1HeaderPool.Get().(*format.DataPageHeader) 226 hdr.NumValues = page.nvals 227 hdr.Encoding = page.encoding 228 hdr.DefinitionLevelEncoding = page.defLvlEncoding 229 hdr.RepetitionLevelEncoding = page.repLvlEncoding 230 hdr.Statistics = page.statistics.ToThrift() 231 pageHdr.DataPageHeader = hdr 232 pageHdr.DataPageHeaderV2 = nil 233 pageHdr.DictionaryPageHeader = nil 234 } 235 236 var dataPageV2HeaderPool = sync.Pool{ 237 New: func() interface{} { return format.NewDataPageHeaderV2() }, 238 } 239 240 func (pw *serializedPageWriter) setDataPageV2Header(pageHdr *format.PageHeader, page *DataPageV2) { 241 pageHdr.Type = format.PageType_DATA_PAGE_V2 242 hdr := dataPageV2HeaderPool.Get().(*format.DataPageHeaderV2) 243 hdr.NumValues = page.nvals 244 hdr.NumNulls = page.nulls 245 hdr.NumRows = page.nrows 246 hdr.Encoding = page.encoding 247 hdr.DefinitionLevelsByteLength = page.defLvlByteLen 248 hdr.RepetitionLevelsByteLength = page.repLvlByteLen 249 hdr.IsCompressed = page.compressed 250 hdr.Statistics = page.statistics.ToThrift() 251 pageHdr.DataPageHeaderV2 = hdr 252 pageHdr.DataPageHeader = nil 253 pageHdr.DictionaryPageHeader = nil 254 } 255 256 func (pw *serializedPageWriter) HasCompressor() bool { return pw.compressor != nil } 257 func (pw *serializedPageWriter) NumValues() int64 { return pw.nvalues } 258 func (pw *serializedPageWriter) DictionaryPageOffset() int64 { return pw.dictPageOffset } 259 func (pw *serializedPageWriter) DataPageoffset() int64 { return pw.dataPageOffset } 260 func (pw *serializedPageWriter) TotalCompressedSize() int64 { return pw.totalCompressed } 261 func (pw *serializedPageWriter) TotalUncompressedSize() int64 { return pw.totalUncompressed } 262 263 func (pw *serializedPageWriter) WriteDictionaryPage(page *DictionaryPage) (int64, error) { 264 uncompressed := len(page.Data()) 265 266 var data []byte 267 if pw.HasCompressor() { 268 var buffer bytes.Buffer 269 data = pw.Compress(&buffer, page.Data()) 270 // data = buffer.Bytes() 271 } else { 272 data = page.Data() 273 } 274 275 dictPageHeader := &format.DictionaryPageHeader{ 276 NumValues: page.NumValues(), 277 Encoding: page.Encoding(), 278 IsSorted: libthrift.BoolPtr(page.IsSorted()), 279 } 280 281 if pw.dataEncryptor != nil { 282 pw.updateEncryption(encryption.DictPageModule) 283 pw.encryptionBuf.Reset() 284 pw.encryptionBuf.Grow(pw.dataEncryptor.CiphertextSizeDelta() + len(data)) 285 pw.dataEncryptor.Encrypt(&pw.encryptionBuf, data) 286 data = pw.encryptionBuf.Bytes() 287 } 288 289 pageHdr := pageHeaderPool.Get().(*format.PageHeader) 290 defer pageHeaderPool.Put(pageHdr) 291 pageHdr.Type = format.PageType_DICTIONARY_PAGE 292 pageHdr.UncompressedPageSize = int32(uncompressed) 293 pageHdr.CompressedPageSize = int32(len(data)) 294 pageHdr.DictionaryPageHeader = dictPageHeader 295 pageHdr.DataPageHeader = nil 296 pageHdr.DataPageHeaderV2 = nil 297 298 startPos := pw.sink.Tell() 299 if pw.dictPageOffset == 0 { 300 pw.dictPageOffset = int64(startPos) 301 } 302 303 if pw.metaEncryptor != nil { 304 if err := pw.updateEncryption(encryption.DictPageHeaderModule); err != nil { 305 return 0, err 306 } 307 } 308 headerSize, err := pw.thriftSerializer.Serialize(pageHdr, pw.sink, pw.metaEncryptor) 309 if err != nil { 310 return 0, err 311 } 312 written, err := pw.sink.Write(data) 313 if err != nil { 314 return 0, err 315 } 316 317 written += headerSize 318 319 pw.totalUncompressed += int64(uncompressed + headerSize) 320 pw.totalCompressed = int64(written) 321 pw.dictEncodingStats[parquet.Encoding(page.encoding)]++ 322 return int64(written), nil 323 } 324 325 var pageHeaderPool = sync.Pool{ 326 New: func() interface{} { 327 return format.NewPageHeader() 328 }, 329 } 330 331 func (pw *serializedPageWriter) WriteDataPage(page DataPage) (int64, error) { 332 uncompressed := page.UncompressedSize() 333 data := page.Data() 334 335 if pw.dataEncryptor != nil { 336 if err := pw.updateEncryption(encryption.DataPageModule); err != nil { 337 return 0, err 338 } 339 pw.encryptionBuf.Reset() 340 pw.encryptionBuf.Grow(pw.dataEncryptor.CiphertextSizeDelta() + len(data)) 341 pw.dataEncryptor.Encrypt(&pw.encryptionBuf, data) 342 data = pw.encryptionBuf.Bytes() 343 } 344 345 pageHdr := pageHeaderPool.Get().(*format.PageHeader) 346 defer pageHeaderPool.Put(pageHdr) 347 pageHdr.UncompressedPageSize = uncompressed 348 pageHdr.CompressedPageSize = int32(len(data)) 349 350 switch dpage := page.(type) { 351 case *DataPageV1: 352 pw.setDataPageHeader(pageHdr, dpage) 353 defer dataPageV1HeaderPool.Put(pageHdr.DataPageHeader) 354 case *DataPageV2: 355 pw.setDataPageV2Header(pageHdr, dpage) 356 defer dataPageV2HeaderPool.Put(pageHdr.DataPageHeaderV2) 357 default: 358 return 0, xerrors.New("parquet: unexpected page type") 359 } 360 361 startPos := pw.sink.Tell() 362 if pw.pageOrdinal == 0 { 363 pw.dataPageOffset = int64(startPos) 364 } 365 366 if pw.metaEncryptor != nil { 367 if err := pw.updateEncryption(encryption.DataPageHeaderModule); err != nil { 368 return 0, err 369 } 370 } 371 headerSize, err := pw.thriftSerializer.Serialize(pageHdr, pw.sink, pw.metaEncryptor) 372 if err != nil { 373 return 0, err 374 } 375 written, err := pw.sink.Write(data) 376 if err != nil { 377 return int64(written), err 378 } 379 written += headerSize 380 381 pw.totalUncompressed += int64(uncompressed) + int64(headerSize) 382 pw.totalCompressed += int64(written) 383 pw.nvalues += int64(page.NumValues()) 384 pw.dataEncodingStats[parquet.Encoding(page.Encoding())]++ 385 pw.pageOrdinal++ 386 return int64(written), nil 387 } 388 389 type bufferedPageWriter struct { 390 finalSink utils.WriterTell 391 inMemSink *encoding.BufferWriter 392 metadata *metadata.ColumnChunkMetaDataBuilder 393 pager *serializedPageWriter 394 hasDictionaryPages bool 395 } 396 397 func newBufferedPageWriter(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rgOrdinal, columnOrdinal int16, mem memory.Allocator, metaEncryptor, dataEncryptor encryption.Encryptor) (PageWriter, error) { 398 wr := &bufferedPageWriter{ 399 finalSink: sink, 400 metadata: metadata, 401 hasDictionaryPages: false, 402 inMemSink: encoding.NewBufferWriter(0, mem), 403 } 404 pager, err := createSerializedPageWriter(wr.inMemSink, codec, compressionLevel, metadata, rgOrdinal, columnOrdinal, mem, metaEncryptor, dataEncryptor) 405 if err != nil { 406 return nil, err 407 } 408 wr.pager = pager.(*serializedPageWriter) 409 return wr, nil 410 } 411 412 func (bw *bufferedPageWriter) Reset(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rgOrdinal, columnOrdinal int16, metaEncryptor, dataEncryptor encryption.Encryptor) error { 413 bw.finalSink = sink 414 bw.metadata = metadata 415 bw.hasDictionaryPages = false 416 bw.inMemSink.Reset(0) 417 418 return bw.pager.Reset(bw.inMemSink, codec, compressionLevel, metadata, rgOrdinal, columnOrdinal, metaEncryptor, dataEncryptor) 419 } 420 421 func (bw *bufferedPageWriter) WriteDictionaryPage(page *DictionaryPage) (int64, error) { 422 bw.hasDictionaryPages = true 423 return bw.pager.WriteDictionaryPage(page) 424 } 425 426 func (bw *bufferedPageWriter) Close(hasDict, fallback bool) error { 427 if bw.pager.metaEncryptor != nil { 428 bw.pager.updateEncryption(encryption.ColumnMetaModule) 429 } 430 431 position := bw.finalSink.Tell() 432 dictOffset := int64(0) 433 if bw.hasDictionaryPages { 434 dictOffset = bw.pager.DictionaryPageOffset() + position 435 } 436 437 chunkInfo := metadata.ChunkMetaInfo{ 438 NumValues: bw.pager.NumValues(), 439 DictPageOffset: dictOffset, 440 IndexPageOffset: -1, 441 DataPageOffset: bw.pager.DataPageoffset() + position, 442 CompressedSize: bw.pager.TotalCompressedSize(), 443 UncompressedSize: bw.pager.TotalUncompressedSize(), 444 } 445 encodingStats := metadata.EncodingStats{ 446 DictEncodingStats: bw.pager.dictEncodingStats, 447 DataEncodingStats: bw.pager.dataEncodingStats, 448 } 449 bw.metadata.Finish(chunkInfo, hasDict, fallback, encodingStats, bw.pager.metaEncryptor) 450 bw.metadata.WriteTo(bw.inMemSink) 451 452 buf := bw.inMemSink.Finish() 453 defer buf.Release() 454 _, err := bw.finalSink.Write(buf.Bytes()) 455 return err 456 } 457 458 func (bw *bufferedPageWriter) WriteDataPage(page DataPage) (int64, error) { 459 return bw.pager.WriteDataPage(page) 460 } 461 462 func (bw *bufferedPageWriter) HasCompressor() bool { 463 return bw.pager.HasCompressor() 464 } 465 466 func (bw *bufferedPageWriter) Compress(buf *bytes.Buffer, src []byte) []byte { 467 return bw.pager.Compress(buf, src) 468 }