github.com/apache/arrow/go/v7@v7.0.1/parquet/file/page_writer.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "bytes" 21 "sync" 22 23 "github.com/apache/arrow/go/v7/arrow/memory" 24 "github.com/apache/arrow/go/v7/parquet" 25 "github.com/apache/arrow/go/v7/parquet/compress" 26 "github.com/apache/arrow/go/v7/parquet/internal/encoding" 27 "github.com/apache/arrow/go/v7/parquet/internal/encryption" 28 format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet" 29 "github.com/apache/arrow/go/v7/parquet/internal/thrift" 30 "github.com/apache/arrow/go/v7/parquet/internal/utils" 31 "github.com/apache/arrow/go/v7/parquet/metadata" 32 libthrift "github.com/apache/thrift/lib/go/thrift" 33 "golang.org/x/xerrors" 34 ) 35 36 // PageWriter is the interface for both serialized and buffered page writers 37 type PageWriter interface { 38 // Closes the current page, flushing any buffered data pages/dictionary pages 39 // based on the input parameters. Subsequent calls have no effect. 40 Close(hasDict, fallback bool) error 41 // Write the provided datapage out to the underlying writer 42 WriteDataPage(page DataPage) (int64, error) 43 // Write the provided dictionary page out to the underlying writer 44 WriteDictionaryPage(page *DictionaryPage) (int64, error) 45 // returns true if there is a configured compressor for the data 46 HasCompressor() bool 47 // use the configured compressor and writer properties to compress the data in src 48 // using the buffer buf. Returns the slice of the compressed bytes which may be 49 // the bytes in the provided buffer 50 Compress(buf *bytes.Buffer, src []byte) []byte 51 // Allow reuse of the pagewriter object by resetting it using these values instead 52 // of having to create a new object. 53 Reset(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rgOrdinal, columnOrdinal int16, metaEncryptor, dataEncryptor encryption.Encryptor) error 54 } 55 56 type serializedPageWriter struct { 57 mem memory.Allocator 58 metaData *metadata.ColumnChunkMetaDataBuilder 59 sink utils.WriterTell 60 61 nvalues int64 62 dictPageOffset int64 63 dataPageOffset int64 64 totalUncompressed int64 65 totalCompressed int64 66 pageOrdinal int16 67 rgOrdinal int16 68 columnOrdinal int16 69 70 compressLevel int 71 compressor compress.Codec 72 metaEncryptor encryption.Encryptor 73 dataEncryptor encryption.Encryptor 74 encryptionBuf bytes.Buffer 75 76 dataPageAAD []byte 77 dataPageHeaderAAD []byte 78 79 dictEncodingStats map[parquet.Encoding]int32 80 dataEncodingStats map[parquet.Encoding]int32 81 82 thriftSerializer *thrift.Serializer 83 } 84 85 func createSerializedPageWriter(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rowGroupOrdinal, columnChunkOrdinal int16, mem memory.Allocator, metaEncryptor, dataEncryptor encryption.Encryptor) (PageWriter, error) { 86 var ( 87 compressor compress.Codec 88 err error 89 ) 90 if codec != compress.Codecs.Uncompressed { 91 compressor, err = compress.GetCodec(codec) 92 if err != nil { 93 return nil, err 94 } 95 } 96 97 pgwriter := &serializedPageWriter{ 98 sink: sink, 99 compressor: compressor, 100 compressLevel: compressionLevel, 101 metaData: metadata, 102 rgOrdinal: rowGroupOrdinal, 103 columnOrdinal: columnChunkOrdinal, 104 mem: mem, 105 metaEncryptor: metaEncryptor, 106 dataEncryptor: dataEncryptor, 107 dictEncodingStats: make(map[parquet.Encoding]int32), 108 dataEncodingStats: make(map[parquet.Encoding]int32), 109 thriftSerializer: thrift.NewThriftSerializer(), 110 } 111 if metaEncryptor != nil || dataEncryptor != nil { 112 pgwriter.initEncryption() 113 } 114 return pgwriter, nil 115 } 116 117 // NewPageWriter returns a page writer using either the buffered or serialized implementations 118 func NewPageWriter(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rowGroupOrdinal, columnChunkOrdinal int16, mem memory.Allocator, buffered bool, metaEncryptor, dataEncryptor encryption.Encryptor) (PageWriter, error) { 119 if buffered { 120 return newBufferedPageWriter(sink, codec, compressionLevel, metadata, rowGroupOrdinal, columnChunkOrdinal, mem, metaEncryptor, dataEncryptor) 121 } 122 return createSerializedPageWriter(sink, codec, compressionLevel, metadata, rowGroupOrdinal, columnChunkOrdinal, mem, metaEncryptor, dataEncryptor) 123 } 124 125 // Reset allows reusing the pagewriter object instead of creating a new one. 126 func (pw *serializedPageWriter) Reset(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rowGroupOrdinal, columnChunkOrdinal int16, metaEncryptor, dataEncryptor encryption.Encryptor) error { 127 var ( 128 compressor compress.Codec 129 err error 130 ) 131 if codec != compress.Codecs.Uncompressed { 132 compressor, err = compress.GetCodec(codec) 133 if err != nil { 134 return err 135 } 136 } 137 138 pw.sink = sink 139 pw.compressor = compressor 140 pw.compressLevel = compressionLevel 141 pw.metaData = metadata 142 pw.rgOrdinal = rowGroupOrdinal 143 pw.columnOrdinal = columnChunkOrdinal 144 pw.metaEncryptor = metaEncryptor 145 pw.dataEncryptor = dataEncryptor 146 pw.dictEncodingStats = make(map[parquet.Encoding]int32) 147 pw.dataEncodingStats = make(map[parquet.Encoding]int32) 148 149 pw.nvalues = 0 150 pw.dictPageOffset = 0 151 pw.dataPageOffset = 0 152 pw.totalUncompressed = 0 153 pw.totalCompressed = 0 154 pw.pageOrdinal = 0 155 156 if metaEncryptor != nil || dataEncryptor != nil { 157 pw.initEncryption() 158 } 159 return nil 160 } 161 162 func (pw *serializedPageWriter) initEncryption() { 163 if pw.dataEncryptor != nil { 164 pw.dataPageAAD = []byte(encryption.CreateModuleAad(pw.dataEncryptor.FileAad(), encryption.DataPageModule, pw.rgOrdinal, pw.columnOrdinal, -1)) 165 } 166 if pw.metaEncryptor != nil { 167 pw.dataPageHeaderAAD = []byte(encryption.CreateModuleAad(pw.metaEncryptor.FileAad(), encryption.DataPageHeaderModule, pw.rgOrdinal, pw.columnOrdinal, -1)) 168 } 169 } 170 171 func (pw *serializedPageWriter) updateEncryption(moduleType int8) error { 172 switch moduleType { 173 case encryption.ColumnMetaModule: 174 pw.metaEncryptor.UpdateAad(encryption.CreateModuleAad(pw.metaEncryptor.FileAad(), moduleType, pw.rgOrdinal, pw.columnOrdinal, -1)) 175 case encryption.DataPageModule: 176 encryption.QuickUpdatePageAad(pw.dataPageAAD, pw.pageOrdinal) 177 pw.dataEncryptor.UpdateAad(string(pw.dataPageAAD)) 178 case encryption.DataPageHeaderModule: 179 encryption.QuickUpdatePageAad(pw.dataPageHeaderAAD, pw.pageOrdinal) 180 pw.metaEncryptor.UpdateAad(string(pw.dataPageHeaderAAD)) 181 case encryption.DictPageHeaderModule: 182 pw.metaEncryptor.UpdateAad(encryption.CreateModuleAad(pw.metaEncryptor.FileAad(), moduleType, pw.rgOrdinal, pw.columnOrdinal, -1)) 183 case encryption.DictPageModule: 184 pw.dataEncryptor.UpdateAad(encryption.CreateModuleAad(pw.dataEncryptor.FileAad(), moduleType, pw.rgOrdinal, pw.columnOrdinal, -1)) 185 } 186 return xerrors.New("unknown module type in updateencryption") 187 } 188 189 func (pw *serializedPageWriter) Close(hasDict, fallback bool) error { 190 if pw.metaEncryptor != nil { 191 pw.updateEncryption(encryption.ColumnMetaModule) 192 } 193 194 chunkInfo := metadata.ChunkMetaInfo{ 195 NumValues: pw.nvalues, 196 DictPageOffset: pw.dictPageOffset, 197 IndexPageOffset: -1, 198 DataPageOffset: pw.dataPageOffset, 199 CompressedSize: pw.totalCompressed, 200 UncompressedSize: pw.totalUncompressed, 201 } 202 encodingStats := metadata.EncodingStats{ 203 DictEncodingStats: pw.dictEncodingStats, 204 DataEncodingStats: pw.dataEncodingStats, 205 } 206 pw.metaData.Finish(chunkInfo, hasDict, fallback, encodingStats, pw.metaEncryptor) 207 _, err := pw.metaData.WriteTo(pw.sink) 208 return err 209 } 210 211 func (pw *serializedPageWriter) Compress(buf *bytes.Buffer, src []byte) []byte { 212 maxCompressed := pw.compressor.CompressBound(int64(len(src))) 213 buf.Grow(int(maxCompressed)) 214 return pw.compressor.EncodeLevel(buf.Bytes(), src, pw.compressLevel) 215 } 216 217 var dataPageV1HeaderPool = sync.Pool{ 218 New: func() interface{} { return format.NewDataPageHeader() }, 219 } 220 221 func (pw *serializedPageWriter) setDataPageHeader(pageHdr *format.PageHeader, page *DataPageV1) { 222 pageHdr.Type = format.PageType_DATA_PAGE 223 hdr := dataPageV1HeaderPool.Get().(*format.DataPageHeader) 224 hdr.NumValues = page.nvals 225 hdr.Encoding = page.encoding 226 hdr.DefinitionLevelEncoding = page.defLvlEncoding 227 hdr.RepetitionLevelEncoding = page.repLvlEncoding 228 hdr.Statistics = page.statistics.ToThrift() 229 pageHdr.DataPageHeader = hdr 230 pageHdr.DataPageHeaderV2 = nil 231 pageHdr.DictionaryPageHeader = nil 232 } 233 234 var dataPageV2HeaderPool = sync.Pool{ 235 New: func() interface{} { return format.NewDataPageHeaderV2() }, 236 } 237 238 func (pw *serializedPageWriter) setDataPageV2Header(pageHdr *format.PageHeader, page *DataPageV2) { 239 pageHdr.Type = format.PageType_DATA_PAGE_V2 240 hdr := dataPageV2HeaderPool.Get().(*format.DataPageHeaderV2) 241 hdr.NumValues = page.nvals 242 hdr.NumNulls = page.nulls 243 hdr.NumRows = page.nrows 244 hdr.Encoding = page.encoding 245 hdr.DefinitionLevelsByteLength = page.defLvlByteLen 246 hdr.RepetitionLevelsByteLength = page.repLvlByteLen 247 hdr.IsCompressed = page.compressed 248 hdr.Statistics = page.statistics.ToThrift() 249 pageHdr.DataPageHeaderV2 = hdr 250 pageHdr.DataPageHeader = nil 251 pageHdr.DictionaryPageHeader = nil 252 } 253 254 func (pw *serializedPageWriter) HasCompressor() bool { return pw.compressor != nil } 255 func (pw *serializedPageWriter) NumValues() int64 { return pw.nvalues } 256 func (pw *serializedPageWriter) DictionaryPageOffset() int64 { return pw.dictPageOffset } 257 func (pw *serializedPageWriter) DataPageoffset() int64 { return pw.dataPageOffset } 258 func (pw *serializedPageWriter) TotalCompressedSize() int64 { return pw.totalCompressed } 259 func (pw *serializedPageWriter) TotalUncompressedSize() int64 { return pw.totalUncompressed } 260 261 func (pw *serializedPageWriter) WriteDictionaryPage(page *DictionaryPage) (int64, error) { 262 uncompressed := len(page.Data()) 263 264 var data []byte 265 if pw.HasCompressor() { 266 var buffer bytes.Buffer 267 data = pw.Compress(&buffer, page.Data()) 268 // data = buffer.Bytes() 269 } else { 270 data = page.Data() 271 } 272 273 dictPageHeader := &format.DictionaryPageHeader{ 274 NumValues: page.NumValues(), 275 Encoding: page.Encoding(), 276 IsSorted: libthrift.BoolPtr(page.IsSorted()), 277 } 278 279 if pw.dataEncryptor != nil { 280 pw.updateEncryption(encryption.DictPageModule) 281 pw.encryptionBuf.Reset() 282 pw.encryptionBuf.Grow(pw.dataEncryptor.CiphertextSizeDelta() + len(data)) 283 pw.dataEncryptor.Encrypt(&pw.encryptionBuf, data) 284 data = pw.encryptionBuf.Bytes() 285 } 286 287 pageHdr := pageHeaderPool.Get().(*format.PageHeader) 288 defer pageHeaderPool.Put(pageHdr) 289 pageHdr.Type = format.PageType_DICTIONARY_PAGE 290 pageHdr.UncompressedPageSize = int32(uncompressed) 291 pageHdr.CompressedPageSize = int32(len(data)) 292 pageHdr.DictionaryPageHeader = dictPageHeader 293 pageHdr.DataPageHeader = nil 294 pageHdr.DataPageHeaderV2 = nil 295 296 startPos := pw.sink.Tell() 297 if pw.dictPageOffset == 0 { 298 pw.dictPageOffset = int64(startPos) 299 } 300 301 if pw.metaEncryptor != nil { 302 if err := pw.updateEncryption(encryption.DictPageHeaderModule); err != nil { 303 return 0, err 304 } 305 } 306 headerSize, err := pw.thriftSerializer.Serialize(pageHdr, pw.sink, pw.metaEncryptor) 307 if err != nil { 308 return 0, err 309 } 310 written, err := pw.sink.Write(data) 311 if err != nil { 312 return 0, err 313 } 314 315 written += headerSize 316 317 pw.totalUncompressed += int64(uncompressed + headerSize) 318 pw.totalCompressed = int64(written) 319 pw.dictEncodingStats[parquet.Encoding(page.encoding)]++ 320 return int64(written), nil 321 } 322 323 var pageHeaderPool = sync.Pool{ 324 New: func() interface{} { 325 return format.NewPageHeader() 326 }, 327 } 328 329 func (pw *serializedPageWriter) WriteDataPage(page DataPage) (int64, error) { 330 uncompressed := page.UncompressedSize() 331 data := page.Data() 332 333 if pw.dataEncryptor != nil { 334 if err := pw.updateEncryption(encryption.DataPageModule); err != nil { 335 return 0, err 336 } 337 pw.encryptionBuf.Reset() 338 pw.encryptionBuf.Grow(pw.dataEncryptor.CiphertextSizeDelta() + len(data)) 339 pw.dataEncryptor.Encrypt(&pw.encryptionBuf, data) 340 data = pw.encryptionBuf.Bytes() 341 } 342 343 pageHdr := pageHeaderPool.Get().(*format.PageHeader) 344 defer pageHeaderPool.Put(pageHdr) 345 pageHdr.UncompressedPageSize = uncompressed 346 pageHdr.CompressedPageSize = int32(len(data)) 347 348 switch dpage := page.(type) { 349 case *DataPageV1: 350 pw.setDataPageHeader(pageHdr, dpage) 351 defer dataPageV1HeaderPool.Put(pageHdr.DataPageHeader) 352 case *DataPageV2: 353 pw.setDataPageV2Header(pageHdr, dpage) 354 defer dataPageV2HeaderPool.Put(pageHdr.DataPageHeaderV2) 355 default: 356 return 0, xerrors.New("parquet: unexpected page type") 357 } 358 359 startPos := pw.sink.Tell() 360 if pw.pageOrdinal == 0 { 361 pw.dataPageOffset = int64(startPos) 362 } 363 364 if pw.metaEncryptor != nil { 365 if err := pw.updateEncryption(encryption.DataPageHeaderModule); err != nil { 366 return 0, err 367 } 368 } 369 headerSize, err := pw.thriftSerializer.Serialize(pageHdr, pw.sink, pw.metaEncryptor) 370 if err != nil { 371 return 0, err 372 } 373 written, err := pw.sink.Write(data) 374 if err != nil { 375 return int64(written), err 376 } 377 written += headerSize 378 379 pw.totalUncompressed += int64(uncompressed) + int64(headerSize) 380 pw.totalCompressed += int64(written) 381 pw.nvalues += int64(page.NumValues()) 382 pw.dataEncodingStats[parquet.Encoding(page.Encoding())]++ 383 pw.pageOrdinal++ 384 return int64(written), nil 385 } 386 387 type bufferedPageWriter struct { 388 finalSink utils.WriterTell 389 inMemSink *encoding.BufferWriter 390 metadata *metadata.ColumnChunkMetaDataBuilder 391 pager *serializedPageWriter 392 hasDictionaryPages bool 393 } 394 395 func newBufferedPageWriter(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rgOrdinal, columnOrdinal int16, mem memory.Allocator, metaEncryptor, dataEncryptor encryption.Encryptor) (PageWriter, error) { 396 wr := &bufferedPageWriter{ 397 finalSink: sink, 398 metadata: metadata, 399 hasDictionaryPages: false, 400 inMemSink: encoding.NewBufferWriter(0, mem), 401 } 402 pager, err := createSerializedPageWriter(wr.inMemSink, codec, compressionLevel, metadata, rgOrdinal, columnOrdinal, mem, metaEncryptor, dataEncryptor) 403 if err != nil { 404 return nil, err 405 } 406 wr.pager = pager.(*serializedPageWriter) 407 return wr, nil 408 } 409 410 func (bw *bufferedPageWriter) Reset(sink utils.WriterTell, codec compress.Compression, compressionLevel int, metadata *metadata.ColumnChunkMetaDataBuilder, rgOrdinal, columnOrdinal int16, metaEncryptor, dataEncryptor encryption.Encryptor) error { 411 bw.finalSink = sink 412 bw.metadata = metadata 413 bw.hasDictionaryPages = false 414 bw.inMemSink.Reset(0) 415 416 return bw.pager.Reset(bw.inMemSink, codec, compressionLevel, metadata, rgOrdinal, columnOrdinal, metaEncryptor, dataEncryptor) 417 } 418 419 func (bw *bufferedPageWriter) WriteDictionaryPage(page *DictionaryPage) (int64, error) { 420 bw.hasDictionaryPages = true 421 return bw.pager.WriteDictionaryPage(page) 422 } 423 424 func (bw *bufferedPageWriter) Close(hasDict, fallback bool) error { 425 if bw.pager.metaEncryptor != nil { 426 bw.pager.updateEncryption(encryption.ColumnMetaModule) 427 } 428 429 position := bw.finalSink.Tell() 430 dictOffset := int64(0) 431 if bw.hasDictionaryPages { 432 dictOffset = bw.pager.DictionaryPageOffset() + position 433 } 434 435 chunkInfo := metadata.ChunkMetaInfo{ 436 NumValues: bw.pager.NumValues(), 437 DictPageOffset: dictOffset, 438 IndexPageOffset: -1, 439 DataPageOffset: bw.pager.DataPageoffset() + position, 440 CompressedSize: bw.pager.TotalCompressedSize(), 441 UncompressedSize: bw.pager.TotalUncompressedSize(), 442 } 443 encodingStats := metadata.EncodingStats{ 444 DictEncodingStats: bw.pager.dictEncodingStats, 445 DataEncodingStats: bw.pager.dataEncodingStats, 446 } 447 bw.metadata.Finish(chunkInfo, hasDict, fallback, encodingStats, bw.pager.metaEncryptor) 448 bw.metadata.WriteTo(bw.inMemSink) 449 450 buf := bw.inMemSink.Finish() 451 defer buf.Release() 452 _, err := bw.finalSink.Write(buf.Bytes()) 453 return err 454 } 455 456 func (bw *bufferedPageWriter) WriteDataPage(page DataPage) (int64, error) { 457 return bw.pager.WriteDataPage(page) 458 } 459 460 func (bw *bufferedPageWriter) HasCompressor() bool { 461 return bw.pager.HasCompressor() 462 } 463 464 func (bw *bufferedPageWriter) Compress(buf *bytes.Buffer, src []byte) []byte { 465 return bw.pager.Compress(buf, src) 466 }